workingset.c (a528910e12ec7ee203095eb1711468a66b9b60b0) workingset.c (449dd6984d0e47643c04c807f609dd56d48d5bcc)
1/*
2 * Workingset detection
3 *
4 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
5 */
6
7#include <linux/memcontrol.h>
8#include <linux/writeback.h>

--- 237 unchanged lines hidden (view full) ---

246/**
247 * workingset_activation - note a page activation
248 * @page: page that is being activated
249 */
250void workingset_activation(struct page *page)
251{
252 atomic_long_inc(&page_zone(page)->inactive_age);
253}
1/*
2 * Workingset detection
3 *
4 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
5 */
6
7#include <linux/memcontrol.h>
8#include <linux/writeback.h>

--- 237 unchanged lines hidden (view full) ---

246/**
247 * workingset_activation - note a page activation
248 * @page: page that is being activated
249 */
250void workingset_activation(struct page *page)
251{
252 atomic_long_inc(&page_zone(page)->inactive_age);
253}
254
255/*
256 * Shadow entries reflect the share of the working set that does not
257 * fit into memory, so their number depends on the access pattern of
258 * the workload. In most cases, they will refault or get reclaimed
259 * along with the inode, but a (malicious) workload that streams
260 * through files with a total size several times that of available
261 * memory, while preventing the inodes from being reclaimed, can
262 * create excessive amounts of shadow nodes. To keep a lid on this,
263 * track shadow nodes and reclaim them when they grow way past the
264 * point where they would still be useful.
265 */
266
267struct list_lru workingset_shadow_nodes;
268
269static unsigned long count_shadow_nodes(struct shrinker *shrinker,
270 struct shrink_control *sc)
271{
272 unsigned long shadow_nodes;
273 unsigned long max_nodes;
274 unsigned long pages;
275
276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
277 local_irq_disable();
278 shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
279 local_irq_enable();
280
281 pages = node_present_pages(sc->nid);
282 /*
283 * Active cache pages are limited to 50% of memory, and shadow
284 * entries that represent a refault distance bigger than that
285 * do not have any effect. Limit the number of shadow nodes
286 * such that shadow entries do not exceed the number of active
287 * cache pages, assuming a worst-case node population density
288 * of 1/8th on average.
289 *
290 * On 64-bit with 7 radix_tree_nodes per page and 64 slots
291 * each, this will reclaim shadow entries when they consume
292 * ~2% of available memory:
293 *
294 * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
295 */
296 max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
297
298 if (shadow_nodes <= max_nodes)
299 return 0;
300
301 return shadow_nodes - max_nodes;
302}
303
304static enum lru_status shadow_lru_isolate(struct list_head *item,
305 spinlock_t *lru_lock,
306 void *arg)
307{
308 struct address_space *mapping;
309 struct radix_tree_node *node;
310 unsigned int i;
311 int ret;
312
313 /*
314 * Page cache insertions and deletions synchroneously maintain
315 * the shadow node LRU under the mapping->tree_lock and the
316 * lru_lock. Because the page cache tree is emptied before
317 * the inode can be destroyed, holding the lru_lock pins any
318 * address_space that has radix tree nodes on the LRU.
319 *
320 * We can then safely transition to the mapping->tree_lock to
321 * pin only the address_space of the particular node we want
322 * to reclaim, take the node off-LRU, and drop the lru_lock.
323 */
324
325 node = container_of(item, struct radix_tree_node, private_list);
326 mapping = node->private_data;
327
328 /* Coming from the list, invert the lock order */
329 if (!spin_trylock(&mapping->tree_lock)) {
330 spin_unlock(lru_lock);
331 ret = LRU_RETRY;
332 goto out;
333 }
334
335 list_del_init(item);
336 spin_unlock(lru_lock);
337
338 /*
339 * The nodes should only contain one or more shadow entries,
340 * no pages, so we expect to be able to remove them all and
341 * delete and free the empty node afterwards.
342 */
343
344 BUG_ON(!node->count);
345 BUG_ON(node->count & RADIX_TREE_COUNT_MASK);
346
347 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
348 if (node->slots[i]) {
349 BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
350 node->slots[i] = NULL;
351 BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
352 node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
353 BUG_ON(!mapping->nrshadows);
354 mapping->nrshadows--;
355 }
356 }
357 BUG_ON(node->count);
358 inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
359 if (!__radix_tree_delete_node(&mapping->page_tree, node))
360 BUG();
361
362 spin_unlock(&mapping->tree_lock);
363 ret = LRU_REMOVED_RETRY;
364out:
365 local_irq_enable();
366 cond_resched();
367 local_irq_disable();
368 spin_lock(lru_lock);
369 return ret;
370}
371
372static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
373 struct shrink_control *sc)
374{
375 unsigned long ret;
376
377 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
378 local_irq_disable();
379 ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
380 shadow_lru_isolate, NULL, &sc->nr_to_scan);
381 local_irq_enable();
382 return ret;
383}
384
385static struct shrinker workingset_shadow_shrinker = {
386 .count_objects = count_shadow_nodes,
387 .scan_objects = scan_shadow_nodes,
388 .seeks = DEFAULT_SEEKS,
389 .flags = SHRINKER_NUMA_AWARE,
390};
391
392/*
393 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
394 * mapping->tree_lock.
395 */
396static struct lock_class_key shadow_nodes_key;
397
398static int __init workingset_init(void)
399{
400 int ret;
401
402 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
403 if (ret)
404 goto err;
405 ret = register_shrinker(&workingset_shadow_shrinker);
406 if (ret)
407 goto err_list_lru;
408 return 0;
409err_list_lru:
410 list_lru_destroy(&workingset_shadow_nodes);
411err:
412 return ret;
413}
414module_init(workingset_init);