1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Workingset detection 4 * 5 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner 6 */ 7 8 #include <linux/memcontrol.h> 9 #include <linux/mm_inline.h> 10 #include <linux/writeback.h> 11 #include <linux/shmem_fs.h> 12 #include <linux/pagemap.h> 13 #include <linux/atomic.h> 14 #include <linux/module.h> 15 #include <linux/swap.h> 16 #include <linux/dax.h> 17 #include <linux/fs.h> 18 #include <linux/mm.h> 19 #include "swap_table.h" 20 #include "internal.h" 21 22 /* 23 * Double CLOCK lists 24 * 25 * Per node, two clock lists are maintained for file pages: the 26 * inactive and the active list. Freshly faulted pages start out at 27 * the head of the inactive list and page reclaim scans pages from the 28 * tail. Pages that are accessed multiple times on the inactive list 29 * are promoted to the active list, to protect them from reclaim, 30 * whereas active pages are demoted to the inactive list when the 31 * active list grows too big. 32 * 33 * fault ------------------------+ 34 * | 35 * +--------------+ | +-------------+ 36 * reclaim <- | inactive | <-+-- demotion | active | <--+ 37 * +--------------+ +-------------+ | 38 * | | 39 * +-------------- promotion ------------------+ 40 * 41 * 42 * Access frequency and refault distance 43 * 44 * A workload is thrashing when its pages are frequently used but they 45 * are evicted from the inactive list every time before another access 46 * would have promoted them to the active list. 47 * 48 * In cases where the average access distance between thrashing pages 49 * is bigger than the size of memory there is nothing that can be 50 * done - the thrashing set could never fit into memory under any 51 * circumstance. 52 * 53 * However, the average access distance could be bigger than the 54 * inactive list, yet smaller than the size of memory. In this case, 55 * the set could fit into memory if it weren't for the currently 56 * active pages - which may be used more, hopefully less frequently: 57 * 58 * +-memory available to cache-+ 59 * | | 60 * +-inactive------+-active----+ 61 * a b | c d e f g h i | J K L M N | 62 * +---------------+-----------+ 63 * 64 * It is prohibitively expensive to accurately track access frequency 65 * of pages. But a reasonable approximation can be made to measure 66 * thrashing on the inactive list, after which refaulting pages can be 67 * activated optimistically to compete with the existing active pages. 68 * 69 * Approximating inactive page access frequency - Observations: 70 * 71 * 1. When a page is accessed for the first time, it is added to the 72 * head of the inactive list, slides every existing inactive page 73 * towards the tail by one slot, and pushes the current tail page 74 * out of memory. 75 * 76 * 2. When a page is accessed for the second time, it is promoted to 77 * the active list, shrinking the inactive list by one slot. This 78 * also slides all inactive pages that were faulted into the cache 79 * more recently than the activated page towards the tail of the 80 * inactive list. 81 * 82 * Thus: 83 * 84 * 1. The sum of evictions and activations between any two points in 85 * time indicate the minimum number of inactive pages accessed in 86 * between. 87 * 88 * 2. Moving one inactive page N page slots towards the tail of the 89 * list requires at least N inactive page accesses. 90 * 91 * Combining these: 92 * 93 * 1. When a page is finally evicted from memory, the number of 94 * inactive pages accessed while the page was in cache is at least 95 * the number of page slots on the inactive list. 96 * 97 * 2. In addition, measuring the sum of evictions and activations (E) 98 * at the time of a page's eviction, and comparing it to another 99 * reading (R) at the time the page faults back into memory tells 100 * the minimum number of accesses while the page was not cached. 101 * This is called the refault distance. 102 * 103 * Because the first access of the page was the fault and the second 104 * access the refault, we combine the in-cache distance with the 105 * out-of-cache distance to get the complete minimum access distance 106 * of this page: 107 * 108 * NR_inactive + (R - E) 109 * 110 * And knowing the minimum access distance of a page, we can easily 111 * tell if the page would be able to stay in cache assuming all page 112 * slots in the cache were available: 113 * 114 * NR_inactive + (R - E) <= NR_inactive + NR_active 115 * 116 * If we have swap we should consider about NR_inactive_anon and 117 * NR_active_anon, so for page cache and anonymous respectively: 118 * 119 * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file 120 * + NR_inactive_anon + NR_active_anon 121 * 122 * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon 123 * + NR_inactive_file + NR_active_file 124 * 125 * Which can be further simplified to: 126 * 127 * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon 128 * 129 * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file 130 * 131 * Put into words, the refault distance (out-of-cache) can be seen as 132 * a deficit in inactive list space (in-cache). If the inactive list 133 * had (R - E) more page slots, the page would not have been evicted 134 * in between accesses, but activated instead. And on a full system, 135 * the only thing eating into inactive list space is active pages. 136 * 137 * 138 * Refaulting inactive pages 139 * 140 * All that is known about the active list is that the pages have been 141 * accessed more than once in the past. This means that at any given 142 * time there is actually a good chance that pages on the active list 143 * are no longer in active use. 144 * 145 * So when a refault distance of (R - E) is observed and there are at 146 * least (R - E) pages in the userspace workingset, the refaulting page 147 * is activated optimistically in the hope that (R - E) pages are actually 148 * used less frequently than the refaulting page - or even not used at 149 * all anymore. 150 * 151 * That means if inactive cache is refaulting with a suitable refault 152 * distance, we assume the cache workingset is transitioning and put 153 * pressure on the current workingset. 154 * 155 * If this is wrong and demotion kicks in, the pages which are truly 156 * used more frequently will be reactivated while the less frequently 157 * used once will be evicted from memory. 158 * 159 * But if this is right, the stale pages will be pushed out of memory 160 * and the used pages get to stay in cache. 161 * 162 * Refaulting active pages 163 * 164 * If on the other hand the refaulting pages have recently been 165 * deactivated, it means that the active list is no longer protecting 166 * actively used cache from reclaim. The cache is NOT transitioning to 167 * a different workingset; the existing workingset is thrashing in the 168 * space allocated to the page cache. 169 * 170 * 171 * Implementation 172 * 173 * For each node's LRU lists, a counter for inactive evictions and 174 * activations is maintained (node->nonresident_age). 175 * 176 * On eviction, a snapshot of this counter (along with some bits to 177 * identify the node) is stored in the now empty page cache 178 * slot of the evicted page. This is called a shadow entry. 179 * 180 * On cache misses for which there are shadow entries, an eligible 181 * refault distance will immediately activate the refaulting page. 182 */ 183 184 #define WORKINGSET_SHIFT 1 185 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ 186 WORKINGSET_SHIFT + NODES_SHIFT + \ 187 MEM_CGROUP_ID_SHIFT) 188 #define EVICTION_SHIFT_ANON (EVICTION_SHIFT + SWAP_COUNT_SHIFT) 189 #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 190 #define EVICTION_MASK_ANON (~0UL >> EVICTION_SHIFT_ANON) 191 192 /* 193 * Eviction timestamps need to be able to cover the full range of 194 * actionable refaults. However, bits are tight in the xarray 195 * entry, and after storing the identifier for the lruvec there might 196 * not be enough left to represent every single actionable refault. In 197 * that case, we have to sacrifice granularity for distance, and group 198 * evictions into coarser buckets by shaving off lower timestamp bits. 199 */ 200 static unsigned int bucket_order[ANON_AND_FILE] __read_mostly; 201 202 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, 203 bool workingset, bool file) 204 { 205 eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON; 206 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; 207 eviction = (eviction << NODES_SHIFT) | pgdat->node_id; 208 eviction = (eviction << WORKINGSET_SHIFT) | workingset; 209 210 return xa_mk_value(eviction); 211 } 212 213 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, 214 unsigned long *evictionp, bool *workingsetp) 215 { 216 unsigned long entry = xa_to_value(shadow); 217 int memcgid, nid; 218 bool workingset; 219 220 workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); 221 entry >>= WORKINGSET_SHIFT; 222 nid = entry & ((1UL << NODES_SHIFT) - 1); 223 entry >>= NODES_SHIFT; 224 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); 225 entry >>= MEM_CGROUP_ID_SHIFT; 226 227 *memcgidp = memcgid; 228 *pgdat = NODE_DATA(nid); 229 *evictionp = entry; 230 *workingsetp = workingset; 231 } 232 233 #ifdef CONFIG_LRU_GEN 234 235 static void *lru_gen_eviction(struct folio *folio) 236 { 237 int hist; 238 unsigned long token; 239 unsigned long min_seq; 240 struct lruvec *lruvec; 241 struct lru_gen_folio *lrugen; 242 int type = folio_is_file_lru(folio); 243 int delta = folio_nr_pages(folio); 244 int refs = folio_lru_refs(folio); 245 bool workingset = folio_test_workingset(folio); 246 int tier = lru_tier_from_refs(refs, workingset); 247 struct mem_cgroup *memcg = folio_memcg(folio); 248 struct pglist_data *pgdat = folio_pgdat(folio); 249 250 BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > 251 BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON)); 252 253 lruvec = mem_cgroup_lruvec(memcg, pgdat); 254 lrugen = &lruvec->lrugen; 255 min_seq = READ_ONCE(lrugen->min_seq[type]); 256 token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); 257 258 hist = lru_hist_from_seq(min_seq); 259 atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); 260 261 return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset, type); 262 } 263 264 /* 265 * Tests if the shadow entry is for a folio that was recently evicted. 266 * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. 267 */ 268 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, 269 unsigned long *token, bool *workingset, bool file) 270 { 271 int memcg_id; 272 unsigned long max_seq; 273 struct mem_cgroup *memcg; 274 struct pglist_data *pgdat; 275 276 unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); 277 278 memcg = mem_cgroup_from_private_id(memcg_id); 279 *lruvec = mem_cgroup_lruvec(memcg, pgdat); 280 281 max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); 282 max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH; 283 284 return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; 285 } 286 287 static void lru_gen_refault(struct folio *folio, void *shadow) 288 { 289 bool recent; 290 int hist, tier, refs; 291 bool workingset; 292 unsigned long token; 293 struct lruvec *lruvec; 294 struct lru_gen_folio *lrugen; 295 int type = folio_is_file_lru(folio); 296 int delta = folio_nr_pages(folio); 297 298 rcu_read_lock(); 299 300 recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type); 301 if (lruvec != folio_lruvec(folio)) 302 goto unlock; 303 304 mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); 305 306 if (!recent) 307 goto unlock; 308 309 lrugen = &lruvec->lrugen; 310 311 hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); 312 refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; 313 tier = lru_tier_from_refs(refs, workingset); 314 315 atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); 316 317 /* see folio_add_lru() where folio_set_active() will be called */ 318 if (lru_gen_in_fault()) 319 mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); 320 321 if (workingset) { 322 folio_set_workingset(folio); 323 mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); 324 } else 325 set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); 326 unlock: 327 rcu_read_unlock(); 328 } 329 330 #else /* !CONFIG_LRU_GEN */ 331 332 static void *lru_gen_eviction(struct folio *folio) 333 { 334 return NULL; 335 } 336 337 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, 338 unsigned long *token, bool *workingset, bool file) 339 { 340 return false; 341 } 342 343 static void lru_gen_refault(struct folio *folio, void *shadow) 344 { 345 } 346 347 #endif /* CONFIG_LRU_GEN */ 348 349 /** 350 * workingset_age_nonresident - age non-resident entries as LRU ages 351 * @lruvec: the lruvec that was aged 352 * @nr_pages: the number of pages to count 353 * 354 * As in-memory pages are aged, non-resident pages need to be aged as 355 * well, in order for the refault distances later on to be comparable 356 * to the in-memory dimensions. This function allows reclaim and LRU 357 * operations to drive the non-resident aging along in parallel. 358 */ 359 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) 360 { 361 /* 362 * Reclaiming a cgroup means reclaiming all its children in a 363 * round-robin fashion. That means that each cgroup has an LRU 364 * order that is composed of the LRU orders of its child 365 * cgroups; and every page has an LRU position not just in the 366 * cgroup that owns it, but in all of that group's ancestors. 367 * 368 * So when the physical inactive list of a leaf cgroup ages, 369 * the virtual inactive lists of all its parents, including 370 * the root cgroup's, age as well. 371 */ 372 do { 373 atomic_long_add(nr_pages, &lruvec->nonresident_age); 374 } while ((lruvec = parent_lruvec(lruvec))); 375 } 376 377 /** 378 * workingset_eviction - note the eviction of a folio from memory 379 * @target_memcg: the cgroup that is causing the reclaim 380 * @folio: the folio being evicted 381 * 382 * Return: a shadow entry to be stored in @folio->mapping->i_pages in place 383 * of the evicted @folio so that a later refault can be detected. 384 */ 385 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) 386 { 387 struct pglist_data *pgdat = folio_pgdat(folio); 388 int file = folio_is_file_lru(folio); 389 unsigned long eviction; 390 struct lruvec *lruvec; 391 int memcgid; 392 393 /* Folio is fully exclusive and pins folio's memory cgroup pointer */ 394 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 395 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 396 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 397 398 if (lru_gen_enabled()) 399 return lru_gen_eviction(folio); 400 401 lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 402 /* XXX: target_memcg can be NULL, go through lruvec */ 403 memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); 404 eviction = atomic_long_read(&lruvec->nonresident_age); 405 eviction >>= bucket_order[file]; 406 workingset_age_nonresident(lruvec, folio_nr_pages(folio)); 407 return pack_shadow(memcgid, pgdat, eviction, 408 folio_test_workingset(folio), file); 409 } 410 411 /** 412 * workingset_test_recent - tests if the shadow entry is for a folio that was 413 * recently evicted. Also fills in @workingset with the value unpacked from 414 * shadow. 415 * @shadow: the shadow entry to be tested. 416 * @file: whether the corresponding folio is from the file lru. 417 * @workingset: where the workingset value unpacked from shadow should 418 * be stored. 419 * @flush: whether to flush cgroup rstat. 420 * 421 * Return: true if the shadow is for a recently evicted folio; false otherwise. 422 */ 423 bool workingset_test_recent(void *shadow, bool file, bool *workingset, 424 bool flush) 425 { 426 struct mem_cgroup *eviction_memcg; 427 struct lruvec *eviction_lruvec; 428 unsigned long refault_distance; 429 unsigned long workingset_size; 430 unsigned long refault; 431 int memcgid; 432 struct pglist_data *pgdat; 433 unsigned long eviction; 434 435 if (lru_gen_enabled()) { 436 bool recent; 437 438 rcu_read_lock(); 439 recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, 440 workingset, file); 441 rcu_read_unlock(); 442 return recent; 443 } 444 445 rcu_read_lock(); 446 unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); 447 eviction <<= bucket_order[file]; 448 449 /* 450 * Look up the memcg associated with the stored ID. It might 451 * have been deleted since the folio's eviction. 452 * 453 * Note that in rare events the ID could have been recycled 454 * for a new cgroup that refaults a shared folio. This is 455 * impossible to tell from the available data. However, this 456 * should be a rare and limited disturbance, and activations 457 * are always speculative anyway. Ultimately, it's the aging 458 * algorithm's job to shake out the minimum access frequency 459 * for the active cache. 460 * 461 * XXX: On !CONFIG_MEMCG, this will always return NULL; it 462 * would be better if the root_mem_cgroup existed in all 463 * configurations instead. 464 */ 465 eviction_memcg = mem_cgroup_from_private_id(memcgid); 466 if (!mem_cgroup_tryget(eviction_memcg)) 467 eviction_memcg = NULL; 468 rcu_read_unlock(); 469 470 if (!mem_cgroup_disabled() && !eviction_memcg) 471 return false; 472 /* 473 * Flush stats (and potentially sleep) outside the RCU read section. 474 * 475 * Note that workingset_test_recent() itself might be called in RCU read 476 * section (for e.g, in cachestat) - these callers need to skip flushing 477 * stats (via the flush argument). 478 * 479 * XXX: With per-memcg flushing and thresholding, is ratelimiting 480 * still needed here? 481 */ 482 if (flush) 483 mem_cgroup_flush_stats_ratelimited(eviction_memcg); 484 485 eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); 486 refault = atomic_long_read(&eviction_lruvec->nonresident_age); 487 488 /* 489 * Calculate the refault distance 490 * 491 * The unsigned subtraction here gives an accurate distance 492 * across nonresident_age overflows in most cases. There is a 493 * special case: usually, shadow entries have a short lifetime 494 * and are either refaulted or reclaimed along with the inode 495 * before they get too old. But it is not impossible for the 496 * nonresident_age to lap a shadow entry in the field, which 497 * can then result in a false small refault distance, leading 498 * to a false activation should this old entry actually 499 * refault again. However, earlier kernels used to deactivate 500 * unconditionally with *every* reclaim invocation for the 501 * longest time, so the occasional inappropriate activation 502 * leading to pressure on the active list is not a problem. 503 */ 504 refault_distance = ((refault - eviction) & 505 (file ? EVICTION_MASK : EVICTION_MASK_ANON)); 506 507 /* 508 * Compare the distance to the existing workingset size. We 509 * don't activate pages that couldn't stay resident even if 510 * all the memory was available to the workingset. Whether 511 * workingset competition needs to consider anon or not depends 512 * on having free swap space. 513 */ 514 workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); 515 if (!file) { 516 workingset_size += lruvec_page_state(eviction_lruvec, 517 NR_INACTIVE_FILE); 518 } 519 if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { 520 workingset_size += lruvec_page_state(eviction_lruvec, 521 NR_ACTIVE_ANON); 522 if (file) { 523 workingset_size += lruvec_page_state(eviction_lruvec, 524 NR_INACTIVE_ANON); 525 } 526 } 527 528 mem_cgroup_put(eviction_memcg); 529 return refault_distance <= workingset_size; 530 } 531 532 /** 533 * workingset_refault - Evaluate the refault of a previously evicted folio. 534 * @folio: The freshly allocated replacement folio. 535 * @shadow: Shadow entry of the evicted folio. 536 * 537 * Calculates and evaluates the refault distance of the previously 538 * evicted folio in the context of the node and the memcg whose memory 539 * pressure caused the eviction. 540 */ 541 void workingset_refault(struct folio *folio, void *shadow) 542 { 543 bool file = folio_is_file_lru(folio); 544 struct pglist_data *pgdat; 545 struct mem_cgroup *memcg; 546 struct lruvec *lruvec; 547 bool workingset; 548 long nr; 549 550 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 551 552 if (lru_gen_enabled()) { 553 lru_gen_refault(folio, shadow); 554 return; 555 } 556 557 /* 558 * The activation decision for this folio is made at the level 559 * where the eviction occurred, as that is where the LRU order 560 * during folio reclaim is being determined. 561 * 562 * However, the cgroup that will own the folio is the one that 563 * is actually experiencing the refault event. Make sure the folio is 564 * locked to guarantee folio_memcg() stability throughout. 565 */ 566 nr = folio_nr_pages(folio); 567 memcg = folio_memcg(folio); 568 pgdat = folio_pgdat(folio); 569 lruvec = mem_cgroup_lruvec(memcg, pgdat); 570 571 mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); 572 573 if (!workingset_test_recent(shadow, file, &workingset, true)) 574 return; 575 576 folio_set_active(folio); 577 workingset_age_nonresident(lruvec, nr); 578 mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); 579 580 /* Folio was active prior to eviction */ 581 if (workingset) { 582 folio_set_workingset(folio); 583 /* 584 * XXX: Move to folio_add_lru() when it supports new vs 585 * putback 586 */ 587 lru_note_cost_refault(folio); 588 mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); 589 } 590 } 591 592 /** 593 * workingset_activation - note a page activation 594 * @folio: Folio that is being activated. 595 */ 596 void workingset_activation(struct folio *folio) 597 { 598 /* 599 * Filter non-memcg pages here, e.g. unmap can call 600 * mark_page_accessed() on VDSO pages. 601 */ 602 if (mem_cgroup_disabled() || folio_memcg_charged(folio)) 603 workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); 604 } 605 606 /* 607 * Shadow entries reflect the share of the working set that does not 608 * fit into memory, so their number depends on the access pattern of 609 * the workload. In most cases, they will refault or get reclaimed 610 * along with the inode, but a (malicious) workload that streams 611 * through files with a total size several times that of available 612 * memory, while preventing the inodes from being reclaimed, can 613 * create excessive amounts of shadow nodes. To keep a lid on this, 614 * track shadow nodes and reclaim them when they grow way past the 615 * point where they would still be useful. 616 */ 617 618 struct list_lru shadow_nodes; 619 620 void workingset_update_node(struct xa_node *node) 621 { 622 struct page *page = virt_to_page(node); 623 624 /* 625 * Track non-empty nodes that contain only shadow entries; 626 * unlink those that contain pages or are being freed. 627 * 628 * Avoid acquiring the list_lru lock when the nodes are 629 * already where they should be. The list_empty() test is safe 630 * as node->private_list is protected by the i_pages lock. 631 */ 632 lockdep_assert_held(&node->array->xa_lock); 633 634 if (node->count && node->count == node->nr_values) { 635 if (list_empty(&node->private_list)) { 636 list_lru_add_obj(&shadow_nodes, &node->private_list); 637 __inc_node_page_state(page, WORKINGSET_NODES); 638 } 639 } else { 640 if (!list_empty(&node->private_list)) { 641 list_lru_del_obj(&shadow_nodes, &node->private_list); 642 __dec_node_page_state(page, WORKINGSET_NODES); 643 } 644 } 645 } 646 647 static unsigned long count_shadow_nodes(struct shrinker *shrinker, 648 struct shrink_control *sc) 649 { 650 unsigned long max_nodes; 651 unsigned long nodes; 652 unsigned long pages; 653 654 nodes = list_lru_shrink_count(&shadow_nodes, sc); 655 if (!nodes) 656 return SHRINK_EMPTY; 657 658 /* 659 * Approximate a reasonable limit for the nodes 660 * containing shadow entries. We don't need to keep more 661 * shadow entries than possible pages on the active list, 662 * since refault distances bigger than that are dismissed. 663 * 664 * The size of the active list converges toward 100% of 665 * overall page cache as memory grows, with only a tiny 666 * inactive list. Assume the total cache size for that. 667 * 668 * Nodes might be sparsely populated, with only one shadow 669 * entry in the extreme case. Obviously, we cannot keep one 670 * node for every eligible shadow entry, so compromise on a 671 * worst-case density of 1/8th. Below that, not all eligible 672 * refaults can be detected anymore. 673 * 674 * On 64-bit with 7 xa_nodes per page and 64 slots 675 * each, this will reclaim shadow entries when they consume 676 * ~1.8% of available memory: 677 * 678 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE 679 */ 680 #ifdef CONFIG_MEMCG 681 if (sc->memcg) { 682 struct lruvec *lruvec; 683 int i; 684 685 mem_cgroup_flush_stats_ratelimited(sc->memcg); 686 lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); 687 for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) 688 pages += lruvec_page_state_local(lruvec, 689 NR_LRU_BASE + i); 690 pages += lruvec_page_state_local( 691 lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; 692 pages += lruvec_page_state_local( 693 lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; 694 } else 695 #endif 696 pages = node_present_pages(sc->nid); 697 698 max_nodes = pages >> (XA_CHUNK_SHIFT - 3); 699 700 if (nodes <= max_nodes) 701 return 0; 702 return nodes - max_nodes; 703 } 704 705 static enum lru_status shadow_lru_isolate(struct list_head *item, 706 struct list_lru_one *lru, 707 void *arg) __must_hold(lru->lock) 708 { 709 struct xa_node *node = container_of(item, struct xa_node, private_list); 710 struct address_space *mapping; 711 int ret; 712 713 /* 714 * Page cache insertions and deletions synchronously maintain 715 * the shadow node LRU under the i_pages lock and the 716 * &lru->lock. Because the page cache tree is emptied before 717 * the inode can be destroyed, holding the &lru->lock pins any 718 * address_space that has nodes on the LRU. 719 * 720 * We can then safely transition to the i_pages lock to 721 * pin only the address_space of the particular node we want 722 * to reclaim, take the node off-LRU, and drop the &lru->lock. 723 */ 724 725 mapping = container_of(node->array, struct address_space, i_pages); 726 727 /* Coming from the list, invert the lock order */ 728 if (!xa_trylock(&mapping->i_pages)) { 729 spin_unlock_irq(&lru->lock); 730 ret = LRU_RETRY; 731 goto out; 732 } 733 734 /* For page cache we need to hold i_lock */ 735 if (mapping->host != NULL) { 736 if (!spin_trylock(&mapping->host->i_lock)) { 737 xa_unlock(&mapping->i_pages); 738 spin_unlock_irq(&lru->lock); 739 ret = LRU_RETRY; 740 goto out; 741 } 742 } 743 744 list_lru_isolate(lru, item); 745 __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); 746 747 spin_unlock(&lru->lock); 748 749 /* 750 * The nodes should only contain one or more shadow entries, 751 * no pages, so we expect to be able to remove them all and 752 * delete and free the empty node afterwards. 753 */ 754 if (WARN_ON_ONCE(!node->nr_values)) 755 goto out_invalid; 756 if (WARN_ON_ONCE(node->count != node->nr_values)) 757 goto out_invalid; 758 xa_delete_node(node, workingset_update_node); 759 mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1); 760 761 out_invalid: 762 xa_unlock_irq(&mapping->i_pages); 763 if (mapping->host != NULL) { 764 if (mapping_shrinkable(mapping)) 765 inode_lru_list_add(mapping->host); 766 spin_unlock(&mapping->host->i_lock); 767 } 768 ret = LRU_REMOVED_RETRY; 769 out: 770 cond_resched(); 771 return ret; 772 } 773 774 static unsigned long scan_shadow_nodes(struct shrinker *shrinker, 775 struct shrink_control *sc) 776 { 777 /* list_lru lock nests inside the IRQ-safe i_pages lock */ 778 return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, 779 NULL); 780 } 781 782 /* 783 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe 784 * i_pages lock. 785 */ 786 static struct lock_class_key shadow_nodes_key; 787 788 static int __init workingset_init(void) 789 { 790 unsigned int timestamp_bits, timestamp_bits_anon; 791 struct shrinker *workingset_shadow_shrinker; 792 unsigned int max_order; 793 int ret = -ENOMEM; 794 795 BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); 796 /* 797 * Calculate the eviction bucket size to cover the longest 798 * actionable refault distance, which is currently half of 799 * memory (totalram_pages/2). However, memory hotplug may add 800 * some more pages at runtime, so keep working with up to 801 * double the initial memory by using totalram_pages as-is. 802 */ 803 timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; 804 timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON; 805 max_order = fls_long(totalram_pages() - 1); 806 if (max_order > (BITS_PER_LONG - EVICTION_SHIFT)) 807 bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits; 808 if (max_order > timestamp_bits_anon) 809 bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon; 810 pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n", 811 timestamp_bits, timestamp_bits_anon, max_order, 812 bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]); 813 814 workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 815 SHRINKER_MEMCG_AWARE, 816 "mm-shadow"); 817 if (!workingset_shadow_shrinker) 818 goto err; 819 820 ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, 821 &shadow_nodes_key); 822 if (ret) 823 goto err_list_lru; 824 825 workingset_shadow_shrinker->count_objects = count_shadow_nodes; 826 workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; 827 /* ->count reports only fully expendable nodes */ 828 workingset_shadow_shrinker->seeks = 0; 829 830 shrinker_register(workingset_shadow_shrinker); 831 return 0; 832 err_list_lru: 833 shrinker_free(workingset_shadow_shrinker); 834 err: 835 return ret; 836 } 837 module_init(workingset_init); 838