1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Workingset detection 4 * 5 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner 6 */ 7 8 #include <linux/memcontrol.h> 9 #include <linux/mm_inline.h> 10 #include <linux/writeback.h> 11 #include <linux/shmem_fs.h> 12 #include <linux/pagemap.h> 13 #include <linux/atomic.h> 14 #include <linux/module.h> 15 #include <linux/swap.h> 16 #include <linux/dax.h> 17 #include <linux/fs.h> 18 #include <linux/mm.h> 19 #include "swap_table.h" 20 #include "internal.h" 21 22 /* 23 * Double CLOCK lists 24 * 25 * Per node, two clock lists are maintained for file pages: the 26 * inactive and the active list. Freshly faulted pages start out at 27 * the head of the inactive list and page reclaim scans pages from the 28 * tail. Pages that are accessed multiple times on the inactive list 29 * are promoted to the active list, to protect them from reclaim, 30 * whereas active pages are demoted to the inactive list when the 31 * active list grows too big. 32 * 33 * fault ------------------------+ 34 * | 35 * +--------------+ | +-------------+ 36 * reclaim <- | inactive | <-+-- demotion | active | <--+ 37 * +--------------+ +-------------+ | 38 * | | 39 * +-------------- promotion ------------------+ 40 * 41 * 42 * Access frequency and refault distance 43 * 44 * A workload is thrashing when its pages are frequently used but they 45 * are evicted from the inactive list every time before another access 46 * would have promoted them to the active list. 47 * 48 * In cases where the average access distance between thrashing pages 49 * is bigger than the size of memory there is nothing that can be 50 * done - the thrashing set could never fit into memory under any 51 * circumstance. 52 * 53 * However, the average access distance could be bigger than the 54 * inactive list, yet smaller than the size of memory. In this case, 55 * the set could fit into memory if it weren't for the currently 56 * active pages - which may be used more, hopefully less frequently: 57 * 58 * +-memory available to cache-+ 59 * | | 60 * +-inactive------+-active----+ 61 * a b | c d e f g h i | J K L M N | 62 * +---------------+-----------+ 63 * 64 * It is prohibitively expensive to accurately track access frequency 65 * of pages. But a reasonable approximation can be made to measure 66 * thrashing on the inactive list, after which refaulting pages can be 67 * activated optimistically to compete with the existing active pages. 68 * 69 * Approximating inactive page access frequency - Observations: 70 * 71 * 1. When a page is accessed for the first time, it is added to the 72 * head of the inactive list, slides every existing inactive page 73 * towards the tail by one slot, and pushes the current tail page 74 * out of memory. 75 * 76 * 2. When a page is accessed for the second time, it is promoted to 77 * the active list, shrinking the inactive list by one slot. This 78 * also slides all inactive pages that were faulted into the cache 79 * more recently than the activated page towards the tail of the 80 * inactive list. 81 * 82 * Thus: 83 * 84 * 1. The sum of evictions and activations between any two points in 85 * time indicate the minimum number of inactive pages accessed in 86 * between. 87 * 88 * 2. Moving one inactive page N page slots towards the tail of the 89 * list requires at least N inactive page accesses. 90 * 91 * Combining these: 92 * 93 * 1. When a page is finally evicted from memory, the number of 94 * inactive pages accessed while the page was in cache is at least 95 * the number of page slots on the inactive list. 96 * 97 * 2. In addition, measuring the sum of evictions and activations (E) 98 * at the time of a page's eviction, and comparing it to another 99 * reading (R) at the time the page faults back into memory tells 100 * the minimum number of accesses while the page was not cached. 101 * This is called the refault distance. 102 * 103 * Because the first access of the page was the fault and the second 104 * access the refault, we combine the in-cache distance with the 105 * out-of-cache distance to get the complete minimum access distance 106 * of this page: 107 * 108 * NR_inactive + (R - E) 109 * 110 * And knowing the minimum access distance of a page, we can easily 111 * tell if the page would be able to stay in cache assuming all page 112 * slots in the cache were available: 113 * 114 * NR_inactive + (R - E) <= NR_inactive + NR_active 115 * 116 * If we have swap we should consider about NR_inactive_anon and 117 * NR_active_anon, so for page cache and anonymous respectively: 118 * 119 * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file 120 * + NR_inactive_anon + NR_active_anon 121 * 122 * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon 123 * + NR_inactive_file + NR_active_file 124 * 125 * Which can be further simplified to: 126 * 127 * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon 128 * 129 * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file 130 * 131 * Put into words, the refault distance (out-of-cache) can be seen as 132 * a deficit in inactive list space (in-cache). If the inactive list 133 * had (R - E) more page slots, the page would not have been evicted 134 * in between accesses, but activated instead. And on a full system, 135 * the only thing eating into inactive list space is active pages. 136 * 137 * 138 * Refaulting inactive pages 139 * 140 * All that is known about the active list is that the pages have been 141 * accessed more than once in the past. This means that at any given 142 * time there is actually a good chance that pages on the active list 143 * are no longer in active use. 144 * 145 * So when a refault distance of (R - E) is observed and there are at 146 * least (R - E) pages in the userspace workingset, the refaulting page 147 * is activated optimistically in the hope that (R - E) pages are actually 148 * used less frequently than the refaulting page - or even not used at 149 * all anymore. 150 * 151 * That means if inactive cache is refaulting with a suitable refault 152 * distance, we assume the cache workingset is transitioning and put 153 * pressure on the current workingset. 154 * 155 * If this is wrong and demotion kicks in, the pages which are truly 156 * used more frequently will be reactivated while the less frequently 157 * used once will be evicted from memory. 158 * 159 * But if this is right, the stale pages will be pushed out of memory 160 * and the used pages get to stay in cache. 161 * 162 * Refaulting active pages 163 * 164 * If on the other hand the refaulting pages have recently been 165 * deactivated, it means that the active list is no longer protecting 166 * actively used cache from reclaim. The cache is NOT transitioning to 167 * a different workingset; the existing workingset is thrashing in the 168 * space allocated to the page cache. 169 * 170 * 171 * Implementation 172 * 173 * For each node's LRU lists, a counter for inactive evictions and 174 * activations is maintained (node->nonresident_age). 175 * 176 * On eviction, a snapshot of this counter (along with some bits to 177 * identify the node) is stored in the now empty page cache 178 * slot of the evicted page. This is called a shadow entry. 179 * 180 * On cache misses for which there are shadow entries, an eligible 181 * refault distance will immediately activate the refaulting page. 182 */ 183 184 #define WORKINGSET_SHIFT 1 185 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ 186 WORKINGSET_SHIFT + NODES_SHIFT + \ 187 MEM_CGROUP_ID_SHIFT) 188 #define EVICTION_SHIFT_ANON (EVICTION_SHIFT + SWAP_COUNT_SHIFT) 189 #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 190 #define EVICTION_MASK_ANON (~0UL >> EVICTION_SHIFT_ANON) 191 192 /* 193 * Eviction timestamps need to be able to cover the full range of 194 * actionable refaults. However, bits are tight in the xarray 195 * entry, and after storing the identifier for the lruvec there might 196 * not be enough left to represent every single actionable refault. In 197 * that case, we have to sacrifice granularity for distance, and group 198 * evictions into coarser buckets by shaving off lower timestamp bits. 199 */ 200 static unsigned int bucket_order[ANON_AND_FILE] __read_mostly; 201 202 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, 203 bool workingset, bool file) 204 { 205 eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON; 206 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; 207 eviction = (eviction << NODES_SHIFT) | pgdat->node_id; 208 eviction = (eviction << WORKINGSET_SHIFT) | workingset; 209 210 return xa_mk_value(eviction); 211 } 212 213 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, 214 unsigned long *evictionp, bool *workingsetp) 215 { 216 unsigned long entry = xa_to_value(shadow); 217 int memcgid, nid; 218 bool workingset; 219 220 workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); 221 entry >>= WORKINGSET_SHIFT; 222 nid = entry & ((1UL << NODES_SHIFT) - 1); 223 entry >>= NODES_SHIFT; 224 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); 225 entry >>= MEM_CGROUP_ID_SHIFT; 226 227 *memcgidp = memcgid; 228 *pgdat = NODE_DATA(nid); 229 *evictionp = entry; 230 *workingsetp = workingset; 231 } 232 233 #ifdef CONFIG_LRU_GEN 234 235 static void *lru_gen_eviction(struct folio *folio) 236 { 237 int hist; 238 unsigned long token; 239 unsigned long min_seq; 240 struct lruvec *lruvec; 241 struct lru_gen_folio *lrugen; 242 int type = folio_is_file_lru(folio); 243 int delta = folio_nr_pages(folio); 244 int refs = folio_lru_refs(folio); 245 bool workingset = folio_test_workingset(folio); 246 int tier = lru_tier_from_refs(refs, workingset); 247 struct mem_cgroup *memcg; 248 struct pglist_data *pgdat = folio_pgdat(folio); 249 unsigned short memcg_id; 250 251 BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > 252 BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON)); 253 254 rcu_read_lock(); 255 memcg = folio_memcg(folio); 256 lruvec = mem_cgroup_lruvec(memcg, pgdat); 257 lrugen = &lruvec->lrugen; 258 min_seq = READ_ONCE(lrugen->min_seq[type]); 259 token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); 260 261 hist = lru_hist_from_seq(min_seq); 262 atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); 263 memcg_id = mem_cgroup_private_id(memcg); 264 rcu_read_unlock(); 265 266 return pack_shadow(memcg_id, pgdat, token, workingset, type); 267 } 268 269 /* 270 * Tests if the shadow entry is for a folio that was recently evicted. 271 * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. 272 */ 273 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, 274 unsigned long *token, bool *workingset, bool file) 275 { 276 int memcg_id; 277 unsigned long max_seq; 278 struct mem_cgroup *memcg; 279 struct pglist_data *pgdat; 280 281 unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); 282 283 memcg = mem_cgroup_from_private_id(memcg_id); 284 *lruvec = mem_cgroup_lruvec(memcg, pgdat); 285 286 max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); 287 max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH; 288 289 return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; 290 } 291 292 static void lru_gen_refault(struct folio *folio, void *shadow) 293 { 294 bool recent; 295 int hist, tier, refs; 296 bool workingset; 297 unsigned long token; 298 struct lruvec *lruvec; 299 struct lru_gen_folio *lrugen; 300 int type = folio_is_file_lru(folio); 301 int delta = folio_nr_pages(folio); 302 303 rcu_read_lock(); 304 305 recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type); 306 if (lruvec != folio_lruvec(folio)) 307 goto unlock; 308 309 mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); 310 311 if (!recent) 312 goto unlock; 313 314 lrugen = &lruvec->lrugen; 315 316 hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); 317 refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; 318 tier = lru_tier_from_refs(refs, workingset); 319 320 atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); 321 322 if (workingset) { 323 /* 324 * see folio_add_lru(), where folio_set_active() is 325 * called for workingset folios 326 */ 327 if (lru_gen_in_fault()) 328 mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); 329 folio_set_workingset(folio); 330 mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); 331 } else 332 set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); 333 unlock: 334 rcu_read_unlock(); 335 } 336 337 #else /* !CONFIG_LRU_GEN */ 338 339 static void *lru_gen_eviction(struct folio *folio) 340 { 341 return NULL; 342 } 343 344 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, 345 unsigned long *token, bool *workingset, bool file) 346 { 347 return false; 348 } 349 350 static void lru_gen_refault(struct folio *folio, void *shadow) 351 { 352 } 353 354 #endif /* CONFIG_LRU_GEN */ 355 356 /** 357 * workingset_age_nonresident - age non-resident entries as LRU ages 358 * @lruvec: the lruvec that was aged 359 * @nr_pages: the number of pages to count 360 * 361 * As in-memory pages are aged, non-resident pages need to be aged as 362 * well, in order for the refault distances later on to be comparable 363 * to the in-memory dimensions. This function allows reclaim and LRU 364 * operations to drive the non-resident aging along in parallel. 365 */ 366 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) 367 { 368 /* 369 * Reclaiming a cgroup means reclaiming all its children in a 370 * round-robin fashion. That means that each cgroup has an LRU 371 * order that is composed of the LRU orders of its child 372 * cgroups; and every page has an LRU position not just in the 373 * cgroup that owns it, but in all of that group's ancestors. 374 * 375 * So when the physical inactive list of a leaf cgroup ages, 376 * the virtual inactive lists of all its parents, including 377 * the root cgroup's, age as well. 378 */ 379 do { 380 atomic_long_add(nr_pages, &lruvec->nonresident_age); 381 } while ((lruvec = parent_lruvec(lruvec))); 382 } 383 384 /** 385 * workingset_eviction - note the eviction of a folio from memory 386 * @target_memcg: the cgroup that is causing the reclaim 387 * @folio: the folio being evicted 388 * 389 * Return: a shadow entry to be stored in @folio->mapping->i_pages in place 390 * of the evicted @folio so that a later refault can be detected. 391 */ 392 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) 393 { 394 struct pglist_data *pgdat = folio_pgdat(folio); 395 int file = folio_is_file_lru(folio); 396 unsigned long eviction; 397 struct lruvec *lruvec; 398 int memcgid; 399 400 /* Folio is fully exclusive and pins folio's memory cgroup pointer */ 401 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 402 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 403 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 404 405 if (lru_gen_enabled()) 406 return lru_gen_eviction(folio); 407 408 lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 409 /* XXX: target_memcg can be NULL, go through lruvec */ 410 memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); 411 eviction = atomic_long_read(&lruvec->nonresident_age); 412 eviction >>= bucket_order[file]; 413 workingset_age_nonresident(lruvec, folio_nr_pages(folio)); 414 return pack_shadow(memcgid, pgdat, eviction, 415 folio_test_workingset(folio), file); 416 } 417 418 /** 419 * workingset_test_recent - tests if the shadow entry is for a folio that was 420 * recently evicted. Also fills in @workingset with the value unpacked from 421 * shadow. 422 * @shadow: the shadow entry to be tested. 423 * @file: whether the corresponding folio is from the file lru. 424 * @workingset: where the workingset value unpacked from shadow should 425 * be stored. 426 * @flush: whether to flush cgroup rstat. 427 * 428 * Return: true if the shadow is for a recently evicted folio; false otherwise. 429 */ 430 bool workingset_test_recent(void *shadow, bool file, bool *workingset, 431 bool flush) 432 { 433 struct mem_cgroup *eviction_memcg; 434 struct lruvec *eviction_lruvec; 435 unsigned long refault_distance; 436 unsigned long workingset_size; 437 unsigned long refault; 438 int memcgid; 439 struct pglist_data *pgdat; 440 unsigned long eviction; 441 442 if (lru_gen_enabled()) { 443 bool recent; 444 445 rcu_read_lock(); 446 recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, 447 workingset, file); 448 rcu_read_unlock(); 449 return recent; 450 } 451 452 rcu_read_lock(); 453 unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); 454 eviction <<= bucket_order[file]; 455 456 /* 457 * Look up the memcg associated with the stored ID. It might 458 * have been deleted since the folio's eviction. 459 * 460 * Note that in rare events the ID could have been recycled 461 * for a new cgroup that refaults a shared folio. This is 462 * impossible to tell from the available data. However, this 463 * should be a rare and limited disturbance, and activations 464 * are always speculative anyway. Ultimately, it's the aging 465 * algorithm's job to shake out the minimum access frequency 466 * for the active cache. 467 * 468 * XXX: On !CONFIG_MEMCG, this will always return NULL; it 469 * would be better if the root_mem_cgroup existed in all 470 * configurations instead. 471 */ 472 eviction_memcg = mem_cgroup_from_private_id(memcgid); 473 if (!mem_cgroup_tryget(eviction_memcg)) 474 eviction_memcg = NULL; 475 rcu_read_unlock(); 476 477 if (!mem_cgroup_disabled() && !eviction_memcg) 478 return false; 479 /* 480 * Flush stats (and potentially sleep) outside the RCU read section. 481 * 482 * Note that workingset_test_recent() itself might be called in RCU read 483 * section (for e.g, in cachestat) - these callers need to skip flushing 484 * stats (via the flush argument). 485 * 486 * XXX: With per-memcg flushing and thresholding, is ratelimiting 487 * still needed here? 488 */ 489 if (flush) 490 mem_cgroup_flush_stats_ratelimited(eviction_memcg); 491 492 eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); 493 refault = atomic_long_read(&eviction_lruvec->nonresident_age); 494 495 /* 496 * Calculate the refault distance 497 * 498 * The unsigned subtraction here gives an accurate distance 499 * across nonresident_age overflows in most cases. There is a 500 * special case: usually, shadow entries have a short lifetime 501 * and are either refaulted or reclaimed along with the inode 502 * before they get too old. But it is not impossible for the 503 * nonresident_age to lap a shadow entry in the field, which 504 * can then result in a false small refault distance, leading 505 * to a false activation should this old entry actually 506 * refault again. However, earlier kernels used to deactivate 507 * unconditionally with *every* reclaim invocation for the 508 * longest time, so the occasional inappropriate activation 509 * leading to pressure on the active list is not a problem. 510 */ 511 refault_distance = ((refault - eviction) & 512 (file ? EVICTION_MASK : EVICTION_MASK_ANON)); 513 514 /* 515 * Compare the distance to the existing workingset size. We 516 * don't activate pages that couldn't stay resident even if 517 * all the memory was available to the workingset. Whether 518 * workingset competition needs to consider anon or not depends 519 * on having free swap space. 520 */ 521 workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); 522 if (!file) { 523 workingset_size += lruvec_page_state(eviction_lruvec, 524 NR_INACTIVE_FILE); 525 } 526 if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { 527 workingset_size += lruvec_page_state(eviction_lruvec, 528 NR_ACTIVE_ANON); 529 if (file) { 530 workingset_size += lruvec_page_state(eviction_lruvec, 531 NR_INACTIVE_ANON); 532 } 533 } 534 535 mem_cgroup_put(eviction_memcg); 536 return refault_distance <= workingset_size; 537 } 538 539 /** 540 * workingset_refault - Evaluate the refault of a previously evicted folio. 541 * @folio: The freshly allocated replacement folio. 542 * @shadow: Shadow entry of the evicted folio. 543 * 544 * Calculates and evaluates the refault distance of the previously 545 * evicted folio in the context of the node and the memcg whose memory 546 * pressure caused the eviction. 547 */ 548 void workingset_refault(struct folio *folio, void *shadow) 549 { 550 bool file = folio_is_file_lru(folio); 551 struct mem_cgroup *memcg; 552 struct lruvec *lruvec; 553 bool workingset; 554 long nr; 555 556 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 557 558 if (lru_gen_enabled()) { 559 lru_gen_refault(folio, shadow); 560 return; 561 } 562 563 /* 564 * The activation decision for this folio is made at the level 565 * where the eviction occurred, as that is where the LRU order 566 * during folio reclaim is being determined. 567 * 568 * However, the cgroup that will own the folio is the one that 569 * is actually experiencing the refault event. Make sure the folio is 570 * locked to guarantee folio_memcg() stability throughout. 571 */ 572 nr = folio_nr_pages(folio); 573 memcg = get_mem_cgroup_from_folio(folio); 574 lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio)); 575 mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); 576 577 if (!workingset_test_recent(shadow, file, &workingset, true)) 578 goto out; 579 580 folio_set_active(folio); 581 workingset_age_nonresident(lruvec, nr); 582 mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); 583 584 /* Folio was active prior to eviction */ 585 if (workingset) { 586 folio_set_workingset(folio); 587 /* 588 * XXX: Move to folio_add_lru() when it supports new vs 589 * putback 590 */ 591 lru_note_cost_refault(folio); 592 mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); 593 } 594 out: 595 mem_cgroup_put(memcg); 596 } 597 598 /** 599 * workingset_activation - note a page activation 600 * @folio: Folio that is being activated. 601 */ 602 void workingset_activation(struct folio *folio) 603 { 604 /* 605 * Filter non-memcg pages here, e.g. unmap can call 606 * mark_page_accessed() on VDSO pages. 607 */ 608 if (mem_cgroup_disabled() || folio_memcg_charged(folio)) { 609 rcu_read_lock(); 610 workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); 611 rcu_read_unlock(); 612 } 613 } 614 615 /* 616 * Shadow entries reflect the share of the working set that does not 617 * fit into memory, so their number depends on the access pattern of 618 * the workload. In most cases, they will refault or get reclaimed 619 * along with the inode, but a (malicious) workload that streams 620 * through files with a total size several times that of available 621 * memory, while preventing the inodes from being reclaimed, can 622 * create excessive amounts of shadow nodes. To keep a lid on this, 623 * track shadow nodes and reclaim them when they grow way past the 624 * point where they would still be useful. 625 */ 626 627 struct list_lru shadow_nodes; 628 629 void workingset_update_node(struct xa_node *node) 630 { 631 struct page *page = virt_to_page(node); 632 633 /* 634 * Track non-empty nodes that contain only shadow entries; 635 * unlink those that contain pages or are being freed. 636 * 637 * Avoid acquiring the list_lru lock when the nodes are 638 * already where they should be. The list_empty() test is safe 639 * as node->private_list is protected by the i_pages lock. 640 */ 641 lockdep_assert_held(&node->array->xa_lock); 642 643 if (node->count && node->count == node->nr_values) { 644 if (list_empty(&node->private_list)) { 645 list_lru_add_obj(&shadow_nodes, &node->private_list); 646 __inc_node_page_state(page, WORKINGSET_NODES); 647 } 648 } else { 649 if (!list_empty(&node->private_list)) { 650 list_lru_del_obj(&shadow_nodes, &node->private_list); 651 __dec_node_page_state(page, WORKINGSET_NODES); 652 } 653 } 654 } 655 656 static unsigned long count_shadow_nodes(struct shrinker *shrinker, 657 struct shrink_control *sc) 658 { 659 unsigned long max_nodes; 660 unsigned long nodes; 661 unsigned long pages; 662 663 nodes = list_lru_shrink_count(&shadow_nodes, sc); 664 if (!nodes) 665 return SHRINK_EMPTY; 666 667 /* 668 * Approximate a reasonable limit for the nodes 669 * containing shadow entries. We don't need to keep more 670 * shadow entries than possible pages on the active list, 671 * since refault distances bigger than that are dismissed. 672 * 673 * The size of the active list converges toward 100% of 674 * overall page cache as memory grows, with only a tiny 675 * inactive list. Assume the total cache size for that. 676 * 677 * Nodes might be sparsely populated, with only one shadow 678 * entry in the extreme case. Obviously, we cannot keep one 679 * node for every eligible shadow entry, so compromise on a 680 * worst-case density of 1/8th. Below that, not all eligible 681 * refaults can be detected anymore. 682 * 683 * On 64-bit with 7 xa_nodes per page and 64 slots 684 * each, this will reclaim shadow entries when they consume 685 * ~1.8% of available memory: 686 * 687 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE 688 */ 689 #ifdef CONFIG_MEMCG 690 if (sc->memcg) { 691 struct lruvec *lruvec; 692 int i; 693 694 mem_cgroup_flush_stats_ratelimited(sc->memcg); 695 lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); 696 697 for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) 698 pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1); 699 700 pages += lruvec_page_state_local( 701 lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; 702 pages += lruvec_page_state_local( 703 lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; 704 } else 705 #endif 706 pages = node_present_pages(sc->nid); 707 708 max_nodes = pages >> (XA_CHUNK_SHIFT - 3); 709 710 if (nodes <= max_nodes) 711 return 0; 712 return nodes - max_nodes; 713 } 714 715 static enum lru_status shadow_lru_isolate(struct list_head *item, 716 struct list_lru_one *lru, 717 void *arg) __must_hold(lru->lock) 718 { 719 struct xa_node *node = container_of(item, struct xa_node, private_list); 720 struct address_space *mapping; 721 int ret; 722 723 /* 724 * Page cache insertions and deletions synchronously maintain 725 * the shadow node LRU under the i_pages lock and the 726 * &lru->lock. Because the page cache tree is emptied before 727 * the inode can be destroyed, holding the &lru->lock pins any 728 * address_space that has nodes on the LRU. 729 * 730 * We can then safely transition to the i_pages lock to 731 * pin only the address_space of the particular node we want 732 * to reclaim, take the node off-LRU, and drop the &lru->lock. 733 */ 734 735 mapping = container_of(node->array, struct address_space, i_pages); 736 737 /* Coming from the list, invert the lock order */ 738 if (!xa_trylock(&mapping->i_pages)) { 739 spin_unlock_irq(&lru->lock); 740 ret = LRU_RETRY; 741 goto out; 742 } 743 744 /* For page cache we need to hold i_lock */ 745 if (mapping->host != NULL) { 746 if (!spin_trylock(&mapping->host->i_lock)) { 747 xa_unlock(&mapping->i_pages); 748 spin_unlock_irq(&lru->lock); 749 ret = LRU_RETRY; 750 goto out; 751 } 752 } 753 754 list_lru_isolate(lru, item); 755 __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); 756 757 spin_unlock(&lru->lock); 758 759 /* 760 * The nodes should only contain one or more shadow entries, 761 * no pages, so we expect to be able to remove them all and 762 * delete and free the empty node afterwards. 763 */ 764 if (WARN_ON_ONCE(!node->nr_values)) 765 goto out_invalid; 766 if (WARN_ON_ONCE(node->count != node->nr_values)) 767 goto out_invalid; 768 xa_delete_node(node, workingset_update_node); 769 mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1); 770 771 out_invalid: 772 xa_unlock_irq(&mapping->i_pages); 773 if (mapping->host != NULL) { 774 if (mapping_shrinkable(mapping)) 775 inode_lru_list_add(mapping->host); 776 spin_unlock(&mapping->host->i_lock); 777 } 778 ret = LRU_REMOVED_RETRY; 779 out: 780 cond_resched(); 781 return ret; 782 } 783 784 static unsigned long scan_shadow_nodes(struct shrinker *shrinker, 785 struct shrink_control *sc) 786 { 787 /* list_lru lock nests inside the IRQ-safe i_pages lock */ 788 return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, 789 NULL); 790 } 791 792 /* 793 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe 794 * i_pages lock. 795 */ 796 static struct lock_class_key shadow_nodes_key; 797 798 static int __init workingset_init(void) 799 { 800 unsigned int timestamp_bits, timestamp_bits_anon; 801 struct shrinker *workingset_shadow_shrinker; 802 unsigned int max_order; 803 int ret = -ENOMEM; 804 805 BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); 806 /* 807 * Calculate the eviction bucket size to cover the longest 808 * actionable refault distance, which is currently half of 809 * memory (totalram_pages/2). However, memory hotplug may add 810 * some more pages at runtime, so keep working with up to 811 * double the initial memory by using totalram_pages as-is. 812 */ 813 timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; 814 timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON; 815 max_order = fls_long(totalram_pages() - 1); 816 if (max_order > (BITS_PER_LONG - EVICTION_SHIFT)) 817 bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits; 818 if (max_order > timestamp_bits_anon) 819 bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon; 820 pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n", 821 timestamp_bits, timestamp_bits_anon, max_order, 822 bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]); 823 824 workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 825 SHRINKER_MEMCG_AWARE, 826 "mm-shadow"); 827 if (!workingset_shadow_shrinker) 828 goto err; 829 830 ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, 831 &shadow_nodes_key); 832 if (ret) 833 goto err_list_lru; 834 835 workingset_shadow_shrinker->count_objects = count_shadow_nodes; 836 workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; 837 /* ->count reports only fully expendable nodes */ 838 workingset_shadow_shrinker->seeks = 0; 839 840 shrinker_register(workingset_shadow_shrinker); 841 return 0; 842 err_list_lru: 843 shrinker_free(workingset_shadow_shrinker); 844 err: 845 return ret; 846 } 847 module_init(workingset_init); 848