1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Workingset detection 4 * 5 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner 6 */ 7 8 #include <linux/memcontrol.h> 9 #include <linux/mm_inline.h> 10 #include <linux/writeback.h> 11 #include <linux/shmem_fs.h> 12 #include <linux/pagemap.h> 13 #include <linux/atomic.h> 14 #include <linux/module.h> 15 #include <linux/swap.h> 16 #include <linux/dax.h> 17 #include <linux/fs.h> 18 #include <linux/mm.h> 19 #include "swap_table.h" 20 #include "internal.h" 21 22 /* 23 * Double CLOCK lists 24 * 25 * Per node, two clock lists are maintained for file pages: the 26 * inactive and the active list. Freshly faulted pages start out at 27 * the head of the inactive list and page reclaim scans pages from the 28 * tail. Pages that are accessed multiple times on the inactive list 29 * are promoted to the active list, to protect them from reclaim, 30 * whereas active pages are demoted to the inactive list when the 31 * active list grows too big. 32 * 33 * fault ------------------------+ 34 * | 35 * +--------------+ | +-------------+ 36 * reclaim <- | inactive | <-+-- demotion | active | <--+ 37 * +--------------+ +-------------+ | 38 * | | 39 * +-------------- promotion ------------------+ 40 * 41 * 42 * Access frequency and refault distance 43 * 44 * A workload is thrashing when its pages are frequently used but they 45 * are evicted from the inactive list every time before another access 46 * would have promoted them to the active list. 47 * 48 * In cases where the average access distance between thrashing pages 49 * is bigger than the size of memory there is nothing that can be 50 * done - the thrashing set could never fit into memory under any 51 * circumstance. 52 * 53 * However, the average access distance could be bigger than the 54 * inactive list, yet smaller than the size of memory. In this case, 55 * the set could fit into memory if it weren't for the currently 56 * active pages - which may be used more, hopefully less frequently: 57 * 58 * +-memory available to cache-+ 59 * | | 60 * +-inactive------+-active----+ 61 * a b | c d e f g h i | J K L M N | 62 * +---------------+-----------+ 63 * 64 * It is prohibitively expensive to accurately track access frequency 65 * of pages. But a reasonable approximation can be made to measure 66 * thrashing on the inactive list, after which refaulting pages can be 67 * activated optimistically to compete with the existing active pages. 68 * 69 * Approximating inactive page access frequency - Observations: 70 * 71 * 1. When a page is accessed for the first time, it is added to the 72 * head of the inactive list, slides every existing inactive page 73 * towards the tail by one slot, and pushes the current tail page 74 * out of memory. 75 * 76 * 2. When a page is accessed for the second time, it is promoted to 77 * the active list, shrinking the inactive list by one slot. This 78 * also slides all inactive pages that were faulted into the cache 79 * more recently than the activated page towards the tail of the 80 * inactive list. 81 * 82 * Thus: 83 * 84 * 1. The sum of evictions and activations between any two points in 85 * time indicate the minimum number of inactive pages accessed in 86 * between. 87 * 88 * 2. Moving one inactive page N page slots towards the tail of the 89 * list requires at least N inactive page accesses. 90 * 91 * Combining these: 92 * 93 * 1. When a page is finally evicted from memory, the number of 94 * inactive pages accessed while the page was in cache is at least 95 * the number of page slots on the inactive list. 96 * 97 * 2. In addition, measuring the sum of evictions and activations (E) 98 * at the time of a page's eviction, and comparing it to another 99 * reading (R) at the time the page faults back into memory tells 100 * the minimum number of accesses while the page was not cached. 101 * This is called the refault distance. 102 * 103 * Because the first access of the page was the fault and the second 104 * access the refault, we combine the in-cache distance with the 105 * out-of-cache distance to get the complete minimum access distance 106 * of this page: 107 * 108 * NR_inactive + (R - E) 109 * 110 * And knowing the minimum access distance of a page, we can easily 111 * tell if the page would be able to stay in cache assuming all page 112 * slots in the cache were available: 113 * 114 * NR_inactive + (R - E) <= NR_inactive + NR_active 115 * 116 * If we have swap we should consider about NR_inactive_anon and 117 * NR_active_anon, so for page cache and anonymous respectively: 118 * 119 * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file 120 * + NR_inactive_anon + NR_active_anon 121 * 122 * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon 123 * + NR_inactive_file + NR_active_file 124 * 125 * Which can be further simplified to: 126 * 127 * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon 128 * 129 * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file 130 * 131 * Put into words, the refault distance (out-of-cache) can be seen as 132 * a deficit in inactive list space (in-cache). If the inactive list 133 * had (R - E) more page slots, the page would not have been evicted 134 * in between accesses, but activated instead. And on a full system, 135 * the only thing eating into inactive list space is active pages. 136 * 137 * 138 * Refaulting inactive pages 139 * 140 * All that is known about the active list is that the pages have been 141 * accessed more than once in the past. This means that at any given 142 * time there is actually a good chance that pages on the active list 143 * are no longer in active use. 144 * 145 * So when a refault distance of (R - E) is observed and there are at 146 * least (R - E) pages in the userspace workingset, the refaulting page 147 * is activated optimistically in the hope that (R - E) pages are actually 148 * used less frequently than the refaulting page - or even not used at 149 * all anymore. 150 * 151 * That means if inactive cache is refaulting with a suitable refault 152 * distance, we assume the cache workingset is transitioning and put 153 * pressure on the current workingset. 154 * 155 * If this is wrong and demotion kicks in, the pages which are truly 156 * used more frequently will be reactivated while the less frequently 157 * used once will be evicted from memory. 158 * 159 * But if this is right, the stale pages will be pushed out of memory 160 * and the used pages get to stay in cache. 161 * 162 * Refaulting active pages 163 * 164 * If on the other hand the refaulting pages have recently been 165 * deactivated, it means that the active list is no longer protecting 166 * actively used cache from reclaim. The cache is NOT transitioning to 167 * a different workingset; the existing workingset is thrashing in the 168 * space allocated to the page cache. 169 * 170 * 171 * Implementation 172 * 173 * For each node's LRU lists, a counter for inactive evictions and 174 * activations is maintained (node->nonresident_age). 175 * 176 * On eviction, a snapshot of this counter (along with some bits to 177 * identify the node) is stored in the now empty page cache 178 * slot of the evicted page. This is called a shadow entry. 179 * 180 * On cache misses for which there are shadow entries, an eligible 181 * refault distance will immediately activate the refaulting page. 182 */ 183 184 #define WORKINGSET_SHIFT 1 185 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ 186 WORKINGSET_SHIFT + NODES_SHIFT + \ 187 MEM_CGROUP_ID_SHIFT) 188 #define EVICTION_SHIFT_ANON (EVICTION_SHIFT + SWAP_COUNT_SHIFT) 189 #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 190 #define EVICTION_MASK_ANON (~0UL >> EVICTION_SHIFT_ANON) 191 192 /* 193 * Eviction timestamps need to be able to cover the full range of 194 * actionable refaults. However, bits are tight in the xarray 195 * entry, and after storing the identifier for the lruvec there might 196 * not be enough left to represent every single actionable refault. In 197 * that case, we have to sacrifice granularity for distance, and group 198 * evictions into coarser buckets by shaving off lower timestamp bits. 199 */ 200 static unsigned int bucket_order[ANON_AND_FILE] __read_mostly; 201 202 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, 203 bool workingset, bool file) 204 { 205 eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON; 206 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; 207 eviction = (eviction << NODES_SHIFT) | pgdat->node_id; 208 eviction = (eviction << WORKINGSET_SHIFT) | workingset; 209 210 return xa_mk_value(eviction); 211 } 212 213 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, 214 unsigned long *evictionp, bool *workingsetp) 215 { 216 unsigned long entry = xa_to_value(shadow); 217 int memcgid, nid; 218 bool workingset; 219 220 workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); 221 entry >>= WORKINGSET_SHIFT; 222 nid = entry & ((1UL << NODES_SHIFT) - 1); 223 entry >>= NODES_SHIFT; 224 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); 225 entry >>= MEM_CGROUP_ID_SHIFT; 226 227 *memcgidp = memcgid; 228 *pgdat = NODE_DATA(nid); 229 *evictionp = entry; 230 *workingsetp = workingset; 231 } 232 233 #ifdef CONFIG_LRU_GEN 234 235 static void *lru_gen_eviction(struct folio *folio) 236 { 237 int hist; 238 unsigned long token; 239 unsigned long min_seq; 240 struct lruvec *lruvec; 241 struct lru_gen_folio *lrugen; 242 int type = folio_is_file_lru(folio); 243 int delta = folio_nr_pages(folio); 244 int refs = folio_lru_refs(folio); 245 bool workingset = folio_test_workingset(folio); 246 int tier = lru_tier_from_refs(refs, workingset); 247 struct mem_cgroup *memcg; 248 struct pglist_data *pgdat = folio_pgdat(folio); 249 unsigned short memcg_id; 250 251 BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > 252 BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON)); 253 254 rcu_read_lock(); 255 memcg = folio_memcg(folio); 256 lruvec = mem_cgroup_lruvec(memcg, pgdat); 257 lrugen = &lruvec->lrugen; 258 min_seq = READ_ONCE(lrugen->min_seq[type]); 259 token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); 260 261 hist = lru_hist_from_seq(min_seq); 262 atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); 263 memcg_id = mem_cgroup_private_id(memcg); 264 rcu_read_unlock(); 265 266 return pack_shadow(memcg_id, pgdat, token, workingset, type); 267 } 268 269 /* 270 * Tests if the shadow entry is for a folio that was recently evicted. 271 * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. 272 */ 273 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, 274 unsigned long *token, bool *workingset, bool file) 275 { 276 int memcg_id; 277 unsigned long max_seq; 278 struct mem_cgroup *memcg; 279 struct pglist_data *pgdat; 280 281 unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); 282 283 memcg = mem_cgroup_from_private_id(memcg_id); 284 *lruvec = mem_cgroup_lruvec(memcg, pgdat); 285 286 max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); 287 max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH; 288 289 return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; 290 } 291 292 static void lru_gen_refault(struct folio *folio, void *shadow) 293 { 294 bool recent; 295 int hist, tier, refs; 296 bool workingset; 297 unsigned long token; 298 struct lruvec *lruvec; 299 struct lru_gen_folio *lrugen; 300 int type = folio_is_file_lru(folio); 301 int delta = folio_nr_pages(folio); 302 303 rcu_read_lock(); 304 305 recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type); 306 if (lruvec != folio_lruvec(folio)) 307 goto unlock; 308 309 mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); 310 311 if (!recent) 312 goto unlock; 313 314 lrugen = &lruvec->lrugen; 315 316 hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); 317 refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; 318 tier = lru_tier_from_refs(refs, workingset); 319 320 atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); 321 322 /* see folio_add_lru() where folio_set_active() will be called */ 323 if (lru_gen_in_fault()) 324 mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); 325 326 if (workingset) { 327 folio_set_workingset(folio); 328 mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); 329 } else 330 set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); 331 unlock: 332 rcu_read_unlock(); 333 } 334 335 #else /* !CONFIG_LRU_GEN */ 336 337 static void *lru_gen_eviction(struct folio *folio) 338 { 339 return NULL; 340 } 341 342 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, 343 unsigned long *token, bool *workingset, bool file) 344 { 345 return false; 346 } 347 348 static void lru_gen_refault(struct folio *folio, void *shadow) 349 { 350 } 351 352 #endif /* CONFIG_LRU_GEN */ 353 354 /** 355 * workingset_age_nonresident - age non-resident entries as LRU ages 356 * @lruvec: the lruvec that was aged 357 * @nr_pages: the number of pages to count 358 * 359 * As in-memory pages are aged, non-resident pages need to be aged as 360 * well, in order for the refault distances later on to be comparable 361 * to the in-memory dimensions. This function allows reclaim and LRU 362 * operations to drive the non-resident aging along in parallel. 363 */ 364 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) 365 { 366 /* 367 * Reclaiming a cgroup means reclaiming all its children in a 368 * round-robin fashion. That means that each cgroup has an LRU 369 * order that is composed of the LRU orders of its child 370 * cgroups; and every page has an LRU position not just in the 371 * cgroup that owns it, but in all of that group's ancestors. 372 * 373 * So when the physical inactive list of a leaf cgroup ages, 374 * the virtual inactive lists of all its parents, including 375 * the root cgroup's, age as well. 376 */ 377 do { 378 atomic_long_add(nr_pages, &lruvec->nonresident_age); 379 } while ((lruvec = parent_lruvec(lruvec))); 380 } 381 382 /** 383 * workingset_eviction - note the eviction of a folio from memory 384 * @target_memcg: the cgroup that is causing the reclaim 385 * @folio: the folio being evicted 386 * 387 * Return: a shadow entry to be stored in @folio->mapping->i_pages in place 388 * of the evicted @folio so that a later refault can be detected. 389 */ 390 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) 391 { 392 struct pglist_data *pgdat = folio_pgdat(folio); 393 int file = folio_is_file_lru(folio); 394 unsigned long eviction; 395 struct lruvec *lruvec; 396 int memcgid; 397 398 /* Folio is fully exclusive and pins folio's memory cgroup pointer */ 399 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 400 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 401 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 402 403 if (lru_gen_enabled()) 404 return lru_gen_eviction(folio); 405 406 lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 407 /* XXX: target_memcg can be NULL, go through lruvec */ 408 memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); 409 eviction = atomic_long_read(&lruvec->nonresident_age); 410 eviction >>= bucket_order[file]; 411 workingset_age_nonresident(lruvec, folio_nr_pages(folio)); 412 return pack_shadow(memcgid, pgdat, eviction, 413 folio_test_workingset(folio), file); 414 } 415 416 /** 417 * workingset_test_recent - tests if the shadow entry is for a folio that was 418 * recently evicted. Also fills in @workingset with the value unpacked from 419 * shadow. 420 * @shadow: the shadow entry to be tested. 421 * @file: whether the corresponding folio is from the file lru. 422 * @workingset: where the workingset value unpacked from shadow should 423 * be stored. 424 * @flush: whether to flush cgroup rstat. 425 * 426 * Return: true if the shadow is for a recently evicted folio; false otherwise. 427 */ 428 bool workingset_test_recent(void *shadow, bool file, bool *workingset, 429 bool flush) 430 { 431 struct mem_cgroup *eviction_memcg; 432 struct lruvec *eviction_lruvec; 433 unsigned long refault_distance; 434 unsigned long workingset_size; 435 unsigned long refault; 436 int memcgid; 437 struct pglist_data *pgdat; 438 unsigned long eviction; 439 440 if (lru_gen_enabled()) { 441 bool recent; 442 443 rcu_read_lock(); 444 recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, 445 workingset, file); 446 rcu_read_unlock(); 447 return recent; 448 } 449 450 rcu_read_lock(); 451 unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); 452 eviction <<= bucket_order[file]; 453 454 /* 455 * Look up the memcg associated with the stored ID. It might 456 * have been deleted since the folio's eviction. 457 * 458 * Note that in rare events the ID could have been recycled 459 * for a new cgroup that refaults a shared folio. This is 460 * impossible to tell from the available data. However, this 461 * should be a rare and limited disturbance, and activations 462 * are always speculative anyway. Ultimately, it's the aging 463 * algorithm's job to shake out the minimum access frequency 464 * for the active cache. 465 * 466 * XXX: On !CONFIG_MEMCG, this will always return NULL; it 467 * would be better if the root_mem_cgroup existed in all 468 * configurations instead. 469 */ 470 eviction_memcg = mem_cgroup_from_private_id(memcgid); 471 if (!mem_cgroup_tryget(eviction_memcg)) 472 eviction_memcg = NULL; 473 rcu_read_unlock(); 474 475 if (!mem_cgroup_disabled() && !eviction_memcg) 476 return false; 477 /* 478 * Flush stats (and potentially sleep) outside the RCU read section. 479 * 480 * Note that workingset_test_recent() itself might be called in RCU read 481 * section (for e.g, in cachestat) - these callers need to skip flushing 482 * stats (via the flush argument). 483 * 484 * XXX: With per-memcg flushing and thresholding, is ratelimiting 485 * still needed here? 486 */ 487 if (flush) 488 mem_cgroup_flush_stats_ratelimited(eviction_memcg); 489 490 eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); 491 refault = atomic_long_read(&eviction_lruvec->nonresident_age); 492 493 /* 494 * Calculate the refault distance 495 * 496 * The unsigned subtraction here gives an accurate distance 497 * across nonresident_age overflows in most cases. There is a 498 * special case: usually, shadow entries have a short lifetime 499 * and are either refaulted or reclaimed along with the inode 500 * before they get too old. But it is not impossible for the 501 * nonresident_age to lap a shadow entry in the field, which 502 * can then result in a false small refault distance, leading 503 * to a false activation should this old entry actually 504 * refault again. However, earlier kernels used to deactivate 505 * unconditionally with *every* reclaim invocation for the 506 * longest time, so the occasional inappropriate activation 507 * leading to pressure on the active list is not a problem. 508 */ 509 refault_distance = ((refault - eviction) & 510 (file ? EVICTION_MASK : EVICTION_MASK_ANON)); 511 512 /* 513 * Compare the distance to the existing workingset size. We 514 * don't activate pages that couldn't stay resident even if 515 * all the memory was available to the workingset. Whether 516 * workingset competition needs to consider anon or not depends 517 * on having free swap space. 518 */ 519 workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); 520 if (!file) { 521 workingset_size += lruvec_page_state(eviction_lruvec, 522 NR_INACTIVE_FILE); 523 } 524 if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { 525 workingset_size += lruvec_page_state(eviction_lruvec, 526 NR_ACTIVE_ANON); 527 if (file) { 528 workingset_size += lruvec_page_state(eviction_lruvec, 529 NR_INACTIVE_ANON); 530 } 531 } 532 533 mem_cgroup_put(eviction_memcg); 534 return refault_distance <= workingset_size; 535 } 536 537 /** 538 * workingset_refault - Evaluate the refault of a previously evicted folio. 539 * @folio: The freshly allocated replacement folio. 540 * @shadow: Shadow entry of the evicted folio. 541 * 542 * Calculates and evaluates the refault distance of the previously 543 * evicted folio in the context of the node and the memcg whose memory 544 * pressure caused the eviction. 545 */ 546 void workingset_refault(struct folio *folio, void *shadow) 547 { 548 bool file = folio_is_file_lru(folio); 549 struct mem_cgroup *memcg; 550 struct lruvec *lruvec; 551 bool workingset; 552 long nr; 553 554 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 555 556 if (lru_gen_enabled()) { 557 lru_gen_refault(folio, shadow); 558 return; 559 } 560 561 /* 562 * The activation decision for this folio is made at the level 563 * where the eviction occurred, as that is where the LRU order 564 * during folio reclaim is being determined. 565 * 566 * However, the cgroup that will own the folio is the one that 567 * is actually experiencing the refault event. Make sure the folio is 568 * locked to guarantee folio_memcg() stability throughout. 569 */ 570 nr = folio_nr_pages(folio); 571 memcg = get_mem_cgroup_from_folio(folio); 572 lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio)); 573 mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); 574 575 if (!workingset_test_recent(shadow, file, &workingset, true)) 576 goto out; 577 578 folio_set_active(folio); 579 workingset_age_nonresident(lruvec, nr); 580 mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); 581 582 /* Folio was active prior to eviction */ 583 if (workingset) { 584 folio_set_workingset(folio); 585 /* 586 * XXX: Move to folio_add_lru() when it supports new vs 587 * putback 588 */ 589 lru_note_cost_refault(folio); 590 mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); 591 } 592 out: 593 mem_cgroup_put(memcg); 594 } 595 596 /** 597 * workingset_activation - note a page activation 598 * @folio: Folio that is being activated. 599 */ 600 void workingset_activation(struct folio *folio) 601 { 602 /* 603 * Filter non-memcg pages here, e.g. unmap can call 604 * mark_page_accessed() on VDSO pages. 605 */ 606 if (mem_cgroup_disabled() || folio_memcg_charged(folio)) { 607 rcu_read_lock(); 608 workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); 609 rcu_read_unlock(); 610 } 611 } 612 613 /* 614 * Shadow entries reflect the share of the working set that does not 615 * fit into memory, so their number depends on the access pattern of 616 * the workload. In most cases, they will refault or get reclaimed 617 * along with the inode, but a (malicious) workload that streams 618 * through files with a total size several times that of available 619 * memory, while preventing the inodes from being reclaimed, can 620 * create excessive amounts of shadow nodes. To keep a lid on this, 621 * track shadow nodes and reclaim them when they grow way past the 622 * point where they would still be useful. 623 */ 624 625 struct list_lru shadow_nodes; 626 627 void workingset_update_node(struct xa_node *node) 628 { 629 struct page *page = virt_to_page(node); 630 631 /* 632 * Track non-empty nodes that contain only shadow entries; 633 * unlink those that contain pages or are being freed. 634 * 635 * Avoid acquiring the list_lru lock when the nodes are 636 * already where they should be. The list_empty() test is safe 637 * as node->private_list is protected by the i_pages lock. 638 */ 639 lockdep_assert_held(&node->array->xa_lock); 640 641 if (node->count && node->count == node->nr_values) { 642 if (list_empty(&node->private_list)) { 643 list_lru_add_obj(&shadow_nodes, &node->private_list); 644 __inc_node_page_state(page, WORKINGSET_NODES); 645 } 646 } else { 647 if (!list_empty(&node->private_list)) { 648 list_lru_del_obj(&shadow_nodes, &node->private_list); 649 __dec_node_page_state(page, WORKINGSET_NODES); 650 } 651 } 652 } 653 654 static unsigned long count_shadow_nodes(struct shrinker *shrinker, 655 struct shrink_control *sc) 656 { 657 unsigned long max_nodes; 658 unsigned long nodes; 659 unsigned long pages; 660 661 nodes = list_lru_shrink_count(&shadow_nodes, sc); 662 if (!nodes) 663 return SHRINK_EMPTY; 664 665 /* 666 * Approximate a reasonable limit for the nodes 667 * containing shadow entries. We don't need to keep more 668 * shadow entries than possible pages on the active list, 669 * since refault distances bigger than that are dismissed. 670 * 671 * The size of the active list converges toward 100% of 672 * overall page cache as memory grows, with only a tiny 673 * inactive list. Assume the total cache size for that. 674 * 675 * Nodes might be sparsely populated, with only one shadow 676 * entry in the extreme case. Obviously, we cannot keep one 677 * node for every eligible shadow entry, so compromise on a 678 * worst-case density of 1/8th. Below that, not all eligible 679 * refaults can be detected anymore. 680 * 681 * On 64-bit with 7 xa_nodes per page and 64 slots 682 * each, this will reclaim shadow entries when they consume 683 * ~1.8% of available memory: 684 * 685 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE 686 */ 687 #ifdef CONFIG_MEMCG 688 if (sc->memcg) { 689 struct lruvec *lruvec; 690 int i; 691 692 mem_cgroup_flush_stats_ratelimited(sc->memcg); 693 lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); 694 695 for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) 696 pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1); 697 698 pages += lruvec_page_state_local( 699 lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; 700 pages += lruvec_page_state_local( 701 lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; 702 } else 703 #endif 704 pages = node_present_pages(sc->nid); 705 706 max_nodes = pages >> (XA_CHUNK_SHIFT - 3); 707 708 if (nodes <= max_nodes) 709 return 0; 710 return nodes - max_nodes; 711 } 712 713 static enum lru_status shadow_lru_isolate(struct list_head *item, 714 struct list_lru_one *lru, 715 void *arg) __must_hold(lru->lock) 716 { 717 struct xa_node *node = container_of(item, struct xa_node, private_list); 718 struct address_space *mapping; 719 int ret; 720 721 /* 722 * Page cache insertions and deletions synchronously maintain 723 * the shadow node LRU under the i_pages lock and the 724 * &lru->lock. Because the page cache tree is emptied before 725 * the inode can be destroyed, holding the &lru->lock pins any 726 * address_space that has nodes on the LRU. 727 * 728 * We can then safely transition to the i_pages lock to 729 * pin only the address_space of the particular node we want 730 * to reclaim, take the node off-LRU, and drop the &lru->lock. 731 */ 732 733 mapping = container_of(node->array, struct address_space, i_pages); 734 735 /* Coming from the list, invert the lock order */ 736 if (!xa_trylock(&mapping->i_pages)) { 737 spin_unlock_irq(&lru->lock); 738 ret = LRU_RETRY; 739 goto out; 740 } 741 742 /* For page cache we need to hold i_lock */ 743 if (mapping->host != NULL) { 744 if (!spin_trylock(&mapping->host->i_lock)) { 745 xa_unlock(&mapping->i_pages); 746 spin_unlock_irq(&lru->lock); 747 ret = LRU_RETRY; 748 goto out; 749 } 750 } 751 752 list_lru_isolate(lru, item); 753 __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); 754 755 spin_unlock(&lru->lock); 756 757 /* 758 * The nodes should only contain one or more shadow entries, 759 * no pages, so we expect to be able to remove them all and 760 * delete and free the empty node afterwards. 761 */ 762 if (WARN_ON_ONCE(!node->nr_values)) 763 goto out_invalid; 764 if (WARN_ON_ONCE(node->count != node->nr_values)) 765 goto out_invalid; 766 xa_delete_node(node, workingset_update_node); 767 mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1); 768 769 out_invalid: 770 xa_unlock_irq(&mapping->i_pages); 771 if (mapping->host != NULL) { 772 if (mapping_shrinkable(mapping)) 773 inode_lru_list_add(mapping->host); 774 spin_unlock(&mapping->host->i_lock); 775 } 776 ret = LRU_REMOVED_RETRY; 777 out: 778 cond_resched(); 779 return ret; 780 } 781 782 static unsigned long scan_shadow_nodes(struct shrinker *shrinker, 783 struct shrink_control *sc) 784 { 785 /* list_lru lock nests inside the IRQ-safe i_pages lock */ 786 return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, 787 NULL); 788 } 789 790 /* 791 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe 792 * i_pages lock. 793 */ 794 static struct lock_class_key shadow_nodes_key; 795 796 static int __init workingset_init(void) 797 { 798 unsigned int timestamp_bits, timestamp_bits_anon; 799 struct shrinker *workingset_shadow_shrinker; 800 unsigned int max_order; 801 int ret = -ENOMEM; 802 803 BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); 804 /* 805 * Calculate the eviction bucket size to cover the longest 806 * actionable refault distance, which is currently half of 807 * memory (totalram_pages/2). However, memory hotplug may add 808 * some more pages at runtime, so keep working with up to 809 * double the initial memory by using totalram_pages as-is. 810 */ 811 timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; 812 timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON; 813 max_order = fls_long(totalram_pages() - 1); 814 if (max_order > (BITS_PER_LONG - EVICTION_SHIFT)) 815 bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits; 816 if (max_order > timestamp_bits_anon) 817 bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon; 818 pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n", 819 timestamp_bits, timestamp_bits_anon, max_order, 820 bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]); 821 822 workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 823 SHRINKER_MEMCG_AWARE, 824 "mm-shadow"); 825 if (!workingset_shadow_shrinker) 826 goto err; 827 828 ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, 829 &shadow_nodes_key); 830 if (ret) 831 goto err_list_lru; 832 833 workingset_shadow_shrinker->count_objects = count_shadow_nodes; 834 workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; 835 /* ->count reports only fully expendable nodes */ 836 workingset_shadow_shrinker->seeks = 0; 837 838 shrinker_register(workingset_shadow_shrinker); 839 return 0; 840 err_list_lru: 841 shrinker_free(workingset_shadow_shrinker); 842 err: 843 return ret; 844 } 845 module_init(workingset_init); 846