1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * 5 * Swap reorganised 29.12.95, Stephen Tweedie. 6 * kswapd added: 7.1.96 sct 7 * Removed kswapd_ctl limits, and swap out as many pages as needed 8 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 9 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 10 * Multiqueue VM started 5.8.00, Rik van Riel. 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/mm.h> 16 #include <linux/sched/mm.h> 17 #include <linux/module.h> 18 #include <linux/gfp.h> 19 #include <linux/kernel_stat.h> 20 #include <linux/swap.h> 21 #include <linux/pagemap.h> 22 #include <linux/init.h> 23 #include <linux/highmem.h> 24 #include <linux/vmpressure.h> 25 #include <linux/vmstat.h> 26 #include <linux/file.h> 27 #include <linux/writeback.h> 28 #include <linux/blkdev.h> 29 #include <linux/buffer_head.h> /* for buffer_heads_over_limit */ 30 #include <linux/mm_inline.h> 31 #include <linux/backing-dev.h> 32 #include <linux/rmap.h> 33 #include <linux/topology.h> 34 #include <linux/cpu.h> 35 #include <linux/cpuset.h> 36 #include <linux/compaction.h> 37 #include <linux/notifier.h> 38 #include <linux/rwsem.h> 39 #include <linux/delay.h> 40 #include <linux/kthread.h> 41 #include <linux/freezer.h> 42 #include <linux/memcontrol.h> 43 #include <linux/migrate.h> 44 #include <linux/delayacct.h> 45 #include <linux/sysctl.h> 46 #include <linux/memory-tiers.h> 47 #include <linux/oom.h> 48 #include <linux/pagevec.h> 49 #include <linux/prefetch.h> 50 #include <linux/printk.h> 51 #include <linux/dax.h> 52 #include <linux/psi.h> 53 #include <linux/pagewalk.h> 54 #include <linux/shmem_fs.h> 55 #include <linux/ctype.h> 56 #include <linux/debugfs.h> 57 #include <linux/khugepaged.h> 58 59 #include <asm/tlbflush.h> 60 #include <asm/div64.h> 61 62 #include <linux/swapops.h> 63 #include <linux/balloon_compaction.h> 64 #include <linux/sched/sysctl.h> 65 66 #include "internal.h" 67 #include "swap.h" 68 69 #define CREATE_TRACE_POINTS 70 #include <trace/events/vmscan.h> 71 72 struct scan_control { 73 /* How many pages shrink_list() should reclaim */ 74 unsigned long nr_to_reclaim; 75 76 /* 77 * Nodemask of nodes allowed by the caller. If NULL, all nodes 78 * are scanned. 79 */ 80 nodemask_t *nodemask; 81 82 /* 83 * The memory cgroup that hit its limit and as a result is the 84 * primary target of this reclaim invocation. 85 */ 86 struct mem_cgroup *target_mem_cgroup; 87 88 /* 89 * Scan pressure balancing between anon and file LRUs 90 */ 91 unsigned long anon_cost; 92 unsigned long file_cost; 93 94 /* Can active folios be deactivated as part of reclaim? */ 95 #define DEACTIVATE_ANON 1 96 #define DEACTIVATE_FILE 2 97 unsigned int may_deactivate:2; 98 unsigned int force_deactivate:1; 99 unsigned int skipped_deactivate:1; 100 101 /* Writepage batching in laptop mode; RECLAIM_WRITE */ 102 unsigned int may_writepage:1; 103 104 /* Can mapped folios be reclaimed? */ 105 unsigned int may_unmap:1; 106 107 /* Can folios be swapped as part of reclaim? */ 108 unsigned int may_swap:1; 109 110 /* Proactive reclaim invoked by userspace through memory.reclaim */ 111 unsigned int proactive:1; 112 113 /* 114 * Cgroup memory below memory.low is protected as long as we 115 * don't threaten to OOM. If any cgroup is reclaimed at 116 * reduced force or passed over entirely due to its memory.low 117 * setting (memcg_low_skipped), and nothing is reclaimed as a 118 * result, then go back for one more cycle that reclaims the protected 119 * memory (memcg_low_reclaim) to avert OOM. 120 */ 121 unsigned int memcg_low_reclaim:1; 122 unsigned int memcg_low_skipped:1; 123 124 unsigned int hibernation_mode:1; 125 126 /* One of the zones is ready for compaction */ 127 unsigned int compaction_ready:1; 128 129 /* There is easily reclaimable cold cache in the current node */ 130 unsigned int cache_trim_mode:1; 131 132 /* The file folios on the current node are dangerously low */ 133 unsigned int file_is_tiny:1; 134 135 /* Always discard instead of demoting to lower tier memory */ 136 unsigned int no_demotion:1; 137 138 #ifdef CONFIG_LRU_GEN 139 /* help kswapd make better choices among multiple memcgs */ 140 unsigned int memcgs_need_aging:1; 141 unsigned long last_reclaimed; 142 #endif 143 144 /* Allocation order */ 145 s8 order; 146 147 /* Scan (total_size >> priority) pages at once */ 148 s8 priority; 149 150 /* The highest zone to isolate folios for reclaim from */ 151 s8 reclaim_idx; 152 153 /* This context's GFP mask */ 154 gfp_t gfp_mask; 155 156 /* Incremented by the number of inactive pages that were scanned */ 157 unsigned long nr_scanned; 158 159 /* Number of pages freed so far during a call to shrink_zones() */ 160 unsigned long nr_reclaimed; 161 162 struct { 163 unsigned int dirty; 164 unsigned int unqueued_dirty; 165 unsigned int congested; 166 unsigned int writeback; 167 unsigned int immediate; 168 unsigned int file_taken; 169 unsigned int taken; 170 } nr; 171 172 /* for recording the reclaimed slab by now */ 173 struct reclaim_state reclaim_state; 174 }; 175 176 #ifdef ARCH_HAS_PREFETCHW 177 #define prefetchw_prev_lru_folio(_folio, _base, _field) \ 178 do { \ 179 if ((_folio)->lru.prev != _base) { \ 180 struct folio *prev; \ 181 \ 182 prev = lru_to_folio(&(_folio->lru)); \ 183 prefetchw(&prev->_field); \ 184 } \ 185 } while (0) 186 #else 187 #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) 188 #endif 189 190 /* 191 * From 0 .. 200. Higher means more swappy. 192 */ 193 int vm_swappiness = 60; 194 195 static void set_task_reclaim_state(struct task_struct *task, 196 struct reclaim_state *rs) 197 { 198 /* Check for an overwrite */ 199 WARN_ON_ONCE(rs && task->reclaim_state); 200 201 /* Check for the nulling of an already-nulled member */ 202 WARN_ON_ONCE(!rs && !task->reclaim_state); 203 204 task->reclaim_state = rs; 205 } 206 207 LIST_HEAD(shrinker_list); 208 DECLARE_RWSEM(shrinker_rwsem); 209 210 #ifdef CONFIG_MEMCG 211 static int shrinker_nr_max; 212 213 /* The shrinker_info is expanded in a batch of BITS_PER_LONG */ 214 static inline int shrinker_map_size(int nr_items) 215 { 216 return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); 217 } 218 219 static inline int shrinker_defer_size(int nr_items) 220 { 221 return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); 222 } 223 224 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, 225 int nid) 226 { 227 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, 228 lockdep_is_held(&shrinker_rwsem)); 229 } 230 231 static int expand_one_shrinker_info(struct mem_cgroup *memcg, 232 int map_size, int defer_size, 233 int old_map_size, int old_defer_size) 234 { 235 struct shrinker_info *new, *old; 236 struct mem_cgroup_per_node *pn; 237 int nid; 238 int size = map_size + defer_size; 239 240 for_each_node(nid) { 241 pn = memcg->nodeinfo[nid]; 242 old = shrinker_info_protected(memcg, nid); 243 /* Not yet online memcg */ 244 if (!old) 245 return 0; 246 247 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); 248 if (!new) 249 return -ENOMEM; 250 251 new->nr_deferred = (atomic_long_t *)(new + 1); 252 new->map = (void *)new->nr_deferred + defer_size; 253 254 /* map: set all old bits, clear all new bits */ 255 memset(new->map, (int)0xff, old_map_size); 256 memset((void *)new->map + old_map_size, 0, map_size - old_map_size); 257 /* nr_deferred: copy old values, clear all new values */ 258 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); 259 memset((void *)new->nr_deferred + old_defer_size, 0, 260 defer_size - old_defer_size); 261 262 rcu_assign_pointer(pn->shrinker_info, new); 263 kvfree_rcu(old, rcu); 264 } 265 266 return 0; 267 } 268 269 void free_shrinker_info(struct mem_cgroup *memcg) 270 { 271 struct mem_cgroup_per_node *pn; 272 struct shrinker_info *info; 273 int nid; 274 275 for_each_node(nid) { 276 pn = memcg->nodeinfo[nid]; 277 info = rcu_dereference_protected(pn->shrinker_info, true); 278 kvfree(info); 279 rcu_assign_pointer(pn->shrinker_info, NULL); 280 } 281 } 282 283 int alloc_shrinker_info(struct mem_cgroup *memcg) 284 { 285 struct shrinker_info *info; 286 int nid, size, ret = 0; 287 int map_size, defer_size = 0; 288 289 down_write(&shrinker_rwsem); 290 map_size = shrinker_map_size(shrinker_nr_max); 291 defer_size = shrinker_defer_size(shrinker_nr_max); 292 size = map_size + defer_size; 293 for_each_node(nid) { 294 info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); 295 if (!info) { 296 free_shrinker_info(memcg); 297 ret = -ENOMEM; 298 break; 299 } 300 info->nr_deferred = (atomic_long_t *)(info + 1); 301 info->map = (void *)info->nr_deferred + defer_size; 302 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); 303 } 304 up_write(&shrinker_rwsem); 305 306 return ret; 307 } 308 309 static inline bool need_expand(int nr_max) 310 { 311 return round_up(nr_max, BITS_PER_LONG) > 312 round_up(shrinker_nr_max, BITS_PER_LONG); 313 } 314 315 static int expand_shrinker_info(int new_id) 316 { 317 int ret = 0; 318 int new_nr_max = new_id + 1; 319 int map_size, defer_size = 0; 320 int old_map_size, old_defer_size = 0; 321 struct mem_cgroup *memcg; 322 323 if (!need_expand(new_nr_max)) 324 goto out; 325 326 if (!root_mem_cgroup) 327 goto out; 328 329 lockdep_assert_held(&shrinker_rwsem); 330 331 map_size = shrinker_map_size(new_nr_max); 332 defer_size = shrinker_defer_size(new_nr_max); 333 old_map_size = shrinker_map_size(shrinker_nr_max); 334 old_defer_size = shrinker_defer_size(shrinker_nr_max); 335 336 memcg = mem_cgroup_iter(NULL, NULL, NULL); 337 do { 338 ret = expand_one_shrinker_info(memcg, map_size, defer_size, 339 old_map_size, old_defer_size); 340 if (ret) { 341 mem_cgroup_iter_break(NULL, memcg); 342 goto out; 343 } 344 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 345 out: 346 if (!ret) 347 shrinker_nr_max = new_nr_max; 348 349 return ret; 350 } 351 352 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) 353 { 354 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { 355 struct shrinker_info *info; 356 357 rcu_read_lock(); 358 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); 359 /* Pairs with smp mb in shrink_slab() */ 360 smp_mb__before_atomic(); 361 set_bit(shrinker_id, info->map); 362 rcu_read_unlock(); 363 } 364 } 365 366 static DEFINE_IDR(shrinker_idr); 367 368 static int prealloc_memcg_shrinker(struct shrinker *shrinker) 369 { 370 int id, ret = -ENOMEM; 371 372 if (mem_cgroup_disabled()) 373 return -ENOSYS; 374 375 down_write(&shrinker_rwsem); 376 /* This may call shrinker, so it must use down_read_trylock() */ 377 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); 378 if (id < 0) 379 goto unlock; 380 381 if (id >= shrinker_nr_max) { 382 if (expand_shrinker_info(id)) { 383 idr_remove(&shrinker_idr, id); 384 goto unlock; 385 } 386 } 387 shrinker->id = id; 388 ret = 0; 389 unlock: 390 up_write(&shrinker_rwsem); 391 return ret; 392 } 393 394 static void unregister_memcg_shrinker(struct shrinker *shrinker) 395 { 396 int id = shrinker->id; 397 398 BUG_ON(id < 0); 399 400 lockdep_assert_held(&shrinker_rwsem); 401 402 idr_remove(&shrinker_idr, id); 403 } 404 405 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, 406 struct mem_cgroup *memcg) 407 { 408 struct shrinker_info *info; 409 410 info = shrinker_info_protected(memcg, nid); 411 return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); 412 } 413 414 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, 415 struct mem_cgroup *memcg) 416 { 417 struct shrinker_info *info; 418 419 info = shrinker_info_protected(memcg, nid); 420 return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); 421 } 422 423 void reparent_shrinker_deferred(struct mem_cgroup *memcg) 424 { 425 int i, nid; 426 long nr; 427 struct mem_cgroup *parent; 428 struct shrinker_info *child_info, *parent_info; 429 430 parent = parent_mem_cgroup(memcg); 431 if (!parent) 432 parent = root_mem_cgroup; 433 434 /* Prevent from concurrent shrinker_info expand */ 435 down_read(&shrinker_rwsem); 436 for_each_node(nid) { 437 child_info = shrinker_info_protected(memcg, nid); 438 parent_info = shrinker_info_protected(parent, nid); 439 for (i = 0; i < shrinker_nr_max; i++) { 440 nr = atomic_long_read(&child_info->nr_deferred[i]); 441 atomic_long_add(nr, &parent_info->nr_deferred[i]); 442 } 443 } 444 up_read(&shrinker_rwsem); 445 } 446 447 static bool cgroup_reclaim(struct scan_control *sc) 448 { 449 return sc->target_mem_cgroup; 450 } 451 452 /** 453 * writeback_throttling_sane - is the usual dirty throttling mechanism available? 454 * @sc: scan_control in question 455 * 456 * The normal page dirty throttling mechanism in balance_dirty_pages() is 457 * completely broken with the legacy memcg and direct stalling in 458 * shrink_folio_list() is used for throttling instead, which lacks all the 459 * niceties such as fairness, adaptive pausing, bandwidth proportional 460 * allocation and configurability. 461 * 462 * This function tests whether the vmscan currently in progress can assume 463 * that the normal dirty throttling mechanism is operational. 464 */ 465 static bool writeback_throttling_sane(struct scan_control *sc) 466 { 467 if (!cgroup_reclaim(sc)) 468 return true; 469 #ifdef CONFIG_CGROUP_WRITEBACK 470 if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 471 return true; 472 #endif 473 return false; 474 } 475 #else 476 static int prealloc_memcg_shrinker(struct shrinker *shrinker) 477 { 478 return -ENOSYS; 479 } 480 481 static void unregister_memcg_shrinker(struct shrinker *shrinker) 482 { 483 } 484 485 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, 486 struct mem_cgroup *memcg) 487 { 488 return 0; 489 } 490 491 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, 492 struct mem_cgroup *memcg) 493 { 494 return 0; 495 } 496 497 static bool cgroup_reclaim(struct scan_control *sc) 498 { 499 return false; 500 } 501 502 static bool writeback_throttling_sane(struct scan_control *sc) 503 { 504 return true; 505 } 506 #endif 507 508 static long xchg_nr_deferred(struct shrinker *shrinker, 509 struct shrink_control *sc) 510 { 511 int nid = sc->nid; 512 513 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 514 nid = 0; 515 516 if (sc->memcg && 517 (shrinker->flags & SHRINKER_MEMCG_AWARE)) 518 return xchg_nr_deferred_memcg(nid, shrinker, 519 sc->memcg); 520 521 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); 522 } 523 524 525 static long add_nr_deferred(long nr, struct shrinker *shrinker, 526 struct shrink_control *sc) 527 { 528 int nid = sc->nid; 529 530 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 531 nid = 0; 532 533 if (sc->memcg && 534 (shrinker->flags & SHRINKER_MEMCG_AWARE)) 535 return add_nr_deferred_memcg(nr, nid, shrinker, 536 sc->memcg); 537 538 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); 539 } 540 541 static bool can_demote(int nid, struct scan_control *sc) 542 { 543 if (!numa_demotion_enabled) 544 return false; 545 if (sc && sc->no_demotion) 546 return false; 547 if (next_demotion_node(nid) == NUMA_NO_NODE) 548 return false; 549 550 return true; 551 } 552 553 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, 554 int nid, 555 struct scan_control *sc) 556 { 557 if (memcg == NULL) { 558 /* 559 * For non-memcg reclaim, is there 560 * space in any swap device? 561 */ 562 if (get_nr_swap_pages() > 0) 563 return true; 564 } else { 565 /* Is the memcg below its swap limit? */ 566 if (mem_cgroup_get_nr_swap_pages(memcg) > 0) 567 return true; 568 } 569 570 /* 571 * The page can not be swapped. 572 * 573 * Can it be reclaimed from this node via demotion? 574 */ 575 return can_demote(nid, sc); 576 } 577 578 /* 579 * This misses isolated folios which are not accounted for to save counters. 580 * As the data only determines if reclaim or compaction continues, it is 581 * not expected that isolated folios will be a dominating factor. 582 */ 583 unsigned long zone_reclaimable_pages(struct zone *zone) 584 { 585 unsigned long nr; 586 587 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + 588 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); 589 if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) 590 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + 591 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); 592 593 return nr; 594 } 595 596 /** 597 * lruvec_lru_size - Returns the number of pages on the given LRU list. 598 * @lruvec: lru vector 599 * @lru: lru to use 600 * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) 601 */ 602 static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, 603 int zone_idx) 604 { 605 unsigned long size = 0; 606 int zid; 607 608 for (zid = 0; zid <= zone_idx; zid++) { 609 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; 610 611 if (!managed_zone(zone)) 612 continue; 613 614 if (!mem_cgroup_disabled()) 615 size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); 616 else 617 size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); 618 } 619 return size; 620 } 621 622 /* 623 * Add a shrinker callback to be called from the vm. 624 */ 625 static int __prealloc_shrinker(struct shrinker *shrinker) 626 { 627 unsigned int size; 628 int err; 629 630 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { 631 err = prealloc_memcg_shrinker(shrinker); 632 if (err != -ENOSYS) 633 return err; 634 635 shrinker->flags &= ~SHRINKER_MEMCG_AWARE; 636 } 637 638 size = sizeof(*shrinker->nr_deferred); 639 if (shrinker->flags & SHRINKER_NUMA_AWARE) 640 size *= nr_node_ids; 641 642 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); 643 if (!shrinker->nr_deferred) 644 return -ENOMEM; 645 646 return 0; 647 } 648 649 #ifdef CONFIG_SHRINKER_DEBUG 650 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) 651 { 652 va_list ap; 653 int err; 654 655 va_start(ap, fmt); 656 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); 657 va_end(ap); 658 if (!shrinker->name) 659 return -ENOMEM; 660 661 err = __prealloc_shrinker(shrinker); 662 if (err) { 663 kfree_const(shrinker->name); 664 shrinker->name = NULL; 665 } 666 667 return err; 668 } 669 #else 670 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) 671 { 672 return __prealloc_shrinker(shrinker); 673 } 674 #endif 675 676 void free_prealloced_shrinker(struct shrinker *shrinker) 677 { 678 #ifdef CONFIG_SHRINKER_DEBUG 679 kfree_const(shrinker->name); 680 shrinker->name = NULL; 681 #endif 682 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { 683 down_write(&shrinker_rwsem); 684 unregister_memcg_shrinker(shrinker); 685 up_write(&shrinker_rwsem); 686 return; 687 } 688 689 kfree(shrinker->nr_deferred); 690 shrinker->nr_deferred = NULL; 691 } 692 693 void register_shrinker_prepared(struct shrinker *shrinker) 694 { 695 down_write(&shrinker_rwsem); 696 list_add_tail(&shrinker->list, &shrinker_list); 697 shrinker->flags |= SHRINKER_REGISTERED; 698 shrinker_debugfs_add(shrinker); 699 up_write(&shrinker_rwsem); 700 } 701 702 static int __register_shrinker(struct shrinker *shrinker) 703 { 704 int err = __prealloc_shrinker(shrinker); 705 706 if (err) 707 return err; 708 register_shrinker_prepared(shrinker); 709 return 0; 710 } 711 712 #ifdef CONFIG_SHRINKER_DEBUG 713 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) 714 { 715 va_list ap; 716 int err; 717 718 va_start(ap, fmt); 719 shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); 720 va_end(ap); 721 if (!shrinker->name) 722 return -ENOMEM; 723 724 err = __register_shrinker(shrinker); 725 if (err) { 726 kfree_const(shrinker->name); 727 shrinker->name = NULL; 728 } 729 return err; 730 } 731 #else 732 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) 733 { 734 return __register_shrinker(shrinker); 735 } 736 #endif 737 EXPORT_SYMBOL(register_shrinker); 738 739 /* 740 * Remove one 741 */ 742 void unregister_shrinker(struct shrinker *shrinker) 743 { 744 if (!(shrinker->flags & SHRINKER_REGISTERED)) 745 return; 746 747 down_write(&shrinker_rwsem); 748 list_del(&shrinker->list); 749 shrinker->flags &= ~SHRINKER_REGISTERED; 750 if (shrinker->flags & SHRINKER_MEMCG_AWARE) 751 unregister_memcg_shrinker(shrinker); 752 shrinker_debugfs_remove(shrinker); 753 up_write(&shrinker_rwsem); 754 755 kfree(shrinker->nr_deferred); 756 shrinker->nr_deferred = NULL; 757 } 758 EXPORT_SYMBOL(unregister_shrinker); 759 760 /** 761 * synchronize_shrinkers - Wait for all running shrinkers to complete. 762 * 763 * This is equivalent to calling unregister_shrink() and register_shrinker(), 764 * but atomically and with less overhead. This is useful to guarantee that all 765 * shrinker invocations have seen an update, before freeing memory, similar to 766 * rcu. 767 */ 768 void synchronize_shrinkers(void) 769 { 770 down_write(&shrinker_rwsem); 771 up_write(&shrinker_rwsem); 772 } 773 EXPORT_SYMBOL(synchronize_shrinkers); 774 775 #define SHRINK_BATCH 128 776 777 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, 778 struct shrinker *shrinker, int priority) 779 { 780 unsigned long freed = 0; 781 unsigned long long delta; 782 long total_scan; 783 long freeable; 784 long nr; 785 long new_nr; 786 long batch_size = shrinker->batch ? shrinker->batch 787 : SHRINK_BATCH; 788 long scanned = 0, next_deferred; 789 790 freeable = shrinker->count_objects(shrinker, shrinkctl); 791 if (freeable == 0 || freeable == SHRINK_EMPTY) 792 return freeable; 793 794 /* 795 * copy the current shrinker scan count into a local variable 796 * and zero it so that other concurrent shrinker invocations 797 * don't also do this scanning work. 798 */ 799 nr = xchg_nr_deferred(shrinker, shrinkctl); 800 801 if (shrinker->seeks) { 802 delta = freeable >> priority; 803 delta *= 4; 804 do_div(delta, shrinker->seeks); 805 } else { 806 /* 807 * These objects don't require any IO to create. Trim 808 * them aggressively under memory pressure to keep 809 * them from causing refetches in the IO caches. 810 */ 811 delta = freeable / 2; 812 } 813 814 total_scan = nr >> priority; 815 total_scan += delta; 816 total_scan = min(total_scan, (2 * freeable)); 817 818 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 819 freeable, delta, total_scan, priority); 820 821 /* 822 * Normally, we should not scan less than batch_size objects in one 823 * pass to avoid too frequent shrinker calls, but if the slab has less 824 * than batch_size objects in total and we are really tight on memory, 825 * we will try to reclaim all available objects, otherwise we can end 826 * up failing allocations although there are plenty of reclaimable 827 * objects spread over several slabs with usage less than the 828 * batch_size. 829 * 830 * We detect the "tight on memory" situations by looking at the total 831 * number of objects we want to scan (total_scan). If it is greater 832 * than the total number of objects on slab (freeable), we must be 833 * scanning at high prio and therefore should try to reclaim as much as 834 * possible. 835 */ 836 while (total_scan >= batch_size || 837 total_scan >= freeable) { 838 unsigned long ret; 839 unsigned long nr_to_scan = min(batch_size, total_scan); 840 841 shrinkctl->nr_to_scan = nr_to_scan; 842 shrinkctl->nr_scanned = nr_to_scan; 843 ret = shrinker->scan_objects(shrinker, shrinkctl); 844 if (ret == SHRINK_STOP) 845 break; 846 freed += ret; 847 848 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); 849 total_scan -= shrinkctl->nr_scanned; 850 scanned += shrinkctl->nr_scanned; 851 852 cond_resched(); 853 } 854 855 /* 856 * The deferred work is increased by any new work (delta) that wasn't 857 * done, decreased by old deferred work that was done now. 858 * 859 * And it is capped to two times of the freeable items. 860 */ 861 next_deferred = max_t(long, (nr + delta - scanned), 0); 862 next_deferred = min(next_deferred, (2 * freeable)); 863 864 /* 865 * move the unused scan count back into the shrinker in a 866 * manner that handles concurrent updates. 867 */ 868 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); 869 870 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); 871 return freed; 872 } 873 874 #ifdef CONFIG_MEMCG 875 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 876 struct mem_cgroup *memcg, int priority) 877 { 878 struct shrinker_info *info; 879 unsigned long ret, freed = 0; 880 int i; 881 882 if (!mem_cgroup_online(memcg)) 883 return 0; 884 885 if (!down_read_trylock(&shrinker_rwsem)) 886 return 0; 887 888 info = shrinker_info_protected(memcg, nid); 889 if (unlikely(!info)) 890 goto unlock; 891 892 for_each_set_bit(i, info->map, shrinker_nr_max) { 893 struct shrink_control sc = { 894 .gfp_mask = gfp_mask, 895 .nid = nid, 896 .memcg = memcg, 897 }; 898 struct shrinker *shrinker; 899 900 shrinker = idr_find(&shrinker_idr, i); 901 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { 902 if (!shrinker) 903 clear_bit(i, info->map); 904 continue; 905 } 906 907 /* Call non-slab shrinkers even though kmem is disabled */ 908 if (!memcg_kmem_enabled() && 909 !(shrinker->flags & SHRINKER_NONSLAB)) 910 continue; 911 912 ret = do_shrink_slab(&sc, shrinker, priority); 913 if (ret == SHRINK_EMPTY) { 914 clear_bit(i, info->map); 915 /* 916 * After the shrinker reported that it had no objects to 917 * free, but before we cleared the corresponding bit in 918 * the memcg shrinker map, a new object might have been 919 * added. To make sure, we have the bit set in this 920 * case, we invoke the shrinker one more time and reset 921 * the bit if it reports that it is not empty anymore. 922 * The memory barrier here pairs with the barrier in 923 * set_shrinker_bit(): 924 * 925 * list_lru_add() shrink_slab_memcg() 926 * list_add_tail() clear_bit() 927 * <MB> <MB> 928 * set_bit() do_shrink_slab() 929 */ 930 smp_mb__after_atomic(); 931 ret = do_shrink_slab(&sc, shrinker, priority); 932 if (ret == SHRINK_EMPTY) 933 ret = 0; 934 else 935 set_shrinker_bit(memcg, nid, i); 936 } 937 freed += ret; 938 939 if (rwsem_is_contended(&shrinker_rwsem)) { 940 freed = freed ? : 1; 941 break; 942 } 943 } 944 unlock: 945 up_read(&shrinker_rwsem); 946 return freed; 947 } 948 #else /* CONFIG_MEMCG */ 949 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 950 struct mem_cgroup *memcg, int priority) 951 { 952 return 0; 953 } 954 #endif /* CONFIG_MEMCG */ 955 956 /** 957 * shrink_slab - shrink slab caches 958 * @gfp_mask: allocation context 959 * @nid: node whose slab caches to target 960 * @memcg: memory cgroup whose slab caches to target 961 * @priority: the reclaim priority 962 * 963 * Call the shrink functions to age shrinkable caches. 964 * 965 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, 966 * unaware shrinkers will receive a node id of 0 instead. 967 * 968 * @memcg specifies the memory cgroup to target. Unaware shrinkers 969 * are called only if it is the root cgroup. 970 * 971 * @priority is sc->priority, we take the number of objects and >> by priority 972 * in order to get the scan target. 973 * 974 * Returns the number of reclaimed slab objects. 975 */ 976 static unsigned long shrink_slab(gfp_t gfp_mask, int nid, 977 struct mem_cgroup *memcg, 978 int priority) 979 { 980 unsigned long ret, freed = 0; 981 struct shrinker *shrinker; 982 983 /* 984 * The root memcg might be allocated even though memcg is disabled 985 * via "cgroup_disable=memory" boot parameter. This could make 986 * mem_cgroup_is_root() return false, then just run memcg slab 987 * shrink, but skip global shrink. This may result in premature 988 * oom. 989 */ 990 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) 991 return shrink_slab_memcg(gfp_mask, nid, memcg, priority); 992 993 if (!down_read_trylock(&shrinker_rwsem)) 994 goto out; 995 996 list_for_each_entry(shrinker, &shrinker_list, list) { 997 struct shrink_control sc = { 998 .gfp_mask = gfp_mask, 999 .nid = nid, 1000 .memcg = memcg, 1001 }; 1002 1003 ret = do_shrink_slab(&sc, shrinker, priority); 1004 if (ret == SHRINK_EMPTY) 1005 ret = 0; 1006 freed += ret; 1007 /* 1008 * Bail out if someone want to register a new shrinker to 1009 * prevent the registration from being stalled for long periods 1010 * by parallel ongoing shrinking. 1011 */ 1012 if (rwsem_is_contended(&shrinker_rwsem)) { 1013 freed = freed ? : 1; 1014 break; 1015 } 1016 } 1017 1018 up_read(&shrinker_rwsem); 1019 out: 1020 cond_resched(); 1021 return freed; 1022 } 1023 1024 static void drop_slab_node(int nid) 1025 { 1026 unsigned long freed; 1027 int shift = 0; 1028 1029 do { 1030 struct mem_cgroup *memcg = NULL; 1031 1032 if (fatal_signal_pending(current)) 1033 return; 1034 1035 freed = 0; 1036 memcg = mem_cgroup_iter(NULL, NULL, NULL); 1037 do { 1038 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); 1039 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 1040 } while ((freed >> shift++) > 1); 1041 } 1042 1043 void drop_slab(void) 1044 { 1045 int nid; 1046 1047 for_each_online_node(nid) 1048 drop_slab_node(nid); 1049 } 1050 1051 static int reclaimer_offset(void) 1052 { 1053 BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != 1054 PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); 1055 BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != 1056 PGSCAN_DIRECT - PGSCAN_KSWAPD); 1057 BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != 1058 PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); 1059 BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != 1060 PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); 1061 1062 if (current_is_kswapd()) 1063 return 0; 1064 if (current_is_khugepaged()) 1065 return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; 1066 return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; 1067 } 1068 1069 static inline int is_page_cache_freeable(struct folio *folio) 1070 { 1071 /* 1072 * A freeable page cache folio is referenced only by the caller 1073 * that isolated the folio, the page cache and optional filesystem 1074 * private data at folio->private. 1075 */ 1076 return folio_ref_count(folio) - folio_test_private(folio) == 1077 1 + folio_nr_pages(folio); 1078 } 1079 1080 /* 1081 * We detected a synchronous write error writing a folio out. Probably 1082 * -ENOSPC. We need to propagate that into the address_space for a subsequent 1083 * fsync(), msync() or close(). 1084 * 1085 * The tricky part is that after writepage we cannot touch the mapping: nothing 1086 * prevents it from being freed up. But we have a ref on the folio and once 1087 * that folio is locked, the mapping is pinned. 1088 * 1089 * We're allowed to run sleeping folio_lock() here because we know the caller has 1090 * __GFP_FS. 1091 */ 1092 static void handle_write_error(struct address_space *mapping, 1093 struct folio *folio, int error) 1094 { 1095 folio_lock(folio); 1096 if (folio_mapping(folio) == mapping) 1097 mapping_set_error(mapping, error); 1098 folio_unlock(folio); 1099 } 1100 1101 static bool skip_throttle_noprogress(pg_data_t *pgdat) 1102 { 1103 int reclaimable = 0, write_pending = 0; 1104 int i; 1105 1106 /* 1107 * If kswapd is disabled, reschedule if necessary but do not 1108 * throttle as the system is likely near OOM. 1109 */ 1110 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 1111 return true; 1112 1113 /* 1114 * If there are a lot of dirty/writeback folios then do not 1115 * throttle as throttling will occur when the folios cycle 1116 * towards the end of the LRU if still under writeback. 1117 */ 1118 for (i = 0; i < MAX_NR_ZONES; i++) { 1119 struct zone *zone = pgdat->node_zones + i; 1120 1121 if (!managed_zone(zone)) 1122 continue; 1123 1124 reclaimable += zone_reclaimable_pages(zone); 1125 write_pending += zone_page_state_snapshot(zone, 1126 NR_ZONE_WRITE_PENDING); 1127 } 1128 if (2 * write_pending <= reclaimable) 1129 return true; 1130 1131 return false; 1132 } 1133 1134 void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) 1135 { 1136 wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; 1137 long timeout, ret; 1138 DEFINE_WAIT(wait); 1139 1140 /* 1141 * Do not throttle IO workers, kthreads other than kswapd or 1142 * workqueues. They may be required for reclaim to make 1143 * forward progress (e.g. journalling workqueues or kthreads). 1144 */ 1145 if (!current_is_kswapd() && 1146 current->flags & (PF_IO_WORKER|PF_KTHREAD)) { 1147 cond_resched(); 1148 return; 1149 } 1150 1151 /* 1152 * These figures are pulled out of thin air. 1153 * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many 1154 * parallel reclaimers which is a short-lived event so the timeout is 1155 * short. Failing to make progress or waiting on writeback are 1156 * potentially long-lived events so use a longer timeout. This is shaky 1157 * logic as a failure to make progress could be due to anything from 1158 * writeback to a slow device to excessive referenced folios at the tail 1159 * of the inactive LRU. 1160 */ 1161 switch(reason) { 1162 case VMSCAN_THROTTLE_WRITEBACK: 1163 timeout = HZ/10; 1164 1165 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { 1166 WRITE_ONCE(pgdat->nr_reclaim_start, 1167 node_page_state(pgdat, NR_THROTTLED_WRITTEN)); 1168 } 1169 1170 break; 1171 case VMSCAN_THROTTLE_CONGESTED: 1172 fallthrough; 1173 case VMSCAN_THROTTLE_NOPROGRESS: 1174 if (skip_throttle_noprogress(pgdat)) { 1175 cond_resched(); 1176 return; 1177 } 1178 1179 timeout = 1; 1180 1181 break; 1182 case VMSCAN_THROTTLE_ISOLATED: 1183 timeout = HZ/50; 1184 break; 1185 default: 1186 WARN_ON_ONCE(1); 1187 timeout = HZ; 1188 break; 1189 } 1190 1191 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 1192 ret = schedule_timeout(timeout); 1193 finish_wait(wqh, &wait); 1194 1195 if (reason == VMSCAN_THROTTLE_WRITEBACK) 1196 atomic_dec(&pgdat->nr_writeback_throttled); 1197 1198 trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), 1199 jiffies_to_usecs(timeout - ret), 1200 reason); 1201 } 1202 1203 /* 1204 * Account for folios written if tasks are throttled waiting on dirty 1205 * folios to clean. If enough folios have been cleaned since throttling 1206 * started then wakeup the throttled tasks. 1207 */ 1208 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, 1209 int nr_throttled) 1210 { 1211 unsigned long nr_written; 1212 1213 node_stat_add_folio(folio, NR_THROTTLED_WRITTEN); 1214 1215 /* 1216 * This is an inaccurate read as the per-cpu deltas may not 1217 * be synchronised. However, given that the system is 1218 * writeback throttled, it is not worth taking the penalty 1219 * of getting an accurate count. At worst, the throttle 1220 * timeout guarantees forward progress. 1221 */ 1222 nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - 1223 READ_ONCE(pgdat->nr_reclaim_start); 1224 1225 if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) 1226 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); 1227 } 1228 1229 /* possible outcome of pageout() */ 1230 typedef enum { 1231 /* failed to write folio out, folio is locked */ 1232 PAGE_KEEP, 1233 /* move folio to the active list, folio is locked */ 1234 PAGE_ACTIVATE, 1235 /* folio has been sent to the disk successfully, folio is unlocked */ 1236 PAGE_SUCCESS, 1237 /* folio is clean and locked */ 1238 PAGE_CLEAN, 1239 } pageout_t; 1240 1241 /* 1242 * pageout is called by shrink_folio_list() for each dirty folio. 1243 * Calls ->writepage(). 1244 */ 1245 static pageout_t pageout(struct folio *folio, struct address_space *mapping, 1246 struct swap_iocb **plug) 1247 { 1248 /* 1249 * If the folio is dirty, only perform writeback if that write 1250 * will be non-blocking. To prevent this allocation from being 1251 * stalled by pagecache activity. But note that there may be 1252 * stalls if we need to run get_block(). We could test 1253 * PagePrivate for that. 1254 * 1255 * If this process is currently in __generic_file_write_iter() against 1256 * this folio's queue, we can perform writeback even if that 1257 * will block. 1258 * 1259 * If the folio is swapcache, write it back even if that would 1260 * block, for some throttling. This happens by accident, because 1261 * swap_backing_dev_info is bust: it doesn't reflect the 1262 * congestion state of the swapdevs. Easy to fix, if needed. 1263 */ 1264 if (!is_page_cache_freeable(folio)) 1265 return PAGE_KEEP; 1266 if (!mapping) { 1267 /* 1268 * Some data journaling orphaned folios can have 1269 * folio->mapping == NULL while being dirty with clean buffers. 1270 */ 1271 if (folio_test_private(folio)) { 1272 if (try_to_free_buffers(folio)) { 1273 folio_clear_dirty(folio); 1274 pr_info("%s: orphaned folio\n", __func__); 1275 return PAGE_CLEAN; 1276 } 1277 } 1278 return PAGE_KEEP; 1279 } 1280 if (mapping->a_ops->writepage == NULL) 1281 return PAGE_ACTIVATE; 1282 1283 if (folio_clear_dirty_for_io(folio)) { 1284 int res; 1285 struct writeback_control wbc = { 1286 .sync_mode = WB_SYNC_NONE, 1287 .nr_to_write = SWAP_CLUSTER_MAX, 1288 .range_start = 0, 1289 .range_end = LLONG_MAX, 1290 .for_reclaim = 1, 1291 .swap_plug = plug, 1292 }; 1293 1294 folio_set_reclaim(folio); 1295 res = mapping->a_ops->writepage(&folio->page, &wbc); 1296 if (res < 0) 1297 handle_write_error(mapping, folio, res); 1298 if (res == AOP_WRITEPAGE_ACTIVATE) { 1299 folio_clear_reclaim(folio); 1300 return PAGE_ACTIVATE; 1301 } 1302 1303 if (!folio_test_writeback(folio)) { 1304 /* synchronous write or broken a_ops? */ 1305 folio_clear_reclaim(folio); 1306 } 1307 trace_mm_vmscan_write_folio(folio); 1308 node_stat_add_folio(folio, NR_VMSCAN_WRITE); 1309 return PAGE_SUCCESS; 1310 } 1311 1312 return PAGE_CLEAN; 1313 } 1314 1315 /* 1316 * Same as remove_mapping, but if the folio is removed from the mapping, it 1317 * gets returned with a refcount of 0. 1318 */ 1319 static int __remove_mapping(struct address_space *mapping, struct folio *folio, 1320 bool reclaimed, struct mem_cgroup *target_memcg) 1321 { 1322 int refcount; 1323 void *shadow = NULL; 1324 1325 BUG_ON(!folio_test_locked(folio)); 1326 BUG_ON(mapping != folio_mapping(folio)); 1327 1328 if (!folio_test_swapcache(folio)) 1329 spin_lock(&mapping->host->i_lock); 1330 xa_lock_irq(&mapping->i_pages); 1331 /* 1332 * The non racy check for a busy folio. 1333 * 1334 * Must be careful with the order of the tests. When someone has 1335 * a ref to the folio, it may be possible that they dirty it then 1336 * drop the reference. So if the dirty flag is tested before the 1337 * refcount here, then the following race may occur: 1338 * 1339 * get_user_pages(&page); 1340 * [user mapping goes away] 1341 * write_to(page); 1342 * !folio_test_dirty(folio) [good] 1343 * folio_set_dirty(folio); 1344 * folio_put(folio); 1345 * !refcount(folio) [good, discard it] 1346 * 1347 * [oops, our write_to data is lost] 1348 * 1349 * Reversing the order of the tests ensures such a situation cannot 1350 * escape unnoticed. The smp_rmb is needed to ensure the folio->flags 1351 * load is not satisfied before that of folio->_refcount. 1352 * 1353 * Note that if the dirty flag is always set via folio_mark_dirty, 1354 * and thus under the i_pages lock, then this ordering is not required. 1355 */ 1356 refcount = 1 + folio_nr_pages(folio); 1357 if (!folio_ref_freeze(folio, refcount)) 1358 goto cannot_free; 1359 /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ 1360 if (unlikely(folio_test_dirty(folio))) { 1361 folio_ref_unfreeze(folio, refcount); 1362 goto cannot_free; 1363 } 1364 1365 if (folio_test_swapcache(folio)) { 1366 swp_entry_t swap = folio_swap_entry(folio); 1367 1368 /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */ 1369 if (reclaimed && !mapping_exiting(mapping)) 1370 shadow = workingset_eviction(folio, target_memcg); 1371 mem_cgroup_swapout(folio, swap); 1372 __delete_from_swap_cache(folio, swap, shadow); 1373 xa_unlock_irq(&mapping->i_pages); 1374 put_swap_folio(folio, swap); 1375 } else { 1376 void (*free_folio)(struct folio *); 1377 1378 free_folio = mapping->a_ops->free_folio; 1379 /* 1380 * Remember a shadow entry for reclaimed file cache in 1381 * order to detect refaults, thus thrashing, later on. 1382 * 1383 * But don't store shadows in an address space that is 1384 * already exiting. This is not just an optimization, 1385 * inode reclaim needs to empty out the radix tree or 1386 * the nodes are lost. Don't plant shadows behind its 1387 * back. 1388 * 1389 * We also don't store shadows for DAX mappings because the 1390 * only page cache folios found in these are zero pages 1391 * covering holes, and because we don't want to mix DAX 1392 * exceptional entries and shadow exceptional entries in the 1393 * same address_space. 1394 */ 1395 if (reclaimed && folio_is_file_lru(folio) && 1396 !mapping_exiting(mapping) && !dax_mapping(mapping)) 1397 shadow = workingset_eviction(folio, target_memcg); 1398 __filemap_remove_folio(folio, shadow); 1399 xa_unlock_irq(&mapping->i_pages); 1400 if (mapping_shrinkable(mapping)) 1401 inode_add_lru(mapping->host); 1402 spin_unlock(&mapping->host->i_lock); 1403 1404 if (free_folio) 1405 free_folio(folio); 1406 } 1407 1408 return 1; 1409 1410 cannot_free: 1411 xa_unlock_irq(&mapping->i_pages); 1412 if (!folio_test_swapcache(folio)) 1413 spin_unlock(&mapping->host->i_lock); 1414 return 0; 1415 } 1416 1417 /** 1418 * remove_mapping() - Attempt to remove a folio from its mapping. 1419 * @mapping: The address space. 1420 * @folio: The folio to remove. 1421 * 1422 * If the folio is dirty, under writeback or if someone else has a ref 1423 * on it, removal will fail. 1424 * Return: The number of pages removed from the mapping. 0 if the folio 1425 * could not be removed. 1426 * Context: The caller should have a single refcount on the folio and 1427 * hold its lock. 1428 */ 1429 long remove_mapping(struct address_space *mapping, struct folio *folio) 1430 { 1431 if (__remove_mapping(mapping, folio, false, NULL)) { 1432 /* 1433 * Unfreezing the refcount with 1 effectively 1434 * drops the pagecache ref for us without requiring another 1435 * atomic operation. 1436 */ 1437 folio_ref_unfreeze(folio, 1); 1438 return folio_nr_pages(folio); 1439 } 1440 return 0; 1441 } 1442 1443 /** 1444 * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. 1445 * @folio: Folio to be returned to an LRU list. 1446 * 1447 * Add previously isolated @folio to appropriate LRU list. 1448 * The folio may still be unevictable for other reasons. 1449 * 1450 * Context: lru_lock must not be held, interrupts must be enabled. 1451 */ 1452 void folio_putback_lru(struct folio *folio) 1453 { 1454 folio_add_lru(folio); 1455 folio_put(folio); /* drop ref from isolate */ 1456 } 1457 1458 enum folio_references { 1459 FOLIOREF_RECLAIM, 1460 FOLIOREF_RECLAIM_CLEAN, 1461 FOLIOREF_KEEP, 1462 FOLIOREF_ACTIVATE, 1463 }; 1464 1465 static enum folio_references folio_check_references(struct folio *folio, 1466 struct scan_control *sc) 1467 { 1468 int referenced_ptes, referenced_folio; 1469 unsigned long vm_flags; 1470 1471 referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, 1472 &vm_flags); 1473 referenced_folio = folio_test_clear_referenced(folio); 1474 1475 /* 1476 * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. 1477 * Let the folio, now marked Mlocked, be moved to the unevictable list. 1478 */ 1479 if (vm_flags & VM_LOCKED) 1480 return FOLIOREF_ACTIVATE; 1481 1482 /* rmap lock contention: rotate */ 1483 if (referenced_ptes == -1) 1484 return FOLIOREF_KEEP; 1485 1486 if (referenced_ptes) { 1487 /* 1488 * All mapped folios start out with page table 1489 * references from the instantiating fault, so we need 1490 * to look twice if a mapped file/anon folio is used more 1491 * than once. 1492 * 1493 * Mark it and spare it for another trip around the 1494 * inactive list. Another page table reference will 1495 * lead to its activation. 1496 * 1497 * Note: the mark is set for activated folios as well 1498 * so that recently deactivated but used folios are 1499 * quickly recovered. 1500 */ 1501 folio_set_referenced(folio); 1502 1503 if (referenced_folio || referenced_ptes > 1) 1504 return FOLIOREF_ACTIVATE; 1505 1506 /* 1507 * Activate file-backed executable folios after first usage. 1508 */ 1509 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) 1510 return FOLIOREF_ACTIVATE; 1511 1512 return FOLIOREF_KEEP; 1513 } 1514 1515 /* Reclaim if clean, defer dirty folios to writeback */ 1516 if (referenced_folio && folio_is_file_lru(folio)) 1517 return FOLIOREF_RECLAIM_CLEAN; 1518 1519 return FOLIOREF_RECLAIM; 1520 } 1521 1522 /* Check if a folio is dirty or under writeback */ 1523 static void folio_check_dirty_writeback(struct folio *folio, 1524 bool *dirty, bool *writeback) 1525 { 1526 struct address_space *mapping; 1527 1528 /* 1529 * Anonymous folios are not handled by flushers and must be written 1530 * from reclaim context. Do not stall reclaim based on them. 1531 * MADV_FREE anonymous folios are put into inactive file list too. 1532 * They could be mistakenly treated as file lru. So further anon 1533 * test is needed. 1534 */ 1535 if (!folio_is_file_lru(folio) || 1536 (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { 1537 *dirty = false; 1538 *writeback = false; 1539 return; 1540 } 1541 1542 /* By default assume that the folio flags are accurate */ 1543 *dirty = folio_test_dirty(folio); 1544 *writeback = folio_test_writeback(folio); 1545 1546 /* Verify dirty/writeback state if the filesystem supports it */ 1547 if (!folio_test_private(folio)) 1548 return; 1549 1550 mapping = folio_mapping(folio); 1551 if (mapping && mapping->a_ops->is_dirty_writeback) 1552 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); 1553 } 1554 1555 static struct page *alloc_demote_page(struct page *page, unsigned long private) 1556 { 1557 struct page *target_page; 1558 nodemask_t *allowed_mask; 1559 struct migration_target_control *mtc; 1560 1561 mtc = (struct migration_target_control *)private; 1562 1563 allowed_mask = mtc->nmask; 1564 /* 1565 * make sure we allocate from the target node first also trying to 1566 * demote or reclaim pages from the target node via kswapd if we are 1567 * low on free memory on target node. If we don't do this and if 1568 * we have free memory on the slower(lower) memtier, we would start 1569 * allocating pages from slower(lower) memory tiers without even forcing 1570 * a demotion of cold pages from the target memtier. This can result 1571 * in the kernel placing hot pages in slower(lower) memory tiers. 1572 */ 1573 mtc->nmask = NULL; 1574 mtc->gfp_mask |= __GFP_THISNODE; 1575 target_page = alloc_migration_target(page, (unsigned long)mtc); 1576 if (target_page) 1577 return target_page; 1578 1579 mtc->gfp_mask &= ~__GFP_THISNODE; 1580 mtc->nmask = allowed_mask; 1581 1582 return alloc_migration_target(page, (unsigned long)mtc); 1583 } 1584 1585 /* 1586 * Take folios on @demote_folios and attempt to demote them to another node. 1587 * Folios which are not demoted are left on @demote_folios. 1588 */ 1589 static unsigned int demote_folio_list(struct list_head *demote_folios, 1590 struct pglist_data *pgdat) 1591 { 1592 int target_nid = next_demotion_node(pgdat->node_id); 1593 unsigned int nr_succeeded; 1594 nodemask_t allowed_mask; 1595 1596 struct migration_target_control mtc = { 1597 /* 1598 * Allocate from 'node', or fail quickly and quietly. 1599 * When this happens, 'page' will likely just be discarded 1600 * instead of migrated. 1601 */ 1602 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | 1603 __GFP_NOMEMALLOC | GFP_NOWAIT, 1604 .nid = target_nid, 1605 .nmask = &allowed_mask 1606 }; 1607 1608 if (list_empty(demote_folios)) 1609 return 0; 1610 1611 if (target_nid == NUMA_NO_NODE) 1612 return 0; 1613 1614 node_get_allowed_targets(pgdat, &allowed_mask); 1615 1616 /* Demotion ignores all cpuset and mempolicy settings */ 1617 migrate_pages(demote_folios, alloc_demote_page, NULL, 1618 (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, 1619 &nr_succeeded); 1620 1621 __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); 1622 1623 return nr_succeeded; 1624 } 1625 1626 static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) 1627 { 1628 if (gfp_mask & __GFP_FS) 1629 return true; 1630 if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) 1631 return false; 1632 /* 1633 * We can "enter_fs" for swap-cache with only __GFP_IO 1634 * providing this isn't SWP_FS_OPS. 1635 * ->flags can be updated non-atomicially (scan_swap_map_slots), 1636 * but that will never affect SWP_FS_OPS, so the data_race 1637 * is safe. 1638 */ 1639 return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); 1640 } 1641 1642 /* 1643 * shrink_folio_list() returns the number of reclaimed pages 1644 */ 1645 static unsigned int shrink_folio_list(struct list_head *folio_list, 1646 struct pglist_data *pgdat, struct scan_control *sc, 1647 struct reclaim_stat *stat, bool ignore_references) 1648 { 1649 LIST_HEAD(ret_folios); 1650 LIST_HEAD(free_folios); 1651 LIST_HEAD(demote_folios); 1652 unsigned int nr_reclaimed = 0; 1653 unsigned int pgactivate = 0; 1654 bool do_demote_pass; 1655 struct swap_iocb *plug = NULL; 1656 1657 memset(stat, 0, sizeof(*stat)); 1658 cond_resched(); 1659 do_demote_pass = can_demote(pgdat->node_id, sc); 1660 1661 retry: 1662 while (!list_empty(folio_list)) { 1663 struct address_space *mapping; 1664 struct folio *folio; 1665 enum folio_references references = FOLIOREF_RECLAIM; 1666 bool dirty, writeback; 1667 unsigned int nr_pages; 1668 1669 cond_resched(); 1670 1671 folio = lru_to_folio(folio_list); 1672 list_del(&folio->lru); 1673 1674 if (!folio_trylock(folio)) 1675 goto keep; 1676 1677 VM_BUG_ON_FOLIO(folio_test_active(folio), folio); 1678 1679 nr_pages = folio_nr_pages(folio); 1680 1681 /* Account the number of base pages */ 1682 sc->nr_scanned += nr_pages; 1683 1684 if (unlikely(!folio_evictable(folio))) 1685 goto activate_locked; 1686 1687 if (!sc->may_unmap && folio_mapped(folio)) 1688 goto keep_locked; 1689 1690 /* folio_update_gen() tried to promote this page? */ 1691 if (lru_gen_enabled() && !ignore_references && 1692 folio_mapped(folio) && folio_test_referenced(folio)) 1693 goto keep_locked; 1694 1695 /* 1696 * The number of dirty pages determines if a node is marked 1697 * reclaim_congested. kswapd will stall and start writing 1698 * folios if the tail of the LRU is all dirty unqueued folios. 1699 */ 1700 folio_check_dirty_writeback(folio, &dirty, &writeback); 1701 if (dirty || writeback) 1702 stat->nr_dirty += nr_pages; 1703 1704 if (dirty && !writeback) 1705 stat->nr_unqueued_dirty += nr_pages; 1706 1707 /* 1708 * Treat this folio as congested if folios are cycling 1709 * through the LRU so quickly that the folios marked 1710 * for immediate reclaim are making it to the end of 1711 * the LRU a second time. 1712 */ 1713 if (writeback && folio_test_reclaim(folio)) 1714 stat->nr_congested += nr_pages; 1715 1716 /* 1717 * If a folio at the tail of the LRU is under writeback, there 1718 * are three cases to consider. 1719 * 1720 * 1) If reclaim is encountering an excessive number 1721 * of folios under writeback and this folio has both 1722 * the writeback and reclaim flags set, then it 1723 * indicates that folios are being queued for I/O but 1724 * are being recycled through the LRU before the I/O 1725 * can complete. Waiting on the folio itself risks an 1726 * indefinite stall if it is impossible to writeback 1727 * the folio due to I/O error or disconnected storage 1728 * so instead note that the LRU is being scanned too 1729 * quickly and the caller can stall after the folio 1730 * list has been processed. 1731 * 1732 * 2) Global or new memcg reclaim encounters a folio that is 1733 * not marked for immediate reclaim, or the caller does not 1734 * have __GFP_FS (or __GFP_IO if it's simply going to swap, 1735 * not to fs). In this case mark the folio for immediate 1736 * reclaim and continue scanning. 1737 * 1738 * Require may_enter_fs() because we would wait on fs, which 1739 * may not have submitted I/O yet. And the loop driver might 1740 * enter reclaim, and deadlock if it waits on a folio for 1741 * which it is needed to do the write (loop masks off 1742 * __GFP_IO|__GFP_FS for this reason); but more thought 1743 * would probably show more reasons. 1744 * 1745 * 3) Legacy memcg encounters a folio that already has the 1746 * reclaim flag set. memcg does not have any dirty folio 1747 * throttling so we could easily OOM just because too many 1748 * folios are in writeback and there is nothing else to 1749 * reclaim. Wait for the writeback to complete. 1750 * 1751 * In cases 1) and 2) we activate the folios to get them out of 1752 * the way while we continue scanning for clean folios on the 1753 * inactive list and refilling from the active list. The 1754 * observation here is that waiting for disk writes is more 1755 * expensive than potentially causing reloads down the line. 1756 * Since they're marked for immediate reclaim, they won't put 1757 * memory pressure on the cache working set any longer than it 1758 * takes to write them to disk. 1759 */ 1760 if (folio_test_writeback(folio)) { 1761 /* Case 1 above */ 1762 if (current_is_kswapd() && 1763 folio_test_reclaim(folio) && 1764 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { 1765 stat->nr_immediate += nr_pages; 1766 goto activate_locked; 1767 1768 /* Case 2 above */ 1769 } else if (writeback_throttling_sane(sc) || 1770 !folio_test_reclaim(folio) || 1771 !may_enter_fs(folio, sc->gfp_mask)) { 1772 /* 1773 * This is slightly racy - 1774 * folio_end_writeback() might have 1775 * just cleared the reclaim flag, then 1776 * setting the reclaim flag here ends up 1777 * interpreted as the readahead flag - but 1778 * that does not matter enough to care. 1779 * What we do want is for this folio to 1780 * have the reclaim flag set next time 1781 * memcg reclaim reaches the tests above, 1782 * so it will then wait for writeback to 1783 * avoid OOM; and it's also appropriate 1784 * in global reclaim. 1785 */ 1786 folio_set_reclaim(folio); 1787 stat->nr_writeback += nr_pages; 1788 goto activate_locked; 1789 1790 /* Case 3 above */ 1791 } else { 1792 folio_unlock(folio); 1793 folio_wait_writeback(folio); 1794 /* then go back and try same folio again */ 1795 list_add_tail(&folio->lru, folio_list); 1796 continue; 1797 } 1798 } 1799 1800 if (!ignore_references) 1801 references = folio_check_references(folio, sc); 1802 1803 switch (references) { 1804 case FOLIOREF_ACTIVATE: 1805 goto activate_locked; 1806 case FOLIOREF_KEEP: 1807 stat->nr_ref_keep += nr_pages; 1808 goto keep_locked; 1809 case FOLIOREF_RECLAIM: 1810 case FOLIOREF_RECLAIM_CLEAN: 1811 ; /* try to reclaim the folio below */ 1812 } 1813 1814 /* 1815 * Before reclaiming the folio, try to relocate 1816 * its contents to another node. 1817 */ 1818 if (do_demote_pass && 1819 (thp_migration_supported() || !folio_test_large(folio))) { 1820 list_add(&folio->lru, &demote_folios); 1821 folio_unlock(folio); 1822 continue; 1823 } 1824 1825 /* 1826 * Anonymous process memory has backing store? 1827 * Try to allocate it some swap space here. 1828 * Lazyfree folio could be freed directly 1829 */ 1830 if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { 1831 if (!folio_test_swapcache(folio)) { 1832 if (!(sc->gfp_mask & __GFP_IO)) 1833 goto keep_locked; 1834 if (folio_maybe_dma_pinned(folio)) 1835 goto keep_locked; 1836 if (folio_test_large(folio)) { 1837 /* cannot split folio, skip it */ 1838 if (!can_split_folio(folio, NULL)) 1839 goto activate_locked; 1840 /* 1841 * Split folios without a PMD map right 1842 * away. Chances are some or all of the 1843 * tail pages can be freed without IO. 1844 */ 1845 if (!folio_entire_mapcount(folio) && 1846 split_folio_to_list(folio, 1847 folio_list)) 1848 goto activate_locked; 1849 } 1850 if (!add_to_swap(folio)) { 1851 if (!folio_test_large(folio)) 1852 goto activate_locked_split; 1853 /* Fallback to swap normal pages */ 1854 if (split_folio_to_list(folio, 1855 folio_list)) 1856 goto activate_locked; 1857 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1858 count_vm_event(THP_SWPOUT_FALLBACK); 1859 #endif 1860 if (!add_to_swap(folio)) 1861 goto activate_locked_split; 1862 } 1863 } 1864 } else if (folio_test_swapbacked(folio) && 1865 folio_test_large(folio)) { 1866 /* Split shmem folio */ 1867 if (split_folio_to_list(folio, folio_list)) 1868 goto keep_locked; 1869 } 1870 1871 /* 1872 * If the folio was split above, the tail pages will make 1873 * their own pass through this function and be accounted 1874 * then. 1875 */ 1876 if ((nr_pages > 1) && !folio_test_large(folio)) { 1877 sc->nr_scanned -= (nr_pages - 1); 1878 nr_pages = 1; 1879 } 1880 1881 /* 1882 * The folio is mapped into the page tables of one or more 1883 * processes. Try to unmap it here. 1884 */ 1885 if (folio_mapped(folio)) { 1886 enum ttu_flags flags = TTU_BATCH_FLUSH; 1887 bool was_swapbacked = folio_test_swapbacked(folio); 1888 1889 if (folio_test_pmd_mappable(folio)) 1890 flags |= TTU_SPLIT_HUGE_PMD; 1891 1892 try_to_unmap(folio, flags); 1893 if (folio_mapped(folio)) { 1894 stat->nr_unmap_fail += nr_pages; 1895 if (!was_swapbacked && 1896 folio_test_swapbacked(folio)) 1897 stat->nr_lazyfree_fail += nr_pages; 1898 goto activate_locked; 1899 } 1900 } 1901 1902 mapping = folio_mapping(folio); 1903 if (folio_test_dirty(folio)) { 1904 /* 1905 * Only kswapd can writeback filesystem folios 1906 * to avoid risk of stack overflow. But avoid 1907 * injecting inefficient single-folio I/O into 1908 * flusher writeback as much as possible: only 1909 * write folios when we've encountered many 1910 * dirty folios, and when we've already scanned 1911 * the rest of the LRU for clean folios and see 1912 * the same dirty folios again (with the reclaim 1913 * flag set). 1914 */ 1915 if (folio_is_file_lru(folio) && 1916 (!current_is_kswapd() || 1917 !folio_test_reclaim(folio) || 1918 !test_bit(PGDAT_DIRTY, &pgdat->flags))) { 1919 /* 1920 * Immediately reclaim when written back. 1921 * Similar in principle to deactivate_page() 1922 * except we already have the folio isolated 1923 * and know it's dirty 1924 */ 1925 node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, 1926 nr_pages); 1927 folio_set_reclaim(folio); 1928 1929 goto activate_locked; 1930 } 1931 1932 if (references == FOLIOREF_RECLAIM_CLEAN) 1933 goto keep_locked; 1934 if (!may_enter_fs(folio, sc->gfp_mask)) 1935 goto keep_locked; 1936 if (!sc->may_writepage) 1937 goto keep_locked; 1938 1939 /* 1940 * Folio is dirty. Flush the TLB if a writable entry 1941 * potentially exists to avoid CPU writes after I/O 1942 * starts and then write it out here. 1943 */ 1944 try_to_unmap_flush_dirty(); 1945 switch (pageout(folio, mapping, &plug)) { 1946 case PAGE_KEEP: 1947 goto keep_locked; 1948 case PAGE_ACTIVATE: 1949 goto activate_locked; 1950 case PAGE_SUCCESS: 1951 stat->nr_pageout += nr_pages; 1952 1953 if (folio_test_writeback(folio)) 1954 goto keep; 1955 if (folio_test_dirty(folio)) 1956 goto keep; 1957 1958 /* 1959 * A synchronous write - probably a ramdisk. Go 1960 * ahead and try to reclaim the folio. 1961 */ 1962 if (!folio_trylock(folio)) 1963 goto keep; 1964 if (folio_test_dirty(folio) || 1965 folio_test_writeback(folio)) 1966 goto keep_locked; 1967 mapping = folio_mapping(folio); 1968 fallthrough; 1969 case PAGE_CLEAN: 1970 ; /* try to free the folio below */ 1971 } 1972 } 1973 1974 /* 1975 * If the folio has buffers, try to free the buffer 1976 * mappings associated with this folio. If we succeed 1977 * we try to free the folio as well. 1978 * 1979 * We do this even if the folio is dirty. 1980 * filemap_release_folio() does not perform I/O, but it 1981 * is possible for a folio to have the dirty flag set, 1982 * but it is actually clean (all its buffers are clean). 1983 * This happens if the buffers were written out directly, 1984 * with submit_bh(). ext3 will do this, as well as 1985 * the blockdev mapping. filemap_release_folio() will 1986 * discover that cleanness and will drop the buffers 1987 * and mark the folio clean - it can be freed. 1988 * 1989 * Rarely, folios can have buffers and no ->mapping. 1990 * These are the folios which were not successfully 1991 * invalidated in truncate_cleanup_folio(). We try to 1992 * drop those buffers here and if that worked, and the 1993 * folio is no longer mapped into process address space 1994 * (refcount == 1) it can be freed. Otherwise, leave 1995 * the folio on the LRU so it is swappable. 1996 */ 1997 if (folio_has_private(folio)) { 1998 if (!filemap_release_folio(folio, sc->gfp_mask)) 1999 goto activate_locked; 2000 if (!mapping && folio_ref_count(folio) == 1) { 2001 folio_unlock(folio); 2002 if (folio_put_testzero(folio)) 2003 goto free_it; 2004 else { 2005 /* 2006 * rare race with speculative reference. 2007 * the speculative reference will free 2008 * this folio shortly, so we may 2009 * increment nr_reclaimed here (and 2010 * leave it off the LRU). 2011 */ 2012 nr_reclaimed += nr_pages; 2013 continue; 2014 } 2015 } 2016 } 2017 2018 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { 2019 /* follow __remove_mapping for reference */ 2020 if (!folio_ref_freeze(folio, 1)) 2021 goto keep_locked; 2022 /* 2023 * The folio has only one reference left, which is 2024 * from the isolation. After the caller puts the 2025 * folio back on the lru and drops the reference, the 2026 * folio will be freed anyway. It doesn't matter 2027 * which lru it goes on. So we don't bother checking 2028 * the dirty flag here. 2029 */ 2030 count_vm_events(PGLAZYFREED, nr_pages); 2031 count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); 2032 } else if (!mapping || !__remove_mapping(mapping, folio, true, 2033 sc->target_mem_cgroup)) 2034 goto keep_locked; 2035 2036 folio_unlock(folio); 2037 free_it: 2038 /* 2039 * Folio may get swapped out as a whole, need to account 2040 * all pages in it. 2041 */ 2042 nr_reclaimed += nr_pages; 2043 2044 /* 2045 * Is there need to periodically free_folio_list? It would 2046 * appear not as the counts should be low 2047 */ 2048 if (unlikely(folio_test_large(folio))) 2049 destroy_large_folio(folio); 2050 else 2051 list_add(&folio->lru, &free_folios); 2052 continue; 2053 2054 activate_locked_split: 2055 /* 2056 * The tail pages that are failed to add into swap cache 2057 * reach here. Fixup nr_scanned and nr_pages. 2058 */ 2059 if (nr_pages > 1) { 2060 sc->nr_scanned -= (nr_pages - 1); 2061 nr_pages = 1; 2062 } 2063 activate_locked: 2064 /* Not a candidate for swapping, so reclaim swap space. */ 2065 if (folio_test_swapcache(folio) && 2066 (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) 2067 folio_free_swap(folio); 2068 VM_BUG_ON_FOLIO(folio_test_active(folio), folio); 2069 if (!folio_test_mlocked(folio)) { 2070 int type = folio_is_file_lru(folio); 2071 folio_set_active(folio); 2072 stat->nr_activate[type] += nr_pages; 2073 count_memcg_folio_events(folio, PGACTIVATE, nr_pages); 2074 } 2075 keep_locked: 2076 folio_unlock(folio); 2077 keep: 2078 list_add(&folio->lru, &ret_folios); 2079 VM_BUG_ON_FOLIO(folio_test_lru(folio) || 2080 folio_test_unevictable(folio), folio); 2081 } 2082 /* 'folio_list' is always empty here */ 2083 2084 /* Migrate folios selected for demotion */ 2085 nr_reclaimed += demote_folio_list(&demote_folios, pgdat); 2086 /* Folios that could not be demoted are still in @demote_folios */ 2087 if (!list_empty(&demote_folios)) { 2088 /* Folios which weren't demoted go back on @folio_list for retry: */ 2089 list_splice_init(&demote_folios, folio_list); 2090 do_demote_pass = false; 2091 goto retry; 2092 } 2093 2094 pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; 2095 2096 mem_cgroup_uncharge_list(&free_folios); 2097 try_to_unmap_flush(); 2098 free_unref_page_list(&free_folios); 2099 2100 list_splice(&ret_folios, folio_list); 2101 count_vm_events(PGACTIVATE, pgactivate); 2102 2103 if (plug) 2104 swap_write_unplug(plug); 2105 return nr_reclaimed; 2106 } 2107 2108 unsigned int reclaim_clean_pages_from_list(struct zone *zone, 2109 struct list_head *folio_list) 2110 { 2111 struct scan_control sc = { 2112 .gfp_mask = GFP_KERNEL, 2113 .may_unmap = 1, 2114 }; 2115 struct reclaim_stat stat; 2116 unsigned int nr_reclaimed; 2117 struct folio *folio, *next; 2118 LIST_HEAD(clean_folios); 2119 unsigned int noreclaim_flag; 2120 2121 list_for_each_entry_safe(folio, next, folio_list, lru) { 2122 if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && 2123 !folio_test_dirty(folio) && !__folio_test_movable(folio) && 2124 !folio_test_unevictable(folio)) { 2125 folio_clear_active(folio); 2126 list_move(&folio->lru, &clean_folios); 2127 } 2128 } 2129 2130 /* 2131 * We should be safe here since we are only dealing with file pages and 2132 * we are not kswapd and therefore cannot write dirty file pages. But 2133 * call memalloc_noreclaim_save() anyway, just in case these conditions 2134 * change in the future. 2135 */ 2136 noreclaim_flag = memalloc_noreclaim_save(); 2137 nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, 2138 &stat, true); 2139 memalloc_noreclaim_restore(noreclaim_flag); 2140 2141 list_splice(&clean_folios, folio_list); 2142 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, 2143 -(long)nr_reclaimed); 2144 /* 2145 * Since lazyfree pages are isolated from file LRU from the beginning, 2146 * they will rotate back to anonymous LRU in the end if it failed to 2147 * discard so isolated count will be mismatched. 2148 * Compensate the isolated count for both LRU lists. 2149 */ 2150 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, 2151 stat.nr_lazyfree_fail); 2152 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, 2153 -(long)stat.nr_lazyfree_fail); 2154 return nr_reclaimed; 2155 } 2156 2157 /* 2158 * Update LRU sizes after isolating pages. The LRU size updates must 2159 * be complete before mem_cgroup_update_lru_size due to a sanity check. 2160 */ 2161 static __always_inline void update_lru_sizes(struct lruvec *lruvec, 2162 enum lru_list lru, unsigned long *nr_zone_taken) 2163 { 2164 int zid; 2165 2166 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2167 if (!nr_zone_taken[zid]) 2168 continue; 2169 2170 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); 2171 } 2172 2173 } 2174 2175 /* 2176 * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. 2177 * 2178 * lruvec->lru_lock is heavily contended. Some of the functions that 2179 * shrink the lists perform better by taking out a batch of pages 2180 * and working on them outside the LRU lock. 2181 * 2182 * For pagecache intensive workloads, this function is the hottest 2183 * spot in the kernel (apart from copy_*_user functions). 2184 * 2185 * Lru_lock must be held before calling this function. 2186 * 2187 * @nr_to_scan: The number of eligible pages to look through on the list. 2188 * @lruvec: The LRU vector to pull pages from. 2189 * @dst: The temp list to put pages on to. 2190 * @nr_scanned: The number of pages that were scanned. 2191 * @sc: The scan_control struct for this reclaim session 2192 * @lru: LRU list id for isolating 2193 * 2194 * returns how many pages were moved onto *@dst. 2195 */ 2196 static unsigned long isolate_lru_folios(unsigned long nr_to_scan, 2197 struct lruvec *lruvec, struct list_head *dst, 2198 unsigned long *nr_scanned, struct scan_control *sc, 2199 enum lru_list lru) 2200 { 2201 struct list_head *src = &lruvec->lists[lru]; 2202 unsigned long nr_taken = 0; 2203 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; 2204 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; 2205 unsigned long skipped = 0; 2206 unsigned long scan, total_scan, nr_pages; 2207 LIST_HEAD(folios_skipped); 2208 2209 total_scan = 0; 2210 scan = 0; 2211 while (scan < nr_to_scan && !list_empty(src)) { 2212 struct list_head *move_to = src; 2213 struct folio *folio; 2214 2215 folio = lru_to_folio(src); 2216 prefetchw_prev_lru_folio(folio, src, flags); 2217 2218 nr_pages = folio_nr_pages(folio); 2219 total_scan += nr_pages; 2220 2221 if (folio_zonenum(folio) > sc->reclaim_idx) { 2222 nr_skipped[folio_zonenum(folio)] += nr_pages; 2223 move_to = &folios_skipped; 2224 goto move; 2225 } 2226 2227 /* 2228 * Do not count skipped folios because that makes the function 2229 * return with no isolated folios if the LRU mostly contains 2230 * ineligible folios. This causes the VM to not reclaim any 2231 * folios, triggering a premature OOM. 2232 * Account all pages in a folio. 2233 */ 2234 scan += nr_pages; 2235 2236 if (!folio_test_lru(folio)) 2237 goto move; 2238 if (!sc->may_unmap && folio_mapped(folio)) 2239 goto move; 2240 2241 /* 2242 * Be careful not to clear the lru flag until after we're 2243 * sure the folio is not being freed elsewhere -- the 2244 * folio release code relies on it. 2245 */ 2246 if (unlikely(!folio_try_get(folio))) 2247 goto move; 2248 2249 if (!folio_test_clear_lru(folio)) { 2250 /* Another thread is already isolating this folio */ 2251 folio_put(folio); 2252 goto move; 2253 } 2254 2255 nr_taken += nr_pages; 2256 nr_zone_taken[folio_zonenum(folio)] += nr_pages; 2257 move_to = dst; 2258 move: 2259 list_move(&folio->lru, move_to); 2260 } 2261 2262 /* 2263 * Splice any skipped folios to the start of the LRU list. Note that 2264 * this disrupts the LRU order when reclaiming for lower zones but 2265 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX 2266 * scanning would soon rescan the same folios to skip and waste lots 2267 * of cpu cycles. 2268 */ 2269 if (!list_empty(&folios_skipped)) { 2270 int zid; 2271 2272 list_splice(&folios_skipped, src); 2273 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 2274 if (!nr_skipped[zid]) 2275 continue; 2276 2277 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); 2278 skipped += nr_skipped[zid]; 2279 } 2280 } 2281 *nr_scanned = total_scan; 2282 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, 2283 total_scan, skipped, nr_taken, 2284 sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); 2285 update_lru_sizes(lruvec, lru, nr_zone_taken); 2286 return nr_taken; 2287 } 2288 2289 /** 2290 * folio_isolate_lru() - Try to isolate a folio from its LRU list. 2291 * @folio: Folio to isolate from its LRU list. 2292 * 2293 * Isolate a @folio from an LRU list and adjust the vmstat statistic 2294 * corresponding to whatever LRU list the folio was on. 2295 * 2296 * The folio will have its LRU flag cleared. If it was found on the 2297 * active list, it will have the Active flag set. If it was found on the 2298 * unevictable list, it will have the Unevictable flag set. These flags 2299 * may need to be cleared by the caller before letting the page go. 2300 * 2301 * Context: 2302 * 2303 * (1) Must be called with an elevated refcount on the folio. This is a 2304 * fundamental difference from isolate_lru_folios() (which is called 2305 * without a stable reference). 2306 * (2) The lru_lock must not be held. 2307 * (3) Interrupts must be enabled. 2308 * 2309 * Return: 0 if the folio was removed from an LRU list. 2310 * -EBUSY if the folio was not on an LRU list. 2311 */ 2312 int folio_isolate_lru(struct folio *folio) 2313 { 2314 int ret = -EBUSY; 2315 2316 VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); 2317 2318 if (folio_test_clear_lru(folio)) { 2319 struct lruvec *lruvec; 2320 2321 folio_get(folio); 2322 lruvec = folio_lruvec_lock_irq(folio); 2323 lruvec_del_folio(lruvec, folio); 2324 unlock_page_lruvec_irq(lruvec); 2325 ret = 0; 2326 } 2327 2328 return ret; 2329 } 2330 2331 /* 2332 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and 2333 * then get rescheduled. When there are massive number of tasks doing page 2334 * allocation, such sleeping direct reclaimers may keep piling up on each CPU, 2335 * the LRU list will go small and be scanned faster than necessary, leading to 2336 * unnecessary swapping, thrashing and OOM. 2337 */ 2338 static int too_many_isolated(struct pglist_data *pgdat, int file, 2339 struct scan_control *sc) 2340 { 2341 unsigned long inactive, isolated; 2342 bool too_many; 2343 2344 if (current_is_kswapd()) 2345 return 0; 2346 2347 if (!writeback_throttling_sane(sc)) 2348 return 0; 2349 2350 if (file) { 2351 inactive = node_page_state(pgdat, NR_INACTIVE_FILE); 2352 isolated = node_page_state(pgdat, NR_ISOLATED_FILE); 2353 } else { 2354 inactive = node_page_state(pgdat, NR_INACTIVE_ANON); 2355 isolated = node_page_state(pgdat, NR_ISOLATED_ANON); 2356 } 2357 2358 /* 2359 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they 2360 * won't get blocked by normal direct-reclaimers, forming a circular 2361 * deadlock. 2362 */ 2363 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) 2364 inactive >>= 3; 2365 2366 too_many = isolated > inactive; 2367 2368 /* Wake up tasks throttled due to too_many_isolated. */ 2369 if (!too_many) 2370 wake_throttle_isolated(pgdat); 2371 2372 return too_many; 2373 } 2374 2375 /* 2376 * move_folios_to_lru() moves folios from private @list to appropriate LRU list. 2377 * On return, @list is reused as a list of folios to be freed by the caller. 2378 * 2379 * Returns the number of pages moved to the given lruvec. 2380 */ 2381 static unsigned int move_folios_to_lru(struct lruvec *lruvec, 2382 struct list_head *list) 2383 { 2384 int nr_pages, nr_moved = 0; 2385 LIST_HEAD(folios_to_free); 2386 2387 while (!list_empty(list)) { 2388 struct folio *folio = lru_to_folio(list); 2389 2390 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 2391 list_del(&folio->lru); 2392 if (unlikely(!folio_evictable(folio))) { 2393 spin_unlock_irq(&lruvec->lru_lock); 2394 folio_putback_lru(folio); 2395 spin_lock_irq(&lruvec->lru_lock); 2396 continue; 2397 } 2398 2399 /* 2400 * The folio_set_lru needs to be kept here for list integrity. 2401 * Otherwise: 2402 * #0 move_folios_to_lru #1 release_pages 2403 * if (!folio_put_testzero()) 2404 * if (folio_put_testzero()) 2405 * !lru //skip lru_lock 2406 * folio_set_lru() 2407 * list_add(&folio->lru,) 2408 * list_add(&folio->lru,) 2409 */ 2410 folio_set_lru(folio); 2411 2412 if (unlikely(folio_put_testzero(folio))) { 2413 __folio_clear_lru_flags(folio); 2414 2415 if (unlikely(folio_test_large(folio))) { 2416 spin_unlock_irq(&lruvec->lru_lock); 2417 destroy_large_folio(folio); 2418 spin_lock_irq(&lruvec->lru_lock); 2419 } else 2420 list_add(&folio->lru, &folios_to_free); 2421 2422 continue; 2423 } 2424 2425 /* 2426 * All pages were isolated from the same lruvec (and isolation 2427 * inhibits memcg migration). 2428 */ 2429 VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); 2430 lruvec_add_folio(lruvec, folio); 2431 nr_pages = folio_nr_pages(folio); 2432 nr_moved += nr_pages; 2433 if (folio_test_active(folio)) 2434 workingset_age_nonresident(lruvec, nr_pages); 2435 } 2436 2437 /* 2438 * To save our caller's stack, now use input list for pages to free. 2439 */ 2440 list_splice(&folios_to_free, list); 2441 2442 return nr_moved; 2443 } 2444 2445 /* 2446 * If a kernel thread (such as nfsd for loop-back mounts) services a backing 2447 * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case 2448 * we should not throttle. Otherwise it is safe to do so. 2449 */ 2450 static int current_may_throttle(void) 2451 { 2452 return !(current->flags & PF_LOCAL_THROTTLE); 2453 } 2454 2455 /* 2456 * shrink_inactive_list() is a helper for shrink_node(). It returns the number 2457 * of reclaimed pages 2458 */ 2459 static unsigned long shrink_inactive_list(unsigned long nr_to_scan, 2460 struct lruvec *lruvec, struct scan_control *sc, 2461 enum lru_list lru) 2462 { 2463 LIST_HEAD(folio_list); 2464 unsigned long nr_scanned; 2465 unsigned int nr_reclaimed = 0; 2466 unsigned long nr_taken; 2467 struct reclaim_stat stat; 2468 bool file = is_file_lru(lru); 2469 enum vm_event_item item; 2470 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2471 bool stalled = false; 2472 2473 while (unlikely(too_many_isolated(pgdat, file, sc))) { 2474 if (stalled) 2475 return 0; 2476 2477 /* wait a bit for the reclaimer. */ 2478 stalled = true; 2479 reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); 2480 2481 /* We are about to die and free our memory. Return now. */ 2482 if (fatal_signal_pending(current)) 2483 return SWAP_CLUSTER_MAX; 2484 } 2485 2486 lru_add_drain(); 2487 2488 spin_lock_irq(&lruvec->lru_lock); 2489 2490 nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, 2491 &nr_scanned, sc, lru); 2492 2493 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2494 item = PGSCAN_KSWAPD + reclaimer_offset(); 2495 if (!cgroup_reclaim(sc)) 2496 __count_vm_events(item, nr_scanned); 2497 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); 2498 __count_vm_events(PGSCAN_ANON + file, nr_scanned); 2499 2500 spin_unlock_irq(&lruvec->lru_lock); 2501 2502 if (nr_taken == 0) 2503 return 0; 2504 2505 nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); 2506 2507 spin_lock_irq(&lruvec->lru_lock); 2508 move_folios_to_lru(lruvec, &folio_list); 2509 2510 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2511 item = PGSTEAL_KSWAPD + reclaimer_offset(); 2512 if (!cgroup_reclaim(sc)) 2513 __count_vm_events(item, nr_reclaimed); 2514 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); 2515 __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); 2516 spin_unlock_irq(&lruvec->lru_lock); 2517 2518 lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); 2519 mem_cgroup_uncharge_list(&folio_list); 2520 free_unref_page_list(&folio_list); 2521 2522 /* 2523 * If dirty folios are scanned that are not queued for IO, it 2524 * implies that flushers are not doing their job. This can 2525 * happen when memory pressure pushes dirty folios to the end of 2526 * the LRU before the dirty limits are breached and the dirty 2527 * data has expired. It can also happen when the proportion of 2528 * dirty folios grows not through writes but through memory 2529 * pressure reclaiming all the clean cache. And in some cases, 2530 * the flushers simply cannot keep up with the allocation 2531 * rate. Nudge the flusher threads in case they are asleep. 2532 */ 2533 if (stat.nr_unqueued_dirty == nr_taken) { 2534 wakeup_flusher_threads(WB_REASON_VMSCAN); 2535 /* 2536 * For cgroupv1 dirty throttling is achieved by waking up 2537 * the kernel flusher here and later waiting on folios 2538 * which are in writeback to finish (see shrink_folio_list()). 2539 * 2540 * Flusher may not be able to issue writeback quickly 2541 * enough for cgroupv1 writeback throttling to work 2542 * on a large system. 2543 */ 2544 if (!writeback_throttling_sane(sc)) 2545 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); 2546 } 2547 2548 sc->nr.dirty += stat.nr_dirty; 2549 sc->nr.congested += stat.nr_congested; 2550 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; 2551 sc->nr.writeback += stat.nr_writeback; 2552 sc->nr.immediate += stat.nr_immediate; 2553 sc->nr.taken += nr_taken; 2554 if (file) 2555 sc->nr.file_taken += nr_taken; 2556 2557 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, 2558 nr_scanned, nr_reclaimed, &stat, sc->priority, file); 2559 return nr_reclaimed; 2560 } 2561 2562 /* 2563 * shrink_active_list() moves folios from the active LRU to the inactive LRU. 2564 * 2565 * We move them the other way if the folio is referenced by one or more 2566 * processes. 2567 * 2568 * If the folios are mostly unmapped, the processing is fast and it is 2569 * appropriate to hold lru_lock across the whole operation. But if 2570 * the folios are mapped, the processing is slow (folio_referenced()), so 2571 * we should drop lru_lock around each folio. It's impossible to balance 2572 * this, so instead we remove the folios from the LRU while processing them. 2573 * It is safe to rely on the active flag against the non-LRU folios in here 2574 * because nobody will play with that bit on a non-LRU folio. 2575 * 2576 * The downside is that we have to touch folio->_refcount against each folio. 2577 * But we had to alter folio->flags anyway. 2578 */ 2579 static void shrink_active_list(unsigned long nr_to_scan, 2580 struct lruvec *lruvec, 2581 struct scan_control *sc, 2582 enum lru_list lru) 2583 { 2584 unsigned long nr_taken; 2585 unsigned long nr_scanned; 2586 unsigned long vm_flags; 2587 LIST_HEAD(l_hold); /* The folios which were snipped off */ 2588 LIST_HEAD(l_active); 2589 LIST_HEAD(l_inactive); 2590 unsigned nr_deactivate, nr_activate; 2591 unsigned nr_rotated = 0; 2592 int file = is_file_lru(lru); 2593 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2594 2595 lru_add_drain(); 2596 2597 spin_lock_irq(&lruvec->lru_lock); 2598 2599 nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, 2600 &nr_scanned, sc, lru); 2601 2602 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); 2603 2604 if (!cgroup_reclaim(sc)) 2605 __count_vm_events(PGREFILL, nr_scanned); 2606 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); 2607 2608 spin_unlock_irq(&lruvec->lru_lock); 2609 2610 while (!list_empty(&l_hold)) { 2611 struct folio *folio; 2612 2613 cond_resched(); 2614 folio = lru_to_folio(&l_hold); 2615 list_del(&folio->lru); 2616 2617 if (unlikely(!folio_evictable(folio))) { 2618 folio_putback_lru(folio); 2619 continue; 2620 } 2621 2622 if (unlikely(buffer_heads_over_limit)) { 2623 if (folio_test_private(folio) && folio_trylock(folio)) { 2624 if (folio_test_private(folio)) 2625 filemap_release_folio(folio, 0); 2626 folio_unlock(folio); 2627 } 2628 } 2629 2630 /* Referenced or rmap lock contention: rotate */ 2631 if (folio_referenced(folio, 0, sc->target_mem_cgroup, 2632 &vm_flags) != 0) { 2633 /* 2634 * Identify referenced, file-backed active folios and 2635 * give them one more trip around the active list. So 2636 * that executable code get better chances to stay in 2637 * memory under moderate memory pressure. Anon folios 2638 * are not likely to be evicted by use-once streaming 2639 * IO, plus JVM can create lots of anon VM_EXEC folios, 2640 * so we ignore them here. 2641 */ 2642 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { 2643 nr_rotated += folio_nr_pages(folio); 2644 list_add(&folio->lru, &l_active); 2645 continue; 2646 } 2647 } 2648 2649 folio_clear_active(folio); /* we are de-activating */ 2650 folio_set_workingset(folio); 2651 list_add(&folio->lru, &l_inactive); 2652 } 2653 2654 /* 2655 * Move folios back to the lru list. 2656 */ 2657 spin_lock_irq(&lruvec->lru_lock); 2658 2659 nr_activate = move_folios_to_lru(lruvec, &l_active); 2660 nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); 2661 /* Keep all free folios in l_active list */ 2662 list_splice(&l_inactive, &l_active); 2663 2664 __count_vm_events(PGDEACTIVATE, nr_deactivate); 2665 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); 2666 2667 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); 2668 spin_unlock_irq(&lruvec->lru_lock); 2669 2670 if (nr_rotated) 2671 lru_note_cost(lruvec, file, 0, nr_rotated); 2672 mem_cgroup_uncharge_list(&l_active); 2673 free_unref_page_list(&l_active); 2674 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, 2675 nr_deactivate, nr_rotated, sc->priority, file); 2676 } 2677 2678 static unsigned int reclaim_folio_list(struct list_head *folio_list, 2679 struct pglist_data *pgdat) 2680 { 2681 struct reclaim_stat dummy_stat; 2682 unsigned int nr_reclaimed; 2683 struct folio *folio; 2684 struct scan_control sc = { 2685 .gfp_mask = GFP_KERNEL, 2686 .may_writepage = 1, 2687 .may_unmap = 1, 2688 .may_swap = 1, 2689 .no_demotion = 1, 2690 }; 2691 2692 nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); 2693 while (!list_empty(folio_list)) { 2694 folio = lru_to_folio(folio_list); 2695 list_del(&folio->lru); 2696 folio_putback_lru(folio); 2697 } 2698 2699 return nr_reclaimed; 2700 } 2701 2702 unsigned long reclaim_pages(struct list_head *folio_list) 2703 { 2704 int nid; 2705 unsigned int nr_reclaimed = 0; 2706 LIST_HEAD(node_folio_list); 2707 unsigned int noreclaim_flag; 2708 2709 if (list_empty(folio_list)) 2710 return nr_reclaimed; 2711 2712 noreclaim_flag = memalloc_noreclaim_save(); 2713 2714 nid = folio_nid(lru_to_folio(folio_list)); 2715 do { 2716 struct folio *folio = lru_to_folio(folio_list); 2717 2718 if (nid == folio_nid(folio)) { 2719 folio_clear_active(folio); 2720 list_move(&folio->lru, &node_folio_list); 2721 continue; 2722 } 2723 2724 nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); 2725 nid = folio_nid(lru_to_folio(folio_list)); 2726 } while (!list_empty(folio_list)); 2727 2728 nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); 2729 2730 memalloc_noreclaim_restore(noreclaim_flag); 2731 2732 return nr_reclaimed; 2733 } 2734 2735 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 2736 struct lruvec *lruvec, struct scan_control *sc) 2737 { 2738 if (is_active_lru(lru)) { 2739 if (sc->may_deactivate & (1 << is_file_lru(lru))) 2740 shrink_active_list(nr_to_scan, lruvec, sc, lru); 2741 else 2742 sc->skipped_deactivate = 1; 2743 return 0; 2744 } 2745 2746 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); 2747 } 2748 2749 /* 2750 * The inactive anon list should be small enough that the VM never has 2751 * to do too much work. 2752 * 2753 * The inactive file list should be small enough to leave most memory 2754 * to the established workingset on the scan-resistant active list, 2755 * but large enough to avoid thrashing the aggregate readahead window. 2756 * 2757 * Both inactive lists should also be large enough that each inactive 2758 * folio has a chance to be referenced again before it is reclaimed. 2759 * 2760 * If that fails and refaulting is observed, the inactive list grows. 2761 * 2762 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios 2763 * on this LRU, maintained by the pageout code. An inactive_ratio 2764 * of 3 means 3:1 or 25% of the folios are kept on the inactive list. 2765 * 2766 * total target max 2767 * memory ratio inactive 2768 * ------------------------------------- 2769 * 10MB 1 5MB 2770 * 100MB 1 50MB 2771 * 1GB 3 250MB 2772 * 10GB 10 0.9GB 2773 * 100GB 31 3GB 2774 * 1TB 101 10GB 2775 * 10TB 320 32GB 2776 */ 2777 static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) 2778 { 2779 enum lru_list active_lru = inactive_lru + LRU_ACTIVE; 2780 unsigned long inactive, active; 2781 unsigned long inactive_ratio; 2782 unsigned long gb; 2783 2784 inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); 2785 active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); 2786 2787 gb = (inactive + active) >> (30 - PAGE_SHIFT); 2788 if (gb) 2789 inactive_ratio = int_sqrt(10 * gb); 2790 else 2791 inactive_ratio = 1; 2792 2793 return inactive * inactive_ratio < active; 2794 } 2795 2796 enum scan_balance { 2797 SCAN_EQUAL, 2798 SCAN_FRACT, 2799 SCAN_ANON, 2800 SCAN_FILE, 2801 }; 2802 2803 static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) 2804 { 2805 unsigned long file; 2806 struct lruvec *target_lruvec; 2807 2808 if (lru_gen_enabled()) 2809 return; 2810 2811 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); 2812 2813 /* 2814 * Flush the memory cgroup stats, so that we read accurate per-memcg 2815 * lruvec stats for heuristics. 2816 */ 2817 mem_cgroup_flush_stats(); 2818 2819 /* 2820 * Determine the scan balance between anon and file LRUs. 2821 */ 2822 spin_lock_irq(&target_lruvec->lru_lock); 2823 sc->anon_cost = target_lruvec->anon_cost; 2824 sc->file_cost = target_lruvec->file_cost; 2825 spin_unlock_irq(&target_lruvec->lru_lock); 2826 2827 /* 2828 * Target desirable inactive:active list ratios for the anon 2829 * and file LRU lists. 2830 */ 2831 if (!sc->force_deactivate) { 2832 unsigned long refaults; 2833 2834 /* 2835 * When refaults are being observed, it means a new 2836 * workingset is being established. Deactivate to get 2837 * rid of any stale active pages quickly. 2838 */ 2839 refaults = lruvec_page_state(target_lruvec, 2840 WORKINGSET_ACTIVATE_ANON); 2841 if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || 2842 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) 2843 sc->may_deactivate |= DEACTIVATE_ANON; 2844 else 2845 sc->may_deactivate &= ~DEACTIVATE_ANON; 2846 2847 refaults = lruvec_page_state(target_lruvec, 2848 WORKINGSET_ACTIVATE_FILE); 2849 if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || 2850 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) 2851 sc->may_deactivate |= DEACTIVATE_FILE; 2852 else 2853 sc->may_deactivate &= ~DEACTIVATE_FILE; 2854 } else 2855 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; 2856 2857 /* 2858 * If we have plenty of inactive file pages that aren't 2859 * thrashing, try to reclaim those first before touching 2860 * anonymous pages. 2861 */ 2862 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); 2863 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) 2864 sc->cache_trim_mode = 1; 2865 else 2866 sc->cache_trim_mode = 0; 2867 2868 /* 2869 * Prevent the reclaimer from falling into the cache trap: as 2870 * cache pages start out inactive, every cache fault will tip 2871 * the scan balance towards the file LRU. And as the file LRU 2872 * shrinks, so does the window for rotation from references. 2873 * This means we have a runaway feedback loop where a tiny 2874 * thrashing file LRU becomes infinitely more attractive than 2875 * anon pages. Try to detect this based on file LRU size. 2876 */ 2877 if (!cgroup_reclaim(sc)) { 2878 unsigned long total_high_wmark = 0; 2879 unsigned long free, anon; 2880 int z; 2881 2882 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); 2883 file = node_page_state(pgdat, NR_ACTIVE_FILE) + 2884 node_page_state(pgdat, NR_INACTIVE_FILE); 2885 2886 for (z = 0; z < MAX_NR_ZONES; z++) { 2887 struct zone *zone = &pgdat->node_zones[z]; 2888 2889 if (!managed_zone(zone)) 2890 continue; 2891 2892 total_high_wmark += high_wmark_pages(zone); 2893 } 2894 2895 /* 2896 * Consider anon: if that's low too, this isn't a 2897 * runaway file reclaim problem, but rather just 2898 * extreme pressure. Reclaim as per usual then. 2899 */ 2900 anon = node_page_state(pgdat, NR_INACTIVE_ANON); 2901 2902 sc->file_is_tiny = 2903 file + free <= total_high_wmark && 2904 !(sc->may_deactivate & DEACTIVATE_ANON) && 2905 anon >> sc->priority; 2906 } 2907 } 2908 2909 /* 2910 * Determine how aggressively the anon and file LRU lists should be 2911 * scanned. 2912 * 2913 * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan 2914 * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan 2915 */ 2916 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 2917 unsigned long *nr) 2918 { 2919 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 2920 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 2921 unsigned long anon_cost, file_cost, total_cost; 2922 int swappiness = mem_cgroup_swappiness(memcg); 2923 u64 fraction[ANON_AND_FILE]; 2924 u64 denominator = 0; /* gcc */ 2925 enum scan_balance scan_balance; 2926 unsigned long ap, fp; 2927 enum lru_list lru; 2928 2929 /* If we have no swap space, do not bother scanning anon folios. */ 2930 if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { 2931 scan_balance = SCAN_FILE; 2932 goto out; 2933 } 2934 2935 /* 2936 * Global reclaim will swap to prevent OOM even with no 2937 * swappiness, but memcg users want to use this knob to 2938 * disable swapping for individual groups completely when 2939 * using the memory controller's swap limit feature would be 2940 * too expensive. 2941 */ 2942 if (cgroup_reclaim(sc) && !swappiness) { 2943 scan_balance = SCAN_FILE; 2944 goto out; 2945 } 2946 2947 /* 2948 * Do not apply any pressure balancing cleverness when the 2949 * system is close to OOM, scan both anon and file equally 2950 * (unless the swappiness setting disagrees with swapping). 2951 */ 2952 if (!sc->priority && swappiness) { 2953 scan_balance = SCAN_EQUAL; 2954 goto out; 2955 } 2956 2957 /* 2958 * If the system is almost out of file pages, force-scan anon. 2959 */ 2960 if (sc->file_is_tiny) { 2961 scan_balance = SCAN_ANON; 2962 goto out; 2963 } 2964 2965 /* 2966 * If there is enough inactive page cache, we do not reclaim 2967 * anything from the anonymous working right now. 2968 */ 2969 if (sc->cache_trim_mode) { 2970 scan_balance = SCAN_FILE; 2971 goto out; 2972 } 2973 2974 scan_balance = SCAN_FRACT; 2975 /* 2976 * Calculate the pressure balance between anon and file pages. 2977 * 2978 * The amount of pressure we put on each LRU is inversely 2979 * proportional to the cost of reclaiming each list, as 2980 * determined by the share of pages that are refaulting, times 2981 * the relative IO cost of bringing back a swapped out 2982 * anonymous page vs reloading a filesystem page (swappiness). 2983 * 2984 * Although we limit that influence to ensure no list gets 2985 * left behind completely: at least a third of the pressure is 2986 * applied, before swappiness. 2987 * 2988 * With swappiness at 100, anon and file have equal IO cost. 2989 */ 2990 total_cost = sc->anon_cost + sc->file_cost; 2991 anon_cost = total_cost + sc->anon_cost; 2992 file_cost = total_cost + sc->file_cost; 2993 total_cost = anon_cost + file_cost; 2994 2995 ap = swappiness * (total_cost + 1); 2996 ap /= anon_cost + 1; 2997 2998 fp = (200 - swappiness) * (total_cost + 1); 2999 fp /= file_cost + 1; 3000 3001 fraction[0] = ap; 3002 fraction[1] = fp; 3003 denominator = ap + fp; 3004 out: 3005 for_each_evictable_lru(lru) { 3006 int file = is_file_lru(lru); 3007 unsigned long lruvec_size; 3008 unsigned long low, min; 3009 unsigned long scan; 3010 3011 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); 3012 mem_cgroup_protection(sc->target_mem_cgroup, memcg, 3013 &min, &low); 3014 3015 if (min || low) { 3016 /* 3017 * Scale a cgroup's reclaim pressure by proportioning 3018 * its current usage to its memory.low or memory.min 3019 * setting. 3020 * 3021 * This is important, as otherwise scanning aggression 3022 * becomes extremely binary -- from nothing as we 3023 * approach the memory protection threshold, to totally 3024 * nominal as we exceed it. This results in requiring 3025 * setting extremely liberal protection thresholds. It 3026 * also means we simply get no protection at all if we 3027 * set it too low, which is not ideal. 3028 * 3029 * If there is any protection in place, we reduce scan 3030 * pressure by how much of the total memory used is 3031 * within protection thresholds. 3032 * 3033 * There is one special case: in the first reclaim pass, 3034 * we skip over all groups that are within their low 3035 * protection. If that fails to reclaim enough pages to 3036 * satisfy the reclaim goal, we come back and override 3037 * the best-effort low protection. However, we still 3038 * ideally want to honor how well-behaved groups are in 3039 * that case instead of simply punishing them all 3040 * equally. As such, we reclaim them based on how much 3041 * memory they are using, reducing the scan pressure 3042 * again by how much of the total memory used is under 3043 * hard protection. 3044 */ 3045 unsigned long cgroup_size = mem_cgroup_size(memcg); 3046 unsigned long protection; 3047 3048 /* memory.low scaling, make sure we retry before OOM */ 3049 if (!sc->memcg_low_reclaim && low > min) { 3050 protection = low; 3051 sc->memcg_low_skipped = 1; 3052 } else { 3053 protection = min; 3054 } 3055 3056 /* Avoid TOCTOU with earlier protection check */ 3057 cgroup_size = max(cgroup_size, protection); 3058 3059 scan = lruvec_size - lruvec_size * protection / 3060 (cgroup_size + 1); 3061 3062 /* 3063 * Minimally target SWAP_CLUSTER_MAX pages to keep 3064 * reclaim moving forwards, avoiding decrementing 3065 * sc->priority further than desirable. 3066 */ 3067 scan = max(scan, SWAP_CLUSTER_MAX); 3068 } else { 3069 scan = lruvec_size; 3070 } 3071 3072 scan >>= sc->priority; 3073 3074 /* 3075 * If the cgroup's already been deleted, make sure to 3076 * scrape out the remaining cache. 3077 */ 3078 if (!scan && !mem_cgroup_online(memcg)) 3079 scan = min(lruvec_size, SWAP_CLUSTER_MAX); 3080 3081 switch (scan_balance) { 3082 case SCAN_EQUAL: 3083 /* Scan lists relative to size */ 3084 break; 3085 case SCAN_FRACT: 3086 /* 3087 * Scan types proportional to swappiness and 3088 * their relative recent reclaim efficiency. 3089 * Make sure we don't miss the last page on 3090 * the offlined memory cgroups because of a 3091 * round-off error. 3092 */ 3093 scan = mem_cgroup_online(memcg) ? 3094 div64_u64(scan * fraction[file], denominator) : 3095 DIV64_U64_ROUND_UP(scan * fraction[file], 3096 denominator); 3097 break; 3098 case SCAN_FILE: 3099 case SCAN_ANON: 3100 /* Scan one type exclusively */ 3101 if ((scan_balance == SCAN_FILE) != file) 3102 scan = 0; 3103 break; 3104 default: 3105 /* Look ma, no brain */ 3106 BUG(); 3107 } 3108 3109 nr[lru] = scan; 3110 } 3111 } 3112 3113 /* 3114 * Anonymous LRU management is a waste if there is 3115 * ultimately no way to reclaim the memory. 3116 */ 3117 static bool can_age_anon_pages(struct pglist_data *pgdat, 3118 struct scan_control *sc) 3119 { 3120 /* Aging the anon LRU is valuable if swap is present: */ 3121 if (total_swap_pages > 0) 3122 return true; 3123 3124 /* Also valuable if anon pages can be demoted: */ 3125 return can_demote(pgdat->node_id, sc); 3126 } 3127 3128 #ifdef CONFIG_LRU_GEN 3129 3130 #ifdef CONFIG_LRU_GEN_ENABLED 3131 DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); 3132 #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) 3133 #else 3134 DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); 3135 #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) 3136 #endif 3137 3138 /****************************************************************************** 3139 * shorthand helpers 3140 ******************************************************************************/ 3141 3142 #define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) 3143 3144 #define DEFINE_MAX_SEQ(lruvec) \ 3145 unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) 3146 3147 #define DEFINE_MIN_SEQ(lruvec) \ 3148 unsigned long min_seq[ANON_AND_FILE] = { \ 3149 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ 3150 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ 3151 } 3152 3153 #define for_each_gen_type_zone(gen, type, zone) \ 3154 for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ 3155 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ 3156 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) 3157 3158 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) 3159 { 3160 struct pglist_data *pgdat = NODE_DATA(nid); 3161 3162 #ifdef CONFIG_MEMCG 3163 if (memcg) { 3164 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; 3165 3166 /* for hotadd_new_pgdat() */ 3167 if (!lruvec->pgdat) 3168 lruvec->pgdat = pgdat; 3169 3170 return lruvec; 3171 } 3172 #endif 3173 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3174 3175 return pgdat ? &pgdat->__lruvec : NULL; 3176 } 3177 3178 static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) 3179 { 3180 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3181 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 3182 3183 if (!can_demote(pgdat->node_id, sc) && 3184 mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) 3185 return 0; 3186 3187 return mem_cgroup_swappiness(memcg); 3188 } 3189 3190 static int get_nr_gens(struct lruvec *lruvec, int type) 3191 { 3192 return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; 3193 } 3194 3195 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) 3196 { 3197 /* see the comment on lru_gen_struct */ 3198 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && 3199 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && 3200 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; 3201 } 3202 3203 /****************************************************************************** 3204 * mm_struct list 3205 ******************************************************************************/ 3206 3207 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) 3208 { 3209 static struct lru_gen_mm_list mm_list = { 3210 .fifo = LIST_HEAD_INIT(mm_list.fifo), 3211 .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), 3212 }; 3213 3214 #ifdef CONFIG_MEMCG 3215 if (memcg) 3216 return &memcg->mm_list; 3217 #endif 3218 VM_WARN_ON_ONCE(!mem_cgroup_disabled()); 3219 3220 return &mm_list; 3221 } 3222 3223 void lru_gen_add_mm(struct mm_struct *mm) 3224 { 3225 int nid; 3226 struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); 3227 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3228 3229 VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); 3230 #ifdef CONFIG_MEMCG 3231 VM_WARN_ON_ONCE(mm->lru_gen.memcg); 3232 mm->lru_gen.memcg = memcg; 3233 #endif 3234 spin_lock(&mm_list->lock); 3235 3236 for_each_node_state(nid, N_MEMORY) { 3237 struct lruvec *lruvec = get_lruvec(memcg, nid); 3238 3239 if (!lruvec) 3240 continue; 3241 3242 /* the first addition since the last iteration */ 3243 if (lruvec->mm_state.tail == &mm_list->fifo) 3244 lruvec->mm_state.tail = &mm->lru_gen.list; 3245 } 3246 3247 list_add_tail(&mm->lru_gen.list, &mm_list->fifo); 3248 3249 spin_unlock(&mm_list->lock); 3250 } 3251 3252 void lru_gen_del_mm(struct mm_struct *mm) 3253 { 3254 int nid; 3255 struct lru_gen_mm_list *mm_list; 3256 struct mem_cgroup *memcg = NULL; 3257 3258 if (list_empty(&mm->lru_gen.list)) 3259 return; 3260 3261 #ifdef CONFIG_MEMCG 3262 memcg = mm->lru_gen.memcg; 3263 #endif 3264 mm_list = get_mm_list(memcg); 3265 3266 spin_lock(&mm_list->lock); 3267 3268 for_each_node(nid) { 3269 struct lruvec *lruvec = get_lruvec(memcg, nid); 3270 3271 if (!lruvec) 3272 continue; 3273 3274 /* where the last iteration ended (exclusive) */ 3275 if (lruvec->mm_state.tail == &mm->lru_gen.list) 3276 lruvec->mm_state.tail = lruvec->mm_state.tail->next; 3277 3278 /* where the current iteration continues (inclusive) */ 3279 if (lruvec->mm_state.head != &mm->lru_gen.list) 3280 continue; 3281 3282 lruvec->mm_state.head = lruvec->mm_state.head->next; 3283 /* the deletion ends the current iteration */ 3284 if (lruvec->mm_state.head == &mm_list->fifo) 3285 WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); 3286 } 3287 3288 list_del_init(&mm->lru_gen.list); 3289 3290 spin_unlock(&mm_list->lock); 3291 3292 #ifdef CONFIG_MEMCG 3293 mem_cgroup_put(mm->lru_gen.memcg); 3294 mm->lru_gen.memcg = NULL; 3295 #endif 3296 } 3297 3298 #ifdef CONFIG_MEMCG 3299 void lru_gen_migrate_mm(struct mm_struct *mm) 3300 { 3301 struct mem_cgroup *memcg; 3302 struct task_struct *task = rcu_dereference_protected(mm->owner, true); 3303 3304 VM_WARN_ON_ONCE(task->mm != mm); 3305 lockdep_assert_held(&task->alloc_lock); 3306 3307 /* for mm_update_next_owner() */ 3308 if (mem_cgroup_disabled()) 3309 return; 3310 3311 rcu_read_lock(); 3312 memcg = mem_cgroup_from_task(task); 3313 rcu_read_unlock(); 3314 if (memcg == mm->lru_gen.memcg) 3315 return; 3316 3317 VM_WARN_ON_ONCE(!mm->lru_gen.memcg); 3318 VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); 3319 3320 lru_gen_del_mm(mm); 3321 lru_gen_add_mm(mm); 3322 } 3323 #endif 3324 3325 /* 3326 * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when 3327 * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of 3328 * bits in a bitmap, k is the number of hash functions and n is the number of 3329 * inserted items. 3330 * 3331 * Page table walkers use one of the two filters to reduce their search space. 3332 * To get rid of non-leaf entries that no longer have enough leaf entries, the 3333 * aging uses the double-buffering technique to flip to the other filter each 3334 * time it produces a new generation. For non-leaf entries that have enough 3335 * leaf entries, the aging carries them over to the next generation in 3336 * walk_pmd_range(); the eviction also report them when walking the rmap 3337 * in lru_gen_look_around(). 3338 * 3339 * For future optimizations: 3340 * 1. It's not necessary to keep both filters all the time. The spare one can be 3341 * freed after the RCU grace period and reallocated if needed again. 3342 * 2. And when reallocating, it's worth scaling its size according to the number 3343 * of inserted entries in the other filter, to reduce the memory overhead on 3344 * small systems and false positives on large systems. 3345 * 3. Jenkins' hash function is an alternative to Knuth's. 3346 */ 3347 #define BLOOM_FILTER_SHIFT 15 3348 3349 static inline int filter_gen_from_seq(unsigned long seq) 3350 { 3351 return seq % NR_BLOOM_FILTERS; 3352 } 3353 3354 static void get_item_key(void *item, int *key) 3355 { 3356 u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); 3357 3358 BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); 3359 3360 key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); 3361 key[1] = hash >> BLOOM_FILTER_SHIFT; 3362 } 3363 3364 static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) 3365 { 3366 unsigned long *filter; 3367 int gen = filter_gen_from_seq(seq); 3368 3369 filter = lruvec->mm_state.filters[gen]; 3370 if (filter) { 3371 bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); 3372 return; 3373 } 3374 3375 filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), 3376 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 3377 WRITE_ONCE(lruvec->mm_state.filters[gen], filter); 3378 } 3379 3380 static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3381 { 3382 int key[2]; 3383 unsigned long *filter; 3384 int gen = filter_gen_from_seq(seq); 3385 3386 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3387 if (!filter) 3388 return; 3389 3390 get_item_key(item, key); 3391 3392 if (!test_bit(key[0], filter)) 3393 set_bit(key[0], filter); 3394 if (!test_bit(key[1], filter)) 3395 set_bit(key[1], filter); 3396 } 3397 3398 static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) 3399 { 3400 int key[2]; 3401 unsigned long *filter; 3402 int gen = filter_gen_from_seq(seq); 3403 3404 filter = READ_ONCE(lruvec->mm_state.filters[gen]); 3405 if (!filter) 3406 return true; 3407 3408 get_item_key(item, key); 3409 3410 return test_bit(key[0], filter) && test_bit(key[1], filter); 3411 } 3412 3413 static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) 3414 { 3415 int i; 3416 int hist; 3417 3418 lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); 3419 3420 if (walk) { 3421 hist = lru_hist_from_seq(walk->max_seq); 3422 3423 for (i = 0; i < NR_MM_STATS; i++) { 3424 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 3425 lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); 3426 walk->mm_stats[i] = 0; 3427 } 3428 } 3429 3430 if (NR_HIST_GENS > 1 && last) { 3431 hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); 3432 3433 for (i = 0; i < NR_MM_STATS; i++) 3434 WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); 3435 } 3436 } 3437 3438 static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) 3439 { 3440 int type; 3441 unsigned long size = 0; 3442 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3443 int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); 3444 3445 if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) 3446 return true; 3447 3448 clear_bit(key, &mm->lru_gen.bitmap); 3449 3450 for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { 3451 size += type ? get_mm_counter(mm, MM_FILEPAGES) : 3452 get_mm_counter(mm, MM_ANONPAGES) + 3453 get_mm_counter(mm, MM_SHMEMPAGES); 3454 } 3455 3456 if (size < MIN_LRU_BATCH) 3457 return true; 3458 3459 return !mmget_not_zero(mm); 3460 } 3461 3462 static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, 3463 struct mm_struct **iter) 3464 { 3465 bool first = false; 3466 bool last = true; 3467 struct mm_struct *mm = NULL; 3468 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3469 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3470 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3471 3472 /* 3473 * There are four interesting cases for this page table walker: 3474 * 1. It tries to start a new iteration of mm_list with a stale max_seq; 3475 * there is nothing left to do. 3476 * 2. It's the first of the current generation, and it needs to reset 3477 * the Bloom filter for the next generation. 3478 * 3. It reaches the end of mm_list, and it needs to increment 3479 * mm_state->seq; the iteration is done. 3480 * 4. It's the last of the current generation, and it needs to reset the 3481 * mm stats counters for the next generation. 3482 */ 3483 spin_lock(&mm_list->lock); 3484 3485 VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); 3486 VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); 3487 VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); 3488 3489 if (walk->max_seq <= mm_state->seq) { 3490 if (!*iter) 3491 last = false; 3492 goto done; 3493 } 3494 3495 if (!mm_state->nr_walkers) { 3496 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); 3497 3498 mm_state->head = mm_list->fifo.next; 3499 first = true; 3500 } 3501 3502 while (!mm && mm_state->head != &mm_list->fifo) { 3503 mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); 3504 3505 mm_state->head = mm_state->head->next; 3506 3507 /* force scan for those added after the last iteration */ 3508 if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { 3509 mm_state->tail = mm_state->head; 3510 walk->force_scan = true; 3511 } 3512 3513 if (should_skip_mm(mm, walk)) 3514 mm = NULL; 3515 } 3516 3517 if (mm_state->head == &mm_list->fifo) 3518 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3519 done: 3520 if (*iter && !mm) 3521 mm_state->nr_walkers--; 3522 if (!*iter && mm) 3523 mm_state->nr_walkers++; 3524 3525 if (mm_state->nr_walkers) 3526 last = false; 3527 3528 if (*iter || last) 3529 reset_mm_stats(lruvec, walk, last); 3530 3531 spin_unlock(&mm_list->lock); 3532 3533 if (mm && first) 3534 reset_bloom_filter(lruvec, walk->max_seq + 1); 3535 3536 if (*iter) 3537 mmput_async(*iter); 3538 3539 *iter = mm; 3540 3541 return last; 3542 } 3543 3544 static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) 3545 { 3546 bool success = false; 3547 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 3548 struct lru_gen_mm_list *mm_list = get_mm_list(memcg); 3549 struct lru_gen_mm_state *mm_state = &lruvec->mm_state; 3550 3551 spin_lock(&mm_list->lock); 3552 3553 VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); 3554 3555 if (max_seq > mm_state->seq && !mm_state->nr_walkers) { 3556 VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); 3557 3558 WRITE_ONCE(mm_state->seq, mm_state->seq + 1); 3559 reset_mm_stats(lruvec, NULL, true); 3560 success = true; 3561 } 3562 3563 spin_unlock(&mm_list->lock); 3564 3565 return success; 3566 } 3567 3568 /****************************************************************************** 3569 * refault feedback loop 3570 ******************************************************************************/ 3571 3572 /* 3573 * A feedback loop based on Proportional-Integral-Derivative (PID) controller. 3574 * 3575 * The P term is refaulted/(evicted+protected) from a tier in the generation 3576 * currently being evicted; the I term is the exponential moving average of the 3577 * P term over the generations previously evicted, using the smoothing factor 3578 * 1/2; the D term isn't supported. 3579 * 3580 * The setpoint (SP) is always the first tier of one type; the process variable 3581 * (PV) is either any tier of the other type or any other tier of the same 3582 * type. 3583 * 3584 * The error is the difference between the SP and the PV; the correction is to 3585 * turn off protection when SP>PV or turn on protection when SP<PV. 3586 * 3587 * For future optimizations: 3588 * 1. The D term may discount the other two terms over time so that long-lived 3589 * generations can resist stale information. 3590 */ 3591 struct ctrl_pos { 3592 unsigned long refaulted; 3593 unsigned long total; 3594 int gain; 3595 }; 3596 3597 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, 3598 struct ctrl_pos *pos) 3599 { 3600 struct lru_gen_struct *lrugen = &lruvec->lrugen; 3601 int hist = lru_hist_from_seq(lrugen->min_seq[type]); 3602 3603 pos->refaulted = lrugen->avg_refaulted[type][tier] + 3604 atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3605 pos->total = lrugen->avg_total[type][tier] + 3606 atomic_long_read(&lrugen->evicted[hist][type][tier]); 3607 if (tier) 3608 pos->total += lrugen->protected[hist][type][tier - 1]; 3609 pos->gain = gain; 3610 } 3611 3612 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) 3613 { 3614 int hist, tier; 3615 struct lru_gen_struct *lrugen = &lruvec->lrugen; 3616 bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; 3617 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; 3618 3619 lockdep_assert_held(&lruvec->lru_lock); 3620 3621 if (!carryover && !clear) 3622 return; 3623 3624 hist = lru_hist_from_seq(seq); 3625 3626 for (tier = 0; tier < MAX_NR_TIERS; tier++) { 3627 if (carryover) { 3628 unsigned long sum; 3629 3630 sum = lrugen->avg_refaulted[type][tier] + 3631 atomic_long_read(&lrugen->refaulted[hist][type][tier]); 3632 WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); 3633 3634 sum = lrugen->avg_total[type][tier] + 3635 atomic_long_read(&lrugen->evicted[hist][type][tier]); 3636 if (tier) 3637 sum += lrugen->protected[hist][type][tier - 1]; 3638 WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); 3639 } 3640 3641 if (clear) { 3642 atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); 3643 atomic_long_set(&lrugen->evicted[hist][type][tier], 0); 3644 if (tier) 3645 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); 3646 } 3647 } 3648 } 3649 3650 static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) 3651 { 3652 /* 3653 * Return true if the PV has a limited number of refaults or a lower 3654 * refaulted/total than the SP. 3655 */ 3656 return pv->refaulted < MIN_LRU_BATCH || 3657 pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= 3658 (sp->refaulted + 1) * pv->total * pv->gain; 3659 } 3660 3661 /****************************************************************************** 3662 * the aging 3663 ******************************************************************************/ 3664 3665 /* promote pages accessed through page tables */ 3666 static int folio_update_gen(struct folio *folio, int gen) 3667 { 3668 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 3669 3670 VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); 3671 VM_WARN_ON_ONCE(!rcu_read_lock_held()); 3672 3673 do { 3674 /* lru_gen_del_folio() has isolated this page? */ 3675 if (!(old_flags & LRU_GEN_MASK)) { 3676 /* for shrink_folio_list() */ 3677 new_flags = old_flags | BIT(PG_referenced); 3678 continue; 3679 } 3680 3681 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); 3682 new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; 3683 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3684 3685 return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 3686 } 3687 3688 /* protect pages accessed multiple times through file descriptors */ 3689 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) 3690 { 3691 int type = folio_is_file_lru(folio); 3692 struct lru_gen_struct *lrugen = &lruvec->lrugen; 3693 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); 3694 unsigned long new_flags, old_flags = READ_ONCE(folio->flags); 3695 3696 VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); 3697 3698 do { 3699 new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; 3700 /* folio_update_gen() has promoted this page? */ 3701 if (new_gen >= 0 && new_gen != old_gen) 3702 return new_gen; 3703 3704 new_gen = (old_gen + 1) % MAX_NR_GENS; 3705 3706 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); 3707 new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; 3708 /* for folio_end_writeback() */ 3709 if (reclaiming) 3710 new_flags |= BIT(PG_reclaim); 3711 } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); 3712 3713 lru_gen_update_size(lruvec, folio, old_gen, new_gen); 3714 3715 return new_gen; 3716 } 3717 3718 static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, 3719 int old_gen, int new_gen) 3720 { 3721 int type = folio_is_file_lru(folio); 3722 int zone = folio_zonenum(folio); 3723 int delta = folio_nr_pages(folio); 3724 3725 VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); 3726 VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); 3727 3728 walk->batched++; 3729 3730 walk->nr_pages[old_gen][type][zone] -= delta; 3731 walk->nr_pages[new_gen][type][zone] += delta; 3732 } 3733 3734 static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) 3735 { 3736 int gen, type, zone; 3737 struct lru_gen_struct *lrugen = &lruvec->lrugen; 3738 3739 walk->batched = 0; 3740 3741 for_each_gen_type_zone(gen, type, zone) { 3742 enum lru_list lru = type * LRU_INACTIVE_FILE; 3743 int delta = walk->nr_pages[gen][type][zone]; 3744 3745 if (!delta) 3746 continue; 3747 3748 walk->nr_pages[gen][type][zone] = 0; 3749 WRITE_ONCE(lrugen->nr_pages[gen][type][zone], 3750 lrugen->nr_pages[gen][type][zone] + delta); 3751 3752 if (lru_gen_is_active(lruvec, gen)) 3753 lru += LRU_ACTIVE; 3754 __update_lru_size(lruvec, lru, zone, delta); 3755 } 3756 } 3757 3758 static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) 3759 { 3760 struct address_space *mapping; 3761 struct vm_area_struct *vma = args->vma; 3762 struct lru_gen_mm_walk *walk = args->private; 3763 3764 if (!vma_is_accessible(vma)) 3765 return true; 3766 3767 if (is_vm_hugetlb_page(vma)) 3768 return true; 3769 3770 if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) 3771 return true; 3772 3773 if (vma == get_gate_vma(vma->vm_mm)) 3774 return true; 3775 3776 if (vma_is_anonymous(vma)) 3777 return !walk->can_swap; 3778 3779 if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) 3780 return true; 3781 3782 mapping = vma->vm_file->f_mapping; 3783 if (mapping_unevictable(mapping)) 3784 return true; 3785 3786 if (shmem_mapping(mapping)) 3787 return !walk->can_swap; 3788 3789 /* to exclude special mappings like dax, etc. */ 3790 return !mapping->a_ops->read_folio; 3791 } 3792 3793 /* 3794 * Some userspace memory allocators map many single-page VMAs. Instead of 3795 * returning back to the PGD table for each of such VMAs, finish an entire PMD 3796 * table to reduce zigzags and improve cache performance. 3797 */ 3798 static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, 3799 unsigned long *vm_start, unsigned long *vm_end) 3800 { 3801 unsigned long start = round_up(*vm_end, size); 3802 unsigned long end = (start | ~mask) + 1; 3803 VMA_ITERATOR(vmi, args->mm, start); 3804 3805 VM_WARN_ON_ONCE(mask & size); 3806 VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); 3807 3808 for_each_vma(vmi, args->vma) { 3809 if (end && end <= args->vma->vm_start) 3810 return false; 3811 3812 if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) 3813 continue; 3814 3815 *vm_start = max(start, args->vma->vm_start); 3816 *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; 3817 3818 return true; 3819 } 3820 3821 return false; 3822 } 3823 3824 static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) 3825 { 3826 unsigned long pfn = pte_pfn(pte); 3827 3828 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3829 3830 if (!pte_present(pte) || is_zero_pfn(pfn)) 3831 return -1; 3832 3833 if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) 3834 return -1; 3835 3836 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3837 return -1; 3838 3839 return pfn; 3840 } 3841 3842 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 3843 static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) 3844 { 3845 unsigned long pfn = pmd_pfn(pmd); 3846 3847 VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); 3848 3849 if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) 3850 return -1; 3851 3852 if (WARN_ON_ONCE(pmd_devmap(pmd))) 3853 return -1; 3854 3855 if (WARN_ON_ONCE(!pfn_valid(pfn))) 3856 return -1; 3857 3858 return pfn; 3859 } 3860 #endif 3861 3862 static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, 3863 struct pglist_data *pgdat, bool can_swap) 3864 { 3865 struct folio *folio; 3866 3867 /* try to avoid unnecessary memory loads */ 3868 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 3869 return NULL; 3870 3871 folio = pfn_folio(pfn); 3872 if (folio_nid(folio) != pgdat->node_id) 3873 return NULL; 3874 3875 if (folio_memcg_rcu(folio) != memcg) 3876 return NULL; 3877 3878 /* file VMAs can contain anon pages from COW */ 3879 if (!folio_is_file_lru(folio) && !can_swap) 3880 return NULL; 3881 3882 return folio; 3883 } 3884 3885 static bool suitable_to_scan(int total, int young) 3886 { 3887 int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); 3888 3889 /* suitable if the average number of young PTEs per cacheline is >=1 */ 3890 return young * n >= total; 3891 } 3892 3893 static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, 3894 struct mm_walk *args) 3895 { 3896 int i; 3897 pte_t *pte; 3898 spinlock_t *ptl; 3899 unsigned long addr; 3900 int total = 0; 3901 int young = 0; 3902 struct lru_gen_mm_walk *walk = args->private; 3903 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 3904 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3905 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 3906 3907 VM_WARN_ON_ONCE(pmd_leaf(*pmd)); 3908 3909 ptl = pte_lockptr(args->mm, pmd); 3910 if (!spin_trylock(ptl)) 3911 return false; 3912 3913 arch_enter_lazy_mmu_mode(); 3914 3915 pte = pte_offset_map(pmd, start & PMD_MASK); 3916 restart: 3917 for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { 3918 unsigned long pfn; 3919 struct folio *folio; 3920 3921 total++; 3922 walk->mm_stats[MM_LEAF_TOTAL]++; 3923 3924 pfn = get_pte_pfn(pte[i], args->vma, addr); 3925 if (pfn == -1) 3926 continue; 3927 3928 if (!pte_young(pte[i])) { 3929 walk->mm_stats[MM_LEAF_OLD]++; 3930 continue; 3931 } 3932 3933 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 3934 if (!folio) 3935 continue; 3936 3937 if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) 3938 VM_WARN_ON_ONCE(true); 3939 3940 young++; 3941 walk->mm_stats[MM_LEAF_YOUNG]++; 3942 3943 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && 3944 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 3945 !folio_test_swapcache(folio))) 3946 folio_mark_dirty(folio); 3947 3948 old_gen = folio_update_gen(folio, new_gen); 3949 if (old_gen >= 0 && old_gen != new_gen) 3950 update_batch_size(walk, folio, old_gen, new_gen); 3951 } 3952 3953 if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) 3954 goto restart; 3955 3956 pte_unmap(pte); 3957 3958 arch_leave_lazy_mmu_mode(); 3959 spin_unlock(ptl); 3960 3961 return suitable_to_scan(total, young); 3962 } 3963 3964 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 3965 static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, 3966 struct mm_walk *args, unsigned long *bitmap, unsigned long *start) 3967 { 3968 int i; 3969 pmd_t *pmd; 3970 spinlock_t *ptl; 3971 struct lru_gen_mm_walk *walk = args->private; 3972 struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); 3973 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 3974 int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); 3975 3976 VM_WARN_ON_ONCE(pud_leaf(*pud)); 3977 3978 /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ 3979 if (*start == -1) { 3980 *start = next; 3981 return; 3982 } 3983 3984 i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); 3985 if (i && i <= MIN_LRU_BATCH) { 3986 __set_bit(i - 1, bitmap); 3987 return; 3988 } 3989 3990 pmd = pmd_offset(pud, *start); 3991 3992 ptl = pmd_lockptr(args->mm, pmd); 3993 if (!spin_trylock(ptl)) 3994 goto done; 3995 3996 arch_enter_lazy_mmu_mode(); 3997 3998 do { 3999 unsigned long pfn; 4000 struct folio *folio; 4001 unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; 4002 4003 pfn = get_pmd_pfn(pmd[i], vma, addr); 4004 if (pfn == -1) 4005 goto next; 4006 4007 if (!pmd_trans_huge(pmd[i])) { 4008 if (arch_has_hw_nonleaf_pmd_young() && 4009 get_cap(LRU_GEN_NONLEAF_YOUNG)) 4010 pmdp_test_and_clear_young(vma, addr, pmd + i); 4011 goto next; 4012 } 4013 4014 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); 4015 if (!folio) 4016 goto next; 4017 4018 if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) 4019 goto next; 4020 4021 walk->mm_stats[MM_LEAF_YOUNG]++; 4022 4023 if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && 4024 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4025 !folio_test_swapcache(folio))) 4026 folio_mark_dirty(folio); 4027 4028 old_gen = folio_update_gen(folio, new_gen); 4029 if (old_gen >= 0 && old_gen != new_gen) 4030 update_batch_size(walk, folio, old_gen, new_gen); 4031 next: 4032 i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; 4033 } while (i <= MIN_LRU_BATCH); 4034 4035 arch_leave_lazy_mmu_mode(); 4036 spin_unlock(ptl); 4037 done: 4038 *start = -1; 4039 bitmap_zero(bitmap, MIN_LRU_BATCH); 4040 } 4041 #else 4042 static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, 4043 struct mm_walk *args, unsigned long *bitmap, unsigned long *start) 4044 { 4045 } 4046 #endif 4047 4048 static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, 4049 struct mm_walk *args) 4050 { 4051 int i; 4052 pmd_t *pmd; 4053 unsigned long next; 4054 unsigned long addr; 4055 struct vm_area_struct *vma; 4056 unsigned long pos = -1; 4057 struct lru_gen_mm_walk *walk = args->private; 4058 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; 4059 4060 VM_WARN_ON_ONCE(pud_leaf(*pud)); 4061 4062 /* 4063 * Finish an entire PMD in two passes: the first only reaches to PTE 4064 * tables to avoid taking the PMD lock; the second, if necessary, takes 4065 * the PMD lock to clear the accessed bit in PMD entries. 4066 */ 4067 pmd = pmd_offset(pud, start & PUD_MASK); 4068 restart: 4069 /* walk_pte_range() may call get_next_vma() */ 4070 vma = args->vma; 4071 for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { 4072 pmd_t val = pmd_read_atomic(pmd + i); 4073 4074 /* for pmd_read_atomic() */ 4075 barrier(); 4076 4077 next = pmd_addr_end(addr, end); 4078 4079 if (!pmd_present(val) || is_huge_zero_pmd(val)) { 4080 walk->mm_stats[MM_LEAF_TOTAL]++; 4081 continue; 4082 } 4083 4084 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 4085 if (pmd_trans_huge(val)) { 4086 unsigned long pfn = pmd_pfn(val); 4087 struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); 4088 4089 walk->mm_stats[MM_LEAF_TOTAL]++; 4090 4091 if (!pmd_young(val)) { 4092 walk->mm_stats[MM_LEAF_OLD]++; 4093 continue; 4094 } 4095 4096 /* try to avoid unnecessary memory loads */ 4097 if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) 4098 continue; 4099 4100 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); 4101 continue; 4102 } 4103 #endif 4104 walk->mm_stats[MM_NONLEAF_TOTAL]++; 4105 4106 if (arch_has_hw_nonleaf_pmd_young() && 4107 get_cap(LRU_GEN_NONLEAF_YOUNG)) { 4108 if (!pmd_young(val)) 4109 continue; 4110 4111 walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); 4112 } 4113 4114 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) 4115 continue; 4116 4117 walk->mm_stats[MM_NONLEAF_FOUND]++; 4118 4119 if (!walk_pte_range(&val, addr, next, args)) 4120 continue; 4121 4122 walk->mm_stats[MM_NONLEAF_ADDED]++; 4123 4124 /* carry over to the next generation */ 4125 update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); 4126 } 4127 4128 walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); 4129 4130 if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) 4131 goto restart; 4132 } 4133 4134 static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, 4135 struct mm_walk *args) 4136 { 4137 int i; 4138 pud_t *pud; 4139 unsigned long addr; 4140 unsigned long next; 4141 struct lru_gen_mm_walk *walk = args->private; 4142 4143 VM_WARN_ON_ONCE(p4d_leaf(*p4d)); 4144 4145 pud = pud_offset(p4d, start & P4D_MASK); 4146 restart: 4147 for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { 4148 pud_t val = READ_ONCE(pud[i]); 4149 4150 next = pud_addr_end(addr, end); 4151 4152 if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) 4153 continue; 4154 4155 walk_pmd_range(&val, addr, next, args); 4156 4157 /* a racy check to curtail the waiting time */ 4158 if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) 4159 return 1; 4160 4161 if (need_resched() || walk->batched >= MAX_LRU_BATCH) { 4162 end = (addr | ~PUD_MASK) + 1; 4163 goto done; 4164 } 4165 } 4166 4167 if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) 4168 goto restart; 4169 4170 end = round_up(end, P4D_SIZE); 4171 done: 4172 if (!end || !args->vma) 4173 return 1; 4174 4175 walk->next_addr = max(end, args->vma->vm_start); 4176 4177 return -EAGAIN; 4178 } 4179 4180 static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) 4181 { 4182 static const struct mm_walk_ops mm_walk_ops = { 4183 .test_walk = should_skip_vma, 4184 .p4d_entry = walk_pud_range, 4185 }; 4186 4187 int err; 4188 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4189 4190 walk->next_addr = FIRST_USER_ADDRESS; 4191 4192 do { 4193 err = -EBUSY; 4194 4195 /* folio_update_gen() requires stable folio_memcg() */ 4196 if (!mem_cgroup_trylock_pages(memcg)) 4197 break; 4198 4199 /* the caller might be holding the lock for write */ 4200 if (mmap_read_trylock(mm)) { 4201 err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); 4202 4203 mmap_read_unlock(mm); 4204 } 4205 4206 mem_cgroup_unlock_pages(); 4207 4208 if (walk->batched) { 4209 spin_lock_irq(&lruvec->lru_lock); 4210 reset_batch_size(lruvec, walk); 4211 spin_unlock_irq(&lruvec->lru_lock); 4212 } 4213 4214 cond_resched(); 4215 } while (err == -EAGAIN); 4216 } 4217 4218 static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) 4219 { 4220 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4221 4222 if (pgdat && current_is_kswapd()) { 4223 VM_WARN_ON_ONCE(walk); 4224 4225 walk = &pgdat->mm_walk; 4226 } else if (!pgdat && !walk) { 4227 VM_WARN_ON_ONCE(current_is_kswapd()); 4228 4229 walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 4230 } 4231 4232 current->reclaim_state->mm_walk = walk; 4233 4234 return walk; 4235 } 4236 4237 static void clear_mm_walk(void) 4238 { 4239 struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; 4240 4241 VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); 4242 VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); 4243 4244 current->reclaim_state->mm_walk = NULL; 4245 4246 if (!current_is_kswapd()) 4247 kfree(walk); 4248 } 4249 4250 static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) 4251 { 4252 int zone; 4253 int remaining = MAX_LRU_BATCH; 4254 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4255 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); 4256 4257 if (type == LRU_GEN_ANON && !can_swap) 4258 goto done; 4259 4260 /* prevent cold/hot inversion if force_scan is true */ 4261 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4262 struct list_head *head = &lrugen->lists[old_gen][type][zone]; 4263 4264 while (!list_empty(head)) { 4265 struct folio *folio = lru_to_folio(head); 4266 4267 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 4268 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 4269 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 4270 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 4271 4272 new_gen = folio_inc_gen(lruvec, folio, false); 4273 list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); 4274 4275 if (!--remaining) 4276 return false; 4277 } 4278 } 4279 done: 4280 reset_ctrl_pos(lruvec, type, true); 4281 WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); 4282 4283 return true; 4284 } 4285 4286 static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) 4287 { 4288 int gen, type, zone; 4289 bool success = false; 4290 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4291 DEFINE_MIN_SEQ(lruvec); 4292 4293 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 4294 4295 /* find the oldest populated generation */ 4296 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4297 while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { 4298 gen = lru_gen_from_seq(min_seq[type]); 4299 4300 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4301 if (!list_empty(&lrugen->lists[gen][type][zone])) 4302 goto next; 4303 } 4304 4305 min_seq[type]++; 4306 } 4307 next: 4308 ; 4309 } 4310 4311 /* see the comment on lru_gen_struct */ 4312 if (can_swap) { 4313 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); 4314 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); 4315 } 4316 4317 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4318 if (min_seq[type] == lrugen->min_seq[type]) 4319 continue; 4320 4321 reset_ctrl_pos(lruvec, type, true); 4322 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); 4323 success = true; 4324 } 4325 4326 return success; 4327 } 4328 4329 static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) 4330 { 4331 int prev, next; 4332 int type, zone; 4333 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4334 4335 spin_lock_irq(&lruvec->lru_lock); 4336 4337 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 4338 4339 for (type = ANON_AND_FILE - 1; type >= 0; type--) { 4340 if (get_nr_gens(lruvec, type) != MAX_NR_GENS) 4341 continue; 4342 4343 VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); 4344 4345 while (!inc_min_seq(lruvec, type, can_swap)) { 4346 spin_unlock_irq(&lruvec->lru_lock); 4347 cond_resched(); 4348 spin_lock_irq(&lruvec->lru_lock); 4349 } 4350 } 4351 4352 /* 4353 * Update the active/inactive LRU sizes for compatibility. Both sides of 4354 * the current max_seq need to be covered, since max_seq+1 can overlap 4355 * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do 4356 * overlap, cold/hot inversion happens. 4357 */ 4358 prev = lru_gen_from_seq(lrugen->max_seq - 1); 4359 next = lru_gen_from_seq(lrugen->max_seq + 1); 4360 4361 for (type = 0; type < ANON_AND_FILE; type++) { 4362 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4363 enum lru_list lru = type * LRU_INACTIVE_FILE; 4364 long delta = lrugen->nr_pages[prev][type][zone] - 4365 lrugen->nr_pages[next][type][zone]; 4366 4367 if (!delta) 4368 continue; 4369 4370 __update_lru_size(lruvec, lru, zone, delta); 4371 __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); 4372 } 4373 } 4374 4375 for (type = 0; type < ANON_AND_FILE; type++) 4376 reset_ctrl_pos(lruvec, type, false); 4377 4378 WRITE_ONCE(lrugen->timestamps[next], jiffies); 4379 /* make sure preceding modifications appear */ 4380 smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); 4381 4382 spin_unlock_irq(&lruvec->lru_lock); 4383 } 4384 4385 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, 4386 struct scan_control *sc, bool can_swap, bool force_scan) 4387 { 4388 bool success; 4389 struct lru_gen_mm_walk *walk; 4390 struct mm_struct *mm = NULL; 4391 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4392 4393 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); 4394 4395 /* see the comment in iterate_mm_list() */ 4396 if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { 4397 success = false; 4398 goto done; 4399 } 4400 4401 /* 4402 * If the hardware doesn't automatically set the accessed bit, fallback 4403 * to lru_gen_look_around(), which only clears the accessed bit in a 4404 * handful of PTEs. Spreading the work out over a period of time usually 4405 * is less efficient, but it avoids bursty page faults. 4406 */ 4407 if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { 4408 success = iterate_mm_list_nowalk(lruvec, max_seq); 4409 goto done; 4410 } 4411 4412 walk = set_mm_walk(NULL); 4413 if (!walk) { 4414 success = iterate_mm_list_nowalk(lruvec, max_seq); 4415 goto done; 4416 } 4417 4418 walk->lruvec = lruvec; 4419 walk->max_seq = max_seq; 4420 walk->can_swap = can_swap; 4421 walk->force_scan = force_scan; 4422 4423 do { 4424 success = iterate_mm_list(lruvec, walk, &mm); 4425 if (mm) 4426 walk_mm(lruvec, mm, walk); 4427 4428 cond_resched(); 4429 } while (mm); 4430 done: 4431 if (!success) { 4432 if (sc->priority <= DEF_PRIORITY - 2) 4433 wait_event_killable(lruvec->mm_state.wait, 4434 max_seq < READ_ONCE(lrugen->max_seq)); 4435 4436 return max_seq < READ_ONCE(lrugen->max_seq); 4437 } 4438 4439 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); 4440 4441 inc_max_seq(lruvec, can_swap, force_scan); 4442 /* either this sees any waiters or they will see updated max_seq */ 4443 if (wq_has_sleeper(&lruvec->mm_state.wait)) 4444 wake_up_all(&lruvec->mm_state.wait); 4445 4446 return true; 4447 } 4448 4449 static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, 4450 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) 4451 { 4452 int gen, type, zone; 4453 unsigned long old = 0; 4454 unsigned long young = 0; 4455 unsigned long total = 0; 4456 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4457 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4458 4459 for (type = !can_swap; type < ANON_AND_FILE; type++) { 4460 unsigned long seq; 4461 4462 for (seq = min_seq[type]; seq <= max_seq; seq++) { 4463 unsigned long size = 0; 4464 4465 gen = lru_gen_from_seq(seq); 4466 4467 for (zone = 0; zone < MAX_NR_ZONES; zone++) 4468 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 4469 4470 total += size; 4471 if (seq == max_seq) 4472 young += size; 4473 else if (seq + MIN_NR_GENS == max_seq) 4474 old += size; 4475 } 4476 } 4477 4478 /* try to scrape all its memory if this memcg was deleted */ 4479 *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; 4480 4481 /* 4482 * The aging tries to be lazy to reduce the overhead, while the eviction 4483 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the 4484 * ideal number of generations is MIN_NR_GENS+1. 4485 */ 4486 if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) 4487 return true; 4488 if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) 4489 return false; 4490 4491 /* 4492 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) 4493 * of the total number of pages for each generation. A reasonable range 4494 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The 4495 * aging cares about the upper bound of hot pages, while the eviction 4496 * cares about the lower bound of cold pages. 4497 */ 4498 if (young * MIN_NR_GENS > total) 4499 return true; 4500 if (old * (MIN_NR_GENS + 2) < total) 4501 return true; 4502 4503 return false; 4504 } 4505 4506 static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) 4507 { 4508 bool need_aging; 4509 unsigned long nr_to_scan; 4510 int swappiness = get_swappiness(lruvec, sc); 4511 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4512 DEFINE_MAX_SEQ(lruvec); 4513 DEFINE_MIN_SEQ(lruvec); 4514 4515 VM_WARN_ON_ONCE(sc->memcg_low_reclaim); 4516 4517 mem_cgroup_calculate_protection(NULL, memcg); 4518 4519 if (mem_cgroup_below_min(memcg)) 4520 return false; 4521 4522 need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); 4523 4524 if (min_ttl) { 4525 int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); 4526 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); 4527 4528 if (time_is_after_jiffies(birth + min_ttl)) 4529 return false; 4530 4531 /* the size is likely too small to be helpful */ 4532 if (!nr_to_scan && sc->priority != DEF_PRIORITY) 4533 return false; 4534 } 4535 4536 if (need_aging) 4537 try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); 4538 4539 return true; 4540 } 4541 4542 /* to protect the working set of the last N jiffies */ 4543 static unsigned long lru_gen_min_ttl __read_mostly; 4544 4545 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 4546 { 4547 struct mem_cgroup *memcg; 4548 bool success = false; 4549 unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); 4550 4551 VM_WARN_ON_ONCE(!current_is_kswapd()); 4552 4553 sc->last_reclaimed = sc->nr_reclaimed; 4554 4555 /* 4556 * To reduce the chance of going into the aging path, which can be 4557 * costly, optimistically skip it if the flag below was cleared in the 4558 * eviction path. This improves the overall performance when multiple 4559 * memcgs are available. 4560 */ 4561 if (!sc->memcgs_need_aging) { 4562 sc->memcgs_need_aging = true; 4563 return; 4564 } 4565 4566 set_mm_walk(pgdat); 4567 4568 memcg = mem_cgroup_iter(NULL, NULL, NULL); 4569 do { 4570 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4571 4572 if (age_lruvec(lruvec, sc, min_ttl)) 4573 success = true; 4574 4575 cond_resched(); 4576 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 4577 4578 clear_mm_walk(); 4579 4580 /* check the order to exclude compaction-induced reclaim */ 4581 if (success || !min_ttl || sc->order) 4582 return; 4583 4584 /* 4585 * The main goal is to OOM kill if every generation from all memcgs is 4586 * younger than min_ttl. However, another possibility is all memcgs are 4587 * either below min or empty. 4588 */ 4589 if (mutex_trylock(&oom_lock)) { 4590 struct oom_control oc = { 4591 .gfp_mask = sc->gfp_mask, 4592 }; 4593 4594 out_of_memory(&oc); 4595 4596 mutex_unlock(&oom_lock); 4597 } 4598 } 4599 4600 /* 4601 * This function exploits spatial locality when shrink_folio_list() walks the 4602 * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If 4603 * the scan was done cacheline efficiently, it adds the PMD entry pointing to 4604 * the PTE table to the Bloom filter. This forms a feedback loop between the 4605 * eviction and the aging. 4606 */ 4607 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) 4608 { 4609 int i; 4610 pte_t *pte; 4611 unsigned long start; 4612 unsigned long end; 4613 unsigned long addr; 4614 struct lru_gen_mm_walk *walk; 4615 int young = 0; 4616 unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; 4617 struct folio *folio = pfn_folio(pvmw->pfn); 4618 struct mem_cgroup *memcg = folio_memcg(folio); 4619 struct pglist_data *pgdat = folio_pgdat(folio); 4620 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 4621 DEFINE_MAX_SEQ(lruvec); 4622 int old_gen, new_gen = lru_gen_from_seq(max_seq); 4623 4624 lockdep_assert_held(pvmw->ptl); 4625 VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); 4626 4627 if (spin_is_contended(pvmw->ptl)) 4628 return; 4629 4630 /* avoid taking the LRU lock under the PTL when possible */ 4631 walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; 4632 4633 start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); 4634 end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; 4635 4636 if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { 4637 if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) 4638 end = start + MIN_LRU_BATCH * PAGE_SIZE; 4639 else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) 4640 start = end - MIN_LRU_BATCH * PAGE_SIZE; 4641 else { 4642 start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; 4643 end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; 4644 } 4645 } 4646 4647 pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; 4648 4649 rcu_read_lock(); 4650 arch_enter_lazy_mmu_mode(); 4651 4652 for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { 4653 unsigned long pfn; 4654 4655 pfn = get_pte_pfn(pte[i], pvmw->vma, addr); 4656 if (pfn == -1) 4657 continue; 4658 4659 if (!pte_young(pte[i])) 4660 continue; 4661 4662 folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); 4663 if (!folio) 4664 continue; 4665 4666 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) 4667 VM_WARN_ON_ONCE(true); 4668 4669 young++; 4670 4671 if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && 4672 !(folio_test_anon(folio) && folio_test_swapbacked(folio) && 4673 !folio_test_swapcache(folio))) 4674 folio_mark_dirty(folio); 4675 4676 old_gen = folio_lru_gen(folio); 4677 if (old_gen < 0) 4678 folio_set_referenced(folio); 4679 else if (old_gen != new_gen) 4680 __set_bit(i, bitmap); 4681 } 4682 4683 arch_leave_lazy_mmu_mode(); 4684 rcu_read_unlock(); 4685 4686 /* feedback from rmap walkers to page table walkers */ 4687 if (suitable_to_scan(i, young)) 4688 update_bloom_filter(lruvec, max_seq, pvmw->pmd); 4689 4690 if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { 4691 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { 4692 folio = pfn_folio(pte_pfn(pte[i])); 4693 folio_activate(folio); 4694 } 4695 return; 4696 } 4697 4698 /* folio_update_gen() requires stable folio_memcg() */ 4699 if (!mem_cgroup_trylock_pages(memcg)) 4700 return; 4701 4702 if (!walk) { 4703 spin_lock_irq(&lruvec->lru_lock); 4704 new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); 4705 } 4706 4707 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { 4708 folio = pfn_folio(pte_pfn(pte[i])); 4709 if (folio_memcg_rcu(folio) != memcg) 4710 continue; 4711 4712 old_gen = folio_update_gen(folio, new_gen); 4713 if (old_gen < 0 || old_gen == new_gen) 4714 continue; 4715 4716 if (walk) 4717 update_batch_size(walk, folio, old_gen, new_gen); 4718 else 4719 lru_gen_update_size(lruvec, folio, old_gen, new_gen); 4720 } 4721 4722 if (!walk) 4723 spin_unlock_irq(&lruvec->lru_lock); 4724 4725 mem_cgroup_unlock_pages(); 4726 } 4727 4728 /****************************************************************************** 4729 * the eviction 4730 ******************************************************************************/ 4731 4732 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) 4733 { 4734 bool success; 4735 int gen = folio_lru_gen(folio); 4736 int type = folio_is_file_lru(folio); 4737 int zone = folio_zonenum(folio); 4738 int delta = folio_nr_pages(folio); 4739 int refs = folio_lru_refs(folio); 4740 int tier = lru_tier_from_refs(refs); 4741 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4742 4743 VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); 4744 4745 /* unevictable */ 4746 if (!folio_evictable(folio)) { 4747 success = lru_gen_del_folio(lruvec, folio, true); 4748 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4749 folio_set_unevictable(folio); 4750 lruvec_add_folio(lruvec, folio); 4751 __count_vm_events(UNEVICTABLE_PGCULLED, delta); 4752 return true; 4753 } 4754 4755 /* dirty lazyfree */ 4756 if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { 4757 success = lru_gen_del_folio(lruvec, folio, true); 4758 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4759 folio_set_swapbacked(folio); 4760 lruvec_add_folio_tail(lruvec, folio); 4761 return true; 4762 } 4763 4764 /* promoted */ 4765 if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { 4766 list_move(&folio->lru, &lrugen->lists[gen][type][zone]); 4767 return true; 4768 } 4769 4770 /* protected */ 4771 if (tier > tier_idx) { 4772 int hist = lru_hist_from_seq(lrugen->min_seq[type]); 4773 4774 gen = folio_inc_gen(lruvec, folio, false); 4775 list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); 4776 4777 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 4778 lrugen->protected[hist][type][tier - 1] + delta); 4779 __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); 4780 return true; 4781 } 4782 4783 /* waiting for writeback */ 4784 if (folio_test_locked(folio) || folio_test_writeback(folio) || 4785 (type == LRU_GEN_FILE && folio_test_dirty(folio))) { 4786 gen = folio_inc_gen(lruvec, folio, true); 4787 list_move(&folio->lru, &lrugen->lists[gen][type][zone]); 4788 return true; 4789 } 4790 4791 return false; 4792 } 4793 4794 static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) 4795 { 4796 bool success; 4797 4798 /* unmapping inhibited */ 4799 if (!sc->may_unmap && folio_mapped(folio)) 4800 return false; 4801 4802 /* swapping inhibited */ 4803 if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && 4804 (folio_test_dirty(folio) || 4805 (folio_test_anon(folio) && !folio_test_swapcache(folio)))) 4806 return false; 4807 4808 /* raced with release_pages() */ 4809 if (!folio_try_get(folio)) 4810 return false; 4811 4812 /* raced with another isolation */ 4813 if (!folio_test_clear_lru(folio)) { 4814 folio_put(folio); 4815 return false; 4816 } 4817 4818 /* see the comment on MAX_NR_TIERS */ 4819 if (!folio_test_referenced(folio)) 4820 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); 4821 4822 /* for shrink_folio_list() */ 4823 folio_clear_reclaim(folio); 4824 folio_clear_referenced(folio); 4825 4826 success = lru_gen_del_folio(lruvec, folio, true); 4827 VM_WARN_ON_ONCE_FOLIO(!success, folio); 4828 4829 return true; 4830 } 4831 4832 static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, 4833 int type, int tier, struct list_head *list) 4834 { 4835 int gen, zone; 4836 enum vm_event_item item; 4837 int sorted = 0; 4838 int scanned = 0; 4839 int isolated = 0; 4840 int remaining = MAX_LRU_BATCH; 4841 struct lru_gen_struct *lrugen = &lruvec->lrugen; 4842 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 4843 4844 VM_WARN_ON_ONCE(!list_empty(list)); 4845 4846 if (get_nr_gens(lruvec, type) == MIN_NR_GENS) 4847 return 0; 4848 4849 gen = lru_gen_from_seq(lrugen->min_seq[type]); 4850 4851 for (zone = sc->reclaim_idx; zone >= 0; zone--) { 4852 LIST_HEAD(moved); 4853 int skipped = 0; 4854 struct list_head *head = &lrugen->lists[gen][type][zone]; 4855 4856 while (!list_empty(head)) { 4857 struct folio *folio = lru_to_folio(head); 4858 int delta = folio_nr_pages(folio); 4859 4860 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 4861 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 4862 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 4863 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 4864 4865 scanned += delta; 4866 4867 if (sort_folio(lruvec, folio, tier)) 4868 sorted += delta; 4869 else if (isolate_folio(lruvec, folio, sc)) { 4870 list_add(&folio->lru, list); 4871 isolated += delta; 4872 } else { 4873 list_move(&folio->lru, &moved); 4874 skipped += delta; 4875 } 4876 4877 if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) 4878 break; 4879 } 4880 4881 if (skipped) { 4882 list_splice(&moved, head); 4883 __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); 4884 } 4885 4886 if (!remaining || isolated >= MIN_LRU_BATCH) 4887 break; 4888 } 4889 4890 item = PGSCAN_KSWAPD + reclaimer_offset(); 4891 if (!cgroup_reclaim(sc)) { 4892 __count_vm_events(item, isolated); 4893 __count_vm_events(PGREFILL, sorted); 4894 } 4895 __count_memcg_events(memcg, item, isolated); 4896 __count_memcg_events(memcg, PGREFILL, sorted); 4897 __count_vm_events(PGSCAN_ANON + type, isolated); 4898 4899 /* 4900 * There might not be eligible pages due to reclaim_idx, may_unmap and 4901 * may_writepage. Check the remaining to prevent livelock if it's not 4902 * making progress. 4903 */ 4904 return isolated || !remaining ? scanned : 0; 4905 } 4906 4907 static int get_tier_idx(struct lruvec *lruvec, int type) 4908 { 4909 int tier; 4910 struct ctrl_pos sp, pv; 4911 4912 /* 4913 * To leave a margin for fluctuations, use a larger gain factor (1:2). 4914 * This value is chosen because any other tier would have at least twice 4915 * as many refaults as the first tier. 4916 */ 4917 read_ctrl_pos(lruvec, type, 0, 1, &sp); 4918 for (tier = 1; tier < MAX_NR_TIERS; tier++) { 4919 read_ctrl_pos(lruvec, type, tier, 2, &pv); 4920 if (!positive_ctrl_err(&sp, &pv)) 4921 break; 4922 } 4923 4924 return tier - 1; 4925 } 4926 4927 static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) 4928 { 4929 int type, tier; 4930 struct ctrl_pos sp, pv; 4931 int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; 4932 4933 /* 4934 * Compare the first tier of anon with that of file to determine which 4935 * type to scan. Also need to compare other tiers of the selected type 4936 * with the first tier of the other type to determine the last tier (of 4937 * the selected type) to evict. 4938 */ 4939 read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); 4940 read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); 4941 type = positive_ctrl_err(&sp, &pv); 4942 4943 read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); 4944 for (tier = 1; tier < MAX_NR_TIERS; tier++) { 4945 read_ctrl_pos(lruvec, type, tier, gain[type], &pv); 4946 if (!positive_ctrl_err(&sp, &pv)) 4947 break; 4948 } 4949 4950 *tier_idx = tier - 1; 4951 4952 return type; 4953 } 4954 4955 static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, 4956 int *type_scanned, struct list_head *list) 4957 { 4958 int i; 4959 int type; 4960 int scanned; 4961 int tier = -1; 4962 DEFINE_MIN_SEQ(lruvec); 4963 4964 /* 4965 * Try to make the obvious choice first. When anon and file are both 4966 * available from the same generation, interpret swappiness 1 as file 4967 * first and 200 as anon first. 4968 */ 4969 if (!swappiness) 4970 type = LRU_GEN_FILE; 4971 else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) 4972 type = LRU_GEN_ANON; 4973 else if (swappiness == 1) 4974 type = LRU_GEN_FILE; 4975 else if (swappiness == 200) 4976 type = LRU_GEN_ANON; 4977 else 4978 type = get_type_to_scan(lruvec, swappiness, &tier); 4979 4980 for (i = !swappiness; i < ANON_AND_FILE; i++) { 4981 if (tier < 0) 4982 tier = get_tier_idx(lruvec, type); 4983 4984 scanned = scan_folios(lruvec, sc, type, tier, list); 4985 if (scanned) 4986 break; 4987 4988 type = !type; 4989 tier = -1; 4990 } 4991 4992 *type_scanned = type; 4993 4994 return scanned; 4995 } 4996 4997 static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, 4998 bool *need_swapping) 4999 { 5000 int type; 5001 int scanned; 5002 int reclaimed; 5003 LIST_HEAD(list); 5004 LIST_HEAD(clean); 5005 struct folio *folio; 5006 struct folio *next; 5007 enum vm_event_item item; 5008 struct reclaim_stat stat; 5009 struct lru_gen_mm_walk *walk; 5010 bool skip_retry = false; 5011 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5012 struct pglist_data *pgdat = lruvec_pgdat(lruvec); 5013 5014 spin_lock_irq(&lruvec->lru_lock); 5015 5016 scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); 5017 5018 scanned += try_to_inc_min_seq(lruvec, swappiness); 5019 5020 if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) 5021 scanned = 0; 5022 5023 spin_unlock_irq(&lruvec->lru_lock); 5024 5025 if (list_empty(&list)) 5026 return scanned; 5027 retry: 5028 reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); 5029 sc->nr_reclaimed += reclaimed; 5030 5031 list_for_each_entry_safe_reverse(folio, next, &list, lru) { 5032 if (!folio_evictable(folio)) { 5033 list_del(&folio->lru); 5034 folio_putback_lru(folio); 5035 continue; 5036 } 5037 5038 if (folio_test_reclaim(folio) && 5039 (folio_test_dirty(folio) || folio_test_writeback(folio))) { 5040 /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ 5041 if (folio_test_workingset(folio)) 5042 folio_set_referenced(folio); 5043 continue; 5044 } 5045 5046 if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || 5047 folio_mapped(folio) || folio_test_locked(folio) || 5048 folio_test_dirty(folio) || folio_test_writeback(folio)) { 5049 /* don't add rejected folios to the oldest generation */ 5050 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 5051 BIT(PG_active)); 5052 continue; 5053 } 5054 5055 /* retry folios that may have missed folio_rotate_reclaimable() */ 5056 list_move(&folio->lru, &clean); 5057 sc->nr_scanned -= folio_nr_pages(folio); 5058 } 5059 5060 spin_lock_irq(&lruvec->lru_lock); 5061 5062 move_folios_to_lru(lruvec, &list); 5063 5064 walk = current->reclaim_state->mm_walk; 5065 if (walk && walk->batched) 5066 reset_batch_size(lruvec, walk); 5067 5068 item = PGSTEAL_KSWAPD + reclaimer_offset(); 5069 if (!cgroup_reclaim(sc)) 5070 __count_vm_events(item, reclaimed); 5071 __count_memcg_events(memcg, item, reclaimed); 5072 __count_vm_events(PGSTEAL_ANON + type, reclaimed); 5073 5074 spin_unlock_irq(&lruvec->lru_lock); 5075 5076 mem_cgroup_uncharge_list(&list); 5077 free_unref_page_list(&list); 5078 5079 INIT_LIST_HEAD(&list); 5080 list_splice_init(&clean, &list); 5081 5082 if (!list_empty(&list)) { 5083 skip_retry = true; 5084 goto retry; 5085 } 5086 5087 if (need_swapping && type == LRU_GEN_ANON) 5088 *need_swapping = true; 5089 5090 return scanned; 5091 } 5092 5093 /* 5094 * For future optimizations: 5095 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg 5096 * reclaim. 5097 */ 5098 static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, 5099 bool can_swap, bool *need_aging) 5100 { 5101 unsigned long nr_to_scan; 5102 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5103 DEFINE_MAX_SEQ(lruvec); 5104 DEFINE_MIN_SEQ(lruvec); 5105 5106 if (mem_cgroup_below_min(memcg) || 5107 (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) 5108 return 0; 5109 5110 *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); 5111 if (!*need_aging) 5112 return nr_to_scan; 5113 5114 /* skip the aging path at the default priority */ 5115 if (sc->priority == DEF_PRIORITY) 5116 goto done; 5117 5118 /* leave the work to lru_gen_age_node() */ 5119 if (current_is_kswapd()) 5120 return 0; 5121 5122 if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) 5123 return nr_to_scan; 5124 done: 5125 return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; 5126 } 5127 5128 static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, 5129 struct scan_control *sc, bool need_swapping) 5130 { 5131 int i; 5132 DEFINE_MAX_SEQ(lruvec); 5133 5134 if (!current_is_kswapd()) { 5135 /* age each memcg at most once to ensure fairness */ 5136 if (max_seq - seq > 1) 5137 return true; 5138 5139 /* over-swapping can increase allocation latency */ 5140 if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) 5141 return true; 5142 5143 /* give this thread a chance to exit and free its memory */ 5144 if (fatal_signal_pending(current)) { 5145 sc->nr_reclaimed += MIN_LRU_BATCH; 5146 return true; 5147 } 5148 5149 if (cgroup_reclaim(sc)) 5150 return false; 5151 } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) 5152 return false; 5153 5154 /* keep scanning at low priorities to ensure fairness */ 5155 if (sc->priority > DEF_PRIORITY - 2) 5156 return false; 5157 5158 /* 5159 * A minimum amount of work was done under global memory pressure. For 5160 * kswapd, it may be overshooting. For direct reclaim, the allocation 5161 * may succeed if all suitable zones are somewhat safe. In either case, 5162 * it's better to stop now, and restart later if necessary. 5163 */ 5164 for (i = 0; i <= sc->reclaim_idx; i++) { 5165 unsigned long wmark; 5166 struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; 5167 5168 if (!managed_zone(zone)) 5169 continue; 5170 5171 wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); 5172 if (wmark > zone_page_state(zone, NR_FREE_PAGES)) 5173 return false; 5174 } 5175 5176 sc->nr_reclaimed += MIN_LRU_BATCH; 5177 5178 return true; 5179 } 5180 5181 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5182 { 5183 struct blk_plug plug; 5184 bool need_aging = false; 5185 bool need_swapping = false; 5186 unsigned long scanned = 0; 5187 unsigned long reclaimed = sc->nr_reclaimed; 5188 DEFINE_MAX_SEQ(lruvec); 5189 5190 lru_add_drain(); 5191 5192 blk_start_plug(&plug); 5193 5194 set_mm_walk(lruvec_pgdat(lruvec)); 5195 5196 while (true) { 5197 int delta; 5198 int swappiness; 5199 unsigned long nr_to_scan; 5200 5201 if (sc->may_swap) 5202 swappiness = get_swappiness(lruvec, sc); 5203 else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) 5204 swappiness = 1; 5205 else 5206 swappiness = 0; 5207 5208 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); 5209 if (!nr_to_scan) 5210 goto done; 5211 5212 delta = evict_folios(lruvec, sc, swappiness, &need_swapping); 5213 if (!delta) 5214 goto done; 5215 5216 scanned += delta; 5217 if (scanned >= nr_to_scan) 5218 break; 5219 5220 if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) 5221 break; 5222 5223 cond_resched(); 5224 } 5225 5226 /* see the comment in lru_gen_age_node() */ 5227 if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) 5228 sc->memcgs_need_aging = false; 5229 done: 5230 clear_mm_walk(); 5231 5232 blk_finish_plug(&plug); 5233 } 5234 5235 /****************************************************************************** 5236 * state change 5237 ******************************************************************************/ 5238 5239 static bool __maybe_unused state_is_valid(struct lruvec *lruvec) 5240 { 5241 struct lru_gen_struct *lrugen = &lruvec->lrugen; 5242 5243 if (lrugen->enabled) { 5244 enum lru_list lru; 5245 5246 for_each_evictable_lru(lru) { 5247 if (!list_empty(&lruvec->lists[lru])) 5248 return false; 5249 } 5250 } else { 5251 int gen, type, zone; 5252 5253 for_each_gen_type_zone(gen, type, zone) { 5254 if (!list_empty(&lrugen->lists[gen][type][zone])) 5255 return false; 5256 } 5257 } 5258 5259 return true; 5260 } 5261 5262 static bool fill_evictable(struct lruvec *lruvec) 5263 { 5264 enum lru_list lru; 5265 int remaining = MAX_LRU_BATCH; 5266 5267 for_each_evictable_lru(lru) { 5268 int type = is_file_lru(lru); 5269 bool active = is_active_lru(lru); 5270 struct list_head *head = &lruvec->lists[lru]; 5271 5272 while (!list_empty(head)) { 5273 bool success; 5274 struct folio *folio = lru_to_folio(head); 5275 5276 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 5277 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); 5278 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 5279 VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); 5280 5281 lruvec_del_folio(lruvec, folio); 5282 success = lru_gen_add_folio(lruvec, folio, false); 5283 VM_WARN_ON_ONCE(!success); 5284 5285 if (!--remaining) 5286 return false; 5287 } 5288 } 5289 5290 return true; 5291 } 5292 5293 static bool drain_evictable(struct lruvec *lruvec) 5294 { 5295 int gen, type, zone; 5296 int remaining = MAX_LRU_BATCH; 5297 5298 for_each_gen_type_zone(gen, type, zone) { 5299 struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; 5300 5301 while (!list_empty(head)) { 5302 bool success; 5303 struct folio *folio = lru_to_folio(head); 5304 5305 VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); 5306 VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); 5307 VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); 5308 VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); 5309 5310 success = lru_gen_del_folio(lruvec, folio, false); 5311 VM_WARN_ON_ONCE(!success); 5312 lruvec_add_folio(lruvec, folio); 5313 5314 if (!--remaining) 5315 return false; 5316 } 5317 } 5318 5319 return true; 5320 } 5321 5322 static void lru_gen_change_state(bool enabled) 5323 { 5324 static DEFINE_MUTEX(state_mutex); 5325 5326 struct mem_cgroup *memcg; 5327 5328 cgroup_lock(); 5329 cpus_read_lock(); 5330 get_online_mems(); 5331 mutex_lock(&state_mutex); 5332 5333 if (enabled == lru_gen_enabled()) 5334 goto unlock; 5335 5336 if (enabled) 5337 static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); 5338 else 5339 static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); 5340 5341 memcg = mem_cgroup_iter(NULL, NULL, NULL); 5342 do { 5343 int nid; 5344 5345 for_each_node(nid) { 5346 struct lruvec *lruvec = get_lruvec(memcg, nid); 5347 5348 if (!lruvec) 5349 continue; 5350 5351 spin_lock_irq(&lruvec->lru_lock); 5352 5353 VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); 5354 VM_WARN_ON_ONCE(!state_is_valid(lruvec)); 5355 5356 lruvec->lrugen.enabled = enabled; 5357 5358 while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { 5359 spin_unlock_irq(&lruvec->lru_lock); 5360 cond_resched(); 5361 spin_lock_irq(&lruvec->lru_lock); 5362 } 5363 5364 spin_unlock_irq(&lruvec->lru_lock); 5365 } 5366 5367 cond_resched(); 5368 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 5369 unlock: 5370 mutex_unlock(&state_mutex); 5371 put_online_mems(); 5372 cpus_read_unlock(); 5373 cgroup_unlock(); 5374 } 5375 5376 /****************************************************************************** 5377 * sysfs interface 5378 ******************************************************************************/ 5379 5380 static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 5381 { 5382 return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); 5383 } 5384 5385 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5386 static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, 5387 const char *buf, size_t len) 5388 { 5389 unsigned int msecs; 5390 5391 if (kstrtouint(buf, 0, &msecs)) 5392 return -EINVAL; 5393 5394 WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); 5395 5396 return len; 5397 } 5398 5399 static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( 5400 min_ttl_ms, 0644, show_min_ttl, store_min_ttl 5401 ); 5402 5403 static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) 5404 { 5405 unsigned int caps = 0; 5406 5407 if (get_cap(LRU_GEN_CORE)) 5408 caps |= BIT(LRU_GEN_CORE); 5409 5410 if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) 5411 caps |= BIT(LRU_GEN_MM_WALK); 5412 5413 if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) 5414 caps |= BIT(LRU_GEN_NONLEAF_YOUNG); 5415 5416 return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); 5417 } 5418 5419 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5420 static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, 5421 const char *buf, size_t len) 5422 { 5423 int i; 5424 unsigned int caps; 5425 5426 if (tolower(*buf) == 'n') 5427 caps = 0; 5428 else if (tolower(*buf) == 'y') 5429 caps = -1; 5430 else if (kstrtouint(buf, 0, &caps)) 5431 return -EINVAL; 5432 5433 for (i = 0; i < NR_LRU_GEN_CAPS; i++) { 5434 bool enabled = caps & BIT(i); 5435 5436 if (i == LRU_GEN_CORE) 5437 lru_gen_change_state(enabled); 5438 else if (enabled) 5439 static_branch_enable(&lru_gen_caps[i]); 5440 else 5441 static_branch_disable(&lru_gen_caps[i]); 5442 } 5443 5444 return len; 5445 } 5446 5447 static struct kobj_attribute lru_gen_enabled_attr = __ATTR( 5448 enabled, 0644, show_enabled, store_enabled 5449 ); 5450 5451 static struct attribute *lru_gen_attrs[] = { 5452 &lru_gen_min_ttl_attr.attr, 5453 &lru_gen_enabled_attr.attr, 5454 NULL 5455 }; 5456 5457 static struct attribute_group lru_gen_attr_group = { 5458 .name = "lru_gen", 5459 .attrs = lru_gen_attrs, 5460 }; 5461 5462 /****************************************************************************** 5463 * debugfs interface 5464 ******************************************************************************/ 5465 5466 static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) 5467 { 5468 struct mem_cgroup *memcg; 5469 loff_t nr_to_skip = *pos; 5470 5471 m->private = kvmalloc(PATH_MAX, GFP_KERNEL); 5472 if (!m->private) 5473 return ERR_PTR(-ENOMEM); 5474 5475 memcg = mem_cgroup_iter(NULL, NULL, NULL); 5476 do { 5477 int nid; 5478 5479 for_each_node_state(nid, N_MEMORY) { 5480 if (!nr_to_skip--) 5481 return get_lruvec(memcg, nid); 5482 } 5483 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); 5484 5485 return NULL; 5486 } 5487 5488 static void lru_gen_seq_stop(struct seq_file *m, void *v) 5489 { 5490 if (!IS_ERR_OR_NULL(v)) 5491 mem_cgroup_iter_break(NULL, lruvec_memcg(v)); 5492 5493 kvfree(m->private); 5494 m->private = NULL; 5495 } 5496 5497 static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) 5498 { 5499 int nid = lruvec_pgdat(v)->node_id; 5500 struct mem_cgroup *memcg = lruvec_memcg(v); 5501 5502 ++*pos; 5503 5504 nid = next_memory_node(nid); 5505 if (nid == MAX_NUMNODES) { 5506 memcg = mem_cgroup_iter(NULL, memcg, NULL); 5507 if (!memcg) 5508 return NULL; 5509 5510 nid = first_memory_node; 5511 } 5512 5513 return get_lruvec(memcg, nid); 5514 } 5515 5516 static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, 5517 unsigned long max_seq, unsigned long *min_seq, 5518 unsigned long seq) 5519 { 5520 int i; 5521 int type, tier; 5522 int hist = lru_hist_from_seq(seq); 5523 struct lru_gen_struct *lrugen = &lruvec->lrugen; 5524 5525 for (tier = 0; tier < MAX_NR_TIERS; tier++) { 5526 seq_printf(m, " %10d", tier); 5527 for (type = 0; type < ANON_AND_FILE; type++) { 5528 const char *s = " "; 5529 unsigned long n[3] = {}; 5530 5531 if (seq == max_seq) { 5532 s = "RT "; 5533 n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); 5534 n[1] = READ_ONCE(lrugen->avg_total[type][tier]); 5535 } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { 5536 s = "rep"; 5537 n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); 5538 n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); 5539 if (tier) 5540 n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); 5541 } 5542 5543 for (i = 0; i < 3; i++) 5544 seq_printf(m, " %10lu%c", n[i], s[i]); 5545 } 5546 seq_putc(m, '\n'); 5547 } 5548 5549 seq_puts(m, " "); 5550 for (i = 0; i < NR_MM_STATS; i++) { 5551 const char *s = " "; 5552 unsigned long n = 0; 5553 5554 if (seq == max_seq && NR_HIST_GENS == 1) { 5555 s = "LOYNFA"; 5556 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); 5557 } else if (seq != max_seq && NR_HIST_GENS > 1) { 5558 s = "loynfa"; 5559 n = READ_ONCE(lruvec->mm_state.stats[hist][i]); 5560 } 5561 5562 seq_printf(m, " %10lu%c", n, s[i]); 5563 } 5564 seq_putc(m, '\n'); 5565 } 5566 5567 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5568 static int lru_gen_seq_show(struct seq_file *m, void *v) 5569 { 5570 unsigned long seq; 5571 bool full = !debugfs_real_fops(m->file)->write; 5572 struct lruvec *lruvec = v; 5573 struct lru_gen_struct *lrugen = &lruvec->lrugen; 5574 int nid = lruvec_pgdat(lruvec)->node_id; 5575 struct mem_cgroup *memcg = lruvec_memcg(lruvec); 5576 DEFINE_MAX_SEQ(lruvec); 5577 DEFINE_MIN_SEQ(lruvec); 5578 5579 if (nid == first_memory_node) { 5580 const char *path = memcg ? m->private : ""; 5581 5582 #ifdef CONFIG_MEMCG 5583 if (memcg) 5584 cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); 5585 #endif 5586 seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); 5587 } 5588 5589 seq_printf(m, " node %5d\n", nid); 5590 5591 if (!full) 5592 seq = min_seq[LRU_GEN_ANON]; 5593 else if (max_seq >= MAX_NR_GENS) 5594 seq = max_seq - MAX_NR_GENS + 1; 5595 else 5596 seq = 0; 5597 5598 for (; seq <= max_seq; seq++) { 5599 int type, zone; 5600 int gen = lru_gen_from_seq(seq); 5601 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); 5602 5603 seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); 5604 5605 for (type = 0; type < ANON_AND_FILE; type++) { 5606 unsigned long size = 0; 5607 char mark = full && seq < min_seq[type] ? 'x' : ' '; 5608 5609 for (zone = 0; zone < MAX_NR_ZONES; zone++) 5610 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); 5611 5612 seq_printf(m, " %10lu%c", size, mark); 5613 } 5614 5615 seq_putc(m, '\n'); 5616 5617 if (full) 5618 lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); 5619 } 5620 5621 return 0; 5622 } 5623 5624 static const struct seq_operations lru_gen_seq_ops = { 5625 .start = lru_gen_seq_start, 5626 .stop = lru_gen_seq_stop, 5627 .next = lru_gen_seq_next, 5628 .show = lru_gen_seq_show, 5629 }; 5630 5631 static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, 5632 bool can_swap, bool force_scan) 5633 { 5634 DEFINE_MAX_SEQ(lruvec); 5635 DEFINE_MIN_SEQ(lruvec); 5636 5637 if (seq < max_seq) 5638 return 0; 5639 5640 if (seq > max_seq) 5641 return -EINVAL; 5642 5643 if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) 5644 return -ERANGE; 5645 5646 try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); 5647 5648 return 0; 5649 } 5650 5651 static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, 5652 int swappiness, unsigned long nr_to_reclaim) 5653 { 5654 DEFINE_MAX_SEQ(lruvec); 5655 5656 if (seq + MIN_NR_GENS > max_seq) 5657 return -EINVAL; 5658 5659 sc->nr_reclaimed = 0; 5660 5661 while (!signal_pending(current)) { 5662 DEFINE_MIN_SEQ(lruvec); 5663 5664 if (seq < min_seq[!swappiness]) 5665 return 0; 5666 5667 if (sc->nr_reclaimed >= nr_to_reclaim) 5668 return 0; 5669 5670 if (!evict_folios(lruvec, sc, swappiness, NULL)) 5671 return 0; 5672 5673 cond_resched(); 5674 } 5675 5676 return -EINTR; 5677 } 5678 5679 static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, 5680 struct scan_control *sc, int swappiness, unsigned long opt) 5681 { 5682 struct lruvec *lruvec; 5683 int err = -EINVAL; 5684 struct mem_cgroup *memcg = NULL; 5685 5686 if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) 5687 return -EINVAL; 5688 5689 if (!mem_cgroup_disabled()) { 5690 rcu_read_lock(); 5691 memcg = mem_cgroup_from_id(memcg_id); 5692 #ifdef CONFIG_MEMCG 5693 if (memcg && !css_tryget(&memcg->css)) 5694 memcg = NULL; 5695 #endif 5696 rcu_read_unlock(); 5697 5698 if (!memcg) 5699 return -EINVAL; 5700 } 5701 5702 if (memcg_id != mem_cgroup_id(memcg)) 5703 goto done; 5704 5705 lruvec = get_lruvec(memcg, nid); 5706 5707 if (swappiness < 0) 5708 swappiness = get_swappiness(lruvec, sc); 5709 else if (swappiness > 200) 5710 goto done; 5711 5712 switch (cmd) { 5713 case '+': 5714 err = run_aging(lruvec, seq, sc, swappiness, opt); 5715 break; 5716 case '-': 5717 err = run_eviction(lruvec, seq, sc, swappiness, opt); 5718 break; 5719 } 5720 done: 5721 mem_cgroup_put(memcg); 5722 5723 return err; 5724 } 5725 5726 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ 5727 static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, 5728 size_t len, loff_t *pos) 5729 { 5730 void *buf; 5731 char *cur, *next; 5732 unsigned int flags; 5733 struct blk_plug plug; 5734 int err = -EINVAL; 5735 struct scan_control sc = { 5736 .may_writepage = true, 5737 .may_unmap = true, 5738 .may_swap = true, 5739 .reclaim_idx = MAX_NR_ZONES - 1, 5740 .gfp_mask = GFP_KERNEL, 5741 }; 5742 5743 buf = kvmalloc(len + 1, GFP_KERNEL); 5744 if (!buf) 5745 return -ENOMEM; 5746 5747 if (copy_from_user(buf, src, len)) { 5748 kvfree(buf); 5749 return -EFAULT; 5750 } 5751 5752 set_task_reclaim_state(current, &sc.reclaim_state); 5753 flags = memalloc_noreclaim_save(); 5754 blk_start_plug(&plug); 5755 if (!set_mm_walk(NULL)) { 5756 err = -ENOMEM; 5757 goto done; 5758 } 5759 5760 next = buf; 5761 next[len] = '\0'; 5762 5763 while ((cur = strsep(&next, ",;\n"))) { 5764 int n; 5765 int end; 5766 char cmd; 5767 unsigned int memcg_id; 5768 unsigned int nid; 5769 unsigned long seq; 5770 unsigned int swappiness = -1; 5771 unsigned long opt = -1; 5772 5773 cur = skip_spaces(cur); 5774 if (!*cur) 5775 continue; 5776 5777 n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, 5778 &seq, &end, &swappiness, &end, &opt, &end); 5779 if (n < 4 || cur[end]) { 5780 err = -EINVAL; 5781 break; 5782 } 5783 5784 err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); 5785 if (err) 5786 break; 5787 } 5788 done: 5789 clear_mm_walk(); 5790 blk_finish_plug(&plug); 5791 memalloc_noreclaim_restore(flags); 5792 set_task_reclaim_state(current, NULL); 5793 5794 kvfree(buf); 5795 5796 return err ? : len; 5797 } 5798 5799 static int lru_gen_seq_open(struct inode *inode, struct file *file) 5800 { 5801 return seq_open(file, &lru_gen_seq_ops); 5802 } 5803 5804 static const struct file_operations lru_gen_rw_fops = { 5805 .open = lru_gen_seq_open, 5806 .read = seq_read, 5807 .write = lru_gen_seq_write, 5808 .llseek = seq_lseek, 5809 .release = seq_release, 5810 }; 5811 5812 static const struct file_operations lru_gen_ro_fops = { 5813 .open = lru_gen_seq_open, 5814 .read = seq_read, 5815 .llseek = seq_lseek, 5816 .release = seq_release, 5817 }; 5818 5819 /****************************************************************************** 5820 * initialization 5821 ******************************************************************************/ 5822 5823 void lru_gen_init_lruvec(struct lruvec *lruvec) 5824 { 5825 int i; 5826 int gen, type, zone; 5827 struct lru_gen_struct *lrugen = &lruvec->lrugen; 5828 5829 lrugen->max_seq = MIN_NR_GENS + 1; 5830 lrugen->enabled = lru_gen_enabled(); 5831 5832 for (i = 0; i <= MIN_NR_GENS + 1; i++) 5833 lrugen->timestamps[i] = jiffies; 5834 5835 for_each_gen_type_zone(gen, type, zone) 5836 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); 5837 5838 lruvec->mm_state.seq = MIN_NR_GENS; 5839 init_waitqueue_head(&lruvec->mm_state.wait); 5840 } 5841 5842 #ifdef CONFIG_MEMCG 5843 void lru_gen_init_memcg(struct mem_cgroup *memcg) 5844 { 5845 INIT_LIST_HEAD(&memcg->mm_list.fifo); 5846 spin_lock_init(&memcg->mm_list.lock); 5847 } 5848 5849 void lru_gen_exit_memcg(struct mem_cgroup *memcg) 5850 { 5851 int i; 5852 int nid; 5853 5854 for_each_node(nid) { 5855 struct lruvec *lruvec = get_lruvec(memcg, nid); 5856 5857 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, 5858 sizeof(lruvec->lrugen.nr_pages))); 5859 5860 for (i = 0; i < NR_BLOOM_FILTERS; i++) { 5861 bitmap_free(lruvec->mm_state.filters[i]); 5862 lruvec->mm_state.filters[i] = NULL; 5863 } 5864 } 5865 } 5866 #endif 5867 5868 static int __init init_lru_gen(void) 5869 { 5870 BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); 5871 BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); 5872 5873 if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) 5874 pr_err("lru_gen: failed to create sysfs group\n"); 5875 5876 debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); 5877 debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); 5878 5879 return 0; 5880 }; 5881 late_initcall(init_lru_gen); 5882 5883 #else /* !CONFIG_LRU_GEN */ 5884 5885 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) 5886 { 5887 } 5888 5889 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5890 { 5891 } 5892 5893 #endif /* CONFIG_LRU_GEN */ 5894 5895 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 5896 { 5897 unsigned long nr[NR_LRU_LISTS]; 5898 unsigned long targets[NR_LRU_LISTS]; 5899 unsigned long nr_to_scan; 5900 enum lru_list lru; 5901 unsigned long nr_reclaimed = 0; 5902 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 5903 bool proportional_reclaim; 5904 struct blk_plug plug; 5905 5906 if (lru_gen_enabled()) { 5907 lru_gen_shrink_lruvec(lruvec, sc); 5908 return; 5909 } 5910 5911 get_scan_count(lruvec, sc, nr); 5912 5913 /* Record the original scan target for proportional adjustments later */ 5914 memcpy(targets, nr, sizeof(nr)); 5915 5916 /* 5917 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal 5918 * event that can occur when there is little memory pressure e.g. 5919 * multiple streaming readers/writers. Hence, we do not abort scanning 5920 * when the requested number of pages are reclaimed when scanning at 5921 * DEF_PRIORITY on the assumption that the fact we are direct 5922 * reclaiming implies that kswapd is not keeping up and it is best to 5923 * do a batch of work at once. For memcg reclaim one check is made to 5924 * abort proportional reclaim if either the file or anon lru has already 5925 * dropped to zero at the first pass. 5926 */ 5927 proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && 5928 sc->priority == DEF_PRIORITY); 5929 5930 blk_start_plug(&plug); 5931 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 5932 nr[LRU_INACTIVE_FILE]) { 5933 unsigned long nr_anon, nr_file, percentage; 5934 unsigned long nr_scanned; 5935 5936 for_each_evictable_lru(lru) { 5937 if (nr[lru]) { 5938 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 5939 nr[lru] -= nr_to_scan; 5940 5941 nr_reclaimed += shrink_list(lru, nr_to_scan, 5942 lruvec, sc); 5943 } 5944 } 5945 5946 cond_resched(); 5947 5948 if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) 5949 continue; 5950 5951 /* 5952 * For kswapd and memcg, reclaim at least the number of pages 5953 * requested. Ensure that the anon and file LRUs are scanned 5954 * proportionally what was requested by get_scan_count(). We 5955 * stop reclaiming one LRU and reduce the amount scanning 5956 * proportional to the original scan target. 5957 */ 5958 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 5959 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 5960 5961 /* 5962 * It's just vindictive to attack the larger once the smaller 5963 * has gone to zero. And given the way we stop scanning the 5964 * smaller below, this makes sure that we only make one nudge 5965 * towards proportionality once we've got nr_to_reclaim. 5966 */ 5967 if (!nr_file || !nr_anon) 5968 break; 5969 5970 if (nr_file > nr_anon) { 5971 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 5972 targets[LRU_ACTIVE_ANON] + 1; 5973 lru = LRU_BASE; 5974 percentage = nr_anon * 100 / scan_target; 5975 } else { 5976 unsigned long scan_target = targets[LRU_INACTIVE_FILE] + 5977 targets[LRU_ACTIVE_FILE] + 1; 5978 lru = LRU_FILE; 5979 percentage = nr_file * 100 / scan_target; 5980 } 5981 5982 /* Stop scanning the smaller of the LRU */ 5983 nr[lru] = 0; 5984 nr[lru + LRU_ACTIVE] = 0; 5985 5986 /* 5987 * Recalculate the other LRU scan count based on its original 5988 * scan target and the percentage scanning already complete 5989 */ 5990 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; 5991 nr_scanned = targets[lru] - nr[lru]; 5992 nr[lru] = targets[lru] * (100 - percentage) / 100; 5993 nr[lru] -= min(nr[lru], nr_scanned); 5994 5995 lru += LRU_ACTIVE; 5996 nr_scanned = targets[lru] - nr[lru]; 5997 nr[lru] = targets[lru] * (100 - percentage) / 100; 5998 nr[lru] -= min(nr[lru], nr_scanned); 5999 } 6000 blk_finish_plug(&plug); 6001 sc->nr_reclaimed += nr_reclaimed; 6002 6003 /* 6004 * Even if we did not try to evict anon pages at all, we want to 6005 * rebalance the anon lru active/inactive ratio. 6006 */ 6007 if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && 6008 inactive_is_low(lruvec, LRU_INACTIVE_ANON)) 6009 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 6010 sc, LRU_ACTIVE_ANON); 6011 } 6012 6013 /* Use reclaim/compaction for costly allocs or under memory pressure */ 6014 static bool in_reclaim_compaction(struct scan_control *sc) 6015 { 6016 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 6017 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 6018 sc->priority < DEF_PRIORITY - 2)) 6019 return true; 6020 6021 return false; 6022 } 6023 6024 /* 6025 * Reclaim/compaction is used for high-order allocation requests. It reclaims 6026 * order-0 pages before compacting the zone. should_continue_reclaim() returns 6027 * true if more pages should be reclaimed such that when the page allocator 6028 * calls try_to_compact_pages() that it will have enough free pages to succeed. 6029 * It will give up earlier than that if there is difficulty reclaiming pages. 6030 */ 6031 static inline bool should_continue_reclaim(struct pglist_data *pgdat, 6032 unsigned long nr_reclaimed, 6033 struct scan_control *sc) 6034 { 6035 unsigned long pages_for_compaction; 6036 unsigned long inactive_lru_pages; 6037 int z; 6038 6039 /* If not in reclaim/compaction mode, stop */ 6040 if (!in_reclaim_compaction(sc)) 6041 return false; 6042 6043 /* 6044 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX 6045 * number of pages that were scanned. This will return to the caller 6046 * with the risk reclaim/compaction and the resulting allocation attempt 6047 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL 6048 * allocations through requiring that the full LRU list has been scanned 6049 * first, by assuming that zero delta of sc->nr_scanned means full LRU 6050 * scan, but that approximation was wrong, and there were corner cases 6051 * where always a non-zero amount of pages were scanned. 6052 */ 6053 if (!nr_reclaimed) 6054 return false; 6055 6056 /* If compaction would go ahead or the allocation would succeed, stop */ 6057 for (z = 0; z <= sc->reclaim_idx; z++) { 6058 struct zone *zone = &pgdat->node_zones[z]; 6059 if (!managed_zone(zone)) 6060 continue; 6061 6062 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { 6063 case COMPACT_SUCCESS: 6064 case COMPACT_CONTINUE: 6065 return false; 6066 default: 6067 /* check next zone */ 6068 ; 6069 } 6070 } 6071 6072 /* 6073 * If we have not reclaimed enough pages for compaction and the 6074 * inactive lists are large enough, continue reclaiming 6075 */ 6076 pages_for_compaction = compact_gap(sc->order); 6077 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); 6078 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) 6079 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); 6080 6081 return inactive_lru_pages > pages_for_compaction; 6082 } 6083 6084 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) 6085 { 6086 struct mem_cgroup *target_memcg = sc->target_mem_cgroup; 6087 struct mem_cgroup *memcg; 6088 6089 memcg = mem_cgroup_iter(target_memcg, NULL, NULL); 6090 do { 6091 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 6092 unsigned long reclaimed; 6093 unsigned long scanned; 6094 6095 /* 6096 * This loop can become CPU-bound when target memcgs 6097 * aren't eligible for reclaim - either because they 6098 * don't have any reclaimable pages, or because their 6099 * memory is explicitly protected. Avoid soft lockups. 6100 */ 6101 cond_resched(); 6102 6103 mem_cgroup_calculate_protection(target_memcg, memcg); 6104 6105 if (mem_cgroup_below_min(memcg)) { 6106 /* 6107 * Hard protection. 6108 * If there is no reclaimable memory, OOM. 6109 */ 6110 continue; 6111 } else if (mem_cgroup_below_low(memcg)) { 6112 /* 6113 * Soft protection. 6114 * Respect the protection only as long as 6115 * there is an unprotected supply 6116 * of reclaimable memory from other cgroups. 6117 */ 6118 if (!sc->memcg_low_reclaim) { 6119 sc->memcg_low_skipped = 1; 6120 continue; 6121 } 6122 memcg_memory_event(memcg, MEMCG_LOW); 6123 } 6124 6125 reclaimed = sc->nr_reclaimed; 6126 scanned = sc->nr_scanned; 6127 6128 shrink_lruvec(lruvec, sc); 6129 6130 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, 6131 sc->priority); 6132 6133 /* Record the group's reclaim efficiency */ 6134 if (!sc->proactive) 6135 vmpressure(sc->gfp_mask, memcg, false, 6136 sc->nr_scanned - scanned, 6137 sc->nr_reclaimed - reclaimed); 6138 6139 } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); 6140 } 6141 6142 static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) 6143 { 6144 struct reclaim_state *reclaim_state = current->reclaim_state; 6145 unsigned long nr_reclaimed, nr_scanned; 6146 struct lruvec *target_lruvec; 6147 bool reclaimable = false; 6148 6149 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); 6150 6151 again: 6152 memset(&sc->nr, 0, sizeof(sc->nr)); 6153 6154 nr_reclaimed = sc->nr_reclaimed; 6155 nr_scanned = sc->nr_scanned; 6156 6157 prepare_scan_count(pgdat, sc); 6158 6159 shrink_node_memcgs(pgdat, sc); 6160 6161 if (reclaim_state) { 6162 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 6163 reclaim_state->reclaimed_slab = 0; 6164 } 6165 6166 /* Record the subtree's reclaim efficiency */ 6167 if (!sc->proactive) 6168 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, 6169 sc->nr_scanned - nr_scanned, 6170 sc->nr_reclaimed - nr_reclaimed); 6171 6172 if (sc->nr_reclaimed - nr_reclaimed) 6173 reclaimable = true; 6174 6175 if (current_is_kswapd()) { 6176 /* 6177 * If reclaim is isolating dirty pages under writeback, 6178 * it implies that the long-lived page allocation rate 6179 * is exceeding the page laundering rate. Either the 6180 * global limits are not being effective at throttling 6181 * processes due to the page distribution throughout 6182 * zones or there is heavy usage of a slow backing 6183 * device. The only option is to throttle from reclaim 6184 * context which is not ideal as there is no guarantee 6185 * the dirtying process is throttled in the same way 6186 * balance_dirty_pages() manages. 6187 * 6188 * Once a node is flagged PGDAT_WRITEBACK, kswapd will 6189 * count the number of pages under pages flagged for 6190 * immediate reclaim and stall if any are encountered 6191 * in the nr_immediate check below. 6192 */ 6193 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) 6194 set_bit(PGDAT_WRITEBACK, &pgdat->flags); 6195 6196 /* Allow kswapd to start writing pages during reclaim.*/ 6197 if (sc->nr.unqueued_dirty == sc->nr.file_taken) 6198 set_bit(PGDAT_DIRTY, &pgdat->flags); 6199 6200 /* 6201 * If kswapd scans pages marked for immediate 6202 * reclaim and under writeback (nr_immediate), it 6203 * implies that pages are cycling through the LRU 6204 * faster than they are written so forcibly stall 6205 * until some pages complete writeback. 6206 */ 6207 if (sc->nr.immediate) 6208 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); 6209 } 6210 6211 /* 6212 * Tag a node/memcg as congested if all the dirty pages were marked 6213 * for writeback and immediate reclaim (counted in nr.congested). 6214 * 6215 * Legacy memcg will stall in page writeback so avoid forcibly 6216 * stalling in reclaim_throttle(). 6217 */ 6218 if ((current_is_kswapd() || 6219 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && 6220 sc->nr.dirty && sc->nr.dirty == sc->nr.congested) 6221 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); 6222 6223 /* 6224 * Stall direct reclaim for IO completions if the lruvec is 6225 * node is congested. Allow kswapd to continue until it 6226 * starts encountering unqueued dirty pages or cycling through 6227 * the LRU too quickly. 6228 */ 6229 if (!current_is_kswapd() && current_may_throttle() && 6230 !sc->hibernation_mode && 6231 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) 6232 reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); 6233 6234 if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, 6235 sc)) 6236 goto again; 6237 6238 /* 6239 * Kswapd gives up on balancing particular nodes after too 6240 * many failures to reclaim anything from them and goes to 6241 * sleep. On reclaim progress, reset the failure counter. A 6242 * successful direct reclaim run will revive a dormant kswapd. 6243 */ 6244 if (reclaimable) 6245 pgdat->kswapd_failures = 0; 6246 } 6247 6248 /* 6249 * Returns true if compaction should go ahead for a costly-order request, or 6250 * the allocation would already succeed without compaction. Return false if we 6251 * should reclaim first. 6252 */ 6253 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 6254 { 6255 unsigned long watermark; 6256 enum compact_result suitable; 6257 6258 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); 6259 if (suitable == COMPACT_SUCCESS) 6260 /* Allocation should succeed already. Don't reclaim. */ 6261 return true; 6262 if (suitable == COMPACT_SKIPPED) 6263 /* Compaction cannot yet proceed. Do reclaim. */ 6264 return false; 6265 6266 /* 6267 * Compaction is already possible, but it takes time to run and there 6268 * are potentially other callers using the pages just freed. So proceed 6269 * with reclaim to make a buffer of free pages available to give 6270 * compaction a reasonable chance of completing and allocating the page. 6271 * Note that we won't actually reclaim the whole buffer in one attempt 6272 * as the target watermark in should_continue_reclaim() is lower. But if 6273 * we are already above the high+gap watermark, don't reclaim at all. 6274 */ 6275 watermark = high_wmark_pages(zone) + compact_gap(sc->order); 6276 6277 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); 6278 } 6279 6280 static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) 6281 { 6282 /* 6283 * If reclaim is making progress greater than 12% efficiency then 6284 * wake all the NOPROGRESS throttled tasks. 6285 */ 6286 if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { 6287 wait_queue_head_t *wqh; 6288 6289 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; 6290 if (waitqueue_active(wqh)) 6291 wake_up(wqh); 6292 6293 return; 6294 } 6295 6296 /* 6297 * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will 6298 * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages 6299 * under writeback and marked for immediate reclaim at the tail of the 6300 * LRU. 6301 */ 6302 if (current_is_kswapd() || cgroup_reclaim(sc)) 6303 return; 6304 6305 /* Throttle if making no progress at high prioities. */ 6306 if (sc->priority == 1 && !sc->nr_reclaimed) 6307 reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); 6308 } 6309 6310 /* 6311 * This is the direct reclaim path, for page-allocating processes. We only 6312 * try to reclaim pages from zones which will satisfy the caller's allocation 6313 * request. 6314 * 6315 * If a zone is deemed to be full of pinned pages then just give it a light 6316 * scan then give up on it. 6317 */ 6318 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) 6319 { 6320 struct zoneref *z; 6321 struct zone *zone; 6322 unsigned long nr_soft_reclaimed; 6323 unsigned long nr_soft_scanned; 6324 gfp_t orig_mask; 6325 pg_data_t *last_pgdat = NULL; 6326 pg_data_t *first_pgdat = NULL; 6327 6328 /* 6329 * If the number of buffer_heads in the machine exceeds the maximum 6330 * allowed level, force direct reclaim to scan the highmem zone as 6331 * highmem pages could be pinning lowmem pages storing buffer_heads 6332 */ 6333 orig_mask = sc->gfp_mask; 6334 if (buffer_heads_over_limit) { 6335 sc->gfp_mask |= __GFP_HIGHMEM; 6336 sc->reclaim_idx = gfp_zone(sc->gfp_mask); 6337 } 6338 6339 for_each_zone_zonelist_nodemask(zone, z, zonelist, 6340 sc->reclaim_idx, sc->nodemask) { 6341 /* 6342 * Take care memory controller reclaiming has small influence 6343 * to global LRU. 6344 */ 6345 if (!cgroup_reclaim(sc)) { 6346 if (!cpuset_zone_allowed(zone, 6347 GFP_KERNEL | __GFP_HARDWALL)) 6348 continue; 6349 6350 /* 6351 * If we already have plenty of memory free for 6352 * compaction in this zone, don't free any more. 6353 * Even though compaction is invoked for any 6354 * non-zero order, only frequent costly order 6355 * reclamation is disruptive enough to become a 6356 * noticeable problem, like transparent huge 6357 * page allocations. 6358 */ 6359 if (IS_ENABLED(CONFIG_COMPACTION) && 6360 sc->order > PAGE_ALLOC_COSTLY_ORDER && 6361 compaction_ready(zone, sc)) { 6362 sc->compaction_ready = true; 6363 continue; 6364 } 6365 6366 /* 6367 * Shrink each node in the zonelist once. If the 6368 * zonelist is ordered by zone (not the default) then a 6369 * node may be shrunk multiple times but in that case 6370 * the user prefers lower zones being preserved. 6371 */ 6372 if (zone->zone_pgdat == last_pgdat) 6373 continue; 6374 6375 /* 6376 * This steals pages from memory cgroups over softlimit 6377 * and returns the number of reclaimed pages and 6378 * scanned pages. This works for global memory pressure 6379 * and balancing, not for a memcg's limit. 6380 */ 6381 nr_soft_scanned = 0; 6382 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, 6383 sc->order, sc->gfp_mask, 6384 &nr_soft_scanned); 6385 sc->nr_reclaimed += nr_soft_reclaimed; 6386 sc->nr_scanned += nr_soft_scanned; 6387 /* need some check for avoid more shrink_zone() */ 6388 } 6389 6390 if (!first_pgdat) 6391 first_pgdat = zone->zone_pgdat; 6392 6393 /* See comment about same check for global reclaim above */ 6394 if (zone->zone_pgdat == last_pgdat) 6395 continue; 6396 last_pgdat = zone->zone_pgdat; 6397 shrink_node(zone->zone_pgdat, sc); 6398 } 6399 6400 if (first_pgdat) 6401 consider_reclaim_throttle(first_pgdat, sc); 6402 6403 /* 6404 * Restore to original mask to avoid the impact on the caller if we 6405 * promoted it to __GFP_HIGHMEM. 6406 */ 6407 sc->gfp_mask = orig_mask; 6408 } 6409 6410 static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) 6411 { 6412 struct lruvec *target_lruvec; 6413 unsigned long refaults; 6414 6415 if (lru_gen_enabled()) 6416 return; 6417 6418 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); 6419 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); 6420 target_lruvec->refaults[WORKINGSET_ANON] = refaults; 6421 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); 6422 target_lruvec->refaults[WORKINGSET_FILE] = refaults; 6423 } 6424 6425 /* 6426 * This is the main entry point to direct page reclaim. 6427 * 6428 * If a full scan of the inactive list fails to free enough memory then we 6429 * are "out of memory" and something needs to be killed. 6430 * 6431 * If the caller is !__GFP_FS then the probability of a failure is reasonably 6432 * high - the zone may be full of dirty or under-writeback pages, which this 6433 * caller can't do much about. We kick the writeback threads and take explicit 6434 * naps in the hope that some of these pages can be written. But if the 6435 * allocating task holds filesystem locks which prevent writeout this might not 6436 * work, and the allocation attempt will fail. 6437 * 6438 * returns: 0, if no pages reclaimed 6439 * else, the number of pages reclaimed 6440 */ 6441 static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 6442 struct scan_control *sc) 6443 { 6444 int initial_priority = sc->priority; 6445 pg_data_t *last_pgdat; 6446 struct zoneref *z; 6447 struct zone *zone; 6448 retry: 6449 delayacct_freepages_start(); 6450 6451 if (!cgroup_reclaim(sc)) 6452 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); 6453 6454 do { 6455 if (!sc->proactive) 6456 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 6457 sc->priority); 6458 sc->nr_scanned = 0; 6459 shrink_zones(zonelist, sc); 6460 6461 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 6462 break; 6463 6464 if (sc->compaction_ready) 6465 break; 6466 6467 /* 6468 * If we're getting trouble reclaiming, start doing 6469 * writepage even in laptop mode. 6470 */ 6471 if (sc->priority < DEF_PRIORITY - 2) 6472 sc->may_writepage = 1; 6473 } while (--sc->priority >= 0); 6474 6475 last_pgdat = NULL; 6476 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, 6477 sc->nodemask) { 6478 if (zone->zone_pgdat == last_pgdat) 6479 continue; 6480 last_pgdat = zone->zone_pgdat; 6481 6482 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); 6483 6484 if (cgroup_reclaim(sc)) { 6485 struct lruvec *lruvec; 6486 6487 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, 6488 zone->zone_pgdat); 6489 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); 6490 } 6491 } 6492 6493 delayacct_freepages_end(); 6494 6495 if (sc->nr_reclaimed) 6496 return sc->nr_reclaimed; 6497 6498 /* Aborted reclaim to try compaction? don't OOM, then */ 6499 if (sc->compaction_ready) 6500 return 1; 6501 6502 /* 6503 * We make inactive:active ratio decisions based on the node's 6504 * composition of memory, but a restrictive reclaim_idx or a 6505 * memory.low cgroup setting can exempt large amounts of 6506 * memory from reclaim. Neither of which are very common, so 6507 * instead of doing costly eligibility calculations of the 6508 * entire cgroup subtree up front, we assume the estimates are 6509 * good, and retry with forcible deactivation if that fails. 6510 */ 6511 if (sc->skipped_deactivate) { 6512 sc->priority = initial_priority; 6513 sc->force_deactivate = 1; 6514 sc->skipped_deactivate = 0; 6515 goto retry; 6516 } 6517 6518 /* Untapped cgroup reserves? Don't OOM, retry. */ 6519 if (sc->memcg_low_skipped) { 6520 sc->priority = initial_priority; 6521 sc->force_deactivate = 0; 6522 sc->memcg_low_reclaim = 1; 6523 sc->memcg_low_skipped = 0; 6524 goto retry; 6525 } 6526 6527 return 0; 6528 } 6529 6530 static bool allow_direct_reclaim(pg_data_t *pgdat) 6531 { 6532 struct zone *zone; 6533 unsigned long pfmemalloc_reserve = 0; 6534 unsigned long free_pages = 0; 6535 int i; 6536 bool wmark_ok; 6537 6538 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 6539 return true; 6540 6541 for (i = 0; i <= ZONE_NORMAL; i++) { 6542 zone = &pgdat->node_zones[i]; 6543 if (!managed_zone(zone)) 6544 continue; 6545 6546 if (!zone_reclaimable_pages(zone)) 6547 continue; 6548 6549 pfmemalloc_reserve += min_wmark_pages(zone); 6550 free_pages += zone_page_state(zone, NR_FREE_PAGES); 6551 } 6552 6553 /* If there are no reserves (unexpected config) then do not throttle */ 6554 if (!pfmemalloc_reserve) 6555 return true; 6556 6557 wmark_ok = free_pages > pfmemalloc_reserve / 2; 6558 6559 /* kswapd must be awake if processes are being throttled */ 6560 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { 6561 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) 6562 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); 6563 6564 wake_up_interruptible(&pgdat->kswapd_wait); 6565 } 6566 6567 return wmark_ok; 6568 } 6569 6570 /* 6571 * Throttle direct reclaimers if backing storage is backed by the network 6572 * and the PFMEMALLOC reserve for the preferred node is getting dangerously 6573 * depleted. kswapd will continue to make progress and wake the processes 6574 * when the low watermark is reached. 6575 * 6576 * Returns true if a fatal signal was delivered during throttling. If this 6577 * happens, the page allocator should not consider triggering the OOM killer. 6578 */ 6579 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 6580 nodemask_t *nodemask) 6581 { 6582 struct zoneref *z; 6583 struct zone *zone; 6584 pg_data_t *pgdat = NULL; 6585 6586 /* 6587 * Kernel threads should not be throttled as they may be indirectly 6588 * responsible for cleaning pages necessary for reclaim to make forward 6589 * progress. kjournald for example may enter direct reclaim while 6590 * committing a transaction where throttling it could forcing other 6591 * processes to block on log_wait_commit(). 6592 */ 6593 if (current->flags & PF_KTHREAD) 6594 goto out; 6595 6596 /* 6597 * If a fatal signal is pending, this process should not throttle. 6598 * It should return quickly so it can exit and free its memory 6599 */ 6600 if (fatal_signal_pending(current)) 6601 goto out; 6602 6603 /* 6604 * Check if the pfmemalloc reserves are ok by finding the first node 6605 * with a usable ZONE_NORMAL or lower zone. The expectation is that 6606 * GFP_KERNEL will be required for allocating network buffers when 6607 * swapping over the network so ZONE_HIGHMEM is unusable. 6608 * 6609 * Throttling is based on the first usable node and throttled processes 6610 * wait on a queue until kswapd makes progress and wakes them. There 6611 * is an affinity then between processes waking up and where reclaim 6612 * progress has been made assuming the process wakes on the same node. 6613 * More importantly, processes running on remote nodes will not compete 6614 * for remote pfmemalloc reserves and processes on different nodes 6615 * should make reasonable progress. 6616 */ 6617 for_each_zone_zonelist_nodemask(zone, z, zonelist, 6618 gfp_zone(gfp_mask), nodemask) { 6619 if (zone_idx(zone) > ZONE_NORMAL) 6620 continue; 6621 6622 /* Throttle based on the first usable node */ 6623 pgdat = zone->zone_pgdat; 6624 if (allow_direct_reclaim(pgdat)) 6625 goto out; 6626 break; 6627 } 6628 6629 /* If no zone was usable by the allocation flags then do not throttle */ 6630 if (!pgdat) 6631 goto out; 6632 6633 /* Account for the throttling */ 6634 count_vm_event(PGSCAN_DIRECT_THROTTLE); 6635 6636 /* 6637 * If the caller cannot enter the filesystem, it's possible that it 6638 * is due to the caller holding an FS lock or performing a journal 6639 * transaction in the case of a filesystem like ext[3|4]. In this case, 6640 * it is not safe to block on pfmemalloc_wait as kswapd could be 6641 * blocked waiting on the same lock. Instead, throttle for up to a 6642 * second before continuing. 6643 */ 6644 if (!(gfp_mask & __GFP_FS)) 6645 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 6646 allow_direct_reclaim(pgdat), HZ); 6647 else 6648 /* Throttle until kswapd wakes the process */ 6649 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 6650 allow_direct_reclaim(pgdat)); 6651 6652 if (fatal_signal_pending(current)) 6653 return true; 6654 6655 out: 6656 return false; 6657 } 6658 6659 unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 6660 gfp_t gfp_mask, nodemask_t *nodemask) 6661 { 6662 unsigned long nr_reclaimed; 6663 struct scan_control sc = { 6664 .nr_to_reclaim = SWAP_CLUSTER_MAX, 6665 .gfp_mask = current_gfp_context(gfp_mask), 6666 .reclaim_idx = gfp_zone(gfp_mask), 6667 .order = order, 6668 .nodemask = nodemask, 6669 .priority = DEF_PRIORITY, 6670 .may_writepage = !laptop_mode, 6671 .may_unmap = 1, 6672 .may_swap = 1, 6673 }; 6674 6675 /* 6676 * scan_control uses s8 fields for order, priority, and reclaim_idx. 6677 * Confirm they are large enough for max values. 6678 */ 6679 BUILD_BUG_ON(MAX_ORDER > S8_MAX); 6680 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); 6681 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); 6682 6683 /* 6684 * Do not enter reclaim if fatal signal was delivered while throttled. 6685 * 1 is returned so that the page allocator does not OOM kill at this 6686 * point. 6687 */ 6688 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) 6689 return 1; 6690 6691 set_task_reclaim_state(current, &sc.reclaim_state); 6692 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); 6693 6694 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 6695 6696 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 6697 set_task_reclaim_state(current, NULL); 6698 6699 return nr_reclaimed; 6700 } 6701 6702 #ifdef CONFIG_MEMCG 6703 6704 /* Only used by soft limit reclaim. Do not reuse for anything else. */ 6705 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, 6706 gfp_t gfp_mask, bool noswap, 6707 pg_data_t *pgdat, 6708 unsigned long *nr_scanned) 6709 { 6710 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); 6711 struct scan_control sc = { 6712 .nr_to_reclaim = SWAP_CLUSTER_MAX, 6713 .target_mem_cgroup = memcg, 6714 .may_writepage = !laptop_mode, 6715 .may_unmap = 1, 6716 .reclaim_idx = MAX_NR_ZONES - 1, 6717 .may_swap = !noswap, 6718 }; 6719 6720 WARN_ON_ONCE(!current->reclaim_state); 6721 6722 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 6723 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 6724 6725 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, 6726 sc.gfp_mask); 6727 6728 /* 6729 * NOTE: Although we can get the priority field, using it 6730 * here is not a good idea, since it limits the pages we can scan. 6731 * if we don't reclaim here, the shrink_node from balance_pgdat 6732 * will pick up pages from other mem cgroup's as well. We hack 6733 * the priority and make it zero. 6734 */ 6735 shrink_lruvec(lruvec, &sc); 6736 6737 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 6738 6739 *nr_scanned = sc.nr_scanned; 6740 6741 return sc.nr_reclaimed; 6742 } 6743 6744 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, 6745 unsigned long nr_pages, 6746 gfp_t gfp_mask, 6747 unsigned int reclaim_options) 6748 { 6749 unsigned long nr_reclaimed; 6750 unsigned int noreclaim_flag; 6751 struct scan_control sc = { 6752 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 6753 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | 6754 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 6755 .reclaim_idx = MAX_NR_ZONES - 1, 6756 .target_mem_cgroup = memcg, 6757 .priority = DEF_PRIORITY, 6758 .may_writepage = !laptop_mode, 6759 .may_unmap = 1, 6760 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), 6761 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), 6762 }; 6763 /* 6764 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put 6765 * equal pressure on all the nodes. This is based on the assumption that 6766 * the reclaim does not bail out early. 6767 */ 6768 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 6769 6770 set_task_reclaim_state(current, &sc.reclaim_state); 6771 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); 6772 noreclaim_flag = memalloc_noreclaim_save(); 6773 6774 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 6775 6776 memalloc_noreclaim_restore(noreclaim_flag); 6777 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 6778 set_task_reclaim_state(current, NULL); 6779 6780 return nr_reclaimed; 6781 } 6782 #endif 6783 6784 static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) 6785 { 6786 struct mem_cgroup *memcg; 6787 struct lruvec *lruvec; 6788 6789 if (lru_gen_enabled()) { 6790 lru_gen_age_node(pgdat, sc); 6791 return; 6792 } 6793 6794 if (!can_age_anon_pages(pgdat, sc)) 6795 return; 6796 6797 lruvec = mem_cgroup_lruvec(NULL, pgdat); 6798 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) 6799 return; 6800 6801 memcg = mem_cgroup_iter(NULL, NULL, NULL); 6802 do { 6803 lruvec = mem_cgroup_lruvec(memcg, pgdat); 6804 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 6805 sc, LRU_ACTIVE_ANON); 6806 memcg = mem_cgroup_iter(NULL, memcg, NULL); 6807 } while (memcg); 6808 } 6809 6810 static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) 6811 { 6812 int i; 6813 struct zone *zone; 6814 6815 /* 6816 * Check for watermark boosts top-down as the higher zones 6817 * are more likely to be boosted. Both watermarks and boosts 6818 * should not be checked at the same time as reclaim would 6819 * start prematurely when there is no boosting and a lower 6820 * zone is balanced. 6821 */ 6822 for (i = highest_zoneidx; i >= 0; i--) { 6823 zone = pgdat->node_zones + i; 6824 if (!managed_zone(zone)) 6825 continue; 6826 6827 if (zone->watermark_boost) 6828 return true; 6829 } 6830 6831 return false; 6832 } 6833 6834 /* 6835 * Returns true if there is an eligible zone balanced for the request order 6836 * and highest_zoneidx 6837 */ 6838 static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) 6839 { 6840 int i; 6841 unsigned long mark = -1; 6842 struct zone *zone; 6843 6844 /* 6845 * Check watermarks bottom-up as lower zones are more likely to 6846 * meet watermarks. 6847 */ 6848 for (i = 0; i <= highest_zoneidx; i++) { 6849 zone = pgdat->node_zones + i; 6850 6851 if (!managed_zone(zone)) 6852 continue; 6853 6854 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) 6855 mark = wmark_pages(zone, WMARK_PROMO); 6856 else 6857 mark = high_wmark_pages(zone); 6858 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) 6859 return true; 6860 } 6861 6862 /* 6863 * If a node has no managed zone within highest_zoneidx, it does not 6864 * need balancing by definition. This can happen if a zone-restricted 6865 * allocation tries to wake a remote kswapd. 6866 */ 6867 if (mark == -1) 6868 return true; 6869 6870 return false; 6871 } 6872 6873 /* Clear pgdat state for congested, dirty or under writeback. */ 6874 static void clear_pgdat_congested(pg_data_t *pgdat) 6875 { 6876 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); 6877 6878 clear_bit(LRUVEC_CONGESTED, &lruvec->flags); 6879 clear_bit(PGDAT_DIRTY, &pgdat->flags); 6880 clear_bit(PGDAT_WRITEBACK, &pgdat->flags); 6881 } 6882 6883 /* 6884 * Prepare kswapd for sleeping. This verifies that there are no processes 6885 * waiting in throttle_direct_reclaim() and that watermarks have been met. 6886 * 6887 * Returns true if kswapd is ready to sleep 6888 */ 6889 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, 6890 int highest_zoneidx) 6891 { 6892 /* 6893 * The throttled processes are normally woken up in balance_pgdat() as 6894 * soon as allow_direct_reclaim() is true. But there is a potential 6895 * race between when kswapd checks the watermarks and a process gets 6896 * throttled. There is also a potential race if processes get 6897 * throttled, kswapd wakes, a large process exits thereby balancing the 6898 * zones, which causes kswapd to exit balance_pgdat() before reaching 6899 * the wake up checks. If kswapd is going to sleep, no process should 6900 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If 6901 * the wake up is premature, processes will wake kswapd and get 6902 * throttled again. The difference from wake ups in balance_pgdat() is 6903 * that here we are under prepare_to_wait(). 6904 */ 6905 if (waitqueue_active(&pgdat->pfmemalloc_wait)) 6906 wake_up_all(&pgdat->pfmemalloc_wait); 6907 6908 /* Hopeless node, leave it to direct reclaim */ 6909 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 6910 return true; 6911 6912 if (pgdat_balanced(pgdat, order, highest_zoneidx)) { 6913 clear_pgdat_congested(pgdat); 6914 return true; 6915 } 6916 6917 return false; 6918 } 6919 6920 /* 6921 * kswapd shrinks a node of pages that are at or below the highest usable 6922 * zone that is currently unbalanced. 6923 * 6924 * Returns true if kswapd scanned at least the requested number of pages to 6925 * reclaim or if the lack of progress was due to pages under writeback. 6926 * This is used to determine if the scanning priority needs to be raised. 6927 */ 6928 static bool kswapd_shrink_node(pg_data_t *pgdat, 6929 struct scan_control *sc) 6930 { 6931 struct zone *zone; 6932 int z; 6933 6934 /* Reclaim a number of pages proportional to the number of zones */ 6935 sc->nr_to_reclaim = 0; 6936 for (z = 0; z <= sc->reclaim_idx; z++) { 6937 zone = pgdat->node_zones + z; 6938 if (!managed_zone(zone)) 6939 continue; 6940 6941 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); 6942 } 6943 6944 /* 6945 * Historically care was taken to put equal pressure on all zones but 6946 * now pressure is applied based on node LRU order. 6947 */ 6948 shrink_node(pgdat, sc); 6949 6950 /* 6951 * Fragmentation may mean that the system cannot be rebalanced for 6952 * high-order allocations. If twice the allocation size has been 6953 * reclaimed then recheck watermarks only at order-0 to prevent 6954 * excessive reclaim. Assume that a process requested a high-order 6955 * can direct reclaim/compact. 6956 */ 6957 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) 6958 sc->order = 0; 6959 6960 return sc->nr_scanned >= sc->nr_to_reclaim; 6961 } 6962 6963 /* Page allocator PCP high watermark is lowered if reclaim is active. */ 6964 static inline void 6965 update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) 6966 { 6967 int i; 6968 struct zone *zone; 6969 6970 for (i = 0; i <= highest_zoneidx; i++) { 6971 zone = pgdat->node_zones + i; 6972 6973 if (!managed_zone(zone)) 6974 continue; 6975 6976 if (active) 6977 set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); 6978 else 6979 clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); 6980 } 6981 } 6982 6983 static inline void 6984 set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) 6985 { 6986 update_reclaim_active(pgdat, highest_zoneidx, true); 6987 } 6988 6989 static inline void 6990 clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) 6991 { 6992 update_reclaim_active(pgdat, highest_zoneidx, false); 6993 } 6994 6995 /* 6996 * For kswapd, balance_pgdat() will reclaim pages across a node from zones 6997 * that are eligible for use by the caller until at least one zone is 6998 * balanced. 6999 * 7000 * Returns the order kswapd finished reclaiming at. 7001 * 7002 * kswapd scans the zones in the highmem->normal->dma direction. It skips 7003 * zones which have free_pages > high_wmark_pages(zone), but once a zone is 7004 * found to have free_pages <= high_wmark_pages(zone), any page in that zone 7005 * or lower is eligible for reclaim until at least one usable zone is 7006 * balanced. 7007 */ 7008 static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) 7009 { 7010 int i; 7011 unsigned long nr_soft_reclaimed; 7012 unsigned long nr_soft_scanned; 7013 unsigned long pflags; 7014 unsigned long nr_boost_reclaim; 7015 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; 7016 bool boosted; 7017 struct zone *zone; 7018 struct scan_control sc = { 7019 .gfp_mask = GFP_KERNEL, 7020 .order = order, 7021 .may_unmap = 1, 7022 }; 7023 7024 set_task_reclaim_state(current, &sc.reclaim_state); 7025 psi_memstall_enter(&pflags); 7026 __fs_reclaim_acquire(_THIS_IP_); 7027 7028 count_vm_event(PAGEOUTRUN); 7029 7030 /* 7031 * Account for the reclaim boost. Note that the zone boost is left in 7032 * place so that parallel allocations that are near the watermark will 7033 * stall or direct reclaim until kswapd is finished. 7034 */ 7035 nr_boost_reclaim = 0; 7036 for (i = 0; i <= highest_zoneidx; i++) { 7037 zone = pgdat->node_zones + i; 7038 if (!managed_zone(zone)) 7039 continue; 7040 7041 nr_boost_reclaim += zone->watermark_boost; 7042 zone_boosts[i] = zone->watermark_boost; 7043 } 7044 boosted = nr_boost_reclaim; 7045 7046 restart: 7047 set_reclaim_active(pgdat, highest_zoneidx); 7048 sc.priority = DEF_PRIORITY; 7049 do { 7050 unsigned long nr_reclaimed = sc.nr_reclaimed; 7051 bool raise_priority = true; 7052 bool balanced; 7053 bool ret; 7054 7055 sc.reclaim_idx = highest_zoneidx; 7056 7057 /* 7058 * If the number of buffer_heads exceeds the maximum allowed 7059 * then consider reclaiming from all zones. This has a dual 7060 * purpose -- on 64-bit systems it is expected that 7061 * buffer_heads are stripped during active rotation. On 32-bit 7062 * systems, highmem pages can pin lowmem memory and shrinking 7063 * buffers can relieve lowmem pressure. Reclaim may still not 7064 * go ahead if all eligible zones for the original allocation 7065 * request are balanced to avoid excessive reclaim from kswapd. 7066 */ 7067 if (buffer_heads_over_limit) { 7068 for (i = MAX_NR_ZONES - 1; i >= 0; i--) { 7069 zone = pgdat->node_zones + i; 7070 if (!managed_zone(zone)) 7071 continue; 7072 7073 sc.reclaim_idx = i; 7074 break; 7075 } 7076 } 7077 7078 /* 7079 * If the pgdat is imbalanced then ignore boosting and preserve 7080 * the watermarks for a later time and restart. Note that the 7081 * zone watermarks will be still reset at the end of balancing 7082 * on the grounds that the normal reclaim should be enough to 7083 * re-evaluate if boosting is required when kswapd next wakes. 7084 */ 7085 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); 7086 if (!balanced && nr_boost_reclaim) { 7087 nr_boost_reclaim = 0; 7088 goto restart; 7089 } 7090 7091 /* 7092 * If boosting is not active then only reclaim if there are no 7093 * eligible zones. Note that sc.reclaim_idx is not used as 7094 * buffer_heads_over_limit may have adjusted it. 7095 */ 7096 if (!nr_boost_reclaim && balanced) 7097 goto out; 7098 7099 /* Limit the priority of boosting to avoid reclaim writeback */ 7100 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) 7101 raise_priority = false; 7102 7103 /* 7104 * Do not writeback or swap pages for boosted reclaim. The 7105 * intent is to relieve pressure not issue sub-optimal IO 7106 * from reclaim context. If no pages are reclaimed, the 7107 * reclaim will be aborted. 7108 */ 7109 sc.may_writepage = !laptop_mode && !nr_boost_reclaim; 7110 sc.may_swap = !nr_boost_reclaim; 7111 7112 /* 7113 * Do some background aging, to give pages a chance to be 7114 * referenced before reclaiming. All pages are rotated 7115 * regardless of classzone as this is about consistent aging. 7116 */ 7117 kswapd_age_node(pgdat, &sc); 7118 7119 /* 7120 * If we're getting trouble reclaiming, start doing writepage 7121 * even in laptop mode. 7122 */ 7123 if (sc.priority < DEF_PRIORITY - 2) 7124 sc.may_writepage = 1; 7125 7126 /* Call soft limit reclaim before calling shrink_node. */ 7127 sc.nr_scanned = 0; 7128 nr_soft_scanned = 0; 7129 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, 7130 sc.gfp_mask, &nr_soft_scanned); 7131 sc.nr_reclaimed += nr_soft_reclaimed; 7132 7133 /* 7134 * There should be no need to raise the scanning priority if 7135 * enough pages are already being scanned that that high 7136 * watermark would be met at 100% efficiency. 7137 */ 7138 if (kswapd_shrink_node(pgdat, &sc)) 7139 raise_priority = false; 7140 7141 /* 7142 * If the low watermark is met there is no need for processes 7143 * to be throttled on pfmemalloc_wait as they should not be 7144 * able to safely make forward progress. Wake them 7145 */ 7146 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 7147 allow_direct_reclaim(pgdat)) 7148 wake_up_all(&pgdat->pfmemalloc_wait); 7149 7150 /* Check if kswapd should be suspending */ 7151 __fs_reclaim_release(_THIS_IP_); 7152 ret = try_to_freeze(); 7153 __fs_reclaim_acquire(_THIS_IP_); 7154 if (ret || kthread_should_stop()) 7155 break; 7156 7157 /* 7158 * Raise priority if scanning rate is too low or there was no 7159 * progress in reclaiming pages 7160 */ 7161 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; 7162 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); 7163 7164 /* 7165 * If reclaim made no progress for a boost, stop reclaim as 7166 * IO cannot be queued and it could be an infinite loop in 7167 * extreme circumstances. 7168 */ 7169 if (nr_boost_reclaim && !nr_reclaimed) 7170 break; 7171 7172 if (raise_priority || !nr_reclaimed) 7173 sc.priority--; 7174 } while (sc.priority >= 1); 7175 7176 if (!sc.nr_reclaimed) 7177 pgdat->kswapd_failures++; 7178 7179 out: 7180 clear_reclaim_active(pgdat, highest_zoneidx); 7181 7182 /* If reclaim was boosted, account for the reclaim done in this pass */ 7183 if (boosted) { 7184 unsigned long flags; 7185 7186 for (i = 0; i <= highest_zoneidx; i++) { 7187 if (!zone_boosts[i]) 7188 continue; 7189 7190 /* Increments are under the zone lock */ 7191 zone = pgdat->node_zones + i; 7192 spin_lock_irqsave(&zone->lock, flags); 7193 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); 7194 spin_unlock_irqrestore(&zone->lock, flags); 7195 } 7196 7197 /* 7198 * As there is now likely space, wakeup kcompact to defragment 7199 * pageblocks. 7200 */ 7201 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); 7202 } 7203 7204 snapshot_refaults(NULL, pgdat); 7205 __fs_reclaim_release(_THIS_IP_); 7206 psi_memstall_leave(&pflags); 7207 set_task_reclaim_state(current, NULL); 7208 7209 /* 7210 * Return the order kswapd stopped reclaiming at as 7211 * prepare_kswapd_sleep() takes it into account. If another caller 7212 * entered the allocator slow path while kswapd was awake, order will 7213 * remain at the higher level. 7214 */ 7215 return sc.order; 7216 } 7217 7218 /* 7219 * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to 7220 * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is 7221 * not a valid index then either kswapd runs for first time or kswapd couldn't 7222 * sleep after previous reclaim attempt (node is still unbalanced). In that 7223 * case return the zone index of the previous kswapd reclaim cycle. 7224 */ 7225 static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, 7226 enum zone_type prev_highest_zoneidx) 7227 { 7228 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); 7229 7230 return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; 7231 } 7232 7233 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, 7234 unsigned int highest_zoneidx) 7235 { 7236 long remaining = 0; 7237 DEFINE_WAIT(wait); 7238 7239 if (freezing(current) || kthread_should_stop()) 7240 return; 7241 7242 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 7243 7244 /* 7245 * Try to sleep for a short interval. Note that kcompactd will only be 7246 * woken if it is possible to sleep for a short interval. This is 7247 * deliberate on the assumption that if reclaim cannot keep an 7248 * eligible zone balanced that it's also unlikely that compaction will 7249 * succeed. 7250 */ 7251 if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { 7252 /* 7253 * Compaction records what page blocks it recently failed to 7254 * isolate pages from and skips them in the future scanning. 7255 * When kswapd is going to sleep, it is reasonable to assume 7256 * that pages and compaction may succeed so reset the cache. 7257 */ 7258 reset_isolation_suitable(pgdat); 7259 7260 /* 7261 * We have freed the memory, now we should compact it to make 7262 * allocation of the requested order possible. 7263 */ 7264 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); 7265 7266 remaining = schedule_timeout(HZ/10); 7267 7268 /* 7269 * If woken prematurely then reset kswapd_highest_zoneidx and 7270 * order. The values will either be from a wakeup request or 7271 * the previous request that slept prematurely. 7272 */ 7273 if (remaining) { 7274 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, 7275 kswapd_highest_zoneidx(pgdat, 7276 highest_zoneidx)); 7277 7278 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) 7279 WRITE_ONCE(pgdat->kswapd_order, reclaim_order); 7280 } 7281 7282 finish_wait(&pgdat->kswapd_wait, &wait); 7283 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 7284 } 7285 7286 /* 7287 * After a short sleep, check if it was a premature sleep. If not, then 7288 * go fully to sleep until explicitly woken up. 7289 */ 7290 if (!remaining && 7291 prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { 7292 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 7293 7294 /* 7295 * vmstat counters are not perfectly accurate and the estimated 7296 * value for counters such as NR_FREE_PAGES can deviate from the 7297 * true value by nr_online_cpus * threshold. To avoid the zone 7298 * watermarks being breached while under pressure, we reduce the 7299 * per-cpu vmstat threshold while kswapd is awake and restore 7300 * them before going back to sleep. 7301 */ 7302 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 7303 7304 if (!kthread_should_stop()) 7305 schedule(); 7306 7307 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 7308 } else { 7309 if (remaining) 7310 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 7311 else 7312 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); 7313 } 7314 finish_wait(&pgdat->kswapd_wait, &wait); 7315 } 7316 7317 /* 7318 * The background pageout daemon, started as a kernel thread 7319 * from the init process. 7320 * 7321 * This basically trickles out pages so that we have _some_ 7322 * free memory available even if there is no other activity 7323 * that frees anything up. This is needed for things like routing 7324 * etc, where we otherwise might have all activity going on in 7325 * asynchronous contexts that cannot page things out. 7326 * 7327 * If there are applications that are active memory-allocators 7328 * (most normal use), this basically shouldn't matter. 7329 */ 7330 static int kswapd(void *p) 7331 { 7332 unsigned int alloc_order, reclaim_order; 7333 unsigned int highest_zoneidx = MAX_NR_ZONES - 1; 7334 pg_data_t *pgdat = (pg_data_t *)p; 7335 struct task_struct *tsk = current; 7336 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 7337 7338 if (!cpumask_empty(cpumask)) 7339 set_cpus_allowed_ptr(tsk, cpumask); 7340 7341 /* 7342 * Tell the memory management that we're a "memory allocator", 7343 * and that if we need more memory we should get access to it 7344 * regardless (see "__alloc_pages()"). "kswapd" should 7345 * never get caught in the normal page freeing logic. 7346 * 7347 * (Kswapd normally doesn't need memory anyway, but sometimes 7348 * you need a small amount of memory in order to be able to 7349 * page out something else, and this flag essentially protects 7350 * us from recursively trying to free more memory as we're 7351 * trying to free the first piece of memory in the first place). 7352 */ 7353 tsk->flags |= PF_MEMALLOC | PF_KSWAPD; 7354 set_freezable(); 7355 7356 WRITE_ONCE(pgdat->kswapd_order, 0); 7357 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); 7358 atomic_set(&pgdat->nr_writeback_throttled, 0); 7359 for ( ; ; ) { 7360 bool ret; 7361 7362 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); 7363 highest_zoneidx = kswapd_highest_zoneidx(pgdat, 7364 highest_zoneidx); 7365 7366 kswapd_try_sleep: 7367 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, 7368 highest_zoneidx); 7369 7370 /* Read the new order and highest_zoneidx */ 7371 alloc_order = READ_ONCE(pgdat->kswapd_order); 7372 highest_zoneidx = kswapd_highest_zoneidx(pgdat, 7373 highest_zoneidx); 7374 WRITE_ONCE(pgdat->kswapd_order, 0); 7375 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); 7376 7377 ret = try_to_freeze(); 7378 if (kthread_should_stop()) 7379 break; 7380 7381 /* 7382 * We can speed up thawing tasks if we don't call balance_pgdat 7383 * after returning from the refrigerator 7384 */ 7385 if (ret) 7386 continue; 7387 7388 /* 7389 * Reclaim begins at the requested order but if a high-order 7390 * reclaim fails then kswapd falls back to reclaiming for 7391 * order-0. If that happens, kswapd will consider sleeping 7392 * for the order it finished reclaiming at (reclaim_order) 7393 * but kcompactd is woken to compact for the original 7394 * request (alloc_order). 7395 */ 7396 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, 7397 alloc_order); 7398 reclaim_order = balance_pgdat(pgdat, alloc_order, 7399 highest_zoneidx); 7400 if (reclaim_order < alloc_order) 7401 goto kswapd_try_sleep; 7402 } 7403 7404 tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); 7405 7406 return 0; 7407 } 7408 7409 /* 7410 * A zone is low on free memory or too fragmented for high-order memory. If 7411 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's 7412 * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim 7413 * has failed or is not needed, still wake up kcompactd if only compaction is 7414 * needed. 7415 */ 7416 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, 7417 enum zone_type highest_zoneidx) 7418 { 7419 pg_data_t *pgdat; 7420 enum zone_type curr_idx; 7421 7422 if (!managed_zone(zone)) 7423 return; 7424 7425 if (!cpuset_zone_allowed(zone, gfp_flags)) 7426 return; 7427 7428 pgdat = zone->zone_pgdat; 7429 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); 7430 7431 if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) 7432 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); 7433 7434 if (READ_ONCE(pgdat->kswapd_order) < order) 7435 WRITE_ONCE(pgdat->kswapd_order, order); 7436 7437 if (!waitqueue_active(&pgdat->kswapd_wait)) 7438 return; 7439 7440 /* Hopeless node, leave it to direct reclaim if possible */ 7441 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || 7442 (pgdat_balanced(pgdat, order, highest_zoneidx) && 7443 !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { 7444 /* 7445 * There may be plenty of free memory available, but it's too 7446 * fragmented for high-order allocations. Wake up kcompactd 7447 * and rely on compaction_suitable() to determine if it's 7448 * needed. If it fails, it will defer subsequent attempts to 7449 * ratelimit its work. 7450 */ 7451 if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) 7452 wakeup_kcompactd(pgdat, order, highest_zoneidx); 7453 return; 7454 } 7455 7456 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, 7457 gfp_flags); 7458 wake_up_interruptible(&pgdat->kswapd_wait); 7459 } 7460 7461 #ifdef CONFIG_HIBERNATION 7462 /* 7463 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 7464 * freed pages. 7465 * 7466 * Rather than trying to age LRUs the aim is to preserve the overall 7467 * LRU order by reclaiming preferentially 7468 * inactive > active > active referenced > active mapped 7469 */ 7470 unsigned long shrink_all_memory(unsigned long nr_to_reclaim) 7471 { 7472 struct scan_control sc = { 7473 .nr_to_reclaim = nr_to_reclaim, 7474 .gfp_mask = GFP_HIGHUSER_MOVABLE, 7475 .reclaim_idx = MAX_NR_ZONES - 1, 7476 .priority = DEF_PRIORITY, 7477 .may_writepage = 1, 7478 .may_unmap = 1, 7479 .may_swap = 1, 7480 .hibernation_mode = 1, 7481 }; 7482 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 7483 unsigned long nr_reclaimed; 7484 unsigned int noreclaim_flag; 7485 7486 fs_reclaim_acquire(sc.gfp_mask); 7487 noreclaim_flag = memalloc_noreclaim_save(); 7488 set_task_reclaim_state(current, &sc.reclaim_state); 7489 7490 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 7491 7492 set_task_reclaim_state(current, NULL); 7493 memalloc_noreclaim_restore(noreclaim_flag); 7494 fs_reclaim_release(sc.gfp_mask); 7495 7496 return nr_reclaimed; 7497 } 7498 #endif /* CONFIG_HIBERNATION */ 7499 7500 /* 7501 * This kswapd start function will be called by init and node-hot-add. 7502 */ 7503 void kswapd_run(int nid) 7504 { 7505 pg_data_t *pgdat = NODE_DATA(nid); 7506 7507 pgdat_kswapd_lock(pgdat); 7508 if (!pgdat->kswapd) { 7509 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); 7510 if (IS_ERR(pgdat->kswapd)) { 7511 /* failure at boot is fatal */ 7512 BUG_ON(system_state < SYSTEM_RUNNING); 7513 pr_err("Failed to start kswapd on node %d\n", nid); 7514 pgdat->kswapd = NULL; 7515 } 7516 } 7517 pgdat_kswapd_unlock(pgdat); 7518 } 7519 7520 /* 7521 * Called by memory hotplug when all memory in a node is offlined. Caller must 7522 * be holding mem_hotplug_begin/done(). 7523 */ 7524 void kswapd_stop(int nid) 7525 { 7526 pg_data_t *pgdat = NODE_DATA(nid); 7527 struct task_struct *kswapd; 7528 7529 pgdat_kswapd_lock(pgdat); 7530 kswapd = pgdat->kswapd; 7531 if (kswapd) { 7532 kthread_stop(kswapd); 7533 pgdat->kswapd = NULL; 7534 } 7535 pgdat_kswapd_unlock(pgdat); 7536 } 7537 7538 static int __init kswapd_init(void) 7539 { 7540 int nid; 7541 7542 swap_setup(); 7543 for_each_node_state(nid, N_MEMORY) 7544 kswapd_run(nid); 7545 return 0; 7546 } 7547 7548 module_init(kswapd_init) 7549 7550 #ifdef CONFIG_NUMA 7551 /* 7552 * Node reclaim mode 7553 * 7554 * If non-zero call node_reclaim when the number of free pages falls below 7555 * the watermarks. 7556 */ 7557 int node_reclaim_mode __read_mostly; 7558 7559 /* 7560 * Priority for NODE_RECLAIM. This determines the fraction of pages 7561 * of a node considered for each zone_reclaim. 4 scans 1/16th of 7562 * a zone. 7563 */ 7564 #define NODE_RECLAIM_PRIORITY 4 7565 7566 /* 7567 * Percentage of pages in a zone that must be unmapped for node_reclaim to 7568 * occur. 7569 */ 7570 int sysctl_min_unmapped_ratio = 1; 7571 7572 /* 7573 * If the number of slab pages in a zone grows beyond this percentage then 7574 * slab reclaim needs to occur. 7575 */ 7576 int sysctl_min_slab_ratio = 5; 7577 7578 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) 7579 { 7580 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); 7581 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + 7582 node_page_state(pgdat, NR_ACTIVE_FILE); 7583 7584 /* 7585 * It's possible for there to be more file mapped pages than 7586 * accounted for by the pages on the file LRU lists because 7587 * tmpfs pages accounted for as ANON can also be FILE_MAPPED 7588 */ 7589 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; 7590 } 7591 7592 /* Work out how many page cache pages we can reclaim in this reclaim_mode */ 7593 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) 7594 { 7595 unsigned long nr_pagecache_reclaimable; 7596 unsigned long delta = 0; 7597 7598 /* 7599 * If RECLAIM_UNMAP is set, then all file pages are considered 7600 * potentially reclaimable. Otherwise, we have to worry about 7601 * pages like swapcache and node_unmapped_file_pages() provides 7602 * a better estimate 7603 */ 7604 if (node_reclaim_mode & RECLAIM_UNMAP) 7605 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); 7606 else 7607 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); 7608 7609 /* If we can't clean pages, remove dirty pages from consideration */ 7610 if (!(node_reclaim_mode & RECLAIM_WRITE)) 7611 delta += node_page_state(pgdat, NR_FILE_DIRTY); 7612 7613 /* Watch for any possible underflows due to delta */ 7614 if (unlikely(delta > nr_pagecache_reclaimable)) 7615 delta = nr_pagecache_reclaimable; 7616 7617 return nr_pagecache_reclaimable - delta; 7618 } 7619 7620 /* 7621 * Try to free up some pages from this node through reclaim. 7622 */ 7623 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 7624 { 7625 /* Minimum pages needed in order to stay on node */ 7626 const unsigned long nr_pages = 1 << order; 7627 struct task_struct *p = current; 7628 unsigned int noreclaim_flag; 7629 struct scan_control sc = { 7630 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 7631 .gfp_mask = current_gfp_context(gfp_mask), 7632 .order = order, 7633 .priority = NODE_RECLAIM_PRIORITY, 7634 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), 7635 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), 7636 .may_swap = 1, 7637 .reclaim_idx = gfp_zone(gfp_mask), 7638 }; 7639 unsigned long pflags; 7640 7641 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, 7642 sc.gfp_mask); 7643 7644 cond_resched(); 7645 psi_memstall_enter(&pflags); 7646 fs_reclaim_acquire(sc.gfp_mask); 7647 /* 7648 * We need to be able to allocate from the reserves for RECLAIM_UNMAP 7649 */ 7650 noreclaim_flag = memalloc_noreclaim_save(); 7651 set_task_reclaim_state(p, &sc.reclaim_state); 7652 7653 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || 7654 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { 7655 /* 7656 * Free memory by calling shrink node with increasing 7657 * priorities until we have enough memory freed. 7658 */ 7659 do { 7660 shrink_node(pgdat, &sc); 7661 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); 7662 } 7663 7664 set_task_reclaim_state(p, NULL); 7665 memalloc_noreclaim_restore(noreclaim_flag); 7666 fs_reclaim_release(sc.gfp_mask); 7667 psi_memstall_leave(&pflags); 7668 7669 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); 7670 7671 return sc.nr_reclaimed >= nr_pages; 7672 } 7673 7674 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) 7675 { 7676 int ret; 7677 7678 /* 7679 * Node reclaim reclaims unmapped file backed pages and 7680 * slab pages if we are over the defined limits. 7681 * 7682 * A small portion of unmapped file backed pages is needed for 7683 * file I/O otherwise pages read by file I/O will be immediately 7684 * thrown out if the node is overallocated. So we do not reclaim 7685 * if less than a specified percentage of the node is used by 7686 * unmapped file backed pages. 7687 */ 7688 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && 7689 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= 7690 pgdat->min_slab_pages) 7691 return NODE_RECLAIM_FULL; 7692 7693 /* 7694 * Do not scan if the allocation should not be delayed. 7695 */ 7696 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) 7697 return NODE_RECLAIM_NOSCAN; 7698 7699 /* 7700 * Only run node reclaim on the local node or on nodes that do not 7701 * have associated processors. This will favor the local processor 7702 * over remote processors and spread off node memory allocations 7703 * as wide as possible. 7704 */ 7705 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) 7706 return NODE_RECLAIM_NOSCAN; 7707 7708 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) 7709 return NODE_RECLAIM_NOSCAN; 7710 7711 ret = __node_reclaim(pgdat, gfp_mask, order); 7712 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); 7713 7714 if (!ret) 7715 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); 7716 7717 return ret; 7718 } 7719 #endif 7720 7721 void check_move_unevictable_pages(struct pagevec *pvec) 7722 { 7723 struct folio_batch fbatch; 7724 unsigned i; 7725 7726 folio_batch_init(&fbatch); 7727 for (i = 0; i < pvec->nr; i++) { 7728 struct page *page = pvec->pages[i]; 7729 7730 if (PageTransTail(page)) 7731 continue; 7732 folio_batch_add(&fbatch, page_folio(page)); 7733 } 7734 check_move_unevictable_folios(&fbatch); 7735 } 7736 EXPORT_SYMBOL_GPL(check_move_unevictable_pages); 7737 7738 /** 7739 * check_move_unevictable_folios - Move evictable folios to appropriate zone 7740 * lru list 7741 * @fbatch: Batch of lru folios to check. 7742 * 7743 * Checks folios for evictability, if an evictable folio is in the unevictable 7744 * lru list, moves it to the appropriate evictable lru list. This function 7745 * should be only used for lru folios. 7746 */ 7747 void check_move_unevictable_folios(struct folio_batch *fbatch) 7748 { 7749 struct lruvec *lruvec = NULL; 7750 int pgscanned = 0; 7751 int pgrescued = 0; 7752 int i; 7753 7754 for (i = 0; i < fbatch->nr; i++) { 7755 struct folio *folio = fbatch->folios[i]; 7756 int nr_pages = folio_nr_pages(folio); 7757 7758 pgscanned += nr_pages; 7759 7760 /* block memcg migration while the folio moves between lrus */ 7761 if (!folio_test_clear_lru(folio)) 7762 continue; 7763 7764 lruvec = folio_lruvec_relock_irq(folio, lruvec); 7765 if (folio_evictable(folio) && folio_test_unevictable(folio)) { 7766 lruvec_del_folio(lruvec, folio); 7767 folio_clear_unevictable(folio); 7768 lruvec_add_folio(lruvec, folio); 7769 pgrescued += nr_pages; 7770 } 7771 folio_set_lru(folio); 7772 } 7773 7774 if (lruvec) { 7775 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); 7776 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 7777 unlock_page_lruvec_irq(lruvec); 7778 } else if (pgscanned) { 7779 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); 7780 } 7781 } 7782 EXPORT_SYMBOL_GPL(check_move_unevictable_folios); 7783