xref: /linux/mm/vmscan.c (revision b05f8d7e077952d14acb63e3ccdf5f64404b59a4)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
4  *
5  *  Swap reorganised 29.12.95, Stephen Tweedie.
6  *  kswapd added: 7.1.96  sct
7  *  Removed kswapd_ctl limits, and swap out as many pages as needed
8  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
9  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
10  *  Multiqueue VM started 5.8.00, Rik van Riel.
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/mm.h>
16 #include <linux/sched/mm.h>
17 #include <linux/module.h>
18 #include <linux/gfp.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/swap.h>
21 #include <linux/pagemap.h>
22 #include <linux/init.h>
23 #include <linux/highmem.h>
24 #include <linux/vmpressure.h>
25 #include <linux/vmstat.h>
26 #include <linux/file.h>
27 #include <linux/writeback.h>
28 #include <linux/blkdev.h>
29 #include <linux/buffer_head.h>	/* for buffer_heads_over_limit */
30 #include <linux/mm_inline.h>
31 #include <linux/backing-dev.h>
32 #include <linux/rmap.h>
33 #include <linux/topology.h>
34 #include <linux/cpu.h>
35 #include <linux/cpuset.h>
36 #include <linux/compaction.h>
37 #include <linux/notifier.h>
38 #include <linux/delay.h>
39 #include <linux/kthread.h>
40 #include <linux/freezer.h>
41 #include <linux/memcontrol.h>
42 #include <linux/migrate.h>
43 #include <linux/delayacct.h>
44 #include <linux/sysctl.h>
45 #include <linux/memory-tiers.h>
46 #include <linux/oom.h>
47 #include <linux/pagevec.h>
48 #include <linux/prefetch.h>
49 #include <linux/printk.h>
50 #include <linux/dax.h>
51 #include <linux/psi.h>
52 #include <linux/pagewalk.h>
53 #include <linux/shmem_fs.h>
54 #include <linux/ctype.h>
55 #include <linux/debugfs.h>
56 #include <linux/khugepaged.h>
57 #include <linux/rculist_nulls.h>
58 #include <linux/random.h>
59 #include <linux/mmu_notifier.h>
60 
61 #include <asm/tlbflush.h>
62 #include <asm/div64.h>
63 
64 #include <linux/swapops.h>
65 #include <linux/balloon_compaction.h>
66 #include <linux/sched/sysctl.h>
67 
68 #include "internal.h"
69 #include "swap.h"
70 
71 #define CREATE_TRACE_POINTS
72 #include <trace/events/vmscan.h>
73 
74 struct scan_control {
75 	/* How many pages shrink_list() should reclaim */
76 	unsigned long nr_to_reclaim;
77 
78 	/*
79 	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
80 	 * are scanned.
81 	 */
82 	nodemask_t	*nodemask;
83 
84 	/*
85 	 * The memory cgroup that hit its limit and as a result is the
86 	 * primary target of this reclaim invocation.
87 	 */
88 	struct mem_cgroup *target_mem_cgroup;
89 
90 	/*
91 	 * Scan pressure balancing between anon and file LRUs
92 	 */
93 	unsigned long	anon_cost;
94 	unsigned long	file_cost;
95 
96 #ifdef CONFIG_MEMCG
97 	/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
98 	int *proactive_swappiness;
99 #endif
100 
101 	/* Can active folios be deactivated as part of reclaim? */
102 #define DEACTIVATE_ANON 1
103 #define DEACTIVATE_FILE 2
104 	unsigned int may_deactivate:2;
105 	unsigned int force_deactivate:1;
106 	unsigned int skipped_deactivate:1;
107 
108 	/* Writepage batching in laptop mode; RECLAIM_WRITE */
109 	unsigned int may_writepage:1;
110 
111 	/* Can mapped folios be reclaimed? */
112 	unsigned int may_unmap:1;
113 
114 	/* Can folios be swapped as part of reclaim? */
115 	unsigned int may_swap:1;
116 
117 	/* Not allow cache_trim_mode to be turned on as part of reclaim? */
118 	unsigned int no_cache_trim_mode:1;
119 
120 	/* Has cache_trim_mode failed at least once? */
121 	unsigned int cache_trim_mode_failed:1;
122 
123 	/* Proactive reclaim invoked by userspace through memory.reclaim */
124 	unsigned int proactive:1;
125 
126 	/*
127 	 * Cgroup memory below memory.low is protected as long as we
128 	 * don't threaten to OOM. If any cgroup is reclaimed at
129 	 * reduced force or passed over entirely due to its memory.low
130 	 * setting (memcg_low_skipped), and nothing is reclaimed as a
131 	 * result, then go back for one more cycle that reclaims the protected
132 	 * memory (memcg_low_reclaim) to avert OOM.
133 	 */
134 	unsigned int memcg_low_reclaim:1;
135 	unsigned int memcg_low_skipped:1;
136 
137 	/* Shared cgroup tree walk failed, rescan the whole tree */
138 	unsigned int memcg_full_walk:1;
139 
140 	unsigned int hibernation_mode:1;
141 
142 	/* One of the zones is ready for compaction */
143 	unsigned int compaction_ready:1;
144 
145 	/* There is easily reclaimable cold cache in the current node */
146 	unsigned int cache_trim_mode:1;
147 
148 	/* The file folios on the current node are dangerously low */
149 	unsigned int file_is_tiny:1;
150 
151 	/* Always discard instead of demoting to lower tier memory */
152 	unsigned int no_demotion:1;
153 
154 	/* Allocation order */
155 	s8 order;
156 
157 	/* Scan (total_size >> priority) pages at once */
158 	s8 priority;
159 
160 	/* The highest zone to isolate folios for reclaim from */
161 	s8 reclaim_idx;
162 
163 	/* This context's GFP mask */
164 	gfp_t gfp_mask;
165 
166 	/* Incremented by the number of inactive pages that were scanned */
167 	unsigned long nr_scanned;
168 
169 	/* Number of pages freed so far during a call to shrink_zones() */
170 	unsigned long nr_reclaimed;
171 
172 	struct {
173 		unsigned int dirty;
174 		unsigned int unqueued_dirty;
175 		unsigned int congested;
176 		unsigned int writeback;
177 		unsigned int immediate;
178 		unsigned int file_taken;
179 		unsigned int taken;
180 	} nr;
181 
182 	/* for recording the reclaimed slab by now */
183 	struct reclaim_state reclaim_state;
184 };
185 
186 #ifdef ARCH_HAS_PREFETCHW
187 #define prefetchw_prev_lru_folio(_folio, _base, _field)			\
188 	do {								\
189 		if ((_folio)->lru.prev != _base) {			\
190 			struct folio *prev;				\
191 									\
192 			prev = lru_to_folio(&(_folio->lru));		\
193 			prefetchw(&prev->_field);			\
194 		}							\
195 	} while (0)
196 #else
197 #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
198 #endif
199 
200 /*
201  * From 0 .. MAX_SWAPPINESS.  Higher means more swappy.
202  */
203 int vm_swappiness = 60;
204 
205 #ifdef CONFIG_MEMCG
206 
207 /* Returns true for reclaim through cgroup limits or cgroup interfaces. */
208 static bool cgroup_reclaim(struct scan_control *sc)
209 {
210 	return sc->target_mem_cgroup;
211 }
212 
213 /*
214  * Returns true for reclaim on the root cgroup. This is true for direct
215  * allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
216  */
217 static bool root_reclaim(struct scan_control *sc)
218 {
219 	return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
220 }
221 
222 /**
223  * writeback_throttling_sane - is the usual dirty throttling mechanism available?
224  * @sc: scan_control in question
225  *
226  * The normal page dirty throttling mechanism in balance_dirty_pages() is
227  * completely broken with the legacy memcg and direct stalling in
228  * shrink_folio_list() is used for throttling instead, which lacks all the
229  * niceties such as fairness, adaptive pausing, bandwidth proportional
230  * allocation and configurability.
231  *
232  * This function tests whether the vmscan currently in progress can assume
233  * that the normal dirty throttling mechanism is operational.
234  */
235 static bool writeback_throttling_sane(struct scan_control *sc)
236 {
237 	if (!cgroup_reclaim(sc))
238 		return true;
239 #ifdef CONFIG_CGROUP_WRITEBACK
240 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
241 		return true;
242 #endif
243 	return false;
244 }
245 
246 static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
247 {
248 	if (sc->proactive && sc->proactive_swappiness)
249 		return *sc->proactive_swappiness;
250 	return mem_cgroup_swappiness(memcg);
251 }
252 #else
253 static bool cgroup_reclaim(struct scan_control *sc)
254 {
255 	return false;
256 }
257 
258 static bool root_reclaim(struct scan_control *sc)
259 {
260 	return true;
261 }
262 
263 static bool writeback_throttling_sane(struct scan_control *sc)
264 {
265 	return true;
266 }
267 
268 static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
269 {
270 	return READ_ONCE(vm_swappiness);
271 }
272 #endif
273 
274 /* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to
275  * and including the specified highidx
276  * @zone: The current zone in the iterator
277  * @pgdat: The pgdat which node_zones are being iterated
278  * @idx: The index variable
279  * @highidx: The index of the highest zone to return
280  *
281  * This macro iterates through all managed zones up to and including the specified highidx.
282  * The zone iterator enters an invalid state after macro call and must be reinitialized
283  * before it can be used again.
284  */
285 #define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx)	\
286 	for ((idx) = 0, (zone) = (pgdat)->node_zones;		\
287 	    (idx) <= (highidx);					\
288 	    (idx)++, (zone)++)					\
289 		if (!managed_zone(zone))			\
290 			continue;				\
291 		else
292 
293 static void set_task_reclaim_state(struct task_struct *task,
294 				   struct reclaim_state *rs)
295 {
296 	/* Check for an overwrite */
297 	WARN_ON_ONCE(rs && task->reclaim_state);
298 
299 	/* Check for the nulling of an already-nulled member */
300 	WARN_ON_ONCE(!rs && !task->reclaim_state);
301 
302 	task->reclaim_state = rs;
303 }
304 
305 /*
306  * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
307  * scan_control->nr_reclaimed.
308  */
309 static void flush_reclaim_state(struct scan_control *sc)
310 {
311 	/*
312 	 * Currently, reclaim_state->reclaimed includes three types of pages
313 	 * freed outside of vmscan:
314 	 * (1) Slab pages.
315 	 * (2) Clean file pages from pruned inodes (on highmem systems).
316 	 * (3) XFS freed buffer pages.
317 	 *
318 	 * For all of these cases, we cannot universally link the pages to a
319 	 * single memcg. For example, a memcg-aware shrinker can free one object
320 	 * charged to the target memcg, causing an entire page to be freed.
321 	 * If we count the entire page as reclaimed from the memcg, we end up
322 	 * overestimating the reclaimed amount (potentially under-reclaiming).
323 	 *
324 	 * Only count such pages for global reclaim to prevent under-reclaiming
325 	 * from the target memcg; preventing unnecessary retries during memcg
326 	 * charging and false positives from proactive reclaim.
327 	 *
328 	 * For uncommon cases where the freed pages were actually mostly
329 	 * charged to the target memcg, we end up underestimating the reclaimed
330 	 * amount. This should be fine. The freed pages will be uncharged
331 	 * anyway, even if they are not counted here properly, and we will be
332 	 * able to make forward progress in charging (which is usually in a
333 	 * retry loop).
334 	 *
335 	 * We can go one step further, and report the uncharged objcg pages in
336 	 * memcg reclaim, to make reporting more accurate and reduce
337 	 * underestimation, but it's probably not worth the complexity for now.
338 	 */
339 	if (current->reclaim_state && root_reclaim(sc)) {
340 		sc->nr_reclaimed += current->reclaim_state->reclaimed;
341 		current->reclaim_state->reclaimed = 0;
342 	}
343 }
344 
345 static bool can_demote(int nid, struct scan_control *sc)
346 {
347 	if (!numa_demotion_enabled)
348 		return false;
349 	if (sc && sc->no_demotion)
350 		return false;
351 	if (next_demotion_node(nid) == NUMA_NO_NODE)
352 		return false;
353 
354 	return true;
355 }
356 
357 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
358 					  int nid,
359 					  struct scan_control *sc)
360 {
361 	if (memcg == NULL) {
362 		/*
363 		 * For non-memcg reclaim, is there
364 		 * space in any swap device?
365 		 */
366 		if (get_nr_swap_pages() > 0)
367 			return true;
368 	} else {
369 		/* Is the memcg below its swap limit? */
370 		if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
371 			return true;
372 	}
373 
374 	/*
375 	 * The page can not be swapped.
376 	 *
377 	 * Can it be reclaimed from this node via demotion?
378 	 */
379 	return can_demote(nid, sc);
380 }
381 
382 /*
383  * This misses isolated folios which are not accounted for to save counters.
384  * As the data only determines if reclaim or compaction continues, it is
385  * not expected that isolated folios will be a dominating factor.
386  */
387 unsigned long zone_reclaimable_pages(struct zone *zone)
388 {
389 	unsigned long nr;
390 
391 	nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
392 		zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
393 	if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
394 		nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
395 			zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
396 	/*
397 	 * If there are no reclaimable file-backed or anonymous pages,
398 	 * ensure zones with sufficient free pages are not skipped.
399 	 * This prevents zones like DMA32 from being ignored in reclaim
400 	 * scenarios where they can still help alleviate memory pressure.
401 	 */
402 	if (nr == 0)
403 		nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
404 	return nr;
405 }
406 
407 /**
408  * lruvec_lru_size -  Returns the number of pages on the given LRU list.
409  * @lruvec: lru vector
410  * @lru: lru to use
411  * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
412  */
413 static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
414 				     int zone_idx)
415 {
416 	unsigned long size = 0;
417 	int zid;
418 	struct zone *zone;
419 
420 	for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
421 		if (!mem_cgroup_disabled())
422 			size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
423 		else
424 			size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
425 	}
426 	return size;
427 }
428 
429 static unsigned long drop_slab_node(int nid)
430 {
431 	unsigned long freed = 0;
432 	struct mem_cgroup *memcg = NULL;
433 
434 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
435 	do {
436 		freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
437 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
438 
439 	return freed;
440 }
441 
442 void drop_slab(void)
443 {
444 	int nid;
445 	int shift = 0;
446 	unsigned long freed;
447 
448 	do {
449 		freed = 0;
450 		for_each_online_node(nid) {
451 			if (fatal_signal_pending(current))
452 				return;
453 
454 			freed += drop_slab_node(nid);
455 		}
456 	} while ((freed >> shift++) > 1);
457 }
458 
459 #define CHECK_RECLAIMER_OFFSET(type)					\
460 	do {								\
461 		BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD !=		\
462 			     PGDEMOTE_##type - PGDEMOTE_KSWAPD);	\
463 		BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD !=		\
464 			     PGSCAN_##type - PGSCAN_KSWAPD);		\
465 	} while (0)
466 
467 static int reclaimer_offset(struct scan_control *sc)
468 {
469 	CHECK_RECLAIMER_OFFSET(DIRECT);
470 	CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
471 	CHECK_RECLAIMER_OFFSET(PROACTIVE);
472 
473 	if (current_is_kswapd())
474 		return 0;
475 	if (current_is_khugepaged())
476 		return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
477 	if (sc->proactive)
478 		return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
479 	return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
480 }
481 
482 static inline int is_page_cache_freeable(struct folio *folio)
483 {
484 	/*
485 	 * A freeable page cache folio is referenced only by the caller
486 	 * that isolated the folio, the page cache and optional filesystem
487 	 * private data at folio->private.
488 	 */
489 	return folio_ref_count(folio) - folio_test_private(folio) ==
490 		1 + folio_nr_pages(folio);
491 }
492 
493 /*
494  * We detected a synchronous write error writing a folio out.  Probably
495  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
496  * fsync(), msync() or close().
497  *
498  * The tricky part is that after writepage we cannot touch the mapping: nothing
499  * prevents it from being freed up.  But we have a ref on the folio and once
500  * that folio is locked, the mapping is pinned.
501  *
502  * We're allowed to run sleeping folio_lock() here because we know the caller has
503  * __GFP_FS.
504  */
505 static void handle_write_error(struct address_space *mapping,
506 				struct folio *folio, int error)
507 {
508 	folio_lock(folio);
509 	if (folio_mapping(folio) == mapping)
510 		mapping_set_error(mapping, error);
511 	folio_unlock(folio);
512 }
513 
514 static bool skip_throttle_noprogress(pg_data_t *pgdat)
515 {
516 	int reclaimable = 0, write_pending = 0;
517 	int i;
518 	struct zone *zone;
519 	/*
520 	 * If kswapd is disabled, reschedule if necessary but do not
521 	 * throttle as the system is likely near OOM.
522 	 */
523 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
524 		return true;
525 
526 	/*
527 	 * If there are a lot of dirty/writeback folios then do not
528 	 * throttle as throttling will occur when the folios cycle
529 	 * towards the end of the LRU if still under writeback.
530 	 */
531 	for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) {
532 		reclaimable += zone_reclaimable_pages(zone);
533 		write_pending += zone_page_state_snapshot(zone,
534 						  NR_ZONE_WRITE_PENDING);
535 	}
536 	if (2 * write_pending <= reclaimable)
537 		return true;
538 
539 	return false;
540 }
541 
542 void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
543 {
544 	wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
545 	long timeout, ret;
546 	DEFINE_WAIT(wait);
547 
548 	/*
549 	 * Do not throttle user workers, kthreads other than kswapd or
550 	 * workqueues. They may be required for reclaim to make
551 	 * forward progress (e.g. journalling workqueues or kthreads).
552 	 */
553 	if (!current_is_kswapd() &&
554 	    current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
555 		cond_resched();
556 		return;
557 	}
558 
559 	/*
560 	 * These figures are pulled out of thin air.
561 	 * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
562 	 * parallel reclaimers which is a short-lived event so the timeout is
563 	 * short. Failing to make progress or waiting on writeback are
564 	 * potentially long-lived events so use a longer timeout. This is shaky
565 	 * logic as a failure to make progress could be due to anything from
566 	 * writeback to a slow device to excessive referenced folios at the tail
567 	 * of the inactive LRU.
568 	 */
569 	switch(reason) {
570 	case VMSCAN_THROTTLE_WRITEBACK:
571 		timeout = HZ/10;
572 
573 		if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
574 			WRITE_ONCE(pgdat->nr_reclaim_start,
575 				node_page_state(pgdat, NR_THROTTLED_WRITTEN));
576 		}
577 
578 		break;
579 	case VMSCAN_THROTTLE_CONGESTED:
580 		fallthrough;
581 	case VMSCAN_THROTTLE_NOPROGRESS:
582 		if (skip_throttle_noprogress(pgdat)) {
583 			cond_resched();
584 			return;
585 		}
586 
587 		timeout = 1;
588 
589 		break;
590 	case VMSCAN_THROTTLE_ISOLATED:
591 		timeout = HZ/50;
592 		break;
593 	default:
594 		WARN_ON_ONCE(1);
595 		timeout = HZ;
596 		break;
597 	}
598 
599 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
600 	ret = schedule_timeout(timeout);
601 	finish_wait(wqh, &wait);
602 
603 	if (reason == VMSCAN_THROTTLE_WRITEBACK)
604 		atomic_dec(&pgdat->nr_writeback_throttled);
605 
606 	trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
607 				jiffies_to_usecs(timeout - ret),
608 				reason);
609 }
610 
611 /*
612  * Account for folios written if tasks are throttled waiting on dirty
613  * folios to clean. If enough folios have been cleaned since throttling
614  * started then wakeup the throttled tasks.
615  */
616 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
617 							int nr_throttled)
618 {
619 	unsigned long nr_written;
620 
621 	node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
622 
623 	/*
624 	 * This is an inaccurate read as the per-cpu deltas may not
625 	 * be synchronised. However, given that the system is
626 	 * writeback throttled, it is not worth taking the penalty
627 	 * of getting an accurate count. At worst, the throttle
628 	 * timeout guarantees forward progress.
629 	 */
630 	nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
631 		READ_ONCE(pgdat->nr_reclaim_start);
632 
633 	if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
634 		wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
635 }
636 
637 /* possible outcome of pageout() */
638 typedef enum {
639 	/* failed to write folio out, folio is locked */
640 	PAGE_KEEP,
641 	/* move folio to the active list, folio is locked */
642 	PAGE_ACTIVATE,
643 	/* folio has been sent to the disk successfully, folio is unlocked */
644 	PAGE_SUCCESS,
645 	/* folio is clean and locked */
646 	PAGE_CLEAN,
647 } pageout_t;
648 
649 /*
650  * pageout is called by shrink_folio_list() for each dirty folio.
651  * Calls ->writepage().
652  */
653 static pageout_t pageout(struct folio *folio, struct address_space *mapping,
654 			 struct swap_iocb **plug, struct list_head *folio_list)
655 {
656 	/*
657 	 * If the folio is dirty, only perform writeback if that write
658 	 * will be non-blocking.  To prevent this allocation from being
659 	 * stalled by pagecache activity.  But note that there may be
660 	 * stalls if we need to run get_block().  We could test
661 	 * PagePrivate for that.
662 	 *
663 	 * If this process is currently in __generic_file_write_iter() against
664 	 * this folio's queue, we can perform writeback even if that
665 	 * will block.
666 	 *
667 	 * If the folio is swapcache, write it back even if that would
668 	 * block, for some throttling. This happens by accident, because
669 	 * swap_backing_dev_info is bust: it doesn't reflect the
670 	 * congestion state of the swapdevs.  Easy to fix, if needed.
671 	 */
672 	if (!is_page_cache_freeable(folio))
673 		return PAGE_KEEP;
674 	if (!mapping) {
675 		/*
676 		 * Some data journaling orphaned folios can have
677 		 * folio->mapping == NULL while being dirty with clean buffers.
678 		 */
679 		if (folio_test_private(folio)) {
680 			if (try_to_free_buffers(folio)) {
681 				folio_clear_dirty(folio);
682 				pr_info("%s: orphaned folio\n", __func__);
683 				return PAGE_CLEAN;
684 			}
685 		}
686 		return PAGE_KEEP;
687 	}
688 	if (mapping->a_ops->writepage == NULL)
689 		return PAGE_ACTIVATE;
690 
691 	if (folio_clear_dirty_for_io(folio)) {
692 		int res;
693 		struct writeback_control wbc = {
694 			.sync_mode = WB_SYNC_NONE,
695 			.nr_to_write = SWAP_CLUSTER_MAX,
696 			.range_start = 0,
697 			.range_end = LLONG_MAX,
698 			.for_reclaim = 1,
699 			.swap_plug = plug,
700 		};
701 
702 		/*
703 		 * The large shmem folio can be split if CONFIG_THP_SWAP is
704 		 * not enabled or contiguous swap entries are failed to
705 		 * allocate.
706 		 */
707 		if (shmem_mapping(mapping) && folio_test_large(folio))
708 			wbc.list = folio_list;
709 
710 		folio_set_reclaim(folio);
711 		res = mapping->a_ops->writepage(&folio->page, &wbc);
712 		if (res < 0)
713 			handle_write_error(mapping, folio, res);
714 		if (res == AOP_WRITEPAGE_ACTIVATE) {
715 			folio_clear_reclaim(folio);
716 			return PAGE_ACTIVATE;
717 		}
718 
719 		if (!folio_test_writeback(folio)) {
720 			/* synchronous write or broken a_ops? */
721 			folio_clear_reclaim(folio);
722 		}
723 		trace_mm_vmscan_write_folio(folio);
724 		node_stat_add_folio(folio, NR_VMSCAN_WRITE);
725 		return PAGE_SUCCESS;
726 	}
727 
728 	return PAGE_CLEAN;
729 }
730 
731 /*
732  * Same as remove_mapping, but if the folio is removed from the mapping, it
733  * gets returned with a refcount of 0.
734  */
735 static int __remove_mapping(struct address_space *mapping, struct folio *folio,
736 			    bool reclaimed, struct mem_cgroup *target_memcg)
737 {
738 	int refcount;
739 	void *shadow = NULL;
740 
741 	BUG_ON(!folio_test_locked(folio));
742 	BUG_ON(mapping != folio_mapping(folio));
743 
744 	if (!folio_test_swapcache(folio))
745 		spin_lock(&mapping->host->i_lock);
746 	xa_lock_irq(&mapping->i_pages);
747 	/*
748 	 * The non racy check for a busy folio.
749 	 *
750 	 * Must be careful with the order of the tests. When someone has
751 	 * a ref to the folio, it may be possible that they dirty it then
752 	 * drop the reference. So if the dirty flag is tested before the
753 	 * refcount here, then the following race may occur:
754 	 *
755 	 * get_user_pages(&page);
756 	 * [user mapping goes away]
757 	 * write_to(page);
758 	 *				!folio_test_dirty(folio)    [good]
759 	 * folio_set_dirty(folio);
760 	 * folio_put(folio);
761 	 *				!refcount(folio)   [good, discard it]
762 	 *
763 	 * [oops, our write_to data is lost]
764 	 *
765 	 * Reversing the order of the tests ensures such a situation cannot
766 	 * escape unnoticed. The smp_rmb is needed to ensure the folio->flags
767 	 * load is not satisfied before that of folio->_refcount.
768 	 *
769 	 * Note that if the dirty flag is always set via folio_mark_dirty,
770 	 * and thus under the i_pages lock, then this ordering is not required.
771 	 */
772 	refcount = 1 + folio_nr_pages(folio);
773 	if (!folio_ref_freeze(folio, refcount))
774 		goto cannot_free;
775 	/* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
776 	if (unlikely(folio_test_dirty(folio))) {
777 		folio_ref_unfreeze(folio, refcount);
778 		goto cannot_free;
779 	}
780 
781 	if (folio_test_swapcache(folio)) {
782 		swp_entry_t swap = folio->swap;
783 
784 		if (reclaimed && !mapping_exiting(mapping))
785 			shadow = workingset_eviction(folio, target_memcg);
786 		__delete_from_swap_cache(folio, swap, shadow);
787 		memcg1_swapout(folio, swap);
788 		xa_unlock_irq(&mapping->i_pages);
789 		put_swap_folio(folio, swap);
790 	} else {
791 		void (*free_folio)(struct folio *);
792 
793 		free_folio = mapping->a_ops->free_folio;
794 		/*
795 		 * Remember a shadow entry for reclaimed file cache in
796 		 * order to detect refaults, thus thrashing, later on.
797 		 *
798 		 * But don't store shadows in an address space that is
799 		 * already exiting.  This is not just an optimization,
800 		 * inode reclaim needs to empty out the radix tree or
801 		 * the nodes are lost.  Don't plant shadows behind its
802 		 * back.
803 		 *
804 		 * We also don't store shadows for DAX mappings because the
805 		 * only page cache folios found in these are zero pages
806 		 * covering holes, and because we don't want to mix DAX
807 		 * exceptional entries and shadow exceptional entries in the
808 		 * same address_space.
809 		 */
810 		if (reclaimed && folio_is_file_lru(folio) &&
811 		    !mapping_exiting(mapping) && !dax_mapping(mapping))
812 			shadow = workingset_eviction(folio, target_memcg);
813 		__filemap_remove_folio(folio, shadow);
814 		xa_unlock_irq(&mapping->i_pages);
815 		if (mapping_shrinkable(mapping))
816 			inode_add_lru(mapping->host);
817 		spin_unlock(&mapping->host->i_lock);
818 
819 		if (free_folio)
820 			free_folio(folio);
821 	}
822 
823 	return 1;
824 
825 cannot_free:
826 	xa_unlock_irq(&mapping->i_pages);
827 	if (!folio_test_swapcache(folio))
828 		spin_unlock(&mapping->host->i_lock);
829 	return 0;
830 }
831 
832 /**
833  * remove_mapping() - Attempt to remove a folio from its mapping.
834  * @mapping: The address space.
835  * @folio: The folio to remove.
836  *
837  * If the folio is dirty, under writeback or if someone else has a ref
838  * on it, removal will fail.
839  * Return: The number of pages removed from the mapping.  0 if the folio
840  * could not be removed.
841  * Context: The caller should have a single refcount on the folio and
842  * hold its lock.
843  */
844 long remove_mapping(struct address_space *mapping, struct folio *folio)
845 {
846 	if (__remove_mapping(mapping, folio, false, NULL)) {
847 		/*
848 		 * Unfreezing the refcount with 1 effectively
849 		 * drops the pagecache ref for us without requiring another
850 		 * atomic operation.
851 		 */
852 		folio_ref_unfreeze(folio, 1);
853 		return folio_nr_pages(folio);
854 	}
855 	return 0;
856 }
857 
858 /**
859  * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
860  * @folio: Folio to be returned to an LRU list.
861  *
862  * Add previously isolated @folio to appropriate LRU list.
863  * The folio may still be unevictable for other reasons.
864  *
865  * Context: lru_lock must not be held, interrupts must be enabled.
866  */
867 void folio_putback_lru(struct folio *folio)
868 {
869 	folio_add_lru(folio);
870 	folio_put(folio);		/* drop ref from isolate */
871 }
872 
873 enum folio_references {
874 	FOLIOREF_RECLAIM,
875 	FOLIOREF_RECLAIM_CLEAN,
876 	FOLIOREF_KEEP,
877 	FOLIOREF_ACTIVATE,
878 };
879 
880 #ifdef CONFIG_LRU_GEN
881 /*
882  * Only used on a mapped folio in the eviction (rmap walk) path, where promotion
883  * needs to be done by taking the folio off the LRU list and then adding it back
884  * with PG_active set. In contrast, the aging (page table walk) path uses
885  * folio_update_gen().
886  */
887 static bool lru_gen_set_refs(struct folio *folio)
888 {
889 	/* see the comment on LRU_REFS_FLAGS */
890 	if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
891 		set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
892 		return false;
893 	}
894 
895 	set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset));
896 	return true;
897 }
898 #else
899 static bool lru_gen_set_refs(struct folio *folio)
900 {
901 	return false;
902 }
903 #endif /* CONFIG_LRU_GEN */
904 
905 static enum folio_references folio_check_references(struct folio *folio,
906 						  struct scan_control *sc)
907 {
908 	int referenced_ptes, referenced_folio;
909 	unsigned long vm_flags;
910 
911 	referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
912 					   &vm_flags);
913 
914 	/*
915 	 * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
916 	 * Let the folio, now marked Mlocked, be moved to the unevictable list.
917 	 */
918 	if (vm_flags & VM_LOCKED)
919 		return FOLIOREF_ACTIVATE;
920 
921 	/*
922 	 * There are two cases to consider.
923 	 * 1) Rmap lock contention: rotate.
924 	 * 2) Skip the non-shared swapbacked folio mapped solely by
925 	 *    the exiting or OOM-reaped process.
926 	 */
927 	if (referenced_ptes == -1)
928 		return FOLIOREF_KEEP;
929 
930 	if (lru_gen_enabled()) {
931 		if (!referenced_ptes)
932 			return FOLIOREF_RECLAIM;
933 
934 		return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
935 	}
936 
937 	referenced_folio = folio_test_clear_referenced(folio);
938 
939 	if (referenced_ptes) {
940 		/*
941 		 * All mapped folios start out with page table
942 		 * references from the instantiating fault, so we need
943 		 * to look twice if a mapped file/anon folio is used more
944 		 * than once.
945 		 *
946 		 * Mark it and spare it for another trip around the
947 		 * inactive list.  Another page table reference will
948 		 * lead to its activation.
949 		 *
950 		 * Note: the mark is set for activated folios as well
951 		 * so that recently deactivated but used folios are
952 		 * quickly recovered.
953 		 */
954 		folio_set_referenced(folio);
955 
956 		if (referenced_folio || referenced_ptes > 1)
957 			return FOLIOREF_ACTIVATE;
958 
959 		/*
960 		 * Activate file-backed executable folios after first usage.
961 		 */
962 		if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
963 			return FOLIOREF_ACTIVATE;
964 
965 		return FOLIOREF_KEEP;
966 	}
967 
968 	/* Reclaim if clean, defer dirty folios to writeback */
969 	if (referenced_folio && folio_is_file_lru(folio))
970 		return FOLIOREF_RECLAIM_CLEAN;
971 
972 	return FOLIOREF_RECLAIM;
973 }
974 
975 /* Check if a folio is dirty or under writeback */
976 static void folio_check_dirty_writeback(struct folio *folio,
977 				       bool *dirty, bool *writeback)
978 {
979 	struct address_space *mapping;
980 
981 	/*
982 	 * Anonymous folios are not handled by flushers and must be written
983 	 * from reclaim context. Do not stall reclaim based on them.
984 	 * MADV_FREE anonymous folios are put into inactive file list too.
985 	 * They could be mistakenly treated as file lru. So further anon
986 	 * test is needed.
987 	 */
988 	if (!folio_is_file_lru(folio) ||
989 	    (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
990 		*dirty = false;
991 		*writeback = false;
992 		return;
993 	}
994 
995 	/* By default assume that the folio flags are accurate */
996 	*dirty = folio_test_dirty(folio);
997 	*writeback = folio_test_writeback(folio);
998 
999 	/* Verify dirty/writeback state if the filesystem supports it */
1000 	if (!folio_test_private(folio))
1001 		return;
1002 
1003 	mapping = folio_mapping(folio);
1004 	if (mapping && mapping->a_ops->is_dirty_writeback)
1005 		mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
1006 }
1007 
1008 struct folio *alloc_migrate_folio(struct folio *src, unsigned long private)
1009 {
1010 	struct folio *dst;
1011 	nodemask_t *allowed_mask;
1012 	struct migration_target_control *mtc;
1013 
1014 	mtc = (struct migration_target_control *)private;
1015 
1016 	allowed_mask = mtc->nmask;
1017 	/*
1018 	 * make sure we allocate from the target node first also trying to
1019 	 * demote or reclaim pages from the target node via kswapd if we are
1020 	 * low on free memory on target node. If we don't do this and if
1021 	 * we have free memory on the slower(lower) memtier, we would start
1022 	 * allocating pages from slower(lower) memory tiers without even forcing
1023 	 * a demotion of cold pages from the target memtier. This can result
1024 	 * in the kernel placing hot pages in slower(lower) memory tiers.
1025 	 */
1026 	mtc->nmask = NULL;
1027 	mtc->gfp_mask |= __GFP_THISNODE;
1028 	dst = alloc_migration_target(src, (unsigned long)mtc);
1029 	if (dst)
1030 		return dst;
1031 
1032 	mtc->gfp_mask &= ~__GFP_THISNODE;
1033 	mtc->nmask = allowed_mask;
1034 
1035 	return alloc_migration_target(src, (unsigned long)mtc);
1036 }
1037 
1038 /*
1039  * Take folios on @demote_folios and attempt to demote them to another node.
1040  * Folios which are not demoted are left on @demote_folios.
1041  */
1042 static unsigned int demote_folio_list(struct list_head *demote_folios,
1043 				     struct pglist_data *pgdat)
1044 {
1045 	int target_nid = next_demotion_node(pgdat->node_id);
1046 	unsigned int nr_succeeded;
1047 	nodemask_t allowed_mask;
1048 
1049 	struct migration_target_control mtc = {
1050 		/*
1051 		 * Allocate from 'node', or fail quickly and quietly.
1052 		 * When this happens, 'page' will likely just be discarded
1053 		 * instead of migrated.
1054 		 */
1055 		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
1056 			__GFP_NOMEMALLOC | GFP_NOWAIT,
1057 		.nid = target_nid,
1058 		.nmask = &allowed_mask,
1059 		.reason = MR_DEMOTION,
1060 	};
1061 
1062 	if (list_empty(demote_folios))
1063 		return 0;
1064 
1065 	if (target_nid == NUMA_NO_NODE)
1066 		return 0;
1067 
1068 	node_get_allowed_targets(pgdat, &allowed_mask);
1069 
1070 	/* Demotion ignores all cpuset and mempolicy settings */
1071 	migrate_pages(demote_folios, alloc_migrate_folio, NULL,
1072 		      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
1073 		      &nr_succeeded);
1074 
1075 	return nr_succeeded;
1076 }
1077 
1078 static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
1079 {
1080 	if (gfp_mask & __GFP_FS)
1081 		return true;
1082 	if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
1083 		return false;
1084 	/*
1085 	 * We can "enter_fs" for swap-cache with only __GFP_IO
1086 	 * providing this isn't SWP_FS_OPS.
1087 	 * ->flags can be updated non-atomicially (scan_swap_map_slots),
1088 	 * but that will never affect SWP_FS_OPS, so the data_race
1089 	 * is safe.
1090 	 */
1091 	return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
1092 }
1093 
1094 /*
1095  * shrink_folio_list() returns the number of reclaimed pages
1096  */
1097 static unsigned int shrink_folio_list(struct list_head *folio_list,
1098 		struct pglist_data *pgdat, struct scan_control *sc,
1099 		struct reclaim_stat *stat, bool ignore_references)
1100 {
1101 	struct folio_batch free_folios;
1102 	LIST_HEAD(ret_folios);
1103 	LIST_HEAD(demote_folios);
1104 	unsigned int nr_reclaimed = 0, nr_demoted = 0;
1105 	unsigned int pgactivate = 0;
1106 	bool do_demote_pass;
1107 	struct swap_iocb *plug = NULL;
1108 
1109 	folio_batch_init(&free_folios);
1110 	memset(stat, 0, sizeof(*stat));
1111 	cond_resched();
1112 	do_demote_pass = can_demote(pgdat->node_id, sc);
1113 
1114 retry:
1115 	while (!list_empty(folio_list)) {
1116 		struct address_space *mapping;
1117 		struct folio *folio;
1118 		enum folio_references references = FOLIOREF_RECLAIM;
1119 		bool dirty, writeback;
1120 		unsigned int nr_pages;
1121 
1122 		cond_resched();
1123 
1124 		folio = lru_to_folio(folio_list);
1125 		list_del(&folio->lru);
1126 
1127 		if (!folio_trylock(folio))
1128 			goto keep;
1129 
1130 		if (folio_contain_hwpoisoned_page(folio)) {
1131 			unmap_poisoned_folio(folio, folio_pfn(folio), false);
1132 			folio_unlock(folio);
1133 			folio_put(folio);
1134 			continue;
1135 		}
1136 
1137 		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1138 
1139 		nr_pages = folio_nr_pages(folio);
1140 
1141 		/* Account the number of base pages */
1142 		sc->nr_scanned += nr_pages;
1143 
1144 		if (unlikely(!folio_evictable(folio)))
1145 			goto activate_locked;
1146 
1147 		if (!sc->may_unmap && folio_mapped(folio))
1148 			goto keep_locked;
1149 
1150 		/*
1151 		 * The number of dirty pages determines if a node is marked
1152 		 * reclaim_congested. kswapd will stall and start writing
1153 		 * folios if the tail of the LRU is all dirty unqueued folios.
1154 		 */
1155 		folio_check_dirty_writeback(folio, &dirty, &writeback);
1156 		if (dirty || writeback)
1157 			stat->nr_dirty += nr_pages;
1158 
1159 		if (dirty && !writeback)
1160 			stat->nr_unqueued_dirty += nr_pages;
1161 
1162 		/*
1163 		 * Treat this folio as congested if folios are cycling
1164 		 * through the LRU so quickly that the folios marked
1165 		 * for immediate reclaim are making it to the end of
1166 		 * the LRU a second time.
1167 		 */
1168 		if (writeback && folio_test_reclaim(folio))
1169 			stat->nr_congested += nr_pages;
1170 
1171 		/*
1172 		 * If a folio at the tail of the LRU is under writeback, there
1173 		 * are three cases to consider.
1174 		 *
1175 		 * 1) If reclaim is encountering an excessive number
1176 		 *    of folios under writeback and this folio has both
1177 		 *    the writeback and reclaim flags set, then it
1178 		 *    indicates that folios are being queued for I/O but
1179 		 *    are being recycled through the LRU before the I/O
1180 		 *    can complete. Waiting on the folio itself risks an
1181 		 *    indefinite stall if it is impossible to writeback
1182 		 *    the folio due to I/O error or disconnected storage
1183 		 *    so instead note that the LRU is being scanned too
1184 		 *    quickly and the caller can stall after the folio
1185 		 *    list has been processed.
1186 		 *
1187 		 * 2) Global or new memcg reclaim encounters a folio that is
1188 		 *    not marked for immediate reclaim, or the caller does not
1189 		 *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
1190 		 *    not to fs). In this case mark the folio for immediate
1191 		 *    reclaim and continue scanning.
1192 		 *
1193 		 *    Require may_enter_fs() because we would wait on fs, which
1194 		 *    may not have submitted I/O yet. And the loop driver might
1195 		 *    enter reclaim, and deadlock if it waits on a folio for
1196 		 *    which it is needed to do the write (loop masks off
1197 		 *    __GFP_IO|__GFP_FS for this reason); but more thought
1198 		 *    would probably show more reasons.
1199 		 *
1200 		 * 3) Legacy memcg encounters a folio that already has the
1201 		 *    reclaim flag set. memcg does not have any dirty folio
1202 		 *    throttling so we could easily OOM just because too many
1203 		 *    folios are in writeback and there is nothing else to
1204 		 *    reclaim. Wait for the writeback to complete.
1205 		 *
1206 		 * In cases 1) and 2) we activate the folios to get them out of
1207 		 * the way while we continue scanning for clean folios on the
1208 		 * inactive list and refilling from the active list. The
1209 		 * observation here is that waiting for disk writes is more
1210 		 * expensive than potentially causing reloads down the line.
1211 		 * Since they're marked for immediate reclaim, they won't put
1212 		 * memory pressure on the cache working set any longer than it
1213 		 * takes to write them to disk.
1214 		 */
1215 		if (folio_test_writeback(folio)) {
1216 			/* Case 1 above */
1217 			if (current_is_kswapd() &&
1218 			    folio_test_reclaim(folio) &&
1219 			    test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1220 				stat->nr_immediate += nr_pages;
1221 				goto activate_locked;
1222 
1223 			/* Case 2 above */
1224 			} else if (writeback_throttling_sane(sc) ||
1225 			    !folio_test_reclaim(folio) ||
1226 			    !may_enter_fs(folio, sc->gfp_mask)) {
1227 				/*
1228 				 * This is slightly racy -
1229 				 * folio_end_writeback() might have
1230 				 * just cleared the reclaim flag, then
1231 				 * setting the reclaim flag here ends up
1232 				 * interpreted as the readahead flag - but
1233 				 * that does not matter enough to care.
1234 				 * What we do want is for this folio to
1235 				 * have the reclaim flag set next time
1236 				 * memcg reclaim reaches the tests above,
1237 				 * so it will then wait for writeback to
1238 				 * avoid OOM; and it's also appropriate
1239 				 * in global reclaim.
1240 				 */
1241 				folio_set_reclaim(folio);
1242 				stat->nr_writeback += nr_pages;
1243 				goto activate_locked;
1244 
1245 			/* Case 3 above */
1246 			} else {
1247 				folio_unlock(folio);
1248 				folio_wait_writeback(folio);
1249 				/* then go back and try same folio again */
1250 				list_add_tail(&folio->lru, folio_list);
1251 				continue;
1252 			}
1253 		}
1254 
1255 		if (!ignore_references)
1256 			references = folio_check_references(folio, sc);
1257 
1258 		switch (references) {
1259 		case FOLIOREF_ACTIVATE:
1260 			goto activate_locked;
1261 		case FOLIOREF_KEEP:
1262 			stat->nr_ref_keep += nr_pages;
1263 			goto keep_locked;
1264 		case FOLIOREF_RECLAIM:
1265 		case FOLIOREF_RECLAIM_CLEAN:
1266 			; /* try to reclaim the folio below */
1267 		}
1268 
1269 		/*
1270 		 * Before reclaiming the folio, try to relocate
1271 		 * its contents to another node.
1272 		 */
1273 		if (do_demote_pass &&
1274 		    (thp_migration_supported() || !folio_test_large(folio))) {
1275 			list_add(&folio->lru, &demote_folios);
1276 			folio_unlock(folio);
1277 			continue;
1278 		}
1279 
1280 		/*
1281 		 * Anonymous process memory has backing store?
1282 		 * Try to allocate it some swap space here.
1283 		 * Lazyfree folio could be freed directly
1284 		 */
1285 		if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
1286 			if (!folio_test_swapcache(folio)) {
1287 				if (!(sc->gfp_mask & __GFP_IO))
1288 					goto keep_locked;
1289 				if (folio_maybe_dma_pinned(folio))
1290 					goto keep_locked;
1291 				if (folio_test_large(folio)) {
1292 					/* cannot split folio, skip it */
1293 					if (!can_split_folio(folio, 1, NULL))
1294 						goto activate_locked;
1295 					/*
1296 					 * Split partially mapped folios right away.
1297 					 * We can free the unmapped pages without IO.
1298 					 */
1299 					if (data_race(!list_empty(&folio->_deferred_list) &&
1300 					    folio_test_partially_mapped(folio)) &&
1301 					    split_folio_to_list(folio, folio_list))
1302 						goto activate_locked;
1303 				}
1304 				if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
1305 					int __maybe_unused order = folio_order(folio);
1306 
1307 					if (!folio_test_large(folio))
1308 						goto activate_locked_split;
1309 					/* Fallback to swap normal pages */
1310 					if (split_folio_to_list(folio, folio_list))
1311 						goto activate_locked;
1312 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1313 					if (nr_pages >= HPAGE_PMD_NR) {
1314 						count_memcg_folio_events(folio,
1315 							THP_SWPOUT_FALLBACK, 1);
1316 						count_vm_event(THP_SWPOUT_FALLBACK);
1317 					}
1318 #endif
1319 					count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
1320 					if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
1321 						goto activate_locked_split;
1322 				}
1323 				/*
1324 				 * Normally the folio will be dirtied in unmap because its
1325 				 * pte should be dirty. A special case is MADV_FREE page. The
1326 				 * page's pte could have dirty bit cleared but the folio's
1327 				 * SwapBacked flag is still set because clearing the dirty bit
1328 				 * and SwapBacked flag has no lock protected. For such folio,
1329 				 * unmap will not set dirty bit for it, so folio reclaim will
1330 				 * not write the folio out. This can cause data corruption when
1331 				 * the folio is swapped in later. Always setting the dirty flag
1332 				 * for the folio solves the problem.
1333 				 */
1334 				folio_mark_dirty(folio);
1335 			}
1336 		}
1337 
1338 		/*
1339 		 * If the folio was split above, the tail pages will make
1340 		 * their own pass through this function and be accounted
1341 		 * then.
1342 		 */
1343 		if ((nr_pages > 1) && !folio_test_large(folio)) {
1344 			sc->nr_scanned -= (nr_pages - 1);
1345 			nr_pages = 1;
1346 		}
1347 
1348 		/*
1349 		 * The folio is mapped into the page tables of one or more
1350 		 * processes. Try to unmap it here.
1351 		 */
1352 		if (folio_mapped(folio)) {
1353 			enum ttu_flags flags = TTU_BATCH_FLUSH;
1354 			bool was_swapbacked = folio_test_swapbacked(folio);
1355 
1356 			if (folio_test_pmd_mappable(folio))
1357 				flags |= TTU_SPLIT_HUGE_PMD;
1358 			/*
1359 			 * Without TTU_SYNC, try_to_unmap will only begin to
1360 			 * hold PTL from the first present PTE within a large
1361 			 * folio. Some initial PTEs might be skipped due to
1362 			 * races with parallel PTE writes in which PTEs can be
1363 			 * cleared temporarily before being written new present
1364 			 * values. This will lead to a large folio is still
1365 			 * mapped while some subpages have been partially
1366 			 * unmapped after try_to_unmap; TTU_SYNC helps
1367 			 * try_to_unmap acquire PTL from the first PTE,
1368 			 * eliminating the influence of temporary PTE values.
1369 			 */
1370 			if (folio_test_large(folio))
1371 				flags |= TTU_SYNC;
1372 
1373 			try_to_unmap(folio, flags);
1374 			if (folio_mapped(folio)) {
1375 				stat->nr_unmap_fail += nr_pages;
1376 				if (!was_swapbacked &&
1377 				    folio_test_swapbacked(folio))
1378 					stat->nr_lazyfree_fail += nr_pages;
1379 				goto activate_locked;
1380 			}
1381 		}
1382 
1383 		/*
1384 		 * Folio is unmapped now so it cannot be newly pinned anymore.
1385 		 * No point in trying to reclaim folio if it is pinned.
1386 		 * Furthermore we don't want to reclaim underlying fs metadata
1387 		 * if the folio is pinned and thus potentially modified by the
1388 		 * pinning process as that may upset the filesystem.
1389 		 */
1390 		if (folio_maybe_dma_pinned(folio))
1391 			goto activate_locked;
1392 
1393 		mapping = folio_mapping(folio);
1394 		if (folio_test_dirty(folio)) {
1395 			/*
1396 			 * Only kswapd can writeback filesystem folios
1397 			 * to avoid risk of stack overflow. But avoid
1398 			 * injecting inefficient single-folio I/O into
1399 			 * flusher writeback as much as possible: only
1400 			 * write folios when we've encountered many
1401 			 * dirty folios, and when we've already scanned
1402 			 * the rest of the LRU for clean folios and see
1403 			 * the same dirty folios again (with the reclaim
1404 			 * flag set).
1405 			 */
1406 			if (folio_is_file_lru(folio) &&
1407 			    (!current_is_kswapd() ||
1408 			     !folio_test_reclaim(folio) ||
1409 			     !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1410 				/*
1411 				 * Immediately reclaim when written back.
1412 				 * Similar in principle to folio_deactivate()
1413 				 * except we already have the folio isolated
1414 				 * and know it's dirty
1415 				 */
1416 				node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
1417 						nr_pages);
1418 				folio_set_reclaim(folio);
1419 
1420 				goto activate_locked;
1421 			}
1422 
1423 			if (references == FOLIOREF_RECLAIM_CLEAN)
1424 				goto keep_locked;
1425 			if (!may_enter_fs(folio, sc->gfp_mask))
1426 				goto keep_locked;
1427 			if (!sc->may_writepage)
1428 				goto keep_locked;
1429 
1430 			/*
1431 			 * Folio is dirty. Flush the TLB if a writable entry
1432 			 * potentially exists to avoid CPU writes after I/O
1433 			 * starts and then write it out here.
1434 			 */
1435 			try_to_unmap_flush_dirty();
1436 			switch (pageout(folio, mapping, &plug, folio_list)) {
1437 			case PAGE_KEEP:
1438 				goto keep_locked;
1439 			case PAGE_ACTIVATE:
1440 				/*
1441 				 * If shmem folio is split when writeback to swap,
1442 				 * the tail pages will make their own pass through
1443 				 * this function and be accounted then.
1444 				 */
1445 				if (nr_pages > 1 && !folio_test_large(folio)) {
1446 					sc->nr_scanned -= (nr_pages - 1);
1447 					nr_pages = 1;
1448 				}
1449 				goto activate_locked;
1450 			case PAGE_SUCCESS:
1451 				if (nr_pages > 1 && !folio_test_large(folio)) {
1452 					sc->nr_scanned -= (nr_pages - 1);
1453 					nr_pages = 1;
1454 				}
1455 				stat->nr_pageout += nr_pages;
1456 
1457 				if (folio_test_writeback(folio))
1458 					goto keep;
1459 				if (folio_test_dirty(folio))
1460 					goto keep;
1461 
1462 				/*
1463 				 * A synchronous write - probably a ramdisk.  Go
1464 				 * ahead and try to reclaim the folio.
1465 				 */
1466 				if (!folio_trylock(folio))
1467 					goto keep;
1468 				if (folio_test_dirty(folio) ||
1469 				    folio_test_writeback(folio))
1470 					goto keep_locked;
1471 				mapping = folio_mapping(folio);
1472 				fallthrough;
1473 			case PAGE_CLEAN:
1474 				; /* try to free the folio below */
1475 			}
1476 		}
1477 
1478 		/*
1479 		 * If the folio has buffers, try to free the buffer
1480 		 * mappings associated with this folio. If we succeed
1481 		 * we try to free the folio as well.
1482 		 *
1483 		 * We do this even if the folio is dirty.
1484 		 * filemap_release_folio() does not perform I/O, but it
1485 		 * is possible for a folio to have the dirty flag set,
1486 		 * but it is actually clean (all its buffers are clean).
1487 		 * This happens if the buffers were written out directly,
1488 		 * with submit_bh(). ext3 will do this, as well as
1489 		 * the blockdev mapping.  filemap_release_folio() will
1490 		 * discover that cleanness and will drop the buffers
1491 		 * and mark the folio clean - it can be freed.
1492 		 *
1493 		 * Rarely, folios can have buffers and no ->mapping.
1494 		 * These are the folios which were not successfully
1495 		 * invalidated in truncate_cleanup_folio().  We try to
1496 		 * drop those buffers here and if that worked, and the
1497 		 * folio is no longer mapped into process address space
1498 		 * (refcount == 1) it can be freed.  Otherwise, leave
1499 		 * the folio on the LRU so it is swappable.
1500 		 */
1501 		if (folio_needs_release(folio)) {
1502 			if (!filemap_release_folio(folio, sc->gfp_mask))
1503 				goto activate_locked;
1504 			if (!mapping && folio_ref_count(folio) == 1) {
1505 				folio_unlock(folio);
1506 				if (folio_put_testzero(folio))
1507 					goto free_it;
1508 				else {
1509 					/*
1510 					 * rare race with speculative reference.
1511 					 * the speculative reference will free
1512 					 * this folio shortly, so we may
1513 					 * increment nr_reclaimed here (and
1514 					 * leave it off the LRU).
1515 					 */
1516 					nr_reclaimed += nr_pages;
1517 					continue;
1518 				}
1519 			}
1520 		}
1521 
1522 		if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
1523 			/* follow __remove_mapping for reference */
1524 			if (!folio_ref_freeze(folio, 1))
1525 				goto keep_locked;
1526 			/*
1527 			 * The folio has only one reference left, which is
1528 			 * from the isolation. After the caller puts the
1529 			 * folio back on the lru and drops the reference, the
1530 			 * folio will be freed anyway. It doesn't matter
1531 			 * which lru it goes on. So we don't bother checking
1532 			 * the dirty flag here.
1533 			 */
1534 			count_vm_events(PGLAZYFREED, nr_pages);
1535 			count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
1536 		} else if (!mapping || !__remove_mapping(mapping, folio, true,
1537 							 sc->target_mem_cgroup))
1538 			goto keep_locked;
1539 
1540 		folio_unlock(folio);
1541 free_it:
1542 		/*
1543 		 * Folio may get swapped out as a whole, need to account
1544 		 * all pages in it.
1545 		 */
1546 		nr_reclaimed += nr_pages;
1547 
1548 		folio_unqueue_deferred_split(folio);
1549 		if (folio_batch_add(&free_folios, folio) == 0) {
1550 			mem_cgroup_uncharge_folios(&free_folios);
1551 			try_to_unmap_flush();
1552 			free_unref_folios(&free_folios);
1553 		}
1554 		continue;
1555 
1556 activate_locked_split:
1557 		/*
1558 		 * The tail pages that are failed to add into swap cache
1559 		 * reach here.  Fixup nr_scanned and nr_pages.
1560 		 */
1561 		if (nr_pages > 1) {
1562 			sc->nr_scanned -= (nr_pages - 1);
1563 			nr_pages = 1;
1564 		}
1565 activate_locked:
1566 		/* Not a candidate for swapping, so reclaim swap space. */
1567 		if (folio_test_swapcache(folio) &&
1568 		    (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
1569 			folio_free_swap(folio);
1570 		VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1571 		if (!folio_test_mlocked(folio)) {
1572 			int type = folio_is_file_lru(folio);
1573 			folio_set_active(folio);
1574 			stat->nr_activate[type] += nr_pages;
1575 			count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
1576 		}
1577 keep_locked:
1578 		folio_unlock(folio);
1579 keep:
1580 		list_add(&folio->lru, &ret_folios);
1581 		VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
1582 				folio_test_unevictable(folio), folio);
1583 	}
1584 	/* 'folio_list' is always empty here */
1585 
1586 	/* Migrate folios selected for demotion */
1587 	nr_demoted = demote_folio_list(&demote_folios, pgdat);
1588 	nr_reclaimed += nr_demoted;
1589 	stat->nr_demoted += nr_demoted;
1590 	/* Folios that could not be demoted are still in @demote_folios */
1591 	if (!list_empty(&demote_folios)) {
1592 		/* Folios which weren't demoted go back on @folio_list */
1593 		list_splice_init(&demote_folios, folio_list);
1594 
1595 		/*
1596 		 * goto retry to reclaim the undemoted folios in folio_list if
1597 		 * desired.
1598 		 *
1599 		 * Reclaiming directly from top tier nodes is not often desired
1600 		 * due to it breaking the LRU ordering: in general memory
1601 		 * should be reclaimed from lower tier nodes and demoted from
1602 		 * top tier nodes.
1603 		 *
1604 		 * However, disabling reclaim from top tier nodes entirely
1605 		 * would cause ooms in edge scenarios where lower tier memory
1606 		 * is unreclaimable for whatever reason, eg memory being
1607 		 * mlocked or too hot to reclaim. We can disable reclaim
1608 		 * from top tier nodes in proactive reclaim though as that is
1609 		 * not real memory pressure.
1610 		 */
1611 		if (!sc->proactive) {
1612 			do_demote_pass = false;
1613 			goto retry;
1614 		}
1615 	}
1616 
1617 	pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
1618 
1619 	mem_cgroup_uncharge_folios(&free_folios);
1620 	try_to_unmap_flush();
1621 	free_unref_folios(&free_folios);
1622 
1623 	list_splice(&ret_folios, folio_list);
1624 	count_vm_events(PGACTIVATE, pgactivate);
1625 
1626 	if (plug)
1627 		swap_write_unplug(plug);
1628 	return nr_reclaimed;
1629 }
1630 
1631 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1632 					   struct list_head *folio_list)
1633 {
1634 	struct scan_control sc = {
1635 		.gfp_mask = GFP_KERNEL,
1636 		.may_unmap = 1,
1637 	};
1638 	struct reclaim_stat stat;
1639 	unsigned int nr_reclaimed;
1640 	struct folio *folio, *next;
1641 	LIST_HEAD(clean_folios);
1642 	unsigned int noreclaim_flag;
1643 
1644 	list_for_each_entry_safe(folio, next, folio_list, lru) {
1645 		if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
1646 		    !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
1647 		    !folio_test_unevictable(folio)) {
1648 			folio_clear_active(folio);
1649 			list_move(&folio->lru, &clean_folios);
1650 		}
1651 	}
1652 
1653 	/*
1654 	 * We should be safe here since we are only dealing with file pages and
1655 	 * we are not kswapd and therefore cannot write dirty file pages. But
1656 	 * call memalloc_noreclaim_save() anyway, just in case these conditions
1657 	 * change in the future.
1658 	 */
1659 	noreclaim_flag = memalloc_noreclaim_save();
1660 	nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
1661 					&stat, true);
1662 	memalloc_noreclaim_restore(noreclaim_flag);
1663 
1664 	list_splice(&clean_folios, folio_list);
1665 	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1666 			    -(long)nr_reclaimed);
1667 	/*
1668 	 * Since lazyfree pages are isolated from file LRU from the beginning,
1669 	 * they will rotate back to anonymous LRU in the end if it failed to
1670 	 * discard so isolated count will be mismatched.
1671 	 * Compensate the isolated count for both LRU lists.
1672 	 */
1673 	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
1674 			    stat.nr_lazyfree_fail);
1675 	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1676 			    -(long)stat.nr_lazyfree_fail);
1677 	return nr_reclaimed;
1678 }
1679 
1680 /*
1681  * Update LRU sizes after isolating pages. The LRU size updates must
1682  * be complete before mem_cgroup_update_lru_size due to a sanity check.
1683  */
1684 static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1685 			enum lru_list lru, unsigned long *nr_zone_taken)
1686 {
1687 	int zid;
1688 
1689 	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1690 		if (!nr_zone_taken[zid])
1691 			continue;
1692 
1693 		update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1694 	}
1695 
1696 }
1697 
1698 /*
1699  * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
1700  *
1701  * lruvec->lru_lock is heavily contended.  Some of the functions that
1702  * shrink the lists perform better by taking out a batch of pages
1703  * and working on them outside the LRU lock.
1704  *
1705  * For pagecache intensive workloads, this function is the hottest
1706  * spot in the kernel (apart from copy_*_user functions).
1707  *
1708  * Lru_lock must be held before calling this function.
1709  *
1710  * @nr_to_scan:	The number of eligible pages to look through on the list.
1711  * @lruvec:	The LRU vector to pull pages from.
1712  * @dst:	The temp list to put pages on to.
1713  * @nr_scanned:	The number of pages that were scanned.
1714  * @sc:		The scan_control struct for this reclaim session
1715  * @lru:	LRU list id for isolating
1716  *
1717  * returns how many pages were moved onto *@dst.
1718  */
1719 static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
1720 		struct lruvec *lruvec, struct list_head *dst,
1721 		unsigned long *nr_scanned, struct scan_control *sc,
1722 		enum lru_list lru)
1723 {
1724 	struct list_head *src = &lruvec->lists[lru];
1725 	unsigned long nr_taken = 0;
1726 	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1727 	unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1728 	unsigned long skipped = 0, total_scan = 0, scan = 0;
1729 	unsigned long nr_pages;
1730 	unsigned long max_nr_skipped = 0;
1731 	LIST_HEAD(folios_skipped);
1732 
1733 	while (scan < nr_to_scan && !list_empty(src)) {
1734 		struct list_head *move_to = src;
1735 		struct folio *folio;
1736 
1737 		folio = lru_to_folio(src);
1738 		prefetchw_prev_lru_folio(folio, src, flags);
1739 
1740 		nr_pages = folio_nr_pages(folio);
1741 		total_scan += nr_pages;
1742 
1743 		/* Using max_nr_skipped to prevent hard LOCKUP*/
1744 		if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
1745 		    (folio_zonenum(folio) > sc->reclaim_idx)) {
1746 			nr_skipped[folio_zonenum(folio)] += nr_pages;
1747 			move_to = &folios_skipped;
1748 			max_nr_skipped++;
1749 			goto move;
1750 		}
1751 
1752 		/*
1753 		 * Do not count skipped folios because that makes the function
1754 		 * return with no isolated folios if the LRU mostly contains
1755 		 * ineligible folios.  This causes the VM to not reclaim any
1756 		 * folios, triggering a premature OOM.
1757 		 * Account all pages in a folio.
1758 		 */
1759 		scan += nr_pages;
1760 
1761 		if (!folio_test_lru(folio))
1762 			goto move;
1763 		if (!sc->may_unmap && folio_mapped(folio))
1764 			goto move;
1765 
1766 		/*
1767 		 * Be careful not to clear the lru flag until after we're
1768 		 * sure the folio is not being freed elsewhere -- the
1769 		 * folio release code relies on it.
1770 		 */
1771 		if (unlikely(!folio_try_get(folio)))
1772 			goto move;
1773 
1774 		if (!folio_test_clear_lru(folio)) {
1775 			/* Another thread is already isolating this folio */
1776 			folio_put(folio);
1777 			goto move;
1778 		}
1779 
1780 		nr_taken += nr_pages;
1781 		nr_zone_taken[folio_zonenum(folio)] += nr_pages;
1782 		move_to = dst;
1783 move:
1784 		list_move(&folio->lru, move_to);
1785 	}
1786 
1787 	/*
1788 	 * Splice any skipped folios to the start of the LRU list. Note that
1789 	 * this disrupts the LRU order when reclaiming for lower zones but
1790 	 * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
1791 	 * scanning would soon rescan the same folios to skip and waste lots
1792 	 * of cpu cycles.
1793 	 */
1794 	if (!list_empty(&folios_skipped)) {
1795 		int zid;
1796 
1797 		list_splice(&folios_skipped, src);
1798 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1799 			if (!nr_skipped[zid])
1800 				continue;
1801 
1802 			__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1803 			skipped += nr_skipped[zid];
1804 		}
1805 	}
1806 	*nr_scanned = total_scan;
1807 	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1808 				    total_scan, skipped, nr_taken, lru);
1809 	update_lru_sizes(lruvec, lru, nr_zone_taken);
1810 	return nr_taken;
1811 }
1812 
1813 /**
1814  * folio_isolate_lru() - Try to isolate a folio from its LRU list.
1815  * @folio: Folio to isolate from its LRU list.
1816  *
1817  * Isolate a @folio from an LRU list and adjust the vmstat statistic
1818  * corresponding to whatever LRU list the folio was on.
1819  *
1820  * The folio will have its LRU flag cleared.  If it was found on the
1821  * active list, it will have the Active flag set.  If it was found on the
1822  * unevictable list, it will have the Unevictable flag set.  These flags
1823  * may need to be cleared by the caller before letting the page go.
1824  *
1825  * Context:
1826  *
1827  * (1) Must be called with an elevated refcount on the folio. This is a
1828  *     fundamental difference from isolate_lru_folios() (which is called
1829  *     without a stable reference).
1830  * (2) The lru_lock must not be held.
1831  * (3) Interrupts must be enabled.
1832  *
1833  * Return: true if the folio was removed from an LRU list.
1834  * false if the folio was not on an LRU list.
1835  */
1836 bool folio_isolate_lru(struct folio *folio)
1837 {
1838 	bool ret = false;
1839 
1840 	VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
1841 
1842 	if (folio_test_clear_lru(folio)) {
1843 		struct lruvec *lruvec;
1844 
1845 		folio_get(folio);
1846 		lruvec = folio_lruvec_lock_irq(folio);
1847 		lruvec_del_folio(lruvec, folio);
1848 		unlock_page_lruvec_irq(lruvec);
1849 		ret = true;
1850 	}
1851 
1852 	return ret;
1853 }
1854 
1855 /*
1856  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1857  * then get rescheduled. When there are massive number of tasks doing page
1858  * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1859  * the LRU list will go small and be scanned faster than necessary, leading to
1860  * unnecessary swapping, thrashing and OOM.
1861  */
1862 static bool too_many_isolated(struct pglist_data *pgdat, int file,
1863 		struct scan_control *sc)
1864 {
1865 	unsigned long inactive, isolated;
1866 	bool too_many;
1867 
1868 	if (current_is_kswapd())
1869 		return false;
1870 
1871 	if (!writeback_throttling_sane(sc))
1872 		return false;
1873 
1874 	if (file) {
1875 		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1876 		isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1877 	} else {
1878 		inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1879 		isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1880 	}
1881 
1882 	/*
1883 	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1884 	 * won't get blocked by normal direct-reclaimers, forming a circular
1885 	 * deadlock.
1886 	 */
1887 	if (gfp_has_io_fs(sc->gfp_mask))
1888 		inactive >>= 3;
1889 
1890 	too_many = isolated > inactive;
1891 
1892 	/* Wake up tasks throttled due to too_many_isolated. */
1893 	if (!too_many)
1894 		wake_throttle_isolated(pgdat);
1895 
1896 	return too_many;
1897 }
1898 
1899 /*
1900  * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
1901  *
1902  * Returns the number of pages moved to the given lruvec.
1903  */
1904 static unsigned int move_folios_to_lru(struct lruvec *lruvec,
1905 		struct list_head *list)
1906 {
1907 	int nr_pages, nr_moved = 0;
1908 	struct folio_batch free_folios;
1909 
1910 	folio_batch_init(&free_folios);
1911 	while (!list_empty(list)) {
1912 		struct folio *folio = lru_to_folio(list);
1913 
1914 		VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
1915 		list_del(&folio->lru);
1916 		if (unlikely(!folio_evictable(folio))) {
1917 			spin_unlock_irq(&lruvec->lru_lock);
1918 			folio_putback_lru(folio);
1919 			spin_lock_irq(&lruvec->lru_lock);
1920 			continue;
1921 		}
1922 
1923 		/*
1924 		 * The folio_set_lru needs to be kept here for list integrity.
1925 		 * Otherwise:
1926 		 *   #0 move_folios_to_lru             #1 release_pages
1927 		 *   if (!folio_put_testzero())
1928 		 *				      if (folio_put_testzero())
1929 		 *				        !lru //skip lru_lock
1930 		 *     folio_set_lru()
1931 		 *     list_add(&folio->lru,)
1932 		 *                                        list_add(&folio->lru,)
1933 		 */
1934 		folio_set_lru(folio);
1935 
1936 		if (unlikely(folio_put_testzero(folio))) {
1937 			__folio_clear_lru_flags(folio);
1938 
1939 			folio_unqueue_deferred_split(folio);
1940 			if (folio_batch_add(&free_folios, folio) == 0) {
1941 				spin_unlock_irq(&lruvec->lru_lock);
1942 				mem_cgroup_uncharge_folios(&free_folios);
1943 				free_unref_folios(&free_folios);
1944 				spin_lock_irq(&lruvec->lru_lock);
1945 			}
1946 
1947 			continue;
1948 		}
1949 
1950 		/*
1951 		 * All pages were isolated from the same lruvec (and isolation
1952 		 * inhibits memcg migration).
1953 		 */
1954 		VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
1955 		lruvec_add_folio(lruvec, folio);
1956 		nr_pages = folio_nr_pages(folio);
1957 		nr_moved += nr_pages;
1958 		if (folio_test_active(folio))
1959 			workingset_age_nonresident(lruvec, nr_pages);
1960 	}
1961 
1962 	if (free_folios.nr) {
1963 		spin_unlock_irq(&lruvec->lru_lock);
1964 		mem_cgroup_uncharge_folios(&free_folios);
1965 		free_unref_folios(&free_folios);
1966 		spin_lock_irq(&lruvec->lru_lock);
1967 	}
1968 
1969 	return nr_moved;
1970 }
1971 
1972 /*
1973  * If a kernel thread (such as nfsd for loop-back mounts) services a backing
1974  * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
1975  * we should not throttle.  Otherwise it is safe to do so.
1976  */
1977 static int current_may_throttle(void)
1978 {
1979 	return !(current->flags & PF_LOCAL_THROTTLE);
1980 }
1981 
1982 /*
1983  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
1984  * of reclaimed pages
1985  */
1986 static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
1987 		struct lruvec *lruvec, struct scan_control *sc,
1988 		enum lru_list lru)
1989 {
1990 	LIST_HEAD(folio_list);
1991 	unsigned long nr_scanned;
1992 	unsigned int nr_reclaimed = 0;
1993 	unsigned long nr_taken;
1994 	struct reclaim_stat stat;
1995 	bool file = is_file_lru(lru);
1996 	enum vm_event_item item;
1997 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1998 	bool stalled = false;
1999 
2000 	while (unlikely(too_many_isolated(pgdat, file, sc))) {
2001 		if (stalled)
2002 			return 0;
2003 
2004 		/* wait a bit for the reclaimer. */
2005 		stalled = true;
2006 		reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
2007 
2008 		/* We are about to die and free our memory. Return now. */
2009 		if (fatal_signal_pending(current))
2010 			return SWAP_CLUSTER_MAX;
2011 	}
2012 
2013 	lru_add_drain();
2014 
2015 	spin_lock_irq(&lruvec->lru_lock);
2016 
2017 	nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
2018 				     &nr_scanned, sc, lru);
2019 
2020 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2021 	item = PGSCAN_KSWAPD + reclaimer_offset(sc);
2022 	if (!cgroup_reclaim(sc))
2023 		__count_vm_events(item, nr_scanned);
2024 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
2025 	__count_vm_events(PGSCAN_ANON + file, nr_scanned);
2026 
2027 	spin_unlock_irq(&lruvec->lru_lock);
2028 
2029 	if (nr_taken == 0)
2030 		return 0;
2031 
2032 	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
2033 
2034 	spin_lock_irq(&lruvec->lru_lock);
2035 	move_folios_to_lru(lruvec, &folio_list);
2036 
2037 	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
2038 					stat.nr_demoted);
2039 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2040 	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
2041 	if (!cgroup_reclaim(sc))
2042 		__count_vm_events(item, nr_reclaimed);
2043 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
2044 	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
2045 	spin_unlock_irq(&lruvec->lru_lock);
2046 
2047 	lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
2048 
2049 	/*
2050 	 * If dirty folios are scanned that are not queued for IO, it
2051 	 * implies that flushers are not doing their job. This can
2052 	 * happen when memory pressure pushes dirty folios to the end of
2053 	 * the LRU before the dirty limits are breached and the dirty
2054 	 * data has expired. It can also happen when the proportion of
2055 	 * dirty folios grows not through writes but through memory
2056 	 * pressure reclaiming all the clean cache. And in some cases,
2057 	 * the flushers simply cannot keep up with the allocation
2058 	 * rate. Nudge the flusher threads in case they are asleep.
2059 	 */
2060 	if (stat.nr_unqueued_dirty == nr_taken) {
2061 		wakeup_flusher_threads(WB_REASON_VMSCAN);
2062 		/*
2063 		 * For cgroupv1 dirty throttling is achieved by waking up
2064 		 * the kernel flusher here and later waiting on folios
2065 		 * which are in writeback to finish (see shrink_folio_list()).
2066 		 *
2067 		 * Flusher may not be able to issue writeback quickly
2068 		 * enough for cgroupv1 writeback throttling to work
2069 		 * on a large system.
2070 		 */
2071 		if (!writeback_throttling_sane(sc))
2072 			reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
2073 	}
2074 
2075 	sc->nr.dirty += stat.nr_dirty;
2076 	sc->nr.congested += stat.nr_congested;
2077 	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2078 	sc->nr.writeback += stat.nr_writeback;
2079 	sc->nr.immediate += stat.nr_immediate;
2080 	sc->nr.taken += nr_taken;
2081 	if (file)
2082 		sc->nr.file_taken += nr_taken;
2083 
2084 	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2085 			nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2086 	return nr_reclaimed;
2087 }
2088 
2089 /*
2090  * shrink_active_list() moves folios from the active LRU to the inactive LRU.
2091  *
2092  * We move them the other way if the folio is referenced by one or more
2093  * processes.
2094  *
2095  * If the folios are mostly unmapped, the processing is fast and it is
2096  * appropriate to hold lru_lock across the whole operation.  But if
2097  * the folios are mapped, the processing is slow (folio_referenced()), so
2098  * we should drop lru_lock around each folio.  It's impossible to balance
2099  * this, so instead we remove the folios from the LRU while processing them.
2100  * It is safe to rely on the active flag against the non-LRU folios in here
2101  * because nobody will play with that bit on a non-LRU folio.
2102  *
2103  * The downside is that we have to touch folio->_refcount against each folio.
2104  * But we had to alter folio->flags anyway.
2105  */
2106 static void shrink_active_list(unsigned long nr_to_scan,
2107 			       struct lruvec *lruvec,
2108 			       struct scan_control *sc,
2109 			       enum lru_list lru)
2110 {
2111 	unsigned long nr_taken;
2112 	unsigned long nr_scanned;
2113 	unsigned long vm_flags;
2114 	LIST_HEAD(l_hold);	/* The folios which were snipped off */
2115 	LIST_HEAD(l_active);
2116 	LIST_HEAD(l_inactive);
2117 	unsigned nr_deactivate, nr_activate;
2118 	unsigned nr_rotated = 0;
2119 	bool file = is_file_lru(lru);
2120 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2121 
2122 	lru_add_drain();
2123 
2124 	spin_lock_irq(&lruvec->lru_lock);
2125 
2126 	nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
2127 				     &nr_scanned, sc, lru);
2128 
2129 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2130 
2131 	if (!cgroup_reclaim(sc))
2132 		__count_vm_events(PGREFILL, nr_scanned);
2133 	__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2134 
2135 	spin_unlock_irq(&lruvec->lru_lock);
2136 
2137 	while (!list_empty(&l_hold)) {
2138 		struct folio *folio;
2139 
2140 		cond_resched();
2141 		folio = lru_to_folio(&l_hold);
2142 		list_del(&folio->lru);
2143 
2144 		if (unlikely(!folio_evictable(folio))) {
2145 			folio_putback_lru(folio);
2146 			continue;
2147 		}
2148 
2149 		if (unlikely(buffer_heads_over_limit)) {
2150 			if (folio_needs_release(folio) &&
2151 			    folio_trylock(folio)) {
2152 				filemap_release_folio(folio, 0);
2153 				folio_unlock(folio);
2154 			}
2155 		}
2156 
2157 		/* Referenced or rmap lock contention: rotate */
2158 		if (folio_referenced(folio, 0, sc->target_mem_cgroup,
2159 				     &vm_flags) != 0) {
2160 			/*
2161 			 * Identify referenced, file-backed active folios and
2162 			 * give them one more trip around the active list. So
2163 			 * that executable code get better chances to stay in
2164 			 * memory under moderate memory pressure.  Anon folios
2165 			 * are not likely to be evicted by use-once streaming
2166 			 * IO, plus JVM can create lots of anon VM_EXEC folios,
2167 			 * so we ignore them here.
2168 			 */
2169 			if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
2170 				nr_rotated += folio_nr_pages(folio);
2171 				list_add(&folio->lru, &l_active);
2172 				continue;
2173 			}
2174 		}
2175 
2176 		folio_clear_active(folio);	/* we are de-activating */
2177 		folio_set_workingset(folio);
2178 		list_add(&folio->lru, &l_inactive);
2179 	}
2180 
2181 	/*
2182 	 * Move folios back to the lru list.
2183 	 */
2184 	spin_lock_irq(&lruvec->lru_lock);
2185 
2186 	nr_activate = move_folios_to_lru(lruvec, &l_active);
2187 	nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
2188 
2189 	__count_vm_events(PGDEACTIVATE, nr_deactivate);
2190 	__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2191 
2192 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2193 	spin_unlock_irq(&lruvec->lru_lock);
2194 
2195 	if (nr_rotated)
2196 		lru_note_cost(lruvec, file, 0, nr_rotated);
2197 	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2198 			nr_deactivate, nr_rotated, sc->priority, file);
2199 }
2200 
2201 static unsigned int reclaim_folio_list(struct list_head *folio_list,
2202 				      struct pglist_data *pgdat)
2203 {
2204 	struct reclaim_stat stat;
2205 	unsigned int nr_reclaimed;
2206 	struct folio *folio;
2207 	struct scan_control sc = {
2208 		.gfp_mask = GFP_KERNEL,
2209 		.may_writepage = 1,
2210 		.may_unmap = 1,
2211 		.may_swap = 1,
2212 		.no_demotion = 1,
2213 	};
2214 
2215 	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
2216 	while (!list_empty(folio_list)) {
2217 		folio = lru_to_folio(folio_list);
2218 		list_del(&folio->lru);
2219 		folio_putback_lru(folio);
2220 	}
2221 	trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat);
2222 
2223 	return nr_reclaimed;
2224 }
2225 
2226 unsigned long reclaim_pages(struct list_head *folio_list)
2227 {
2228 	int nid;
2229 	unsigned int nr_reclaimed = 0;
2230 	LIST_HEAD(node_folio_list);
2231 	unsigned int noreclaim_flag;
2232 
2233 	if (list_empty(folio_list))
2234 		return nr_reclaimed;
2235 
2236 	noreclaim_flag = memalloc_noreclaim_save();
2237 
2238 	nid = folio_nid(lru_to_folio(folio_list));
2239 	do {
2240 		struct folio *folio = lru_to_folio(folio_list);
2241 
2242 		if (nid == folio_nid(folio)) {
2243 			folio_clear_active(folio);
2244 			list_move(&folio->lru, &node_folio_list);
2245 			continue;
2246 		}
2247 
2248 		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
2249 		nid = folio_nid(lru_to_folio(folio_list));
2250 	} while (!list_empty(folio_list));
2251 
2252 	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
2253 
2254 	memalloc_noreclaim_restore(noreclaim_flag);
2255 
2256 	return nr_reclaimed;
2257 }
2258 
2259 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2260 				 struct lruvec *lruvec, struct scan_control *sc)
2261 {
2262 	if (is_active_lru(lru)) {
2263 		if (sc->may_deactivate & (1 << is_file_lru(lru)))
2264 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
2265 		else
2266 			sc->skipped_deactivate = 1;
2267 		return 0;
2268 	}
2269 
2270 	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2271 }
2272 
2273 /*
2274  * The inactive anon list should be small enough that the VM never has
2275  * to do too much work.
2276  *
2277  * The inactive file list should be small enough to leave most memory
2278  * to the established workingset on the scan-resistant active list,
2279  * but large enough to avoid thrashing the aggregate readahead window.
2280  *
2281  * Both inactive lists should also be large enough that each inactive
2282  * folio has a chance to be referenced again before it is reclaimed.
2283  *
2284  * If that fails and refaulting is observed, the inactive list grows.
2285  *
2286  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
2287  * on this LRU, maintained by the pageout code. An inactive_ratio
2288  * of 3 means 3:1 or 25% of the folios are kept on the inactive list.
2289  *
2290  * total     target    max
2291  * memory    ratio     inactive
2292  * -------------------------------------
2293  *   10MB       1         5MB
2294  *  100MB       1        50MB
2295  *    1GB       3       250MB
2296  *   10GB      10       0.9GB
2297  *  100GB      31         3GB
2298  *    1TB     101        10GB
2299  *   10TB     320        32GB
2300  */
2301 static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2302 {
2303 	enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2304 	unsigned long inactive, active;
2305 	unsigned long inactive_ratio;
2306 	unsigned long gb;
2307 
2308 	inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2309 	active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2310 
2311 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
2312 	if (gb)
2313 		inactive_ratio = int_sqrt(10 * gb);
2314 	else
2315 		inactive_ratio = 1;
2316 
2317 	return inactive * inactive_ratio < active;
2318 }
2319 
2320 enum scan_balance {
2321 	SCAN_EQUAL,
2322 	SCAN_FRACT,
2323 	SCAN_ANON,
2324 	SCAN_FILE,
2325 };
2326 
2327 static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
2328 {
2329 	unsigned long file;
2330 	struct lruvec *target_lruvec;
2331 
2332 	if (lru_gen_enabled())
2333 		return;
2334 
2335 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
2336 
2337 	/*
2338 	 * Flush the memory cgroup stats in rate-limited way as we don't need
2339 	 * most accurate stats here. We may switch to regular stats flushing
2340 	 * in the future once it is cheap enough.
2341 	 */
2342 	mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup);
2343 
2344 	/*
2345 	 * Determine the scan balance between anon and file LRUs.
2346 	 */
2347 	spin_lock_irq(&target_lruvec->lru_lock);
2348 	sc->anon_cost = target_lruvec->anon_cost;
2349 	sc->file_cost = target_lruvec->file_cost;
2350 	spin_unlock_irq(&target_lruvec->lru_lock);
2351 
2352 	/*
2353 	 * Target desirable inactive:active list ratios for the anon
2354 	 * and file LRU lists.
2355 	 */
2356 	if (!sc->force_deactivate) {
2357 		unsigned long refaults;
2358 
2359 		/*
2360 		 * When refaults are being observed, it means a new
2361 		 * workingset is being established. Deactivate to get
2362 		 * rid of any stale active pages quickly.
2363 		 */
2364 		refaults = lruvec_page_state(target_lruvec,
2365 				WORKINGSET_ACTIVATE_ANON);
2366 		if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
2367 			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
2368 			sc->may_deactivate |= DEACTIVATE_ANON;
2369 		else
2370 			sc->may_deactivate &= ~DEACTIVATE_ANON;
2371 
2372 		refaults = lruvec_page_state(target_lruvec,
2373 				WORKINGSET_ACTIVATE_FILE);
2374 		if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
2375 		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
2376 			sc->may_deactivate |= DEACTIVATE_FILE;
2377 		else
2378 			sc->may_deactivate &= ~DEACTIVATE_FILE;
2379 	} else
2380 		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
2381 
2382 	/*
2383 	 * If we have plenty of inactive file pages that aren't
2384 	 * thrashing, try to reclaim those first before touching
2385 	 * anonymous pages.
2386 	 */
2387 	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
2388 	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) &&
2389 	    !sc->no_cache_trim_mode)
2390 		sc->cache_trim_mode = 1;
2391 	else
2392 		sc->cache_trim_mode = 0;
2393 
2394 	/*
2395 	 * Prevent the reclaimer from falling into the cache trap: as
2396 	 * cache pages start out inactive, every cache fault will tip
2397 	 * the scan balance towards the file LRU.  And as the file LRU
2398 	 * shrinks, so does the window for rotation from references.
2399 	 * This means we have a runaway feedback loop where a tiny
2400 	 * thrashing file LRU becomes infinitely more attractive than
2401 	 * anon pages.  Try to detect this based on file LRU size.
2402 	 */
2403 	if (!cgroup_reclaim(sc)) {
2404 		unsigned long total_high_wmark = 0;
2405 		unsigned long free, anon;
2406 		int z;
2407 		struct zone *zone;
2408 
2409 		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2410 		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
2411 			   node_page_state(pgdat, NR_INACTIVE_FILE);
2412 
2413 		for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) {
2414 			total_high_wmark += high_wmark_pages(zone);
2415 		}
2416 
2417 		/*
2418 		 * Consider anon: if that's low too, this isn't a
2419 		 * runaway file reclaim problem, but rather just
2420 		 * extreme pressure. Reclaim as per usual then.
2421 		 */
2422 		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
2423 
2424 		sc->file_is_tiny =
2425 			file + free <= total_high_wmark &&
2426 			!(sc->may_deactivate & DEACTIVATE_ANON) &&
2427 			anon >> sc->priority;
2428 	}
2429 }
2430 
2431 static inline void calculate_pressure_balance(struct scan_control *sc,
2432 			int swappiness, u64 *fraction, u64 *denominator)
2433 {
2434 	unsigned long anon_cost, file_cost, total_cost;
2435 	unsigned long ap, fp;
2436 
2437 	/*
2438 	 * Calculate the pressure balance between anon and file pages.
2439 	 *
2440 	 * The amount of pressure we put on each LRU is inversely
2441 	 * proportional to the cost of reclaiming each list, as
2442 	 * determined by the share of pages that are refaulting, times
2443 	 * the relative IO cost of bringing back a swapped out
2444 	 * anonymous page vs reloading a filesystem page (swappiness).
2445 	 *
2446 	 * Although we limit that influence to ensure no list gets
2447 	 * left behind completely: at least a third of the pressure is
2448 	 * applied, before swappiness.
2449 	 *
2450 	 * With swappiness at 100, anon and file have equal IO cost.
2451 	 */
2452 	total_cost = sc->anon_cost + sc->file_cost;
2453 	anon_cost = total_cost + sc->anon_cost;
2454 	file_cost = total_cost + sc->file_cost;
2455 	total_cost = anon_cost + file_cost;
2456 
2457 	ap = swappiness * (total_cost + 1);
2458 	ap /= anon_cost + 1;
2459 
2460 	fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
2461 	fp /= file_cost + 1;
2462 
2463 	fraction[WORKINGSET_ANON] = ap;
2464 	fraction[WORKINGSET_FILE] = fp;
2465 	*denominator = ap + fp;
2466 }
2467 
2468 /*
2469  * Determine how aggressively the anon and file LRU lists should be
2470  * scanned.
2471  *
2472  * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
2473  * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
2474  */
2475 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2476 			   unsigned long *nr)
2477 {
2478 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2479 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2480 	int swappiness = sc_swappiness(sc, memcg);
2481 	u64 fraction[ANON_AND_FILE];
2482 	u64 denominator = 0;	/* gcc */
2483 	enum scan_balance scan_balance;
2484 	enum lru_list lru;
2485 
2486 	/* If we have no swap space, do not bother scanning anon folios. */
2487 	if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
2488 		scan_balance = SCAN_FILE;
2489 		goto out;
2490 	}
2491 
2492 	/*
2493 	 * Global reclaim will swap to prevent OOM even with no
2494 	 * swappiness, but memcg users want to use this knob to
2495 	 * disable swapping for individual groups completely when
2496 	 * using the memory controller's swap limit feature would be
2497 	 * too expensive.
2498 	 */
2499 	if (cgroup_reclaim(sc) && !swappiness) {
2500 		scan_balance = SCAN_FILE;
2501 		goto out;
2502 	}
2503 
2504 	/*
2505 	 * Do not apply any pressure balancing cleverness when the
2506 	 * system is close to OOM, scan both anon and file equally
2507 	 * (unless the swappiness setting disagrees with swapping).
2508 	 */
2509 	if (!sc->priority && swappiness) {
2510 		scan_balance = SCAN_EQUAL;
2511 		goto out;
2512 	}
2513 
2514 	/*
2515 	 * If the system is almost out of file pages, force-scan anon.
2516 	 */
2517 	if (sc->file_is_tiny) {
2518 		scan_balance = SCAN_ANON;
2519 		goto out;
2520 	}
2521 
2522 	/*
2523 	 * If there is enough inactive page cache, we do not reclaim
2524 	 * anything from the anonymous working right now.
2525 	 */
2526 	if (sc->cache_trim_mode) {
2527 		scan_balance = SCAN_FILE;
2528 		goto out;
2529 	}
2530 
2531 	scan_balance = SCAN_FRACT;
2532 	calculate_pressure_balance(sc, swappiness, fraction, &denominator);
2533 
2534 out:
2535 	for_each_evictable_lru(lru) {
2536 		bool file = is_file_lru(lru);
2537 		unsigned long lruvec_size;
2538 		unsigned long low, min;
2539 		unsigned long scan;
2540 
2541 		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2542 		mem_cgroup_protection(sc->target_mem_cgroup, memcg,
2543 				      &min, &low);
2544 
2545 		if (min || low) {
2546 			/*
2547 			 * Scale a cgroup's reclaim pressure by proportioning
2548 			 * its current usage to its memory.low or memory.min
2549 			 * setting.
2550 			 *
2551 			 * This is important, as otherwise scanning aggression
2552 			 * becomes extremely binary -- from nothing as we
2553 			 * approach the memory protection threshold, to totally
2554 			 * nominal as we exceed it.  This results in requiring
2555 			 * setting extremely liberal protection thresholds. It
2556 			 * also means we simply get no protection at all if we
2557 			 * set it too low, which is not ideal.
2558 			 *
2559 			 * If there is any protection in place, we reduce scan
2560 			 * pressure by how much of the total memory used is
2561 			 * within protection thresholds.
2562 			 *
2563 			 * There is one special case: in the first reclaim pass,
2564 			 * we skip over all groups that are within their low
2565 			 * protection. If that fails to reclaim enough pages to
2566 			 * satisfy the reclaim goal, we come back and override
2567 			 * the best-effort low protection. However, we still
2568 			 * ideally want to honor how well-behaved groups are in
2569 			 * that case instead of simply punishing them all
2570 			 * equally. As such, we reclaim them based on how much
2571 			 * memory they are using, reducing the scan pressure
2572 			 * again by how much of the total memory used is under
2573 			 * hard protection.
2574 			 */
2575 			unsigned long cgroup_size = mem_cgroup_size(memcg);
2576 			unsigned long protection;
2577 
2578 			/* memory.low scaling, make sure we retry before OOM */
2579 			if (!sc->memcg_low_reclaim && low > min) {
2580 				protection = low;
2581 				sc->memcg_low_skipped = 1;
2582 			} else {
2583 				protection = min;
2584 			}
2585 
2586 			/* Avoid TOCTOU with earlier protection check */
2587 			cgroup_size = max(cgroup_size, protection);
2588 
2589 			scan = lruvec_size - lruvec_size * protection /
2590 				(cgroup_size + 1);
2591 
2592 			/*
2593 			 * Minimally target SWAP_CLUSTER_MAX pages to keep
2594 			 * reclaim moving forwards, avoiding decrementing
2595 			 * sc->priority further than desirable.
2596 			 */
2597 			scan = max(scan, SWAP_CLUSTER_MAX);
2598 		} else {
2599 			scan = lruvec_size;
2600 		}
2601 
2602 		scan >>= sc->priority;
2603 
2604 		/*
2605 		 * If the cgroup's already been deleted, make sure to
2606 		 * scrape out the remaining cache.
2607 		 */
2608 		if (!scan && !mem_cgroup_online(memcg))
2609 			scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2610 
2611 		switch (scan_balance) {
2612 		case SCAN_EQUAL:
2613 			/* Scan lists relative to size */
2614 			break;
2615 		case SCAN_FRACT:
2616 			/*
2617 			 * Scan types proportional to swappiness and
2618 			 * their relative recent reclaim efficiency.
2619 			 * Make sure we don't miss the last page on
2620 			 * the offlined memory cgroups because of a
2621 			 * round-off error.
2622 			 */
2623 			scan = mem_cgroup_online(memcg) ?
2624 			       div64_u64(scan * fraction[file], denominator) :
2625 			       DIV64_U64_ROUND_UP(scan * fraction[file],
2626 						  denominator);
2627 			break;
2628 		case SCAN_FILE:
2629 		case SCAN_ANON:
2630 			/* Scan one type exclusively */
2631 			if ((scan_balance == SCAN_FILE) != file)
2632 				scan = 0;
2633 			break;
2634 		default:
2635 			/* Look ma, no brain */
2636 			BUG();
2637 		}
2638 
2639 		nr[lru] = scan;
2640 	}
2641 }
2642 
2643 /*
2644  * Anonymous LRU management is a waste if there is
2645  * ultimately no way to reclaim the memory.
2646  */
2647 static bool can_age_anon_pages(struct pglist_data *pgdat,
2648 			       struct scan_control *sc)
2649 {
2650 	/* Aging the anon LRU is valuable if swap is present: */
2651 	if (total_swap_pages > 0)
2652 		return true;
2653 
2654 	/* Also valuable if anon pages can be demoted: */
2655 	return can_demote(pgdat->node_id, sc);
2656 }
2657 
2658 #ifdef CONFIG_LRU_GEN
2659 
2660 #ifdef CONFIG_LRU_GEN_ENABLED
2661 DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
2662 #define get_cap(cap)	static_branch_likely(&lru_gen_caps[cap])
2663 #else
2664 DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
2665 #define get_cap(cap)	static_branch_unlikely(&lru_gen_caps[cap])
2666 #endif
2667 
2668 static bool should_walk_mmu(void)
2669 {
2670 	return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK);
2671 }
2672 
2673 static bool should_clear_pmd_young(void)
2674 {
2675 	return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG);
2676 }
2677 
2678 /******************************************************************************
2679  *                          shorthand helpers
2680  ******************************************************************************/
2681 
2682 #define DEFINE_MAX_SEQ(lruvec)						\
2683 	unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
2684 
2685 #define DEFINE_MIN_SEQ(lruvec)						\
2686 	unsigned long min_seq[ANON_AND_FILE] = {			\
2687 		READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]),	\
2688 		READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]),	\
2689 	}
2690 
2691 #define evictable_min_seq(min_seq, swappiness)				\
2692 	min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS])
2693 
2694 #define for_each_gen_type_zone(gen, type, zone)				\
2695 	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)			\
2696 		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
2697 			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
2698 
2699 #define for_each_evictable_type(type, swappiness)			\
2700 	for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++)
2701 
2702 #define get_memcg_gen(seq)	((seq) % MEMCG_NR_GENS)
2703 #define get_memcg_bin(bin)	((bin) % MEMCG_NR_BINS)
2704 
2705 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
2706 {
2707 	struct pglist_data *pgdat = NODE_DATA(nid);
2708 
2709 #ifdef CONFIG_MEMCG
2710 	if (memcg) {
2711 		struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
2712 
2713 		/* see the comment in mem_cgroup_lruvec() */
2714 		if (!lruvec->pgdat)
2715 			lruvec->pgdat = pgdat;
2716 
2717 		return lruvec;
2718 	}
2719 #endif
2720 	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
2721 
2722 	return &pgdat->__lruvec;
2723 }
2724 
2725 static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
2726 {
2727 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2728 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2729 
2730 	if (!sc->may_swap)
2731 		return 0;
2732 
2733 	if (!can_demote(pgdat->node_id, sc) &&
2734 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
2735 		return 0;
2736 
2737 	return sc_swappiness(sc, memcg);
2738 }
2739 
2740 static int get_nr_gens(struct lruvec *lruvec, int type)
2741 {
2742 	return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
2743 }
2744 
2745 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
2746 {
2747 	int type;
2748 
2749 	for (type = 0; type < ANON_AND_FILE; type++) {
2750 		int n = get_nr_gens(lruvec, type);
2751 
2752 		if (n < MIN_NR_GENS || n > MAX_NR_GENS)
2753 			return false;
2754 	}
2755 
2756 	return true;
2757 }
2758 
2759 /******************************************************************************
2760  *                          Bloom filters
2761  ******************************************************************************/
2762 
2763 /*
2764  * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
2765  * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
2766  * bits in a bitmap, k is the number of hash functions and n is the number of
2767  * inserted items.
2768  *
2769  * Page table walkers use one of the two filters to reduce their search space.
2770  * To get rid of non-leaf entries that no longer have enough leaf entries, the
2771  * aging uses the double-buffering technique to flip to the other filter each
2772  * time it produces a new generation. For non-leaf entries that have enough
2773  * leaf entries, the aging carries them over to the next generation in
2774  * walk_pmd_range(); the eviction also report them when walking the rmap
2775  * in lru_gen_look_around().
2776  *
2777  * For future optimizations:
2778  * 1. It's not necessary to keep both filters all the time. The spare one can be
2779  *    freed after the RCU grace period and reallocated if needed again.
2780  * 2. And when reallocating, it's worth scaling its size according to the number
2781  *    of inserted entries in the other filter, to reduce the memory overhead on
2782  *    small systems and false positives on large systems.
2783  * 3. Jenkins' hash function is an alternative to Knuth's.
2784  */
2785 #define BLOOM_FILTER_SHIFT	15
2786 
2787 static inline int filter_gen_from_seq(unsigned long seq)
2788 {
2789 	return seq % NR_BLOOM_FILTERS;
2790 }
2791 
2792 static void get_item_key(void *item, int *key)
2793 {
2794 	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
2795 
2796 	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
2797 
2798 	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
2799 	key[1] = hash >> BLOOM_FILTER_SHIFT;
2800 }
2801 
2802 static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
2803 			      void *item)
2804 {
2805 	int key[2];
2806 	unsigned long *filter;
2807 	int gen = filter_gen_from_seq(seq);
2808 
2809 	filter = READ_ONCE(mm_state->filters[gen]);
2810 	if (!filter)
2811 		return true;
2812 
2813 	get_item_key(item, key);
2814 
2815 	return test_bit(key[0], filter) && test_bit(key[1], filter);
2816 }
2817 
2818 static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
2819 				void *item)
2820 {
2821 	int key[2];
2822 	unsigned long *filter;
2823 	int gen = filter_gen_from_seq(seq);
2824 
2825 	filter = READ_ONCE(mm_state->filters[gen]);
2826 	if (!filter)
2827 		return;
2828 
2829 	get_item_key(item, key);
2830 
2831 	if (!test_bit(key[0], filter))
2832 		set_bit(key[0], filter);
2833 	if (!test_bit(key[1], filter))
2834 		set_bit(key[1], filter);
2835 }
2836 
2837 static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq)
2838 {
2839 	unsigned long *filter;
2840 	int gen = filter_gen_from_seq(seq);
2841 
2842 	filter = mm_state->filters[gen];
2843 	if (filter) {
2844 		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
2845 		return;
2846 	}
2847 
2848 	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
2849 			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
2850 	WRITE_ONCE(mm_state->filters[gen], filter);
2851 }
2852 
2853 /******************************************************************************
2854  *                          mm_struct list
2855  ******************************************************************************/
2856 
2857 #ifdef CONFIG_LRU_GEN_WALKS_MMU
2858 
2859 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
2860 {
2861 	static struct lru_gen_mm_list mm_list = {
2862 		.fifo = LIST_HEAD_INIT(mm_list.fifo),
2863 		.lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
2864 	};
2865 
2866 #ifdef CONFIG_MEMCG
2867 	if (memcg)
2868 		return &memcg->mm_list;
2869 #endif
2870 	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
2871 
2872 	return &mm_list;
2873 }
2874 
2875 static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
2876 {
2877 	return &lruvec->mm_state;
2878 }
2879 
2880 static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
2881 {
2882 	int key;
2883 	struct mm_struct *mm;
2884 	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
2885 	struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
2886 
2887 	mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
2888 	key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
2889 
2890 	if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
2891 		return NULL;
2892 
2893 	clear_bit(key, &mm->lru_gen.bitmap);
2894 
2895 	return mmget_not_zero(mm) ? mm : NULL;
2896 }
2897 
2898 void lru_gen_add_mm(struct mm_struct *mm)
2899 {
2900 	int nid;
2901 	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
2902 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
2903 
2904 	VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
2905 #ifdef CONFIG_MEMCG
2906 	VM_WARN_ON_ONCE(mm->lru_gen.memcg);
2907 	mm->lru_gen.memcg = memcg;
2908 #endif
2909 	spin_lock(&mm_list->lock);
2910 
2911 	for_each_node_state(nid, N_MEMORY) {
2912 		struct lruvec *lruvec = get_lruvec(memcg, nid);
2913 		struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2914 
2915 		/* the first addition since the last iteration */
2916 		if (mm_state->tail == &mm_list->fifo)
2917 			mm_state->tail = &mm->lru_gen.list;
2918 	}
2919 
2920 	list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
2921 
2922 	spin_unlock(&mm_list->lock);
2923 }
2924 
2925 void lru_gen_del_mm(struct mm_struct *mm)
2926 {
2927 	int nid;
2928 	struct lru_gen_mm_list *mm_list;
2929 	struct mem_cgroup *memcg = NULL;
2930 
2931 	if (list_empty(&mm->lru_gen.list))
2932 		return;
2933 
2934 #ifdef CONFIG_MEMCG
2935 	memcg = mm->lru_gen.memcg;
2936 #endif
2937 	mm_list = get_mm_list(memcg);
2938 
2939 	spin_lock(&mm_list->lock);
2940 
2941 	for_each_node(nid) {
2942 		struct lruvec *lruvec = get_lruvec(memcg, nid);
2943 		struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2944 
2945 		/* where the current iteration continues after */
2946 		if (mm_state->head == &mm->lru_gen.list)
2947 			mm_state->head = mm_state->head->prev;
2948 
2949 		/* where the last iteration ended before */
2950 		if (mm_state->tail == &mm->lru_gen.list)
2951 			mm_state->tail = mm_state->tail->next;
2952 	}
2953 
2954 	list_del_init(&mm->lru_gen.list);
2955 
2956 	spin_unlock(&mm_list->lock);
2957 
2958 #ifdef CONFIG_MEMCG
2959 	mem_cgroup_put(mm->lru_gen.memcg);
2960 	mm->lru_gen.memcg = NULL;
2961 #endif
2962 }
2963 
2964 #ifdef CONFIG_MEMCG
2965 void lru_gen_migrate_mm(struct mm_struct *mm)
2966 {
2967 	struct mem_cgroup *memcg;
2968 	struct task_struct *task = rcu_dereference_protected(mm->owner, true);
2969 
2970 	VM_WARN_ON_ONCE(task->mm != mm);
2971 	lockdep_assert_held(&task->alloc_lock);
2972 
2973 	/* for mm_update_next_owner() */
2974 	if (mem_cgroup_disabled())
2975 		return;
2976 
2977 	/* migration can happen before addition */
2978 	if (!mm->lru_gen.memcg)
2979 		return;
2980 
2981 	rcu_read_lock();
2982 	memcg = mem_cgroup_from_task(task);
2983 	rcu_read_unlock();
2984 	if (memcg == mm->lru_gen.memcg)
2985 		return;
2986 
2987 	VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
2988 
2989 	lru_gen_del_mm(mm);
2990 	lru_gen_add_mm(mm);
2991 }
2992 #endif
2993 
2994 #else /* !CONFIG_LRU_GEN_WALKS_MMU */
2995 
2996 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
2997 {
2998 	return NULL;
2999 }
3000 
3001 static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
3002 {
3003 	return NULL;
3004 }
3005 
3006 static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
3007 {
3008 	return NULL;
3009 }
3010 
3011 #endif
3012 
3013 static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
3014 {
3015 	int i;
3016 	int hist;
3017 	struct lruvec *lruvec = walk->lruvec;
3018 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3019 
3020 	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
3021 
3022 	hist = lru_hist_from_seq(walk->seq);
3023 
3024 	for (i = 0; i < NR_MM_STATS; i++) {
3025 		WRITE_ONCE(mm_state->stats[hist][i],
3026 			   mm_state->stats[hist][i] + walk->mm_stats[i]);
3027 		walk->mm_stats[i] = 0;
3028 	}
3029 
3030 	if (NR_HIST_GENS > 1 && last) {
3031 		hist = lru_hist_from_seq(walk->seq + 1);
3032 
3033 		for (i = 0; i < NR_MM_STATS; i++)
3034 			WRITE_ONCE(mm_state->stats[hist][i], 0);
3035 	}
3036 }
3037 
3038 static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter)
3039 {
3040 	bool first = false;
3041 	bool last = false;
3042 	struct mm_struct *mm = NULL;
3043 	struct lruvec *lruvec = walk->lruvec;
3044 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3045 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3046 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3047 
3048 	/*
3049 	 * mm_state->seq is incremented after each iteration of mm_list. There
3050 	 * are three interesting cases for this page table walker:
3051 	 * 1. It tries to start a new iteration with a stale max_seq: there is
3052 	 *    nothing left to do.
3053 	 * 2. It started the next iteration: it needs to reset the Bloom filter
3054 	 *    so that a fresh set of PTE tables can be recorded.
3055 	 * 3. It ended the current iteration: it needs to reset the mm stats
3056 	 *    counters and tell its caller to increment max_seq.
3057 	 */
3058 	spin_lock(&mm_list->lock);
3059 
3060 	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq);
3061 
3062 	if (walk->seq <= mm_state->seq)
3063 		goto done;
3064 
3065 	if (!mm_state->head)
3066 		mm_state->head = &mm_list->fifo;
3067 
3068 	if (mm_state->head == &mm_list->fifo)
3069 		first = true;
3070 
3071 	do {
3072 		mm_state->head = mm_state->head->next;
3073 		if (mm_state->head == &mm_list->fifo) {
3074 			WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
3075 			last = true;
3076 			break;
3077 		}
3078 
3079 		/* force scan for those added after the last iteration */
3080 		if (!mm_state->tail || mm_state->tail == mm_state->head) {
3081 			mm_state->tail = mm_state->head->next;
3082 			walk->force_scan = true;
3083 		}
3084 	} while (!(mm = get_next_mm(walk)));
3085 done:
3086 	if (*iter || last)
3087 		reset_mm_stats(walk, last);
3088 
3089 	spin_unlock(&mm_list->lock);
3090 
3091 	if (mm && first)
3092 		reset_bloom_filter(mm_state, walk->seq + 1);
3093 
3094 	if (*iter)
3095 		mmput_async(*iter);
3096 
3097 	*iter = mm;
3098 
3099 	return last;
3100 }
3101 
3102 static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq)
3103 {
3104 	bool success = false;
3105 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3106 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3107 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3108 
3109 	spin_lock(&mm_list->lock);
3110 
3111 	VM_WARN_ON_ONCE(mm_state->seq + 1 < seq);
3112 
3113 	if (seq > mm_state->seq) {
3114 		mm_state->head = NULL;
3115 		mm_state->tail = NULL;
3116 		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
3117 		success = true;
3118 	}
3119 
3120 	spin_unlock(&mm_list->lock);
3121 
3122 	return success;
3123 }
3124 
3125 /******************************************************************************
3126  *                          PID controller
3127  ******************************************************************************/
3128 
3129 /*
3130  * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3131  *
3132  * The P term is refaulted/(evicted+protected) from a tier in the generation
3133  * currently being evicted; the I term is the exponential moving average of the
3134  * P term over the generations previously evicted, using the smoothing factor
3135  * 1/2; the D term isn't supported.
3136  *
3137  * The setpoint (SP) is always the first tier of one type; the process variable
3138  * (PV) is either any tier of the other type or any other tier of the same
3139  * type.
3140  *
3141  * The error is the difference between the SP and the PV; the correction is to
3142  * turn off protection when SP>PV or turn on protection when SP<PV.
3143  *
3144  * For future optimizations:
3145  * 1. The D term may discount the other two terms over time so that long-lived
3146  *    generations can resist stale information.
3147  */
3148 struct ctrl_pos {
3149 	unsigned long refaulted;
3150 	unsigned long total;
3151 	int gain;
3152 };
3153 
3154 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
3155 			  struct ctrl_pos *pos)
3156 {
3157 	int i;
3158 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3159 	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
3160 
3161 	pos->gain = gain;
3162 	pos->refaulted = pos->total = 0;
3163 
3164 	for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) {
3165 		pos->refaulted += lrugen->avg_refaulted[type][i] +
3166 				  atomic_long_read(&lrugen->refaulted[hist][type][i]);
3167 		pos->total += lrugen->avg_total[type][i] +
3168 			      lrugen->protected[hist][type][i] +
3169 			      atomic_long_read(&lrugen->evicted[hist][type][i]);
3170 	}
3171 }
3172 
3173 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
3174 {
3175 	int hist, tier;
3176 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3177 	bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
3178 	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
3179 
3180 	lockdep_assert_held(&lruvec->lru_lock);
3181 
3182 	if (!carryover && !clear)
3183 		return;
3184 
3185 	hist = lru_hist_from_seq(seq);
3186 
3187 	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
3188 		if (carryover) {
3189 			unsigned long sum;
3190 
3191 			sum = lrugen->avg_refaulted[type][tier] +
3192 			      atomic_long_read(&lrugen->refaulted[hist][type][tier]);
3193 			WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
3194 
3195 			sum = lrugen->avg_total[type][tier] +
3196 			      lrugen->protected[hist][type][tier] +
3197 			      atomic_long_read(&lrugen->evicted[hist][type][tier]);
3198 			WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
3199 		}
3200 
3201 		if (clear) {
3202 			atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
3203 			atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
3204 			WRITE_ONCE(lrugen->protected[hist][type][tier], 0);
3205 		}
3206 	}
3207 }
3208 
3209 static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
3210 {
3211 	/*
3212 	 * Return true if the PV has a limited number of refaults or a lower
3213 	 * refaulted/total than the SP.
3214 	 */
3215 	return pv->refaulted < MIN_LRU_BATCH ||
3216 	       pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
3217 	       (sp->refaulted + 1) * pv->total * pv->gain;
3218 }
3219 
3220 /******************************************************************************
3221  *                          the aging
3222  ******************************************************************************/
3223 
3224 /* promote pages accessed through page tables */
3225 static int folio_update_gen(struct folio *folio, int gen)
3226 {
3227 	unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
3228 
3229 	VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
3230 
3231 	/* see the comment on LRU_REFS_FLAGS */
3232 	if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
3233 		set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
3234 		return -1;
3235 	}
3236 
3237 	do {
3238 		/* lru_gen_del_folio() has isolated this page? */
3239 		if (!(old_flags & LRU_GEN_MASK))
3240 			return -1;
3241 
3242 		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
3243 		new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset);
3244 	} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3245 
3246 	return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
3247 }
3248 
3249 /* protect pages accessed multiple times through file descriptors */
3250 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
3251 {
3252 	int type = folio_is_file_lru(folio);
3253 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3254 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
3255 	unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
3256 
3257 	VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
3258 
3259 	do {
3260 		new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
3261 		/* folio_update_gen() has promoted this page? */
3262 		if (new_gen >= 0 && new_gen != old_gen)
3263 			return new_gen;
3264 
3265 		new_gen = (old_gen + 1) % MAX_NR_GENS;
3266 
3267 		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS);
3268 		new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
3269 		/* for folio_end_writeback() */
3270 		if (reclaiming)
3271 			new_flags |= BIT(PG_reclaim);
3272 	} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3273 
3274 	lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3275 
3276 	return new_gen;
3277 }
3278 
3279 static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
3280 			      int old_gen, int new_gen)
3281 {
3282 	int type = folio_is_file_lru(folio);
3283 	int zone = folio_zonenum(folio);
3284 	int delta = folio_nr_pages(folio);
3285 
3286 	VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
3287 	VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
3288 
3289 	walk->batched++;
3290 
3291 	walk->nr_pages[old_gen][type][zone] -= delta;
3292 	walk->nr_pages[new_gen][type][zone] += delta;
3293 }
3294 
3295 static void reset_batch_size(struct lru_gen_mm_walk *walk)
3296 {
3297 	int gen, type, zone;
3298 	struct lruvec *lruvec = walk->lruvec;
3299 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3300 
3301 	walk->batched = 0;
3302 
3303 	for_each_gen_type_zone(gen, type, zone) {
3304 		enum lru_list lru = type * LRU_INACTIVE_FILE;
3305 		int delta = walk->nr_pages[gen][type][zone];
3306 
3307 		if (!delta)
3308 			continue;
3309 
3310 		walk->nr_pages[gen][type][zone] = 0;
3311 		WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
3312 			   lrugen->nr_pages[gen][type][zone] + delta);
3313 
3314 		if (lru_gen_is_active(lruvec, gen))
3315 			lru += LRU_ACTIVE;
3316 		__update_lru_size(lruvec, lru, zone, delta);
3317 	}
3318 }
3319 
3320 static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
3321 {
3322 	struct address_space *mapping;
3323 	struct vm_area_struct *vma = args->vma;
3324 	struct lru_gen_mm_walk *walk = args->private;
3325 
3326 	if (!vma_is_accessible(vma))
3327 		return true;
3328 
3329 	if (is_vm_hugetlb_page(vma))
3330 		return true;
3331 
3332 	if (!vma_has_recency(vma))
3333 		return true;
3334 
3335 	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
3336 		return true;
3337 
3338 	if (vma == get_gate_vma(vma->vm_mm))
3339 		return true;
3340 
3341 	if (vma_is_anonymous(vma))
3342 		return !walk->swappiness;
3343 
3344 	if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
3345 		return true;
3346 
3347 	mapping = vma->vm_file->f_mapping;
3348 	if (mapping_unevictable(mapping))
3349 		return true;
3350 
3351 	if (shmem_mapping(mapping))
3352 		return !walk->swappiness;
3353 
3354 	if (walk->swappiness > MAX_SWAPPINESS)
3355 		return true;
3356 
3357 	/* to exclude special mappings like dax, etc. */
3358 	return !mapping->a_ops->read_folio;
3359 }
3360 
3361 /*
3362  * Some userspace memory allocators map many single-page VMAs. Instead of
3363  * returning back to the PGD table for each of such VMAs, finish an entire PMD
3364  * table to reduce zigzags and improve cache performance.
3365  */
3366 static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
3367 			 unsigned long *vm_start, unsigned long *vm_end)
3368 {
3369 	unsigned long start = round_up(*vm_end, size);
3370 	unsigned long end = (start | ~mask) + 1;
3371 	VMA_ITERATOR(vmi, args->mm, start);
3372 
3373 	VM_WARN_ON_ONCE(mask & size);
3374 	VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
3375 
3376 	for_each_vma(vmi, args->vma) {
3377 		if (end && end <= args->vma->vm_start)
3378 			return false;
3379 
3380 		if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
3381 			continue;
3382 
3383 		*vm_start = max(start, args->vma->vm_start);
3384 		*vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
3385 
3386 		return true;
3387 	}
3388 
3389 	return false;
3390 }
3391 
3392 static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr,
3393 				 struct pglist_data *pgdat)
3394 {
3395 	unsigned long pfn = pte_pfn(pte);
3396 
3397 	VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3398 
3399 	if (!pte_present(pte) || is_zero_pfn(pfn))
3400 		return -1;
3401 
3402 	if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
3403 		return -1;
3404 
3405 	if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
3406 		return -1;
3407 
3408 	if (WARN_ON_ONCE(!pfn_valid(pfn)))
3409 		return -1;
3410 
3411 	if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3412 		return -1;
3413 
3414 	return pfn;
3415 }
3416 
3417 static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr,
3418 				 struct pglist_data *pgdat)
3419 {
3420 	unsigned long pfn = pmd_pfn(pmd);
3421 
3422 	VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3423 
3424 	if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
3425 		return -1;
3426 
3427 	if (WARN_ON_ONCE(pmd_devmap(pmd)))
3428 		return -1;
3429 
3430 	if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
3431 		return -1;
3432 
3433 	if (WARN_ON_ONCE(!pfn_valid(pfn)))
3434 		return -1;
3435 
3436 	if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3437 		return -1;
3438 
3439 	return pfn;
3440 }
3441 
3442 static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
3443 				   struct pglist_data *pgdat)
3444 {
3445 	struct folio *folio = pfn_folio(pfn);
3446 
3447 	if (folio_lru_gen(folio) < 0)
3448 		return NULL;
3449 
3450 	if (folio_nid(folio) != pgdat->node_id)
3451 		return NULL;
3452 
3453 	if (folio_memcg(folio) != memcg)
3454 		return NULL;
3455 
3456 	return folio;
3457 }
3458 
3459 static bool suitable_to_scan(int total, int young)
3460 {
3461 	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
3462 
3463 	/* suitable if the average number of young PTEs per cacheline is >=1 */
3464 	return young * n >= total;
3465 }
3466 
3467 static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio,
3468 			      int new_gen, bool dirty)
3469 {
3470 	int old_gen;
3471 
3472 	if (!folio)
3473 		return;
3474 
3475 	if (dirty && !folio_test_dirty(folio) &&
3476 	    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3477 	      !folio_test_swapcache(folio)))
3478 		folio_mark_dirty(folio);
3479 
3480 	if (walk) {
3481 		old_gen = folio_update_gen(folio, new_gen);
3482 		if (old_gen >= 0 && old_gen != new_gen)
3483 			update_batch_size(walk, folio, old_gen, new_gen);
3484 	} else if (lru_gen_set_refs(folio)) {
3485 		old_gen = folio_lru_gen(folio);
3486 		if (old_gen >= 0 && old_gen != new_gen)
3487 			folio_activate(folio);
3488 	}
3489 }
3490 
3491 static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
3492 			   struct mm_walk *args)
3493 {
3494 	int i;
3495 	bool dirty;
3496 	pte_t *pte;
3497 	spinlock_t *ptl;
3498 	unsigned long addr;
3499 	int total = 0;
3500 	int young = 0;
3501 	struct folio *last = NULL;
3502 	struct lru_gen_mm_walk *walk = args->private;
3503 	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
3504 	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3505 	DEFINE_MAX_SEQ(walk->lruvec);
3506 	int gen = lru_gen_from_seq(max_seq);
3507 	pmd_t pmdval;
3508 
3509 	pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
3510 	if (!pte)
3511 		return false;
3512 
3513 	if (!spin_trylock(ptl)) {
3514 		pte_unmap(pte);
3515 		return true;
3516 	}
3517 
3518 	if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
3519 		pte_unmap_unlock(pte, ptl);
3520 		return false;
3521 	}
3522 
3523 	arch_enter_lazy_mmu_mode();
3524 restart:
3525 	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
3526 		unsigned long pfn;
3527 		struct folio *folio;
3528 		pte_t ptent = ptep_get(pte + i);
3529 
3530 		total++;
3531 		walk->mm_stats[MM_LEAF_TOTAL]++;
3532 
3533 		pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
3534 		if (pfn == -1)
3535 			continue;
3536 
3537 		folio = get_pfn_folio(pfn, memcg, pgdat);
3538 		if (!folio)
3539 			continue;
3540 
3541 		if (!ptep_clear_young_notify(args->vma, addr, pte + i))
3542 			continue;
3543 
3544 		if (last != folio) {
3545 			walk_update_folio(walk, last, gen, dirty);
3546 
3547 			last = folio;
3548 			dirty = false;
3549 		}
3550 
3551 		if (pte_dirty(ptent))
3552 			dirty = true;
3553 
3554 		young++;
3555 		walk->mm_stats[MM_LEAF_YOUNG]++;
3556 	}
3557 
3558 	walk_update_folio(walk, last, gen, dirty);
3559 	last = NULL;
3560 
3561 	if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
3562 		goto restart;
3563 
3564 	arch_leave_lazy_mmu_mode();
3565 	pte_unmap_unlock(pte, ptl);
3566 
3567 	return suitable_to_scan(total, young);
3568 }
3569 
3570 static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
3571 				  struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
3572 {
3573 	int i;
3574 	bool dirty;
3575 	pmd_t *pmd;
3576 	spinlock_t *ptl;
3577 	struct folio *last = NULL;
3578 	struct lru_gen_mm_walk *walk = args->private;
3579 	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
3580 	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3581 	DEFINE_MAX_SEQ(walk->lruvec);
3582 	int gen = lru_gen_from_seq(max_seq);
3583 
3584 	VM_WARN_ON_ONCE(pud_leaf(*pud));
3585 
3586 	/* try to batch at most 1+MIN_LRU_BATCH+1 entries */
3587 	if (*first == -1) {
3588 		*first = addr;
3589 		bitmap_zero(bitmap, MIN_LRU_BATCH);
3590 		return;
3591 	}
3592 
3593 	i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
3594 	if (i && i <= MIN_LRU_BATCH) {
3595 		__set_bit(i - 1, bitmap);
3596 		return;
3597 	}
3598 
3599 	pmd = pmd_offset(pud, *first);
3600 
3601 	ptl = pmd_lockptr(args->mm, pmd);
3602 	if (!spin_trylock(ptl))
3603 		goto done;
3604 
3605 	arch_enter_lazy_mmu_mode();
3606 
3607 	do {
3608 		unsigned long pfn;
3609 		struct folio *folio;
3610 
3611 		/* don't round down the first address */
3612 		addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
3613 
3614 		if (!pmd_present(pmd[i]))
3615 			goto next;
3616 
3617 		if (!pmd_trans_huge(pmd[i])) {
3618 			if (!walk->force_scan && should_clear_pmd_young() &&
3619 			    !mm_has_notifiers(args->mm))
3620 				pmdp_test_and_clear_young(vma, addr, pmd + i);
3621 			goto next;
3622 		}
3623 
3624 		pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat);
3625 		if (pfn == -1)
3626 			goto next;
3627 
3628 		folio = get_pfn_folio(pfn, memcg, pgdat);
3629 		if (!folio)
3630 			goto next;
3631 
3632 		if (!pmdp_clear_young_notify(vma, addr, pmd + i))
3633 			goto next;
3634 
3635 		if (last != folio) {
3636 			walk_update_folio(walk, last, gen, dirty);
3637 
3638 			last = folio;
3639 			dirty = false;
3640 		}
3641 
3642 		if (pmd_dirty(pmd[i]))
3643 			dirty = true;
3644 
3645 		walk->mm_stats[MM_LEAF_YOUNG]++;
3646 next:
3647 		i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
3648 	} while (i <= MIN_LRU_BATCH);
3649 
3650 	walk_update_folio(walk, last, gen, dirty);
3651 
3652 	arch_leave_lazy_mmu_mode();
3653 	spin_unlock(ptl);
3654 done:
3655 	*first = -1;
3656 }
3657 
3658 static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
3659 			   struct mm_walk *args)
3660 {
3661 	int i;
3662 	pmd_t *pmd;
3663 	unsigned long next;
3664 	unsigned long addr;
3665 	struct vm_area_struct *vma;
3666 	DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
3667 	unsigned long first = -1;
3668 	struct lru_gen_mm_walk *walk = args->private;
3669 	struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
3670 
3671 	VM_WARN_ON_ONCE(pud_leaf(*pud));
3672 
3673 	/*
3674 	 * Finish an entire PMD in two passes: the first only reaches to PTE
3675 	 * tables to avoid taking the PMD lock; the second, if necessary, takes
3676 	 * the PMD lock to clear the accessed bit in PMD entries.
3677 	 */
3678 	pmd = pmd_offset(pud, start & PUD_MASK);
3679 restart:
3680 	/* walk_pte_range() may call get_next_vma() */
3681 	vma = args->vma;
3682 	for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
3683 		pmd_t val = pmdp_get_lockless(pmd + i);
3684 
3685 		next = pmd_addr_end(addr, end);
3686 
3687 		if (!pmd_present(val) || is_huge_zero_pmd(val)) {
3688 			walk->mm_stats[MM_LEAF_TOTAL]++;
3689 			continue;
3690 		}
3691 
3692 		if (pmd_trans_huge(val)) {
3693 			struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3694 			unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat);
3695 
3696 			walk->mm_stats[MM_LEAF_TOTAL]++;
3697 
3698 			if (pfn != -1)
3699 				walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
3700 			continue;
3701 		}
3702 
3703 		if (!walk->force_scan && should_clear_pmd_young() &&
3704 		    !mm_has_notifiers(args->mm)) {
3705 			if (!pmd_young(val))
3706 				continue;
3707 
3708 			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
3709 		}
3710 
3711 		if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i))
3712 			continue;
3713 
3714 		walk->mm_stats[MM_NONLEAF_FOUND]++;
3715 
3716 		if (!walk_pte_range(&val, addr, next, args))
3717 			continue;
3718 
3719 		walk->mm_stats[MM_NONLEAF_ADDED]++;
3720 
3721 		/* carry over to the next generation */
3722 		update_bloom_filter(mm_state, walk->seq + 1, pmd + i);
3723 	}
3724 
3725 	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
3726 
3727 	if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
3728 		goto restart;
3729 }
3730 
3731 static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
3732 			  struct mm_walk *args)
3733 {
3734 	int i;
3735 	pud_t *pud;
3736 	unsigned long addr;
3737 	unsigned long next;
3738 	struct lru_gen_mm_walk *walk = args->private;
3739 
3740 	VM_WARN_ON_ONCE(p4d_leaf(*p4d));
3741 
3742 	pud = pud_offset(p4d, start & P4D_MASK);
3743 restart:
3744 	for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
3745 		pud_t val = READ_ONCE(pud[i]);
3746 
3747 		next = pud_addr_end(addr, end);
3748 
3749 		if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
3750 			continue;
3751 
3752 		walk_pmd_range(&val, addr, next, args);
3753 
3754 		if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
3755 			end = (addr | ~PUD_MASK) + 1;
3756 			goto done;
3757 		}
3758 	}
3759 
3760 	if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
3761 		goto restart;
3762 
3763 	end = round_up(end, P4D_SIZE);
3764 done:
3765 	if (!end || !args->vma)
3766 		return 1;
3767 
3768 	walk->next_addr = max(end, args->vma->vm_start);
3769 
3770 	return -EAGAIN;
3771 }
3772 
3773 static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
3774 {
3775 	static const struct mm_walk_ops mm_walk_ops = {
3776 		.test_walk = should_skip_vma,
3777 		.p4d_entry = walk_pud_range,
3778 		.walk_lock = PGWALK_RDLOCK,
3779 	};
3780 	int err;
3781 	struct lruvec *lruvec = walk->lruvec;
3782 
3783 	walk->next_addr = FIRST_USER_ADDRESS;
3784 
3785 	do {
3786 		DEFINE_MAX_SEQ(lruvec);
3787 
3788 		err = -EBUSY;
3789 
3790 		/* another thread might have called inc_max_seq() */
3791 		if (walk->seq != max_seq)
3792 			break;
3793 
3794 		/* the caller might be holding the lock for write */
3795 		if (mmap_read_trylock(mm)) {
3796 			err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
3797 
3798 			mmap_read_unlock(mm);
3799 		}
3800 
3801 		if (walk->batched) {
3802 			spin_lock_irq(&lruvec->lru_lock);
3803 			reset_batch_size(walk);
3804 			spin_unlock_irq(&lruvec->lru_lock);
3805 		}
3806 
3807 		cond_resched();
3808 	} while (err == -EAGAIN);
3809 }
3810 
3811 static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
3812 {
3813 	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
3814 
3815 	if (pgdat && current_is_kswapd()) {
3816 		VM_WARN_ON_ONCE(walk);
3817 
3818 		walk = &pgdat->mm_walk;
3819 	} else if (!walk && force_alloc) {
3820 		VM_WARN_ON_ONCE(current_is_kswapd());
3821 
3822 		walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
3823 	}
3824 
3825 	current->reclaim_state->mm_walk = walk;
3826 
3827 	return walk;
3828 }
3829 
3830 static void clear_mm_walk(void)
3831 {
3832 	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
3833 
3834 	VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
3835 	VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
3836 
3837 	current->reclaim_state->mm_walk = NULL;
3838 
3839 	if (!current_is_kswapd())
3840 		kfree(walk);
3841 }
3842 
3843 static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness)
3844 {
3845 	int zone;
3846 	int remaining = MAX_LRU_BATCH;
3847 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3848 	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
3849 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
3850 
3851 	if (type ? swappiness > MAX_SWAPPINESS : !swappiness)
3852 		goto done;
3853 
3854 	/* prevent cold/hot inversion if the type is evictable */
3855 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3856 		struct list_head *head = &lrugen->folios[old_gen][type][zone];
3857 
3858 		while (!list_empty(head)) {
3859 			struct folio *folio = lru_to_folio(head);
3860 			int refs = folio_lru_refs(folio);
3861 			bool workingset = folio_test_workingset(folio);
3862 
3863 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
3864 			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
3865 			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
3866 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
3867 
3868 			new_gen = folio_inc_gen(lruvec, folio, false);
3869 			list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
3870 
3871 			/* don't count the workingset being lazily promoted */
3872 			if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
3873 				int tier = lru_tier_from_refs(refs, workingset);
3874 				int delta = folio_nr_pages(folio);
3875 
3876 				WRITE_ONCE(lrugen->protected[hist][type][tier],
3877 					   lrugen->protected[hist][type][tier] + delta);
3878 			}
3879 
3880 			if (!--remaining)
3881 				return false;
3882 		}
3883 	}
3884 done:
3885 	reset_ctrl_pos(lruvec, type, true);
3886 	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
3887 
3888 	return true;
3889 }
3890 
3891 static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
3892 {
3893 	int gen, type, zone;
3894 	bool success = false;
3895 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3896 	DEFINE_MIN_SEQ(lruvec);
3897 
3898 	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3899 
3900 	/* find the oldest populated generation */
3901 	for_each_evictable_type(type, swappiness) {
3902 		while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
3903 			gen = lru_gen_from_seq(min_seq[type]);
3904 
3905 			for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3906 				if (!list_empty(&lrugen->folios[gen][type][zone]))
3907 					goto next;
3908 			}
3909 
3910 			min_seq[type]++;
3911 		}
3912 next:
3913 		;
3914 	}
3915 
3916 	/* see the comment on lru_gen_folio */
3917 	if (swappiness && swappiness <= MAX_SWAPPINESS) {
3918 		unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
3919 
3920 		if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
3921 			min_seq[LRU_GEN_ANON] = seq;
3922 		else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
3923 			min_seq[LRU_GEN_FILE] = seq;
3924 	}
3925 
3926 	for_each_evictable_type(type, swappiness) {
3927 		if (min_seq[type] <= lrugen->min_seq[type])
3928 			continue;
3929 
3930 		reset_ctrl_pos(lruvec, type, true);
3931 		WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
3932 		success = true;
3933 	}
3934 
3935 	return success;
3936 }
3937 
3938 static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness)
3939 {
3940 	bool success;
3941 	int prev, next;
3942 	int type, zone;
3943 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3944 restart:
3945 	if (seq < READ_ONCE(lrugen->max_seq))
3946 		return false;
3947 
3948 	spin_lock_irq(&lruvec->lru_lock);
3949 
3950 	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3951 
3952 	success = seq == lrugen->max_seq;
3953 	if (!success)
3954 		goto unlock;
3955 
3956 	for (type = 0; type < ANON_AND_FILE; type++) {
3957 		if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
3958 			continue;
3959 
3960 		if (inc_min_seq(lruvec, type, swappiness))
3961 			continue;
3962 
3963 		spin_unlock_irq(&lruvec->lru_lock);
3964 		cond_resched();
3965 		goto restart;
3966 	}
3967 
3968 	/*
3969 	 * Update the active/inactive LRU sizes for compatibility. Both sides of
3970 	 * the current max_seq need to be covered, since max_seq+1 can overlap
3971 	 * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
3972 	 * overlap, cold/hot inversion happens.
3973 	 */
3974 	prev = lru_gen_from_seq(lrugen->max_seq - 1);
3975 	next = lru_gen_from_seq(lrugen->max_seq + 1);
3976 
3977 	for (type = 0; type < ANON_AND_FILE; type++) {
3978 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3979 			enum lru_list lru = type * LRU_INACTIVE_FILE;
3980 			long delta = lrugen->nr_pages[prev][type][zone] -
3981 				     lrugen->nr_pages[next][type][zone];
3982 
3983 			if (!delta)
3984 				continue;
3985 
3986 			__update_lru_size(lruvec, lru, zone, delta);
3987 			__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
3988 		}
3989 	}
3990 
3991 	for (type = 0; type < ANON_AND_FILE; type++)
3992 		reset_ctrl_pos(lruvec, type, false);
3993 
3994 	WRITE_ONCE(lrugen->timestamps[next], jiffies);
3995 	/* make sure preceding modifications appear */
3996 	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
3997 unlock:
3998 	spin_unlock_irq(&lruvec->lru_lock);
3999 
4000 	return success;
4001 }
4002 
4003 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
4004 			       int swappiness, bool force_scan)
4005 {
4006 	bool success;
4007 	struct lru_gen_mm_walk *walk;
4008 	struct mm_struct *mm = NULL;
4009 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4010 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
4011 
4012 	VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
4013 
4014 	if (!mm_state)
4015 		return inc_max_seq(lruvec, seq, swappiness);
4016 
4017 	/* see the comment in iterate_mm_list() */
4018 	if (seq <= READ_ONCE(mm_state->seq))
4019 		return false;
4020 
4021 	/*
4022 	 * If the hardware doesn't automatically set the accessed bit, fallback
4023 	 * to lru_gen_look_around(), which only clears the accessed bit in a
4024 	 * handful of PTEs. Spreading the work out over a period of time usually
4025 	 * is less efficient, but it avoids bursty page faults.
4026 	 */
4027 	if (!should_walk_mmu()) {
4028 		success = iterate_mm_list_nowalk(lruvec, seq);
4029 		goto done;
4030 	}
4031 
4032 	walk = set_mm_walk(NULL, true);
4033 	if (!walk) {
4034 		success = iterate_mm_list_nowalk(lruvec, seq);
4035 		goto done;
4036 	}
4037 
4038 	walk->lruvec = lruvec;
4039 	walk->seq = seq;
4040 	walk->swappiness = swappiness;
4041 	walk->force_scan = force_scan;
4042 
4043 	do {
4044 		success = iterate_mm_list(walk, &mm);
4045 		if (mm)
4046 			walk_mm(mm, walk);
4047 	} while (mm);
4048 done:
4049 	if (success) {
4050 		success = inc_max_seq(lruvec, seq, swappiness);
4051 		WARN_ON_ONCE(!success);
4052 	}
4053 
4054 	return success;
4055 }
4056 
4057 /******************************************************************************
4058  *                          working set protection
4059  ******************************************************************************/
4060 
4061 static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
4062 {
4063 	int priority;
4064 	unsigned long reclaimable;
4065 
4066 	if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
4067 		return;
4068 	/*
4069 	 * Determine the initial priority based on
4070 	 * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
4071 	 * where reclaimed_to_scanned_ratio = inactive / total.
4072 	 */
4073 	reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
4074 	if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
4075 		reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
4076 
4077 	/* round down reclaimable and round up sc->nr_to_reclaim */
4078 	priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
4079 
4080 	/*
4081 	 * The estimation is based on LRU pages only, so cap it to prevent
4082 	 * overshoots of shrinker objects by large margins.
4083 	 */
4084 	sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
4085 }
4086 
4087 static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
4088 {
4089 	int gen, type, zone;
4090 	unsigned long total = 0;
4091 	int swappiness = get_swappiness(lruvec, sc);
4092 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4093 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4094 	DEFINE_MAX_SEQ(lruvec);
4095 	DEFINE_MIN_SEQ(lruvec);
4096 
4097 	for_each_evictable_type(type, swappiness) {
4098 		unsigned long seq;
4099 
4100 		for (seq = min_seq[type]; seq <= max_seq; seq++) {
4101 			gen = lru_gen_from_seq(seq);
4102 
4103 			for (zone = 0; zone < MAX_NR_ZONES; zone++)
4104 				total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
4105 		}
4106 	}
4107 
4108 	/* whether the size is big enough to be helpful */
4109 	return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
4110 }
4111 
4112 static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
4113 				  unsigned long min_ttl)
4114 {
4115 	int gen;
4116 	unsigned long birth;
4117 	int swappiness = get_swappiness(lruvec, sc);
4118 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4119 	DEFINE_MIN_SEQ(lruvec);
4120 
4121 	if (mem_cgroup_below_min(NULL, memcg))
4122 		return false;
4123 
4124 	if (!lruvec_is_sizable(lruvec, sc))
4125 		return false;
4126 
4127 	gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
4128 	birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
4129 
4130 	return time_is_before_jiffies(birth + min_ttl);
4131 }
4132 
4133 /* to protect the working set of the last N jiffies */
4134 static unsigned long lru_gen_min_ttl __read_mostly;
4135 
4136 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
4137 {
4138 	struct mem_cgroup *memcg;
4139 	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
4140 	bool reclaimable = !min_ttl;
4141 
4142 	VM_WARN_ON_ONCE(!current_is_kswapd());
4143 
4144 	set_initial_priority(pgdat, sc);
4145 
4146 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
4147 	do {
4148 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4149 
4150 		mem_cgroup_calculate_protection(NULL, memcg);
4151 
4152 		if (!reclaimable)
4153 			reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
4154 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
4155 
4156 	/*
4157 	 * The main goal is to OOM kill if every generation from all memcgs is
4158 	 * younger than min_ttl. However, another possibility is all memcgs are
4159 	 * either too small or below min.
4160 	 */
4161 	if (!reclaimable && mutex_trylock(&oom_lock)) {
4162 		struct oom_control oc = {
4163 			.gfp_mask = sc->gfp_mask,
4164 		};
4165 
4166 		out_of_memory(&oc);
4167 
4168 		mutex_unlock(&oom_lock);
4169 	}
4170 }
4171 
4172 /******************************************************************************
4173  *                          rmap/PT walk feedback
4174  ******************************************************************************/
4175 
4176 /*
4177  * This function exploits spatial locality when shrink_folio_list() walks the
4178  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
4179  * the scan was done cacheline efficiently, it adds the PMD entry pointing to
4180  * the PTE table to the Bloom filter. This forms a feedback loop between the
4181  * eviction and the aging.
4182  */
4183 bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4184 {
4185 	int i;
4186 	bool dirty;
4187 	unsigned long start;
4188 	unsigned long end;
4189 	struct lru_gen_mm_walk *walk;
4190 	struct folio *last = NULL;
4191 	int young = 1;
4192 	pte_t *pte = pvmw->pte;
4193 	unsigned long addr = pvmw->address;
4194 	struct vm_area_struct *vma = pvmw->vma;
4195 	struct folio *folio = pfn_folio(pvmw->pfn);
4196 	struct mem_cgroup *memcg = folio_memcg(folio);
4197 	struct pglist_data *pgdat = folio_pgdat(folio);
4198 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4199 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
4200 	DEFINE_MAX_SEQ(lruvec);
4201 	int gen = lru_gen_from_seq(max_seq);
4202 
4203 	lockdep_assert_held(pvmw->ptl);
4204 	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
4205 
4206 	if (!ptep_clear_young_notify(vma, addr, pte))
4207 		return false;
4208 
4209 	if (spin_is_contended(pvmw->ptl))
4210 		return true;
4211 
4212 	/* exclude special VMAs containing anon pages from COW */
4213 	if (vma->vm_flags & VM_SPECIAL)
4214 		return true;
4215 
4216 	/* avoid taking the LRU lock under the PTL when possible */
4217 	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
4218 
4219 	start = max(addr & PMD_MASK, vma->vm_start);
4220 	end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
4221 
4222 	if (end - start == PAGE_SIZE)
4223 		return true;
4224 
4225 	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
4226 		if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
4227 			end = start + MIN_LRU_BATCH * PAGE_SIZE;
4228 		else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
4229 			start = end - MIN_LRU_BATCH * PAGE_SIZE;
4230 		else {
4231 			start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
4232 			end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
4233 		}
4234 	}
4235 
4236 	arch_enter_lazy_mmu_mode();
4237 
4238 	pte -= (addr - start) / PAGE_SIZE;
4239 
4240 	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
4241 		unsigned long pfn;
4242 		pte_t ptent = ptep_get(pte + i);
4243 
4244 		pfn = get_pte_pfn(ptent, vma, addr, pgdat);
4245 		if (pfn == -1)
4246 			continue;
4247 
4248 		folio = get_pfn_folio(pfn, memcg, pgdat);
4249 		if (!folio)
4250 			continue;
4251 
4252 		if (!ptep_clear_young_notify(vma, addr, pte + i))
4253 			continue;
4254 
4255 		if (last != folio) {
4256 			walk_update_folio(walk, last, gen, dirty);
4257 
4258 			last = folio;
4259 			dirty = false;
4260 		}
4261 
4262 		if (pte_dirty(ptent))
4263 			dirty = true;
4264 
4265 		young++;
4266 	}
4267 
4268 	walk_update_folio(walk, last, gen, dirty);
4269 
4270 	arch_leave_lazy_mmu_mode();
4271 
4272 	/* feedback from rmap walkers to page table walkers */
4273 	if (mm_state && suitable_to_scan(i, young))
4274 		update_bloom_filter(mm_state, max_seq, pvmw->pmd);
4275 
4276 	return true;
4277 }
4278 
4279 /******************************************************************************
4280  *                          memcg LRU
4281  ******************************************************************************/
4282 
4283 /* see the comment on MEMCG_NR_GENS */
4284 enum {
4285 	MEMCG_LRU_NOP,
4286 	MEMCG_LRU_HEAD,
4287 	MEMCG_LRU_TAIL,
4288 	MEMCG_LRU_OLD,
4289 	MEMCG_LRU_YOUNG,
4290 };
4291 
4292 static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
4293 {
4294 	int seg;
4295 	int old, new;
4296 	unsigned long flags;
4297 	int bin = get_random_u32_below(MEMCG_NR_BINS);
4298 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4299 
4300 	spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);
4301 
4302 	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
4303 
4304 	seg = 0;
4305 	new = old = lruvec->lrugen.gen;
4306 
4307 	/* see the comment on MEMCG_NR_GENS */
4308 	if (op == MEMCG_LRU_HEAD)
4309 		seg = MEMCG_LRU_HEAD;
4310 	else if (op == MEMCG_LRU_TAIL)
4311 		seg = MEMCG_LRU_TAIL;
4312 	else if (op == MEMCG_LRU_OLD)
4313 		new = get_memcg_gen(pgdat->memcg_lru.seq);
4314 	else if (op == MEMCG_LRU_YOUNG)
4315 		new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
4316 	else
4317 		VM_WARN_ON_ONCE(true);
4318 
4319 	WRITE_ONCE(lruvec->lrugen.seg, seg);
4320 	WRITE_ONCE(lruvec->lrugen.gen, new);
4321 
4322 	hlist_nulls_del_rcu(&lruvec->lrugen.list);
4323 
4324 	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
4325 		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
4326 	else
4327 		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
4328 
4329 	pgdat->memcg_lru.nr_memcgs[old]--;
4330 	pgdat->memcg_lru.nr_memcgs[new]++;
4331 
4332 	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
4333 		WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
4334 
4335 	spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
4336 }
4337 
4338 #ifdef CONFIG_MEMCG
4339 
4340 void lru_gen_online_memcg(struct mem_cgroup *memcg)
4341 {
4342 	int gen;
4343 	int nid;
4344 	int bin = get_random_u32_below(MEMCG_NR_BINS);
4345 
4346 	for_each_node(nid) {
4347 		struct pglist_data *pgdat = NODE_DATA(nid);
4348 		struct lruvec *lruvec = get_lruvec(memcg, nid);
4349 
4350 		spin_lock_irq(&pgdat->memcg_lru.lock);
4351 
4352 		VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
4353 
4354 		gen = get_memcg_gen(pgdat->memcg_lru.seq);
4355 
4356 		lruvec->lrugen.gen = gen;
4357 
4358 		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
4359 		pgdat->memcg_lru.nr_memcgs[gen]++;
4360 
4361 		spin_unlock_irq(&pgdat->memcg_lru.lock);
4362 	}
4363 }
4364 
4365 void lru_gen_offline_memcg(struct mem_cgroup *memcg)
4366 {
4367 	int nid;
4368 
4369 	for_each_node(nid) {
4370 		struct lruvec *lruvec = get_lruvec(memcg, nid);
4371 
4372 		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
4373 	}
4374 }
4375 
4376 void lru_gen_release_memcg(struct mem_cgroup *memcg)
4377 {
4378 	int gen;
4379 	int nid;
4380 
4381 	for_each_node(nid) {
4382 		struct pglist_data *pgdat = NODE_DATA(nid);
4383 		struct lruvec *lruvec = get_lruvec(memcg, nid);
4384 
4385 		spin_lock_irq(&pgdat->memcg_lru.lock);
4386 
4387 		if (hlist_nulls_unhashed(&lruvec->lrugen.list))
4388 			goto unlock;
4389 
4390 		gen = lruvec->lrugen.gen;
4391 
4392 		hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
4393 		pgdat->memcg_lru.nr_memcgs[gen]--;
4394 
4395 		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
4396 			WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
4397 unlock:
4398 		spin_unlock_irq(&pgdat->memcg_lru.lock);
4399 	}
4400 }
4401 
4402 void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
4403 {
4404 	struct lruvec *lruvec = get_lruvec(memcg, nid);
4405 
4406 	/* see the comment on MEMCG_NR_GENS */
4407 	if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD)
4408 		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
4409 }
4410 
4411 #endif /* CONFIG_MEMCG */
4412 
4413 /******************************************************************************
4414  *                          the eviction
4415  ******************************************************************************/
4416 
4417 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
4418 		       int tier_idx)
4419 {
4420 	bool success;
4421 	bool dirty, writeback;
4422 	int gen = folio_lru_gen(folio);
4423 	int type = folio_is_file_lru(folio);
4424 	int zone = folio_zonenum(folio);
4425 	int delta = folio_nr_pages(folio);
4426 	int refs = folio_lru_refs(folio);
4427 	bool workingset = folio_test_workingset(folio);
4428 	int tier = lru_tier_from_refs(refs, workingset);
4429 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4430 
4431 	VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
4432 
4433 	/* unevictable */
4434 	if (!folio_evictable(folio)) {
4435 		success = lru_gen_del_folio(lruvec, folio, true);
4436 		VM_WARN_ON_ONCE_FOLIO(!success, folio);
4437 		folio_set_unevictable(folio);
4438 		lruvec_add_folio(lruvec, folio);
4439 		__count_vm_events(UNEVICTABLE_PGCULLED, delta);
4440 		return true;
4441 	}
4442 
4443 	/* promoted */
4444 	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
4445 		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4446 		return true;
4447 	}
4448 
4449 	/* protected */
4450 	if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) {
4451 		gen = folio_inc_gen(lruvec, folio, false);
4452 		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4453 
4454 		/* don't count the workingset being lazily promoted */
4455 		if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) {
4456 			int hist = lru_hist_from_seq(lrugen->min_seq[type]);
4457 
4458 			WRITE_ONCE(lrugen->protected[hist][type][tier],
4459 				   lrugen->protected[hist][type][tier] + delta);
4460 		}
4461 		return true;
4462 	}
4463 
4464 	/* ineligible */
4465 	if (!folio_test_lru(folio) || zone > sc->reclaim_idx) {
4466 		gen = folio_inc_gen(lruvec, folio, false);
4467 		list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
4468 		return true;
4469 	}
4470 
4471 	dirty = folio_test_dirty(folio);
4472 	writeback = folio_test_writeback(folio);
4473 	if (type == LRU_GEN_FILE && dirty) {
4474 		sc->nr.file_taken += delta;
4475 		if (!writeback)
4476 			sc->nr.unqueued_dirty += delta;
4477 	}
4478 
4479 	/* waiting for writeback */
4480 	if (writeback || (type == LRU_GEN_FILE && dirty)) {
4481 		gen = folio_inc_gen(lruvec, folio, true);
4482 		list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4483 		return true;
4484 	}
4485 
4486 	return false;
4487 }
4488 
4489 static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
4490 {
4491 	bool success;
4492 
4493 	/* swap constrained */
4494 	if (!(sc->gfp_mask & __GFP_IO) &&
4495 	    (folio_test_dirty(folio) ||
4496 	     (folio_test_anon(folio) && !folio_test_swapcache(folio))))
4497 		return false;
4498 
4499 	/* raced with release_pages() */
4500 	if (!folio_try_get(folio))
4501 		return false;
4502 
4503 	/* raced with another isolation */
4504 	if (!folio_test_clear_lru(folio)) {
4505 		folio_put(folio);
4506 		return false;
4507 	}
4508 
4509 	/* see the comment on LRU_REFS_FLAGS */
4510 	if (!folio_test_referenced(folio))
4511 		set_mask_bits(&folio->flags, LRU_REFS_MASK, 0);
4512 
4513 	/* for shrink_folio_list() */
4514 	folio_clear_reclaim(folio);
4515 
4516 	success = lru_gen_del_folio(lruvec, folio, true);
4517 	VM_WARN_ON_ONCE_FOLIO(!success, folio);
4518 
4519 	return true;
4520 }
4521 
4522 static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
4523 		       int type, int tier, struct list_head *list)
4524 {
4525 	int i;
4526 	int gen;
4527 	enum vm_event_item item;
4528 	int sorted = 0;
4529 	int scanned = 0;
4530 	int isolated = 0;
4531 	int skipped = 0;
4532 	int remaining = MAX_LRU_BATCH;
4533 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4534 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4535 
4536 	VM_WARN_ON_ONCE(!list_empty(list));
4537 
4538 	if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
4539 		return 0;
4540 
4541 	gen = lru_gen_from_seq(lrugen->min_seq[type]);
4542 
4543 	for (i = MAX_NR_ZONES; i > 0; i--) {
4544 		LIST_HEAD(moved);
4545 		int skipped_zone = 0;
4546 		int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
4547 		struct list_head *head = &lrugen->folios[gen][type][zone];
4548 
4549 		while (!list_empty(head)) {
4550 			struct folio *folio = lru_to_folio(head);
4551 			int delta = folio_nr_pages(folio);
4552 
4553 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
4554 			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
4555 			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
4556 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
4557 
4558 			scanned += delta;
4559 
4560 			if (sort_folio(lruvec, folio, sc, tier))
4561 				sorted += delta;
4562 			else if (isolate_folio(lruvec, folio, sc)) {
4563 				list_add(&folio->lru, list);
4564 				isolated += delta;
4565 			} else {
4566 				list_move(&folio->lru, &moved);
4567 				skipped_zone += delta;
4568 			}
4569 
4570 			if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH)
4571 				break;
4572 		}
4573 
4574 		if (skipped_zone) {
4575 			list_splice(&moved, head);
4576 			__count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone);
4577 			skipped += skipped_zone;
4578 		}
4579 
4580 		if (!remaining || isolated >= MIN_LRU_BATCH)
4581 			break;
4582 	}
4583 
4584 	item = PGSCAN_KSWAPD + reclaimer_offset(sc);
4585 	if (!cgroup_reclaim(sc)) {
4586 		__count_vm_events(item, isolated);
4587 		__count_vm_events(PGREFILL, sorted);
4588 	}
4589 	__count_memcg_events(memcg, item, isolated);
4590 	__count_memcg_events(memcg, PGREFILL, sorted);
4591 	__count_vm_events(PGSCAN_ANON + type, isolated);
4592 	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
4593 				scanned, skipped, isolated,
4594 				type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
4595 	if (type == LRU_GEN_FILE)
4596 		sc->nr.file_taken += isolated;
4597 	/*
4598 	 * There might not be eligible folios due to reclaim_idx. Check the
4599 	 * remaining to prevent livelock if it's not making progress.
4600 	 */
4601 	return isolated || !remaining ? scanned : 0;
4602 }
4603 
4604 static int get_tier_idx(struct lruvec *lruvec, int type)
4605 {
4606 	int tier;
4607 	struct ctrl_pos sp, pv;
4608 
4609 	/*
4610 	 * To leave a margin for fluctuations, use a larger gain factor (2:3).
4611 	 * This value is chosen because any other tier would have at least twice
4612 	 * as many refaults as the first tier.
4613 	 */
4614 	read_ctrl_pos(lruvec, type, 0, 2, &sp);
4615 	for (tier = 1; tier < MAX_NR_TIERS; tier++) {
4616 		read_ctrl_pos(lruvec, type, tier, 3, &pv);
4617 		if (!positive_ctrl_err(&sp, &pv))
4618 			break;
4619 	}
4620 
4621 	return tier - 1;
4622 }
4623 
4624 static int get_type_to_scan(struct lruvec *lruvec, int swappiness)
4625 {
4626 	struct ctrl_pos sp, pv;
4627 
4628 	if (swappiness <= MIN_SWAPPINESS + 1)
4629 		return LRU_GEN_FILE;
4630 
4631 	if (swappiness >= MAX_SWAPPINESS)
4632 		return LRU_GEN_ANON;
4633 	/*
4634 	 * Compare the sum of all tiers of anon with that of file to determine
4635 	 * which type to scan.
4636 	 */
4637 	read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
4638 	read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);
4639 
4640 	return positive_ctrl_err(&sp, &pv);
4641 }
4642 
4643 static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
4644 			  int *type_scanned, struct list_head *list)
4645 {
4646 	int i;
4647 	int type = get_type_to_scan(lruvec, swappiness);
4648 
4649 	for_each_evictable_type(i, swappiness) {
4650 		int scanned;
4651 		int tier = get_tier_idx(lruvec, type);
4652 
4653 		*type_scanned = type;
4654 
4655 		scanned = scan_folios(lruvec, sc, type, tier, list);
4656 		if (scanned)
4657 			return scanned;
4658 
4659 		type = !type;
4660 	}
4661 
4662 	return 0;
4663 }
4664 
4665 static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
4666 {
4667 	int type;
4668 	int scanned;
4669 	int reclaimed;
4670 	LIST_HEAD(list);
4671 	LIST_HEAD(clean);
4672 	struct folio *folio;
4673 	struct folio *next;
4674 	enum vm_event_item item;
4675 	struct reclaim_stat stat;
4676 	struct lru_gen_mm_walk *walk;
4677 	bool skip_retry = false;
4678 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4679 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4680 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4681 
4682 	spin_lock_irq(&lruvec->lru_lock);
4683 
4684 	scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
4685 
4686 	scanned += try_to_inc_min_seq(lruvec, swappiness);
4687 
4688 	if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
4689 		scanned = 0;
4690 
4691 	spin_unlock_irq(&lruvec->lru_lock);
4692 
4693 	if (list_empty(&list))
4694 		return scanned;
4695 retry:
4696 	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
4697 	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
4698 	sc->nr_reclaimed += reclaimed;
4699 	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
4700 			scanned, reclaimed, &stat, sc->priority,
4701 			type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
4702 
4703 	list_for_each_entry_safe_reverse(folio, next, &list, lru) {
4704 		DEFINE_MIN_SEQ(lruvec);
4705 
4706 		if (!folio_evictable(folio)) {
4707 			list_del(&folio->lru);
4708 			folio_putback_lru(folio);
4709 			continue;
4710 		}
4711 
4712 		/* retry folios that may have missed folio_rotate_reclaimable() */
4713 		if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
4714 		    !folio_test_dirty(folio) && !folio_test_writeback(folio)) {
4715 			list_move(&folio->lru, &clean);
4716 			continue;
4717 		}
4718 
4719 		/* don't add rejected folios to the oldest generation */
4720 		if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
4721 			set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active));
4722 	}
4723 
4724 	spin_lock_irq(&lruvec->lru_lock);
4725 
4726 	move_folios_to_lru(lruvec, &list);
4727 
4728 	walk = current->reclaim_state->mm_walk;
4729 	if (walk && walk->batched) {
4730 		walk->lruvec = lruvec;
4731 		reset_batch_size(walk);
4732 	}
4733 
4734 	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
4735 					stat.nr_demoted);
4736 
4737 	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
4738 	if (!cgroup_reclaim(sc))
4739 		__count_vm_events(item, reclaimed);
4740 	__count_memcg_events(memcg, item, reclaimed);
4741 	__count_vm_events(PGSTEAL_ANON + type, reclaimed);
4742 
4743 	spin_unlock_irq(&lruvec->lru_lock);
4744 
4745 	list_splice_init(&clean, &list);
4746 
4747 	if (!list_empty(&list)) {
4748 		skip_retry = true;
4749 		goto retry;
4750 	}
4751 
4752 	return scanned;
4753 }
4754 
4755 static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
4756 			     int swappiness, unsigned long *nr_to_scan)
4757 {
4758 	int gen, type, zone;
4759 	unsigned long size = 0;
4760 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4761 	DEFINE_MIN_SEQ(lruvec);
4762 
4763 	*nr_to_scan = 0;
4764 	/* have to run aging, since eviction is not possible anymore */
4765 	if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
4766 		return true;
4767 
4768 	for_each_evictable_type(type, swappiness) {
4769 		unsigned long seq;
4770 
4771 		for (seq = min_seq[type]; seq <= max_seq; seq++) {
4772 			gen = lru_gen_from_seq(seq);
4773 
4774 			for (zone = 0; zone < MAX_NR_ZONES; zone++)
4775 				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
4776 		}
4777 	}
4778 
4779 	*nr_to_scan = size;
4780 	/* better to run aging even though eviction is still possible */
4781 	return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
4782 }
4783 
4784 /*
4785  * For future optimizations:
4786  * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
4787  *    reclaim.
4788  */
4789 static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
4790 {
4791 	bool success;
4792 	unsigned long nr_to_scan;
4793 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4794 	DEFINE_MAX_SEQ(lruvec);
4795 
4796 	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
4797 		return -1;
4798 
4799 	success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);
4800 
4801 	/* try to scrape all its memory if this memcg was deleted */
4802 	if (nr_to_scan && !mem_cgroup_online(memcg))
4803 		return nr_to_scan;
4804 
4805 	/* try to get away with not aging at the default priority */
4806 	if (!success || sc->priority == DEF_PRIORITY)
4807 		return nr_to_scan >> sc->priority;
4808 
4809 	/* stop scanning this lruvec as it's low on cold folios */
4810 	return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0;
4811 }
4812 
4813 static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
4814 {
4815 	int i;
4816 	enum zone_watermarks mark;
4817 
4818 	/* don't abort memcg reclaim to ensure fairness */
4819 	if (!root_reclaim(sc))
4820 		return false;
4821 
4822 	if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
4823 		return true;
4824 
4825 	/* check the order to exclude compaction-induced reclaim */
4826 	if (!current_is_kswapd() || sc->order)
4827 		return false;
4828 
4829 	mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
4830 	       WMARK_PROMO : WMARK_HIGH;
4831 
4832 	for (i = 0; i <= sc->reclaim_idx; i++) {
4833 		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
4834 		unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH;
4835 
4836 		if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0))
4837 			return false;
4838 	}
4839 
4840 	/* kswapd should abort if all eligible zones are safe */
4841 	return true;
4842 }
4843 
4844 static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
4845 {
4846 	long nr_to_scan;
4847 	unsigned long scanned = 0;
4848 	int swappiness = get_swappiness(lruvec, sc);
4849 
4850 	while (true) {
4851 		int delta;
4852 
4853 		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
4854 		if (nr_to_scan <= 0)
4855 			break;
4856 
4857 		delta = evict_folios(lruvec, sc, swappiness);
4858 		if (!delta)
4859 			break;
4860 
4861 		scanned += delta;
4862 		if (scanned >= nr_to_scan)
4863 			break;
4864 
4865 		if (should_abort_scan(lruvec, sc))
4866 			break;
4867 
4868 		cond_resched();
4869 	}
4870 
4871 	/*
4872 	 * If too many file cache in the coldest generation can't be evicted
4873 	 * due to being dirty, wake up the flusher.
4874 	 */
4875 	if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
4876 		wakeup_flusher_threads(WB_REASON_VMSCAN);
4877 
4878 	/* whether this lruvec should be rotated */
4879 	return nr_to_scan < 0;
4880 }
4881 
4882 static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
4883 {
4884 	bool success;
4885 	unsigned long scanned = sc->nr_scanned;
4886 	unsigned long reclaimed = sc->nr_reclaimed;
4887 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4888 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4889 
4890 	/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
4891 	if (mem_cgroup_below_min(NULL, memcg))
4892 		return MEMCG_LRU_YOUNG;
4893 
4894 	if (mem_cgroup_below_low(NULL, memcg)) {
4895 		/* see the comment on MEMCG_NR_GENS */
4896 		if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
4897 			return MEMCG_LRU_TAIL;
4898 
4899 		memcg_memory_event(memcg, MEMCG_LOW);
4900 	}
4901 
4902 	success = try_to_shrink_lruvec(lruvec, sc);
4903 
4904 	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
4905 
4906 	if (!sc->proactive)
4907 		vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
4908 			   sc->nr_reclaimed - reclaimed);
4909 
4910 	flush_reclaim_state(sc);
4911 
4912 	if (success && mem_cgroup_online(memcg))
4913 		return MEMCG_LRU_YOUNG;
4914 
4915 	if (!success && lruvec_is_sizable(lruvec, sc))
4916 		return 0;
4917 
4918 	/* one retry if offlined or too small */
4919 	return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
4920 	       MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
4921 }
4922 
4923 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
4924 {
4925 	int op;
4926 	int gen;
4927 	int bin;
4928 	int first_bin;
4929 	struct lruvec *lruvec;
4930 	struct lru_gen_folio *lrugen;
4931 	struct mem_cgroup *memcg;
4932 	struct hlist_nulls_node *pos;
4933 
4934 	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
4935 	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
4936 restart:
4937 	op = 0;
4938 	memcg = NULL;
4939 
4940 	rcu_read_lock();
4941 
4942 	hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
4943 		if (op) {
4944 			lru_gen_rotate_memcg(lruvec, op);
4945 			op = 0;
4946 		}
4947 
4948 		mem_cgroup_put(memcg);
4949 		memcg = NULL;
4950 
4951 		if (gen != READ_ONCE(lrugen->gen))
4952 			continue;
4953 
4954 		lruvec = container_of(lrugen, struct lruvec, lrugen);
4955 		memcg = lruvec_memcg(lruvec);
4956 
4957 		if (!mem_cgroup_tryget(memcg)) {
4958 			lru_gen_release_memcg(memcg);
4959 			memcg = NULL;
4960 			continue;
4961 		}
4962 
4963 		rcu_read_unlock();
4964 
4965 		op = shrink_one(lruvec, sc);
4966 
4967 		rcu_read_lock();
4968 
4969 		if (should_abort_scan(lruvec, sc))
4970 			break;
4971 	}
4972 
4973 	rcu_read_unlock();
4974 
4975 	if (op)
4976 		lru_gen_rotate_memcg(lruvec, op);
4977 
4978 	mem_cgroup_put(memcg);
4979 
4980 	if (!is_a_nulls(pos))
4981 		return;
4982 
4983 	/* restart if raced with lru_gen_rotate_memcg() */
4984 	if (gen != get_nulls_value(pos))
4985 		goto restart;
4986 
4987 	/* try the rest of the bins of the current generation */
4988 	bin = get_memcg_bin(bin + 1);
4989 	if (bin != first_bin)
4990 		goto restart;
4991 }
4992 
4993 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
4994 {
4995 	struct blk_plug plug;
4996 
4997 	VM_WARN_ON_ONCE(root_reclaim(sc));
4998 	VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
4999 
5000 	lru_add_drain();
5001 
5002 	blk_start_plug(&plug);
5003 
5004 	set_mm_walk(NULL, sc->proactive);
5005 
5006 	if (try_to_shrink_lruvec(lruvec, sc))
5007 		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
5008 
5009 	clear_mm_walk();
5010 
5011 	blk_finish_plug(&plug);
5012 }
5013 
5014 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
5015 {
5016 	struct blk_plug plug;
5017 	unsigned long reclaimed = sc->nr_reclaimed;
5018 
5019 	VM_WARN_ON_ONCE(!root_reclaim(sc));
5020 
5021 	/*
5022 	 * Unmapped clean folios are already prioritized. Scanning for more of
5023 	 * them is likely futile and can cause high reclaim latency when there
5024 	 * is a large number of memcgs.
5025 	 */
5026 	if (!sc->may_writepage || !sc->may_unmap)
5027 		goto done;
5028 
5029 	lru_add_drain();
5030 
5031 	blk_start_plug(&plug);
5032 
5033 	set_mm_walk(pgdat, sc->proactive);
5034 
5035 	set_initial_priority(pgdat, sc);
5036 
5037 	if (current_is_kswapd())
5038 		sc->nr_reclaimed = 0;
5039 
5040 	if (mem_cgroup_disabled())
5041 		shrink_one(&pgdat->__lruvec, sc);
5042 	else
5043 		shrink_many(pgdat, sc);
5044 
5045 	if (current_is_kswapd())
5046 		sc->nr_reclaimed += reclaimed;
5047 
5048 	clear_mm_walk();
5049 
5050 	blk_finish_plug(&plug);
5051 done:
5052 	if (sc->nr_reclaimed > reclaimed)
5053 		pgdat->kswapd_failures = 0;
5054 }
5055 
5056 /******************************************************************************
5057  *                          state change
5058  ******************************************************************************/
5059 
5060 static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
5061 {
5062 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5063 
5064 	if (lrugen->enabled) {
5065 		enum lru_list lru;
5066 
5067 		for_each_evictable_lru(lru) {
5068 			if (!list_empty(&lruvec->lists[lru]))
5069 				return false;
5070 		}
5071 	} else {
5072 		int gen, type, zone;
5073 
5074 		for_each_gen_type_zone(gen, type, zone) {
5075 			if (!list_empty(&lrugen->folios[gen][type][zone]))
5076 				return false;
5077 		}
5078 	}
5079 
5080 	return true;
5081 }
5082 
5083 static bool fill_evictable(struct lruvec *lruvec)
5084 {
5085 	enum lru_list lru;
5086 	int remaining = MAX_LRU_BATCH;
5087 
5088 	for_each_evictable_lru(lru) {
5089 		int type = is_file_lru(lru);
5090 		bool active = is_active_lru(lru);
5091 		struct list_head *head = &lruvec->lists[lru];
5092 
5093 		while (!list_empty(head)) {
5094 			bool success;
5095 			struct folio *folio = lru_to_folio(head);
5096 
5097 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
5098 			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
5099 			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
5100 			VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
5101 
5102 			lruvec_del_folio(lruvec, folio);
5103 			success = lru_gen_add_folio(lruvec, folio, false);
5104 			VM_WARN_ON_ONCE(!success);
5105 
5106 			if (!--remaining)
5107 				return false;
5108 		}
5109 	}
5110 
5111 	return true;
5112 }
5113 
5114 static bool drain_evictable(struct lruvec *lruvec)
5115 {
5116 	int gen, type, zone;
5117 	int remaining = MAX_LRU_BATCH;
5118 
5119 	for_each_gen_type_zone(gen, type, zone) {
5120 		struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
5121 
5122 		while (!list_empty(head)) {
5123 			bool success;
5124 			struct folio *folio = lru_to_folio(head);
5125 
5126 			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
5127 			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
5128 			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
5129 			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
5130 
5131 			success = lru_gen_del_folio(lruvec, folio, false);
5132 			VM_WARN_ON_ONCE(!success);
5133 			lruvec_add_folio(lruvec, folio);
5134 
5135 			if (!--remaining)
5136 				return false;
5137 		}
5138 	}
5139 
5140 	return true;
5141 }
5142 
5143 static void lru_gen_change_state(bool enabled)
5144 {
5145 	static DEFINE_MUTEX(state_mutex);
5146 
5147 	struct mem_cgroup *memcg;
5148 
5149 	cgroup_lock();
5150 	cpus_read_lock();
5151 	get_online_mems();
5152 	mutex_lock(&state_mutex);
5153 
5154 	if (enabled == lru_gen_enabled())
5155 		goto unlock;
5156 
5157 	if (enabled)
5158 		static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5159 	else
5160 		static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5161 
5162 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
5163 	do {
5164 		int nid;
5165 
5166 		for_each_node(nid) {
5167 			struct lruvec *lruvec = get_lruvec(memcg, nid);
5168 
5169 			spin_lock_irq(&lruvec->lru_lock);
5170 
5171 			VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
5172 			VM_WARN_ON_ONCE(!state_is_valid(lruvec));
5173 
5174 			lruvec->lrugen.enabled = enabled;
5175 
5176 			while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
5177 				spin_unlock_irq(&lruvec->lru_lock);
5178 				cond_resched();
5179 				spin_lock_irq(&lruvec->lru_lock);
5180 			}
5181 
5182 			spin_unlock_irq(&lruvec->lru_lock);
5183 		}
5184 
5185 		cond_resched();
5186 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5187 unlock:
5188 	mutex_unlock(&state_mutex);
5189 	put_online_mems();
5190 	cpus_read_unlock();
5191 	cgroup_unlock();
5192 }
5193 
5194 /******************************************************************************
5195  *                          sysfs interface
5196  ******************************************************************************/
5197 
5198 static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
5199 {
5200 	return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
5201 }
5202 
5203 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5204 static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
5205 				const char *buf, size_t len)
5206 {
5207 	unsigned int msecs;
5208 
5209 	if (kstrtouint(buf, 0, &msecs))
5210 		return -EINVAL;
5211 
5212 	WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
5213 
5214 	return len;
5215 }
5216 
5217 static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);
5218 
5219 static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
5220 {
5221 	unsigned int caps = 0;
5222 
5223 	if (get_cap(LRU_GEN_CORE))
5224 		caps |= BIT(LRU_GEN_CORE);
5225 
5226 	if (should_walk_mmu())
5227 		caps |= BIT(LRU_GEN_MM_WALK);
5228 
5229 	if (should_clear_pmd_young())
5230 		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
5231 
5232 	return sysfs_emit(buf, "0x%04x\n", caps);
5233 }
5234 
5235 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5236 static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
5237 			     const char *buf, size_t len)
5238 {
5239 	int i;
5240 	unsigned int caps;
5241 
5242 	if (tolower(*buf) == 'n')
5243 		caps = 0;
5244 	else if (tolower(*buf) == 'y')
5245 		caps = -1;
5246 	else if (kstrtouint(buf, 0, &caps))
5247 		return -EINVAL;
5248 
5249 	for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
5250 		bool enabled = caps & BIT(i);
5251 
5252 		if (i == LRU_GEN_CORE)
5253 			lru_gen_change_state(enabled);
5254 		else if (enabled)
5255 			static_branch_enable(&lru_gen_caps[i]);
5256 		else
5257 			static_branch_disable(&lru_gen_caps[i]);
5258 	}
5259 
5260 	return len;
5261 }
5262 
5263 static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);
5264 
5265 static struct attribute *lru_gen_attrs[] = {
5266 	&lru_gen_min_ttl_attr.attr,
5267 	&lru_gen_enabled_attr.attr,
5268 	NULL
5269 };
5270 
5271 static const struct attribute_group lru_gen_attr_group = {
5272 	.name = "lru_gen",
5273 	.attrs = lru_gen_attrs,
5274 };
5275 
5276 /******************************************************************************
5277  *                          debugfs interface
5278  ******************************************************************************/
5279 
5280 static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
5281 {
5282 	struct mem_cgroup *memcg;
5283 	loff_t nr_to_skip = *pos;
5284 
5285 	m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
5286 	if (!m->private)
5287 		return ERR_PTR(-ENOMEM);
5288 
5289 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
5290 	do {
5291 		int nid;
5292 
5293 		for_each_node_state(nid, N_MEMORY) {
5294 			if (!nr_to_skip--)
5295 				return get_lruvec(memcg, nid);
5296 		}
5297 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5298 
5299 	return NULL;
5300 }
5301 
5302 static void lru_gen_seq_stop(struct seq_file *m, void *v)
5303 {
5304 	if (!IS_ERR_OR_NULL(v))
5305 		mem_cgroup_iter_break(NULL, lruvec_memcg(v));
5306 
5307 	kvfree(m->private);
5308 	m->private = NULL;
5309 }
5310 
5311 static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
5312 {
5313 	int nid = lruvec_pgdat(v)->node_id;
5314 	struct mem_cgroup *memcg = lruvec_memcg(v);
5315 
5316 	++*pos;
5317 
5318 	nid = next_memory_node(nid);
5319 	if (nid == MAX_NUMNODES) {
5320 		memcg = mem_cgroup_iter(NULL, memcg, NULL);
5321 		if (!memcg)
5322 			return NULL;
5323 
5324 		nid = first_memory_node;
5325 	}
5326 
5327 	return get_lruvec(memcg, nid);
5328 }
5329 
5330 static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
5331 				  unsigned long max_seq, unsigned long *min_seq,
5332 				  unsigned long seq)
5333 {
5334 	int i;
5335 	int type, tier;
5336 	int hist = lru_hist_from_seq(seq);
5337 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5338 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5339 
5340 	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
5341 		seq_printf(m, "            %10d", tier);
5342 		for (type = 0; type < ANON_AND_FILE; type++) {
5343 			const char *s = "xxx";
5344 			unsigned long n[3] = {};
5345 
5346 			if (seq == max_seq) {
5347 				s = "RTx";
5348 				n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
5349 				n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
5350 			} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
5351 				s = "rep";
5352 				n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
5353 				n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
5354 				n[2] = READ_ONCE(lrugen->protected[hist][type][tier]);
5355 			}
5356 
5357 			for (i = 0; i < 3; i++)
5358 				seq_printf(m, " %10lu%c", n[i], s[i]);
5359 		}
5360 		seq_putc(m, '\n');
5361 	}
5362 
5363 	if (!mm_state)
5364 		return;
5365 
5366 	seq_puts(m, "                      ");
5367 	for (i = 0; i < NR_MM_STATS; i++) {
5368 		const char *s = "xxxx";
5369 		unsigned long n = 0;
5370 
5371 		if (seq == max_seq && NR_HIST_GENS == 1) {
5372 			s = "TYFA";
5373 			n = READ_ONCE(mm_state->stats[hist][i]);
5374 		} else if (seq != max_seq && NR_HIST_GENS > 1) {
5375 			s = "tyfa";
5376 			n = READ_ONCE(mm_state->stats[hist][i]);
5377 		}
5378 
5379 		seq_printf(m, " %10lu%c", n, s[i]);
5380 	}
5381 	seq_putc(m, '\n');
5382 }
5383 
5384 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5385 static int lru_gen_seq_show(struct seq_file *m, void *v)
5386 {
5387 	unsigned long seq;
5388 	bool full = !debugfs_real_fops(m->file)->write;
5389 	struct lruvec *lruvec = v;
5390 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5391 	int nid = lruvec_pgdat(lruvec)->node_id;
5392 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5393 	DEFINE_MAX_SEQ(lruvec);
5394 	DEFINE_MIN_SEQ(lruvec);
5395 
5396 	if (nid == first_memory_node) {
5397 		const char *path = memcg ? m->private : "";
5398 
5399 #ifdef CONFIG_MEMCG
5400 		if (memcg)
5401 			cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
5402 #endif
5403 		seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
5404 	}
5405 
5406 	seq_printf(m, " node %5d\n", nid);
5407 
5408 	if (!full)
5409 		seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2);
5410 	else if (max_seq >= MAX_NR_GENS)
5411 		seq = max_seq - MAX_NR_GENS + 1;
5412 	else
5413 		seq = 0;
5414 
5415 	for (; seq <= max_seq; seq++) {
5416 		int type, zone;
5417 		int gen = lru_gen_from_seq(seq);
5418 		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
5419 
5420 		seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
5421 
5422 		for (type = 0; type < ANON_AND_FILE; type++) {
5423 			unsigned long size = 0;
5424 			char mark = full && seq < min_seq[type] ? 'x' : ' ';
5425 
5426 			for (zone = 0; zone < MAX_NR_ZONES; zone++)
5427 				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
5428 
5429 			seq_printf(m, " %10lu%c", size, mark);
5430 		}
5431 
5432 		seq_putc(m, '\n');
5433 
5434 		if (full)
5435 			lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
5436 	}
5437 
5438 	return 0;
5439 }
5440 
5441 static const struct seq_operations lru_gen_seq_ops = {
5442 	.start = lru_gen_seq_start,
5443 	.stop = lru_gen_seq_stop,
5444 	.next = lru_gen_seq_next,
5445 	.show = lru_gen_seq_show,
5446 };
5447 
5448 static int run_aging(struct lruvec *lruvec, unsigned long seq,
5449 		     int swappiness, bool force_scan)
5450 {
5451 	DEFINE_MAX_SEQ(lruvec);
5452 
5453 	if (seq > max_seq)
5454 		return -EINVAL;
5455 
5456 	return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST;
5457 }
5458 
5459 static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
5460 			int swappiness, unsigned long nr_to_reclaim)
5461 {
5462 	DEFINE_MAX_SEQ(lruvec);
5463 
5464 	if (seq + MIN_NR_GENS > max_seq)
5465 		return -EINVAL;
5466 
5467 	sc->nr_reclaimed = 0;
5468 
5469 	while (!signal_pending(current)) {
5470 		DEFINE_MIN_SEQ(lruvec);
5471 
5472 		if (seq < evictable_min_seq(min_seq, swappiness))
5473 			return 0;
5474 
5475 		if (sc->nr_reclaimed >= nr_to_reclaim)
5476 			return 0;
5477 
5478 		if (!evict_folios(lruvec, sc, swappiness))
5479 			return 0;
5480 
5481 		cond_resched();
5482 	}
5483 
5484 	return -EINTR;
5485 }
5486 
5487 static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
5488 		   struct scan_control *sc, int swappiness, unsigned long opt)
5489 {
5490 	struct lruvec *lruvec;
5491 	int err = -EINVAL;
5492 	struct mem_cgroup *memcg = NULL;
5493 
5494 	if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
5495 		return -EINVAL;
5496 
5497 	if (!mem_cgroup_disabled()) {
5498 		rcu_read_lock();
5499 
5500 		memcg = mem_cgroup_from_id(memcg_id);
5501 		if (!mem_cgroup_tryget(memcg))
5502 			memcg = NULL;
5503 
5504 		rcu_read_unlock();
5505 
5506 		if (!memcg)
5507 			return -EINVAL;
5508 	}
5509 
5510 	if (memcg_id != mem_cgroup_id(memcg))
5511 		goto done;
5512 
5513 	lruvec = get_lruvec(memcg, nid);
5514 
5515 	if (swappiness < MIN_SWAPPINESS)
5516 		swappiness = get_swappiness(lruvec, sc);
5517 	else if (swappiness > MAX_SWAPPINESS + 1)
5518 		goto done;
5519 
5520 	switch (cmd) {
5521 	case '+':
5522 		err = run_aging(lruvec, seq, swappiness, opt);
5523 		break;
5524 	case '-':
5525 		err = run_eviction(lruvec, seq, sc, swappiness, opt);
5526 		break;
5527 	}
5528 done:
5529 	mem_cgroup_put(memcg);
5530 
5531 	return err;
5532 }
5533 
5534 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5535 static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
5536 				 size_t len, loff_t *pos)
5537 {
5538 	void *buf;
5539 	char *cur, *next;
5540 	unsigned int flags;
5541 	struct blk_plug plug;
5542 	int err = -EINVAL;
5543 	struct scan_control sc = {
5544 		.may_writepage = true,
5545 		.may_unmap = true,
5546 		.may_swap = true,
5547 		.reclaim_idx = MAX_NR_ZONES - 1,
5548 		.gfp_mask = GFP_KERNEL,
5549 	};
5550 
5551 	buf = kvmalloc(len + 1, GFP_KERNEL);
5552 	if (!buf)
5553 		return -ENOMEM;
5554 
5555 	if (copy_from_user(buf, src, len)) {
5556 		kvfree(buf);
5557 		return -EFAULT;
5558 	}
5559 
5560 	set_task_reclaim_state(current, &sc.reclaim_state);
5561 	flags = memalloc_noreclaim_save();
5562 	blk_start_plug(&plug);
5563 	if (!set_mm_walk(NULL, true)) {
5564 		err = -ENOMEM;
5565 		goto done;
5566 	}
5567 
5568 	next = buf;
5569 	next[len] = '\0';
5570 
5571 	while ((cur = strsep(&next, ",;\n"))) {
5572 		int n;
5573 		int end;
5574 		char cmd;
5575 		unsigned int memcg_id;
5576 		unsigned int nid;
5577 		unsigned long seq;
5578 		unsigned int swappiness = -1;
5579 		unsigned long opt = -1;
5580 
5581 		cur = skip_spaces(cur);
5582 		if (!*cur)
5583 			continue;
5584 
5585 		n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
5586 			   &seq, &end, &swappiness, &end, &opt, &end);
5587 		if (n < 4 || cur[end]) {
5588 			err = -EINVAL;
5589 			break;
5590 		}
5591 
5592 		err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
5593 		if (err)
5594 			break;
5595 	}
5596 done:
5597 	clear_mm_walk();
5598 	blk_finish_plug(&plug);
5599 	memalloc_noreclaim_restore(flags);
5600 	set_task_reclaim_state(current, NULL);
5601 
5602 	kvfree(buf);
5603 
5604 	return err ? : len;
5605 }
5606 
5607 static int lru_gen_seq_open(struct inode *inode, struct file *file)
5608 {
5609 	return seq_open(file, &lru_gen_seq_ops);
5610 }
5611 
5612 static const struct file_operations lru_gen_rw_fops = {
5613 	.open = lru_gen_seq_open,
5614 	.read = seq_read,
5615 	.write = lru_gen_seq_write,
5616 	.llseek = seq_lseek,
5617 	.release = seq_release,
5618 };
5619 
5620 static const struct file_operations lru_gen_ro_fops = {
5621 	.open = lru_gen_seq_open,
5622 	.read = seq_read,
5623 	.llseek = seq_lseek,
5624 	.release = seq_release,
5625 };
5626 
5627 /******************************************************************************
5628  *                          initialization
5629  ******************************************************************************/
5630 
5631 void lru_gen_init_pgdat(struct pglist_data *pgdat)
5632 {
5633 	int i, j;
5634 
5635 	spin_lock_init(&pgdat->memcg_lru.lock);
5636 
5637 	for (i = 0; i < MEMCG_NR_GENS; i++) {
5638 		for (j = 0; j < MEMCG_NR_BINS; j++)
5639 			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
5640 	}
5641 }
5642 
5643 void lru_gen_init_lruvec(struct lruvec *lruvec)
5644 {
5645 	int i;
5646 	int gen, type, zone;
5647 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5648 	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5649 
5650 	lrugen->max_seq = MIN_NR_GENS + 1;
5651 	lrugen->enabled = lru_gen_enabled();
5652 
5653 	for (i = 0; i <= MIN_NR_GENS + 1; i++)
5654 		lrugen->timestamps[i] = jiffies;
5655 
5656 	for_each_gen_type_zone(gen, type, zone)
5657 		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
5658 
5659 	if (mm_state)
5660 		mm_state->seq = MIN_NR_GENS;
5661 }
5662 
5663 #ifdef CONFIG_MEMCG
5664 
5665 void lru_gen_init_memcg(struct mem_cgroup *memcg)
5666 {
5667 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
5668 
5669 	if (!mm_list)
5670 		return;
5671 
5672 	INIT_LIST_HEAD(&mm_list->fifo);
5673 	spin_lock_init(&mm_list->lock);
5674 }
5675 
5676 void lru_gen_exit_memcg(struct mem_cgroup *memcg)
5677 {
5678 	int i;
5679 	int nid;
5680 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
5681 
5682 	VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo));
5683 
5684 	for_each_node(nid) {
5685 		struct lruvec *lruvec = get_lruvec(memcg, nid);
5686 		struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5687 
5688 		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
5689 					   sizeof(lruvec->lrugen.nr_pages)));
5690 
5691 		lruvec->lrugen.list.next = LIST_POISON1;
5692 
5693 		if (!mm_state)
5694 			continue;
5695 
5696 		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
5697 			bitmap_free(mm_state->filters[i]);
5698 			mm_state->filters[i] = NULL;
5699 		}
5700 	}
5701 }
5702 
5703 #endif /* CONFIG_MEMCG */
5704 
5705 static int __init init_lru_gen(void)
5706 {
5707 	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
5708 	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
5709 
5710 	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
5711 		pr_err("lru_gen: failed to create sysfs group\n");
5712 
5713 	debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
5714 	debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
5715 
5716 	return 0;
5717 };
5718 late_initcall(init_lru_gen);
5719 
5720 #else /* !CONFIG_LRU_GEN */
5721 
5722 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
5723 {
5724 	BUILD_BUG();
5725 }
5726 
5727 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
5728 {
5729 	BUILD_BUG();
5730 }
5731 
5732 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
5733 {
5734 	BUILD_BUG();
5735 }
5736 
5737 #endif /* CONFIG_LRU_GEN */
5738 
5739 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
5740 {
5741 	unsigned long nr[NR_LRU_LISTS];
5742 	unsigned long targets[NR_LRU_LISTS];
5743 	unsigned long nr_to_scan;
5744 	enum lru_list lru;
5745 	unsigned long nr_reclaimed = 0;
5746 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
5747 	bool proportional_reclaim;
5748 	struct blk_plug plug;
5749 
5750 	if (lru_gen_enabled() && !root_reclaim(sc)) {
5751 		lru_gen_shrink_lruvec(lruvec, sc);
5752 		return;
5753 	}
5754 
5755 	get_scan_count(lruvec, sc, nr);
5756 
5757 	/* Record the original scan target for proportional adjustments later */
5758 	memcpy(targets, nr, sizeof(nr));
5759 
5760 	/*
5761 	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
5762 	 * event that can occur when there is little memory pressure e.g.
5763 	 * multiple streaming readers/writers. Hence, we do not abort scanning
5764 	 * when the requested number of pages are reclaimed when scanning at
5765 	 * DEF_PRIORITY on the assumption that the fact we are direct
5766 	 * reclaiming implies that kswapd is not keeping up and it is best to
5767 	 * do a batch of work at once. For memcg reclaim one check is made to
5768 	 * abort proportional reclaim if either the file or anon lru has already
5769 	 * dropped to zero at the first pass.
5770 	 */
5771 	proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
5772 				sc->priority == DEF_PRIORITY);
5773 
5774 	blk_start_plug(&plug);
5775 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
5776 					nr[LRU_INACTIVE_FILE]) {
5777 		unsigned long nr_anon, nr_file, percentage;
5778 		unsigned long nr_scanned;
5779 
5780 		for_each_evictable_lru(lru) {
5781 			if (nr[lru]) {
5782 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
5783 				nr[lru] -= nr_to_scan;
5784 
5785 				nr_reclaimed += shrink_list(lru, nr_to_scan,
5786 							    lruvec, sc);
5787 			}
5788 		}
5789 
5790 		cond_resched();
5791 
5792 		if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
5793 			continue;
5794 
5795 		/*
5796 		 * For kswapd and memcg, reclaim at least the number of pages
5797 		 * requested. Ensure that the anon and file LRUs are scanned
5798 		 * proportionally what was requested by get_scan_count(). We
5799 		 * stop reclaiming one LRU and reduce the amount scanning
5800 		 * proportional to the original scan target.
5801 		 */
5802 		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
5803 		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
5804 
5805 		/*
5806 		 * It's just vindictive to attack the larger once the smaller
5807 		 * has gone to zero.  And given the way we stop scanning the
5808 		 * smaller below, this makes sure that we only make one nudge
5809 		 * towards proportionality once we've got nr_to_reclaim.
5810 		 */
5811 		if (!nr_file || !nr_anon)
5812 			break;
5813 
5814 		if (nr_file > nr_anon) {
5815 			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
5816 						targets[LRU_ACTIVE_ANON] + 1;
5817 			lru = LRU_BASE;
5818 			percentage = nr_anon * 100 / scan_target;
5819 		} else {
5820 			unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
5821 						targets[LRU_ACTIVE_FILE] + 1;
5822 			lru = LRU_FILE;
5823 			percentage = nr_file * 100 / scan_target;
5824 		}
5825 
5826 		/* Stop scanning the smaller of the LRU */
5827 		nr[lru] = 0;
5828 		nr[lru + LRU_ACTIVE] = 0;
5829 
5830 		/*
5831 		 * Recalculate the other LRU scan count based on its original
5832 		 * scan target and the percentage scanning already complete
5833 		 */
5834 		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
5835 		nr_scanned = targets[lru] - nr[lru];
5836 		nr[lru] = targets[lru] * (100 - percentage) / 100;
5837 		nr[lru] -= min(nr[lru], nr_scanned);
5838 
5839 		lru += LRU_ACTIVE;
5840 		nr_scanned = targets[lru] - nr[lru];
5841 		nr[lru] = targets[lru] * (100 - percentage) / 100;
5842 		nr[lru] -= min(nr[lru], nr_scanned);
5843 	}
5844 	blk_finish_plug(&plug);
5845 	sc->nr_reclaimed += nr_reclaimed;
5846 
5847 	/*
5848 	 * Even if we did not try to evict anon pages at all, we want to
5849 	 * rebalance the anon lru active/inactive ratio.
5850 	 */
5851 	if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
5852 	    inactive_is_low(lruvec, LRU_INACTIVE_ANON))
5853 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
5854 				   sc, LRU_ACTIVE_ANON);
5855 }
5856 
5857 /* Use reclaim/compaction for costly allocs or under memory pressure */
5858 static bool in_reclaim_compaction(struct scan_control *sc)
5859 {
5860 	if (gfp_compaction_allowed(sc->gfp_mask) && sc->order &&
5861 			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
5862 			 sc->priority < DEF_PRIORITY - 2))
5863 		return true;
5864 
5865 	return false;
5866 }
5867 
5868 /*
5869  * Reclaim/compaction is used for high-order allocation requests. It reclaims
5870  * order-0 pages before compacting the zone. should_continue_reclaim() returns
5871  * true if more pages should be reclaimed such that when the page allocator
5872  * calls try_to_compact_pages() that it will have enough free pages to succeed.
5873  * It will give up earlier than that if there is difficulty reclaiming pages.
5874  */
5875 static inline bool should_continue_reclaim(struct pglist_data *pgdat,
5876 					unsigned long nr_reclaimed,
5877 					struct scan_control *sc)
5878 {
5879 	unsigned long pages_for_compaction;
5880 	unsigned long inactive_lru_pages;
5881 	int z;
5882 	struct zone *zone;
5883 
5884 	/* If not in reclaim/compaction mode, stop */
5885 	if (!in_reclaim_compaction(sc))
5886 		return false;
5887 
5888 	/*
5889 	 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
5890 	 * number of pages that were scanned. This will return to the caller
5891 	 * with the risk reclaim/compaction and the resulting allocation attempt
5892 	 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
5893 	 * allocations through requiring that the full LRU list has been scanned
5894 	 * first, by assuming that zero delta of sc->nr_scanned means full LRU
5895 	 * scan, but that approximation was wrong, and there were corner cases
5896 	 * where always a non-zero amount of pages were scanned.
5897 	 */
5898 	if (!nr_reclaimed)
5899 		return false;
5900 
5901 	/* If compaction would go ahead or the allocation would succeed, stop */
5902 	for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
5903 		unsigned long watermark = min_wmark_pages(zone);
5904 
5905 		/* Allocation can already succeed, nothing to do */
5906 		if (zone_watermark_ok(zone, sc->order, watermark,
5907 				      sc->reclaim_idx, 0))
5908 			return false;
5909 
5910 		if (compaction_suitable(zone, sc->order, watermark,
5911 					sc->reclaim_idx))
5912 			return false;
5913 	}
5914 
5915 	/*
5916 	 * If we have not reclaimed enough pages for compaction and the
5917 	 * inactive lists are large enough, continue reclaiming
5918 	 */
5919 	pages_for_compaction = compact_gap(sc->order);
5920 	inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
5921 	if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
5922 		inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
5923 
5924 	return inactive_lru_pages > pages_for_compaction;
5925 }
5926 
5927 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
5928 {
5929 	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
5930 	struct mem_cgroup_reclaim_cookie reclaim = {
5931 		.pgdat = pgdat,
5932 	};
5933 	struct mem_cgroup_reclaim_cookie *partial = &reclaim;
5934 	struct mem_cgroup *memcg;
5935 
5936 	/*
5937 	 * In most cases, direct reclaimers can do partial walks
5938 	 * through the cgroup tree, using an iterator state that
5939 	 * persists across invocations. This strikes a balance between
5940 	 * fairness and allocation latency.
5941 	 *
5942 	 * For kswapd, reliable forward progress is more important
5943 	 * than a quick return to idle. Always do full walks.
5944 	 */
5945 	if (current_is_kswapd() || sc->memcg_full_walk)
5946 		partial = NULL;
5947 
5948 	memcg = mem_cgroup_iter(target_memcg, NULL, partial);
5949 	do {
5950 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
5951 		unsigned long reclaimed;
5952 		unsigned long scanned;
5953 
5954 		/*
5955 		 * This loop can become CPU-bound when target memcgs
5956 		 * aren't eligible for reclaim - either because they
5957 		 * don't have any reclaimable pages, or because their
5958 		 * memory is explicitly protected. Avoid soft lockups.
5959 		 */
5960 		cond_resched();
5961 
5962 		mem_cgroup_calculate_protection(target_memcg, memcg);
5963 
5964 		if (mem_cgroup_below_min(target_memcg, memcg)) {
5965 			/*
5966 			 * Hard protection.
5967 			 * If there is no reclaimable memory, OOM.
5968 			 */
5969 			continue;
5970 		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
5971 			/*
5972 			 * Soft protection.
5973 			 * Respect the protection only as long as
5974 			 * there is an unprotected supply
5975 			 * of reclaimable memory from other cgroups.
5976 			 */
5977 			if (!sc->memcg_low_reclaim) {
5978 				sc->memcg_low_skipped = 1;
5979 				continue;
5980 			}
5981 			memcg_memory_event(memcg, MEMCG_LOW);
5982 		}
5983 
5984 		reclaimed = sc->nr_reclaimed;
5985 		scanned = sc->nr_scanned;
5986 
5987 		shrink_lruvec(lruvec, sc);
5988 
5989 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
5990 			    sc->priority);
5991 
5992 		/* Record the group's reclaim efficiency */
5993 		if (!sc->proactive)
5994 			vmpressure(sc->gfp_mask, memcg, false,
5995 				   sc->nr_scanned - scanned,
5996 				   sc->nr_reclaimed - reclaimed);
5997 
5998 		/* If partial walks are allowed, bail once goal is reached */
5999 		if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) {
6000 			mem_cgroup_iter_break(target_memcg, memcg);
6001 			break;
6002 		}
6003 	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
6004 }
6005 
6006 static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
6007 {
6008 	unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed;
6009 	struct lruvec *target_lruvec;
6010 	bool reclaimable = false;
6011 
6012 	if (lru_gen_enabled() && root_reclaim(sc)) {
6013 		memset(&sc->nr, 0, sizeof(sc->nr));
6014 		lru_gen_shrink_node(pgdat, sc);
6015 		return;
6016 	}
6017 
6018 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
6019 
6020 again:
6021 	memset(&sc->nr, 0, sizeof(sc->nr));
6022 
6023 	nr_reclaimed = sc->nr_reclaimed;
6024 	nr_scanned = sc->nr_scanned;
6025 
6026 	prepare_scan_control(pgdat, sc);
6027 
6028 	shrink_node_memcgs(pgdat, sc);
6029 
6030 	flush_reclaim_state(sc);
6031 
6032 	nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed;
6033 
6034 	/* Record the subtree's reclaim efficiency */
6035 	if (!sc->proactive)
6036 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
6037 			   sc->nr_scanned - nr_scanned, nr_node_reclaimed);
6038 
6039 	if (nr_node_reclaimed)
6040 		reclaimable = true;
6041 
6042 	if (current_is_kswapd()) {
6043 		/*
6044 		 * If reclaim is isolating dirty pages under writeback,
6045 		 * it implies that the long-lived page allocation rate
6046 		 * is exceeding the page laundering rate. Either the
6047 		 * global limits are not being effective at throttling
6048 		 * processes due to the page distribution throughout
6049 		 * zones or there is heavy usage of a slow backing
6050 		 * device. The only option is to throttle from reclaim
6051 		 * context which is not ideal as there is no guarantee
6052 		 * the dirtying process is throttled in the same way
6053 		 * balance_dirty_pages() manages.
6054 		 *
6055 		 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
6056 		 * count the number of pages under pages flagged for
6057 		 * immediate reclaim and stall if any are encountered
6058 		 * in the nr_immediate check below.
6059 		 */
6060 		if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
6061 			set_bit(PGDAT_WRITEBACK, &pgdat->flags);
6062 
6063 		/* Allow kswapd to start writing pages during reclaim.*/
6064 		if (sc->nr.unqueued_dirty &&
6065 			sc->nr.unqueued_dirty == sc->nr.file_taken)
6066 			set_bit(PGDAT_DIRTY, &pgdat->flags);
6067 
6068 		/*
6069 		 * If kswapd scans pages marked for immediate
6070 		 * reclaim and under writeback (nr_immediate), it
6071 		 * implies that pages are cycling through the LRU
6072 		 * faster than they are written so forcibly stall
6073 		 * until some pages complete writeback.
6074 		 */
6075 		if (sc->nr.immediate)
6076 			reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
6077 	}
6078 
6079 	/*
6080 	 * Tag a node/memcg as congested if all the dirty pages were marked
6081 	 * for writeback and immediate reclaim (counted in nr.congested).
6082 	 *
6083 	 * Legacy memcg will stall in page writeback so avoid forcibly
6084 	 * stalling in reclaim_throttle().
6085 	 */
6086 	if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) {
6087 		if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
6088 			set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);
6089 
6090 		if (current_is_kswapd())
6091 			set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
6092 	}
6093 
6094 	/*
6095 	 * Stall direct reclaim for IO completions if the lruvec is
6096 	 * node is congested. Allow kswapd to continue until it
6097 	 * starts encountering unqueued dirty pages or cycling through
6098 	 * the LRU too quickly.
6099 	 */
6100 	if (!current_is_kswapd() && current_may_throttle() &&
6101 	    !sc->hibernation_mode &&
6102 	    (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) ||
6103 	     test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
6104 		reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
6105 
6106 	if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc))
6107 		goto again;
6108 
6109 	/*
6110 	 * Kswapd gives up on balancing particular nodes after too
6111 	 * many failures to reclaim anything from them and goes to
6112 	 * sleep. On reclaim progress, reset the failure counter. A
6113 	 * successful direct reclaim run will revive a dormant kswapd.
6114 	 */
6115 	if (reclaimable)
6116 		pgdat->kswapd_failures = 0;
6117 	else if (sc->cache_trim_mode)
6118 		sc->cache_trim_mode_failed = 1;
6119 }
6120 
6121 /*
6122  * Returns true if compaction should go ahead for a costly-order request, or
6123  * the allocation would already succeed without compaction. Return false if we
6124  * should reclaim first.
6125  */
6126 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
6127 {
6128 	unsigned long watermark;
6129 
6130 	if (!gfp_compaction_allowed(sc->gfp_mask))
6131 		return false;
6132 
6133 	/* Allocation can already succeed, nothing to do */
6134 	if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
6135 			      sc->reclaim_idx, 0))
6136 		return true;
6137 
6138 	/*
6139 	 * Direct reclaim usually targets the min watermark, but compaction
6140 	 * takes time to run and there are potentially other callers using the
6141 	 * pages just freed. So target a higher buffer to give compaction a
6142 	 * reasonable chance of completing and allocating the pages.
6143 	 *
6144 	 * Note that we won't actually reclaim the whole buffer in one attempt
6145 	 * as the target watermark in should_continue_reclaim() is lower. But if
6146 	 * we are already above the high+gap watermark, don't reclaim at all.
6147 	 */
6148 	watermark = high_wmark_pages(zone);
6149 	if (compaction_suitable(zone, sc->order, watermark, sc->reclaim_idx))
6150 		return true;
6151 
6152 	return false;
6153 }
6154 
6155 static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
6156 {
6157 	/*
6158 	 * If reclaim is making progress greater than 12% efficiency then
6159 	 * wake all the NOPROGRESS throttled tasks.
6160 	 */
6161 	if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
6162 		wait_queue_head_t *wqh;
6163 
6164 		wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
6165 		if (waitqueue_active(wqh))
6166 			wake_up(wqh);
6167 
6168 		return;
6169 	}
6170 
6171 	/*
6172 	 * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
6173 	 * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
6174 	 * under writeback and marked for immediate reclaim at the tail of the
6175 	 * LRU.
6176 	 */
6177 	if (current_is_kswapd() || cgroup_reclaim(sc))
6178 		return;
6179 
6180 	/* Throttle if making no progress at high prioities. */
6181 	if (sc->priority == 1 && !sc->nr_reclaimed)
6182 		reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
6183 }
6184 
6185 /*
6186  * This is the direct reclaim path, for page-allocating processes.  We only
6187  * try to reclaim pages from zones which will satisfy the caller's allocation
6188  * request.
6189  *
6190  * If a zone is deemed to be full of pinned pages then just give it a light
6191  * scan then give up on it.
6192  */
6193 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
6194 {
6195 	struct zoneref *z;
6196 	struct zone *zone;
6197 	unsigned long nr_soft_reclaimed;
6198 	unsigned long nr_soft_scanned;
6199 	gfp_t orig_mask;
6200 	pg_data_t *last_pgdat = NULL;
6201 	pg_data_t *first_pgdat = NULL;
6202 
6203 	/*
6204 	 * If the number of buffer_heads in the machine exceeds the maximum
6205 	 * allowed level, force direct reclaim to scan the highmem zone as
6206 	 * highmem pages could be pinning lowmem pages storing buffer_heads
6207 	 */
6208 	orig_mask = sc->gfp_mask;
6209 	if (buffer_heads_over_limit) {
6210 		sc->gfp_mask |= __GFP_HIGHMEM;
6211 		sc->reclaim_idx = gfp_zone(sc->gfp_mask);
6212 	}
6213 
6214 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
6215 					sc->reclaim_idx, sc->nodemask) {
6216 		/*
6217 		 * Take care memory controller reclaiming has small influence
6218 		 * to global LRU.
6219 		 */
6220 		if (!cgroup_reclaim(sc)) {
6221 			if (!cpuset_zone_allowed(zone,
6222 						 GFP_KERNEL | __GFP_HARDWALL))
6223 				continue;
6224 
6225 			/*
6226 			 * If we already have plenty of memory free for
6227 			 * compaction in this zone, don't free any more.
6228 			 * Even though compaction is invoked for any
6229 			 * non-zero order, only frequent costly order
6230 			 * reclamation is disruptive enough to become a
6231 			 * noticeable problem, like transparent huge
6232 			 * page allocations.
6233 			 */
6234 			if (IS_ENABLED(CONFIG_COMPACTION) &&
6235 			    sc->order > PAGE_ALLOC_COSTLY_ORDER &&
6236 			    compaction_ready(zone, sc)) {
6237 				sc->compaction_ready = true;
6238 				continue;
6239 			}
6240 
6241 			/*
6242 			 * Shrink each node in the zonelist once. If the
6243 			 * zonelist is ordered by zone (not the default) then a
6244 			 * node may be shrunk multiple times but in that case
6245 			 * the user prefers lower zones being preserved.
6246 			 */
6247 			if (zone->zone_pgdat == last_pgdat)
6248 				continue;
6249 
6250 			/*
6251 			 * This steals pages from memory cgroups over softlimit
6252 			 * and returns the number of reclaimed pages and
6253 			 * scanned pages. This works for global memory pressure
6254 			 * and balancing, not for a memcg's limit.
6255 			 */
6256 			nr_soft_scanned = 0;
6257 			nr_soft_reclaimed = memcg1_soft_limit_reclaim(zone->zone_pgdat,
6258 								      sc->order, sc->gfp_mask,
6259 								      &nr_soft_scanned);
6260 			sc->nr_reclaimed += nr_soft_reclaimed;
6261 			sc->nr_scanned += nr_soft_scanned;
6262 			/* need some check for avoid more shrink_zone() */
6263 		}
6264 
6265 		if (!first_pgdat)
6266 			first_pgdat = zone->zone_pgdat;
6267 
6268 		/* See comment about same check for global reclaim above */
6269 		if (zone->zone_pgdat == last_pgdat)
6270 			continue;
6271 		last_pgdat = zone->zone_pgdat;
6272 		shrink_node(zone->zone_pgdat, sc);
6273 	}
6274 
6275 	if (first_pgdat)
6276 		consider_reclaim_throttle(first_pgdat, sc);
6277 
6278 	/*
6279 	 * Restore to original mask to avoid the impact on the caller if we
6280 	 * promoted it to __GFP_HIGHMEM.
6281 	 */
6282 	sc->gfp_mask = orig_mask;
6283 }
6284 
6285 static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
6286 {
6287 	struct lruvec *target_lruvec;
6288 	unsigned long refaults;
6289 
6290 	if (lru_gen_enabled())
6291 		return;
6292 
6293 	target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
6294 	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
6295 	target_lruvec->refaults[WORKINGSET_ANON] = refaults;
6296 	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
6297 	target_lruvec->refaults[WORKINGSET_FILE] = refaults;
6298 }
6299 
6300 /*
6301  * This is the main entry point to direct page reclaim.
6302  *
6303  * If a full scan of the inactive list fails to free enough memory then we
6304  * are "out of memory" and something needs to be killed.
6305  *
6306  * If the caller is !__GFP_FS then the probability of a failure is reasonably
6307  * high - the zone may be full of dirty or under-writeback pages, which this
6308  * caller can't do much about.  We kick the writeback threads and take explicit
6309  * naps in the hope that some of these pages can be written.  But if the
6310  * allocating task holds filesystem locks which prevent writeout this might not
6311  * work, and the allocation attempt will fail.
6312  *
6313  * returns:	0, if no pages reclaimed
6314  * 		else, the number of pages reclaimed
6315  */
6316 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
6317 					  struct scan_control *sc)
6318 {
6319 	int initial_priority = sc->priority;
6320 	pg_data_t *last_pgdat;
6321 	struct zoneref *z;
6322 	struct zone *zone;
6323 retry:
6324 	delayacct_freepages_start();
6325 
6326 	if (!cgroup_reclaim(sc))
6327 		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
6328 
6329 	do {
6330 		if (!sc->proactive)
6331 			vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
6332 					sc->priority);
6333 		sc->nr_scanned = 0;
6334 		shrink_zones(zonelist, sc);
6335 
6336 		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
6337 			break;
6338 
6339 		if (sc->compaction_ready)
6340 			break;
6341 
6342 		/*
6343 		 * If we're getting trouble reclaiming, start doing
6344 		 * writepage even in laptop mode.
6345 		 */
6346 		if (sc->priority < DEF_PRIORITY - 2)
6347 			sc->may_writepage = 1;
6348 	} while (--sc->priority >= 0);
6349 
6350 	last_pgdat = NULL;
6351 	for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
6352 					sc->nodemask) {
6353 		if (zone->zone_pgdat == last_pgdat)
6354 			continue;
6355 		last_pgdat = zone->zone_pgdat;
6356 
6357 		snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
6358 
6359 		if (cgroup_reclaim(sc)) {
6360 			struct lruvec *lruvec;
6361 
6362 			lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
6363 						   zone->zone_pgdat);
6364 			clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
6365 		}
6366 	}
6367 
6368 	delayacct_freepages_end();
6369 
6370 	if (sc->nr_reclaimed)
6371 		return sc->nr_reclaimed;
6372 
6373 	/* Aborted reclaim to try compaction? don't OOM, then */
6374 	if (sc->compaction_ready)
6375 		return 1;
6376 
6377 	/*
6378 	 * In most cases, direct reclaimers can do partial walks
6379 	 * through the cgroup tree to meet the reclaim goal while
6380 	 * keeping latency low. Since the iterator state is shared
6381 	 * among all direct reclaim invocations (to retain fairness
6382 	 * among cgroups), though, high concurrency can result in
6383 	 * individual threads not seeing enough cgroups to make
6384 	 * meaningful forward progress. Avoid false OOMs in this case.
6385 	 */
6386 	if (!sc->memcg_full_walk) {
6387 		sc->priority = initial_priority;
6388 		sc->memcg_full_walk = 1;
6389 		goto retry;
6390 	}
6391 
6392 	/*
6393 	 * We make inactive:active ratio decisions based on the node's
6394 	 * composition of memory, but a restrictive reclaim_idx or a
6395 	 * memory.low cgroup setting can exempt large amounts of
6396 	 * memory from reclaim. Neither of which are very common, so
6397 	 * instead of doing costly eligibility calculations of the
6398 	 * entire cgroup subtree up front, we assume the estimates are
6399 	 * good, and retry with forcible deactivation if that fails.
6400 	 */
6401 	if (sc->skipped_deactivate) {
6402 		sc->priority = initial_priority;
6403 		sc->force_deactivate = 1;
6404 		sc->skipped_deactivate = 0;
6405 		goto retry;
6406 	}
6407 
6408 	/* Untapped cgroup reserves?  Don't OOM, retry. */
6409 	if (sc->memcg_low_skipped) {
6410 		sc->priority = initial_priority;
6411 		sc->force_deactivate = 0;
6412 		sc->memcg_low_reclaim = 1;
6413 		sc->memcg_low_skipped = 0;
6414 		goto retry;
6415 	}
6416 
6417 	return 0;
6418 }
6419 
6420 static bool allow_direct_reclaim(pg_data_t *pgdat)
6421 {
6422 	struct zone *zone;
6423 	unsigned long pfmemalloc_reserve = 0;
6424 	unsigned long free_pages = 0;
6425 	int i;
6426 	bool wmark_ok;
6427 
6428 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
6429 		return true;
6430 
6431 	for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
6432 		if (!zone_reclaimable_pages(zone))
6433 			continue;
6434 
6435 		pfmemalloc_reserve += min_wmark_pages(zone);
6436 		free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES);
6437 	}
6438 
6439 	/* If there are no reserves (unexpected config) then do not throttle */
6440 	if (!pfmemalloc_reserve)
6441 		return true;
6442 
6443 	wmark_ok = free_pages > pfmemalloc_reserve / 2;
6444 
6445 	/* kswapd must be awake if processes are being throttled */
6446 	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
6447 		if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
6448 			WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
6449 
6450 		wake_up_interruptible(&pgdat->kswapd_wait);
6451 	}
6452 
6453 	return wmark_ok;
6454 }
6455 
6456 /*
6457  * Throttle direct reclaimers if backing storage is backed by the network
6458  * and the PFMEMALLOC reserve for the preferred node is getting dangerously
6459  * depleted. kswapd will continue to make progress and wake the processes
6460  * when the low watermark is reached.
6461  *
6462  * Returns true if a fatal signal was delivered during throttling. If this
6463  * happens, the page allocator should not consider triggering the OOM killer.
6464  */
6465 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
6466 					nodemask_t *nodemask)
6467 {
6468 	struct zoneref *z;
6469 	struct zone *zone;
6470 	pg_data_t *pgdat = NULL;
6471 
6472 	/*
6473 	 * Kernel threads should not be throttled as they may be indirectly
6474 	 * responsible for cleaning pages necessary for reclaim to make forward
6475 	 * progress. kjournald for example may enter direct reclaim while
6476 	 * committing a transaction where throttling it could forcing other
6477 	 * processes to block on log_wait_commit().
6478 	 */
6479 	if (current->flags & PF_KTHREAD)
6480 		goto out;
6481 
6482 	/*
6483 	 * If a fatal signal is pending, this process should not throttle.
6484 	 * It should return quickly so it can exit and free its memory
6485 	 */
6486 	if (fatal_signal_pending(current))
6487 		goto out;
6488 
6489 	/*
6490 	 * Check if the pfmemalloc reserves are ok by finding the first node
6491 	 * with a usable ZONE_NORMAL or lower zone. The expectation is that
6492 	 * GFP_KERNEL will be required for allocating network buffers when
6493 	 * swapping over the network so ZONE_HIGHMEM is unusable.
6494 	 *
6495 	 * Throttling is based on the first usable node and throttled processes
6496 	 * wait on a queue until kswapd makes progress and wakes them. There
6497 	 * is an affinity then between processes waking up and where reclaim
6498 	 * progress has been made assuming the process wakes on the same node.
6499 	 * More importantly, processes running on remote nodes will not compete
6500 	 * for remote pfmemalloc reserves and processes on different nodes
6501 	 * should make reasonable progress.
6502 	 */
6503 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
6504 					gfp_zone(gfp_mask), nodemask) {
6505 		if (zone_idx(zone) > ZONE_NORMAL)
6506 			continue;
6507 
6508 		/* Throttle based on the first usable node */
6509 		pgdat = zone->zone_pgdat;
6510 		if (allow_direct_reclaim(pgdat))
6511 			goto out;
6512 		break;
6513 	}
6514 
6515 	/* If no zone was usable by the allocation flags then do not throttle */
6516 	if (!pgdat)
6517 		goto out;
6518 
6519 	/* Account for the throttling */
6520 	count_vm_event(PGSCAN_DIRECT_THROTTLE);
6521 
6522 	/*
6523 	 * If the caller cannot enter the filesystem, it's possible that it
6524 	 * is due to the caller holding an FS lock or performing a journal
6525 	 * transaction in the case of a filesystem like ext[3|4]. In this case,
6526 	 * it is not safe to block on pfmemalloc_wait as kswapd could be
6527 	 * blocked waiting on the same lock. Instead, throttle for up to a
6528 	 * second before continuing.
6529 	 */
6530 	if (!(gfp_mask & __GFP_FS))
6531 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
6532 			allow_direct_reclaim(pgdat), HZ);
6533 	else
6534 		/* Throttle until kswapd wakes the process */
6535 		wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
6536 			allow_direct_reclaim(pgdat));
6537 
6538 	if (fatal_signal_pending(current))
6539 		return true;
6540 
6541 out:
6542 	return false;
6543 }
6544 
6545 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
6546 				gfp_t gfp_mask, nodemask_t *nodemask)
6547 {
6548 	unsigned long nr_reclaimed;
6549 	struct scan_control sc = {
6550 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
6551 		.gfp_mask = current_gfp_context(gfp_mask),
6552 		.reclaim_idx = gfp_zone(gfp_mask),
6553 		.order = order,
6554 		.nodemask = nodemask,
6555 		.priority = DEF_PRIORITY,
6556 		.may_writepage = !laptop_mode,
6557 		.may_unmap = 1,
6558 		.may_swap = 1,
6559 	};
6560 
6561 	/*
6562 	 * scan_control uses s8 fields for order, priority, and reclaim_idx.
6563 	 * Confirm they are large enough for max values.
6564 	 */
6565 	BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
6566 	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
6567 	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
6568 
6569 	/*
6570 	 * Do not enter reclaim if fatal signal was delivered while throttled.
6571 	 * 1 is returned so that the page allocator does not OOM kill at this
6572 	 * point.
6573 	 */
6574 	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
6575 		return 1;
6576 
6577 	set_task_reclaim_state(current, &sc.reclaim_state);
6578 	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
6579 
6580 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
6581 
6582 	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
6583 	set_task_reclaim_state(current, NULL);
6584 
6585 	return nr_reclaimed;
6586 }
6587 
6588 #ifdef CONFIG_MEMCG
6589 
6590 /* Only used by soft limit reclaim. Do not reuse for anything else. */
6591 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
6592 						gfp_t gfp_mask, bool noswap,
6593 						pg_data_t *pgdat,
6594 						unsigned long *nr_scanned)
6595 {
6596 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
6597 	struct scan_control sc = {
6598 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
6599 		.target_mem_cgroup = memcg,
6600 		.may_writepage = !laptop_mode,
6601 		.may_unmap = 1,
6602 		.reclaim_idx = MAX_NR_ZONES - 1,
6603 		.may_swap = !noswap,
6604 	};
6605 
6606 	WARN_ON_ONCE(!current->reclaim_state);
6607 
6608 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
6609 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
6610 
6611 	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
6612 						      sc.gfp_mask);
6613 
6614 	/*
6615 	 * NOTE: Although we can get the priority field, using it
6616 	 * here is not a good idea, since it limits the pages we can scan.
6617 	 * if we don't reclaim here, the shrink_node from balance_pgdat
6618 	 * will pick up pages from other mem cgroup's as well. We hack
6619 	 * the priority and make it zero.
6620 	 */
6621 	shrink_lruvec(lruvec, &sc);
6622 
6623 	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
6624 
6625 	*nr_scanned = sc.nr_scanned;
6626 
6627 	return sc.nr_reclaimed;
6628 }
6629 
6630 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
6631 					   unsigned long nr_pages,
6632 					   gfp_t gfp_mask,
6633 					   unsigned int reclaim_options,
6634 					   int *swappiness)
6635 {
6636 	unsigned long nr_reclaimed;
6637 	unsigned int noreclaim_flag;
6638 	struct scan_control sc = {
6639 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
6640 		.proactive_swappiness = swappiness,
6641 		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
6642 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
6643 		.reclaim_idx = MAX_NR_ZONES - 1,
6644 		.target_mem_cgroup = memcg,
6645 		.priority = DEF_PRIORITY,
6646 		.may_writepage = !laptop_mode,
6647 		.may_unmap = 1,
6648 		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
6649 		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
6650 	};
6651 	/*
6652 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
6653 	 * equal pressure on all the nodes. This is based on the assumption that
6654 	 * the reclaim does not bail out early.
6655 	 */
6656 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
6657 
6658 	set_task_reclaim_state(current, &sc.reclaim_state);
6659 	trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
6660 	noreclaim_flag = memalloc_noreclaim_save();
6661 
6662 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
6663 
6664 	memalloc_noreclaim_restore(noreclaim_flag);
6665 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
6666 	set_task_reclaim_state(current, NULL);
6667 
6668 	return nr_reclaimed;
6669 }
6670 #endif
6671 
6672 static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
6673 {
6674 	struct mem_cgroup *memcg;
6675 	struct lruvec *lruvec;
6676 
6677 	if (lru_gen_enabled()) {
6678 		lru_gen_age_node(pgdat, sc);
6679 		return;
6680 	}
6681 
6682 	if (!can_age_anon_pages(pgdat, sc))
6683 		return;
6684 
6685 	lruvec = mem_cgroup_lruvec(NULL, pgdat);
6686 	if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
6687 		return;
6688 
6689 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
6690 	do {
6691 		lruvec = mem_cgroup_lruvec(memcg, pgdat);
6692 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
6693 				   sc, LRU_ACTIVE_ANON);
6694 		memcg = mem_cgroup_iter(NULL, memcg, NULL);
6695 	} while (memcg);
6696 }
6697 
6698 static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
6699 {
6700 	int i;
6701 	struct zone *zone;
6702 
6703 	/*
6704 	 * Check for watermark boosts top-down as the higher zones
6705 	 * are more likely to be boosted. Both watermarks and boosts
6706 	 * should not be checked at the same time as reclaim would
6707 	 * start prematurely when there is no boosting and a lower
6708 	 * zone is balanced.
6709 	 */
6710 	for (i = highest_zoneidx; i >= 0; i--) {
6711 		zone = pgdat->node_zones + i;
6712 		if (!managed_zone(zone))
6713 			continue;
6714 
6715 		if (zone->watermark_boost)
6716 			return true;
6717 	}
6718 
6719 	return false;
6720 }
6721 
6722 /*
6723  * Returns true if there is an eligible zone balanced for the request order
6724  * and highest_zoneidx
6725  */
6726 static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
6727 {
6728 	int i;
6729 	unsigned long mark = -1;
6730 	struct zone *zone;
6731 
6732 	/*
6733 	 * Check watermarks bottom-up as lower zones are more likely to
6734 	 * meet watermarks.
6735 	 */
6736 	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
6737 		enum zone_stat_item item;
6738 		unsigned long free_pages;
6739 
6740 		if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
6741 			mark = promo_wmark_pages(zone);
6742 		else
6743 			mark = high_wmark_pages(zone);
6744 
6745 		/*
6746 		 * In defrag_mode, watermarks must be met in whole
6747 		 * blocks to avoid polluting allocator fallbacks.
6748 		 *
6749 		 * However, kswapd usually cannot accomplish this on
6750 		 * its own and needs kcompactd support. Once it's
6751 		 * reclaimed a compaction gap, and kswapd_shrink_node
6752 		 * has dropped order, simply ensure there are enough
6753 		 * base pages for compaction, wake kcompactd & sleep.
6754 		 */
6755 		if (defrag_mode && order)
6756 			item = NR_FREE_PAGES_BLOCKS;
6757 		else
6758 			item = NR_FREE_PAGES;
6759 
6760 		/*
6761 		 * When there is a high number of CPUs in the system,
6762 		 * the cumulative error from the vmstat per-cpu cache
6763 		 * can blur the line between the watermarks. In that
6764 		 * case, be safe and get an accurate snapshot.
6765 		 *
6766 		 * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
6767 		 * pageblock_nr_pages, while the vmstat pcp threshold
6768 		 * is limited to 125. On many configurations that
6769 		 * counter won't actually be per-cpu cached. But keep
6770 		 * things simple for now; revisit when somebody cares.
6771 		 */
6772 		free_pages = zone_page_state(zone, item);
6773 		if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
6774 			free_pages = zone_page_state_snapshot(zone, item);
6775 
6776 		if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
6777 					0, free_pages))
6778 			return true;
6779 	}
6780 
6781 	/*
6782 	 * If a node has no managed zone within highest_zoneidx, it does not
6783 	 * need balancing by definition. This can happen if a zone-restricted
6784 	 * allocation tries to wake a remote kswapd.
6785 	 */
6786 	if (mark == -1)
6787 		return true;
6788 
6789 	return false;
6790 }
6791 
6792 /* Clear pgdat state for congested, dirty or under writeback. */
6793 static void clear_pgdat_congested(pg_data_t *pgdat)
6794 {
6795 	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
6796 
6797 	clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
6798 	clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
6799 	clear_bit(PGDAT_DIRTY, &pgdat->flags);
6800 	clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
6801 }
6802 
6803 /*
6804  * Prepare kswapd for sleeping. This verifies that there are no processes
6805  * waiting in throttle_direct_reclaim() and that watermarks have been met.
6806  *
6807  * Returns true if kswapd is ready to sleep
6808  */
6809 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
6810 				int highest_zoneidx)
6811 {
6812 	/*
6813 	 * The throttled processes are normally woken up in balance_pgdat() as
6814 	 * soon as allow_direct_reclaim() is true. But there is a potential
6815 	 * race between when kswapd checks the watermarks and a process gets
6816 	 * throttled. There is also a potential race if processes get
6817 	 * throttled, kswapd wakes, a large process exits thereby balancing the
6818 	 * zones, which causes kswapd to exit balance_pgdat() before reaching
6819 	 * the wake up checks. If kswapd is going to sleep, no process should
6820 	 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
6821 	 * the wake up is premature, processes will wake kswapd and get
6822 	 * throttled again. The difference from wake ups in balance_pgdat() is
6823 	 * that here we are under prepare_to_wait().
6824 	 */
6825 	if (waitqueue_active(&pgdat->pfmemalloc_wait))
6826 		wake_up_all(&pgdat->pfmemalloc_wait);
6827 
6828 	/* Hopeless node, leave it to direct reclaim */
6829 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
6830 		return true;
6831 
6832 	if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
6833 		clear_pgdat_congested(pgdat);
6834 		return true;
6835 	}
6836 
6837 	return false;
6838 }
6839 
6840 /*
6841  * kswapd shrinks a node of pages that are at or below the highest usable
6842  * zone that is currently unbalanced.
6843  *
6844  * Returns true if kswapd scanned at least the requested number of pages to
6845  * reclaim or if the lack of progress was due to pages under writeback.
6846  * This is used to determine if the scanning priority needs to be raised.
6847  */
6848 static bool kswapd_shrink_node(pg_data_t *pgdat,
6849 			       struct scan_control *sc)
6850 {
6851 	struct zone *zone;
6852 	int z;
6853 	unsigned long nr_reclaimed = sc->nr_reclaimed;
6854 
6855 	/* Reclaim a number of pages proportional to the number of zones */
6856 	sc->nr_to_reclaim = 0;
6857 	for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
6858 		sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
6859 	}
6860 
6861 	/*
6862 	 * Historically care was taken to put equal pressure on all zones but
6863 	 * now pressure is applied based on node LRU order.
6864 	 */
6865 	shrink_node(pgdat, sc);
6866 
6867 	/*
6868 	 * Fragmentation may mean that the system cannot be rebalanced for
6869 	 * high-order allocations. If twice the allocation size has been
6870 	 * reclaimed then recheck watermarks only at order-0 to prevent
6871 	 * excessive reclaim. Assume that a process requested a high-order
6872 	 * can direct reclaim/compact.
6873 	 */
6874 	if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
6875 		sc->order = 0;
6876 
6877 	/* account for progress from mm_account_reclaimed_pages() */
6878 	return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
6879 }
6880 
6881 /* Page allocator PCP high watermark is lowered if reclaim is active. */
6882 static inline void
6883 update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
6884 {
6885 	int i;
6886 	struct zone *zone;
6887 
6888 	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
6889 		if (active)
6890 			set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
6891 		else
6892 			clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
6893 	}
6894 }
6895 
6896 static inline void
6897 set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
6898 {
6899 	update_reclaim_active(pgdat, highest_zoneidx, true);
6900 }
6901 
6902 static inline void
6903 clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
6904 {
6905 	update_reclaim_active(pgdat, highest_zoneidx, false);
6906 }
6907 
6908 /*
6909  * For kswapd, balance_pgdat() will reclaim pages across a node from zones
6910  * that are eligible for use by the caller until at least one zone is
6911  * balanced.
6912  *
6913  * Returns the order kswapd finished reclaiming at.
6914  *
6915  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
6916  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
6917  * found to have free_pages <= high_wmark_pages(zone), any page in that zone
6918  * or lower is eligible for reclaim until at least one usable zone is
6919  * balanced.
6920  */
6921 static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
6922 {
6923 	int i;
6924 	unsigned long nr_soft_reclaimed;
6925 	unsigned long nr_soft_scanned;
6926 	unsigned long pflags;
6927 	unsigned long nr_boost_reclaim;
6928 	unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
6929 	bool boosted;
6930 	struct zone *zone;
6931 	struct scan_control sc = {
6932 		.gfp_mask = GFP_KERNEL,
6933 		.order = order,
6934 		.may_unmap = 1,
6935 	};
6936 
6937 	set_task_reclaim_state(current, &sc.reclaim_state);
6938 	psi_memstall_enter(&pflags);
6939 	__fs_reclaim_acquire(_THIS_IP_);
6940 
6941 	count_vm_event(PAGEOUTRUN);
6942 
6943 	/*
6944 	 * Account for the reclaim boost. Note that the zone boost is left in
6945 	 * place so that parallel allocations that are near the watermark will
6946 	 * stall or direct reclaim until kswapd is finished.
6947 	 */
6948 	nr_boost_reclaim = 0;
6949 	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
6950 		nr_boost_reclaim += zone->watermark_boost;
6951 		zone_boosts[i] = zone->watermark_boost;
6952 	}
6953 	boosted = nr_boost_reclaim;
6954 
6955 restart:
6956 	set_reclaim_active(pgdat, highest_zoneidx);
6957 	sc.priority = DEF_PRIORITY;
6958 	do {
6959 		unsigned long nr_reclaimed = sc.nr_reclaimed;
6960 		bool raise_priority = true;
6961 		bool balanced;
6962 		bool ret;
6963 		bool was_frozen;
6964 
6965 		sc.reclaim_idx = highest_zoneidx;
6966 
6967 		/*
6968 		 * If the number of buffer_heads exceeds the maximum allowed
6969 		 * then consider reclaiming from all zones. This has a dual
6970 		 * purpose -- on 64-bit systems it is expected that
6971 		 * buffer_heads are stripped during active rotation. On 32-bit
6972 		 * systems, highmem pages can pin lowmem memory and shrinking
6973 		 * buffers can relieve lowmem pressure. Reclaim may still not
6974 		 * go ahead if all eligible zones for the original allocation
6975 		 * request are balanced to avoid excessive reclaim from kswapd.
6976 		 */
6977 		if (buffer_heads_over_limit) {
6978 			for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
6979 				zone = pgdat->node_zones + i;
6980 				if (!managed_zone(zone))
6981 					continue;
6982 
6983 				sc.reclaim_idx = i;
6984 				break;
6985 			}
6986 		}
6987 
6988 		/*
6989 		 * If the pgdat is imbalanced then ignore boosting and preserve
6990 		 * the watermarks for a later time and restart. Note that the
6991 		 * zone watermarks will be still reset at the end of balancing
6992 		 * on the grounds that the normal reclaim should be enough to
6993 		 * re-evaluate if boosting is required when kswapd next wakes.
6994 		 */
6995 		balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
6996 		if (!balanced && nr_boost_reclaim) {
6997 			nr_boost_reclaim = 0;
6998 			goto restart;
6999 		}
7000 
7001 		/*
7002 		 * If boosting is not active then only reclaim if there are no
7003 		 * eligible zones. Note that sc.reclaim_idx is not used as
7004 		 * buffer_heads_over_limit may have adjusted it.
7005 		 */
7006 		if (!nr_boost_reclaim && balanced)
7007 			goto out;
7008 
7009 		/* Limit the priority of boosting to avoid reclaim writeback */
7010 		if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
7011 			raise_priority = false;
7012 
7013 		/*
7014 		 * Do not writeback or swap pages for boosted reclaim. The
7015 		 * intent is to relieve pressure not issue sub-optimal IO
7016 		 * from reclaim context. If no pages are reclaimed, the
7017 		 * reclaim will be aborted.
7018 		 */
7019 		sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
7020 		sc.may_swap = !nr_boost_reclaim;
7021 
7022 		/*
7023 		 * Do some background aging, to give pages a chance to be
7024 		 * referenced before reclaiming. All pages are rotated
7025 		 * regardless of classzone as this is about consistent aging.
7026 		 */
7027 		kswapd_age_node(pgdat, &sc);
7028 
7029 		/*
7030 		 * If we're getting trouble reclaiming, start doing writepage
7031 		 * even in laptop mode.
7032 		 */
7033 		if (sc.priority < DEF_PRIORITY - 2)
7034 			sc.may_writepage = 1;
7035 
7036 		/* Call soft limit reclaim before calling shrink_node. */
7037 		sc.nr_scanned = 0;
7038 		nr_soft_scanned = 0;
7039 		nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat, sc.order,
7040 							      sc.gfp_mask, &nr_soft_scanned);
7041 		sc.nr_reclaimed += nr_soft_reclaimed;
7042 
7043 		/*
7044 		 * There should be no need to raise the scanning priority if
7045 		 * enough pages are already being scanned that that high
7046 		 * watermark would be met at 100% efficiency.
7047 		 */
7048 		if (kswapd_shrink_node(pgdat, &sc))
7049 			raise_priority = false;
7050 
7051 		/*
7052 		 * If the low watermark is met there is no need for processes
7053 		 * to be throttled on pfmemalloc_wait as they should not be
7054 		 * able to safely make forward progress. Wake them
7055 		 */
7056 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
7057 				allow_direct_reclaim(pgdat))
7058 			wake_up_all(&pgdat->pfmemalloc_wait);
7059 
7060 		/* Check if kswapd should be suspending */
7061 		__fs_reclaim_release(_THIS_IP_);
7062 		ret = kthread_freezable_should_stop(&was_frozen);
7063 		__fs_reclaim_acquire(_THIS_IP_);
7064 		if (was_frozen || ret)
7065 			break;
7066 
7067 		/*
7068 		 * Raise priority if scanning rate is too low or there was no
7069 		 * progress in reclaiming pages
7070 		 */
7071 		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
7072 		nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
7073 
7074 		/*
7075 		 * If reclaim made no progress for a boost, stop reclaim as
7076 		 * IO cannot be queued and it could be an infinite loop in
7077 		 * extreme circumstances.
7078 		 */
7079 		if (nr_boost_reclaim && !nr_reclaimed)
7080 			break;
7081 
7082 		if (raise_priority || !nr_reclaimed)
7083 			sc.priority--;
7084 	} while (sc.priority >= 1);
7085 
7086 	/*
7087 	 * Restart only if it went through the priority loop all the way,
7088 	 * but cache_trim_mode didn't work.
7089 	 */
7090 	if (!sc.nr_reclaimed && sc.priority < 1 &&
7091 	    !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) {
7092 		sc.no_cache_trim_mode = 1;
7093 		goto restart;
7094 	}
7095 
7096 	if (!sc.nr_reclaimed)
7097 		pgdat->kswapd_failures++;
7098 
7099 out:
7100 	clear_reclaim_active(pgdat, highest_zoneidx);
7101 
7102 	/* If reclaim was boosted, account for the reclaim done in this pass */
7103 	if (boosted) {
7104 		unsigned long flags;
7105 
7106 		for (i = 0; i <= highest_zoneidx; i++) {
7107 			if (!zone_boosts[i])
7108 				continue;
7109 
7110 			/* Increments are under the zone lock */
7111 			zone = pgdat->node_zones + i;
7112 			spin_lock_irqsave(&zone->lock, flags);
7113 			zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
7114 			spin_unlock_irqrestore(&zone->lock, flags);
7115 		}
7116 
7117 		/*
7118 		 * As there is now likely space, wakeup kcompact to defragment
7119 		 * pageblocks.
7120 		 */
7121 		wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
7122 	}
7123 
7124 	snapshot_refaults(NULL, pgdat);
7125 	__fs_reclaim_release(_THIS_IP_);
7126 	psi_memstall_leave(&pflags);
7127 	set_task_reclaim_state(current, NULL);
7128 
7129 	/*
7130 	 * Return the order kswapd stopped reclaiming at as
7131 	 * prepare_kswapd_sleep() takes it into account. If another caller
7132 	 * entered the allocator slow path while kswapd was awake, order will
7133 	 * remain at the higher level.
7134 	 */
7135 	return sc.order;
7136 }
7137 
7138 /*
7139  * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
7140  * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
7141  * not a valid index then either kswapd runs for first time or kswapd couldn't
7142  * sleep after previous reclaim attempt (node is still unbalanced). In that
7143  * case return the zone index of the previous kswapd reclaim cycle.
7144  */
7145 static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
7146 					   enum zone_type prev_highest_zoneidx)
7147 {
7148 	enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7149 
7150 	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
7151 }
7152 
7153 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
7154 				unsigned int highest_zoneidx)
7155 {
7156 	long remaining = 0;
7157 	DEFINE_WAIT(wait);
7158 
7159 	if (freezing(current) || kthread_should_stop())
7160 		return;
7161 
7162 	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
7163 
7164 	/*
7165 	 * Try to sleep for a short interval. Note that kcompactd will only be
7166 	 * woken if it is possible to sleep for a short interval. This is
7167 	 * deliberate on the assumption that if reclaim cannot keep an
7168 	 * eligible zone balanced that it's also unlikely that compaction will
7169 	 * succeed.
7170 	 */
7171 	if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
7172 		/*
7173 		 * Compaction records what page blocks it recently failed to
7174 		 * isolate pages from and skips them in the future scanning.
7175 		 * When kswapd is going to sleep, it is reasonable to assume
7176 		 * that pages and compaction may succeed so reset the cache.
7177 		 */
7178 		reset_isolation_suitable(pgdat);
7179 
7180 		/*
7181 		 * We have freed the memory, now we should compact it to make
7182 		 * allocation of the requested order possible.
7183 		 */
7184 		wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
7185 
7186 		remaining = schedule_timeout(HZ/10);
7187 
7188 		/*
7189 		 * If woken prematurely then reset kswapd_highest_zoneidx and
7190 		 * order. The values will either be from a wakeup request or
7191 		 * the previous request that slept prematurely.
7192 		 */
7193 		if (remaining) {
7194 			WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
7195 					kswapd_highest_zoneidx(pgdat,
7196 							highest_zoneidx));
7197 
7198 			if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
7199 				WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
7200 		}
7201 
7202 		finish_wait(&pgdat->kswapd_wait, &wait);
7203 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
7204 	}
7205 
7206 	/*
7207 	 * After a short sleep, check if it was a premature sleep. If not, then
7208 	 * go fully to sleep until explicitly woken up.
7209 	 */
7210 	if (!remaining &&
7211 	    prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
7212 		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
7213 
7214 		/*
7215 		 * vmstat counters are not perfectly accurate and the estimated
7216 		 * value for counters such as NR_FREE_PAGES can deviate from the
7217 		 * true value by nr_online_cpus * threshold. To avoid the zone
7218 		 * watermarks being breached while under pressure, we reduce the
7219 		 * per-cpu vmstat threshold while kswapd is awake and restore
7220 		 * them before going back to sleep.
7221 		 */
7222 		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
7223 
7224 		if (!kthread_should_stop())
7225 			schedule();
7226 
7227 		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
7228 	} else {
7229 		if (remaining)
7230 			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
7231 		else
7232 			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
7233 	}
7234 	finish_wait(&pgdat->kswapd_wait, &wait);
7235 }
7236 
7237 /*
7238  * The background pageout daemon, started as a kernel thread
7239  * from the init process.
7240  *
7241  * This basically trickles out pages so that we have _some_
7242  * free memory available even if there is no other activity
7243  * that frees anything up. This is needed for things like routing
7244  * etc, where we otherwise might have all activity going on in
7245  * asynchronous contexts that cannot page things out.
7246  *
7247  * If there are applications that are active memory-allocators
7248  * (most normal use), this basically shouldn't matter.
7249  */
7250 static int kswapd(void *p)
7251 {
7252 	unsigned int alloc_order, reclaim_order;
7253 	unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
7254 	pg_data_t *pgdat = (pg_data_t *)p;
7255 	struct task_struct *tsk = current;
7256 
7257 	/*
7258 	 * Tell the memory management that we're a "memory allocator",
7259 	 * and that if we need more memory we should get access to it
7260 	 * regardless (see "__alloc_pages()"). "kswapd" should
7261 	 * never get caught in the normal page freeing logic.
7262 	 *
7263 	 * (Kswapd normally doesn't need memory anyway, but sometimes
7264 	 * you need a small amount of memory in order to be able to
7265 	 * page out something else, and this flag essentially protects
7266 	 * us from recursively trying to free more memory as we're
7267 	 * trying to free the first piece of memory in the first place).
7268 	 */
7269 	tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
7270 	set_freezable();
7271 
7272 	WRITE_ONCE(pgdat->kswapd_order, 0);
7273 	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7274 	atomic_set(&pgdat->nr_writeback_throttled, 0);
7275 	for ( ; ; ) {
7276 		bool was_frozen;
7277 
7278 		alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
7279 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7280 							highest_zoneidx);
7281 
7282 kswapd_try_sleep:
7283 		kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
7284 					highest_zoneidx);
7285 
7286 		/* Read the new order and highest_zoneidx */
7287 		alloc_order = READ_ONCE(pgdat->kswapd_order);
7288 		highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7289 							highest_zoneidx);
7290 		WRITE_ONCE(pgdat->kswapd_order, 0);
7291 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7292 
7293 		if (kthread_freezable_should_stop(&was_frozen))
7294 			break;
7295 
7296 		/*
7297 		 * We can speed up thawing tasks if we don't call balance_pgdat
7298 		 * after returning from the refrigerator
7299 		 */
7300 		if (was_frozen)
7301 			continue;
7302 
7303 		/*
7304 		 * Reclaim begins at the requested order but if a high-order
7305 		 * reclaim fails then kswapd falls back to reclaiming for
7306 		 * order-0. If that happens, kswapd will consider sleeping
7307 		 * for the order it finished reclaiming at (reclaim_order)
7308 		 * but kcompactd is woken to compact for the original
7309 		 * request (alloc_order).
7310 		 */
7311 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
7312 						alloc_order);
7313 		reclaim_order = balance_pgdat(pgdat, alloc_order,
7314 						highest_zoneidx);
7315 		if (reclaim_order < alloc_order)
7316 			goto kswapd_try_sleep;
7317 	}
7318 
7319 	tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
7320 
7321 	return 0;
7322 }
7323 
7324 /*
7325  * A zone is low on free memory or too fragmented for high-order memory.  If
7326  * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
7327  * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
7328  * has failed or is not needed, still wake up kcompactd if only compaction is
7329  * needed.
7330  */
7331 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
7332 		   enum zone_type highest_zoneidx)
7333 {
7334 	pg_data_t *pgdat;
7335 	enum zone_type curr_idx;
7336 
7337 	if (!managed_zone(zone))
7338 		return;
7339 
7340 	if (!cpuset_zone_allowed(zone, gfp_flags))
7341 		return;
7342 
7343 	pgdat = zone->zone_pgdat;
7344 	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7345 
7346 	if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
7347 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
7348 
7349 	if (READ_ONCE(pgdat->kswapd_order) < order)
7350 		WRITE_ONCE(pgdat->kswapd_order, order);
7351 
7352 	if (!waitqueue_active(&pgdat->kswapd_wait))
7353 		return;
7354 
7355 	/* Hopeless node, leave it to direct reclaim if possible */
7356 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
7357 	    (pgdat_balanced(pgdat, order, highest_zoneidx) &&
7358 	     !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
7359 		/*
7360 		 * There may be plenty of free memory available, but it's too
7361 		 * fragmented for high-order allocations.  Wake up kcompactd
7362 		 * and rely on compaction_suitable() to determine if it's
7363 		 * needed.  If it fails, it will defer subsequent attempts to
7364 		 * ratelimit its work.
7365 		 */
7366 		if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
7367 			wakeup_kcompactd(pgdat, order, highest_zoneidx);
7368 		return;
7369 	}
7370 
7371 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
7372 				      gfp_flags);
7373 	wake_up_interruptible(&pgdat->kswapd_wait);
7374 }
7375 
7376 #ifdef CONFIG_HIBERNATION
7377 /*
7378  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
7379  * freed pages.
7380  *
7381  * Rather than trying to age LRUs the aim is to preserve the overall
7382  * LRU order by reclaiming preferentially
7383  * inactive > active > active referenced > active mapped
7384  */
7385 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
7386 {
7387 	struct scan_control sc = {
7388 		.nr_to_reclaim = nr_to_reclaim,
7389 		.gfp_mask = GFP_HIGHUSER_MOVABLE,
7390 		.reclaim_idx = MAX_NR_ZONES - 1,
7391 		.priority = DEF_PRIORITY,
7392 		.may_writepage = 1,
7393 		.may_unmap = 1,
7394 		.may_swap = 1,
7395 		.hibernation_mode = 1,
7396 	};
7397 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
7398 	unsigned long nr_reclaimed;
7399 	unsigned int noreclaim_flag;
7400 
7401 	fs_reclaim_acquire(sc.gfp_mask);
7402 	noreclaim_flag = memalloc_noreclaim_save();
7403 	set_task_reclaim_state(current, &sc.reclaim_state);
7404 
7405 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
7406 
7407 	set_task_reclaim_state(current, NULL);
7408 	memalloc_noreclaim_restore(noreclaim_flag);
7409 	fs_reclaim_release(sc.gfp_mask);
7410 
7411 	return nr_reclaimed;
7412 }
7413 #endif /* CONFIG_HIBERNATION */
7414 
7415 /*
7416  * This kswapd start function will be called by init and node-hot-add.
7417  */
7418 void __meminit kswapd_run(int nid)
7419 {
7420 	pg_data_t *pgdat = NODE_DATA(nid);
7421 
7422 	pgdat_kswapd_lock(pgdat);
7423 	if (!pgdat->kswapd) {
7424 		pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid);
7425 		if (IS_ERR(pgdat->kswapd)) {
7426 			/* failure at boot is fatal */
7427 			pr_err("Failed to start kswapd on node %d,ret=%ld\n",
7428 				   nid, PTR_ERR(pgdat->kswapd));
7429 			BUG_ON(system_state < SYSTEM_RUNNING);
7430 			pgdat->kswapd = NULL;
7431 		} else {
7432 			wake_up_process(pgdat->kswapd);
7433 		}
7434 	}
7435 	pgdat_kswapd_unlock(pgdat);
7436 }
7437 
7438 /*
7439  * Called by memory hotplug when all memory in a node is offlined.  Caller must
7440  * be holding mem_hotplug_begin/done().
7441  */
7442 void __meminit kswapd_stop(int nid)
7443 {
7444 	pg_data_t *pgdat = NODE_DATA(nid);
7445 	struct task_struct *kswapd;
7446 
7447 	pgdat_kswapd_lock(pgdat);
7448 	kswapd = pgdat->kswapd;
7449 	if (kswapd) {
7450 		kthread_stop(kswapd);
7451 		pgdat->kswapd = NULL;
7452 	}
7453 	pgdat_kswapd_unlock(pgdat);
7454 }
7455 
7456 static const struct ctl_table vmscan_sysctl_table[] = {
7457 	{
7458 		.procname	= "swappiness",
7459 		.data		= &vm_swappiness,
7460 		.maxlen		= sizeof(vm_swappiness),
7461 		.mode		= 0644,
7462 		.proc_handler	= proc_dointvec_minmax,
7463 		.extra1		= SYSCTL_ZERO,
7464 		.extra2		= SYSCTL_TWO_HUNDRED,
7465 	},
7466 #ifdef CONFIG_NUMA
7467 	{
7468 		.procname	= "zone_reclaim_mode",
7469 		.data		= &node_reclaim_mode,
7470 		.maxlen		= sizeof(node_reclaim_mode),
7471 		.mode		= 0644,
7472 		.proc_handler	= proc_dointvec_minmax,
7473 		.extra1		= SYSCTL_ZERO,
7474 	}
7475 #endif
7476 };
7477 
7478 static int __init kswapd_init(void)
7479 {
7480 	int nid;
7481 
7482 	swap_setup();
7483 	for_each_node_state(nid, N_MEMORY)
7484  		kswapd_run(nid);
7485 	register_sysctl_init("vm", vmscan_sysctl_table);
7486 	return 0;
7487 }
7488 
7489 module_init(kswapd_init)
7490 
7491 #ifdef CONFIG_NUMA
7492 /*
7493  * Node reclaim mode
7494  *
7495  * If non-zero call node_reclaim when the number of free pages falls below
7496  * the watermarks.
7497  */
7498 int node_reclaim_mode __read_mostly;
7499 
7500 /*
7501  * Priority for NODE_RECLAIM. This determines the fraction of pages
7502  * of a node considered for each zone_reclaim. 4 scans 1/16th of
7503  * a zone.
7504  */
7505 #define NODE_RECLAIM_PRIORITY 4
7506 
7507 /*
7508  * Percentage of pages in a zone that must be unmapped for node_reclaim to
7509  * occur.
7510  */
7511 int sysctl_min_unmapped_ratio = 1;
7512 
7513 /*
7514  * If the number of slab pages in a zone grows beyond this percentage then
7515  * slab reclaim needs to occur.
7516  */
7517 int sysctl_min_slab_ratio = 5;
7518 
7519 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
7520 {
7521 	unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
7522 	unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
7523 		node_page_state(pgdat, NR_ACTIVE_FILE);
7524 
7525 	/*
7526 	 * It's possible for there to be more file mapped pages than
7527 	 * accounted for by the pages on the file LRU lists because
7528 	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
7529 	 */
7530 	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
7531 }
7532 
7533 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
7534 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
7535 {
7536 	unsigned long nr_pagecache_reclaimable;
7537 	unsigned long delta = 0;
7538 
7539 	/*
7540 	 * If RECLAIM_UNMAP is set, then all file pages are considered
7541 	 * potentially reclaimable. Otherwise, we have to worry about
7542 	 * pages like swapcache and node_unmapped_file_pages() provides
7543 	 * a better estimate
7544 	 */
7545 	if (node_reclaim_mode & RECLAIM_UNMAP)
7546 		nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
7547 	else
7548 		nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
7549 
7550 	/* If we can't clean pages, remove dirty pages from consideration */
7551 	if (!(node_reclaim_mode & RECLAIM_WRITE))
7552 		delta += node_page_state(pgdat, NR_FILE_DIRTY);
7553 
7554 	/* Watch for any possible underflows due to delta */
7555 	if (unlikely(delta > nr_pagecache_reclaimable))
7556 		delta = nr_pagecache_reclaimable;
7557 
7558 	return nr_pagecache_reclaimable - delta;
7559 }
7560 
7561 /*
7562  * Try to free up some pages from this node through reclaim.
7563  */
7564 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
7565 {
7566 	/* Minimum pages needed in order to stay on node */
7567 	const unsigned long nr_pages = 1 << order;
7568 	struct task_struct *p = current;
7569 	unsigned int noreclaim_flag;
7570 	struct scan_control sc = {
7571 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
7572 		.gfp_mask = current_gfp_context(gfp_mask),
7573 		.order = order,
7574 		.priority = NODE_RECLAIM_PRIORITY,
7575 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
7576 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
7577 		.may_swap = 1,
7578 		.reclaim_idx = gfp_zone(gfp_mask),
7579 	};
7580 	unsigned long pflags;
7581 
7582 	trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
7583 					   sc.gfp_mask);
7584 
7585 	cond_resched();
7586 	psi_memstall_enter(&pflags);
7587 	delayacct_freepages_start();
7588 	fs_reclaim_acquire(sc.gfp_mask);
7589 	/*
7590 	 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
7591 	 */
7592 	noreclaim_flag = memalloc_noreclaim_save();
7593 	set_task_reclaim_state(p, &sc.reclaim_state);
7594 
7595 	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
7596 	    node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
7597 		/*
7598 		 * Free memory by calling shrink node with increasing
7599 		 * priorities until we have enough memory freed.
7600 		 */
7601 		do {
7602 			shrink_node(pgdat, &sc);
7603 		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
7604 	}
7605 
7606 	set_task_reclaim_state(p, NULL);
7607 	memalloc_noreclaim_restore(noreclaim_flag);
7608 	fs_reclaim_release(sc.gfp_mask);
7609 	psi_memstall_leave(&pflags);
7610 	delayacct_freepages_end();
7611 
7612 	trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
7613 
7614 	return sc.nr_reclaimed >= nr_pages;
7615 }
7616 
7617 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
7618 {
7619 	int ret;
7620 
7621 	/*
7622 	 * Node reclaim reclaims unmapped file backed pages and
7623 	 * slab pages if we are over the defined limits.
7624 	 *
7625 	 * A small portion of unmapped file backed pages is needed for
7626 	 * file I/O otherwise pages read by file I/O will be immediately
7627 	 * thrown out if the node is overallocated. So we do not reclaim
7628 	 * if less than a specified percentage of the node is used by
7629 	 * unmapped file backed pages.
7630 	 */
7631 	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
7632 	    node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
7633 	    pgdat->min_slab_pages)
7634 		return NODE_RECLAIM_FULL;
7635 
7636 	/*
7637 	 * Do not scan if the allocation should not be delayed.
7638 	 */
7639 	if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
7640 		return NODE_RECLAIM_NOSCAN;
7641 
7642 	/*
7643 	 * Only run node reclaim on the local node or on nodes that do not
7644 	 * have associated processors. This will favor the local processor
7645 	 * over remote processors and spread off node memory allocations
7646 	 * as wide as possible.
7647 	 */
7648 	if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
7649 		return NODE_RECLAIM_NOSCAN;
7650 
7651 	if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
7652 		return NODE_RECLAIM_NOSCAN;
7653 
7654 	ret = __node_reclaim(pgdat, gfp_mask, order);
7655 	clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
7656 
7657 	if (ret)
7658 		count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
7659 	else
7660 		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
7661 
7662 	return ret;
7663 }
7664 #endif
7665 
7666 /**
7667  * check_move_unevictable_folios - Move evictable folios to appropriate zone
7668  * lru list
7669  * @fbatch: Batch of lru folios to check.
7670  *
7671  * Checks folios for evictability, if an evictable folio is in the unevictable
7672  * lru list, moves it to the appropriate evictable lru list. This function
7673  * should be only used for lru folios.
7674  */
7675 void check_move_unevictable_folios(struct folio_batch *fbatch)
7676 {
7677 	struct lruvec *lruvec = NULL;
7678 	int pgscanned = 0;
7679 	int pgrescued = 0;
7680 	int i;
7681 
7682 	for (i = 0; i < fbatch->nr; i++) {
7683 		struct folio *folio = fbatch->folios[i];
7684 		int nr_pages = folio_nr_pages(folio);
7685 
7686 		pgscanned += nr_pages;
7687 
7688 		/* block memcg migration while the folio moves between lrus */
7689 		if (!folio_test_clear_lru(folio))
7690 			continue;
7691 
7692 		lruvec = folio_lruvec_relock_irq(folio, lruvec);
7693 		if (folio_evictable(folio) && folio_test_unevictable(folio)) {
7694 			lruvec_del_folio(lruvec, folio);
7695 			folio_clear_unevictable(folio);
7696 			lruvec_add_folio(lruvec, folio);
7697 			pgrescued += nr_pages;
7698 		}
7699 		folio_set_lru(folio);
7700 	}
7701 
7702 	if (lruvec) {
7703 		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
7704 		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
7705 		unlock_page_lruvec_irq(lruvec);
7706 	} else if (pgscanned) {
7707 		count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
7708 	}
7709 }
7710 EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
7711