xref: /linux/mm/workingset.c (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Workingset detection
4  *
5  * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
6  */
7 
8 #include <linux/memcontrol.h>
9 #include <linux/mm_inline.h>
10 #include <linux/writeback.h>
11 #include <linux/shmem_fs.h>
12 #include <linux/pagemap.h>
13 #include <linux/atomic.h>
14 #include <linux/module.h>
15 #include <linux/swap.h>
16 #include <linux/dax.h>
17 #include <linux/fs.h>
18 #include <linux/mm.h>
19 #include "swap_table.h"
20 #include "internal.h"
21 
22 /*
23  *		Double CLOCK lists
24  *
25  * Per node, two clock lists are maintained for file pages: the
26  * inactive and the active list.  Freshly faulted pages start out at
27  * the head of the inactive list and page reclaim scans pages from the
28  * tail.  Pages that are accessed multiple times on the inactive list
29  * are promoted to the active list, to protect them from reclaim,
30  * whereas active pages are demoted to the inactive list when the
31  * active list grows too big.
32  *
33  *   fault ------------------------+
34  *                                 |
35  *              +--------------+   |            +-------------+
36  *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
37  *              +--------------+                +-------------+    |
38  *                     |                                           |
39  *                     +-------------- promotion ------------------+
40  *
41  *
42  *		Access frequency and refault distance
43  *
44  * A workload is thrashing when its pages are frequently used but they
45  * are evicted from the inactive list every time before another access
46  * would have promoted them to the active list.
47  *
48  * In cases where the average access distance between thrashing pages
49  * is bigger than the size of memory there is nothing that can be
50  * done - the thrashing set could never fit into memory under any
51  * circumstance.
52  *
53  * However, the average access distance could be bigger than the
54  * inactive list, yet smaller than the size of memory.  In this case,
55  * the set could fit into memory if it weren't for the currently
56  * active pages - which may be used more, hopefully less frequently:
57  *
58  *      +-memory available to cache-+
59  *      |                           |
60  *      +-inactive------+-active----+
61  *  a b | c d e f g h i | J K L M N |
62  *      +---------------+-----------+
63  *
64  * It is prohibitively expensive to accurately track access frequency
65  * of pages.  But a reasonable approximation can be made to measure
66  * thrashing on the inactive list, after which refaulting pages can be
67  * activated optimistically to compete with the existing active pages.
68  *
69  * Approximating inactive page access frequency - Observations:
70  *
71  * 1. When a page is accessed for the first time, it is added to the
72  *    head of the inactive list, slides every existing inactive page
73  *    towards the tail by one slot, and pushes the current tail page
74  *    out of memory.
75  *
76  * 2. When a page is accessed for the second time, it is promoted to
77  *    the active list, shrinking the inactive list by one slot.  This
78  *    also slides all inactive pages that were faulted into the cache
79  *    more recently than the activated page towards the tail of the
80  *    inactive list.
81  *
82  * Thus:
83  *
84  * 1. The sum of evictions and activations between any two points in
85  *    time indicate the minimum number of inactive pages accessed in
86  *    between.
87  *
88  * 2. Moving one inactive page N page slots towards the tail of the
89  *    list requires at least N inactive page accesses.
90  *
91  * Combining these:
92  *
93  * 1. When a page is finally evicted from memory, the number of
94  *    inactive pages accessed while the page was in cache is at least
95  *    the number of page slots on the inactive list.
96  *
97  * 2. In addition, measuring the sum of evictions and activations (E)
98  *    at the time of a page's eviction, and comparing it to another
99  *    reading (R) at the time the page faults back into memory tells
100  *    the minimum number of accesses while the page was not cached.
101  *    This is called the refault distance.
102  *
103  * Because the first access of the page was the fault and the second
104  * access the refault, we combine the in-cache distance with the
105  * out-of-cache distance to get the complete minimum access distance
106  * of this page:
107  *
108  *      NR_inactive + (R - E)
109  *
110  * And knowing the minimum access distance of a page, we can easily
111  * tell if the page would be able to stay in cache assuming all page
112  * slots in the cache were available:
113  *
114  *   NR_inactive + (R - E) <= NR_inactive + NR_active
115  *
116  * If we have swap we should consider about NR_inactive_anon and
117  * NR_active_anon, so for page cache and anonymous respectively:
118  *
119  *   NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
120  *   + NR_inactive_anon + NR_active_anon
121  *
122  *   NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
123  *   + NR_inactive_file + NR_active_file
124  *
125  * Which can be further simplified to:
126  *
127  *   (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
128  *
129  *   (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
130  *
131  * Put into words, the refault distance (out-of-cache) can be seen as
132  * a deficit in inactive list space (in-cache).  If the inactive list
133  * had (R - E) more page slots, the page would not have been evicted
134  * in between accesses, but activated instead.  And on a full system,
135  * the only thing eating into inactive list space is active pages.
136  *
137  *
138  *		Refaulting inactive pages
139  *
140  * All that is known about the active list is that the pages have been
141  * accessed more than once in the past.  This means that at any given
142  * time there is actually a good chance that pages on the active list
143  * are no longer in active use.
144  *
145  * So when a refault distance of (R - E) is observed and there are at
146  * least (R - E) pages in the userspace workingset, the refaulting page
147  * is activated optimistically in the hope that (R - E) pages are actually
148  * used less frequently than the refaulting page - or even not used at
149  * all anymore.
150  *
151  * That means if inactive cache is refaulting with a suitable refault
152  * distance, we assume the cache workingset is transitioning and put
153  * pressure on the current workingset.
154  *
155  * If this is wrong and demotion kicks in, the pages which are truly
156  * used more frequently will be reactivated while the less frequently
157  * used once will be evicted from memory.
158  *
159  * But if this is right, the stale pages will be pushed out of memory
160  * and the used pages get to stay in cache.
161  *
162  *		Refaulting active pages
163  *
164  * If on the other hand the refaulting pages have recently been
165  * deactivated, it means that the active list is no longer protecting
166  * actively used cache from reclaim. The cache is NOT transitioning to
167  * a different workingset; the existing workingset is thrashing in the
168  * space allocated to the page cache.
169  *
170  *
171  *		Implementation
172  *
173  * For each node's LRU lists, a counter for inactive evictions and
174  * activations is maintained (node->nonresident_age).
175  *
176  * On eviction, a snapshot of this counter (along with some bits to
177  * identify the node) is stored in the now empty page cache
178  * slot of the evicted page.  This is called a shadow entry.
179  *
180  * On cache misses for which there are shadow entries, an eligible
181  * refault distance will immediately activate the refaulting page.
182  */
183 
184 #define WORKINGSET_SHIFT 1
185 #define EVICTION_SHIFT	((BITS_PER_LONG - BITS_PER_XA_VALUE) +	\
186 			 WORKINGSET_SHIFT + NODES_SHIFT + \
187 			 MEM_CGROUP_ID_SHIFT)
188 #define EVICTION_SHIFT_ANON	(EVICTION_SHIFT + SWAP_COUNT_SHIFT)
189 #define EVICTION_MASK	(~0UL >> EVICTION_SHIFT)
190 #define EVICTION_MASK_ANON	(~0UL >> EVICTION_SHIFT_ANON)
191 
192 /*
193  * Eviction timestamps need to be able to cover the full range of
194  * actionable refaults. However, bits are tight in the xarray
195  * entry, and after storing the identifier for the lruvec there might
196  * not be enough left to represent every single actionable refault. In
197  * that case, we have to sacrifice granularity for distance, and group
198  * evictions into coarser buckets by shaving off lower timestamp bits.
199  */
200 static unsigned int bucket_order[ANON_AND_FILE] __read_mostly;
201 
202 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
203 			 bool workingset, bool file)
204 {
205 	eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON;
206 	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
207 	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
208 	eviction = (eviction << WORKINGSET_SHIFT) | workingset;
209 
210 	return xa_mk_value(eviction);
211 }
212 
213 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
214 			  unsigned long *evictionp, bool *workingsetp)
215 {
216 	unsigned long entry = xa_to_value(shadow);
217 	int memcgid, nid;
218 	bool workingset;
219 
220 	workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
221 	entry >>= WORKINGSET_SHIFT;
222 	nid = entry & ((1UL << NODES_SHIFT) - 1);
223 	entry >>= NODES_SHIFT;
224 	memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
225 	entry >>= MEM_CGROUP_ID_SHIFT;
226 
227 	*memcgidp = memcgid;
228 	*pgdat = NODE_DATA(nid);
229 	*evictionp = entry;
230 	*workingsetp = workingset;
231 }
232 
233 #ifdef CONFIG_LRU_GEN
234 
235 static void *lru_gen_eviction(struct folio *folio)
236 {
237 	int hist;
238 	unsigned long token;
239 	unsigned long min_seq;
240 	struct lruvec *lruvec;
241 	struct lru_gen_folio *lrugen;
242 	int type = folio_is_file_lru(folio);
243 	int delta = folio_nr_pages(folio);
244 	int refs = folio_lru_refs(folio);
245 	bool workingset = folio_test_workingset(folio);
246 	int tier = lru_tier_from_refs(refs, workingset);
247 	struct mem_cgroup *memcg;
248 	struct pglist_data *pgdat = folio_pgdat(folio);
249 	unsigned short memcg_id;
250 
251 	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
252 		     BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
253 
254 	rcu_read_lock();
255 	memcg = folio_memcg(folio);
256 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
257 	lrugen = &lruvec->lrugen;
258 	min_seq = READ_ONCE(lrugen->min_seq[type]);
259 	token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
260 
261 	hist = lru_hist_from_seq(min_seq);
262 	atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
263 	memcg_id = mem_cgroup_private_id(memcg);
264 	rcu_read_unlock();
265 
266 	return pack_shadow(memcg_id, pgdat, token, workingset, type);
267 }
268 
269 /*
270  * Tests if the shadow entry is for a folio that was recently evicted.
271  * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
272  */
273 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
274 				unsigned long *token, bool *workingset, bool file)
275 {
276 	int memcg_id;
277 	unsigned long max_seq;
278 	struct mem_cgroup *memcg;
279 	struct pglist_data *pgdat;
280 
281 	unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
282 
283 	memcg = mem_cgroup_from_private_id(memcg_id);
284 	*lruvec = mem_cgroup_lruvec(memcg, pgdat);
285 
286 	max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
287 	max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH;
288 
289 	return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
290 }
291 
292 static void lru_gen_refault(struct folio *folio, void *shadow)
293 {
294 	bool recent;
295 	int hist, tier, refs;
296 	bool workingset;
297 	unsigned long token;
298 	struct lruvec *lruvec;
299 	struct lru_gen_folio *lrugen;
300 	int type = folio_is_file_lru(folio);
301 	int delta = folio_nr_pages(folio);
302 
303 	rcu_read_lock();
304 
305 	recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type);
306 	if (lruvec != folio_lruvec(folio))
307 		goto unlock;
308 
309 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
310 
311 	if (!recent)
312 		goto unlock;
313 
314 	lrugen = &lruvec->lrugen;
315 
316 	hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
317 	refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
318 	tier = lru_tier_from_refs(refs, workingset);
319 
320 	atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
321 
322 	if (workingset) {
323 		/*
324 		 * see folio_add_lru(), where folio_set_active() is
325 		 * called for workingset folios
326 		 */
327 		if (lru_gen_in_fault())
328 			mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
329 		folio_set_workingset(folio);
330 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
331 	} else
332 		set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
333 unlock:
334 	rcu_read_unlock();
335 }
336 
337 #else /* !CONFIG_LRU_GEN */
338 
339 static void *lru_gen_eviction(struct folio *folio)
340 {
341 	return NULL;
342 }
343 
344 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
345 				unsigned long *token, bool *workingset, bool file)
346 {
347 	return false;
348 }
349 
350 static void lru_gen_refault(struct folio *folio, void *shadow)
351 {
352 }
353 
354 #endif /* CONFIG_LRU_GEN */
355 
356 /**
357  * workingset_age_nonresident - age non-resident entries as LRU ages
358  * @lruvec: the lruvec that was aged
359  * @nr_pages: the number of pages to count
360  *
361  * As in-memory pages are aged, non-resident pages need to be aged as
362  * well, in order for the refault distances later on to be comparable
363  * to the in-memory dimensions. This function allows reclaim and LRU
364  * operations to drive the non-resident aging along in parallel.
365  */
366 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
367 {
368 	/*
369 	 * Reclaiming a cgroup means reclaiming all its children in a
370 	 * round-robin fashion. That means that each cgroup has an LRU
371 	 * order that is composed of the LRU orders of its child
372 	 * cgroups; and every page has an LRU position not just in the
373 	 * cgroup that owns it, but in all of that group's ancestors.
374 	 *
375 	 * So when the physical inactive list of a leaf cgroup ages,
376 	 * the virtual inactive lists of all its parents, including
377 	 * the root cgroup's, age as well.
378 	 */
379 	do {
380 		atomic_long_add(nr_pages, &lruvec->nonresident_age);
381 	} while ((lruvec = parent_lruvec(lruvec)));
382 }
383 
384 /**
385  * workingset_eviction - note the eviction of a folio from memory
386  * @target_memcg: the cgroup that is causing the reclaim
387  * @folio: the folio being evicted
388  *
389  * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
390  * of the evicted @folio so that a later refault can be detected.
391  */
392 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
393 {
394 	struct pglist_data *pgdat = folio_pgdat(folio);
395 	int file = folio_is_file_lru(folio);
396 	unsigned long eviction;
397 	struct lruvec *lruvec;
398 	int memcgid;
399 
400 	/* Folio is fully exclusive and pins folio's memory cgroup pointer */
401 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
402 	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
403 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
404 
405 	if (lru_gen_enabled())
406 		return lru_gen_eviction(folio);
407 
408 	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
409 	/* XXX: target_memcg can be NULL, go through lruvec */
410 	memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec));
411 	eviction = atomic_long_read(&lruvec->nonresident_age);
412 	eviction >>= bucket_order[file];
413 	workingset_age_nonresident(lruvec, folio_nr_pages(folio));
414 	return pack_shadow(memcgid, pgdat, eviction,
415 			   folio_test_workingset(folio), file);
416 }
417 
418 /**
419  * workingset_test_recent - tests if the shadow entry is for a folio that was
420  * recently evicted. Also fills in @workingset with the value unpacked from
421  * shadow.
422  * @shadow: the shadow entry to be tested.
423  * @file: whether the corresponding folio is from the file lru.
424  * @workingset: where the workingset value unpacked from shadow should
425  * be stored.
426  * @flush: whether to flush cgroup rstat.
427  *
428  * Return: true if the shadow is for a recently evicted folio; false otherwise.
429  */
430 bool workingset_test_recent(void *shadow, bool file, bool *workingset,
431 				bool flush)
432 {
433 	struct mem_cgroup *eviction_memcg;
434 	struct lruvec *eviction_lruvec;
435 	unsigned long refault_distance;
436 	unsigned long workingset_size;
437 	unsigned long refault;
438 	int memcgid;
439 	struct pglist_data *pgdat;
440 	unsigned long eviction;
441 
442 	if (lru_gen_enabled()) {
443 		bool recent;
444 
445 		rcu_read_lock();
446 		recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction,
447 					     workingset, file);
448 		rcu_read_unlock();
449 		return recent;
450 	}
451 
452 	rcu_read_lock();
453 	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
454 	eviction <<= bucket_order[file];
455 
456 	/*
457 	 * Look up the memcg associated with the stored ID. It might
458 	 * have been deleted since the folio's eviction.
459 	 *
460 	 * Note that in rare events the ID could have been recycled
461 	 * for a new cgroup that refaults a shared folio. This is
462 	 * impossible to tell from the available data. However, this
463 	 * should be a rare and limited disturbance, and activations
464 	 * are always speculative anyway. Ultimately, it's the aging
465 	 * algorithm's job to shake out the minimum access frequency
466 	 * for the active cache.
467 	 *
468 	 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
469 	 * would be better if the root_mem_cgroup existed in all
470 	 * configurations instead.
471 	 */
472 	eviction_memcg = mem_cgroup_from_private_id(memcgid);
473 	if (!mem_cgroup_tryget(eviction_memcg))
474 		eviction_memcg = NULL;
475 	rcu_read_unlock();
476 
477 	if (!mem_cgroup_disabled() && !eviction_memcg)
478 		return false;
479 	/*
480 	 * Flush stats (and potentially sleep) outside the RCU read section.
481 	 *
482 	 * Note that workingset_test_recent() itself might be called in RCU read
483 	 * section (for e.g, in cachestat) - these callers need to skip flushing
484 	 * stats (via the flush argument).
485 	 *
486 	 * XXX: With per-memcg flushing and thresholding, is ratelimiting
487 	 * still needed here?
488 	 */
489 	if (flush)
490 		mem_cgroup_flush_stats_ratelimited(eviction_memcg);
491 
492 	eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
493 	refault = atomic_long_read(&eviction_lruvec->nonresident_age);
494 
495 	/*
496 	 * Calculate the refault distance
497 	 *
498 	 * The unsigned subtraction here gives an accurate distance
499 	 * across nonresident_age overflows in most cases. There is a
500 	 * special case: usually, shadow entries have a short lifetime
501 	 * and are either refaulted or reclaimed along with the inode
502 	 * before they get too old.  But it is not impossible for the
503 	 * nonresident_age to lap a shadow entry in the field, which
504 	 * can then result in a false small refault distance, leading
505 	 * to a false activation should this old entry actually
506 	 * refault again.  However, earlier kernels used to deactivate
507 	 * unconditionally with *every* reclaim invocation for the
508 	 * longest time, so the occasional inappropriate activation
509 	 * leading to pressure on the active list is not a problem.
510 	 */
511 	refault_distance = ((refault - eviction) &
512 			    (file ? EVICTION_MASK : EVICTION_MASK_ANON));
513 
514 	/*
515 	 * Compare the distance to the existing workingset size. We
516 	 * don't activate pages that couldn't stay resident even if
517 	 * all the memory was available to the workingset. Whether
518 	 * workingset competition needs to consider anon or not depends
519 	 * on having free swap space.
520 	 */
521 	workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
522 	if (!file) {
523 		workingset_size += lruvec_page_state(eviction_lruvec,
524 						     NR_INACTIVE_FILE);
525 	}
526 	if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
527 		workingset_size += lruvec_page_state(eviction_lruvec,
528 						     NR_ACTIVE_ANON);
529 		if (file) {
530 			workingset_size += lruvec_page_state(eviction_lruvec,
531 						     NR_INACTIVE_ANON);
532 		}
533 	}
534 
535 	mem_cgroup_put(eviction_memcg);
536 	return refault_distance <= workingset_size;
537 }
538 
539 /**
540  * workingset_refault - Evaluate the refault of a previously evicted folio.
541  * @folio: The freshly allocated replacement folio.
542  * @shadow: Shadow entry of the evicted folio.
543  *
544  * Calculates and evaluates the refault distance of the previously
545  * evicted folio in the context of the node and the memcg whose memory
546  * pressure caused the eviction.
547  */
548 void workingset_refault(struct folio *folio, void *shadow)
549 {
550 	bool file = folio_is_file_lru(folio);
551 	struct mem_cgroup *memcg;
552 	struct lruvec *lruvec;
553 	bool workingset;
554 	long nr;
555 
556 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
557 
558 	if (lru_gen_enabled()) {
559 		lru_gen_refault(folio, shadow);
560 		return;
561 	}
562 
563 	/*
564 	 * The activation decision for this folio is made at the level
565 	 * where the eviction occurred, as that is where the LRU order
566 	 * during folio reclaim is being determined.
567 	 *
568 	 * However, the cgroup that will own the folio is the one that
569 	 * is actually experiencing the refault event. Make sure the folio is
570 	 * locked to guarantee folio_memcg() stability throughout.
571 	 */
572 	nr = folio_nr_pages(folio);
573 	memcg = get_mem_cgroup_from_folio(folio);
574 	lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio));
575 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
576 
577 	if (!workingset_test_recent(shadow, file, &workingset, true))
578 		goto out;
579 
580 	folio_set_active(folio);
581 	workingset_age_nonresident(lruvec, nr);
582 	mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
583 
584 	/* Folio was active prior to eviction */
585 	if (workingset) {
586 		folio_set_workingset(folio);
587 		/*
588 		 * XXX: Move to folio_add_lru() when it supports new vs
589 		 * putback
590 		 */
591 		lru_note_cost_refault(folio);
592 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
593 	}
594 out:
595 	mem_cgroup_put(memcg);
596 }
597 
598 /**
599  * workingset_activation - note a page activation
600  * @folio: Folio that is being activated.
601  */
602 void workingset_activation(struct folio *folio)
603 {
604 	/*
605 	 * Filter non-memcg pages here, e.g. unmap can call
606 	 * mark_page_accessed() on VDSO pages.
607 	 */
608 	if (mem_cgroup_disabled() || folio_memcg_charged(folio)) {
609 		rcu_read_lock();
610 		workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
611 		rcu_read_unlock();
612 	}
613 }
614 
615 /*
616  * Shadow entries reflect the share of the working set that does not
617  * fit into memory, so their number depends on the access pattern of
618  * the workload.  In most cases, they will refault or get reclaimed
619  * along with the inode, but a (malicious) workload that streams
620  * through files with a total size several times that of available
621  * memory, while preventing the inodes from being reclaimed, can
622  * create excessive amounts of shadow nodes.  To keep a lid on this,
623  * track shadow nodes and reclaim them when they grow way past the
624  * point where they would still be useful.
625  */
626 
627 struct list_lru shadow_nodes;
628 
629 void workingset_update_node(struct xa_node *node)
630 {
631 	struct page *page = virt_to_page(node);
632 
633 	/*
634 	 * Track non-empty nodes that contain only shadow entries;
635 	 * unlink those that contain pages or are being freed.
636 	 *
637 	 * Avoid acquiring the list_lru lock when the nodes are
638 	 * already where they should be. The list_empty() test is safe
639 	 * as node->private_list is protected by the i_pages lock.
640 	 */
641 	lockdep_assert_held(&node->array->xa_lock);
642 
643 	if (node->count && node->count == node->nr_values) {
644 		if (list_empty(&node->private_list)) {
645 			list_lru_add_obj(&shadow_nodes, &node->private_list);
646 			__inc_node_page_state(page, WORKINGSET_NODES);
647 		}
648 	} else {
649 		if (!list_empty(&node->private_list)) {
650 			list_lru_del_obj(&shadow_nodes, &node->private_list);
651 			__dec_node_page_state(page, WORKINGSET_NODES);
652 		}
653 	}
654 }
655 
656 static unsigned long count_shadow_nodes(struct shrinker *shrinker,
657 					struct shrink_control *sc)
658 {
659 	unsigned long max_nodes;
660 	unsigned long nodes;
661 	unsigned long pages;
662 
663 	nodes = list_lru_shrink_count(&shadow_nodes, sc);
664 	if (!nodes)
665 		return SHRINK_EMPTY;
666 
667 	/*
668 	 * Approximate a reasonable limit for the nodes
669 	 * containing shadow entries. We don't need to keep more
670 	 * shadow entries than possible pages on the active list,
671 	 * since refault distances bigger than that are dismissed.
672 	 *
673 	 * The size of the active list converges toward 100% of
674 	 * overall page cache as memory grows, with only a tiny
675 	 * inactive list. Assume the total cache size for that.
676 	 *
677 	 * Nodes might be sparsely populated, with only one shadow
678 	 * entry in the extreme case. Obviously, we cannot keep one
679 	 * node for every eligible shadow entry, so compromise on a
680 	 * worst-case density of 1/8th. Below that, not all eligible
681 	 * refaults can be detected anymore.
682 	 *
683 	 * On 64-bit with 7 xa_nodes per page and 64 slots
684 	 * each, this will reclaim shadow entries when they consume
685 	 * ~1.8% of available memory:
686 	 *
687 	 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
688 	 */
689 #ifdef CONFIG_MEMCG
690 	if (sc->memcg) {
691 		struct lruvec *lruvec;
692 		int i;
693 
694 		mem_cgroup_flush_stats_ratelimited(sc->memcg);
695 		lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
696 
697 		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
698 			pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1);
699 
700 		pages += lruvec_page_state_local(
701 			lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
702 		pages += lruvec_page_state_local(
703 			lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
704 	} else
705 #endif
706 		pages = node_present_pages(sc->nid);
707 
708 	max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
709 
710 	if (nodes <= max_nodes)
711 		return 0;
712 	return nodes - max_nodes;
713 }
714 
715 static enum lru_status shadow_lru_isolate(struct list_head *item,
716 					  struct list_lru_one *lru,
717 					  void *arg) __must_hold(lru->lock)
718 {
719 	struct xa_node *node = container_of(item, struct xa_node, private_list);
720 	struct address_space *mapping;
721 	int ret;
722 
723 	/*
724 	 * Page cache insertions and deletions synchronously maintain
725 	 * the shadow node LRU under the i_pages lock and the
726 	 * &lru->lock. Because the page cache tree is emptied before
727 	 * the inode can be destroyed, holding the &lru->lock pins any
728 	 * address_space that has nodes on the LRU.
729 	 *
730 	 * We can then safely transition to the i_pages lock to
731 	 * pin only the address_space of the particular node we want
732 	 * to reclaim, take the node off-LRU, and drop the &lru->lock.
733 	 */
734 
735 	mapping = container_of(node->array, struct address_space, i_pages);
736 
737 	/* Coming from the list, invert the lock order */
738 	if (!xa_trylock(&mapping->i_pages)) {
739 		spin_unlock_irq(&lru->lock);
740 		ret = LRU_RETRY;
741 		goto out;
742 	}
743 
744 	/* For page cache we need to hold i_lock */
745 	if (mapping->host != NULL) {
746 		if (!spin_trylock(&mapping->host->i_lock)) {
747 			xa_unlock(&mapping->i_pages);
748 			spin_unlock_irq(&lru->lock);
749 			ret = LRU_RETRY;
750 			goto out;
751 		}
752 	}
753 
754 	list_lru_isolate(lru, item);
755 	__dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
756 
757 	spin_unlock(&lru->lock);
758 
759 	/*
760 	 * The nodes should only contain one or more shadow entries,
761 	 * no pages, so we expect to be able to remove them all and
762 	 * delete and free the empty node afterwards.
763 	 */
764 	if (WARN_ON_ONCE(!node->nr_values))
765 		goto out_invalid;
766 	if (WARN_ON_ONCE(node->count != node->nr_values))
767 		goto out_invalid;
768 	xa_delete_node(node, workingset_update_node);
769 	mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);
770 
771 out_invalid:
772 	xa_unlock_irq(&mapping->i_pages);
773 	if (mapping->host != NULL) {
774 		if (mapping_shrinkable(mapping))
775 			inode_lru_list_add(mapping->host);
776 		spin_unlock(&mapping->host->i_lock);
777 	}
778 	ret = LRU_REMOVED_RETRY;
779 out:
780 	cond_resched();
781 	return ret;
782 }
783 
784 static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
785 				       struct shrink_control *sc)
786 {
787 	/* list_lru lock nests inside the IRQ-safe i_pages lock */
788 	return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
789 					NULL);
790 }
791 
792 /*
793  * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
794  * i_pages lock.
795  */
796 static struct lock_class_key shadow_nodes_key;
797 
798 static int __init workingset_init(void)
799 {
800 	unsigned int timestamp_bits, timestamp_bits_anon;
801 	struct shrinker *workingset_shadow_shrinker;
802 	unsigned int max_order;
803 	int ret = -ENOMEM;
804 
805 	BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
806 	/*
807 	 * Calculate the eviction bucket size to cover the longest
808 	 * actionable refault distance, which is currently half of
809 	 * memory (totalram_pages/2). However, memory hotplug may add
810 	 * some more pages at runtime, so keep working with up to
811 	 * double the initial memory by using totalram_pages as-is.
812 	 */
813 	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
814 	timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON;
815 	max_order = fls_long(totalram_pages() - 1);
816 	if (max_order > (BITS_PER_LONG - EVICTION_SHIFT))
817 		bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits;
818 	if (max_order > timestamp_bits_anon)
819 		bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon;
820 	pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n",
821 		timestamp_bits, timestamp_bits_anon, max_order,
822 		bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]);
823 
824 	workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
825 						    SHRINKER_MEMCG_AWARE,
826 						    "mm-shadow");
827 	if (!workingset_shadow_shrinker)
828 		goto err;
829 
830 	ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
831 				      &shadow_nodes_key);
832 	if (ret)
833 		goto err_list_lru;
834 
835 	workingset_shadow_shrinker->count_objects = count_shadow_nodes;
836 	workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
837 	/* ->count reports only fully expendable nodes */
838 	workingset_shadow_shrinker->seeks = 0;
839 
840 	shrinker_register(workingset_shadow_shrinker);
841 	return 0;
842 err_list_lru:
843 	shrinker_free(workingset_shadow_shrinker);
844 err:
845 	return ret;
846 }
847 module_init(workingset_init);
848