xref: /linux/mm/workingset.c (revision 5ea5880764cbb164afb17a62e76ca75dc371409d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Workingset detection
4  *
5  * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
6  */
7 
8 #include <linux/memcontrol.h>
9 #include <linux/mm_inline.h>
10 #include <linux/writeback.h>
11 #include <linux/shmem_fs.h>
12 #include <linux/pagemap.h>
13 #include <linux/atomic.h>
14 #include <linux/module.h>
15 #include <linux/swap.h>
16 #include <linux/dax.h>
17 #include <linux/fs.h>
18 #include <linux/mm.h>
19 #include "swap_table.h"
20 #include "internal.h"
21 
22 /*
23  *		Double CLOCK lists
24  *
25  * Per node, two clock lists are maintained for file pages: the
26  * inactive and the active list.  Freshly faulted pages start out at
27  * the head of the inactive list and page reclaim scans pages from the
28  * tail.  Pages that are accessed multiple times on the inactive list
29  * are promoted to the active list, to protect them from reclaim,
30  * whereas active pages are demoted to the inactive list when the
31  * active list grows too big.
32  *
33  *   fault ------------------------+
34  *                                 |
35  *              +--------------+   |            +-------------+
36  *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
37  *              +--------------+                +-------------+    |
38  *                     |                                           |
39  *                     +-------------- promotion ------------------+
40  *
41  *
42  *		Access frequency and refault distance
43  *
44  * A workload is thrashing when its pages are frequently used but they
45  * are evicted from the inactive list every time before another access
46  * would have promoted them to the active list.
47  *
48  * In cases where the average access distance between thrashing pages
49  * is bigger than the size of memory there is nothing that can be
50  * done - the thrashing set could never fit into memory under any
51  * circumstance.
52  *
53  * However, the average access distance could be bigger than the
54  * inactive list, yet smaller than the size of memory.  In this case,
55  * the set could fit into memory if it weren't for the currently
56  * active pages - which may be used more, hopefully less frequently:
57  *
58  *      +-memory available to cache-+
59  *      |                           |
60  *      +-inactive------+-active----+
61  *  a b | c d e f g h i | J K L M N |
62  *      +---------------+-----------+
63  *
64  * It is prohibitively expensive to accurately track access frequency
65  * of pages.  But a reasonable approximation can be made to measure
66  * thrashing on the inactive list, after which refaulting pages can be
67  * activated optimistically to compete with the existing active pages.
68  *
69  * Approximating inactive page access frequency - Observations:
70  *
71  * 1. When a page is accessed for the first time, it is added to the
72  *    head of the inactive list, slides every existing inactive page
73  *    towards the tail by one slot, and pushes the current tail page
74  *    out of memory.
75  *
76  * 2. When a page is accessed for the second time, it is promoted to
77  *    the active list, shrinking the inactive list by one slot.  This
78  *    also slides all inactive pages that were faulted into the cache
79  *    more recently than the activated page towards the tail of the
80  *    inactive list.
81  *
82  * Thus:
83  *
84  * 1. The sum of evictions and activations between any two points in
85  *    time indicate the minimum number of inactive pages accessed in
86  *    between.
87  *
88  * 2. Moving one inactive page N page slots towards the tail of the
89  *    list requires at least N inactive page accesses.
90  *
91  * Combining these:
92  *
93  * 1. When a page is finally evicted from memory, the number of
94  *    inactive pages accessed while the page was in cache is at least
95  *    the number of page slots on the inactive list.
96  *
97  * 2. In addition, measuring the sum of evictions and activations (E)
98  *    at the time of a page's eviction, and comparing it to another
99  *    reading (R) at the time the page faults back into memory tells
100  *    the minimum number of accesses while the page was not cached.
101  *    This is called the refault distance.
102  *
103  * Because the first access of the page was the fault and the second
104  * access the refault, we combine the in-cache distance with the
105  * out-of-cache distance to get the complete minimum access distance
106  * of this page:
107  *
108  *      NR_inactive + (R - E)
109  *
110  * And knowing the minimum access distance of a page, we can easily
111  * tell if the page would be able to stay in cache assuming all page
112  * slots in the cache were available:
113  *
114  *   NR_inactive + (R - E) <= NR_inactive + NR_active
115  *
116  * If we have swap we should consider about NR_inactive_anon and
117  * NR_active_anon, so for page cache and anonymous respectively:
118  *
119  *   NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
120  *   + NR_inactive_anon + NR_active_anon
121  *
122  *   NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
123  *   + NR_inactive_file + NR_active_file
124  *
125  * Which can be further simplified to:
126  *
127  *   (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
128  *
129  *   (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
130  *
131  * Put into words, the refault distance (out-of-cache) can be seen as
132  * a deficit in inactive list space (in-cache).  If the inactive list
133  * had (R - E) more page slots, the page would not have been evicted
134  * in between accesses, but activated instead.  And on a full system,
135  * the only thing eating into inactive list space is active pages.
136  *
137  *
138  *		Refaulting inactive pages
139  *
140  * All that is known about the active list is that the pages have been
141  * accessed more than once in the past.  This means that at any given
142  * time there is actually a good chance that pages on the active list
143  * are no longer in active use.
144  *
145  * So when a refault distance of (R - E) is observed and there are at
146  * least (R - E) pages in the userspace workingset, the refaulting page
147  * is activated optimistically in the hope that (R - E) pages are actually
148  * used less frequently than the refaulting page - or even not used at
149  * all anymore.
150  *
151  * That means if inactive cache is refaulting with a suitable refault
152  * distance, we assume the cache workingset is transitioning and put
153  * pressure on the current workingset.
154  *
155  * If this is wrong and demotion kicks in, the pages which are truly
156  * used more frequently will be reactivated while the less frequently
157  * used once will be evicted from memory.
158  *
159  * But if this is right, the stale pages will be pushed out of memory
160  * and the used pages get to stay in cache.
161  *
162  *		Refaulting active pages
163  *
164  * If on the other hand the refaulting pages have recently been
165  * deactivated, it means that the active list is no longer protecting
166  * actively used cache from reclaim. The cache is NOT transitioning to
167  * a different workingset; the existing workingset is thrashing in the
168  * space allocated to the page cache.
169  *
170  *
171  *		Implementation
172  *
173  * For each node's LRU lists, a counter for inactive evictions and
174  * activations is maintained (node->nonresident_age).
175  *
176  * On eviction, a snapshot of this counter (along with some bits to
177  * identify the node) is stored in the now empty page cache
178  * slot of the evicted page.  This is called a shadow entry.
179  *
180  * On cache misses for which there are shadow entries, an eligible
181  * refault distance will immediately activate the refaulting page.
182  */
183 
184 #define WORKINGSET_SHIFT 1
185 #define EVICTION_SHIFT	((BITS_PER_LONG - BITS_PER_XA_VALUE) +	\
186 			 WORKINGSET_SHIFT + NODES_SHIFT + \
187 			 MEM_CGROUP_ID_SHIFT)
188 #define EVICTION_SHIFT_ANON	(EVICTION_SHIFT + SWAP_COUNT_SHIFT)
189 #define EVICTION_MASK	(~0UL >> EVICTION_SHIFT)
190 #define EVICTION_MASK_ANON	(~0UL >> EVICTION_SHIFT_ANON)
191 
192 /*
193  * Eviction timestamps need to be able to cover the full range of
194  * actionable refaults. However, bits are tight in the xarray
195  * entry, and after storing the identifier for the lruvec there might
196  * not be enough left to represent every single actionable refault. In
197  * that case, we have to sacrifice granularity for distance, and group
198  * evictions into coarser buckets by shaving off lower timestamp bits.
199  */
200 static unsigned int bucket_order[ANON_AND_FILE] __read_mostly;
201 
202 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
203 			 bool workingset, bool file)
204 {
205 	eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON;
206 	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
207 	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
208 	eviction = (eviction << WORKINGSET_SHIFT) | workingset;
209 
210 	return xa_mk_value(eviction);
211 }
212 
213 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
214 			  unsigned long *evictionp, bool *workingsetp)
215 {
216 	unsigned long entry = xa_to_value(shadow);
217 	int memcgid, nid;
218 	bool workingset;
219 
220 	workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
221 	entry >>= WORKINGSET_SHIFT;
222 	nid = entry & ((1UL << NODES_SHIFT) - 1);
223 	entry >>= NODES_SHIFT;
224 	memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
225 	entry >>= MEM_CGROUP_ID_SHIFT;
226 
227 	*memcgidp = memcgid;
228 	*pgdat = NODE_DATA(nid);
229 	*evictionp = entry;
230 	*workingsetp = workingset;
231 }
232 
233 #ifdef CONFIG_LRU_GEN
234 
235 static void *lru_gen_eviction(struct folio *folio)
236 {
237 	int hist;
238 	unsigned long token;
239 	unsigned long min_seq;
240 	struct lruvec *lruvec;
241 	struct lru_gen_folio *lrugen;
242 	int type = folio_is_file_lru(folio);
243 	int delta = folio_nr_pages(folio);
244 	int refs = folio_lru_refs(folio);
245 	bool workingset = folio_test_workingset(folio);
246 	int tier = lru_tier_from_refs(refs, workingset);
247 	struct mem_cgroup *memcg;
248 	struct pglist_data *pgdat = folio_pgdat(folio);
249 	unsigned short memcg_id;
250 
251 	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
252 		     BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
253 
254 	rcu_read_lock();
255 	memcg = folio_memcg(folio);
256 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
257 	lrugen = &lruvec->lrugen;
258 	min_seq = READ_ONCE(lrugen->min_seq[type]);
259 	token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
260 
261 	hist = lru_hist_from_seq(min_seq);
262 	atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
263 	memcg_id = mem_cgroup_private_id(memcg);
264 	rcu_read_unlock();
265 
266 	return pack_shadow(memcg_id, pgdat, token, workingset, type);
267 }
268 
269 /*
270  * Tests if the shadow entry is for a folio that was recently evicted.
271  * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
272  */
273 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
274 				unsigned long *token, bool *workingset, bool file)
275 {
276 	int memcg_id;
277 	unsigned long max_seq;
278 	struct mem_cgroup *memcg;
279 	struct pglist_data *pgdat;
280 
281 	unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
282 
283 	memcg = mem_cgroup_from_private_id(memcg_id);
284 	*lruvec = mem_cgroup_lruvec(memcg, pgdat);
285 
286 	max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
287 	max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH;
288 
289 	return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
290 }
291 
292 static void lru_gen_refault(struct folio *folio, void *shadow)
293 {
294 	bool recent;
295 	int hist, tier, refs;
296 	bool workingset;
297 	unsigned long token;
298 	struct lruvec *lruvec;
299 	struct lru_gen_folio *lrugen;
300 	int type = folio_is_file_lru(folio);
301 	int delta = folio_nr_pages(folio);
302 
303 	rcu_read_lock();
304 
305 	recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type);
306 	if (lruvec != folio_lruvec(folio))
307 		goto unlock;
308 
309 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
310 
311 	if (!recent)
312 		goto unlock;
313 
314 	lrugen = &lruvec->lrugen;
315 
316 	hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
317 	refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
318 	tier = lru_tier_from_refs(refs, workingset);
319 
320 	atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
321 
322 	/* see folio_add_lru() where folio_set_active() will be called */
323 	if (lru_gen_in_fault())
324 		mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
325 
326 	if (workingset) {
327 		folio_set_workingset(folio);
328 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
329 	} else
330 		set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
331 unlock:
332 	rcu_read_unlock();
333 }
334 
335 #else /* !CONFIG_LRU_GEN */
336 
337 static void *lru_gen_eviction(struct folio *folio)
338 {
339 	return NULL;
340 }
341 
342 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
343 				unsigned long *token, bool *workingset, bool file)
344 {
345 	return false;
346 }
347 
348 static void lru_gen_refault(struct folio *folio, void *shadow)
349 {
350 }
351 
352 #endif /* CONFIG_LRU_GEN */
353 
354 /**
355  * workingset_age_nonresident - age non-resident entries as LRU ages
356  * @lruvec: the lruvec that was aged
357  * @nr_pages: the number of pages to count
358  *
359  * As in-memory pages are aged, non-resident pages need to be aged as
360  * well, in order for the refault distances later on to be comparable
361  * to the in-memory dimensions. This function allows reclaim and LRU
362  * operations to drive the non-resident aging along in parallel.
363  */
364 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
365 {
366 	/*
367 	 * Reclaiming a cgroup means reclaiming all its children in a
368 	 * round-robin fashion. That means that each cgroup has an LRU
369 	 * order that is composed of the LRU orders of its child
370 	 * cgroups; and every page has an LRU position not just in the
371 	 * cgroup that owns it, but in all of that group's ancestors.
372 	 *
373 	 * So when the physical inactive list of a leaf cgroup ages,
374 	 * the virtual inactive lists of all its parents, including
375 	 * the root cgroup's, age as well.
376 	 */
377 	do {
378 		atomic_long_add(nr_pages, &lruvec->nonresident_age);
379 	} while ((lruvec = parent_lruvec(lruvec)));
380 }
381 
382 /**
383  * workingset_eviction - note the eviction of a folio from memory
384  * @target_memcg: the cgroup that is causing the reclaim
385  * @folio: the folio being evicted
386  *
387  * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
388  * of the evicted @folio so that a later refault can be detected.
389  */
390 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
391 {
392 	struct pglist_data *pgdat = folio_pgdat(folio);
393 	int file = folio_is_file_lru(folio);
394 	unsigned long eviction;
395 	struct lruvec *lruvec;
396 	int memcgid;
397 
398 	/* Folio is fully exclusive and pins folio's memory cgroup pointer */
399 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
400 	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
401 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
402 
403 	if (lru_gen_enabled())
404 		return lru_gen_eviction(folio);
405 
406 	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
407 	/* XXX: target_memcg can be NULL, go through lruvec */
408 	memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec));
409 	eviction = atomic_long_read(&lruvec->nonresident_age);
410 	eviction >>= bucket_order[file];
411 	workingset_age_nonresident(lruvec, folio_nr_pages(folio));
412 	return pack_shadow(memcgid, pgdat, eviction,
413 			   folio_test_workingset(folio), file);
414 }
415 
416 /**
417  * workingset_test_recent - tests if the shadow entry is for a folio that was
418  * recently evicted. Also fills in @workingset with the value unpacked from
419  * shadow.
420  * @shadow: the shadow entry to be tested.
421  * @file: whether the corresponding folio is from the file lru.
422  * @workingset: where the workingset value unpacked from shadow should
423  * be stored.
424  * @flush: whether to flush cgroup rstat.
425  *
426  * Return: true if the shadow is for a recently evicted folio; false otherwise.
427  */
428 bool workingset_test_recent(void *shadow, bool file, bool *workingset,
429 				bool flush)
430 {
431 	struct mem_cgroup *eviction_memcg;
432 	struct lruvec *eviction_lruvec;
433 	unsigned long refault_distance;
434 	unsigned long workingset_size;
435 	unsigned long refault;
436 	int memcgid;
437 	struct pglist_data *pgdat;
438 	unsigned long eviction;
439 
440 	if (lru_gen_enabled()) {
441 		bool recent;
442 
443 		rcu_read_lock();
444 		recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction,
445 					     workingset, file);
446 		rcu_read_unlock();
447 		return recent;
448 	}
449 
450 	rcu_read_lock();
451 	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
452 	eviction <<= bucket_order[file];
453 
454 	/*
455 	 * Look up the memcg associated with the stored ID. It might
456 	 * have been deleted since the folio's eviction.
457 	 *
458 	 * Note that in rare events the ID could have been recycled
459 	 * for a new cgroup that refaults a shared folio. This is
460 	 * impossible to tell from the available data. However, this
461 	 * should be a rare and limited disturbance, and activations
462 	 * are always speculative anyway. Ultimately, it's the aging
463 	 * algorithm's job to shake out the minimum access frequency
464 	 * for the active cache.
465 	 *
466 	 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
467 	 * would be better if the root_mem_cgroup existed in all
468 	 * configurations instead.
469 	 */
470 	eviction_memcg = mem_cgroup_from_private_id(memcgid);
471 	if (!mem_cgroup_tryget(eviction_memcg))
472 		eviction_memcg = NULL;
473 	rcu_read_unlock();
474 
475 	if (!mem_cgroup_disabled() && !eviction_memcg)
476 		return false;
477 	/*
478 	 * Flush stats (and potentially sleep) outside the RCU read section.
479 	 *
480 	 * Note that workingset_test_recent() itself might be called in RCU read
481 	 * section (for e.g, in cachestat) - these callers need to skip flushing
482 	 * stats (via the flush argument).
483 	 *
484 	 * XXX: With per-memcg flushing and thresholding, is ratelimiting
485 	 * still needed here?
486 	 */
487 	if (flush)
488 		mem_cgroup_flush_stats_ratelimited(eviction_memcg);
489 
490 	eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
491 	refault = atomic_long_read(&eviction_lruvec->nonresident_age);
492 
493 	/*
494 	 * Calculate the refault distance
495 	 *
496 	 * The unsigned subtraction here gives an accurate distance
497 	 * across nonresident_age overflows in most cases. There is a
498 	 * special case: usually, shadow entries have a short lifetime
499 	 * and are either refaulted or reclaimed along with the inode
500 	 * before they get too old.  But it is not impossible for the
501 	 * nonresident_age to lap a shadow entry in the field, which
502 	 * can then result in a false small refault distance, leading
503 	 * to a false activation should this old entry actually
504 	 * refault again.  However, earlier kernels used to deactivate
505 	 * unconditionally with *every* reclaim invocation for the
506 	 * longest time, so the occasional inappropriate activation
507 	 * leading to pressure on the active list is not a problem.
508 	 */
509 	refault_distance = ((refault - eviction) &
510 			    (file ? EVICTION_MASK : EVICTION_MASK_ANON));
511 
512 	/*
513 	 * Compare the distance to the existing workingset size. We
514 	 * don't activate pages that couldn't stay resident even if
515 	 * all the memory was available to the workingset. Whether
516 	 * workingset competition needs to consider anon or not depends
517 	 * on having free swap space.
518 	 */
519 	workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
520 	if (!file) {
521 		workingset_size += lruvec_page_state(eviction_lruvec,
522 						     NR_INACTIVE_FILE);
523 	}
524 	if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
525 		workingset_size += lruvec_page_state(eviction_lruvec,
526 						     NR_ACTIVE_ANON);
527 		if (file) {
528 			workingset_size += lruvec_page_state(eviction_lruvec,
529 						     NR_INACTIVE_ANON);
530 		}
531 	}
532 
533 	mem_cgroup_put(eviction_memcg);
534 	return refault_distance <= workingset_size;
535 }
536 
537 /**
538  * workingset_refault - Evaluate the refault of a previously evicted folio.
539  * @folio: The freshly allocated replacement folio.
540  * @shadow: Shadow entry of the evicted folio.
541  *
542  * Calculates and evaluates the refault distance of the previously
543  * evicted folio in the context of the node and the memcg whose memory
544  * pressure caused the eviction.
545  */
546 void workingset_refault(struct folio *folio, void *shadow)
547 {
548 	bool file = folio_is_file_lru(folio);
549 	struct mem_cgroup *memcg;
550 	struct lruvec *lruvec;
551 	bool workingset;
552 	long nr;
553 
554 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
555 
556 	if (lru_gen_enabled()) {
557 		lru_gen_refault(folio, shadow);
558 		return;
559 	}
560 
561 	/*
562 	 * The activation decision for this folio is made at the level
563 	 * where the eviction occurred, as that is where the LRU order
564 	 * during folio reclaim is being determined.
565 	 *
566 	 * However, the cgroup that will own the folio is the one that
567 	 * is actually experiencing the refault event. Make sure the folio is
568 	 * locked to guarantee folio_memcg() stability throughout.
569 	 */
570 	nr = folio_nr_pages(folio);
571 	memcg = get_mem_cgroup_from_folio(folio);
572 	lruvec = mem_cgroup_lruvec(memcg, folio_pgdat(folio));
573 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
574 
575 	if (!workingset_test_recent(shadow, file, &workingset, true))
576 		goto out;
577 
578 	folio_set_active(folio);
579 	workingset_age_nonresident(lruvec, nr);
580 	mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
581 
582 	/* Folio was active prior to eviction */
583 	if (workingset) {
584 		folio_set_workingset(folio);
585 		/*
586 		 * XXX: Move to folio_add_lru() when it supports new vs
587 		 * putback
588 		 */
589 		lru_note_cost_refault(folio);
590 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
591 	}
592 out:
593 	mem_cgroup_put(memcg);
594 }
595 
596 /**
597  * workingset_activation - note a page activation
598  * @folio: Folio that is being activated.
599  */
600 void workingset_activation(struct folio *folio)
601 {
602 	/*
603 	 * Filter non-memcg pages here, e.g. unmap can call
604 	 * mark_page_accessed() on VDSO pages.
605 	 */
606 	if (mem_cgroup_disabled() || folio_memcg_charged(folio)) {
607 		rcu_read_lock();
608 		workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
609 		rcu_read_unlock();
610 	}
611 }
612 
613 /*
614  * Shadow entries reflect the share of the working set that does not
615  * fit into memory, so their number depends on the access pattern of
616  * the workload.  In most cases, they will refault or get reclaimed
617  * along with the inode, but a (malicious) workload that streams
618  * through files with a total size several times that of available
619  * memory, while preventing the inodes from being reclaimed, can
620  * create excessive amounts of shadow nodes.  To keep a lid on this,
621  * track shadow nodes and reclaim them when they grow way past the
622  * point where they would still be useful.
623  */
624 
625 struct list_lru shadow_nodes;
626 
627 void workingset_update_node(struct xa_node *node)
628 {
629 	struct page *page = virt_to_page(node);
630 
631 	/*
632 	 * Track non-empty nodes that contain only shadow entries;
633 	 * unlink those that contain pages or are being freed.
634 	 *
635 	 * Avoid acquiring the list_lru lock when the nodes are
636 	 * already where they should be. The list_empty() test is safe
637 	 * as node->private_list is protected by the i_pages lock.
638 	 */
639 	lockdep_assert_held(&node->array->xa_lock);
640 
641 	if (node->count && node->count == node->nr_values) {
642 		if (list_empty(&node->private_list)) {
643 			list_lru_add_obj(&shadow_nodes, &node->private_list);
644 			__inc_node_page_state(page, WORKINGSET_NODES);
645 		}
646 	} else {
647 		if (!list_empty(&node->private_list)) {
648 			list_lru_del_obj(&shadow_nodes, &node->private_list);
649 			__dec_node_page_state(page, WORKINGSET_NODES);
650 		}
651 	}
652 }
653 
654 static unsigned long count_shadow_nodes(struct shrinker *shrinker,
655 					struct shrink_control *sc)
656 {
657 	unsigned long max_nodes;
658 	unsigned long nodes;
659 	unsigned long pages;
660 
661 	nodes = list_lru_shrink_count(&shadow_nodes, sc);
662 	if (!nodes)
663 		return SHRINK_EMPTY;
664 
665 	/*
666 	 * Approximate a reasonable limit for the nodes
667 	 * containing shadow entries. We don't need to keep more
668 	 * shadow entries than possible pages on the active list,
669 	 * since refault distances bigger than that are dismissed.
670 	 *
671 	 * The size of the active list converges toward 100% of
672 	 * overall page cache as memory grows, with only a tiny
673 	 * inactive list. Assume the total cache size for that.
674 	 *
675 	 * Nodes might be sparsely populated, with only one shadow
676 	 * entry in the extreme case. Obviously, we cannot keep one
677 	 * node for every eligible shadow entry, so compromise on a
678 	 * worst-case density of 1/8th. Below that, not all eligible
679 	 * refaults can be detected anymore.
680 	 *
681 	 * On 64-bit with 7 xa_nodes per page and 64 slots
682 	 * each, this will reclaim shadow entries when they consume
683 	 * ~1.8% of available memory:
684 	 *
685 	 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
686 	 */
687 #ifdef CONFIG_MEMCG
688 	if (sc->memcg) {
689 		struct lruvec *lruvec;
690 		int i;
691 
692 		mem_cgroup_flush_stats_ratelimited(sc->memcg);
693 		lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
694 
695 		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
696 			pages += lruvec_lru_size(lruvec, i, MAX_NR_ZONES - 1);
697 
698 		pages += lruvec_page_state_local(
699 			lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
700 		pages += lruvec_page_state_local(
701 			lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
702 	} else
703 #endif
704 		pages = node_present_pages(sc->nid);
705 
706 	max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
707 
708 	if (nodes <= max_nodes)
709 		return 0;
710 	return nodes - max_nodes;
711 }
712 
713 static enum lru_status shadow_lru_isolate(struct list_head *item,
714 					  struct list_lru_one *lru,
715 					  void *arg) __must_hold(lru->lock)
716 {
717 	struct xa_node *node = container_of(item, struct xa_node, private_list);
718 	struct address_space *mapping;
719 	int ret;
720 
721 	/*
722 	 * Page cache insertions and deletions synchronously maintain
723 	 * the shadow node LRU under the i_pages lock and the
724 	 * &lru->lock. Because the page cache tree is emptied before
725 	 * the inode can be destroyed, holding the &lru->lock pins any
726 	 * address_space that has nodes on the LRU.
727 	 *
728 	 * We can then safely transition to the i_pages lock to
729 	 * pin only the address_space of the particular node we want
730 	 * to reclaim, take the node off-LRU, and drop the &lru->lock.
731 	 */
732 
733 	mapping = container_of(node->array, struct address_space, i_pages);
734 
735 	/* Coming from the list, invert the lock order */
736 	if (!xa_trylock(&mapping->i_pages)) {
737 		spin_unlock_irq(&lru->lock);
738 		ret = LRU_RETRY;
739 		goto out;
740 	}
741 
742 	/* For page cache we need to hold i_lock */
743 	if (mapping->host != NULL) {
744 		if (!spin_trylock(&mapping->host->i_lock)) {
745 			xa_unlock(&mapping->i_pages);
746 			spin_unlock_irq(&lru->lock);
747 			ret = LRU_RETRY;
748 			goto out;
749 		}
750 	}
751 
752 	list_lru_isolate(lru, item);
753 	__dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
754 
755 	spin_unlock(&lru->lock);
756 
757 	/*
758 	 * The nodes should only contain one or more shadow entries,
759 	 * no pages, so we expect to be able to remove them all and
760 	 * delete and free the empty node afterwards.
761 	 */
762 	if (WARN_ON_ONCE(!node->nr_values))
763 		goto out_invalid;
764 	if (WARN_ON_ONCE(node->count != node->nr_values))
765 		goto out_invalid;
766 	xa_delete_node(node, workingset_update_node);
767 	mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);
768 
769 out_invalid:
770 	xa_unlock_irq(&mapping->i_pages);
771 	if (mapping->host != NULL) {
772 		if (mapping_shrinkable(mapping))
773 			inode_lru_list_add(mapping->host);
774 		spin_unlock(&mapping->host->i_lock);
775 	}
776 	ret = LRU_REMOVED_RETRY;
777 out:
778 	cond_resched();
779 	return ret;
780 }
781 
782 static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
783 				       struct shrink_control *sc)
784 {
785 	/* list_lru lock nests inside the IRQ-safe i_pages lock */
786 	return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
787 					NULL);
788 }
789 
790 /*
791  * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
792  * i_pages lock.
793  */
794 static struct lock_class_key shadow_nodes_key;
795 
796 static int __init workingset_init(void)
797 {
798 	unsigned int timestamp_bits, timestamp_bits_anon;
799 	struct shrinker *workingset_shadow_shrinker;
800 	unsigned int max_order;
801 	int ret = -ENOMEM;
802 
803 	BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
804 	/*
805 	 * Calculate the eviction bucket size to cover the longest
806 	 * actionable refault distance, which is currently half of
807 	 * memory (totalram_pages/2). However, memory hotplug may add
808 	 * some more pages at runtime, so keep working with up to
809 	 * double the initial memory by using totalram_pages as-is.
810 	 */
811 	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
812 	timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON;
813 	max_order = fls_long(totalram_pages() - 1);
814 	if (max_order > (BITS_PER_LONG - EVICTION_SHIFT))
815 		bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits;
816 	if (max_order > timestamp_bits_anon)
817 		bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon;
818 	pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n",
819 		timestamp_bits, timestamp_bits_anon, max_order,
820 		bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]);
821 
822 	workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
823 						    SHRINKER_MEMCG_AWARE,
824 						    "mm-shadow");
825 	if (!workingset_shadow_shrinker)
826 		goto err;
827 
828 	ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
829 				      &shadow_nodes_key);
830 	if (ret)
831 		goto err_list_lru;
832 
833 	workingset_shadow_shrinker->count_objects = count_shadow_nodes;
834 	workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
835 	/* ->count reports only fully expendable nodes */
836 	workingset_shadow_shrinker->seeks = 0;
837 
838 	shrinker_register(workingset_shadow_shrinker);
839 	return 0;
840 err_list_lru:
841 	shrinker_free(workingset_shadow_shrinker);
842 err:
843 	return ret;
844 }
845 module_init(workingset_init);
846