xref: /linux/mm/workingset.c (revision aec2f682d47c54ef434b2d440992626d80b1ebdc)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Workingset detection
4  *
5  * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
6  */
7 
8 #include <linux/memcontrol.h>
9 #include <linux/mm_inline.h>
10 #include <linux/writeback.h>
11 #include <linux/shmem_fs.h>
12 #include <linux/pagemap.h>
13 #include <linux/atomic.h>
14 #include <linux/module.h>
15 #include <linux/swap.h>
16 #include <linux/dax.h>
17 #include <linux/fs.h>
18 #include <linux/mm.h>
19 #include "swap_table.h"
20 #include "internal.h"
21 
22 /*
23  *		Double CLOCK lists
24  *
25  * Per node, two clock lists are maintained for file pages: the
26  * inactive and the active list.  Freshly faulted pages start out at
27  * the head of the inactive list and page reclaim scans pages from the
28  * tail.  Pages that are accessed multiple times on the inactive list
29  * are promoted to the active list, to protect them from reclaim,
30  * whereas active pages are demoted to the inactive list when the
31  * active list grows too big.
32  *
33  *   fault ------------------------+
34  *                                 |
35  *              +--------------+   |            +-------------+
36  *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
37  *              +--------------+                +-------------+    |
38  *                     |                                           |
39  *                     +-------------- promotion ------------------+
40  *
41  *
42  *		Access frequency and refault distance
43  *
44  * A workload is thrashing when its pages are frequently used but they
45  * are evicted from the inactive list every time before another access
46  * would have promoted them to the active list.
47  *
48  * In cases where the average access distance between thrashing pages
49  * is bigger than the size of memory there is nothing that can be
50  * done - the thrashing set could never fit into memory under any
51  * circumstance.
52  *
53  * However, the average access distance could be bigger than the
54  * inactive list, yet smaller than the size of memory.  In this case,
55  * the set could fit into memory if it weren't for the currently
56  * active pages - which may be used more, hopefully less frequently:
57  *
58  *      +-memory available to cache-+
59  *      |                           |
60  *      +-inactive------+-active----+
61  *  a b | c d e f g h i | J K L M N |
62  *      +---------------+-----------+
63  *
64  * It is prohibitively expensive to accurately track access frequency
65  * of pages.  But a reasonable approximation can be made to measure
66  * thrashing on the inactive list, after which refaulting pages can be
67  * activated optimistically to compete with the existing active pages.
68  *
69  * Approximating inactive page access frequency - Observations:
70  *
71  * 1. When a page is accessed for the first time, it is added to the
72  *    head of the inactive list, slides every existing inactive page
73  *    towards the tail by one slot, and pushes the current tail page
74  *    out of memory.
75  *
76  * 2. When a page is accessed for the second time, it is promoted to
77  *    the active list, shrinking the inactive list by one slot.  This
78  *    also slides all inactive pages that were faulted into the cache
79  *    more recently than the activated page towards the tail of the
80  *    inactive list.
81  *
82  * Thus:
83  *
84  * 1. The sum of evictions and activations between any two points in
85  *    time indicate the minimum number of inactive pages accessed in
86  *    between.
87  *
88  * 2. Moving one inactive page N page slots towards the tail of the
89  *    list requires at least N inactive page accesses.
90  *
91  * Combining these:
92  *
93  * 1. When a page is finally evicted from memory, the number of
94  *    inactive pages accessed while the page was in cache is at least
95  *    the number of page slots on the inactive list.
96  *
97  * 2. In addition, measuring the sum of evictions and activations (E)
98  *    at the time of a page's eviction, and comparing it to another
99  *    reading (R) at the time the page faults back into memory tells
100  *    the minimum number of accesses while the page was not cached.
101  *    This is called the refault distance.
102  *
103  * Because the first access of the page was the fault and the second
104  * access the refault, we combine the in-cache distance with the
105  * out-of-cache distance to get the complete minimum access distance
106  * of this page:
107  *
108  *      NR_inactive + (R - E)
109  *
110  * And knowing the minimum access distance of a page, we can easily
111  * tell if the page would be able to stay in cache assuming all page
112  * slots in the cache were available:
113  *
114  *   NR_inactive + (R - E) <= NR_inactive + NR_active
115  *
116  * If we have swap we should consider about NR_inactive_anon and
117  * NR_active_anon, so for page cache and anonymous respectively:
118  *
119  *   NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
120  *   + NR_inactive_anon + NR_active_anon
121  *
122  *   NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
123  *   + NR_inactive_file + NR_active_file
124  *
125  * Which can be further simplified to:
126  *
127  *   (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
128  *
129  *   (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
130  *
131  * Put into words, the refault distance (out-of-cache) can be seen as
132  * a deficit in inactive list space (in-cache).  If the inactive list
133  * had (R - E) more page slots, the page would not have been evicted
134  * in between accesses, but activated instead.  And on a full system,
135  * the only thing eating into inactive list space is active pages.
136  *
137  *
138  *		Refaulting inactive pages
139  *
140  * All that is known about the active list is that the pages have been
141  * accessed more than once in the past.  This means that at any given
142  * time there is actually a good chance that pages on the active list
143  * are no longer in active use.
144  *
145  * So when a refault distance of (R - E) is observed and there are at
146  * least (R - E) pages in the userspace workingset, the refaulting page
147  * is activated optimistically in the hope that (R - E) pages are actually
148  * used less frequently than the refaulting page - or even not used at
149  * all anymore.
150  *
151  * That means if inactive cache is refaulting with a suitable refault
152  * distance, we assume the cache workingset is transitioning and put
153  * pressure on the current workingset.
154  *
155  * If this is wrong and demotion kicks in, the pages which are truly
156  * used more frequently will be reactivated while the less frequently
157  * used once will be evicted from memory.
158  *
159  * But if this is right, the stale pages will be pushed out of memory
160  * and the used pages get to stay in cache.
161  *
162  *		Refaulting active pages
163  *
164  * If on the other hand the refaulting pages have recently been
165  * deactivated, it means that the active list is no longer protecting
166  * actively used cache from reclaim. The cache is NOT transitioning to
167  * a different workingset; the existing workingset is thrashing in the
168  * space allocated to the page cache.
169  *
170  *
171  *		Implementation
172  *
173  * For each node's LRU lists, a counter for inactive evictions and
174  * activations is maintained (node->nonresident_age).
175  *
176  * On eviction, a snapshot of this counter (along with some bits to
177  * identify the node) is stored in the now empty page cache
178  * slot of the evicted page.  This is called a shadow entry.
179  *
180  * On cache misses for which there are shadow entries, an eligible
181  * refault distance will immediately activate the refaulting page.
182  */
183 
184 #define WORKINGSET_SHIFT 1
185 #define EVICTION_SHIFT	((BITS_PER_LONG - BITS_PER_XA_VALUE) +	\
186 			 WORKINGSET_SHIFT + NODES_SHIFT + \
187 			 MEM_CGROUP_ID_SHIFT)
188 #define EVICTION_SHIFT_ANON	(EVICTION_SHIFT + SWAP_COUNT_SHIFT)
189 #define EVICTION_MASK	(~0UL >> EVICTION_SHIFT)
190 #define EVICTION_MASK_ANON	(~0UL >> EVICTION_SHIFT_ANON)
191 
192 /*
193  * Eviction timestamps need to be able to cover the full range of
194  * actionable refaults. However, bits are tight in the xarray
195  * entry, and after storing the identifier for the lruvec there might
196  * not be enough left to represent every single actionable refault. In
197  * that case, we have to sacrifice granularity for distance, and group
198  * evictions into coarser buckets by shaving off lower timestamp bits.
199  */
200 static unsigned int bucket_order[ANON_AND_FILE] __read_mostly;
201 
202 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
203 			 bool workingset, bool file)
204 {
205 	eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON;
206 	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
207 	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
208 	eviction = (eviction << WORKINGSET_SHIFT) | workingset;
209 
210 	return xa_mk_value(eviction);
211 }
212 
213 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
214 			  unsigned long *evictionp, bool *workingsetp)
215 {
216 	unsigned long entry = xa_to_value(shadow);
217 	int memcgid, nid;
218 	bool workingset;
219 
220 	workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
221 	entry >>= WORKINGSET_SHIFT;
222 	nid = entry & ((1UL << NODES_SHIFT) - 1);
223 	entry >>= NODES_SHIFT;
224 	memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
225 	entry >>= MEM_CGROUP_ID_SHIFT;
226 
227 	*memcgidp = memcgid;
228 	*pgdat = NODE_DATA(nid);
229 	*evictionp = entry;
230 	*workingsetp = workingset;
231 }
232 
233 #ifdef CONFIG_LRU_GEN
234 
235 static void *lru_gen_eviction(struct folio *folio)
236 {
237 	int hist;
238 	unsigned long token;
239 	unsigned long min_seq;
240 	struct lruvec *lruvec;
241 	struct lru_gen_folio *lrugen;
242 	int type = folio_is_file_lru(folio);
243 	int delta = folio_nr_pages(folio);
244 	int refs = folio_lru_refs(folio);
245 	bool workingset = folio_test_workingset(folio);
246 	int tier = lru_tier_from_refs(refs, workingset);
247 	struct mem_cgroup *memcg = folio_memcg(folio);
248 	struct pglist_data *pgdat = folio_pgdat(folio);
249 
250 	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
251 		     BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
252 
253 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
254 	lrugen = &lruvec->lrugen;
255 	min_seq = READ_ONCE(lrugen->min_seq[type]);
256 	token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
257 
258 	hist = lru_hist_from_seq(min_seq);
259 	atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
260 
261 	return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset, type);
262 }
263 
264 /*
265  * Tests if the shadow entry is for a folio that was recently evicted.
266  * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
267  */
268 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
269 				unsigned long *token, bool *workingset, bool file)
270 {
271 	int memcg_id;
272 	unsigned long max_seq;
273 	struct mem_cgroup *memcg;
274 	struct pglist_data *pgdat;
275 
276 	unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
277 
278 	memcg = mem_cgroup_from_private_id(memcg_id);
279 	*lruvec = mem_cgroup_lruvec(memcg, pgdat);
280 
281 	max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
282 	max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH;
283 
284 	return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
285 }
286 
287 static void lru_gen_refault(struct folio *folio, void *shadow)
288 {
289 	bool recent;
290 	int hist, tier, refs;
291 	bool workingset;
292 	unsigned long token;
293 	struct lruvec *lruvec;
294 	struct lru_gen_folio *lrugen;
295 	int type = folio_is_file_lru(folio);
296 	int delta = folio_nr_pages(folio);
297 
298 	rcu_read_lock();
299 
300 	recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type);
301 	if (lruvec != folio_lruvec(folio))
302 		goto unlock;
303 
304 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
305 
306 	if (!recent)
307 		goto unlock;
308 
309 	lrugen = &lruvec->lrugen;
310 
311 	hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
312 	refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
313 	tier = lru_tier_from_refs(refs, workingset);
314 
315 	atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
316 
317 	/* see folio_add_lru() where folio_set_active() will be called */
318 	if (lru_gen_in_fault())
319 		mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
320 
321 	if (workingset) {
322 		folio_set_workingset(folio);
323 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
324 	} else
325 		set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
326 unlock:
327 	rcu_read_unlock();
328 }
329 
330 #else /* !CONFIG_LRU_GEN */
331 
332 static void *lru_gen_eviction(struct folio *folio)
333 {
334 	return NULL;
335 }
336 
337 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
338 				unsigned long *token, bool *workingset, bool file)
339 {
340 	return false;
341 }
342 
343 static void lru_gen_refault(struct folio *folio, void *shadow)
344 {
345 }
346 
347 #endif /* CONFIG_LRU_GEN */
348 
349 /**
350  * workingset_age_nonresident - age non-resident entries as LRU ages
351  * @lruvec: the lruvec that was aged
352  * @nr_pages: the number of pages to count
353  *
354  * As in-memory pages are aged, non-resident pages need to be aged as
355  * well, in order for the refault distances later on to be comparable
356  * to the in-memory dimensions. This function allows reclaim and LRU
357  * operations to drive the non-resident aging along in parallel.
358  */
359 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
360 {
361 	/*
362 	 * Reclaiming a cgroup means reclaiming all its children in a
363 	 * round-robin fashion. That means that each cgroup has an LRU
364 	 * order that is composed of the LRU orders of its child
365 	 * cgroups; and every page has an LRU position not just in the
366 	 * cgroup that owns it, but in all of that group's ancestors.
367 	 *
368 	 * So when the physical inactive list of a leaf cgroup ages,
369 	 * the virtual inactive lists of all its parents, including
370 	 * the root cgroup's, age as well.
371 	 */
372 	do {
373 		atomic_long_add(nr_pages, &lruvec->nonresident_age);
374 	} while ((lruvec = parent_lruvec(lruvec)));
375 }
376 
377 /**
378  * workingset_eviction - note the eviction of a folio from memory
379  * @target_memcg: the cgroup that is causing the reclaim
380  * @folio: the folio being evicted
381  *
382  * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
383  * of the evicted @folio so that a later refault can be detected.
384  */
385 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
386 {
387 	struct pglist_data *pgdat = folio_pgdat(folio);
388 	int file = folio_is_file_lru(folio);
389 	unsigned long eviction;
390 	struct lruvec *lruvec;
391 	int memcgid;
392 
393 	/* Folio is fully exclusive and pins folio's memory cgroup pointer */
394 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
395 	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
396 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
397 
398 	if (lru_gen_enabled())
399 		return lru_gen_eviction(folio);
400 
401 	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
402 	/* XXX: target_memcg can be NULL, go through lruvec */
403 	memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec));
404 	eviction = atomic_long_read(&lruvec->nonresident_age);
405 	eviction >>= bucket_order[file];
406 	workingset_age_nonresident(lruvec, folio_nr_pages(folio));
407 	return pack_shadow(memcgid, pgdat, eviction,
408 			   folio_test_workingset(folio), file);
409 }
410 
411 /**
412  * workingset_test_recent - tests if the shadow entry is for a folio that was
413  * recently evicted. Also fills in @workingset with the value unpacked from
414  * shadow.
415  * @shadow: the shadow entry to be tested.
416  * @file: whether the corresponding folio is from the file lru.
417  * @workingset: where the workingset value unpacked from shadow should
418  * be stored.
419  * @flush: whether to flush cgroup rstat.
420  *
421  * Return: true if the shadow is for a recently evicted folio; false otherwise.
422  */
423 bool workingset_test_recent(void *shadow, bool file, bool *workingset,
424 				bool flush)
425 {
426 	struct mem_cgroup *eviction_memcg;
427 	struct lruvec *eviction_lruvec;
428 	unsigned long refault_distance;
429 	unsigned long workingset_size;
430 	unsigned long refault;
431 	int memcgid;
432 	struct pglist_data *pgdat;
433 	unsigned long eviction;
434 
435 	if (lru_gen_enabled()) {
436 		bool recent;
437 
438 		rcu_read_lock();
439 		recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction,
440 					     workingset, file);
441 		rcu_read_unlock();
442 		return recent;
443 	}
444 
445 	rcu_read_lock();
446 	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
447 	eviction <<= bucket_order[file];
448 
449 	/*
450 	 * Look up the memcg associated with the stored ID. It might
451 	 * have been deleted since the folio's eviction.
452 	 *
453 	 * Note that in rare events the ID could have been recycled
454 	 * for a new cgroup that refaults a shared folio. This is
455 	 * impossible to tell from the available data. However, this
456 	 * should be a rare and limited disturbance, and activations
457 	 * are always speculative anyway. Ultimately, it's the aging
458 	 * algorithm's job to shake out the minimum access frequency
459 	 * for the active cache.
460 	 *
461 	 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
462 	 * would be better if the root_mem_cgroup existed in all
463 	 * configurations instead.
464 	 */
465 	eviction_memcg = mem_cgroup_from_private_id(memcgid);
466 	if (!mem_cgroup_tryget(eviction_memcg))
467 		eviction_memcg = NULL;
468 	rcu_read_unlock();
469 
470 	if (!mem_cgroup_disabled() && !eviction_memcg)
471 		return false;
472 	/*
473 	 * Flush stats (and potentially sleep) outside the RCU read section.
474 	 *
475 	 * Note that workingset_test_recent() itself might be called in RCU read
476 	 * section (for e.g, in cachestat) - these callers need to skip flushing
477 	 * stats (via the flush argument).
478 	 *
479 	 * XXX: With per-memcg flushing and thresholding, is ratelimiting
480 	 * still needed here?
481 	 */
482 	if (flush)
483 		mem_cgroup_flush_stats_ratelimited(eviction_memcg);
484 
485 	eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
486 	refault = atomic_long_read(&eviction_lruvec->nonresident_age);
487 
488 	/*
489 	 * Calculate the refault distance
490 	 *
491 	 * The unsigned subtraction here gives an accurate distance
492 	 * across nonresident_age overflows in most cases. There is a
493 	 * special case: usually, shadow entries have a short lifetime
494 	 * and are either refaulted or reclaimed along with the inode
495 	 * before they get too old.  But it is not impossible for the
496 	 * nonresident_age to lap a shadow entry in the field, which
497 	 * can then result in a false small refault distance, leading
498 	 * to a false activation should this old entry actually
499 	 * refault again.  However, earlier kernels used to deactivate
500 	 * unconditionally with *every* reclaim invocation for the
501 	 * longest time, so the occasional inappropriate activation
502 	 * leading to pressure on the active list is not a problem.
503 	 */
504 	refault_distance = ((refault - eviction) &
505 			    (file ? EVICTION_MASK : EVICTION_MASK_ANON));
506 
507 	/*
508 	 * Compare the distance to the existing workingset size. We
509 	 * don't activate pages that couldn't stay resident even if
510 	 * all the memory was available to the workingset. Whether
511 	 * workingset competition needs to consider anon or not depends
512 	 * on having free swap space.
513 	 */
514 	workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
515 	if (!file) {
516 		workingset_size += lruvec_page_state(eviction_lruvec,
517 						     NR_INACTIVE_FILE);
518 	}
519 	if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
520 		workingset_size += lruvec_page_state(eviction_lruvec,
521 						     NR_ACTIVE_ANON);
522 		if (file) {
523 			workingset_size += lruvec_page_state(eviction_lruvec,
524 						     NR_INACTIVE_ANON);
525 		}
526 	}
527 
528 	mem_cgroup_put(eviction_memcg);
529 	return refault_distance <= workingset_size;
530 }
531 
532 /**
533  * workingset_refault - Evaluate the refault of a previously evicted folio.
534  * @folio: The freshly allocated replacement folio.
535  * @shadow: Shadow entry of the evicted folio.
536  *
537  * Calculates and evaluates the refault distance of the previously
538  * evicted folio in the context of the node and the memcg whose memory
539  * pressure caused the eviction.
540  */
541 void workingset_refault(struct folio *folio, void *shadow)
542 {
543 	bool file = folio_is_file_lru(folio);
544 	struct pglist_data *pgdat;
545 	struct mem_cgroup *memcg;
546 	struct lruvec *lruvec;
547 	bool workingset;
548 	long nr;
549 
550 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
551 
552 	if (lru_gen_enabled()) {
553 		lru_gen_refault(folio, shadow);
554 		return;
555 	}
556 
557 	/*
558 	 * The activation decision for this folio is made at the level
559 	 * where the eviction occurred, as that is where the LRU order
560 	 * during folio reclaim is being determined.
561 	 *
562 	 * However, the cgroup that will own the folio is the one that
563 	 * is actually experiencing the refault event. Make sure the folio is
564 	 * locked to guarantee folio_memcg() stability throughout.
565 	 */
566 	nr = folio_nr_pages(folio);
567 	memcg = folio_memcg(folio);
568 	pgdat = folio_pgdat(folio);
569 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
570 
571 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
572 
573 	if (!workingset_test_recent(shadow, file, &workingset, true))
574 		return;
575 
576 	folio_set_active(folio);
577 	workingset_age_nonresident(lruvec, nr);
578 	mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
579 
580 	/* Folio was active prior to eviction */
581 	if (workingset) {
582 		folio_set_workingset(folio);
583 		/*
584 		 * XXX: Move to folio_add_lru() when it supports new vs
585 		 * putback
586 		 */
587 		lru_note_cost_refault(folio);
588 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
589 	}
590 }
591 
592 /**
593  * workingset_activation - note a page activation
594  * @folio: Folio that is being activated.
595  */
596 void workingset_activation(struct folio *folio)
597 {
598 	/*
599 	 * Filter non-memcg pages here, e.g. unmap can call
600 	 * mark_page_accessed() on VDSO pages.
601 	 */
602 	if (mem_cgroup_disabled() || folio_memcg_charged(folio))
603 		workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
604 }
605 
606 /*
607  * Shadow entries reflect the share of the working set that does not
608  * fit into memory, so their number depends on the access pattern of
609  * the workload.  In most cases, they will refault or get reclaimed
610  * along with the inode, but a (malicious) workload that streams
611  * through files with a total size several times that of available
612  * memory, while preventing the inodes from being reclaimed, can
613  * create excessive amounts of shadow nodes.  To keep a lid on this,
614  * track shadow nodes and reclaim them when they grow way past the
615  * point where they would still be useful.
616  */
617 
618 struct list_lru shadow_nodes;
619 
620 void workingset_update_node(struct xa_node *node)
621 {
622 	struct page *page = virt_to_page(node);
623 
624 	/*
625 	 * Track non-empty nodes that contain only shadow entries;
626 	 * unlink those that contain pages or are being freed.
627 	 *
628 	 * Avoid acquiring the list_lru lock when the nodes are
629 	 * already where they should be. The list_empty() test is safe
630 	 * as node->private_list is protected by the i_pages lock.
631 	 */
632 	lockdep_assert_held(&node->array->xa_lock);
633 
634 	if (node->count && node->count == node->nr_values) {
635 		if (list_empty(&node->private_list)) {
636 			list_lru_add_obj(&shadow_nodes, &node->private_list);
637 			__inc_node_page_state(page, WORKINGSET_NODES);
638 		}
639 	} else {
640 		if (!list_empty(&node->private_list)) {
641 			list_lru_del_obj(&shadow_nodes, &node->private_list);
642 			__dec_node_page_state(page, WORKINGSET_NODES);
643 		}
644 	}
645 }
646 
647 static unsigned long count_shadow_nodes(struct shrinker *shrinker,
648 					struct shrink_control *sc)
649 {
650 	unsigned long max_nodes;
651 	unsigned long nodes;
652 	unsigned long pages;
653 
654 	nodes = list_lru_shrink_count(&shadow_nodes, sc);
655 	if (!nodes)
656 		return SHRINK_EMPTY;
657 
658 	/*
659 	 * Approximate a reasonable limit for the nodes
660 	 * containing shadow entries. We don't need to keep more
661 	 * shadow entries than possible pages on the active list,
662 	 * since refault distances bigger than that are dismissed.
663 	 *
664 	 * The size of the active list converges toward 100% of
665 	 * overall page cache as memory grows, with only a tiny
666 	 * inactive list. Assume the total cache size for that.
667 	 *
668 	 * Nodes might be sparsely populated, with only one shadow
669 	 * entry in the extreme case. Obviously, we cannot keep one
670 	 * node for every eligible shadow entry, so compromise on a
671 	 * worst-case density of 1/8th. Below that, not all eligible
672 	 * refaults can be detected anymore.
673 	 *
674 	 * On 64-bit with 7 xa_nodes per page and 64 slots
675 	 * each, this will reclaim shadow entries when they consume
676 	 * ~1.8% of available memory:
677 	 *
678 	 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
679 	 */
680 #ifdef CONFIG_MEMCG
681 	if (sc->memcg) {
682 		struct lruvec *lruvec;
683 		int i;
684 
685 		mem_cgroup_flush_stats_ratelimited(sc->memcg);
686 		lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
687 		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
688 			pages += lruvec_page_state_local(lruvec,
689 							 NR_LRU_BASE + i);
690 		pages += lruvec_page_state_local(
691 			lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
692 		pages += lruvec_page_state_local(
693 			lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
694 	} else
695 #endif
696 		pages = node_present_pages(sc->nid);
697 
698 	max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
699 
700 	if (nodes <= max_nodes)
701 		return 0;
702 	return nodes - max_nodes;
703 }
704 
705 static enum lru_status shadow_lru_isolate(struct list_head *item,
706 					  struct list_lru_one *lru,
707 					  void *arg) __must_hold(lru->lock)
708 {
709 	struct xa_node *node = container_of(item, struct xa_node, private_list);
710 	struct address_space *mapping;
711 	int ret;
712 
713 	/*
714 	 * Page cache insertions and deletions synchronously maintain
715 	 * the shadow node LRU under the i_pages lock and the
716 	 * &lru->lock. Because the page cache tree is emptied before
717 	 * the inode can be destroyed, holding the &lru->lock pins any
718 	 * address_space that has nodes on the LRU.
719 	 *
720 	 * We can then safely transition to the i_pages lock to
721 	 * pin only the address_space of the particular node we want
722 	 * to reclaim, take the node off-LRU, and drop the &lru->lock.
723 	 */
724 
725 	mapping = container_of(node->array, struct address_space, i_pages);
726 
727 	/* Coming from the list, invert the lock order */
728 	if (!xa_trylock(&mapping->i_pages)) {
729 		spin_unlock_irq(&lru->lock);
730 		ret = LRU_RETRY;
731 		goto out;
732 	}
733 
734 	/* For page cache we need to hold i_lock */
735 	if (mapping->host != NULL) {
736 		if (!spin_trylock(&mapping->host->i_lock)) {
737 			xa_unlock(&mapping->i_pages);
738 			spin_unlock_irq(&lru->lock);
739 			ret = LRU_RETRY;
740 			goto out;
741 		}
742 	}
743 
744 	list_lru_isolate(lru, item);
745 	__dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
746 
747 	spin_unlock(&lru->lock);
748 
749 	/*
750 	 * The nodes should only contain one or more shadow entries,
751 	 * no pages, so we expect to be able to remove them all and
752 	 * delete and free the empty node afterwards.
753 	 */
754 	if (WARN_ON_ONCE(!node->nr_values))
755 		goto out_invalid;
756 	if (WARN_ON_ONCE(node->count != node->nr_values))
757 		goto out_invalid;
758 	xa_delete_node(node, workingset_update_node);
759 	mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);
760 
761 out_invalid:
762 	xa_unlock_irq(&mapping->i_pages);
763 	if (mapping->host != NULL) {
764 		if (mapping_shrinkable(mapping))
765 			inode_lru_list_add(mapping->host);
766 		spin_unlock(&mapping->host->i_lock);
767 	}
768 	ret = LRU_REMOVED_RETRY;
769 out:
770 	cond_resched();
771 	return ret;
772 }
773 
774 static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
775 				       struct shrink_control *sc)
776 {
777 	/* list_lru lock nests inside the IRQ-safe i_pages lock */
778 	return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
779 					NULL);
780 }
781 
782 /*
783  * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
784  * i_pages lock.
785  */
786 static struct lock_class_key shadow_nodes_key;
787 
788 static int __init workingset_init(void)
789 {
790 	unsigned int timestamp_bits, timestamp_bits_anon;
791 	struct shrinker *workingset_shadow_shrinker;
792 	unsigned int max_order;
793 	int ret = -ENOMEM;
794 
795 	BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
796 	/*
797 	 * Calculate the eviction bucket size to cover the longest
798 	 * actionable refault distance, which is currently half of
799 	 * memory (totalram_pages/2). However, memory hotplug may add
800 	 * some more pages at runtime, so keep working with up to
801 	 * double the initial memory by using totalram_pages as-is.
802 	 */
803 	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
804 	timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON;
805 	max_order = fls_long(totalram_pages() - 1);
806 	if (max_order > (BITS_PER_LONG - EVICTION_SHIFT))
807 		bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits;
808 	if (max_order > timestamp_bits_anon)
809 		bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon;
810 	pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n",
811 		timestamp_bits, timestamp_bits_anon, max_order,
812 		bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]);
813 
814 	workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
815 						    SHRINKER_MEMCG_AWARE,
816 						    "mm-shadow");
817 	if (!workingset_shadow_shrinker)
818 		goto err;
819 
820 	ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
821 				      &shadow_nodes_key);
822 	if (ret)
823 		goto err_list_lru;
824 
825 	workingset_shadow_shrinker->count_objects = count_shadow_nodes;
826 	workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
827 	/* ->count reports only fully expendable nodes */
828 	workingset_shadow_shrinker->seeks = 0;
829 
830 	shrinker_register(workingset_shadow_shrinker);
831 	return 0;
832 err_list_lru:
833 	shrinker_free(workingset_shadow_shrinker);
834 err:
835 	return ret;
836 }
837 module_init(workingset_init);
838