xref: /linux/mm/shrinker.c (revision 019fc36872374db6fd35e118c9e935374404bfbf)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/memcontrol.h>
3 #include <linux/rwsem.h>
4 #include <linux/shrinker.h>
5 #include <linux/rculist.h>
6 #include <trace/events/vmscan.h>
7 
8 #include "internal.h"
9 
10 LIST_HEAD(shrinker_list);
11 DEFINE_MUTEX(shrinker_mutex);
12 
13 #ifdef CONFIG_MEMCG
14 static int shrinker_nr_max;
15 
16 static inline int shrinker_unit_size(int nr_items)
17 {
18 	return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
19 }
20 
21 static inline void shrinker_unit_free(struct shrinker_info *info, int start)
22 {
23 	struct shrinker_info_unit **unit;
24 	int nr, i;
25 
26 	if (!info)
27 		return;
28 
29 	unit = info->unit;
30 	nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
31 
32 	for (i = start; i < nr; i++) {
33 		if (!unit[i])
34 			break;
35 
36 		kfree(unit[i]);
37 		unit[i] = NULL;
38 	}
39 }
40 
41 static inline int shrinker_unit_alloc(struct shrinker_info *new,
42 				       struct shrinker_info *old, int nid)
43 {
44 	struct shrinker_info_unit *unit;
45 	int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
46 	int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
47 	int i;
48 
49 	for (i = start; i < nr; i++) {
50 		unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
51 		if (!unit) {
52 			shrinker_unit_free(new, start);
53 			return -ENOMEM;
54 		}
55 
56 		new->unit[i] = unit;
57 	}
58 
59 	return 0;
60 }
61 
62 void free_shrinker_info(struct mem_cgroup *memcg)
63 {
64 	struct mem_cgroup_per_node *pn;
65 	struct shrinker_info *info;
66 	int nid;
67 
68 	for_each_node(nid) {
69 		pn = memcg->nodeinfo[nid];
70 		info = rcu_dereference_protected(pn->shrinker_info, true);
71 		shrinker_unit_free(info, 0);
72 		kvfree(info);
73 		rcu_assign_pointer(pn->shrinker_info, NULL);
74 	}
75 }
76 
77 int alloc_shrinker_info(struct mem_cgroup *memcg)
78 {
79 	int nid, ret = 0;
80 	int array_size = 0;
81 
82 	mutex_lock(&shrinker_mutex);
83 	array_size = shrinker_unit_size(shrinker_nr_max);
84 	for_each_node(nid) {
85 		struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size,
86 							   GFP_KERNEL, nid);
87 		if (!info)
88 			goto err;
89 		info->map_nr_max = shrinker_nr_max;
90 		if (shrinker_unit_alloc(info, NULL, nid)) {
91 			kvfree(info);
92 			goto err;
93 		}
94 		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
95 	}
96 	mutex_unlock(&shrinker_mutex);
97 
98 	return ret;
99 
100 err:
101 	mutex_unlock(&shrinker_mutex);
102 	free_shrinker_info(memcg);
103 	return -ENOMEM;
104 }
105 
106 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
107 						     int nid)
108 {
109 	return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
110 					 lockdep_is_held(&shrinker_mutex));
111 }
112 
113 static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
114 				    int old_size, int new_nr_max)
115 {
116 	struct shrinker_info *new, *old;
117 	struct mem_cgroup_per_node *pn;
118 	int nid;
119 
120 	for_each_node(nid) {
121 		pn = memcg->nodeinfo[nid];
122 		old = shrinker_info_protected(memcg, nid);
123 		/* Not yet online memcg */
124 		if (!old)
125 			return 0;
126 
127 		/* Already expanded this shrinker_info */
128 		if (new_nr_max <= old->map_nr_max)
129 			continue;
130 
131 		new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
132 		if (!new)
133 			return -ENOMEM;
134 
135 		new->map_nr_max = new_nr_max;
136 
137 		memcpy(new->unit, old->unit, old_size);
138 		if (shrinker_unit_alloc(new, old, nid)) {
139 			kvfree(new);
140 			return -ENOMEM;
141 		}
142 
143 		rcu_assign_pointer(pn->shrinker_info, new);
144 		kvfree_rcu(old, rcu);
145 	}
146 
147 	return 0;
148 }
149 
150 static int expand_shrinker_info(int new_id)
151 {
152 	int ret = 0;
153 	int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
154 	int new_size, old_size = 0;
155 	struct mem_cgroup *memcg;
156 
157 	if (!root_mem_cgroup)
158 		goto out;
159 
160 	lockdep_assert_held(&shrinker_mutex);
161 
162 	new_size = shrinker_unit_size(new_nr_max);
163 	old_size = shrinker_unit_size(shrinker_nr_max);
164 
165 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
166 	do {
167 		ret = expand_one_shrinker_info(memcg, new_size, old_size,
168 					       new_nr_max);
169 		if (ret) {
170 			mem_cgroup_iter_break(NULL, memcg);
171 			goto out;
172 		}
173 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
174 out:
175 	if (!ret)
176 		shrinker_nr_max = new_nr_max;
177 
178 	return ret;
179 }
180 
181 static inline int shrinker_id_to_index(int shrinker_id)
182 {
183 	return shrinker_id / SHRINKER_UNIT_BITS;
184 }
185 
186 static inline int shrinker_id_to_offset(int shrinker_id)
187 {
188 	return shrinker_id % SHRINKER_UNIT_BITS;
189 }
190 
191 static inline int calc_shrinker_id(int index, int offset)
192 {
193 	return index * SHRINKER_UNIT_BITS + offset;
194 }
195 
196 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
197 {
198 	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
199 		struct shrinker_info *info;
200 		struct shrinker_info_unit *unit;
201 
202 		rcu_read_lock();
203 		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
204 		unit = info->unit[shrinker_id_to_index(shrinker_id)];
205 		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
206 			/* Pairs with smp mb in shrink_slab() */
207 			smp_mb__before_atomic();
208 			set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
209 		}
210 		rcu_read_unlock();
211 	}
212 }
213 
214 static DEFINE_IDR(shrinker_idr);
215 
216 static int shrinker_memcg_alloc(struct shrinker *shrinker)
217 {
218 	int id, ret = -ENOMEM;
219 
220 	if (mem_cgroup_disabled())
221 		return -ENOSYS;
222 	if (mem_cgroup_kmem_disabled() && !(shrinker->flags & SHRINKER_NONSLAB))
223 		return -ENOSYS;
224 
225 	mutex_lock(&shrinker_mutex);
226 	id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
227 	if (id < 0)
228 		goto unlock;
229 
230 	if (id >= shrinker_nr_max) {
231 		if (expand_shrinker_info(id)) {
232 			idr_remove(&shrinker_idr, id);
233 			goto unlock;
234 		}
235 	}
236 	shrinker->id = id;
237 	ret = 0;
238 unlock:
239 	mutex_unlock(&shrinker_mutex);
240 	return ret;
241 }
242 
243 static void shrinker_memcg_remove(struct shrinker *shrinker)
244 {
245 	int id = shrinker->id;
246 
247 	BUG_ON(id < 0);
248 
249 	lockdep_assert_held(&shrinker_mutex);
250 
251 	idr_remove(&shrinker_idr, id);
252 }
253 
254 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
255 				   struct mem_cgroup *memcg)
256 {
257 	struct shrinker_info *info;
258 	struct shrinker_info_unit *unit;
259 	long nr_deferred;
260 
261 	rcu_read_lock();
262 	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
263 	unit = info->unit[shrinker_id_to_index(shrinker->id)];
264 	nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
265 	rcu_read_unlock();
266 
267 	return nr_deferred;
268 }
269 
270 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
271 				  struct mem_cgroup *memcg)
272 {
273 	struct shrinker_info *info;
274 	struct shrinker_info_unit *unit;
275 	long nr_deferred;
276 
277 	rcu_read_lock();
278 	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
279 	unit = info->unit[shrinker_id_to_index(shrinker->id)];
280 	nr_deferred =
281 		atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
282 	rcu_read_unlock();
283 
284 	return nr_deferred;
285 }
286 
287 void reparent_shrinker_deferred(struct mem_cgroup *memcg)
288 {
289 	int nid, index, offset;
290 	long nr;
291 	struct mem_cgroup *parent;
292 	struct shrinker_info *child_info, *parent_info;
293 	struct shrinker_info_unit *child_unit, *parent_unit;
294 
295 	parent = parent_mem_cgroup(memcg);
296 	if (!parent)
297 		parent = root_mem_cgroup;
298 
299 	/* Prevent from concurrent shrinker_info expand */
300 	mutex_lock(&shrinker_mutex);
301 	for_each_node(nid) {
302 		child_info = shrinker_info_protected(memcg, nid);
303 		parent_info = shrinker_info_protected(parent, nid);
304 		for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
305 			child_unit = child_info->unit[index];
306 			parent_unit = parent_info->unit[index];
307 			for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
308 				nr = atomic_long_read(&child_unit->nr_deferred[offset]);
309 				atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
310 			}
311 		}
312 	}
313 	mutex_unlock(&shrinker_mutex);
314 }
315 #else
316 static int shrinker_memcg_alloc(struct shrinker *shrinker)
317 {
318 	return -ENOSYS;
319 }
320 
321 static void shrinker_memcg_remove(struct shrinker *shrinker)
322 {
323 }
324 
325 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
326 				   struct mem_cgroup *memcg)
327 {
328 	return 0;
329 }
330 
331 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
332 				  struct mem_cgroup *memcg)
333 {
334 	return 0;
335 }
336 #endif /* CONFIG_MEMCG */
337 
338 static long xchg_nr_deferred(struct shrinker *shrinker,
339 			     struct shrink_control *sc)
340 {
341 	int nid = sc->nid;
342 
343 	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
344 		nid = 0;
345 
346 	if (sc->memcg &&
347 	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
348 		return xchg_nr_deferred_memcg(nid, shrinker,
349 					      sc->memcg);
350 
351 	return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
352 }
353 
354 
355 static long add_nr_deferred(long nr, struct shrinker *shrinker,
356 			    struct shrink_control *sc)
357 {
358 	int nid = sc->nid;
359 
360 	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
361 		nid = 0;
362 
363 	if (sc->memcg &&
364 	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
365 		return add_nr_deferred_memcg(nr, nid, shrinker,
366 					     sc->memcg);
367 
368 	return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
369 }
370 
371 #define SHRINK_BATCH 128
372 
373 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
374 				    struct shrinker *shrinker, int priority)
375 {
376 	unsigned long freed = 0;
377 	unsigned long long delta;
378 	long total_scan;
379 	long freeable;
380 	long nr;
381 	long new_nr;
382 	long batch_size = shrinker->batch ? shrinker->batch
383 					  : SHRINK_BATCH;
384 	long scanned = 0, next_deferred;
385 
386 	freeable = shrinker->count_objects(shrinker, shrinkctl);
387 	if (freeable == 0 || freeable == SHRINK_EMPTY)
388 		return freeable;
389 
390 	/*
391 	 * copy the current shrinker scan count into a local variable
392 	 * and zero it so that other concurrent shrinker invocations
393 	 * don't also do this scanning work.
394 	 */
395 	nr = xchg_nr_deferred(shrinker, shrinkctl);
396 
397 	if (shrinker->seeks) {
398 		delta = freeable >> priority;
399 		delta *= 4;
400 		do_div(delta, shrinker->seeks);
401 	} else {
402 		/*
403 		 * These objects don't require any IO to create. Trim
404 		 * them aggressively under memory pressure to keep
405 		 * them from causing refetches in the IO caches.
406 		 */
407 		delta = freeable / 2;
408 	}
409 
410 	total_scan = nr >> priority;
411 	total_scan += delta;
412 	total_scan = min(total_scan, (2 * freeable));
413 
414 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
415 				   freeable, delta, total_scan, priority,
416 				   shrinkctl->memcg);
417 
418 	/*
419 	 * Normally, we should not scan less than batch_size objects in one
420 	 * pass to avoid too frequent shrinker calls, but if the slab has less
421 	 * than batch_size objects in total and we are really tight on memory,
422 	 * we will try to reclaim all available objects, otherwise we can end
423 	 * up failing allocations although there are plenty of reclaimable
424 	 * objects spread over several slabs with usage less than the
425 	 * batch_size.
426 	 *
427 	 * We detect the "tight on memory" situations by looking at the total
428 	 * number of objects we want to scan (total_scan). If it is greater
429 	 * than the total number of objects on slab (freeable), we must be
430 	 * scanning at high prio and therefore should try to reclaim as much as
431 	 * possible.
432 	 */
433 	while (total_scan >= batch_size ||
434 	       total_scan >= freeable) {
435 		unsigned long ret;
436 		unsigned long nr_to_scan = min(batch_size, total_scan);
437 
438 		shrinkctl->nr_to_scan = nr_to_scan;
439 		shrinkctl->nr_scanned = nr_to_scan;
440 		ret = shrinker->scan_objects(shrinker, shrinkctl);
441 		if (ret == SHRINK_STOP)
442 			break;
443 		freed += ret;
444 
445 		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
446 		total_scan -= shrinkctl->nr_scanned;
447 		scanned += shrinkctl->nr_scanned;
448 
449 		cond_resched();
450 	}
451 
452 	/*
453 	 * The deferred work is increased by any new work (delta) that wasn't
454 	 * done, decreased by old deferred work that was done now.
455 	 *
456 	 * And it is capped to two times of the freeable items.
457 	 */
458 	next_deferred = max_t(long, (nr + delta - scanned), 0);
459 	next_deferred = min(next_deferred, (2 * freeable));
460 
461 	/*
462 	 * move the unused scan count back into the shrinker in a
463 	 * manner that handles concurrent updates.
464 	 */
465 	new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
466 
467 	trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan,
468 				 shrinkctl->memcg);
469 	return freed;
470 }
471 
472 #ifdef CONFIG_MEMCG
473 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
474 			struct mem_cgroup *memcg, int priority)
475 {
476 	struct shrinker_info *info;
477 	unsigned long ret, freed = 0;
478 	int offset, index = 0;
479 
480 	if (!mem_cgroup_online(memcg))
481 		return 0;
482 
483 	/*
484 	 * lockless algorithm of memcg shrink.
485 	 *
486 	 * The shrinker_info may be freed asynchronously via RCU in the
487 	 * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
488 	 * to ensure the existence of the shrinker_info.
489 	 *
490 	 * The shrinker_info_unit is never freed unless its corresponding memcg
491 	 * is destroyed. Here we already hold the refcount of memcg, so the
492 	 * memcg will not be destroyed, and of course shrinker_info_unit will
493 	 * not be freed.
494 	 *
495 	 * So in the memcg shrink:
496 	 *  step 1: use rcu_read_lock() to guarantee existence of the
497 	 *          shrinker_info.
498 	 *  step 2: after getting shrinker_info_unit we can safely release the
499 	 *          RCU lock.
500 	 *  step 3: traverse the bitmap and calculate shrinker_id
501 	 *  step 4: use rcu_read_lock() to guarantee existence of the shrinker.
502 	 *  step 5: use shrinker_id to find the shrinker, then use
503 	 *          shrinker_try_get() to guarantee existence of the shrinker,
504 	 *          then we can release the RCU lock to do do_shrink_slab() that
505 	 *          may sleep.
506 	 *  step 6: do shrinker_put() paired with step 5 to put the refcount,
507 	 *          if the refcount reaches 0, then wake up the waiter in
508 	 *          shrinker_free() by calling complete().
509 	 *          Note: here is different from the global shrink, we don't
510 	 *                need to acquire the RCU lock to guarantee existence of
511 	 *                the shrinker, because we don't need to use this
512 	 *                shrinker to traverse the next shrinker in the bitmap.
513 	 *  step 7: we have already exited the read-side of rcu critical section
514 	 *          before calling do_shrink_slab(), the shrinker_info may be
515 	 *          released in expand_one_shrinker_info(), so go back to step 1
516 	 *          to reacquire the shrinker_info.
517 	 */
518 again:
519 	rcu_read_lock();
520 	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
521 	if (unlikely(!info))
522 		goto unlock;
523 
524 	if (index < shrinker_id_to_index(info->map_nr_max)) {
525 		struct shrinker_info_unit *unit;
526 
527 		unit = info->unit[index];
528 
529 		rcu_read_unlock();
530 
531 		for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
532 			struct shrink_control sc = {
533 				.gfp_mask = gfp_mask,
534 				.nid = nid,
535 				.memcg = memcg,
536 			};
537 			struct shrinker *shrinker;
538 			int shrinker_id = calc_shrinker_id(index, offset);
539 
540 			rcu_read_lock();
541 			shrinker = idr_find(&shrinker_idr, shrinker_id);
542 			if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
543 				clear_bit(offset, unit->map);
544 				rcu_read_unlock();
545 				continue;
546 			}
547 			rcu_read_unlock();
548 
549 			/* Call non-slab shrinkers even though kmem is disabled */
550 			if (!memcg_kmem_online() &&
551 			    !(shrinker->flags & SHRINKER_NONSLAB)) {
552 				clear_bit(offset, unit->map);
553 				shrinker_put(shrinker);
554 				continue;
555 			}
556 
557 			ret = do_shrink_slab(&sc, shrinker, priority);
558 			if (ret == SHRINK_EMPTY) {
559 				clear_bit(offset, unit->map);
560 				/*
561 				 * After the shrinker reported that it had no objects to
562 				 * free, but before we cleared the corresponding bit in
563 				 * the memcg shrinker map, a new object might have been
564 				 * added. To make sure, we have the bit set in this
565 				 * case, we invoke the shrinker one more time and reset
566 				 * the bit if it reports that it is not empty anymore.
567 				 * The memory barrier here pairs with the barrier in
568 				 * set_shrinker_bit():
569 				 *
570 				 * list_lru_add()     shrink_slab_memcg()
571 				 *   list_add_tail()    clear_bit()
572 				 *   <MB>               <MB>
573 				 *   set_bit()          do_shrink_slab()
574 				 */
575 				smp_mb__after_atomic();
576 				ret = do_shrink_slab(&sc, shrinker, priority);
577 				if (ret == SHRINK_EMPTY)
578 					ret = 0;
579 				else
580 					set_shrinker_bit(memcg, nid, shrinker_id);
581 			}
582 			freed += ret;
583 			shrinker_put(shrinker);
584 		}
585 
586 		index++;
587 		goto again;
588 	}
589 unlock:
590 	rcu_read_unlock();
591 	return freed;
592 }
593 #else /* !CONFIG_MEMCG */
594 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
595 			struct mem_cgroup *memcg, int priority)
596 {
597 	return 0;
598 }
599 #endif /* CONFIG_MEMCG */
600 
601 /**
602  * shrink_slab - shrink slab caches
603  * @gfp_mask: allocation context
604  * @nid: node whose slab caches to target
605  * @memcg: memory cgroup whose slab caches to target
606  * @priority: the reclaim priority
607  *
608  * Call the shrink functions to age shrinkable caches.
609  *
610  * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
611  * unaware shrinkers will receive a node id of 0 instead.
612  *
613  * @memcg specifies the memory cgroup to target. Unaware shrinkers
614  * are called only if it is the root cgroup.
615  *
616  * @priority is sc->priority, we take the number of objects and >> by priority
617  * in order to get the scan target.
618  *
619  * Returns the number of reclaimed slab objects.
620  */
621 unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
622 			  int priority)
623 {
624 	unsigned long ret, freed = 0;
625 	struct shrinker *shrinker;
626 
627 	/*
628 	 * The root memcg might be allocated even though memcg is disabled
629 	 * via "cgroup_disable=memory" boot parameter.  This could make
630 	 * mem_cgroup_is_root() return false, then just run memcg slab
631 	 * shrink, but skip global shrink.  This may result in premature
632 	 * oom.
633 	 */
634 	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
635 		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
636 
637 	/*
638 	 * lockless algorithm of global shrink.
639 	 *
640 	 * In the unregistration setp, the shrinker will be freed asynchronously
641 	 * via RCU after its refcount reaches 0. So both rcu_read_lock() and
642 	 * shrinker_try_get() can be used to ensure the existence of the shrinker.
643 	 *
644 	 * So in the global shrink:
645 	 *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
646 	 *          and the validity of the shrinker_list walk.
647 	 *  step 2: use shrinker_try_get() to try get the refcount, if successful,
648 	 *          then the existence of the shrinker can also be guaranteed,
649 	 *          so we can release the RCU lock to do do_shrink_slab() that
650 	 *          may sleep.
651 	 *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
652 	 *          which ensures that neither this shrinker nor the next shrinker
653 	 *          will be freed in the next traversal operation.
654 	 *  step 4: do shrinker_put() paired with step 2 to put the refcount,
655 	 *          if the refcount reaches 0, then wake up the waiter in
656 	 *          shrinker_free() by calling complete().
657 	 */
658 	rcu_read_lock();
659 	list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
660 		struct shrink_control sc = {
661 			.gfp_mask = gfp_mask,
662 			.nid = nid,
663 			.memcg = memcg,
664 		};
665 
666 		if (!shrinker_try_get(shrinker))
667 			continue;
668 
669 		rcu_read_unlock();
670 
671 		ret = do_shrink_slab(&sc, shrinker, priority);
672 		if (ret == SHRINK_EMPTY)
673 			ret = 0;
674 		freed += ret;
675 
676 		rcu_read_lock();
677 		shrinker_put(shrinker);
678 	}
679 
680 	rcu_read_unlock();
681 	cond_resched();
682 	return freed;
683 }
684 
685 struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
686 {
687 	struct shrinker *shrinker;
688 	unsigned int size;
689 	va_list ap;
690 	int err;
691 
692 	shrinker = kzalloc_obj(struct shrinker);
693 	if (!shrinker)
694 		return NULL;
695 
696 	va_start(ap, fmt);
697 	err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
698 	va_end(ap);
699 	if (err)
700 		goto err_name;
701 
702 	shrinker->flags = flags | SHRINKER_ALLOCATED;
703 	shrinker->seeks = DEFAULT_SEEKS;
704 
705 	if (flags & SHRINKER_MEMCG_AWARE) {
706 		err = shrinker_memcg_alloc(shrinker);
707 		if (err == -ENOSYS) {
708 			/* Memcg is not supported, fallback to non-memcg-aware shrinker. */
709 			shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
710 			goto non_memcg;
711 		}
712 
713 		if (err)
714 			goto err_flags;
715 
716 		return shrinker;
717 	}
718 
719 non_memcg:
720 	/*
721 	 * The nr_deferred is available on per memcg level for memcg aware
722 	 * shrinkers, so only allocate nr_deferred in the following cases:
723 	 *  - non-memcg-aware shrinkers
724 	 *  - !CONFIG_MEMCG
725 	 *  - memcg is disabled by kernel command line
726 	 *  - non-slab shrinkers: when memcg kmem is disabled
727 	 */
728 	size = sizeof(*shrinker->nr_deferred);
729 	if (flags & SHRINKER_NUMA_AWARE)
730 		size *= nr_node_ids;
731 
732 	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
733 	if (!shrinker->nr_deferred)
734 		goto err_flags;
735 
736 	return shrinker;
737 
738 err_flags:
739 	shrinker_debugfs_name_free(shrinker);
740 err_name:
741 	kfree(shrinker);
742 	return NULL;
743 }
744 EXPORT_SYMBOL_GPL(shrinker_alloc);
745 
746 void shrinker_register(struct shrinker *shrinker)
747 {
748 	if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
749 		pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
750 		return;
751 	}
752 
753 	mutex_lock(&shrinker_mutex);
754 	list_add_tail_rcu(&shrinker->list, &shrinker_list);
755 	shrinker->flags |= SHRINKER_REGISTERED;
756 	shrinker_debugfs_add(shrinker);
757 	mutex_unlock(&shrinker_mutex);
758 
759 	init_completion(&shrinker->done);
760 	/*
761 	 * Now the shrinker is fully set up, take the first reference to it to
762 	 * indicate that lookup operations are now allowed to use it via
763 	 * shrinker_try_get().
764 	 */
765 	refcount_set(&shrinker->refcount, 1);
766 }
767 EXPORT_SYMBOL_GPL(shrinker_register);
768 
769 static void shrinker_free_rcu_cb(struct rcu_head *head)
770 {
771 	struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
772 
773 	kfree(shrinker->nr_deferred);
774 	kfree(shrinker);
775 }
776 
777 void shrinker_free(struct shrinker *shrinker)
778 {
779 	struct dentry *debugfs_entry = NULL;
780 	int debugfs_id;
781 
782 	if (!shrinker)
783 		return;
784 
785 	if (shrinker->flags & SHRINKER_REGISTERED) {
786 		/* drop the initial refcount */
787 		shrinker_put(shrinker);
788 		/*
789 		 * Wait for all lookups of the shrinker to complete, after that,
790 		 * no shrinker is running or will run again, then we can safely
791 		 * free it asynchronously via RCU and safely free the structure
792 		 * where the shrinker is located, such as super_block etc.
793 		 */
794 		wait_for_completion(&shrinker->done);
795 	}
796 
797 	mutex_lock(&shrinker_mutex);
798 	if (shrinker->flags & SHRINKER_REGISTERED) {
799 		/*
800 		 * Now we can safely remove it from the shrinker_list and then
801 		 * free it.
802 		 */
803 		list_del_rcu(&shrinker->list);
804 		debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
805 		shrinker->flags &= ~SHRINKER_REGISTERED;
806 	}
807 
808 	shrinker_debugfs_name_free(shrinker);
809 
810 	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
811 		shrinker_memcg_remove(shrinker);
812 	mutex_unlock(&shrinker_mutex);
813 
814 	if (debugfs_entry)
815 		shrinker_debugfs_remove(debugfs_entry, debugfs_id);
816 
817 	call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
818 }
819 EXPORT_SYMBOL_GPL(shrinker_free);
820