xref: /linux/mm/shrinker.c (revision cf79f291f985662150363b4a93d16f88f12643bc)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/memcontrol.h>
3 #include <linux/rwsem.h>
4 #include <linux/shrinker.h>
5 #include <linux/rculist.h>
6 #include <trace/events/vmscan.h>
7 
8 #include "internal.h"
9 
10 LIST_HEAD(shrinker_list);
11 DEFINE_MUTEX(shrinker_mutex);
12 
13 #ifdef CONFIG_MEMCG
14 static int shrinker_nr_max;
15 
shrinker_unit_size(int nr_items)16 static inline int shrinker_unit_size(int nr_items)
17 {
18 	return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
19 }
20 
shrinker_unit_free(struct shrinker_info * info,int start)21 static inline void shrinker_unit_free(struct shrinker_info *info, int start)
22 {
23 	struct shrinker_info_unit **unit;
24 	int nr, i;
25 
26 	if (!info)
27 		return;
28 
29 	unit = info->unit;
30 	nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
31 
32 	for (i = start; i < nr; i++) {
33 		if (!unit[i])
34 			break;
35 
36 		kfree(unit[i]);
37 		unit[i] = NULL;
38 	}
39 }
40 
shrinker_unit_alloc(struct shrinker_info * new,struct shrinker_info * old,int nid)41 static inline int shrinker_unit_alloc(struct shrinker_info *new,
42 				       struct shrinker_info *old, int nid)
43 {
44 	struct shrinker_info_unit *unit;
45 	int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
46 	int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
47 	int i;
48 
49 	for (i = start; i < nr; i++) {
50 		unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
51 		if (!unit) {
52 			shrinker_unit_free(new, start);
53 			return -ENOMEM;
54 		}
55 
56 		new->unit[i] = unit;
57 	}
58 
59 	return 0;
60 }
61 
free_shrinker_info(struct mem_cgroup * memcg)62 void free_shrinker_info(struct mem_cgroup *memcg)
63 {
64 	struct mem_cgroup_per_node *pn;
65 	struct shrinker_info *info;
66 	int nid;
67 
68 	for_each_node(nid) {
69 		pn = memcg->nodeinfo[nid];
70 		info = rcu_dereference_protected(pn->shrinker_info, true);
71 		shrinker_unit_free(info, 0);
72 		kvfree(info);
73 		rcu_assign_pointer(pn->shrinker_info, NULL);
74 	}
75 }
76 
alloc_shrinker_info(struct mem_cgroup * memcg)77 int alloc_shrinker_info(struct mem_cgroup *memcg)
78 {
79 	struct shrinker_info *info;
80 	int nid, ret = 0;
81 	int array_size = 0;
82 
83 	mutex_lock(&shrinker_mutex);
84 	array_size = shrinker_unit_size(shrinker_nr_max);
85 	for_each_node(nid) {
86 		info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
87 		if (!info)
88 			goto err;
89 		info->map_nr_max = shrinker_nr_max;
90 		if (shrinker_unit_alloc(info, NULL, nid))
91 			goto err;
92 		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
93 	}
94 	mutex_unlock(&shrinker_mutex);
95 
96 	return ret;
97 
98 err:
99 	mutex_unlock(&shrinker_mutex);
100 	free_shrinker_info(memcg);
101 	return -ENOMEM;
102 }
103 
shrinker_info_protected(struct mem_cgroup * memcg,int nid)104 static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
105 						     int nid)
106 {
107 	return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
108 					 lockdep_is_held(&shrinker_mutex));
109 }
110 
expand_one_shrinker_info(struct mem_cgroup * memcg,int new_size,int old_size,int new_nr_max)111 static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
112 				    int old_size, int new_nr_max)
113 {
114 	struct shrinker_info *new, *old;
115 	struct mem_cgroup_per_node *pn;
116 	int nid;
117 
118 	for_each_node(nid) {
119 		pn = memcg->nodeinfo[nid];
120 		old = shrinker_info_protected(memcg, nid);
121 		/* Not yet online memcg */
122 		if (!old)
123 			return 0;
124 
125 		/* Already expanded this shrinker_info */
126 		if (new_nr_max <= old->map_nr_max)
127 			continue;
128 
129 		new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
130 		if (!new)
131 			return -ENOMEM;
132 
133 		new->map_nr_max = new_nr_max;
134 
135 		memcpy(new->unit, old->unit, old_size);
136 		if (shrinker_unit_alloc(new, old, nid)) {
137 			kvfree(new);
138 			return -ENOMEM;
139 		}
140 
141 		rcu_assign_pointer(pn->shrinker_info, new);
142 		kvfree_rcu(old, rcu);
143 	}
144 
145 	return 0;
146 }
147 
expand_shrinker_info(int new_id)148 static int expand_shrinker_info(int new_id)
149 {
150 	int ret = 0;
151 	int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
152 	int new_size, old_size = 0;
153 	struct mem_cgroup *memcg;
154 
155 	if (!root_mem_cgroup)
156 		goto out;
157 
158 	lockdep_assert_held(&shrinker_mutex);
159 
160 	new_size = shrinker_unit_size(new_nr_max);
161 	old_size = shrinker_unit_size(shrinker_nr_max);
162 
163 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
164 	do {
165 		ret = expand_one_shrinker_info(memcg, new_size, old_size,
166 					       new_nr_max);
167 		if (ret) {
168 			mem_cgroup_iter_break(NULL, memcg);
169 			goto out;
170 		}
171 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
172 out:
173 	if (!ret)
174 		shrinker_nr_max = new_nr_max;
175 
176 	return ret;
177 }
178 
shrinker_id_to_index(int shrinker_id)179 static inline int shrinker_id_to_index(int shrinker_id)
180 {
181 	return shrinker_id / SHRINKER_UNIT_BITS;
182 }
183 
shrinker_id_to_offset(int shrinker_id)184 static inline int shrinker_id_to_offset(int shrinker_id)
185 {
186 	return shrinker_id % SHRINKER_UNIT_BITS;
187 }
188 
calc_shrinker_id(int index,int offset)189 static inline int calc_shrinker_id(int index, int offset)
190 {
191 	return index * SHRINKER_UNIT_BITS + offset;
192 }
193 
set_shrinker_bit(struct mem_cgroup * memcg,int nid,int shrinker_id)194 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
195 {
196 	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
197 		struct shrinker_info *info;
198 		struct shrinker_info_unit *unit;
199 
200 		rcu_read_lock();
201 		info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
202 		unit = info->unit[shrinker_id_to_index(shrinker_id)];
203 		if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
204 			/* Pairs with smp mb in shrink_slab() */
205 			smp_mb__before_atomic();
206 			set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
207 		}
208 		rcu_read_unlock();
209 	}
210 }
211 
212 static DEFINE_IDR(shrinker_idr);
213 
shrinker_memcg_alloc(struct shrinker * shrinker)214 static int shrinker_memcg_alloc(struct shrinker *shrinker)
215 {
216 	int id, ret = -ENOMEM;
217 
218 	if (mem_cgroup_disabled())
219 		return -ENOSYS;
220 
221 	mutex_lock(&shrinker_mutex);
222 	id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
223 	if (id < 0)
224 		goto unlock;
225 
226 	if (id >= shrinker_nr_max) {
227 		if (expand_shrinker_info(id)) {
228 			idr_remove(&shrinker_idr, id);
229 			goto unlock;
230 		}
231 	}
232 	shrinker->id = id;
233 	ret = 0;
234 unlock:
235 	mutex_unlock(&shrinker_mutex);
236 	return ret;
237 }
238 
shrinker_memcg_remove(struct shrinker * shrinker)239 static void shrinker_memcg_remove(struct shrinker *shrinker)
240 {
241 	int id = shrinker->id;
242 
243 	BUG_ON(id < 0);
244 
245 	lockdep_assert_held(&shrinker_mutex);
246 
247 	idr_remove(&shrinker_idr, id);
248 }
249 
xchg_nr_deferred_memcg(int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)250 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
251 				   struct mem_cgroup *memcg)
252 {
253 	struct shrinker_info *info;
254 	struct shrinker_info_unit *unit;
255 	long nr_deferred;
256 
257 	rcu_read_lock();
258 	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
259 	unit = info->unit[shrinker_id_to_index(shrinker->id)];
260 	nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
261 	rcu_read_unlock();
262 
263 	return nr_deferred;
264 }
265 
add_nr_deferred_memcg(long nr,int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)266 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
267 				  struct mem_cgroup *memcg)
268 {
269 	struct shrinker_info *info;
270 	struct shrinker_info_unit *unit;
271 	long nr_deferred;
272 
273 	rcu_read_lock();
274 	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
275 	unit = info->unit[shrinker_id_to_index(shrinker->id)];
276 	nr_deferred =
277 		atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
278 	rcu_read_unlock();
279 
280 	return nr_deferred;
281 }
282 
reparent_shrinker_deferred(struct mem_cgroup * memcg)283 void reparent_shrinker_deferred(struct mem_cgroup *memcg)
284 {
285 	int nid, index, offset;
286 	long nr;
287 	struct mem_cgroup *parent;
288 	struct shrinker_info *child_info, *parent_info;
289 	struct shrinker_info_unit *child_unit, *parent_unit;
290 
291 	parent = parent_mem_cgroup(memcg);
292 	if (!parent)
293 		parent = root_mem_cgroup;
294 
295 	/* Prevent from concurrent shrinker_info expand */
296 	mutex_lock(&shrinker_mutex);
297 	for_each_node(nid) {
298 		child_info = shrinker_info_protected(memcg, nid);
299 		parent_info = shrinker_info_protected(parent, nid);
300 		for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
301 			child_unit = child_info->unit[index];
302 			parent_unit = parent_info->unit[index];
303 			for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
304 				nr = atomic_long_read(&child_unit->nr_deferred[offset]);
305 				atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
306 			}
307 		}
308 	}
309 	mutex_unlock(&shrinker_mutex);
310 }
311 #else
shrinker_memcg_alloc(struct shrinker * shrinker)312 static int shrinker_memcg_alloc(struct shrinker *shrinker)
313 {
314 	return -ENOSYS;
315 }
316 
shrinker_memcg_remove(struct shrinker * shrinker)317 static void shrinker_memcg_remove(struct shrinker *shrinker)
318 {
319 }
320 
xchg_nr_deferred_memcg(int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)321 static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
322 				   struct mem_cgroup *memcg)
323 {
324 	return 0;
325 }
326 
add_nr_deferred_memcg(long nr,int nid,struct shrinker * shrinker,struct mem_cgroup * memcg)327 static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
328 				  struct mem_cgroup *memcg)
329 {
330 	return 0;
331 }
332 #endif /* CONFIG_MEMCG */
333 
xchg_nr_deferred(struct shrinker * shrinker,struct shrink_control * sc)334 static long xchg_nr_deferred(struct shrinker *shrinker,
335 			     struct shrink_control *sc)
336 {
337 	int nid = sc->nid;
338 
339 	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
340 		nid = 0;
341 
342 	if (sc->memcg &&
343 	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
344 		return xchg_nr_deferred_memcg(nid, shrinker,
345 					      sc->memcg);
346 
347 	return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
348 }
349 
350 
add_nr_deferred(long nr,struct shrinker * shrinker,struct shrink_control * sc)351 static long add_nr_deferred(long nr, struct shrinker *shrinker,
352 			    struct shrink_control *sc)
353 {
354 	int nid = sc->nid;
355 
356 	if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
357 		nid = 0;
358 
359 	if (sc->memcg &&
360 	    (shrinker->flags & SHRINKER_MEMCG_AWARE))
361 		return add_nr_deferred_memcg(nr, nid, shrinker,
362 					     sc->memcg);
363 
364 	return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
365 }
366 
367 #define SHRINK_BATCH 128
368 
do_shrink_slab(struct shrink_control * shrinkctl,struct shrinker * shrinker,int priority)369 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
370 				    struct shrinker *shrinker, int priority)
371 {
372 	unsigned long freed = 0;
373 	unsigned long long delta;
374 	long total_scan;
375 	long freeable;
376 	long nr;
377 	long new_nr;
378 	long batch_size = shrinker->batch ? shrinker->batch
379 					  : SHRINK_BATCH;
380 	long scanned = 0, next_deferred;
381 
382 	freeable = shrinker->count_objects(shrinker, shrinkctl);
383 	if (freeable == 0 || freeable == SHRINK_EMPTY)
384 		return freeable;
385 
386 	/*
387 	 * copy the current shrinker scan count into a local variable
388 	 * and zero it so that other concurrent shrinker invocations
389 	 * don't also do this scanning work.
390 	 */
391 	nr = xchg_nr_deferred(shrinker, shrinkctl);
392 
393 	if (shrinker->seeks) {
394 		delta = freeable >> priority;
395 		delta *= 4;
396 		do_div(delta, shrinker->seeks);
397 	} else {
398 		/*
399 		 * These objects don't require any IO to create. Trim
400 		 * them aggressively under memory pressure to keep
401 		 * them from causing refetches in the IO caches.
402 		 */
403 		delta = freeable / 2;
404 	}
405 
406 	total_scan = nr >> priority;
407 	total_scan += delta;
408 	total_scan = min(total_scan, (2 * freeable));
409 
410 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
411 				   freeable, delta, total_scan, priority);
412 
413 	/*
414 	 * Normally, we should not scan less than batch_size objects in one
415 	 * pass to avoid too frequent shrinker calls, but if the slab has less
416 	 * than batch_size objects in total and we are really tight on memory,
417 	 * we will try to reclaim all available objects, otherwise we can end
418 	 * up failing allocations although there are plenty of reclaimable
419 	 * objects spread over several slabs with usage less than the
420 	 * batch_size.
421 	 *
422 	 * We detect the "tight on memory" situations by looking at the total
423 	 * number of objects we want to scan (total_scan). If it is greater
424 	 * than the total number of objects on slab (freeable), we must be
425 	 * scanning at high prio and therefore should try to reclaim as much as
426 	 * possible.
427 	 */
428 	while (total_scan >= batch_size ||
429 	       total_scan >= freeable) {
430 		unsigned long ret;
431 		unsigned long nr_to_scan = min(batch_size, total_scan);
432 
433 		shrinkctl->nr_to_scan = nr_to_scan;
434 		shrinkctl->nr_scanned = nr_to_scan;
435 		ret = shrinker->scan_objects(shrinker, shrinkctl);
436 		if (ret == SHRINK_STOP)
437 			break;
438 		freed += ret;
439 
440 		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
441 		total_scan -= shrinkctl->nr_scanned;
442 		scanned += shrinkctl->nr_scanned;
443 
444 		cond_resched();
445 	}
446 
447 	/*
448 	 * The deferred work is increased by any new work (delta) that wasn't
449 	 * done, decreased by old deferred work that was done now.
450 	 *
451 	 * And it is capped to two times of the freeable items.
452 	 */
453 	next_deferred = max_t(long, (nr + delta - scanned), 0);
454 	next_deferred = min(next_deferred, (2 * freeable));
455 
456 	/*
457 	 * move the unused scan count back into the shrinker in a
458 	 * manner that handles concurrent updates.
459 	 */
460 	new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
461 
462 	trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
463 	return freed;
464 }
465 
466 #ifdef CONFIG_MEMCG
shrink_slab_memcg(gfp_t gfp_mask,int nid,struct mem_cgroup * memcg,int priority)467 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
468 			struct mem_cgroup *memcg, int priority)
469 {
470 	struct shrinker_info *info;
471 	unsigned long ret, freed = 0;
472 	int offset, index = 0;
473 
474 	if (!mem_cgroup_online(memcg))
475 		return 0;
476 
477 	/*
478 	 * lockless algorithm of memcg shrink.
479 	 *
480 	 * The shrinker_info may be freed asynchronously via RCU in the
481 	 * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
482 	 * to ensure the existence of the shrinker_info.
483 	 *
484 	 * The shrinker_info_unit is never freed unless its corresponding memcg
485 	 * is destroyed. Here we already hold the refcount of memcg, so the
486 	 * memcg will not be destroyed, and of course shrinker_info_unit will
487 	 * not be freed.
488 	 *
489 	 * So in the memcg shrink:
490 	 *  step 1: use rcu_read_lock() to guarantee existence of the
491 	 *          shrinker_info.
492 	 *  step 2: after getting shrinker_info_unit we can safely release the
493 	 *          RCU lock.
494 	 *  step 3: traverse the bitmap and calculate shrinker_id
495 	 *  step 4: use rcu_read_lock() to guarantee existence of the shrinker.
496 	 *  step 5: use shrinker_id to find the shrinker, then use
497 	 *          shrinker_try_get() to guarantee existence of the shrinker,
498 	 *          then we can release the RCU lock to do do_shrink_slab() that
499 	 *          may sleep.
500 	 *  step 6: do shrinker_put() paired with step 5 to put the refcount,
501 	 *          if the refcount reaches 0, then wake up the waiter in
502 	 *          shrinker_free() by calling complete().
503 	 *          Note: here is different from the global shrink, we don't
504 	 *                need to acquire the RCU lock to guarantee existence of
505 	 *                the shrinker, because we don't need to use this
506 	 *                shrinker to traverse the next shrinker in the bitmap.
507 	 *  step 7: we have already exited the read-side of rcu critical section
508 	 *          before calling do_shrink_slab(), the shrinker_info may be
509 	 *          released in expand_one_shrinker_info(), so go back to step 1
510 	 *          to reacquire the shrinker_info.
511 	 */
512 again:
513 	rcu_read_lock();
514 	info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
515 	if (unlikely(!info))
516 		goto unlock;
517 
518 	if (index < shrinker_id_to_index(info->map_nr_max)) {
519 		struct shrinker_info_unit *unit;
520 
521 		unit = info->unit[index];
522 
523 		rcu_read_unlock();
524 
525 		for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
526 			struct shrink_control sc = {
527 				.gfp_mask = gfp_mask,
528 				.nid = nid,
529 				.memcg = memcg,
530 			};
531 			struct shrinker *shrinker;
532 			int shrinker_id = calc_shrinker_id(index, offset);
533 
534 			rcu_read_lock();
535 			shrinker = idr_find(&shrinker_idr, shrinker_id);
536 			if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
537 				clear_bit(offset, unit->map);
538 				rcu_read_unlock();
539 				continue;
540 			}
541 			rcu_read_unlock();
542 
543 			/* Call non-slab shrinkers even though kmem is disabled */
544 			if (!memcg_kmem_online() &&
545 			    !(shrinker->flags & SHRINKER_NONSLAB))
546 				continue;
547 
548 			ret = do_shrink_slab(&sc, shrinker, priority);
549 			if (ret == SHRINK_EMPTY) {
550 				clear_bit(offset, unit->map);
551 				/*
552 				 * After the shrinker reported that it had no objects to
553 				 * free, but before we cleared the corresponding bit in
554 				 * the memcg shrinker map, a new object might have been
555 				 * added. To make sure, we have the bit set in this
556 				 * case, we invoke the shrinker one more time and reset
557 				 * the bit if it reports that it is not empty anymore.
558 				 * The memory barrier here pairs with the barrier in
559 				 * set_shrinker_bit():
560 				 *
561 				 * list_lru_add()     shrink_slab_memcg()
562 				 *   list_add_tail()    clear_bit()
563 				 *   <MB>               <MB>
564 				 *   set_bit()          do_shrink_slab()
565 				 */
566 				smp_mb__after_atomic();
567 				ret = do_shrink_slab(&sc, shrinker, priority);
568 				if (ret == SHRINK_EMPTY)
569 					ret = 0;
570 				else
571 					set_shrinker_bit(memcg, nid, shrinker_id);
572 			}
573 			freed += ret;
574 			shrinker_put(shrinker);
575 		}
576 
577 		index++;
578 		goto again;
579 	}
580 unlock:
581 	rcu_read_unlock();
582 	return freed;
583 }
584 #else /* !CONFIG_MEMCG */
shrink_slab_memcg(gfp_t gfp_mask,int nid,struct mem_cgroup * memcg,int priority)585 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
586 			struct mem_cgroup *memcg, int priority)
587 {
588 	return 0;
589 }
590 #endif /* CONFIG_MEMCG */
591 
592 /**
593  * shrink_slab - shrink slab caches
594  * @gfp_mask: allocation context
595  * @nid: node whose slab caches to target
596  * @memcg: memory cgroup whose slab caches to target
597  * @priority: the reclaim priority
598  *
599  * Call the shrink functions to age shrinkable caches.
600  *
601  * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
602  * unaware shrinkers will receive a node id of 0 instead.
603  *
604  * @memcg specifies the memory cgroup to target. Unaware shrinkers
605  * are called only if it is the root cgroup.
606  *
607  * @priority is sc->priority, we take the number of objects and >> by priority
608  * in order to get the scan target.
609  *
610  * Returns the number of reclaimed slab objects.
611  */
shrink_slab(gfp_t gfp_mask,int nid,struct mem_cgroup * memcg,int priority)612 unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
613 			  int priority)
614 {
615 	unsigned long ret, freed = 0;
616 	struct shrinker *shrinker;
617 
618 	/*
619 	 * The root memcg might be allocated even though memcg is disabled
620 	 * via "cgroup_disable=memory" boot parameter.  This could make
621 	 * mem_cgroup_is_root() return false, then just run memcg slab
622 	 * shrink, but skip global shrink.  This may result in premature
623 	 * oom.
624 	 */
625 	if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
626 		return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
627 
628 	/*
629 	 * lockless algorithm of global shrink.
630 	 *
631 	 * In the unregistration setp, the shrinker will be freed asynchronously
632 	 * via RCU after its refcount reaches 0. So both rcu_read_lock() and
633 	 * shrinker_try_get() can be used to ensure the existence of the shrinker.
634 	 *
635 	 * So in the global shrink:
636 	 *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
637 	 *          and the validity of the shrinker_list walk.
638 	 *  step 2: use shrinker_try_get() to try get the refcount, if successful,
639 	 *          then the existence of the shrinker can also be guaranteed,
640 	 *          so we can release the RCU lock to do do_shrink_slab() that
641 	 *          may sleep.
642 	 *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
643 	 *          which ensures that neither this shrinker nor the next shrinker
644 	 *          will be freed in the next traversal operation.
645 	 *  step 4: do shrinker_put() paired with step 2 to put the refcount,
646 	 *          if the refcount reaches 0, then wake up the waiter in
647 	 *          shrinker_free() by calling complete().
648 	 */
649 	rcu_read_lock();
650 	list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
651 		struct shrink_control sc = {
652 			.gfp_mask = gfp_mask,
653 			.nid = nid,
654 			.memcg = memcg,
655 		};
656 
657 		if (!shrinker_try_get(shrinker))
658 			continue;
659 
660 		rcu_read_unlock();
661 
662 		ret = do_shrink_slab(&sc, shrinker, priority);
663 		if (ret == SHRINK_EMPTY)
664 			ret = 0;
665 		freed += ret;
666 
667 		rcu_read_lock();
668 		shrinker_put(shrinker);
669 	}
670 
671 	rcu_read_unlock();
672 	cond_resched();
673 	return freed;
674 }
675 
shrinker_alloc(unsigned int flags,const char * fmt,...)676 struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
677 {
678 	struct shrinker *shrinker;
679 	unsigned int size;
680 	va_list ap;
681 	int err;
682 
683 	shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
684 	if (!shrinker)
685 		return NULL;
686 
687 	va_start(ap, fmt);
688 	err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
689 	va_end(ap);
690 	if (err)
691 		goto err_name;
692 
693 	shrinker->flags = flags | SHRINKER_ALLOCATED;
694 	shrinker->seeks = DEFAULT_SEEKS;
695 
696 	if (flags & SHRINKER_MEMCG_AWARE) {
697 		err = shrinker_memcg_alloc(shrinker);
698 		if (err == -ENOSYS) {
699 			/* Memcg is not supported, fallback to non-memcg-aware shrinker. */
700 			shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
701 			goto non_memcg;
702 		}
703 
704 		if (err)
705 			goto err_flags;
706 
707 		return shrinker;
708 	}
709 
710 non_memcg:
711 	/*
712 	 * The nr_deferred is available on per memcg level for memcg aware
713 	 * shrinkers, so only allocate nr_deferred in the following cases:
714 	 *  - non-memcg-aware shrinkers
715 	 *  - !CONFIG_MEMCG
716 	 *  - memcg is disabled by kernel command line
717 	 */
718 	size = sizeof(*shrinker->nr_deferred);
719 	if (flags & SHRINKER_NUMA_AWARE)
720 		size *= nr_node_ids;
721 
722 	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
723 	if (!shrinker->nr_deferred)
724 		goto err_flags;
725 
726 	return shrinker;
727 
728 err_flags:
729 	shrinker_debugfs_name_free(shrinker);
730 err_name:
731 	kfree(shrinker);
732 	return NULL;
733 }
734 EXPORT_SYMBOL_GPL(shrinker_alloc);
735 
shrinker_register(struct shrinker * shrinker)736 void shrinker_register(struct shrinker *shrinker)
737 {
738 	if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
739 		pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
740 		return;
741 	}
742 
743 	mutex_lock(&shrinker_mutex);
744 	list_add_tail_rcu(&shrinker->list, &shrinker_list);
745 	shrinker->flags |= SHRINKER_REGISTERED;
746 	shrinker_debugfs_add(shrinker);
747 	mutex_unlock(&shrinker_mutex);
748 
749 	init_completion(&shrinker->done);
750 	/*
751 	 * Now the shrinker is fully set up, take the first reference to it to
752 	 * indicate that lookup operations are now allowed to use it via
753 	 * shrinker_try_get().
754 	 */
755 	refcount_set(&shrinker->refcount, 1);
756 }
757 EXPORT_SYMBOL_GPL(shrinker_register);
758 
shrinker_free_rcu_cb(struct rcu_head * head)759 static void shrinker_free_rcu_cb(struct rcu_head *head)
760 {
761 	struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
762 
763 	kfree(shrinker->nr_deferred);
764 	kfree(shrinker);
765 }
766 
shrinker_free(struct shrinker * shrinker)767 void shrinker_free(struct shrinker *shrinker)
768 {
769 	struct dentry *debugfs_entry = NULL;
770 	int debugfs_id;
771 
772 	if (!shrinker)
773 		return;
774 
775 	if (shrinker->flags & SHRINKER_REGISTERED) {
776 		/* drop the initial refcount */
777 		shrinker_put(shrinker);
778 		/*
779 		 * Wait for all lookups of the shrinker to complete, after that,
780 		 * no shrinker is running or will run again, then we can safely
781 		 * free it asynchronously via RCU and safely free the structure
782 		 * where the shrinker is located, such as super_block etc.
783 		 */
784 		wait_for_completion(&shrinker->done);
785 	}
786 
787 	mutex_lock(&shrinker_mutex);
788 	if (shrinker->flags & SHRINKER_REGISTERED) {
789 		/*
790 		 * Now we can safely remove it from the shrinker_list and then
791 		 * free it.
792 		 */
793 		list_del_rcu(&shrinker->list);
794 		debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
795 		shrinker->flags &= ~SHRINKER_REGISTERED;
796 	}
797 
798 	shrinker_debugfs_name_free(shrinker);
799 
800 	if (shrinker->flags & SHRINKER_MEMCG_AWARE)
801 		shrinker_memcg_remove(shrinker);
802 	mutex_unlock(&shrinker_mutex);
803 
804 	if (debugfs_entry)
805 		shrinker_debugfs_remove(debugfs_entry, debugfs_id);
806 
807 	call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
808 }
809 EXPORT_SYMBOL_GPL(shrinker_free);
810