xref: /linux/mm/slab_common.c (revision 21e4543a2e2f8538373d1d19264c4bae6f13e798)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Slab allocator functions that are independent of the allocator strategy
4  *
5  * (C) 2012 Christoph Lameter <cl@linux.com>
6  */
7 #include <linux/slab.h>
8 
9 #include <linux/mm.h>
10 #include <linux/poison.h>
11 #include <linux/interrupt.h>
12 #include <linux/memory.h>
13 #include <linux/cache.h>
14 #include <linux/compiler.h>
15 #include <linux/kfence.h>
16 #include <linux/module.h>
17 #include <linux/cpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/seq_file.h>
20 #include <linux/dma-mapping.h>
21 #include <linux/swiotlb.h>
22 #include <linux/proc_fs.h>
23 #include <linux/debugfs.h>
24 #include <linux/kmemleak.h>
25 #include <linux/kasan.h>
26 #include <asm/cacheflush.h>
27 #include <asm/tlbflush.h>
28 #include <asm/page.h>
29 #include <linux/memcontrol.h>
30 #include <linux/stackdepot.h>
31 #include <trace/events/rcu.h>
32 
33 #include "../kernel/rcu/rcu.h"
34 #include "internal.h"
35 #include "slab.h"
36 
37 #define CREATE_TRACE_POINTS
38 #include <trace/events/kmem.h>
39 
40 enum slab_state slab_state;
41 LIST_HEAD(slab_caches);
42 DEFINE_MUTEX(slab_mutex);
43 struct kmem_cache *kmem_cache;
44 
45 /*
46  * Set of flags that will prevent slab merging
47  */
48 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
49 		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
50 		SLAB_FAILSLAB | SLAB_NO_MERGE)
51 
52 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
53 			 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
54 
55 /*
56  * Merge control. If this is set then no merging of slab caches will occur.
57  */
58 static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
59 
setup_slab_nomerge(char * str)60 static int __init setup_slab_nomerge(char *str)
61 {
62 	slab_nomerge = true;
63 	return 1;
64 }
65 
setup_slab_merge(char * str)66 static int __init setup_slab_merge(char *str)
67 {
68 	slab_nomerge = false;
69 	return 1;
70 }
71 
72 __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
73 __setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
74 
75 __setup("slab_nomerge", setup_slab_nomerge);
76 __setup("slab_merge", setup_slab_merge);
77 
78 /*
79  * Determine the size of a slab object
80  */
kmem_cache_size(struct kmem_cache * s)81 unsigned int kmem_cache_size(struct kmem_cache *s)
82 {
83 	return s->object_size;
84 }
85 EXPORT_SYMBOL(kmem_cache_size);
86 
87 #ifdef CONFIG_DEBUG_VM
88 
kmem_cache_is_duplicate_name(const char * name)89 static bool kmem_cache_is_duplicate_name(const char *name)
90 {
91 	struct kmem_cache *s;
92 
93 	list_for_each_entry(s, &slab_caches, list) {
94 		if (!strcmp(s->name, name))
95 			return true;
96 	}
97 
98 	return false;
99 }
100 
kmem_cache_sanity_check(const char * name,unsigned int size)101 static int kmem_cache_sanity_check(const char *name, unsigned int size)
102 {
103 	if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
104 		pr_err("kmem_cache_create(%s) integrity check failed\n", name);
105 		return -EINVAL;
106 	}
107 
108 	/* Duplicate names will confuse slabtop, et al */
109 	WARN(kmem_cache_is_duplicate_name(name),
110 			"kmem_cache of name '%s' already exists\n", name);
111 
112 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
113 	return 0;
114 }
115 #else
kmem_cache_sanity_check(const char * name,unsigned int size)116 static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
117 {
118 	return 0;
119 }
120 #endif
121 
122 /*
123  * Figure out what the alignment of the objects will be given a set of
124  * flags, a user specified alignment and the size of the objects.
125  */
calculate_alignment(slab_flags_t flags,unsigned int align,unsigned int size)126 static unsigned int calculate_alignment(slab_flags_t flags,
127 		unsigned int align, unsigned int size)
128 {
129 	/*
130 	 * If the user wants hardware cache aligned objects then follow that
131 	 * suggestion if the object is sufficiently large.
132 	 *
133 	 * The hardware cache alignment cannot override the specified
134 	 * alignment though. If that is greater then use it.
135 	 */
136 	if (flags & SLAB_HWCACHE_ALIGN) {
137 		unsigned int ralign;
138 
139 		ralign = cache_line_size();
140 		while (size <= ralign / 2)
141 			ralign /= 2;
142 		align = max(align, ralign);
143 	}
144 
145 	align = max(align, arch_slab_minalign());
146 
147 	return ALIGN(align, sizeof(void *));
148 }
149 
150 /*
151  * Find a mergeable slab cache
152  */
slab_unmergeable(struct kmem_cache * s)153 int slab_unmergeable(struct kmem_cache *s)
154 {
155 	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
156 		return 1;
157 
158 	if (s->ctor)
159 		return 1;
160 
161 #ifdef CONFIG_HARDENED_USERCOPY
162 	if (s->usersize)
163 		return 1;
164 #endif
165 
166 	/*
167 	 * We may have set a slab to be unmergeable during bootstrap.
168 	 */
169 	if (s->refcount < 0)
170 		return 1;
171 
172 	return 0;
173 }
174 
find_mergeable(unsigned int size,unsigned int align,slab_flags_t flags,const char * name,void (* ctor)(void *))175 struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
176 		slab_flags_t flags, const char *name, void (*ctor)(void *))
177 {
178 	struct kmem_cache *s;
179 
180 	if (slab_nomerge)
181 		return NULL;
182 
183 	if (ctor)
184 		return NULL;
185 
186 	flags = kmem_cache_flags(flags, name);
187 
188 	if (flags & SLAB_NEVER_MERGE)
189 		return NULL;
190 
191 	size = ALIGN(size, sizeof(void *));
192 	align = calculate_alignment(flags, align, size);
193 	size = ALIGN(size, align);
194 
195 	list_for_each_entry_reverse(s, &slab_caches, list) {
196 		if (slab_unmergeable(s))
197 			continue;
198 
199 		if (size > s->size)
200 			continue;
201 
202 		if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
203 			continue;
204 		/*
205 		 * Check if alignment is compatible.
206 		 * Courtesy of Adrian Drzewiecki
207 		 */
208 		if ((s->size & ~(align - 1)) != s->size)
209 			continue;
210 
211 		if (s->size - size >= sizeof(void *))
212 			continue;
213 
214 		return s;
215 	}
216 	return NULL;
217 }
218 
create_cache(const char * name,unsigned int object_size,struct kmem_cache_args * args,slab_flags_t flags)219 static struct kmem_cache *create_cache(const char *name,
220 				       unsigned int object_size,
221 				       struct kmem_cache_args *args,
222 				       slab_flags_t flags)
223 {
224 	struct kmem_cache *s;
225 	int err;
226 
227 	/* If a custom freelist pointer is requested make sure it's sane. */
228 	err = -EINVAL;
229 	if (args->use_freeptr_offset &&
230 	    (args->freeptr_offset >= object_size ||
231 	     !(flags & SLAB_TYPESAFE_BY_RCU) ||
232 	     !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
233 		goto out;
234 
235 	err = -ENOMEM;
236 	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
237 	if (!s)
238 		goto out;
239 	err = do_kmem_cache_create(s, name, object_size, args, flags);
240 	if (err)
241 		goto out_free_cache;
242 
243 	s->refcount = 1;
244 	list_add(&s->list, &slab_caches);
245 	return s;
246 
247 out_free_cache:
248 	kmem_cache_free(kmem_cache, s);
249 out:
250 	return ERR_PTR(err);
251 }
252 
253 /**
254  * __kmem_cache_create_args - Create a kmem cache.
255  * @name: A string which is used in /proc/slabinfo to identify this cache.
256  * @object_size: The size of objects to be created in this cache.
257  * @args: Additional arguments for the cache creation (see
258  *        &struct kmem_cache_args).
259  * @flags: See the desriptions of individual flags. The common ones are listed
260  *         in the description below.
261  *
262  * Not to be called directly, use the kmem_cache_create() wrapper with the same
263  * parameters.
264  *
265  * Commonly used @flags:
266  *
267  * &SLAB_ACCOUNT - Account allocations to memcg.
268  *
269  * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
270  *
271  * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
272  *
273  * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
274  * by a grace period - see the full description before using.
275  *
276  * Context: Cannot be called within a interrupt, but can be interrupted.
277  *
278  * Return: a pointer to the cache on success, NULL on failure.
279  */
__kmem_cache_create_args(const char * name,unsigned int object_size,struct kmem_cache_args * args,slab_flags_t flags)280 struct kmem_cache *__kmem_cache_create_args(const char *name,
281 					    unsigned int object_size,
282 					    struct kmem_cache_args *args,
283 					    slab_flags_t flags)
284 {
285 	struct kmem_cache *s = NULL;
286 	const char *cache_name;
287 	int err;
288 
289 #ifdef CONFIG_SLUB_DEBUG
290 	/*
291 	 * If no slab_debug was enabled globally, the static key is not yet
292 	 * enabled by setup_slub_debug(). Enable it if the cache is being
293 	 * created with any of the debugging flags passed explicitly.
294 	 * It's also possible that this is the first cache created with
295 	 * SLAB_STORE_USER and we should init stack_depot for it.
296 	 */
297 	if (flags & SLAB_DEBUG_FLAGS)
298 		static_branch_enable(&slub_debug_enabled);
299 	if (flags & SLAB_STORE_USER)
300 		stack_depot_init();
301 #endif
302 
303 	mutex_lock(&slab_mutex);
304 
305 	err = kmem_cache_sanity_check(name, object_size);
306 	if (err) {
307 		goto out_unlock;
308 	}
309 
310 	/* Refuse requests with allocator specific flags */
311 	if (flags & ~SLAB_FLAGS_PERMITTED) {
312 		err = -EINVAL;
313 		goto out_unlock;
314 	}
315 
316 	/*
317 	 * Some allocators will constraint the set of valid flags to a subset
318 	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
319 	 * case, and we'll just provide them with a sanitized version of the
320 	 * passed flags.
321 	 */
322 	flags &= CACHE_CREATE_MASK;
323 
324 	/* Fail closed on bad usersize of useroffset values. */
325 	if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
326 	    WARN_ON(!args->usersize && args->useroffset) ||
327 	    WARN_ON(object_size < args->usersize ||
328 		    object_size - args->usersize < args->useroffset))
329 		args->usersize = args->useroffset = 0;
330 
331 	if (!args->usersize)
332 		s = __kmem_cache_alias(name, object_size, args->align, flags,
333 				       args->ctor);
334 	if (s)
335 		goto out_unlock;
336 
337 	cache_name = kstrdup_const(name, GFP_KERNEL);
338 	if (!cache_name) {
339 		err = -ENOMEM;
340 		goto out_unlock;
341 	}
342 
343 	args->align = calculate_alignment(flags, args->align, object_size);
344 	s = create_cache(cache_name, object_size, args, flags);
345 	if (IS_ERR(s)) {
346 		err = PTR_ERR(s);
347 		kfree_const(cache_name);
348 	}
349 
350 out_unlock:
351 	mutex_unlock(&slab_mutex);
352 
353 	if (err) {
354 		if (flags & SLAB_PANIC)
355 			panic("%s: Failed to create slab '%s'. Error %d\n",
356 				__func__, name, err);
357 		else {
358 			pr_warn("%s(%s) failed with error %d\n",
359 				__func__, name, err);
360 			dump_stack();
361 		}
362 		return NULL;
363 	}
364 	return s;
365 }
366 EXPORT_SYMBOL(__kmem_cache_create_args);
367 
368 static struct kmem_cache *kmem_buckets_cache __ro_after_init;
369 
370 /**
371  * kmem_buckets_create - Create a set of caches that handle dynamic sized
372  *			 allocations via kmem_buckets_alloc()
373  * @name: A prefix string which is used in /proc/slabinfo to identify this
374  *	  cache. The individual caches with have their sizes as the suffix.
375  * @flags: SLAB flags (see kmem_cache_create() for details).
376  * @useroffset: Starting offset within an allocation that may be copied
377  *		to/from userspace.
378  * @usersize: How many bytes, starting at @useroffset, may be copied
379  *		to/from userspace.
380  * @ctor: A constructor for the objects, run when new allocations are made.
381  *
382  * Cannot be called within an interrupt, but can be interrupted.
383  *
384  * Return: a pointer to the cache on success, NULL on failure. When
385  * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and
386  * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc().
387  * (i.e. callers only need to check for NULL on failure.)
388  */
kmem_buckets_create(const char * name,slab_flags_t flags,unsigned int useroffset,unsigned int usersize,void (* ctor)(void *))389 kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
390 				  unsigned int useroffset,
391 				  unsigned int usersize,
392 				  void (*ctor)(void *))
393 {
394 	unsigned long mask = 0;
395 	unsigned int idx;
396 	kmem_buckets *b;
397 
398 	BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG);
399 
400 	/*
401 	 * When the separate buckets API is not built in, just return
402 	 * a non-NULL value for the kmem_buckets pointer, which will be
403 	 * unused when performing allocations.
404 	 */
405 	if (!IS_ENABLED(CONFIG_SLAB_BUCKETS))
406 		return ZERO_SIZE_PTR;
407 
408 	if (WARN_ON(!kmem_buckets_cache))
409 		return NULL;
410 
411 	b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO);
412 	if (WARN_ON(!b))
413 		return NULL;
414 
415 	flags |= SLAB_NO_MERGE;
416 
417 	for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) {
418 		char *short_size, *cache_name;
419 		unsigned int cache_useroffset, cache_usersize;
420 		unsigned int size, aligned_idx;
421 
422 		if (!kmalloc_caches[KMALLOC_NORMAL][idx])
423 			continue;
424 
425 		size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size;
426 		if (!size)
427 			continue;
428 
429 		short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, '-');
430 		if (WARN_ON(!short_size))
431 			goto fail;
432 
433 		if (useroffset >= size) {
434 			cache_useroffset = 0;
435 			cache_usersize = 0;
436 		} else {
437 			cache_useroffset = useroffset;
438 			cache_usersize = min(size - cache_useroffset, usersize);
439 		}
440 
441 		aligned_idx = __kmalloc_index(size, false);
442 		if (!(*b)[aligned_idx]) {
443 			cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1);
444 			if (WARN_ON(!cache_name))
445 				goto fail;
446 			(*b)[aligned_idx] = kmem_cache_create_usercopy(cache_name, size,
447 					0, flags, cache_useroffset,
448 					cache_usersize, ctor);
449 			kfree(cache_name);
450 			if (WARN_ON(!(*b)[aligned_idx]))
451 				goto fail;
452 			set_bit(aligned_idx, &mask);
453 		}
454 		if (idx != aligned_idx)
455 			(*b)[idx] = (*b)[aligned_idx];
456 	}
457 
458 	return b;
459 
460 fail:
461 	for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]))
462 		kmem_cache_destroy((*b)[idx]);
463 	kmem_cache_free(kmem_buckets_cache, b);
464 
465 	return NULL;
466 }
467 EXPORT_SYMBOL(kmem_buckets_create);
468 
469 /*
470  * For a given kmem_cache, kmem_cache_destroy() should only be called
471  * once or there will be a use-after-free problem. The actual deletion
472  * and release of the kobject does not need slab_mutex or cpu_hotplug_lock
473  * protection. So they are now done without holding those locks.
474  */
kmem_cache_release(struct kmem_cache * s)475 static void kmem_cache_release(struct kmem_cache *s)
476 {
477 	kfence_shutdown_cache(s);
478 	if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL)
479 		sysfs_slab_release(s);
480 	else
481 		slab_kmem_cache_release(s);
482 }
483 
slab_kmem_cache_release(struct kmem_cache * s)484 void slab_kmem_cache_release(struct kmem_cache *s)
485 {
486 	__kmem_cache_release(s);
487 	kfree_const(s->name);
488 	kmem_cache_free(kmem_cache, s);
489 }
490 
kmem_cache_destroy(struct kmem_cache * s)491 void kmem_cache_destroy(struct kmem_cache *s)
492 {
493 	int err;
494 
495 	if (unlikely(!s) || !kasan_check_byte(s))
496 		return;
497 
498 	/* in-flight kfree_rcu()'s may include objects from our cache */
499 	kvfree_rcu_barrier();
500 
501 	if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
502 	    (s->flags & SLAB_TYPESAFE_BY_RCU)) {
503 		/*
504 		 * Under CONFIG_SLUB_RCU_DEBUG, when objects in a
505 		 * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally
506 		 * defer their freeing with call_rcu().
507 		 * Wait for such call_rcu() invocations here before actually
508 		 * destroying the cache.
509 		 *
510 		 * It doesn't matter that we haven't looked at the slab refcount
511 		 * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so
512 		 * the refcount should be 1 here.
513 		 */
514 		rcu_barrier();
515 	}
516 
517 	cpus_read_lock();
518 	mutex_lock(&slab_mutex);
519 
520 	s->refcount--;
521 	if (s->refcount) {
522 		mutex_unlock(&slab_mutex);
523 		cpus_read_unlock();
524 		return;
525 	}
526 
527 	/* free asan quarantined objects */
528 	kasan_cache_shutdown(s);
529 
530 	err = __kmem_cache_shutdown(s);
531 	if (!slab_in_kunit_test())
532 		WARN(err, "%s %s: Slab cache still has objects when called from %pS",
533 		     __func__, s->name, (void *)_RET_IP_);
534 
535 	list_del(&s->list);
536 
537 	mutex_unlock(&slab_mutex);
538 	cpus_read_unlock();
539 
540 	if (slab_state >= FULL)
541 		sysfs_slab_unlink(s);
542 	debugfs_slab_release(s);
543 
544 	if (err)
545 		return;
546 
547 	if (s->flags & SLAB_TYPESAFE_BY_RCU)
548 		rcu_barrier();
549 
550 	kmem_cache_release(s);
551 }
552 EXPORT_SYMBOL(kmem_cache_destroy);
553 
554 /**
555  * kmem_cache_shrink - Shrink a cache.
556  * @cachep: The cache to shrink.
557  *
558  * Releases as many slabs as possible for a cache.
559  * To help debugging, a zero exit status indicates all slabs were released.
560  *
561  * Return: %0 if all slabs were released, non-zero otherwise
562  */
kmem_cache_shrink(struct kmem_cache * cachep)563 int kmem_cache_shrink(struct kmem_cache *cachep)
564 {
565 	kasan_cache_shrink(cachep);
566 
567 	return __kmem_cache_shrink(cachep);
568 }
569 EXPORT_SYMBOL(kmem_cache_shrink);
570 
slab_is_available(void)571 bool slab_is_available(void)
572 {
573 	return slab_state >= UP;
574 }
575 
576 #ifdef CONFIG_PRINTK
kmem_obj_info(struct kmem_obj_info * kpp,void * object,struct slab * slab)577 static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
578 {
579 	if (__kfence_obj_info(kpp, object, slab))
580 		return;
581 	__kmem_obj_info(kpp, object, slab);
582 }
583 
584 /**
585  * kmem_dump_obj - Print available slab provenance information
586  * @object: slab object for which to find provenance information.
587  *
588  * This function uses pr_cont(), so that the caller is expected to have
589  * printed out whatever preamble is appropriate.  The provenance information
590  * depends on the type of object and on how much debugging is enabled.
591  * For a slab-cache object, the fact that it is a slab object is printed,
592  * and, if available, the slab name, return address, and stack trace from
593  * the allocation and last free path of that object.
594  *
595  * Return: %true if the pointer is to a not-yet-freed object from
596  * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
597  * is to an already-freed object, and %false otherwise.
598  */
kmem_dump_obj(void * object)599 bool kmem_dump_obj(void *object)
600 {
601 	char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
602 	int i;
603 	struct slab *slab;
604 	unsigned long ptroffset;
605 	struct kmem_obj_info kp = { };
606 
607 	/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
608 	if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
609 		return false;
610 	slab = virt_to_slab(object);
611 	if (!slab)
612 		return false;
613 
614 	kmem_obj_info(&kp, object, slab);
615 	if (kp.kp_slab_cache)
616 		pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
617 	else
618 		pr_cont(" slab%s", cp);
619 	if (is_kfence_address(object))
620 		pr_cont(" (kfence)");
621 	if (kp.kp_objp)
622 		pr_cont(" start %px", kp.kp_objp);
623 	if (kp.kp_data_offset)
624 		pr_cont(" data offset %lu", kp.kp_data_offset);
625 	if (kp.kp_objp) {
626 		ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
627 		pr_cont(" pointer offset %lu", ptroffset);
628 	}
629 	if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
630 		pr_cont(" size %u", kp.kp_slab_cache->object_size);
631 	if (kp.kp_ret)
632 		pr_cont(" allocated at %pS\n", kp.kp_ret);
633 	else
634 		pr_cont("\n");
635 	for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
636 		if (!kp.kp_stack[i])
637 			break;
638 		pr_info("    %pS\n", kp.kp_stack[i]);
639 	}
640 
641 	if (kp.kp_free_stack[0])
642 		pr_cont(" Free path:\n");
643 
644 	for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
645 		if (!kp.kp_free_stack[i])
646 			break;
647 		pr_info("    %pS\n", kp.kp_free_stack[i]);
648 	}
649 
650 	return true;
651 }
652 EXPORT_SYMBOL_GPL(kmem_dump_obj);
653 #endif
654 
655 /* Create a cache during boot when no slab services are available yet */
create_boot_cache(struct kmem_cache * s,const char * name,unsigned int size,slab_flags_t flags,unsigned int useroffset,unsigned int usersize)656 void __init create_boot_cache(struct kmem_cache *s, const char *name,
657 		unsigned int size, slab_flags_t flags,
658 		unsigned int useroffset, unsigned int usersize)
659 {
660 	int err;
661 	unsigned int align = ARCH_KMALLOC_MINALIGN;
662 	struct kmem_cache_args kmem_args = {};
663 
664 	/*
665 	 * kmalloc caches guarantee alignment of at least the largest
666 	 * power-of-two divisor of the size. For power-of-two sizes,
667 	 * it is the size itself.
668 	 */
669 	if (flags & SLAB_KMALLOC)
670 		align = max(align, 1U << (ffs(size) - 1));
671 	kmem_args.align = calculate_alignment(flags, align, size);
672 
673 #ifdef CONFIG_HARDENED_USERCOPY
674 	kmem_args.useroffset = useroffset;
675 	kmem_args.usersize = usersize;
676 #endif
677 
678 	err = do_kmem_cache_create(s, name, size, &kmem_args, flags);
679 
680 	if (err)
681 		panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
682 					name, size, err);
683 
684 	s->refcount = -1;	/* Exempt from merging for now */
685 }
686 
create_kmalloc_cache(const char * name,unsigned int size,slab_flags_t flags)687 static struct kmem_cache *__init create_kmalloc_cache(const char *name,
688 						      unsigned int size,
689 						      slab_flags_t flags)
690 {
691 	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
692 
693 	if (!s)
694 		panic("Out of memory when creating slab %s\n", name);
695 
696 	create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size);
697 	list_add(&s->list, &slab_caches);
698 	s->refcount = 1;
699 	return s;
700 }
701 
702 kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init =
703 { /* initialization for https://llvm.org/pr42570 */ };
704 EXPORT_SYMBOL(kmalloc_caches);
705 
706 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
707 unsigned long random_kmalloc_seed __ro_after_init;
708 EXPORT_SYMBOL(random_kmalloc_seed);
709 #endif
710 
711 /*
712  * Conversion table for small slabs sizes / 8 to the index in the
713  * kmalloc array. This is necessary for slabs < 192 since we have non power
714  * of two cache sizes there. The size of larger slabs can be determined using
715  * fls.
716  */
717 u8 kmalloc_size_index[24] __ro_after_init = {
718 	3,	/* 8 */
719 	4,	/* 16 */
720 	5,	/* 24 */
721 	5,	/* 32 */
722 	6,	/* 40 */
723 	6,	/* 48 */
724 	6,	/* 56 */
725 	6,	/* 64 */
726 	1,	/* 72 */
727 	1,	/* 80 */
728 	1,	/* 88 */
729 	1,	/* 96 */
730 	7,	/* 104 */
731 	7,	/* 112 */
732 	7,	/* 120 */
733 	7,	/* 128 */
734 	2,	/* 136 */
735 	2,	/* 144 */
736 	2,	/* 152 */
737 	2,	/* 160 */
738 	2,	/* 168 */
739 	2,	/* 176 */
740 	2,	/* 184 */
741 	2	/* 192 */
742 };
743 
kmalloc_size_roundup(size_t size)744 size_t kmalloc_size_roundup(size_t size)
745 {
746 	if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
747 		/*
748 		 * The flags don't matter since size_index is common to all.
749 		 * Neither does the caller for just getting ->object_size.
750 		 */
751 		return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size;
752 	}
753 
754 	/* Above the smaller buckets, size is a multiple of page size. */
755 	if (size && size <= KMALLOC_MAX_SIZE)
756 		return PAGE_SIZE << get_order(size);
757 
758 	/*
759 	 * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR
760 	 * and very large size - kmalloc() may fail.
761 	 */
762 	return size;
763 
764 }
765 EXPORT_SYMBOL(kmalloc_size_roundup);
766 
767 #ifdef CONFIG_ZONE_DMA
768 #define KMALLOC_DMA_NAME(sz)	.name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
769 #else
770 #define KMALLOC_DMA_NAME(sz)
771 #endif
772 
773 #ifdef CONFIG_MEMCG
774 #define KMALLOC_CGROUP_NAME(sz)	.name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
775 #else
776 #define KMALLOC_CGROUP_NAME(sz)
777 #endif
778 
779 #ifndef CONFIG_SLUB_TINY
780 #define KMALLOC_RCL_NAME(sz)	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz,
781 #else
782 #define KMALLOC_RCL_NAME(sz)
783 #endif
784 
785 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
786 #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
787 #define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
788 #define KMA_RAND_1(sz)                  .name[KMALLOC_RANDOM_START +  1] = "kmalloc-rnd-01-" #sz,
789 #define KMA_RAND_2(sz)  KMA_RAND_1(sz)  .name[KMALLOC_RANDOM_START +  2] = "kmalloc-rnd-02-" #sz,
790 #define KMA_RAND_3(sz)  KMA_RAND_2(sz)  .name[KMALLOC_RANDOM_START +  3] = "kmalloc-rnd-03-" #sz,
791 #define KMA_RAND_4(sz)  KMA_RAND_3(sz)  .name[KMALLOC_RANDOM_START +  4] = "kmalloc-rnd-04-" #sz,
792 #define KMA_RAND_5(sz)  KMA_RAND_4(sz)  .name[KMALLOC_RANDOM_START +  5] = "kmalloc-rnd-05-" #sz,
793 #define KMA_RAND_6(sz)  KMA_RAND_5(sz)  .name[KMALLOC_RANDOM_START +  6] = "kmalloc-rnd-06-" #sz,
794 #define KMA_RAND_7(sz)  KMA_RAND_6(sz)  .name[KMALLOC_RANDOM_START +  7] = "kmalloc-rnd-07-" #sz,
795 #define KMA_RAND_8(sz)  KMA_RAND_7(sz)  .name[KMALLOC_RANDOM_START +  8] = "kmalloc-rnd-08-" #sz,
796 #define KMA_RAND_9(sz)  KMA_RAND_8(sz)  .name[KMALLOC_RANDOM_START +  9] = "kmalloc-rnd-09-" #sz,
797 #define KMA_RAND_10(sz) KMA_RAND_9(sz)  .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
798 #define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
799 #define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
800 #define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
801 #define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
802 #define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
803 #else // CONFIG_RANDOM_KMALLOC_CACHES
804 #define KMALLOC_RANDOM_NAME(N, sz)
805 #endif
806 
807 #define INIT_KMALLOC_INFO(__size, __short_size)			\
808 {								\
809 	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
810 	KMALLOC_RCL_NAME(__short_size)				\
811 	KMALLOC_CGROUP_NAME(__short_size)			\
812 	KMALLOC_DMA_NAME(__short_size)				\
813 	KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size)	\
814 	.size = __size,						\
815 }
816 
817 /*
818  * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time.
819  * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
820  * kmalloc-2M.
821  */
822 const struct kmalloc_info_struct kmalloc_info[] __initconst = {
823 	INIT_KMALLOC_INFO(0, 0),
824 	INIT_KMALLOC_INFO(96, 96),
825 	INIT_KMALLOC_INFO(192, 192),
826 	INIT_KMALLOC_INFO(8, 8),
827 	INIT_KMALLOC_INFO(16, 16),
828 	INIT_KMALLOC_INFO(32, 32),
829 	INIT_KMALLOC_INFO(64, 64),
830 	INIT_KMALLOC_INFO(128, 128),
831 	INIT_KMALLOC_INFO(256, 256),
832 	INIT_KMALLOC_INFO(512, 512),
833 	INIT_KMALLOC_INFO(1024, 1k),
834 	INIT_KMALLOC_INFO(2048, 2k),
835 	INIT_KMALLOC_INFO(4096, 4k),
836 	INIT_KMALLOC_INFO(8192, 8k),
837 	INIT_KMALLOC_INFO(16384, 16k),
838 	INIT_KMALLOC_INFO(32768, 32k),
839 	INIT_KMALLOC_INFO(65536, 64k),
840 	INIT_KMALLOC_INFO(131072, 128k),
841 	INIT_KMALLOC_INFO(262144, 256k),
842 	INIT_KMALLOC_INFO(524288, 512k),
843 	INIT_KMALLOC_INFO(1048576, 1M),
844 	INIT_KMALLOC_INFO(2097152, 2M)
845 };
846 
847 /*
848  * Patch up the size_index table if we have strange large alignment
849  * requirements for the kmalloc array. This is only the case for
850  * MIPS it seems. The standard arches will not generate any code here.
851  *
852  * Largest permitted alignment is 256 bytes due to the way we
853  * handle the index determination for the smaller caches.
854  *
855  * Make sure that nothing crazy happens if someone starts tinkering
856  * around with ARCH_KMALLOC_MINALIGN
857  */
setup_kmalloc_cache_index_table(void)858 void __init setup_kmalloc_cache_index_table(void)
859 {
860 	unsigned int i;
861 
862 	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
863 		!is_power_of_2(KMALLOC_MIN_SIZE));
864 
865 	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
866 		unsigned int elem = size_index_elem(i);
867 
868 		if (elem >= ARRAY_SIZE(kmalloc_size_index))
869 			break;
870 		kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW;
871 	}
872 
873 	if (KMALLOC_MIN_SIZE >= 64) {
874 		/*
875 		 * The 96 byte sized cache is not used if the alignment
876 		 * is 64 byte.
877 		 */
878 		for (i = 64 + 8; i <= 96; i += 8)
879 			kmalloc_size_index[size_index_elem(i)] = 7;
880 
881 	}
882 
883 	if (KMALLOC_MIN_SIZE >= 128) {
884 		/*
885 		 * The 192 byte sized cache is not used if the alignment
886 		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
887 		 * instead.
888 		 */
889 		for (i = 128 + 8; i <= 192; i += 8)
890 			kmalloc_size_index[size_index_elem(i)] = 8;
891 	}
892 }
893 
__kmalloc_minalign(void)894 static unsigned int __kmalloc_minalign(void)
895 {
896 	unsigned int minalign = dma_get_cache_alignment();
897 
898 	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
899 	    is_swiotlb_allocated())
900 		minalign = ARCH_KMALLOC_MINALIGN;
901 
902 	return max(minalign, arch_slab_minalign());
903 }
904 
905 static void __init
new_kmalloc_cache(int idx,enum kmalloc_cache_type type)906 new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
907 {
908 	slab_flags_t flags = 0;
909 	unsigned int minalign = __kmalloc_minalign();
910 	unsigned int aligned_size = kmalloc_info[idx].size;
911 	int aligned_idx = idx;
912 
913 	if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
914 		flags |= SLAB_RECLAIM_ACCOUNT;
915 	} else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) {
916 		if (mem_cgroup_kmem_disabled()) {
917 			kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
918 			return;
919 		}
920 		flags |= SLAB_ACCOUNT;
921 	} else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
922 		flags |= SLAB_CACHE_DMA;
923 	}
924 
925 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
926 	if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
927 		flags |= SLAB_NO_MERGE;
928 #endif
929 
930 	/*
931 	 * If CONFIG_MEMCG is enabled, disable cache merging for
932 	 * KMALLOC_NORMAL caches.
933 	 */
934 	if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL))
935 		flags |= SLAB_NO_MERGE;
936 
937 	if (minalign > ARCH_KMALLOC_MINALIGN) {
938 		aligned_size = ALIGN(aligned_size, minalign);
939 		aligned_idx = __kmalloc_index(aligned_size, false);
940 	}
941 
942 	if (!kmalloc_caches[type][aligned_idx])
943 		kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
944 					kmalloc_info[aligned_idx].name[type],
945 					aligned_size, flags);
946 	if (idx != aligned_idx)
947 		kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
948 }
949 
950 /*
951  * Create the kmalloc array. Some of the regular kmalloc arrays
952  * may already have been created because they were needed to
953  * enable allocations for slab creation.
954  */
create_kmalloc_caches(void)955 void __init create_kmalloc_caches(void)
956 {
957 	int i;
958 	enum kmalloc_cache_type type;
959 
960 	/*
961 	 * Including KMALLOC_CGROUP if CONFIG_MEMCG defined
962 	 */
963 	for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
964 		/* Caches that are NOT of the two-to-the-power-of size. */
965 		if (KMALLOC_MIN_SIZE <= 32)
966 			new_kmalloc_cache(1, type);
967 		if (KMALLOC_MIN_SIZE <= 64)
968 			new_kmalloc_cache(2, type);
969 
970 		/* Caches that are of the two-to-the-power-of size. */
971 		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
972 			new_kmalloc_cache(i, type);
973 	}
974 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
975 	random_kmalloc_seed = get_random_u64();
976 #endif
977 
978 	/* Kmalloc array is now usable */
979 	slab_state = UP;
980 
981 	if (IS_ENABLED(CONFIG_SLAB_BUCKETS))
982 		kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
983 						       sizeof(kmem_buckets),
984 						       0, SLAB_NO_MERGE, NULL);
985 }
986 
987 /**
988  * __ksize -- Report full size of underlying allocation
989  * @object: pointer to the object
990  *
991  * This should only be used internally to query the true size of allocations.
992  * It is not meant to be a way to discover the usable size of an allocation
993  * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
994  * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
995  * and/or FORTIFY_SOURCE.
996  *
997  * Return: size of the actual memory used by @object in bytes
998  */
__ksize(const void * object)999 size_t __ksize(const void *object)
1000 {
1001 	struct folio *folio;
1002 
1003 	if (unlikely(object == ZERO_SIZE_PTR))
1004 		return 0;
1005 
1006 	folio = virt_to_folio(object);
1007 
1008 	if (unlikely(!folio_test_slab(folio))) {
1009 		if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
1010 			return 0;
1011 		if (WARN_ON(object != folio_address(folio)))
1012 			return 0;
1013 		return folio_size(folio);
1014 	}
1015 
1016 #ifdef CONFIG_SLUB_DEBUG
1017 	skip_orig_size_check(folio_slab(folio)->slab_cache, object);
1018 #endif
1019 
1020 	return slab_ksize(folio_slab(folio)->slab_cache);
1021 }
1022 
kmalloc_fix_flags(gfp_t flags)1023 gfp_t kmalloc_fix_flags(gfp_t flags)
1024 {
1025 	gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1026 
1027 	flags &= ~GFP_SLAB_BUG_MASK;
1028 	pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1029 			invalid_mask, &invalid_mask, flags, &flags);
1030 	dump_stack();
1031 
1032 	return flags;
1033 }
1034 
1035 #ifdef CONFIG_SLAB_FREELIST_RANDOM
1036 /* Randomize a generic freelist */
freelist_randomize(unsigned int * list,unsigned int count)1037 static void freelist_randomize(unsigned int *list,
1038 			       unsigned int count)
1039 {
1040 	unsigned int rand;
1041 	unsigned int i;
1042 
1043 	for (i = 0; i < count; i++)
1044 		list[i] = i;
1045 
1046 	/* Fisher-Yates shuffle */
1047 	for (i = count - 1; i > 0; i--) {
1048 		rand = get_random_u32_below(i + 1);
1049 		swap(list[i], list[rand]);
1050 	}
1051 }
1052 
1053 /* Create a random sequence per cache */
cache_random_seq_create(struct kmem_cache * cachep,unsigned int count,gfp_t gfp)1054 int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
1055 				    gfp_t gfp)
1056 {
1057 
1058 	if (count < 2 || cachep->random_seq)
1059 		return 0;
1060 
1061 	cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
1062 	if (!cachep->random_seq)
1063 		return -ENOMEM;
1064 
1065 	freelist_randomize(cachep->random_seq, count);
1066 	return 0;
1067 }
1068 
1069 /* Destroy the per-cache random freelist sequence */
cache_random_seq_destroy(struct kmem_cache * cachep)1070 void cache_random_seq_destroy(struct kmem_cache *cachep)
1071 {
1072 	kfree(cachep->random_seq);
1073 	cachep->random_seq = NULL;
1074 }
1075 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1076 
1077 #ifdef CONFIG_SLUB_DEBUG
1078 #define SLABINFO_RIGHTS (0400)
1079 
print_slabinfo_header(struct seq_file * m)1080 static void print_slabinfo_header(struct seq_file *m)
1081 {
1082 	/*
1083 	 * Output format version, so at least we can change it
1084 	 * without _too_ many complaints.
1085 	 */
1086 	seq_puts(m, "slabinfo - version: 2.1\n");
1087 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
1088 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
1089 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
1090 	seq_putc(m, '\n');
1091 }
1092 
slab_start(struct seq_file * m,loff_t * pos)1093 static void *slab_start(struct seq_file *m, loff_t *pos)
1094 {
1095 	mutex_lock(&slab_mutex);
1096 	return seq_list_start(&slab_caches, *pos);
1097 }
1098 
slab_next(struct seq_file * m,void * p,loff_t * pos)1099 static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
1100 {
1101 	return seq_list_next(p, &slab_caches, pos);
1102 }
1103 
slab_stop(struct seq_file * m,void * p)1104 static void slab_stop(struct seq_file *m, void *p)
1105 {
1106 	mutex_unlock(&slab_mutex);
1107 }
1108 
cache_show(struct kmem_cache * s,struct seq_file * m)1109 static void cache_show(struct kmem_cache *s, struct seq_file *m)
1110 {
1111 	struct slabinfo sinfo;
1112 
1113 	memset(&sinfo, 0, sizeof(sinfo));
1114 	get_slabinfo(s, &sinfo);
1115 
1116 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
1117 		   s->name, sinfo.active_objs, sinfo.num_objs, s->size,
1118 		   sinfo.objects_per_slab, (1 << sinfo.cache_order));
1119 
1120 	seq_printf(m, " : tunables %4u %4u %4u",
1121 		   sinfo.limit, sinfo.batchcount, sinfo.shared);
1122 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
1123 		   sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
1124 	seq_putc(m, '\n');
1125 }
1126 
slab_show(struct seq_file * m,void * p)1127 static int slab_show(struct seq_file *m, void *p)
1128 {
1129 	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
1130 
1131 	if (p == slab_caches.next)
1132 		print_slabinfo_header(m);
1133 	cache_show(s, m);
1134 	return 0;
1135 }
1136 
dump_unreclaimable_slab(void)1137 void dump_unreclaimable_slab(void)
1138 {
1139 	struct kmem_cache *s;
1140 	struct slabinfo sinfo;
1141 
1142 	/*
1143 	 * Here acquiring slab_mutex is risky since we don't prefer to get
1144 	 * sleep in oom path. But, without mutex hold, it may introduce a
1145 	 * risk of crash.
1146 	 * Use mutex_trylock to protect the list traverse, dump nothing
1147 	 * without acquiring the mutex.
1148 	 */
1149 	if (!mutex_trylock(&slab_mutex)) {
1150 		pr_warn("excessive unreclaimable slab but cannot dump stats\n");
1151 		return;
1152 	}
1153 
1154 	pr_info("Unreclaimable slab info:\n");
1155 	pr_info("Name                      Used          Total\n");
1156 
1157 	list_for_each_entry(s, &slab_caches, list) {
1158 		if (s->flags & SLAB_RECLAIM_ACCOUNT)
1159 			continue;
1160 
1161 		get_slabinfo(s, &sinfo);
1162 
1163 		if (sinfo.num_objs > 0)
1164 			pr_info("%-17s %10luKB %10luKB\n", s->name,
1165 				(sinfo.active_objs * s->size) / 1024,
1166 				(sinfo.num_objs * s->size) / 1024);
1167 	}
1168 	mutex_unlock(&slab_mutex);
1169 }
1170 
1171 /*
1172  * slabinfo_op - iterator that generates /proc/slabinfo
1173  *
1174  * Output layout:
1175  * cache-name
1176  * num-active-objs
1177  * total-objs
1178  * object size
1179  * num-active-slabs
1180  * total-slabs
1181  * num-pages-per-slab
1182  * + further values on SMP and with statistics enabled
1183  */
1184 static const struct seq_operations slabinfo_op = {
1185 	.start = slab_start,
1186 	.next = slab_next,
1187 	.stop = slab_stop,
1188 	.show = slab_show,
1189 };
1190 
slabinfo_open(struct inode * inode,struct file * file)1191 static int slabinfo_open(struct inode *inode, struct file *file)
1192 {
1193 	return seq_open(file, &slabinfo_op);
1194 }
1195 
1196 static const struct proc_ops slabinfo_proc_ops = {
1197 	.proc_flags	= PROC_ENTRY_PERMANENT,
1198 	.proc_open	= slabinfo_open,
1199 	.proc_read	= seq_read,
1200 	.proc_lseek	= seq_lseek,
1201 	.proc_release	= seq_release,
1202 };
1203 
slab_proc_init(void)1204 static int __init slab_proc_init(void)
1205 {
1206 	proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops);
1207 	return 0;
1208 }
1209 module_init(slab_proc_init);
1210 
1211 #endif /* CONFIG_SLUB_DEBUG */
1212 
1213 /**
1214  * kfree_sensitive - Clear sensitive information in memory before freeing
1215  * @p: object to free memory of
1216  *
1217  * The memory of the object @p points to is zeroed before freed.
1218  * If @p is %NULL, kfree_sensitive() does nothing.
1219  *
1220  * Note: this function zeroes the whole allocated buffer which can be a good
1221  * deal bigger than the requested buffer size passed to kmalloc(). So be
1222  * careful when using this function in performance sensitive code.
1223  */
kfree_sensitive(const void * p)1224 void kfree_sensitive(const void *p)
1225 {
1226 	size_t ks;
1227 	void *mem = (void *)p;
1228 
1229 	ks = ksize(mem);
1230 	if (ks) {
1231 		kasan_unpoison_range(mem, ks);
1232 		memzero_explicit(mem, ks);
1233 	}
1234 	kfree(mem);
1235 }
1236 EXPORT_SYMBOL(kfree_sensitive);
1237 
ksize(const void * objp)1238 size_t ksize(const void *objp)
1239 {
1240 	/*
1241 	 * We need to first check that the pointer to the object is valid.
1242 	 * The KASAN report printed from ksize() is more useful, then when
1243 	 * it's printed later when the behaviour could be undefined due to
1244 	 * a potential use-after-free or double-free.
1245 	 *
1246 	 * We use kasan_check_byte(), which is supported for the hardware
1247 	 * tag-based KASAN mode, unlike kasan_check_read/write().
1248 	 *
1249 	 * If the pointed to memory is invalid, we return 0 to avoid users of
1250 	 * ksize() writing to and potentially corrupting the memory region.
1251 	 *
1252 	 * We want to perform the check before __ksize(), to avoid potentially
1253 	 * crashing in __ksize() due to accessing invalid metadata.
1254 	 */
1255 	if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
1256 		return 0;
1257 
1258 	return kfence_ksize(objp) ?: __ksize(objp);
1259 }
1260 EXPORT_SYMBOL(ksize);
1261 
1262 #ifdef CONFIG_BPF_SYSCALL
1263 #include <linux/btf.h>
1264 
1265 __bpf_kfunc_start_defs();
1266 
bpf_get_kmem_cache(u64 addr)1267 __bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr)
1268 {
1269 	struct slab *slab;
1270 
1271 	if (!virt_addr_valid((void *)(long)addr))
1272 		return NULL;
1273 
1274 	slab = virt_to_slab((void *)(long)addr);
1275 	return slab ? slab->slab_cache : NULL;
1276 }
1277 
1278 __bpf_kfunc_end_defs();
1279 #endif /* CONFIG_BPF_SYSCALL */
1280 
1281 /* Tracepoints definitions. */
1282 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
1283 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
1284 EXPORT_TRACEPOINT_SYMBOL(kfree);
1285 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
1286 
1287 /*
1288  * This rcu parameter is runtime-read-only. It reflects
1289  * a minimum allowed number of objects which can be cached
1290  * per-CPU. Object size is equal to one page. This value
1291  * can be changed at boot time.
1292  */
1293 static int rcu_min_cached_objs = 5;
1294 module_param(rcu_min_cached_objs, int, 0444);
1295 
1296 // A page shrinker can ask for pages to be freed to make them
1297 // available for other parts of the system. This usually happens
1298 // under low memory conditions, and in that case we should also
1299 // defer page-cache filling for a short time period.
1300 //
1301 // The default value is 5 seconds, which is long enough to reduce
1302 // interference with the shrinker while it asks other systems to
1303 // drain their caches.
1304 static int rcu_delay_page_cache_fill_msec = 5000;
1305 module_param(rcu_delay_page_cache_fill_msec, int, 0444);
1306 
1307 static struct workqueue_struct *rcu_reclaim_wq;
1308 
1309 /* Maximum number of jiffies to wait before draining a batch. */
1310 #define KFREE_DRAIN_JIFFIES (5 * HZ)
1311 #define KFREE_N_BATCHES 2
1312 #define FREE_N_CHANNELS 2
1313 
1314 /**
1315  * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
1316  * @list: List node. All blocks are linked between each other
1317  * @gp_snap: Snapshot of RCU state for objects placed to this bulk
1318  * @nr_records: Number of active pointers in the array
1319  * @records: Array of the kvfree_rcu() pointers
1320  */
1321 struct kvfree_rcu_bulk_data {
1322 	struct list_head list;
1323 	struct rcu_gp_oldstate gp_snap;
1324 	unsigned long nr_records;
1325 	void *records[] __counted_by(nr_records);
1326 };
1327 
1328 /*
1329  * This macro defines how many entries the "records" array
1330  * will contain. It is based on the fact that the size of
1331  * kvfree_rcu_bulk_data structure becomes exactly one page.
1332  */
1333 #define KVFREE_BULK_MAX_ENTR \
1334 	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
1335 
1336 /**
1337  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
1338  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
1339  * @head_free: List of kfree_rcu() objects waiting for a grace period
1340  * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
1341  * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
1342  * @krcp: Pointer to @kfree_rcu_cpu structure
1343  */
1344 
1345 struct kfree_rcu_cpu_work {
1346 	struct rcu_work rcu_work;
1347 	struct rcu_head *head_free;
1348 	struct rcu_gp_oldstate head_free_gp_snap;
1349 	struct list_head bulk_head_free[FREE_N_CHANNELS];
1350 	struct kfree_rcu_cpu *krcp;
1351 };
1352 
1353 /**
1354  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
1355  * @head: List of kfree_rcu() objects not yet waiting for a grace period
1356  * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
1357  * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
1358  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
1359  * @lock: Synchronize access to this structure
1360  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
1361  * @initialized: The @rcu_work fields have been initialized
1362  * @head_count: Number of objects in rcu_head singular list
1363  * @bulk_count: Number of objects in bulk-list
1364  * @bkvcache:
1365  *	A simple cache list that contains objects for reuse purpose.
1366  *	In order to save some per-cpu space the list is singular.
1367  *	Even though it is lockless an access has to be protected by the
1368  *	per-cpu lock.
1369  * @page_cache_work: A work to refill the cache when it is empty
1370  * @backoff_page_cache_fill: Delay cache refills
1371  * @work_in_progress: Indicates that page_cache_work is running
1372  * @hrtimer: A hrtimer for scheduling a page_cache_work
1373  * @nr_bkv_objs: number of allocated objects at @bkvcache.
1374  *
1375  * This is a per-CPU structure.  The reason that it is not included in
1376  * the rcu_data structure is to permit this code to be extracted from
1377  * the RCU files.  Such extraction could allow further optimization of
1378  * the interactions with the slab allocators.
1379  */
1380 struct kfree_rcu_cpu {
1381 	// Objects queued on a linked list
1382 	// through their rcu_head structures.
1383 	struct rcu_head *head;
1384 	unsigned long head_gp_snap;
1385 	atomic_t head_count;
1386 
1387 	// Objects queued on a bulk-list.
1388 	struct list_head bulk_head[FREE_N_CHANNELS];
1389 	atomic_t bulk_count[FREE_N_CHANNELS];
1390 
1391 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
1392 	raw_spinlock_t lock;
1393 	struct delayed_work monitor_work;
1394 	bool initialized;
1395 
1396 	struct delayed_work page_cache_work;
1397 	atomic_t backoff_page_cache_fill;
1398 	atomic_t work_in_progress;
1399 	struct hrtimer hrtimer;
1400 
1401 	struct llist_head bkvcache;
1402 	int nr_bkv_objs;
1403 };
1404 
1405 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
1406 	.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
1407 };
1408 
1409 static __always_inline void
debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data * bhead)1410 debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
1411 {
1412 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1413 	int i;
1414 
1415 	for (i = 0; i < bhead->nr_records; i++)
1416 		debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
1417 #endif
1418 }
1419 
1420 static inline struct kfree_rcu_cpu *
krc_this_cpu_lock(unsigned long * flags)1421 krc_this_cpu_lock(unsigned long *flags)
1422 {
1423 	struct kfree_rcu_cpu *krcp;
1424 
1425 	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
1426 	krcp = this_cpu_ptr(&krc);
1427 	raw_spin_lock(&krcp->lock);
1428 
1429 	return krcp;
1430 }
1431 
1432 static inline void
krc_this_cpu_unlock(struct kfree_rcu_cpu * krcp,unsigned long flags)1433 krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
1434 {
1435 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1436 }
1437 
1438 static inline struct kvfree_rcu_bulk_data *
get_cached_bnode(struct kfree_rcu_cpu * krcp)1439 get_cached_bnode(struct kfree_rcu_cpu *krcp)
1440 {
1441 	if (!krcp->nr_bkv_objs)
1442 		return NULL;
1443 
1444 	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
1445 	return (struct kvfree_rcu_bulk_data *)
1446 		llist_del_first(&krcp->bkvcache);
1447 }
1448 
1449 static inline bool
put_cached_bnode(struct kfree_rcu_cpu * krcp,struct kvfree_rcu_bulk_data * bnode)1450 put_cached_bnode(struct kfree_rcu_cpu *krcp,
1451 	struct kvfree_rcu_bulk_data *bnode)
1452 {
1453 	// Check the limit.
1454 	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
1455 		return false;
1456 
1457 	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
1458 	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
1459 	return true;
1460 }
1461 
1462 static int
drain_page_cache(struct kfree_rcu_cpu * krcp)1463 drain_page_cache(struct kfree_rcu_cpu *krcp)
1464 {
1465 	unsigned long flags;
1466 	struct llist_node *page_list, *pos, *n;
1467 	int freed = 0;
1468 
1469 	if (!rcu_min_cached_objs)
1470 		return 0;
1471 
1472 	raw_spin_lock_irqsave(&krcp->lock, flags);
1473 	page_list = llist_del_all(&krcp->bkvcache);
1474 	WRITE_ONCE(krcp->nr_bkv_objs, 0);
1475 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1476 
1477 	llist_for_each_safe(pos, n, page_list) {
1478 		free_page((unsigned long)pos);
1479 		freed++;
1480 	}
1481 
1482 	return freed;
1483 }
1484 
1485 static void
kvfree_rcu_bulk(struct kfree_rcu_cpu * krcp,struct kvfree_rcu_bulk_data * bnode,int idx)1486 kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
1487 	struct kvfree_rcu_bulk_data *bnode, int idx)
1488 {
1489 	unsigned long flags;
1490 	int i;
1491 
1492 	if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
1493 		debug_rcu_bhead_unqueue(bnode);
1494 		rcu_lock_acquire(&rcu_callback_map);
1495 		if (idx == 0) { // kmalloc() / kfree().
1496 			trace_rcu_invoke_kfree_bulk_callback(
1497 				"slab", bnode->nr_records,
1498 				bnode->records);
1499 
1500 			kfree_bulk(bnode->nr_records, bnode->records);
1501 		} else { // vmalloc() / vfree().
1502 			for (i = 0; i < bnode->nr_records; i++) {
1503 				trace_rcu_invoke_kvfree_callback(
1504 					"slab", bnode->records[i], 0);
1505 
1506 				vfree(bnode->records[i]);
1507 			}
1508 		}
1509 		rcu_lock_release(&rcu_callback_map);
1510 	}
1511 
1512 	raw_spin_lock_irqsave(&krcp->lock, flags);
1513 	if (put_cached_bnode(krcp, bnode))
1514 		bnode = NULL;
1515 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1516 
1517 	if (bnode)
1518 		free_page((unsigned long) bnode);
1519 
1520 	cond_resched_tasks_rcu_qs();
1521 }
1522 
1523 static void
kvfree_rcu_list(struct rcu_head * head)1524 kvfree_rcu_list(struct rcu_head *head)
1525 {
1526 	struct rcu_head *next;
1527 
1528 	for (; head; head = next) {
1529 		void *ptr = (void *) head->func;
1530 		unsigned long offset = (void *) head - ptr;
1531 
1532 		next = head->next;
1533 		debug_rcu_head_unqueue((struct rcu_head *)ptr);
1534 		rcu_lock_acquire(&rcu_callback_map);
1535 		trace_rcu_invoke_kvfree_callback("slab", head, offset);
1536 
1537 		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
1538 			kvfree(ptr);
1539 
1540 		rcu_lock_release(&rcu_callback_map);
1541 		cond_resched_tasks_rcu_qs();
1542 	}
1543 }
1544 
1545 /*
1546  * This function is invoked in workqueue context after a grace period.
1547  * It frees all the objects queued on ->bulk_head_free or ->head_free.
1548  */
kfree_rcu_work(struct work_struct * work)1549 static void kfree_rcu_work(struct work_struct *work)
1550 {
1551 	unsigned long flags;
1552 	struct kvfree_rcu_bulk_data *bnode, *n;
1553 	struct list_head bulk_head[FREE_N_CHANNELS];
1554 	struct rcu_head *head;
1555 	struct kfree_rcu_cpu *krcp;
1556 	struct kfree_rcu_cpu_work *krwp;
1557 	struct rcu_gp_oldstate head_gp_snap;
1558 	int i;
1559 
1560 	krwp = container_of(to_rcu_work(work),
1561 		struct kfree_rcu_cpu_work, rcu_work);
1562 	krcp = krwp->krcp;
1563 
1564 	raw_spin_lock_irqsave(&krcp->lock, flags);
1565 	// Channels 1 and 2.
1566 	for (i = 0; i < FREE_N_CHANNELS; i++)
1567 		list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
1568 
1569 	// Channel 3.
1570 	head = krwp->head_free;
1571 	krwp->head_free = NULL;
1572 	head_gp_snap = krwp->head_free_gp_snap;
1573 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1574 
1575 	// Handle the first two channels.
1576 	for (i = 0; i < FREE_N_CHANNELS; i++) {
1577 		// Start from the tail page, so a GP is likely passed for it.
1578 		list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
1579 			kvfree_rcu_bulk(krcp, bnode, i);
1580 	}
1581 
1582 	/*
1583 	 * This is used when the "bulk" path can not be used for the
1584 	 * double-argument of kvfree_rcu().  This happens when the
1585 	 * page-cache is empty, which means that objects are instead
1586 	 * queued on a linked list through their rcu_head structures.
1587 	 * This list is named "Channel 3".
1588 	 */
1589 	if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
1590 		kvfree_rcu_list(head);
1591 }
1592 
1593 static bool
need_offload_krc(struct kfree_rcu_cpu * krcp)1594 need_offload_krc(struct kfree_rcu_cpu *krcp)
1595 {
1596 	int i;
1597 
1598 	for (i = 0; i < FREE_N_CHANNELS; i++)
1599 		if (!list_empty(&krcp->bulk_head[i]))
1600 			return true;
1601 
1602 	return !!READ_ONCE(krcp->head);
1603 }
1604 
1605 static bool
need_wait_for_krwp_work(struct kfree_rcu_cpu_work * krwp)1606 need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
1607 {
1608 	int i;
1609 
1610 	for (i = 0; i < FREE_N_CHANNELS; i++)
1611 		if (!list_empty(&krwp->bulk_head_free[i]))
1612 			return true;
1613 
1614 	return !!krwp->head_free;
1615 }
1616 
krc_count(struct kfree_rcu_cpu * krcp)1617 static int krc_count(struct kfree_rcu_cpu *krcp)
1618 {
1619 	int sum = atomic_read(&krcp->head_count);
1620 	int i;
1621 
1622 	for (i = 0; i < FREE_N_CHANNELS; i++)
1623 		sum += atomic_read(&krcp->bulk_count[i]);
1624 
1625 	return sum;
1626 }
1627 
1628 static void
__schedule_delayed_monitor_work(struct kfree_rcu_cpu * krcp)1629 __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1630 {
1631 	long delay, delay_left;
1632 
1633 	delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
1634 	if (delayed_work_pending(&krcp->monitor_work)) {
1635 		delay_left = krcp->monitor_work.timer.expires - jiffies;
1636 		if (delay < delay_left)
1637 			mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
1638 		return;
1639 	}
1640 	queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
1641 }
1642 
1643 static void
schedule_delayed_monitor_work(struct kfree_rcu_cpu * krcp)1644 schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1645 {
1646 	unsigned long flags;
1647 
1648 	raw_spin_lock_irqsave(&krcp->lock, flags);
1649 	__schedule_delayed_monitor_work(krcp);
1650 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1651 }
1652 
1653 static void
kvfree_rcu_drain_ready(struct kfree_rcu_cpu * krcp)1654 kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
1655 {
1656 	struct list_head bulk_ready[FREE_N_CHANNELS];
1657 	struct kvfree_rcu_bulk_data *bnode, *n;
1658 	struct rcu_head *head_ready = NULL;
1659 	unsigned long flags;
1660 	int i;
1661 
1662 	raw_spin_lock_irqsave(&krcp->lock, flags);
1663 	for (i = 0; i < FREE_N_CHANNELS; i++) {
1664 		INIT_LIST_HEAD(&bulk_ready[i]);
1665 
1666 		list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
1667 			if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
1668 				break;
1669 
1670 			atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
1671 			list_move(&bnode->list, &bulk_ready[i]);
1672 		}
1673 	}
1674 
1675 	if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
1676 		head_ready = krcp->head;
1677 		atomic_set(&krcp->head_count, 0);
1678 		WRITE_ONCE(krcp->head, NULL);
1679 	}
1680 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1681 
1682 	for (i = 0; i < FREE_N_CHANNELS; i++) {
1683 		list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
1684 			kvfree_rcu_bulk(krcp, bnode, i);
1685 	}
1686 
1687 	if (head_ready)
1688 		kvfree_rcu_list(head_ready);
1689 }
1690 
1691 /*
1692  * Return: %true if a work is queued, %false otherwise.
1693  */
1694 static bool
kvfree_rcu_queue_batch(struct kfree_rcu_cpu * krcp)1695 kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
1696 {
1697 	unsigned long flags;
1698 	bool queued = false;
1699 	int i, j;
1700 
1701 	raw_spin_lock_irqsave(&krcp->lock, flags);
1702 
1703 	// Attempt to start a new batch.
1704 	for (i = 0; i < KFREE_N_BATCHES; i++) {
1705 		struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
1706 
1707 		// Try to detach bulk_head or head and attach it, only when
1708 		// all channels are free.  Any channel is not free means at krwp
1709 		// there is on-going rcu work to handle krwp's free business.
1710 		if (need_wait_for_krwp_work(krwp))
1711 			continue;
1712 
1713 		// kvfree_rcu_drain_ready() might handle this krcp, if so give up.
1714 		if (need_offload_krc(krcp)) {
1715 			// Channel 1 corresponds to the SLAB-pointer bulk path.
1716 			// Channel 2 corresponds to vmalloc-pointer bulk path.
1717 			for (j = 0; j < FREE_N_CHANNELS; j++) {
1718 				if (list_empty(&krwp->bulk_head_free[j])) {
1719 					atomic_set(&krcp->bulk_count[j], 0);
1720 					list_replace_init(&krcp->bulk_head[j],
1721 						&krwp->bulk_head_free[j]);
1722 				}
1723 			}
1724 
1725 			// Channel 3 corresponds to both SLAB and vmalloc
1726 			// objects queued on the linked list.
1727 			if (!krwp->head_free) {
1728 				krwp->head_free = krcp->head;
1729 				get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
1730 				atomic_set(&krcp->head_count, 0);
1731 				WRITE_ONCE(krcp->head, NULL);
1732 			}
1733 
1734 			// One work is per one batch, so there are three
1735 			// "free channels", the batch can handle. Break
1736 			// the loop since it is done with this CPU thus
1737 			// queuing an RCU work is _always_ success here.
1738 			queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
1739 			WARN_ON_ONCE(!queued);
1740 			break;
1741 		}
1742 	}
1743 
1744 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1745 	return queued;
1746 }
1747 
1748 /*
1749  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
1750  */
kfree_rcu_monitor(struct work_struct * work)1751 static void kfree_rcu_monitor(struct work_struct *work)
1752 {
1753 	struct kfree_rcu_cpu *krcp = container_of(work,
1754 		struct kfree_rcu_cpu, monitor_work.work);
1755 
1756 	// Drain ready for reclaim.
1757 	kvfree_rcu_drain_ready(krcp);
1758 
1759 	// Queue a batch for a rest.
1760 	kvfree_rcu_queue_batch(krcp);
1761 
1762 	// If there is nothing to detach, it means that our job is
1763 	// successfully done here. In case of having at least one
1764 	// of the channels that is still busy we should rearm the
1765 	// work to repeat an attempt. Because previous batches are
1766 	// still in progress.
1767 	if (need_offload_krc(krcp))
1768 		schedule_delayed_monitor_work(krcp);
1769 }
1770 
fill_page_cache_func(struct work_struct * work)1771 static void fill_page_cache_func(struct work_struct *work)
1772 {
1773 	struct kvfree_rcu_bulk_data *bnode;
1774 	struct kfree_rcu_cpu *krcp =
1775 		container_of(work, struct kfree_rcu_cpu,
1776 			page_cache_work.work);
1777 	unsigned long flags;
1778 	int nr_pages;
1779 	bool pushed;
1780 	int i;
1781 
1782 	nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
1783 		1 : rcu_min_cached_objs;
1784 
1785 	for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
1786 		bnode = (struct kvfree_rcu_bulk_data *)
1787 			__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1788 
1789 		if (!bnode)
1790 			break;
1791 
1792 		raw_spin_lock_irqsave(&krcp->lock, flags);
1793 		pushed = put_cached_bnode(krcp, bnode);
1794 		raw_spin_unlock_irqrestore(&krcp->lock, flags);
1795 
1796 		if (!pushed) {
1797 			free_page((unsigned long) bnode);
1798 			break;
1799 		}
1800 	}
1801 
1802 	atomic_set(&krcp->work_in_progress, 0);
1803 	atomic_set(&krcp->backoff_page_cache_fill, 0);
1804 }
1805 
1806 // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
1807 // state specified by flags.  If can_alloc is true, the caller must
1808 // be schedulable and not be holding any locks or mutexes that might be
1809 // acquired by the memory allocator or anything that it might invoke.
1810 // Returns true if ptr was successfully recorded, else the caller must
1811 // use a fallback.
1812 static inline bool
add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu ** krcp,unsigned long * flags,void * ptr,bool can_alloc)1813 add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
1814 	unsigned long *flags, void *ptr, bool can_alloc)
1815 {
1816 	struct kvfree_rcu_bulk_data *bnode;
1817 	int idx;
1818 
1819 	*krcp = krc_this_cpu_lock(flags);
1820 	if (unlikely(!(*krcp)->initialized))
1821 		return false;
1822 
1823 	idx = !!is_vmalloc_addr(ptr);
1824 	bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
1825 		struct kvfree_rcu_bulk_data, list);
1826 
1827 	/* Check if a new block is required. */
1828 	if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
1829 		bnode = get_cached_bnode(*krcp);
1830 		if (!bnode && can_alloc) {
1831 			krc_this_cpu_unlock(*krcp, *flags);
1832 
1833 			// __GFP_NORETRY - allows a light-weight direct reclaim
1834 			// what is OK from minimizing of fallback hitting point of
1835 			// view. Apart of that it forbids any OOM invoking what is
1836 			// also beneficial since we are about to release memory soon.
1837 			//
1838 			// __GFP_NOMEMALLOC - prevents from consuming of all the
1839 			// memory reserves. Please note we have a fallback path.
1840 			//
1841 			// __GFP_NOWARN - it is supposed that an allocation can
1842 			// be failed under low memory or high memory pressure
1843 			// scenarios.
1844 			bnode = (struct kvfree_rcu_bulk_data *)
1845 				__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1846 			raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
1847 		}
1848 
1849 		if (!bnode)
1850 			return false;
1851 
1852 		// Initialize the new block and attach it.
1853 		bnode->nr_records = 0;
1854 		list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
1855 	}
1856 
1857 	// Finally insert and update the GP for this page.
1858 	bnode->nr_records++;
1859 	bnode->records[bnode->nr_records - 1] = ptr;
1860 	get_state_synchronize_rcu_full(&bnode->gp_snap);
1861 	atomic_inc(&(*krcp)->bulk_count[idx]);
1862 
1863 	return true;
1864 }
1865 
1866 #if !defined(CONFIG_TINY_RCU)
1867 
1868 static enum hrtimer_restart
schedule_page_work_fn(struct hrtimer * t)1869 schedule_page_work_fn(struct hrtimer *t)
1870 {
1871 	struct kfree_rcu_cpu *krcp =
1872 		container_of(t, struct kfree_rcu_cpu, hrtimer);
1873 
1874 	queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
1875 	return HRTIMER_NORESTART;
1876 }
1877 
1878 static void
run_page_cache_worker(struct kfree_rcu_cpu * krcp)1879 run_page_cache_worker(struct kfree_rcu_cpu *krcp)
1880 {
1881 	// If cache disabled, bail out.
1882 	if (!rcu_min_cached_objs)
1883 		return;
1884 
1885 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
1886 			!atomic_xchg(&krcp->work_in_progress, 1)) {
1887 		if (atomic_read(&krcp->backoff_page_cache_fill)) {
1888 			queue_delayed_work(rcu_reclaim_wq,
1889 				&krcp->page_cache_work,
1890 					msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
1891 		} else {
1892 			hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1893 			krcp->hrtimer.function = schedule_page_work_fn;
1894 			hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
1895 		}
1896 	}
1897 }
1898 
kfree_rcu_scheduler_running(void)1899 void __init kfree_rcu_scheduler_running(void)
1900 {
1901 	int cpu;
1902 
1903 	for_each_possible_cpu(cpu) {
1904 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
1905 
1906 		if (need_offload_krc(krcp))
1907 			schedule_delayed_monitor_work(krcp);
1908 	}
1909 }
1910 
1911 /*
1912  * Queue a request for lazy invocation of the appropriate free routine
1913  * after a grace period.  Please note that three paths are maintained,
1914  * two for the common case using arrays of pointers and a third one that
1915  * is used only when the main paths cannot be used, for example, due to
1916  * memory pressure.
1917  *
1918  * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
1919  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
1920  * be free'd in workqueue context. This allows us to: batch requests together to
1921  * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
1922  */
kvfree_call_rcu(struct rcu_head * head,void * ptr)1923 void kvfree_call_rcu(struct rcu_head *head, void *ptr)
1924 {
1925 	unsigned long flags;
1926 	struct kfree_rcu_cpu *krcp;
1927 	bool success;
1928 
1929 	/*
1930 	 * Please note there is a limitation for the head-less
1931 	 * variant, that is why there is a clear rule for such
1932 	 * objects: it can be used from might_sleep() context
1933 	 * only. For other places please embed an rcu_head to
1934 	 * your data.
1935 	 */
1936 	if (!head)
1937 		might_sleep();
1938 
1939 	// Queue the object but don't yet schedule the batch.
1940 	if (debug_rcu_head_queue(ptr)) {
1941 		// Probable double kfree_rcu(), just leak.
1942 		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
1943 			  __func__, head);
1944 
1945 		// Mark as success and leave.
1946 		return;
1947 	}
1948 
1949 	kasan_record_aux_stack(ptr);
1950 	success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
1951 	if (!success) {
1952 		run_page_cache_worker(krcp);
1953 
1954 		if (head == NULL)
1955 			// Inline if kvfree_rcu(one_arg) call.
1956 			goto unlock_return;
1957 
1958 		head->func = ptr;
1959 		head->next = krcp->head;
1960 		WRITE_ONCE(krcp->head, head);
1961 		atomic_inc(&krcp->head_count);
1962 
1963 		// Take a snapshot for this krcp.
1964 		krcp->head_gp_snap = get_state_synchronize_rcu();
1965 		success = true;
1966 	}
1967 
1968 	/*
1969 	 * The kvfree_rcu() caller considers the pointer freed at this point
1970 	 * and likely removes any references to it. Since the actual slab
1971 	 * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
1972 	 * this object (no scanning or false positives reporting).
1973 	 */
1974 	kmemleak_ignore(ptr);
1975 
1976 	// Set timer to drain after KFREE_DRAIN_JIFFIES.
1977 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
1978 		__schedule_delayed_monitor_work(krcp);
1979 
1980 unlock_return:
1981 	krc_this_cpu_unlock(krcp, flags);
1982 
1983 	/*
1984 	 * Inline kvfree() after synchronize_rcu(). We can do
1985 	 * it from might_sleep() context only, so the current
1986 	 * CPU can pass the QS state.
1987 	 */
1988 	if (!success) {
1989 		debug_rcu_head_unqueue((struct rcu_head *) ptr);
1990 		synchronize_rcu();
1991 		kvfree(ptr);
1992 	}
1993 }
1994 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
1995 
1996 /**
1997  * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
1998  *
1999  * Note that a single argument of kvfree_rcu() call has a slow path that
2000  * triggers synchronize_rcu() following by freeing a pointer. It is done
2001  * before the return from the function. Therefore for any single-argument
2002  * call that will result in a kfree() to a cache that is to be destroyed
2003  * during module exit, it is developer's responsibility to ensure that all
2004  * such calls have returned before the call to kmem_cache_destroy().
2005  */
kvfree_rcu_barrier(void)2006 void kvfree_rcu_barrier(void)
2007 {
2008 	struct kfree_rcu_cpu_work *krwp;
2009 	struct kfree_rcu_cpu *krcp;
2010 	bool queued;
2011 	int i, cpu;
2012 
2013 	/*
2014 	 * Firstly we detach objects and queue them over an RCU-batch
2015 	 * for all CPUs. Finally queued works are flushed for each CPU.
2016 	 *
2017 	 * Please note. If there are outstanding batches for a particular
2018 	 * CPU, those have to be finished first following by queuing a new.
2019 	 */
2020 	for_each_possible_cpu(cpu) {
2021 		krcp = per_cpu_ptr(&krc, cpu);
2022 
2023 		/*
2024 		 * Check if this CPU has any objects which have been queued for a
2025 		 * new GP completion. If not(means nothing to detach), we are done
2026 		 * with it. If any batch is pending/running for this "krcp", below
2027 		 * per-cpu flush_rcu_work() waits its completion(see last step).
2028 		 */
2029 		if (!need_offload_krc(krcp))
2030 			continue;
2031 
2032 		while (1) {
2033 			/*
2034 			 * If we are not able to queue a new RCU work it means:
2035 			 * - batches for this CPU are still in flight which should
2036 			 *   be flushed first and then repeat;
2037 			 * - no objects to detach, because of concurrency.
2038 			 */
2039 			queued = kvfree_rcu_queue_batch(krcp);
2040 
2041 			/*
2042 			 * Bail out, if there is no need to offload this "krcp"
2043 			 * anymore. As noted earlier it can run concurrently.
2044 			 */
2045 			if (queued || !need_offload_krc(krcp))
2046 				break;
2047 
2048 			/* There are ongoing batches. */
2049 			for (i = 0; i < KFREE_N_BATCHES; i++) {
2050 				krwp = &(krcp->krw_arr[i]);
2051 				flush_rcu_work(&krwp->rcu_work);
2052 			}
2053 		}
2054 	}
2055 
2056 	/*
2057 	 * Now we guarantee that all objects are flushed.
2058 	 */
2059 	for_each_possible_cpu(cpu) {
2060 		krcp = per_cpu_ptr(&krc, cpu);
2061 
2062 		/*
2063 		 * A monitor work can drain ready to reclaim objects
2064 		 * directly. Wait its completion if running or pending.
2065 		 */
2066 		cancel_delayed_work_sync(&krcp->monitor_work);
2067 
2068 		for (i = 0; i < KFREE_N_BATCHES; i++) {
2069 			krwp = &(krcp->krw_arr[i]);
2070 			flush_rcu_work(&krwp->rcu_work);
2071 		}
2072 	}
2073 }
2074 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
2075 
2076 #endif /* #if !defined(CONFIG_TINY_RCU) */
2077 
2078 static unsigned long
kfree_rcu_shrink_count(struct shrinker * shrink,struct shrink_control * sc)2079 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2080 {
2081 	int cpu;
2082 	unsigned long count = 0;
2083 
2084 	/* Snapshot count of all CPUs */
2085 	for_each_possible_cpu(cpu) {
2086 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2087 
2088 		count += krc_count(krcp);
2089 		count += READ_ONCE(krcp->nr_bkv_objs);
2090 		atomic_set(&krcp->backoff_page_cache_fill, 1);
2091 	}
2092 
2093 	return count == 0 ? SHRINK_EMPTY : count;
2094 }
2095 
2096 static unsigned long
kfree_rcu_shrink_scan(struct shrinker * shrink,struct shrink_control * sc)2097 kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2098 {
2099 	int cpu, freed = 0;
2100 
2101 	for_each_possible_cpu(cpu) {
2102 		int count;
2103 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2104 
2105 		count = krc_count(krcp);
2106 		count += drain_page_cache(krcp);
2107 		kfree_rcu_monitor(&krcp->monitor_work.work);
2108 
2109 		sc->nr_to_scan -= count;
2110 		freed += count;
2111 
2112 		if (sc->nr_to_scan <= 0)
2113 			break;
2114 	}
2115 
2116 	return freed == 0 ? SHRINK_STOP : freed;
2117 }
2118 
kvfree_rcu_init(void)2119 void __init kvfree_rcu_init(void)
2120 {
2121 	int cpu;
2122 	int i, j;
2123 	struct shrinker *kfree_rcu_shrinker;
2124 
2125 	rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
2126 			WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
2127 	WARN_ON(!rcu_reclaim_wq);
2128 
2129 	/* Clamp it to [0:100] seconds interval. */
2130 	if (rcu_delay_page_cache_fill_msec < 0 ||
2131 		rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
2132 
2133 		rcu_delay_page_cache_fill_msec =
2134 			clamp(rcu_delay_page_cache_fill_msec, 0,
2135 				(int) (100 * MSEC_PER_SEC));
2136 
2137 		pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
2138 			rcu_delay_page_cache_fill_msec);
2139 	}
2140 
2141 	for_each_possible_cpu(cpu) {
2142 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2143 
2144 		for (i = 0; i < KFREE_N_BATCHES; i++) {
2145 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
2146 			krcp->krw_arr[i].krcp = krcp;
2147 
2148 			for (j = 0; j < FREE_N_CHANNELS; j++)
2149 				INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
2150 		}
2151 
2152 		for (i = 0; i < FREE_N_CHANNELS; i++)
2153 			INIT_LIST_HEAD(&krcp->bulk_head[i]);
2154 
2155 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
2156 		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
2157 		krcp->initialized = true;
2158 	}
2159 
2160 	kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
2161 	if (!kfree_rcu_shrinker) {
2162 		pr_err("Failed to allocate kfree_rcu() shrinker!\n");
2163 		return;
2164 	}
2165 
2166 	kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
2167 	kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
2168 
2169 	shrinker_register(kfree_rcu_shrinker);
2170 }
2171