xref: /linux/mm/slab_common.c (revision c4fb7f0a79771dfd18838bfc5015650a9730e9c0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Slab allocator functions that are independent of the allocator strategy
4  *
5  * (C) 2012 Christoph Lameter <cl@gentwo.org>
6  */
7 #include <linux/slab.h>
8 
9 #include <linux/mm.h>
10 #include <linux/poison.h>
11 #include <linux/interrupt.h>
12 #include <linux/memory.h>
13 #include <linux/cache.h>
14 #include <linux/compiler.h>
15 #include <linux/kfence.h>
16 #include <linux/module.h>
17 #include <linux/cpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/seq_file.h>
20 #include <linux/dma-mapping.h>
21 #include <linux/swiotlb.h>
22 #include <linux/proc_fs.h>
23 #include <linux/debugfs.h>
24 #include <linux/kmemleak.h>
25 #include <linux/kasan.h>
26 #include <asm/cacheflush.h>
27 #include <asm/tlbflush.h>
28 #include <asm/page.h>
29 #include <linux/memcontrol.h>
30 #include <linux/stackdepot.h>
31 #include <trace/events/rcu.h>
32 
33 #include "../kernel/rcu/rcu.h"
34 #include "internal.h"
35 #include "slab.h"
36 
37 #define CREATE_TRACE_POINTS
38 #include <trace/events/kmem.h>
39 
40 enum slab_state slab_state;
41 LIST_HEAD(slab_caches);
42 DEFINE_MUTEX(slab_mutex);
43 struct kmem_cache *kmem_cache;
44 
45 /*
46  * Set of flags that will prevent slab merging
47  */
48 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
49 		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
50 		SLAB_FAILSLAB | SLAB_NO_MERGE)
51 
52 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
53 			 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
54 
55 /*
56  * Merge control. If this is set then no merging of slab caches will occur.
57  */
58 static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
59 
60 static int __init setup_slab_nomerge(char *str)
61 {
62 	slab_nomerge = true;
63 	return 1;
64 }
65 
66 static int __init setup_slab_merge(char *str)
67 {
68 	slab_nomerge = false;
69 	return 1;
70 }
71 
72 __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
73 __setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
74 
75 __setup("slab_nomerge", setup_slab_nomerge);
76 __setup("slab_merge", setup_slab_merge);
77 
78 /*
79  * Determine the size of a slab object
80  */
81 unsigned int kmem_cache_size(struct kmem_cache *s)
82 {
83 	return s->object_size;
84 }
85 EXPORT_SYMBOL(kmem_cache_size);
86 
87 #ifdef CONFIG_DEBUG_VM
88 
89 static bool kmem_cache_is_duplicate_name(const char *name)
90 {
91 	struct kmem_cache *s;
92 
93 	list_for_each_entry(s, &slab_caches, list) {
94 		if (!strcmp(s->name, name))
95 			return true;
96 	}
97 
98 	return false;
99 }
100 
101 static int kmem_cache_sanity_check(const char *name, unsigned int size)
102 {
103 	if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
104 		pr_err("kmem_cache_create(%s) integrity check failed\n", name);
105 		return -EINVAL;
106 	}
107 
108 	/* Duplicate names will confuse slabtop, et al */
109 	WARN(kmem_cache_is_duplicate_name(name),
110 			"kmem_cache of name '%s' already exists\n", name);
111 
112 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
113 	return 0;
114 }
115 #else
116 static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
117 {
118 	return 0;
119 }
120 #endif
121 
122 /*
123  * Figure out what the alignment of the objects will be given a set of
124  * flags, a user specified alignment and the size of the objects.
125  */
126 static unsigned int calculate_alignment(slab_flags_t flags,
127 		unsigned int align, unsigned int size)
128 {
129 	/*
130 	 * If the user wants hardware cache aligned objects then follow that
131 	 * suggestion if the object is sufficiently large.
132 	 *
133 	 * The hardware cache alignment cannot override the specified
134 	 * alignment though. If that is greater then use it.
135 	 */
136 	if (flags & SLAB_HWCACHE_ALIGN) {
137 		unsigned int ralign;
138 
139 		ralign = cache_line_size();
140 		while (size <= ralign / 2)
141 			ralign /= 2;
142 		align = max(align, ralign);
143 	}
144 
145 	align = max(align, arch_slab_minalign());
146 
147 	return ALIGN(align, sizeof(void *));
148 }
149 
150 /*
151  * Find a mergeable slab cache
152  */
153 int slab_unmergeable(struct kmem_cache *s)
154 {
155 	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
156 		return 1;
157 
158 	if (s->ctor)
159 		return 1;
160 
161 #ifdef CONFIG_HARDENED_USERCOPY
162 	if (s->usersize)
163 		return 1;
164 #endif
165 
166 	if (s->cpu_sheaves)
167 		return 1;
168 
169 	/*
170 	 * We may have set a slab to be unmergeable during bootstrap.
171 	 */
172 	if (s->refcount < 0)
173 		return 1;
174 
175 	return 0;
176 }
177 
178 struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
179 		slab_flags_t flags, const char *name, void (*ctor)(void *))
180 {
181 	struct kmem_cache *s;
182 
183 	if (slab_nomerge)
184 		return NULL;
185 
186 	if (ctor)
187 		return NULL;
188 
189 	flags = kmem_cache_flags(flags, name);
190 
191 	if (flags & SLAB_NEVER_MERGE)
192 		return NULL;
193 
194 	size = ALIGN(size, sizeof(void *));
195 	align = calculate_alignment(flags, align, size);
196 	size = ALIGN(size, align);
197 
198 	list_for_each_entry_reverse(s, &slab_caches, list) {
199 		if (slab_unmergeable(s))
200 			continue;
201 
202 		if (size > s->size)
203 			continue;
204 
205 		if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
206 			continue;
207 		/*
208 		 * Check if alignment is compatible.
209 		 * Courtesy of Adrian Drzewiecki
210 		 */
211 		if ((s->size & ~(align - 1)) != s->size)
212 			continue;
213 
214 		if (s->size - size >= sizeof(void *))
215 			continue;
216 
217 		return s;
218 	}
219 	return NULL;
220 }
221 
222 static struct kmem_cache *create_cache(const char *name,
223 				       unsigned int object_size,
224 				       struct kmem_cache_args *args,
225 				       slab_flags_t flags)
226 {
227 	struct kmem_cache *s;
228 	int err;
229 
230 	/* If a custom freelist pointer is requested make sure it's sane. */
231 	err = -EINVAL;
232 	if (args->use_freeptr_offset &&
233 	    (args->freeptr_offset >= object_size ||
234 	     !(flags & SLAB_TYPESAFE_BY_RCU) ||
235 	     !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
236 		goto out;
237 
238 	err = -ENOMEM;
239 	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
240 	if (!s)
241 		goto out;
242 	err = do_kmem_cache_create(s, name, object_size, args, flags);
243 	if (err)
244 		goto out_free_cache;
245 
246 	s->refcount = 1;
247 	list_add(&s->list, &slab_caches);
248 	return s;
249 
250 out_free_cache:
251 	kmem_cache_free(kmem_cache, s);
252 out:
253 	return ERR_PTR(err);
254 }
255 
256 /**
257  * __kmem_cache_create_args - Create a kmem cache.
258  * @name: A string which is used in /proc/slabinfo to identify this cache.
259  * @object_size: The size of objects to be created in this cache.
260  * @args: Additional arguments for the cache creation (see
261  *        &struct kmem_cache_args).
262  * @flags: See the desriptions of individual flags. The common ones are listed
263  *         in the description below.
264  *
265  * Not to be called directly, use the kmem_cache_create() wrapper with the same
266  * parameters.
267  *
268  * Commonly used @flags:
269  *
270  * &SLAB_ACCOUNT - Account allocations to memcg.
271  *
272  * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
273  *
274  * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
275  *
276  * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
277  * by a grace period - see the full description before using.
278  *
279  * Context: Cannot be called within a interrupt, but can be interrupted.
280  *
281  * Return: a pointer to the cache on success, NULL on failure.
282  */
283 struct kmem_cache *__kmem_cache_create_args(const char *name,
284 					    unsigned int object_size,
285 					    struct kmem_cache_args *args,
286 					    slab_flags_t flags)
287 {
288 	struct kmem_cache *s = NULL;
289 	const char *cache_name;
290 	int err;
291 
292 #ifdef CONFIG_SLUB_DEBUG
293 	/*
294 	 * If no slab_debug was enabled globally, the static key is not yet
295 	 * enabled by setup_slub_debug(). Enable it if the cache is being
296 	 * created with any of the debugging flags passed explicitly.
297 	 * It's also possible that this is the first cache created with
298 	 * SLAB_STORE_USER and we should init stack_depot for it.
299 	 */
300 	if (flags & SLAB_DEBUG_FLAGS)
301 		static_branch_enable(&slub_debug_enabled);
302 	if (flags & SLAB_STORE_USER)
303 		stack_depot_init();
304 #else
305 	flags &= ~SLAB_DEBUG_FLAGS;
306 #endif
307 
308 	mutex_lock(&slab_mutex);
309 
310 	err = kmem_cache_sanity_check(name, object_size);
311 	if (err) {
312 		goto out_unlock;
313 	}
314 
315 	if (flags & ~SLAB_FLAGS_PERMITTED) {
316 		err = -EINVAL;
317 		goto out_unlock;
318 	}
319 
320 	/* Fail closed on bad usersize of useroffset values. */
321 	if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
322 	    WARN_ON(!args->usersize && args->useroffset) ||
323 	    WARN_ON(object_size < args->usersize ||
324 		    object_size - args->usersize < args->useroffset))
325 		args->usersize = args->useroffset = 0;
326 
327 	if (!args->usersize && !args->sheaf_capacity)
328 		s = __kmem_cache_alias(name, object_size, args->align, flags,
329 				       args->ctor);
330 	if (s)
331 		goto out_unlock;
332 
333 	cache_name = kstrdup_const(name, GFP_KERNEL);
334 	if (!cache_name) {
335 		err = -ENOMEM;
336 		goto out_unlock;
337 	}
338 
339 	args->align = calculate_alignment(flags, args->align, object_size);
340 	s = create_cache(cache_name, object_size, args, flags);
341 	if (IS_ERR(s)) {
342 		err = PTR_ERR(s);
343 		kfree_const(cache_name);
344 	}
345 
346 out_unlock:
347 	mutex_unlock(&slab_mutex);
348 
349 	if (err) {
350 		if (flags & SLAB_PANIC)
351 			panic("%s: Failed to create slab '%s'. Error %d\n",
352 				__func__, name, err);
353 		else {
354 			pr_warn("%s(%s) failed with error %d\n",
355 				__func__, name, err);
356 			dump_stack();
357 		}
358 		return NULL;
359 	}
360 	return s;
361 }
362 EXPORT_SYMBOL(__kmem_cache_create_args);
363 
364 static struct kmem_cache *kmem_buckets_cache __ro_after_init;
365 
366 /**
367  * kmem_buckets_create - Create a set of caches that handle dynamic sized
368  *			 allocations via kmem_buckets_alloc()
369  * @name: A prefix string which is used in /proc/slabinfo to identify this
370  *	  cache. The individual caches with have their sizes as the suffix.
371  * @flags: SLAB flags (see kmem_cache_create() for details).
372  * @useroffset: Starting offset within an allocation that may be copied
373  *		to/from userspace.
374  * @usersize: How many bytes, starting at @useroffset, may be copied
375  *		to/from userspace.
376  * @ctor: A constructor for the objects, run when new allocations are made.
377  *
378  * Cannot be called within an interrupt, but can be interrupted.
379  *
380  * Return: a pointer to the cache on success, NULL on failure. When
381  * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and
382  * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc().
383  * (i.e. callers only need to check for NULL on failure.)
384  */
385 kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
386 				  unsigned int useroffset,
387 				  unsigned int usersize,
388 				  void (*ctor)(void *))
389 {
390 	unsigned long mask = 0;
391 	unsigned int idx;
392 	kmem_buckets *b;
393 
394 	BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG);
395 
396 	/*
397 	 * When the separate buckets API is not built in, just return
398 	 * a non-NULL value for the kmem_buckets pointer, which will be
399 	 * unused when performing allocations.
400 	 */
401 	if (!IS_ENABLED(CONFIG_SLAB_BUCKETS))
402 		return ZERO_SIZE_PTR;
403 
404 	if (WARN_ON(!kmem_buckets_cache))
405 		return NULL;
406 
407 	b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO);
408 	if (WARN_ON(!b))
409 		return NULL;
410 
411 	flags |= SLAB_NO_MERGE;
412 
413 	for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) {
414 		char *short_size, *cache_name;
415 		unsigned int cache_useroffset, cache_usersize;
416 		unsigned int size, aligned_idx;
417 
418 		if (!kmalloc_caches[KMALLOC_NORMAL][idx])
419 			continue;
420 
421 		size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size;
422 		if (!size)
423 			continue;
424 
425 		short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, '-');
426 		if (WARN_ON(!short_size))
427 			goto fail;
428 
429 		if (useroffset >= size) {
430 			cache_useroffset = 0;
431 			cache_usersize = 0;
432 		} else {
433 			cache_useroffset = useroffset;
434 			cache_usersize = min(size - cache_useroffset, usersize);
435 		}
436 
437 		aligned_idx = __kmalloc_index(size, false);
438 		if (!(*b)[aligned_idx]) {
439 			cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1);
440 			if (WARN_ON(!cache_name))
441 				goto fail;
442 			(*b)[aligned_idx] = kmem_cache_create_usercopy(cache_name, size,
443 					0, flags, cache_useroffset,
444 					cache_usersize, ctor);
445 			kfree(cache_name);
446 			if (WARN_ON(!(*b)[aligned_idx]))
447 				goto fail;
448 			set_bit(aligned_idx, &mask);
449 		}
450 		if (idx != aligned_idx)
451 			(*b)[idx] = (*b)[aligned_idx];
452 	}
453 
454 	return b;
455 
456 fail:
457 	for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]))
458 		kmem_cache_destroy((*b)[idx]);
459 	kmem_cache_free(kmem_buckets_cache, b);
460 
461 	return NULL;
462 }
463 EXPORT_SYMBOL(kmem_buckets_create);
464 
465 /*
466  * For a given kmem_cache, kmem_cache_destroy() should only be called
467  * once or there will be a use-after-free problem. The actual deletion
468  * and release of the kobject does not need slab_mutex or cpu_hotplug_lock
469  * protection. So they are now done without holding those locks.
470  */
471 static void kmem_cache_release(struct kmem_cache *s)
472 {
473 	kfence_shutdown_cache(s);
474 	if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL)
475 		sysfs_slab_release(s);
476 	else
477 		slab_kmem_cache_release(s);
478 }
479 
480 void slab_kmem_cache_release(struct kmem_cache *s)
481 {
482 	__kmem_cache_release(s);
483 	kfree_const(s->name);
484 	kmem_cache_free(kmem_cache, s);
485 }
486 
487 void kmem_cache_destroy(struct kmem_cache *s)
488 {
489 	int err;
490 
491 	if (unlikely(!s) || !kasan_check_byte(s))
492 		return;
493 
494 	/* in-flight kfree_rcu()'s may include objects from our cache */
495 	kvfree_rcu_barrier();
496 
497 	if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
498 	    (s->flags & SLAB_TYPESAFE_BY_RCU)) {
499 		/*
500 		 * Under CONFIG_SLUB_RCU_DEBUG, when objects in a
501 		 * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally
502 		 * defer their freeing with call_rcu().
503 		 * Wait for such call_rcu() invocations here before actually
504 		 * destroying the cache.
505 		 *
506 		 * It doesn't matter that we haven't looked at the slab refcount
507 		 * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so
508 		 * the refcount should be 1 here.
509 		 */
510 		rcu_barrier();
511 	}
512 
513 	cpus_read_lock();
514 	mutex_lock(&slab_mutex);
515 
516 	s->refcount--;
517 	if (s->refcount) {
518 		mutex_unlock(&slab_mutex);
519 		cpus_read_unlock();
520 		return;
521 	}
522 
523 	/* free asan quarantined objects */
524 	kasan_cache_shutdown(s);
525 
526 	err = __kmem_cache_shutdown(s);
527 	if (!slab_in_kunit_test())
528 		WARN(err, "%s %s: Slab cache still has objects when called from %pS",
529 		     __func__, s->name, (void *)_RET_IP_);
530 
531 	list_del(&s->list);
532 
533 	mutex_unlock(&slab_mutex);
534 	cpus_read_unlock();
535 
536 	if (slab_state >= FULL)
537 		sysfs_slab_unlink(s);
538 	debugfs_slab_release(s);
539 
540 	if (err)
541 		return;
542 
543 	if (s->flags & SLAB_TYPESAFE_BY_RCU)
544 		rcu_barrier();
545 
546 	kmem_cache_release(s);
547 }
548 EXPORT_SYMBOL(kmem_cache_destroy);
549 
550 /**
551  * kmem_cache_shrink - Shrink a cache.
552  * @cachep: The cache to shrink.
553  *
554  * Releases as many slabs as possible for a cache.
555  * To help debugging, a zero exit status indicates all slabs were released.
556  *
557  * Return: %0 if all slabs were released, non-zero otherwise
558  */
559 int kmem_cache_shrink(struct kmem_cache *cachep)
560 {
561 	kasan_cache_shrink(cachep);
562 
563 	return __kmem_cache_shrink(cachep);
564 }
565 EXPORT_SYMBOL(kmem_cache_shrink);
566 
567 bool slab_is_available(void)
568 {
569 	return slab_state >= UP;
570 }
571 
572 #ifdef CONFIG_PRINTK
573 static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
574 {
575 	if (__kfence_obj_info(kpp, object, slab))
576 		return;
577 	__kmem_obj_info(kpp, object, slab);
578 }
579 
580 /**
581  * kmem_dump_obj - Print available slab provenance information
582  * @object: slab object for which to find provenance information.
583  *
584  * This function uses pr_cont(), so that the caller is expected to have
585  * printed out whatever preamble is appropriate.  The provenance information
586  * depends on the type of object and on how much debugging is enabled.
587  * For a slab-cache object, the fact that it is a slab object is printed,
588  * and, if available, the slab name, return address, and stack trace from
589  * the allocation and last free path of that object.
590  *
591  * Return: %true if the pointer is to a not-yet-freed object from
592  * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
593  * is to an already-freed object, and %false otherwise.
594  */
595 bool kmem_dump_obj(void *object)
596 {
597 	char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
598 	int i;
599 	struct slab *slab;
600 	unsigned long ptroffset;
601 	struct kmem_obj_info kp = { };
602 
603 	/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
604 	if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
605 		return false;
606 	slab = virt_to_slab(object);
607 	if (!slab)
608 		return false;
609 
610 	kmem_obj_info(&kp, object, slab);
611 	if (kp.kp_slab_cache)
612 		pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
613 	else
614 		pr_cont(" slab%s", cp);
615 	if (is_kfence_address(object))
616 		pr_cont(" (kfence)");
617 	if (kp.kp_objp)
618 		pr_cont(" start %px", kp.kp_objp);
619 	if (kp.kp_data_offset)
620 		pr_cont(" data offset %lu", kp.kp_data_offset);
621 	if (kp.kp_objp) {
622 		ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
623 		pr_cont(" pointer offset %lu", ptroffset);
624 	}
625 	if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
626 		pr_cont(" size %u", kp.kp_slab_cache->object_size);
627 	if (kp.kp_ret)
628 		pr_cont(" allocated at %pS\n", kp.kp_ret);
629 	else
630 		pr_cont("\n");
631 	for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
632 		if (!kp.kp_stack[i])
633 			break;
634 		pr_info("    %pS\n", kp.kp_stack[i]);
635 	}
636 
637 	if (kp.kp_free_stack[0])
638 		pr_cont(" Free path:\n");
639 
640 	for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
641 		if (!kp.kp_free_stack[i])
642 			break;
643 		pr_info("    %pS\n", kp.kp_free_stack[i]);
644 	}
645 
646 	return true;
647 }
648 EXPORT_SYMBOL_GPL(kmem_dump_obj);
649 #endif
650 
651 /* Create a cache during boot when no slab services are available yet */
652 void __init create_boot_cache(struct kmem_cache *s, const char *name,
653 		unsigned int size, slab_flags_t flags,
654 		unsigned int useroffset, unsigned int usersize)
655 {
656 	int err;
657 	unsigned int align = ARCH_KMALLOC_MINALIGN;
658 	struct kmem_cache_args kmem_args = {};
659 
660 	/*
661 	 * kmalloc caches guarantee alignment of at least the largest
662 	 * power-of-two divisor of the size. For power-of-two sizes,
663 	 * it is the size itself.
664 	 */
665 	if (flags & SLAB_KMALLOC)
666 		align = max(align, 1U << (ffs(size) - 1));
667 	kmem_args.align = calculate_alignment(flags, align, size);
668 
669 #ifdef CONFIG_HARDENED_USERCOPY
670 	kmem_args.useroffset = useroffset;
671 	kmem_args.usersize = usersize;
672 #endif
673 
674 	err = do_kmem_cache_create(s, name, size, &kmem_args, flags);
675 
676 	if (err)
677 		panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
678 					name, size, err);
679 
680 	s->refcount = -1;	/* Exempt from merging for now */
681 }
682 
683 static struct kmem_cache *__init create_kmalloc_cache(const char *name,
684 						      unsigned int size,
685 						      slab_flags_t flags)
686 {
687 	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
688 
689 	if (!s)
690 		panic("Out of memory when creating slab %s\n", name);
691 
692 	create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size);
693 	list_add(&s->list, &slab_caches);
694 	s->refcount = 1;
695 	return s;
696 }
697 
698 kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init =
699 { /* initialization for https://llvm.org/pr42570 */ };
700 EXPORT_SYMBOL(kmalloc_caches);
701 
702 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
703 unsigned long random_kmalloc_seed __ro_after_init;
704 EXPORT_SYMBOL(random_kmalloc_seed);
705 #endif
706 
707 /*
708  * Conversion table for small slabs sizes / 8 to the index in the
709  * kmalloc array. This is necessary for slabs < 192 since we have non power
710  * of two cache sizes there. The size of larger slabs can be determined using
711  * fls.
712  */
713 u8 kmalloc_size_index[24] __ro_after_init = {
714 	3,	/* 8 */
715 	4,	/* 16 */
716 	5,	/* 24 */
717 	5,	/* 32 */
718 	6,	/* 40 */
719 	6,	/* 48 */
720 	6,	/* 56 */
721 	6,	/* 64 */
722 	1,	/* 72 */
723 	1,	/* 80 */
724 	1,	/* 88 */
725 	1,	/* 96 */
726 	7,	/* 104 */
727 	7,	/* 112 */
728 	7,	/* 120 */
729 	7,	/* 128 */
730 	2,	/* 136 */
731 	2,	/* 144 */
732 	2,	/* 152 */
733 	2,	/* 160 */
734 	2,	/* 168 */
735 	2,	/* 176 */
736 	2,	/* 184 */
737 	2	/* 192 */
738 };
739 
740 size_t kmalloc_size_roundup(size_t size)
741 {
742 	if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
743 		/*
744 		 * The flags don't matter since size_index is common to all.
745 		 * Neither does the caller for just getting ->object_size.
746 		 */
747 		return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size;
748 	}
749 
750 	/* Above the smaller buckets, size is a multiple of page size. */
751 	if (size && size <= KMALLOC_MAX_SIZE)
752 		return PAGE_SIZE << get_order(size);
753 
754 	/*
755 	 * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR
756 	 * and very large size - kmalloc() may fail.
757 	 */
758 	return size;
759 
760 }
761 EXPORT_SYMBOL(kmalloc_size_roundup);
762 
763 #ifdef CONFIG_ZONE_DMA
764 #define KMALLOC_DMA_NAME(sz)	.name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
765 #else
766 #define KMALLOC_DMA_NAME(sz)
767 #endif
768 
769 #ifdef CONFIG_MEMCG
770 #define KMALLOC_CGROUP_NAME(sz)	.name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
771 #else
772 #define KMALLOC_CGROUP_NAME(sz)
773 #endif
774 
775 #ifndef CONFIG_SLUB_TINY
776 #define KMALLOC_RCL_NAME(sz)	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz,
777 #else
778 #define KMALLOC_RCL_NAME(sz)
779 #endif
780 
781 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
782 #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
783 #define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
784 #define KMA_RAND_1(sz)                  .name[KMALLOC_RANDOM_START +  1] = "kmalloc-rnd-01-" #sz,
785 #define KMA_RAND_2(sz)  KMA_RAND_1(sz)  .name[KMALLOC_RANDOM_START +  2] = "kmalloc-rnd-02-" #sz,
786 #define KMA_RAND_3(sz)  KMA_RAND_2(sz)  .name[KMALLOC_RANDOM_START +  3] = "kmalloc-rnd-03-" #sz,
787 #define KMA_RAND_4(sz)  KMA_RAND_3(sz)  .name[KMALLOC_RANDOM_START +  4] = "kmalloc-rnd-04-" #sz,
788 #define KMA_RAND_5(sz)  KMA_RAND_4(sz)  .name[KMALLOC_RANDOM_START +  5] = "kmalloc-rnd-05-" #sz,
789 #define KMA_RAND_6(sz)  KMA_RAND_5(sz)  .name[KMALLOC_RANDOM_START +  6] = "kmalloc-rnd-06-" #sz,
790 #define KMA_RAND_7(sz)  KMA_RAND_6(sz)  .name[KMALLOC_RANDOM_START +  7] = "kmalloc-rnd-07-" #sz,
791 #define KMA_RAND_8(sz)  KMA_RAND_7(sz)  .name[KMALLOC_RANDOM_START +  8] = "kmalloc-rnd-08-" #sz,
792 #define KMA_RAND_9(sz)  KMA_RAND_8(sz)  .name[KMALLOC_RANDOM_START +  9] = "kmalloc-rnd-09-" #sz,
793 #define KMA_RAND_10(sz) KMA_RAND_9(sz)  .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
794 #define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
795 #define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
796 #define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
797 #define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
798 #define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
799 #else // CONFIG_RANDOM_KMALLOC_CACHES
800 #define KMALLOC_RANDOM_NAME(N, sz)
801 #endif
802 
803 #define INIT_KMALLOC_INFO(__size, __short_size)			\
804 {								\
805 	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
806 	KMALLOC_RCL_NAME(__short_size)				\
807 	KMALLOC_CGROUP_NAME(__short_size)			\
808 	KMALLOC_DMA_NAME(__short_size)				\
809 	KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size)	\
810 	.size = __size,						\
811 }
812 
813 /*
814  * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time.
815  * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
816  * kmalloc-2M.
817  */
818 const struct kmalloc_info_struct kmalloc_info[] __initconst = {
819 	INIT_KMALLOC_INFO(0, 0),
820 	INIT_KMALLOC_INFO(96, 96),
821 	INIT_KMALLOC_INFO(192, 192),
822 	INIT_KMALLOC_INFO(8, 8),
823 	INIT_KMALLOC_INFO(16, 16),
824 	INIT_KMALLOC_INFO(32, 32),
825 	INIT_KMALLOC_INFO(64, 64),
826 	INIT_KMALLOC_INFO(128, 128),
827 	INIT_KMALLOC_INFO(256, 256),
828 	INIT_KMALLOC_INFO(512, 512),
829 	INIT_KMALLOC_INFO(1024, 1k),
830 	INIT_KMALLOC_INFO(2048, 2k),
831 	INIT_KMALLOC_INFO(4096, 4k),
832 	INIT_KMALLOC_INFO(8192, 8k),
833 	INIT_KMALLOC_INFO(16384, 16k),
834 	INIT_KMALLOC_INFO(32768, 32k),
835 	INIT_KMALLOC_INFO(65536, 64k),
836 	INIT_KMALLOC_INFO(131072, 128k),
837 	INIT_KMALLOC_INFO(262144, 256k),
838 	INIT_KMALLOC_INFO(524288, 512k),
839 	INIT_KMALLOC_INFO(1048576, 1M),
840 	INIT_KMALLOC_INFO(2097152, 2M)
841 };
842 
843 /*
844  * Patch up the size_index table if we have strange large alignment
845  * requirements for the kmalloc array. This is only the case for
846  * MIPS it seems. The standard arches will not generate any code here.
847  *
848  * Largest permitted alignment is 256 bytes due to the way we
849  * handle the index determination for the smaller caches.
850  *
851  * Make sure that nothing crazy happens if someone starts tinkering
852  * around with ARCH_KMALLOC_MINALIGN
853  */
854 void __init setup_kmalloc_cache_index_table(void)
855 {
856 	unsigned int i;
857 
858 	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
859 		!is_power_of_2(KMALLOC_MIN_SIZE));
860 
861 	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
862 		unsigned int elem = size_index_elem(i);
863 
864 		if (elem >= ARRAY_SIZE(kmalloc_size_index))
865 			break;
866 		kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW;
867 	}
868 
869 	if (KMALLOC_MIN_SIZE >= 64) {
870 		/*
871 		 * The 96 byte sized cache is not used if the alignment
872 		 * is 64 byte.
873 		 */
874 		for (i = 64 + 8; i <= 96; i += 8)
875 			kmalloc_size_index[size_index_elem(i)] = 7;
876 
877 	}
878 
879 	if (KMALLOC_MIN_SIZE >= 128) {
880 		/*
881 		 * The 192 byte sized cache is not used if the alignment
882 		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
883 		 * instead.
884 		 */
885 		for (i = 128 + 8; i <= 192; i += 8)
886 			kmalloc_size_index[size_index_elem(i)] = 8;
887 	}
888 }
889 
890 static unsigned int __kmalloc_minalign(void)
891 {
892 	unsigned int minalign = dma_get_cache_alignment();
893 
894 	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
895 	    is_swiotlb_allocated())
896 		minalign = ARCH_KMALLOC_MINALIGN;
897 
898 	return max(minalign, arch_slab_minalign());
899 }
900 
901 static void __init
902 new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
903 {
904 	slab_flags_t flags = 0;
905 	unsigned int minalign = __kmalloc_minalign();
906 	unsigned int aligned_size = kmalloc_info[idx].size;
907 	int aligned_idx = idx;
908 
909 	if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
910 		flags |= SLAB_RECLAIM_ACCOUNT;
911 	} else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) {
912 		if (mem_cgroup_kmem_disabled()) {
913 			kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
914 			return;
915 		}
916 		flags |= SLAB_ACCOUNT;
917 	} else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
918 		flags |= SLAB_CACHE_DMA;
919 	}
920 
921 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
922 	if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
923 		flags |= SLAB_NO_MERGE;
924 #endif
925 
926 	/*
927 	 * If CONFIG_MEMCG is enabled, disable cache merging for
928 	 * KMALLOC_NORMAL caches.
929 	 */
930 	if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL))
931 		flags |= SLAB_NO_MERGE;
932 
933 	if (minalign > ARCH_KMALLOC_MINALIGN) {
934 		aligned_size = ALIGN(aligned_size, minalign);
935 		aligned_idx = __kmalloc_index(aligned_size, false);
936 	}
937 
938 	if (!kmalloc_caches[type][aligned_idx])
939 		kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
940 					kmalloc_info[aligned_idx].name[type],
941 					aligned_size, flags);
942 	if (idx != aligned_idx)
943 		kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
944 }
945 
946 /*
947  * Create the kmalloc array. Some of the regular kmalloc arrays
948  * may already have been created because they were needed to
949  * enable allocations for slab creation.
950  */
951 void __init create_kmalloc_caches(void)
952 {
953 	int i;
954 	enum kmalloc_cache_type type;
955 
956 	/*
957 	 * Including KMALLOC_CGROUP if CONFIG_MEMCG defined
958 	 */
959 	for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
960 		/* Caches that are NOT of the two-to-the-power-of size. */
961 		if (KMALLOC_MIN_SIZE <= 32)
962 			new_kmalloc_cache(1, type);
963 		if (KMALLOC_MIN_SIZE <= 64)
964 			new_kmalloc_cache(2, type);
965 
966 		/* Caches that are of the two-to-the-power-of size. */
967 		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
968 			new_kmalloc_cache(i, type);
969 	}
970 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
971 	random_kmalloc_seed = get_random_u64();
972 #endif
973 
974 	/* Kmalloc array is now usable */
975 	slab_state = UP;
976 
977 	if (IS_ENABLED(CONFIG_SLAB_BUCKETS))
978 		kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
979 						       sizeof(kmem_buckets),
980 						       0, SLAB_NO_MERGE, NULL);
981 }
982 
983 /**
984  * __ksize -- Report full size of underlying allocation
985  * @object: pointer to the object
986  *
987  * This should only be used internally to query the true size of allocations.
988  * It is not meant to be a way to discover the usable size of an allocation
989  * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
990  * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
991  * and/or FORTIFY_SOURCE.
992  *
993  * Return: size of the actual memory used by @object in bytes
994  */
995 size_t __ksize(const void *object)
996 {
997 	struct folio *folio;
998 
999 	if (unlikely(object == ZERO_SIZE_PTR))
1000 		return 0;
1001 
1002 	folio = virt_to_folio(object);
1003 
1004 	if (unlikely(!folio_test_slab(folio))) {
1005 		if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
1006 			return 0;
1007 		if (WARN_ON(object != folio_address(folio)))
1008 			return 0;
1009 		return folio_size(folio);
1010 	}
1011 
1012 #ifdef CONFIG_SLUB_DEBUG
1013 	skip_orig_size_check(folio_slab(folio)->slab_cache, object);
1014 #endif
1015 
1016 	return slab_ksize(folio_slab(folio)->slab_cache);
1017 }
1018 
1019 gfp_t kmalloc_fix_flags(gfp_t flags)
1020 {
1021 	gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1022 
1023 	flags &= ~GFP_SLAB_BUG_MASK;
1024 	pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1025 			invalid_mask, &invalid_mask, flags, &flags);
1026 	dump_stack();
1027 
1028 	return flags;
1029 }
1030 
1031 #ifdef CONFIG_SLAB_FREELIST_RANDOM
1032 /* Randomize a generic freelist */
1033 static void freelist_randomize(unsigned int *list,
1034 			       unsigned int count)
1035 {
1036 	unsigned int rand;
1037 	unsigned int i;
1038 
1039 	for (i = 0; i < count; i++)
1040 		list[i] = i;
1041 
1042 	/* Fisher-Yates shuffle */
1043 	for (i = count - 1; i > 0; i--) {
1044 		rand = get_random_u32_below(i + 1);
1045 		swap(list[i], list[rand]);
1046 	}
1047 }
1048 
1049 /* Create a random sequence per cache */
1050 int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
1051 				    gfp_t gfp)
1052 {
1053 
1054 	if (count < 2 || cachep->random_seq)
1055 		return 0;
1056 
1057 	cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
1058 	if (!cachep->random_seq)
1059 		return -ENOMEM;
1060 
1061 	freelist_randomize(cachep->random_seq, count);
1062 	return 0;
1063 }
1064 
1065 /* Destroy the per-cache random freelist sequence */
1066 void cache_random_seq_destroy(struct kmem_cache *cachep)
1067 {
1068 	kfree(cachep->random_seq);
1069 	cachep->random_seq = NULL;
1070 }
1071 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1072 
1073 #ifdef CONFIG_SLUB_DEBUG
1074 #define SLABINFO_RIGHTS (0400)
1075 
1076 static void print_slabinfo_header(struct seq_file *m)
1077 {
1078 	/*
1079 	 * Output format version, so at least we can change it
1080 	 * without _too_ many complaints.
1081 	 */
1082 	seq_puts(m, "slabinfo - version: 2.1\n");
1083 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
1084 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
1085 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
1086 	seq_putc(m, '\n');
1087 }
1088 
1089 static void *slab_start(struct seq_file *m, loff_t *pos)
1090 {
1091 	mutex_lock(&slab_mutex);
1092 	return seq_list_start(&slab_caches, *pos);
1093 }
1094 
1095 static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
1096 {
1097 	return seq_list_next(p, &slab_caches, pos);
1098 }
1099 
1100 static void slab_stop(struct seq_file *m, void *p)
1101 {
1102 	mutex_unlock(&slab_mutex);
1103 }
1104 
1105 static void cache_show(struct kmem_cache *s, struct seq_file *m)
1106 {
1107 	struct slabinfo sinfo;
1108 
1109 	memset(&sinfo, 0, sizeof(sinfo));
1110 	get_slabinfo(s, &sinfo);
1111 
1112 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
1113 		   s->name, sinfo.active_objs, sinfo.num_objs, s->size,
1114 		   sinfo.objects_per_slab, (1 << sinfo.cache_order));
1115 
1116 	seq_printf(m, " : tunables %4u %4u %4u",
1117 		   sinfo.limit, sinfo.batchcount, sinfo.shared);
1118 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
1119 		   sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
1120 	seq_putc(m, '\n');
1121 }
1122 
1123 static int slab_show(struct seq_file *m, void *p)
1124 {
1125 	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
1126 
1127 	if (p == slab_caches.next)
1128 		print_slabinfo_header(m);
1129 	cache_show(s, m);
1130 	return 0;
1131 }
1132 
1133 void dump_unreclaimable_slab(void)
1134 {
1135 	struct kmem_cache *s;
1136 	struct slabinfo sinfo;
1137 
1138 	/*
1139 	 * Here acquiring slab_mutex is risky since we don't prefer to get
1140 	 * sleep in oom path. But, without mutex hold, it may introduce a
1141 	 * risk of crash.
1142 	 * Use mutex_trylock to protect the list traverse, dump nothing
1143 	 * without acquiring the mutex.
1144 	 */
1145 	if (!mutex_trylock(&slab_mutex)) {
1146 		pr_warn("excessive unreclaimable slab but cannot dump stats\n");
1147 		return;
1148 	}
1149 
1150 	pr_info("Unreclaimable slab info:\n");
1151 	pr_info("Name                      Used          Total\n");
1152 
1153 	list_for_each_entry(s, &slab_caches, list) {
1154 		if (s->flags & SLAB_RECLAIM_ACCOUNT)
1155 			continue;
1156 
1157 		get_slabinfo(s, &sinfo);
1158 
1159 		if (sinfo.num_objs > 0)
1160 			pr_info("%-17s %10luKB %10luKB\n", s->name,
1161 				(sinfo.active_objs * s->size) / 1024,
1162 				(sinfo.num_objs * s->size) / 1024);
1163 	}
1164 	mutex_unlock(&slab_mutex);
1165 }
1166 
1167 /*
1168  * slabinfo_op - iterator that generates /proc/slabinfo
1169  *
1170  * Output layout:
1171  * cache-name
1172  * num-active-objs
1173  * total-objs
1174  * object size
1175  * num-active-slabs
1176  * total-slabs
1177  * num-pages-per-slab
1178  * + further values on SMP and with statistics enabled
1179  */
1180 static const struct seq_operations slabinfo_op = {
1181 	.start = slab_start,
1182 	.next = slab_next,
1183 	.stop = slab_stop,
1184 	.show = slab_show,
1185 };
1186 
1187 static int slabinfo_open(struct inode *inode, struct file *file)
1188 {
1189 	return seq_open(file, &slabinfo_op);
1190 }
1191 
1192 static const struct proc_ops slabinfo_proc_ops = {
1193 	.proc_flags	= PROC_ENTRY_PERMANENT,
1194 	.proc_open	= slabinfo_open,
1195 	.proc_read	= seq_read,
1196 	.proc_lseek	= seq_lseek,
1197 	.proc_release	= seq_release,
1198 };
1199 
1200 static int __init slab_proc_init(void)
1201 {
1202 	proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops);
1203 	return 0;
1204 }
1205 module_init(slab_proc_init);
1206 
1207 #endif /* CONFIG_SLUB_DEBUG */
1208 
1209 /**
1210  * kfree_sensitive - Clear sensitive information in memory before freeing
1211  * @p: object to free memory of
1212  *
1213  * The memory of the object @p points to is zeroed before freed.
1214  * If @p is %NULL, kfree_sensitive() does nothing.
1215  *
1216  * Note: this function zeroes the whole allocated buffer which can be a good
1217  * deal bigger than the requested buffer size passed to kmalloc(). So be
1218  * careful when using this function in performance sensitive code.
1219  */
1220 void kfree_sensitive(const void *p)
1221 {
1222 	size_t ks;
1223 	void *mem = (void *)p;
1224 
1225 	ks = ksize(mem);
1226 	if (ks) {
1227 		kasan_unpoison_range(mem, ks);
1228 		memzero_explicit(mem, ks);
1229 	}
1230 	kfree(mem);
1231 }
1232 EXPORT_SYMBOL(kfree_sensitive);
1233 
1234 size_t ksize(const void *objp)
1235 {
1236 	/*
1237 	 * We need to first check that the pointer to the object is valid.
1238 	 * The KASAN report printed from ksize() is more useful, then when
1239 	 * it's printed later when the behaviour could be undefined due to
1240 	 * a potential use-after-free or double-free.
1241 	 *
1242 	 * We use kasan_check_byte(), which is supported for the hardware
1243 	 * tag-based KASAN mode, unlike kasan_check_read/write().
1244 	 *
1245 	 * If the pointed to memory is invalid, we return 0 to avoid users of
1246 	 * ksize() writing to and potentially corrupting the memory region.
1247 	 *
1248 	 * We want to perform the check before __ksize(), to avoid potentially
1249 	 * crashing in __ksize() due to accessing invalid metadata.
1250 	 */
1251 	if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
1252 		return 0;
1253 
1254 	return kfence_ksize(objp) ?: __ksize(objp);
1255 }
1256 EXPORT_SYMBOL(ksize);
1257 
1258 #ifdef CONFIG_BPF_SYSCALL
1259 #include <linux/btf.h>
1260 
1261 __bpf_kfunc_start_defs();
1262 
1263 __bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr)
1264 {
1265 	struct slab *slab;
1266 
1267 	if (!virt_addr_valid((void *)(long)addr))
1268 		return NULL;
1269 
1270 	slab = virt_to_slab((void *)(long)addr);
1271 	return slab ? slab->slab_cache : NULL;
1272 }
1273 
1274 __bpf_kfunc_end_defs();
1275 #endif /* CONFIG_BPF_SYSCALL */
1276 
1277 /* Tracepoints definitions. */
1278 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
1279 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
1280 EXPORT_TRACEPOINT_SYMBOL(kfree);
1281 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
1282 
1283 #ifndef CONFIG_KVFREE_RCU_BATCHED
1284 
1285 void kvfree_call_rcu(struct rcu_head *head, void *ptr)
1286 {
1287 	if (head) {
1288 		kasan_record_aux_stack(ptr);
1289 		call_rcu(head, kvfree_rcu_cb);
1290 		return;
1291 	}
1292 
1293 	// kvfree_rcu(one_arg) call.
1294 	might_sleep();
1295 	synchronize_rcu();
1296 	kvfree(ptr);
1297 }
1298 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
1299 
1300 void __init kvfree_rcu_init(void)
1301 {
1302 }
1303 
1304 #else /* CONFIG_KVFREE_RCU_BATCHED */
1305 
1306 /*
1307  * This rcu parameter is runtime-read-only. It reflects
1308  * a minimum allowed number of objects which can be cached
1309  * per-CPU. Object size is equal to one page. This value
1310  * can be changed at boot time.
1311  */
1312 static int rcu_min_cached_objs = 5;
1313 module_param(rcu_min_cached_objs, int, 0444);
1314 
1315 // A page shrinker can ask for pages to be freed to make them
1316 // available for other parts of the system. This usually happens
1317 // under low memory conditions, and in that case we should also
1318 // defer page-cache filling for a short time period.
1319 //
1320 // The default value is 5 seconds, which is long enough to reduce
1321 // interference with the shrinker while it asks other systems to
1322 // drain their caches.
1323 static int rcu_delay_page_cache_fill_msec = 5000;
1324 module_param(rcu_delay_page_cache_fill_msec, int, 0444);
1325 
1326 static struct workqueue_struct *rcu_reclaim_wq;
1327 
1328 /* Maximum number of jiffies to wait before draining a batch. */
1329 #define KFREE_DRAIN_JIFFIES (5 * HZ)
1330 #define KFREE_N_BATCHES 2
1331 #define FREE_N_CHANNELS 2
1332 
1333 /**
1334  * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
1335  * @list: List node. All blocks are linked between each other
1336  * @gp_snap: Snapshot of RCU state for objects placed to this bulk
1337  * @nr_records: Number of active pointers in the array
1338  * @records: Array of the kvfree_rcu() pointers
1339  */
1340 struct kvfree_rcu_bulk_data {
1341 	struct list_head list;
1342 	struct rcu_gp_oldstate gp_snap;
1343 	unsigned long nr_records;
1344 	void *records[] __counted_by(nr_records);
1345 };
1346 
1347 /*
1348  * This macro defines how many entries the "records" array
1349  * will contain. It is based on the fact that the size of
1350  * kvfree_rcu_bulk_data structure becomes exactly one page.
1351  */
1352 #define KVFREE_BULK_MAX_ENTR \
1353 	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
1354 
1355 /**
1356  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
1357  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
1358  * @head_free: List of kfree_rcu() objects waiting for a grace period
1359  * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
1360  * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
1361  * @krcp: Pointer to @kfree_rcu_cpu structure
1362  */
1363 
1364 struct kfree_rcu_cpu_work {
1365 	struct rcu_work rcu_work;
1366 	struct rcu_head *head_free;
1367 	struct rcu_gp_oldstate head_free_gp_snap;
1368 	struct list_head bulk_head_free[FREE_N_CHANNELS];
1369 	struct kfree_rcu_cpu *krcp;
1370 };
1371 
1372 /**
1373  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
1374  * @head: List of kfree_rcu() objects not yet waiting for a grace period
1375  * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
1376  * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
1377  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
1378  * @lock: Synchronize access to this structure
1379  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
1380  * @initialized: The @rcu_work fields have been initialized
1381  * @head_count: Number of objects in rcu_head singular list
1382  * @bulk_count: Number of objects in bulk-list
1383  * @bkvcache:
1384  *	A simple cache list that contains objects for reuse purpose.
1385  *	In order to save some per-cpu space the list is singular.
1386  *	Even though it is lockless an access has to be protected by the
1387  *	per-cpu lock.
1388  * @page_cache_work: A work to refill the cache when it is empty
1389  * @backoff_page_cache_fill: Delay cache refills
1390  * @work_in_progress: Indicates that page_cache_work is running
1391  * @hrtimer: A hrtimer for scheduling a page_cache_work
1392  * @nr_bkv_objs: number of allocated objects at @bkvcache.
1393  *
1394  * This is a per-CPU structure.  The reason that it is not included in
1395  * the rcu_data structure is to permit this code to be extracted from
1396  * the RCU files.  Such extraction could allow further optimization of
1397  * the interactions with the slab allocators.
1398  */
1399 struct kfree_rcu_cpu {
1400 	// Objects queued on a linked list
1401 	// through their rcu_head structures.
1402 	struct rcu_head *head;
1403 	unsigned long head_gp_snap;
1404 	atomic_t head_count;
1405 
1406 	// Objects queued on a bulk-list.
1407 	struct list_head bulk_head[FREE_N_CHANNELS];
1408 	atomic_t bulk_count[FREE_N_CHANNELS];
1409 
1410 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
1411 	raw_spinlock_t lock;
1412 	struct delayed_work monitor_work;
1413 	bool initialized;
1414 
1415 	struct delayed_work page_cache_work;
1416 	atomic_t backoff_page_cache_fill;
1417 	atomic_t work_in_progress;
1418 	struct hrtimer hrtimer;
1419 
1420 	struct llist_head bkvcache;
1421 	int nr_bkv_objs;
1422 };
1423 
1424 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
1425 	.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
1426 };
1427 
1428 static __always_inline void
1429 debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
1430 {
1431 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1432 	int i;
1433 
1434 	for (i = 0; i < bhead->nr_records; i++)
1435 		debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
1436 #endif
1437 }
1438 
1439 static inline struct kfree_rcu_cpu *
1440 krc_this_cpu_lock(unsigned long *flags)
1441 {
1442 	struct kfree_rcu_cpu *krcp;
1443 
1444 	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
1445 	krcp = this_cpu_ptr(&krc);
1446 	raw_spin_lock(&krcp->lock);
1447 
1448 	return krcp;
1449 }
1450 
1451 static inline void
1452 krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
1453 {
1454 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1455 }
1456 
1457 static inline struct kvfree_rcu_bulk_data *
1458 get_cached_bnode(struct kfree_rcu_cpu *krcp)
1459 {
1460 	if (!krcp->nr_bkv_objs)
1461 		return NULL;
1462 
1463 	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
1464 	return (struct kvfree_rcu_bulk_data *)
1465 		llist_del_first(&krcp->bkvcache);
1466 }
1467 
1468 static inline bool
1469 put_cached_bnode(struct kfree_rcu_cpu *krcp,
1470 	struct kvfree_rcu_bulk_data *bnode)
1471 {
1472 	// Check the limit.
1473 	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
1474 		return false;
1475 
1476 	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
1477 	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
1478 	return true;
1479 }
1480 
1481 static int
1482 drain_page_cache(struct kfree_rcu_cpu *krcp)
1483 {
1484 	unsigned long flags;
1485 	struct llist_node *page_list, *pos, *n;
1486 	int freed = 0;
1487 
1488 	if (!rcu_min_cached_objs)
1489 		return 0;
1490 
1491 	raw_spin_lock_irqsave(&krcp->lock, flags);
1492 	page_list = llist_del_all(&krcp->bkvcache);
1493 	WRITE_ONCE(krcp->nr_bkv_objs, 0);
1494 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1495 
1496 	llist_for_each_safe(pos, n, page_list) {
1497 		free_page((unsigned long)pos);
1498 		freed++;
1499 	}
1500 
1501 	return freed;
1502 }
1503 
1504 static void
1505 kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
1506 	struct kvfree_rcu_bulk_data *bnode, int idx)
1507 {
1508 	unsigned long flags;
1509 	int i;
1510 
1511 	if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
1512 		debug_rcu_bhead_unqueue(bnode);
1513 		rcu_lock_acquire(&rcu_callback_map);
1514 		if (idx == 0) { // kmalloc() / kfree().
1515 			trace_rcu_invoke_kfree_bulk_callback(
1516 				"slab", bnode->nr_records,
1517 				bnode->records);
1518 
1519 			kfree_bulk(bnode->nr_records, bnode->records);
1520 		} else { // vmalloc() / vfree().
1521 			for (i = 0; i < bnode->nr_records; i++) {
1522 				trace_rcu_invoke_kvfree_callback(
1523 					"slab", bnode->records[i], 0);
1524 
1525 				vfree(bnode->records[i]);
1526 			}
1527 		}
1528 		rcu_lock_release(&rcu_callback_map);
1529 	}
1530 
1531 	raw_spin_lock_irqsave(&krcp->lock, flags);
1532 	if (put_cached_bnode(krcp, bnode))
1533 		bnode = NULL;
1534 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1535 
1536 	if (bnode)
1537 		free_page((unsigned long) bnode);
1538 
1539 	cond_resched_tasks_rcu_qs();
1540 }
1541 
1542 static void
1543 kvfree_rcu_list(struct rcu_head *head)
1544 {
1545 	struct rcu_head *next;
1546 
1547 	for (; head; head = next) {
1548 		void *ptr = (void *) head->func;
1549 		unsigned long offset = (void *) head - ptr;
1550 
1551 		next = head->next;
1552 		debug_rcu_head_unqueue((struct rcu_head *)ptr);
1553 		rcu_lock_acquire(&rcu_callback_map);
1554 		trace_rcu_invoke_kvfree_callback("slab", head, offset);
1555 
1556 		kvfree(ptr);
1557 
1558 		rcu_lock_release(&rcu_callback_map);
1559 		cond_resched_tasks_rcu_qs();
1560 	}
1561 }
1562 
1563 /*
1564  * This function is invoked in workqueue context after a grace period.
1565  * It frees all the objects queued on ->bulk_head_free or ->head_free.
1566  */
1567 static void kfree_rcu_work(struct work_struct *work)
1568 {
1569 	unsigned long flags;
1570 	struct kvfree_rcu_bulk_data *bnode, *n;
1571 	struct list_head bulk_head[FREE_N_CHANNELS];
1572 	struct rcu_head *head;
1573 	struct kfree_rcu_cpu *krcp;
1574 	struct kfree_rcu_cpu_work *krwp;
1575 	struct rcu_gp_oldstate head_gp_snap;
1576 	int i;
1577 
1578 	krwp = container_of(to_rcu_work(work),
1579 		struct kfree_rcu_cpu_work, rcu_work);
1580 	krcp = krwp->krcp;
1581 
1582 	raw_spin_lock_irqsave(&krcp->lock, flags);
1583 	// Channels 1 and 2.
1584 	for (i = 0; i < FREE_N_CHANNELS; i++)
1585 		list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
1586 
1587 	// Channel 3.
1588 	head = krwp->head_free;
1589 	krwp->head_free = NULL;
1590 	head_gp_snap = krwp->head_free_gp_snap;
1591 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1592 
1593 	// Handle the first two channels.
1594 	for (i = 0; i < FREE_N_CHANNELS; i++) {
1595 		// Start from the tail page, so a GP is likely passed for it.
1596 		list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
1597 			kvfree_rcu_bulk(krcp, bnode, i);
1598 	}
1599 
1600 	/*
1601 	 * This is used when the "bulk" path can not be used for the
1602 	 * double-argument of kvfree_rcu().  This happens when the
1603 	 * page-cache is empty, which means that objects are instead
1604 	 * queued on a linked list through their rcu_head structures.
1605 	 * This list is named "Channel 3".
1606 	 */
1607 	if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
1608 		kvfree_rcu_list(head);
1609 }
1610 
1611 static bool kfree_rcu_sheaf(void *obj)
1612 {
1613 	struct kmem_cache *s;
1614 	struct folio *folio;
1615 	struct slab *slab;
1616 
1617 	if (is_vmalloc_addr(obj))
1618 		return false;
1619 
1620 	folio = virt_to_folio(obj);
1621 	if (unlikely(!folio_test_slab(folio)))
1622 		return false;
1623 
1624 	slab = folio_slab(folio);
1625 	s = slab->slab_cache;
1626 	if (s->cpu_sheaves) {
1627 		if (likely(!IS_ENABLED(CONFIG_NUMA) ||
1628 			   slab_nid(slab) == numa_mem_id()))
1629 			return __kfree_rcu_sheaf(s, obj);
1630 	}
1631 
1632 	return false;
1633 }
1634 
1635 static bool
1636 need_offload_krc(struct kfree_rcu_cpu *krcp)
1637 {
1638 	int i;
1639 
1640 	for (i = 0; i < FREE_N_CHANNELS; i++)
1641 		if (!list_empty(&krcp->bulk_head[i]))
1642 			return true;
1643 
1644 	return !!READ_ONCE(krcp->head);
1645 }
1646 
1647 static bool
1648 need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
1649 {
1650 	int i;
1651 
1652 	for (i = 0; i < FREE_N_CHANNELS; i++)
1653 		if (!list_empty(&krwp->bulk_head_free[i]))
1654 			return true;
1655 
1656 	return !!krwp->head_free;
1657 }
1658 
1659 static int krc_count(struct kfree_rcu_cpu *krcp)
1660 {
1661 	int sum = atomic_read(&krcp->head_count);
1662 	int i;
1663 
1664 	for (i = 0; i < FREE_N_CHANNELS; i++)
1665 		sum += atomic_read(&krcp->bulk_count[i]);
1666 
1667 	return sum;
1668 }
1669 
1670 static void
1671 __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1672 {
1673 	long delay, delay_left;
1674 
1675 	delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
1676 	if (delayed_work_pending(&krcp->monitor_work)) {
1677 		delay_left = krcp->monitor_work.timer.expires - jiffies;
1678 		if (delay < delay_left)
1679 			mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
1680 		return;
1681 	}
1682 	queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
1683 }
1684 
1685 static void
1686 schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1687 {
1688 	unsigned long flags;
1689 
1690 	raw_spin_lock_irqsave(&krcp->lock, flags);
1691 	__schedule_delayed_monitor_work(krcp);
1692 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1693 }
1694 
1695 static void
1696 kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
1697 {
1698 	struct list_head bulk_ready[FREE_N_CHANNELS];
1699 	struct kvfree_rcu_bulk_data *bnode, *n;
1700 	struct rcu_head *head_ready = NULL;
1701 	unsigned long flags;
1702 	int i;
1703 
1704 	raw_spin_lock_irqsave(&krcp->lock, flags);
1705 	for (i = 0; i < FREE_N_CHANNELS; i++) {
1706 		INIT_LIST_HEAD(&bulk_ready[i]);
1707 
1708 		list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
1709 			if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
1710 				break;
1711 
1712 			atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
1713 			list_move(&bnode->list, &bulk_ready[i]);
1714 		}
1715 	}
1716 
1717 	if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
1718 		head_ready = krcp->head;
1719 		atomic_set(&krcp->head_count, 0);
1720 		WRITE_ONCE(krcp->head, NULL);
1721 	}
1722 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1723 
1724 	for (i = 0; i < FREE_N_CHANNELS; i++) {
1725 		list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
1726 			kvfree_rcu_bulk(krcp, bnode, i);
1727 	}
1728 
1729 	if (head_ready)
1730 		kvfree_rcu_list(head_ready);
1731 }
1732 
1733 /*
1734  * Return: %true if a work is queued, %false otherwise.
1735  */
1736 static bool
1737 kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
1738 {
1739 	unsigned long flags;
1740 	bool queued = false;
1741 	int i, j;
1742 
1743 	raw_spin_lock_irqsave(&krcp->lock, flags);
1744 
1745 	// Attempt to start a new batch.
1746 	for (i = 0; i < KFREE_N_BATCHES; i++) {
1747 		struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
1748 
1749 		// Try to detach bulk_head or head and attach it, only when
1750 		// all channels are free.  Any channel is not free means at krwp
1751 		// there is on-going rcu work to handle krwp's free business.
1752 		if (need_wait_for_krwp_work(krwp))
1753 			continue;
1754 
1755 		// kvfree_rcu_drain_ready() might handle this krcp, if so give up.
1756 		if (need_offload_krc(krcp)) {
1757 			// Channel 1 corresponds to the SLAB-pointer bulk path.
1758 			// Channel 2 corresponds to vmalloc-pointer bulk path.
1759 			for (j = 0; j < FREE_N_CHANNELS; j++) {
1760 				if (list_empty(&krwp->bulk_head_free[j])) {
1761 					atomic_set(&krcp->bulk_count[j], 0);
1762 					list_replace_init(&krcp->bulk_head[j],
1763 						&krwp->bulk_head_free[j]);
1764 				}
1765 			}
1766 
1767 			// Channel 3 corresponds to both SLAB and vmalloc
1768 			// objects queued on the linked list.
1769 			if (!krwp->head_free) {
1770 				krwp->head_free = krcp->head;
1771 				get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
1772 				atomic_set(&krcp->head_count, 0);
1773 				WRITE_ONCE(krcp->head, NULL);
1774 			}
1775 
1776 			// One work is per one batch, so there are three
1777 			// "free channels", the batch can handle. Break
1778 			// the loop since it is done with this CPU thus
1779 			// queuing an RCU work is _always_ success here.
1780 			queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
1781 			WARN_ON_ONCE(!queued);
1782 			break;
1783 		}
1784 	}
1785 
1786 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1787 	return queued;
1788 }
1789 
1790 /*
1791  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
1792  */
1793 static void kfree_rcu_monitor(struct work_struct *work)
1794 {
1795 	struct kfree_rcu_cpu *krcp = container_of(work,
1796 		struct kfree_rcu_cpu, monitor_work.work);
1797 
1798 	// Drain ready for reclaim.
1799 	kvfree_rcu_drain_ready(krcp);
1800 
1801 	// Queue a batch for a rest.
1802 	kvfree_rcu_queue_batch(krcp);
1803 
1804 	// If there is nothing to detach, it means that our job is
1805 	// successfully done here. In case of having at least one
1806 	// of the channels that is still busy we should rearm the
1807 	// work to repeat an attempt. Because previous batches are
1808 	// still in progress.
1809 	if (need_offload_krc(krcp))
1810 		schedule_delayed_monitor_work(krcp);
1811 }
1812 
1813 static void fill_page_cache_func(struct work_struct *work)
1814 {
1815 	struct kvfree_rcu_bulk_data *bnode;
1816 	struct kfree_rcu_cpu *krcp =
1817 		container_of(work, struct kfree_rcu_cpu,
1818 			page_cache_work.work);
1819 	unsigned long flags;
1820 	int nr_pages;
1821 	bool pushed;
1822 	int i;
1823 
1824 	nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
1825 		1 : rcu_min_cached_objs;
1826 
1827 	for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
1828 		bnode = (struct kvfree_rcu_bulk_data *)
1829 			__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1830 
1831 		if (!bnode)
1832 			break;
1833 
1834 		raw_spin_lock_irqsave(&krcp->lock, flags);
1835 		pushed = put_cached_bnode(krcp, bnode);
1836 		raw_spin_unlock_irqrestore(&krcp->lock, flags);
1837 
1838 		if (!pushed) {
1839 			free_page((unsigned long) bnode);
1840 			break;
1841 		}
1842 	}
1843 
1844 	atomic_set(&krcp->work_in_progress, 0);
1845 	atomic_set(&krcp->backoff_page_cache_fill, 0);
1846 }
1847 
1848 // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
1849 // state specified by flags.  If can_alloc is true, the caller must
1850 // be schedulable and not be holding any locks or mutexes that might be
1851 // acquired by the memory allocator or anything that it might invoke.
1852 // Returns true if ptr was successfully recorded, else the caller must
1853 // use a fallback.
1854 static inline bool
1855 add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
1856 	unsigned long *flags, void *ptr, bool can_alloc)
1857 {
1858 	struct kvfree_rcu_bulk_data *bnode;
1859 	int idx;
1860 
1861 	*krcp = krc_this_cpu_lock(flags);
1862 	if (unlikely(!(*krcp)->initialized))
1863 		return false;
1864 
1865 	idx = !!is_vmalloc_addr(ptr);
1866 	bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
1867 		struct kvfree_rcu_bulk_data, list);
1868 
1869 	/* Check if a new block is required. */
1870 	if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
1871 		bnode = get_cached_bnode(*krcp);
1872 		if (!bnode && can_alloc) {
1873 			krc_this_cpu_unlock(*krcp, *flags);
1874 
1875 			// __GFP_NORETRY - allows a light-weight direct reclaim
1876 			// what is OK from minimizing of fallback hitting point of
1877 			// view. Apart of that it forbids any OOM invoking what is
1878 			// also beneficial since we are about to release memory soon.
1879 			//
1880 			// __GFP_NOMEMALLOC - prevents from consuming of all the
1881 			// memory reserves. Please note we have a fallback path.
1882 			//
1883 			// __GFP_NOWARN - it is supposed that an allocation can
1884 			// be failed under low memory or high memory pressure
1885 			// scenarios.
1886 			bnode = (struct kvfree_rcu_bulk_data *)
1887 				__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1888 			raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
1889 		}
1890 
1891 		if (!bnode)
1892 			return false;
1893 
1894 		// Initialize the new block and attach it.
1895 		bnode->nr_records = 0;
1896 		list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
1897 	}
1898 
1899 	// Finally insert and update the GP for this page.
1900 	bnode->nr_records++;
1901 	bnode->records[bnode->nr_records - 1] = ptr;
1902 	get_state_synchronize_rcu_full(&bnode->gp_snap);
1903 	atomic_inc(&(*krcp)->bulk_count[idx]);
1904 
1905 	return true;
1906 }
1907 
1908 static enum hrtimer_restart
1909 schedule_page_work_fn(struct hrtimer *t)
1910 {
1911 	struct kfree_rcu_cpu *krcp =
1912 		container_of(t, struct kfree_rcu_cpu, hrtimer);
1913 
1914 	queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
1915 	return HRTIMER_NORESTART;
1916 }
1917 
1918 static void
1919 run_page_cache_worker(struct kfree_rcu_cpu *krcp)
1920 {
1921 	// If cache disabled, bail out.
1922 	if (!rcu_min_cached_objs)
1923 		return;
1924 
1925 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
1926 			!atomic_xchg(&krcp->work_in_progress, 1)) {
1927 		if (atomic_read(&krcp->backoff_page_cache_fill)) {
1928 			queue_delayed_work(rcu_reclaim_wq,
1929 				&krcp->page_cache_work,
1930 					msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
1931 		} else {
1932 			hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
1933 				      HRTIMER_MODE_REL);
1934 			hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
1935 		}
1936 	}
1937 }
1938 
1939 void __init kfree_rcu_scheduler_running(void)
1940 {
1941 	int cpu;
1942 
1943 	for_each_possible_cpu(cpu) {
1944 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
1945 
1946 		if (need_offload_krc(krcp))
1947 			schedule_delayed_monitor_work(krcp);
1948 	}
1949 }
1950 
1951 /*
1952  * Queue a request for lazy invocation of the appropriate free routine
1953  * after a grace period.  Please note that three paths are maintained,
1954  * two for the common case using arrays of pointers and a third one that
1955  * is used only when the main paths cannot be used, for example, due to
1956  * memory pressure.
1957  *
1958  * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
1959  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
1960  * be free'd in workqueue context. This allows us to: batch requests together to
1961  * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
1962  */
1963 void kvfree_call_rcu(struct rcu_head *head, void *ptr)
1964 {
1965 	unsigned long flags;
1966 	struct kfree_rcu_cpu *krcp;
1967 	bool success;
1968 
1969 	/*
1970 	 * Please note there is a limitation for the head-less
1971 	 * variant, that is why there is a clear rule for such
1972 	 * objects: it can be used from might_sleep() context
1973 	 * only. For other places please embed an rcu_head to
1974 	 * your data.
1975 	 */
1976 	if (!head)
1977 		might_sleep();
1978 
1979 	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
1980 		return;
1981 
1982 	// Queue the object but don't yet schedule the batch.
1983 	if (debug_rcu_head_queue(ptr)) {
1984 		// Probable double kfree_rcu(), just leak.
1985 		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
1986 			  __func__, head);
1987 
1988 		// Mark as success and leave.
1989 		return;
1990 	}
1991 
1992 	kasan_record_aux_stack(ptr);
1993 	success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
1994 	if (!success) {
1995 		run_page_cache_worker(krcp);
1996 
1997 		if (head == NULL)
1998 			// Inline if kvfree_rcu(one_arg) call.
1999 			goto unlock_return;
2000 
2001 		head->func = ptr;
2002 		head->next = krcp->head;
2003 		WRITE_ONCE(krcp->head, head);
2004 		atomic_inc(&krcp->head_count);
2005 
2006 		// Take a snapshot for this krcp.
2007 		krcp->head_gp_snap = get_state_synchronize_rcu();
2008 		success = true;
2009 	}
2010 
2011 	/*
2012 	 * The kvfree_rcu() caller considers the pointer freed at this point
2013 	 * and likely removes any references to it. Since the actual slab
2014 	 * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
2015 	 * this object (no scanning or false positives reporting).
2016 	 */
2017 	kmemleak_ignore(ptr);
2018 
2019 	// Set timer to drain after KFREE_DRAIN_JIFFIES.
2020 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
2021 		__schedule_delayed_monitor_work(krcp);
2022 
2023 unlock_return:
2024 	krc_this_cpu_unlock(krcp, flags);
2025 
2026 	/*
2027 	 * Inline kvfree() after synchronize_rcu(). We can do
2028 	 * it from might_sleep() context only, so the current
2029 	 * CPU can pass the QS state.
2030 	 */
2031 	if (!success) {
2032 		debug_rcu_head_unqueue((struct rcu_head *) ptr);
2033 		synchronize_rcu();
2034 		kvfree(ptr);
2035 	}
2036 }
2037 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
2038 
2039 /**
2040  * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
2041  *
2042  * Note that a single argument of kvfree_rcu() call has a slow path that
2043  * triggers synchronize_rcu() following by freeing a pointer. It is done
2044  * before the return from the function. Therefore for any single-argument
2045  * call that will result in a kfree() to a cache that is to be destroyed
2046  * during module exit, it is developer's responsibility to ensure that all
2047  * such calls have returned before the call to kmem_cache_destroy().
2048  */
2049 void kvfree_rcu_barrier(void)
2050 {
2051 	struct kfree_rcu_cpu_work *krwp;
2052 	struct kfree_rcu_cpu *krcp;
2053 	bool queued;
2054 	int i, cpu;
2055 
2056 	flush_all_rcu_sheaves();
2057 
2058 	/*
2059 	 * Firstly we detach objects and queue them over an RCU-batch
2060 	 * for all CPUs. Finally queued works are flushed for each CPU.
2061 	 *
2062 	 * Please note. If there are outstanding batches for a particular
2063 	 * CPU, those have to be finished first following by queuing a new.
2064 	 */
2065 	for_each_possible_cpu(cpu) {
2066 		krcp = per_cpu_ptr(&krc, cpu);
2067 
2068 		/*
2069 		 * Check if this CPU has any objects which have been queued for a
2070 		 * new GP completion. If not(means nothing to detach), we are done
2071 		 * with it. If any batch is pending/running for this "krcp", below
2072 		 * per-cpu flush_rcu_work() waits its completion(see last step).
2073 		 */
2074 		if (!need_offload_krc(krcp))
2075 			continue;
2076 
2077 		while (1) {
2078 			/*
2079 			 * If we are not able to queue a new RCU work it means:
2080 			 * - batches for this CPU are still in flight which should
2081 			 *   be flushed first and then repeat;
2082 			 * - no objects to detach, because of concurrency.
2083 			 */
2084 			queued = kvfree_rcu_queue_batch(krcp);
2085 
2086 			/*
2087 			 * Bail out, if there is no need to offload this "krcp"
2088 			 * anymore. As noted earlier it can run concurrently.
2089 			 */
2090 			if (queued || !need_offload_krc(krcp))
2091 				break;
2092 
2093 			/* There are ongoing batches. */
2094 			for (i = 0; i < KFREE_N_BATCHES; i++) {
2095 				krwp = &(krcp->krw_arr[i]);
2096 				flush_rcu_work(&krwp->rcu_work);
2097 			}
2098 		}
2099 	}
2100 
2101 	/*
2102 	 * Now we guarantee that all objects are flushed.
2103 	 */
2104 	for_each_possible_cpu(cpu) {
2105 		krcp = per_cpu_ptr(&krc, cpu);
2106 
2107 		/*
2108 		 * A monitor work can drain ready to reclaim objects
2109 		 * directly. Wait its completion if running or pending.
2110 		 */
2111 		cancel_delayed_work_sync(&krcp->monitor_work);
2112 
2113 		for (i = 0; i < KFREE_N_BATCHES; i++) {
2114 			krwp = &(krcp->krw_arr[i]);
2115 			flush_rcu_work(&krwp->rcu_work);
2116 		}
2117 	}
2118 }
2119 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
2120 
2121 static unsigned long
2122 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2123 {
2124 	int cpu;
2125 	unsigned long count = 0;
2126 
2127 	/* Snapshot count of all CPUs */
2128 	for_each_possible_cpu(cpu) {
2129 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2130 
2131 		count += krc_count(krcp);
2132 		count += READ_ONCE(krcp->nr_bkv_objs);
2133 		atomic_set(&krcp->backoff_page_cache_fill, 1);
2134 	}
2135 
2136 	return count == 0 ? SHRINK_EMPTY : count;
2137 }
2138 
2139 static unsigned long
2140 kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2141 {
2142 	int cpu, freed = 0;
2143 
2144 	for_each_possible_cpu(cpu) {
2145 		int count;
2146 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2147 
2148 		count = krc_count(krcp);
2149 		count += drain_page_cache(krcp);
2150 		kfree_rcu_monitor(&krcp->monitor_work.work);
2151 
2152 		sc->nr_to_scan -= count;
2153 		freed += count;
2154 
2155 		if (sc->nr_to_scan <= 0)
2156 			break;
2157 	}
2158 
2159 	return freed == 0 ? SHRINK_STOP : freed;
2160 }
2161 
2162 void __init kvfree_rcu_init(void)
2163 {
2164 	int cpu;
2165 	int i, j;
2166 	struct shrinker *kfree_rcu_shrinker;
2167 
2168 	rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
2169 			WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
2170 	WARN_ON(!rcu_reclaim_wq);
2171 
2172 	/* Clamp it to [0:100] seconds interval. */
2173 	if (rcu_delay_page_cache_fill_msec < 0 ||
2174 		rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
2175 
2176 		rcu_delay_page_cache_fill_msec =
2177 			clamp(rcu_delay_page_cache_fill_msec, 0,
2178 				(int) (100 * MSEC_PER_SEC));
2179 
2180 		pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
2181 			rcu_delay_page_cache_fill_msec);
2182 	}
2183 
2184 	for_each_possible_cpu(cpu) {
2185 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2186 
2187 		for (i = 0; i < KFREE_N_BATCHES; i++) {
2188 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
2189 			krcp->krw_arr[i].krcp = krcp;
2190 
2191 			for (j = 0; j < FREE_N_CHANNELS; j++)
2192 				INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
2193 		}
2194 
2195 		for (i = 0; i < FREE_N_CHANNELS; i++)
2196 			INIT_LIST_HEAD(&krcp->bulk_head[i]);
2197 
2198 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
2199 		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
2200 		krcp->initialized = true;
2201 	}
2202 
2203 	kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
2204 	if (!kfree_rcu_shrinker) {
2205 		pr_err("Failed to allocate kfree_rcu() shrinker!\n");
2206 		return;
2207 	}
2208 
2209 	kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
2210 	kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
2211 
2212 	shrinker_register(kfree_rcu_shrinker);
2213 }
2214 
2215 #endif /* CONFIG_KVFREE_RCU_BATCHED */
2216 
2217