xref: /linux/mm/slab_common.c (revision 24d9e8b3c9c8a6f72c8b4c196a703e144928d919)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Slab allocator functions that are independent of the allocator strategy
4  *
5  * (C) 2012 Christoph Lameter <cl@gentwo.org>
6  */
7 #include <linux/slab.h>
8 
9 #include <linux/mm.h>
10 #include <linux/poison.h>
11 #include <linux/interrupt.h>
12 #include <linux/memory.h>
13 #include <linux/cache.h>
14 #include <linux/compiler.h>
15 #include <linux/kfence.h>
16 #include <linux/module.h>
17 #include <linux/cpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/seq_file.h>
20 #include <linux/dma-mapping.h>
21 #include <linux/swiotlb.h>
22 #include <linux/proc_fs.h>
23 #include <linux/debugfs.h>
24 #include <linux/kmemleak.h>
25 #include <linux/kasan.h>
26 #include <asm/cacheflush.h>
27 #include <asm/tlbflush.h>
28 #include <asm/page.h>
29 #include <linux/memcontrol.h>
30 #include <linux/stackdepot.h>
31 #include <trace/events/rcu.h>
32 
33 #include "../kernel/rcu/rcu.h"
34 #include "internal.h"
35 #include "slab.h"
36 
37 #define CREATE_TRACE_POINTS
38 #include <trace/events/kmem.h>
39 
40 enum slab_state slab_state;
41 LIST_HEAD(slab_caches);
42 DEFINE_MUTEX(slab_mutex);
43 struct kmem_cache *kmem_cache;
44 
45 /*
46  * Set of flags that will prevent slab merging
47  */
48 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
49 		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
50 		SLAB_FAILSLAB | SLAB_NO_MERGE)
51 
52 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
53 			 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
54 
55 /*
56  * Merge control. If this is set then no merging of slab caches will occur.
57  */
58 static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
59 
60 static int __init setup_slab_nomerge(char *str)
61 {
62 	slab_nomerge = true;
63 	return 1;
64 }
65 
66 static int __init setup_slab_merge(char *str)
67 {
68 	slab_nomerge = false;
69 	return 1;
70 }
71 
72 __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
73 __setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
74 
75 __setup("slab_nomerge", setup_slab_nomerge);
76 __setup("slab_merge", setup_slab_merge);
77 
78 /*
79  * Determine the size of a slab object
80  */
81 unsigned int kmem_cache_size(struct kmem_cache *s)
82 {
83 	return s->object_size;
84 }
85 EXPORT_SYMBOL(kmem_cache_size);
86 
87 #ifdef CONFIG_DEBUG_VM
88 
89 static bool kmem_cache_is_duplicate_name(const char *name)
90 {
91 	struct kmem_cache *s;
92 
93 	list_for_each_entry(s, &slab_caches, list) {
94 		if (!strcmp(s->name, name))
95 			return true;
96 	}
97 
98 	return false;
99 }
100 
101 static int kmem_cache_sanity_check(const char *name, unsigned int size)
102 {
103 	if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
104 		pr_err("kmem_cache_create(%s) integrity check failed\n", name);
105 		return -EINVAL;
106 	}
107 
108 	/* Duplicate names will confuse slabtop, et al */
109 	WARN(kmem_cache_is_duplicate_name(name),
110 			"kmem_cache of name '%s' already exists\n", name);
111 
112 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
113 	return 0;
114 }
115 #else
116 static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
117 {
118 	return 0;
119 }
120 #endif
121 
122 /*
123  * Figure out what the alignment of the objects will be given a set of
124  * flags, a user specified alignment and the size of the objects.
125  */
126 static unsigned int calculate_alignment(slab_flags_t flags,
127 		unsigned int align, unsigned int size)
128 {
129 	/*
130 	 * If the user wants hardware cache aligned objects then follow that
131 	 * suggestion if the object is sufficiently large.
132 	 *
133 	 * The hardware cache alignment cannot override the specified
134 	 * alignment though. If that is greater then use it.
135 	 */
136 	if (flags & SLAB_HWCACHE_ALIGN) {
137 		unsigned int ralign;
138 
139 		ralign = cache_line_size();
140 		while (size <= ralign / 2)
141 			ralign /= 2;
142 		align = max(align, ralign);
143 	}
144 
145 	align = max(align, arch_slab_minalign());
146 
147 	return ALIGN(align, sizeof(void *));
148 }
149 
150 /*
151  * Find a mergeable slab cache
152  */
153 int slab_unmergeable(struct kmem_cache *s)
154 {
155 	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
156 		return 1;
157 
158 	if (s->ctor)
159 		return 1;
160 
161 #ifdef CONFIG_HARDENED_USERCOPY
162 	if (s->usersize)
163 		return 1;
164 #endif
165 
166 	if (s->cpu_sheaves)
167 		return 1;
168 
169 	/*
170 	 * We may have set a slab to be unmergeable during bootstrap.
171 	 */
172 	if (s->refcount < 0)
173 		return 1;
174 
175 	return 0;
176 }
177 
178 struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
179 		slab_flags_t flags, const char *name, void (*ctor)(void *))
180 {
181 	struct kmem_cache *s;
182 
183 	if (slab_nomerge)
184 		return NULL;
185 
186 	if (ctor)
187 		return NULL;
188 
189 	flags = kmem_cache_flags(flags, name);
190 
191 	if (flags & SLAB_NEVER_MERGE)
192 		return NULL;
193 
194 	size = ALIGN(size, sizeof(void *));
195 	align = calculate_alignment(flags, align, size);
196 	size = ALIGN(size, align);
197 
198 	list_for_each_entry_reverse(s, &slab_caches, list) {
199 		if (slab_unmergeable(s))
200 			continue;
201 
202 		if (size > s->size)
203 			continue;
204 
205 		if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
206 			continue;
207 		/*
208 		 * Check if alignment is compatible.
209 		 * Courtesy of Adrian Drzewiecki
210 		 */
211 		if ((s->size & ~(align - 1)) != s->size)
212 			continue;
213 
214 		if (s->size - size >= sizeof(void *))
215 			continue;
216 
217 		return s;
218 	}
219 	return NULL;
220 }
221 
222 static struct kmem_cache *create_cache(const char *name,
223 				       unsigned int object_size,
224 				       struct kmem_cache_args *args,
225 				       slab_flags_t flags)
226 {
227 	struct kmem_cache *s;
228 	int err;
229 
230 	/* If a custom freelist pointer is requested make sure it's sane. */
231 	err = -EINVAL;
232 	if (args->use_freeptr_offset &&
233 	    (args->freeptr_offset >= object_size ||
234 	     !(flags & SLAB_TYPESAFE_BY_RCU) ||
235 	     !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
236 		goto out;
237 
238 	err = -ENOMEM;
239 	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
240 	if (!s)
241 		goto out;
242 	err = do_kmem_cache_create(s, name, object_size, args, flags);
243 	if (err)
244 		goto out_free_cache;
245 
246 	s->refcount = 1;
247 	list_add(&s->list, &slab_caches);
248 	return s;
249 
250 out_free_cache:
251 	kmem_cache_free(kmem_cache, s);
252 out:
253 	return ERR_PTR(err);
254 }
255 
256 /**
257  * __kmem_cache_create_args - Create a kmem cache.
258  * @name: A string which is used in /proc/slabinfo to identify this cache.
259  * @object_size: The size of objects to be created in this cache.
260  * @args: Additional arguments for the cache creation (see
261  *        &struct kmem_cache_args).
262  * @flags: See the desriptions of individual flags. The common ones are listed
263  *         in the description below.
264  *
265  * Not to be called directly, use the kmem_cache_create() wrapper with the same
266  * parameters.
267  *
268  * Commonly used @flags:
269  *
270  * &SLAB_ACCOUNT - Account allocations to memcg.
271  *
272  * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
273  *
274  * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
275  *
276  * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
277  * by a grace period - see the full description before using.
278  *
279  * Context: Cannot be called within a interrupt, but can be interrupted.
280  *
281  * Return: a pointer to the cache on success, NULL on failure.
282  */
283 struct kmem_cache *__kmem_cache_create_args(const char *name,
284 					    unsigned int object_size,
285 					    struct kmem_cache_args *args,
286 					    slab_flags_t flags)
287 {
288 	struct kmem_cache *s = NULL;
289 	const char *cache_name;
290 	int err;
291 
292 #ifdef CONFIG_SLUB_DEBUG
293 	/*
294 	 * If no slab_debug was enabled globally, the static key is not yet
295 	 * enabled by setup_slub_debug(). Enable it if the cache is being
296 	 * created with any of the debugging flags passed explicitly.
297 	 * It's also possible that this is the first cache created with
298 	 * SLAB_STORE_USER and we should init stack_depot for it.
299 	 */
300 	if (flags & SLAB_DEBUG_FLAGS)
301 		static_branch_enable(&slub_debug_enabled);
302 	if (flags & SLAB_STORE_USER)
303 		stack_depot_init();
304 #else
305 	flags &= ~SLAB_DEBUG_FLAGS;
306 #endif
307 
308 	mutex_lock(&slab_mutex);
309 
310 	err = kmem_cache_sanity_check(name, object_size);
311 	if (err) {
312 		goto out_unlock;
313 	}
314 
315 	if (flags & ~SLAB_FLAGS_PERMITTED) {
316 		err = -EINVAL;
317 		goto out_unlock;
318 	}
319 
320 	/* Fail closed on bad usersize of useroffset values. */
321 	if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
322 	    WARN_ON(!args->usersize && args->useroffset) ||
323 	    WARN_ON(object_size < args->usersize ||
324 		    object_size - args->usersize < args->useroffset))
325 		args->usersize = args->useroffset = 0;
326 
327 	if (!args->usersize && !args->sheaf_capacity)
328 		s = __kmem_cache_alias(name, object_size, args->align, flags,
329 				       args->ctor);
330 	if (s)
331 		goto out_unlock;
332 
333 	cache_name = kstrdup_const(name, GFP_KERNEL);
334 	if (!cache_name) {
335 		err = -ENOMEM;
336 		goto out_unlock;
337 	}
338 
339 	args->align = calculate_alignment(flags, args->align, object_size);
340 	s = create_cache(cache_name, object_size, args, flags);
341 	if (IS_ERR(s)) {
342 		err = PTR_ERR(s);
343 		kfree_const(cache_name);
344 	}
345 
346 out_unlock:
347 	mutex_unlock(&slab_mutex);
348 
349 	if (err) {
350 		if (flags & SLAB_PANIC)
351 			panic("%s: Failed to create slab '%s'. Error %d\n",
352 				__func__, name, err);
353 		else {
354 			pr_warn("%s(%s) failed with error %d\n",
355 				__func__, name, err);
356 			dump_stack();
357 		}
358 		return NULL;
359 	}
360 	return s;
361 }
362 EXPORT_SYMBOL(__kmem_cache_create_args);
363 
364 static struct kmem_cache *kmem_buckets_cache __ro_after_init;
365 
366 /**
367  * kmem_buckets_create - Create a set of caches that handle dynamic sized
368  *			 allocations via kmem_buckets_alloc()
369  * @name: A prefix string which is used in /proc/slabinfo to identify this
370  *	  cache. The individual caches with have their sizes as the suffix.
371  * @flags: SLAB flags (see kmem_cache_create() for details).
372  * @useroffset: Starting offset within an allocation that may be copied
373  *		to/from userspace.
374  * @usersize: How many bytes, starting at @useroffset, may be copied
375  *		to/from userspace.
376  * @ctor: A constructor for the objects, run when new allocations are made.
377  *
378  * Cannot be called within an interrupt, but can be interrupted.
379  *
380  * Return: a pointer to the cache on success, NULL on failure. When
381  * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and
382  * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc().
383  * (i.e. callers only need to check for NULL on failure.)
384  */
385 kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
386 				  unsigned int useroffset,
387 				  unsigned int usersize,
388 				  void (*ctor)(void *))
389 {
390 	unsigned long mask = 0;
391 	unsigned int idx;
392 	kmem_buckets *b;
393 
394 	BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG);
395 
396 	/*
397 	 * When the separate buckets API is not built in, just return
398 	 * a non-NULL value for the kmem_buckets pointer, which will be
399 	 * unused when performing allocations.
400 	 */
401 	if (!IS_ENABLED(CONFIG_SLAB_BUCKETS))
402 		return ZERO_SIZE_PTR;
403 
404 	if (WARN_ON(!kmem_buckets_cache))
405 		return NULL;
406 
407 	b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO);
408 	if (WARN_ON(!b))
409 		return NULL;
410 
411 	flags |= SLAB_NO_MERGE;
412 
413 	for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) {
414 		char *short_size, *cache_name;
415 		unsigned int cache_useroffset, cache_usersize;
416 		unsigned int size, aligned_idx;
417 
418 		if (!kmalloc_caches[KMALLOC_NORMAL][idx])
419 			continue;
420 
421 		size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size;
422 		if (!size)
423 			continue;
424 
425 		short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, '-');
426 		if (WARN_ON(!short_size))
427 			goto fail;
428 
429 		if (useroffset >= size) {
430 			cache_useroffset = 0;
431 			cache_usersize = 0;
432 		} else {
433 			cache_useroffset = useroffset;
434 			cache_usersize = min(size - cache_useroffset, usersize);
435 		}
436 
437 		aligned_idx = __kmalloc_index(size, false);
438 		if (!(*b)[aligned_idx]) {
439 			cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1);
440 			if (WARN_ON(!cache_name))
441 				goto fail;
442 			(*b)[aligned_idx] = kmem_cache_create_usercopy(cache_name, size,
443 					0, flags, cache_useroffset,
444 					cache_usersize, ctor);
445 			kfree(cache_name);
446 			if (WARN_ON(!(*b)[aligned_idx]))
447 				goto fail;
448 			set_bit(aligned_idx, &mask);
449 		}
450 		if (idx != aligned_idx)
451 			(*b)[idx] = (*b)[aligned_idx];
452 	}
453 
454 	return b;
455 
456 fail:
457 	for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]))
458 		kmem_cache_destroy((*b)[idx]);
459 	kmem_cache_free(kmem_buckets_cache, b);
460 
461 	return NULL;
462 }
463 EXPORT_SYMBOL(kmem_buckets_create);
464 
465 /*
466  * For a given kmem_cache, kmem_cache_destroy() should only be called
467  * once or there will be a use-after-free problem. The actual deletion
468  * and release of the kobject does not need slab_mutex or cpu_hotplug_lock
469  * protection. So they are now done without holding those locks.
470  */
471 static void kmem_cache_release(struct kmem_cache *s)
472 {
473 	kfence_shutdown_cache(s);
474 	if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL)
475 		sysfs_slab_release(s);
476 	else
477 		slab_kmem_cache_release(s);
478 }
479 
480 void slab_kmem_cache_release(struct kmem_cache *s)
481 {
482 	__kmem_cache_release(s);
483 	kfree_const(s->name);
484 	kmem_cache_free(kmem_cache, s);
485 }
486 
487 void kmem_cache_destroy(struct kmem_cache *s)
488 {
489 	int err;
490 
491 	if (unlikely(!s) || !kasan_check_byte(s))
492 		return;
493 
494 	/* in-flight kfree_rcu()'s may include objects from our cache */
495 	kvfree_rcu_barrier();
496 
497 	if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
498 	    (s->flags & SLAB_TYPESAFE_BY_RCU)) {
499 		/*
500 		 * Under CONFIG_SLUB_RCU_DEBUG, when objects in a
501 		 * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally
502 		 * defer their freeing with call_rcu().
503 		 * Wait for such call_rcu() invocations here before actually
504 		 * destroying the cache.
505 		 *
506 		 * It doesn't matter that we haven't looked at the slab refcount
507 		 * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so
508 		 * the refcount should be 1 here.
509 		 */
510 		rcu_barrier();
511 	}
512 
513 	/* Wait for deferred work from kmalloc/kfree_nolock() */
514 	defer_free_barrier();
515 
516 	cpus_read_lock();
517 	mutex_lock(&slab_mutex);
518 
519 	s->refcount--;
520 	if (s->refcount) {
521 		mutex_unlock(&slab_mutex);
522 		cpus_read_unlock();
523 		return;
524 	}
525 
526 	/* free asan quarantined objects */
527 	kasan_cache_shutdown(s);
528 
529 	err = __kmem_cache_shutdown(s);
530 	if (!slab_in_kunit_test())
531 		WARN(err, "%s %s: Slab cache still has objects when called from %pS",
532 		     __func__, s->name, (void *)_RET_IP_);
533 
534 	list_del(&s->list);
535 
536 	mutex_unlock(&slab_mutex);
537 	cpus_read_unlock();
538 
539 	if (slab_state >= FULL)
540 		sysfs_slab_unlink(s);
541 	debugfs_slab_release(s);
542 
543 	if (err)
544 		return;
545 
546 	if (s->flags & SLAB_TYPESAFE_BY_RCU)
547 		rcu_barrier();
548 
549 	kmem_cache_release(s);
550 }
551 EXPORT_SYMBOL(kmem_cache_destroy);
552 
553 /**
554  * kmem_cache_shrink - Shrink a cache.
555  * @cachep: The cache to shrink.
556  *
557  * Releases as many slabs as possible for a cache.
558  * To help debugging, a zero exit status indicates all slabs were released.
559  *
560  * Return: %0 if all slabs were released, non-zero otherwise
561  */
562 int kmem_cache_shrink(struct kmem_cache *cachep)
563 {
564 	kasan_cache_shrink(cachep);
565 
566 	return __kmem_cache_shrink(cachep);
567 }
568 EXPORT_SYMBOL(kmem_cache_shrink);
569 
570 bool slab_is_available(void)
571 {
572 	return slab_state >= UP;
573 }
574 
575 #ifdef CONFIG_PRINTK
576 static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
577 {
578 	if (__kfence_obj_info(kpp, object, slab))
579 		return;
580 	__kmem_obj_info(kpp, object, slab);
581 }
582 
583 /**
584  * kmem_dump_obj - Print available slab provenance information
585  * @object: slab object for which to find provenance information.
586  *
587  * This function uses pr_cont(), so that the caller is expected to have
588  * printed out whatever preamble is appropriate.  The provenance information
589  * depends on the type of object and on how much debugging is enabled.
590  * For a slab-cache object, the fact that it is a slab object is printed,
591  * and, if available, the slab name, return address, and stack trace from
592  * the allocation and last free path of that object.
593  *
594  * Return: %true if the pointer is to a not-yet-freed object from
595  * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
596  * is to an already-freed object, and %false otherwise.
597  */
598 bool kmem_dump_obj(void *object)
599 {
600 	char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
601 	int i;
602 	struct slab *slab;
603 	unsigned long ptroffset;
604 	struct kmem_obj_info kp = { };
605 
606 	/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
607 	if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
608 		return false;
609 	slab = virt_to_slab(object);
610 	if (!slab)
611 		return false;
612 
613 	kmem_obj_info(&kp, object, slab);
614 	if (kp.kp_slab_cache)
615 		pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
616 	else
617 		pr_cont(" slab%s", cp);
618 	if (is_kfence_address(object))
619 		pr_cont(" (kfence)");
620 	if (kp.kp_objp)
621 		pr_cont(" start %px", kp.kp_objp);
622 	if (kp.kp_data_offset)
623 		pr_cont(" data offset %lu", kp.kp_data_offset);
624 	if (kp.kp_objp) {
625 		ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
626 		pr_cont(" pointer offset %lu", ptroffset);
627 	}
628 	if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
629 		pr_cont(" size %u", kp.kp_slab_cache->object_size);
630 	if (kp.kp_ret)
631 		pr_cont(" allocated at %pS\n", kp.kp_ret);
632 	else
633 		pr_cont("\n");
634 	for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
635 		if (!kp.kp_stack[i])
636 			break;
637 		pr_info("    %pS\n", kp.kp_stack[i]);
638 	}
639 
640 	if (kp.kp_free_stack[0])
641 		pr_cont(" Free path:\n");
642 
643 	for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
644 		if (!kp.kp_free_stack[i])
645 			break;
646 		pr_info("    %pS\n", kp.kp_free_stack[i]);
647 	}
648 
649 	return true;
650 }
651 EXPORT_SYMBOL_GPL(kmem_dump_obj);
652 #endif
653 
654 /* Create a cache during boot when no slab services are available yet */
655 void __init create_boot_cache(struct kmem_cache *s, const char *name,
656 		unsigned int size, slab_flags_t flags,
657 		unsigned int useroffset, unsigned int usersize)
658 {
659 	int err;
660 	unsigned int align = ARCH_KMALLOC_MINALIGN;
661 	struct kmem_cache_args kmem_args = {};
662 
663 	/*
664 	 * kmalloc caches guarantee alignment of at least the largest
665 	 * power-of-two divisor of the size. For power-of-two sizes,
666 	 * it is the size itself.
667 	 */
668 	if (flags & SLAB_KMALLOC)
669 		align = max(align, 1U << (ffs(size) - 1));
670 	kmem_args.align = calculate_alignment(flags, align, size);
671 
672 #ifdef CONFIG_HARDENED_USERCOPY
673 	kmem_args.useroffset = useroffset;
674 	kmem_args.usersize = usersize;
675 #endif
676 
677 	err = do_kmem_cache_create(s, name, size, &kmem_args, flags);
678 
679 	if (err)
680 		panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
681 					name, size, err);
682 
683 	s->refcount = -1;	/* Exempt from merging for now */
684 }
685 
686 static struct kmem_cache *__init create_kmalloc_cache(const char *name,
687 						      unsigned int size,
688 						      slab_flags_t flags)
689 {
690 	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
691 
692 	if (!s)
693 		panic("Out of memory when creating slab %s\n", name);
694 
695 	create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size);
696 	list_add(&s->list, &slab_caches);
697 	s->refcount = 1;
698 	return s;
699 }
700 
701 kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init =
702 { /* initialization for https://llvm.org/pr42570 */ };
703 EXPORT_SYMBOL(kmalloc_caches);
704 
705 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
706 unsigned long random_kmalloc_seed __ro_after_init;
707 EXPORT_SYMBOL(random_kmalloc_seed);
708 #endif
709 
710 /*
711  * Conversion table for small slabs sizes / 8 to the index in the
712  * kmalloc array. This is necessary for slabs < 192 since we have non power
713  * of two cache sizes there. The size of larger slabs can be determined using
714  * fls.
715  */
716 u8 kmalloc_size_index[24] __ro_after_init = {
717 	3,	/* 8 */
718 	4,	/* 16 */
719 	5,	/* 24 */
720 	5,	/* 32 */
721 	6,	/* 40 */
722 	6,	/* 48 */
723 	6,	/* 56 */
724 	6,	/* 64 */
725 	1,	/* 72 */
726 	1,	/* 80 */
727 	1,	/* 88 */
728 	1,	/* 96 */
729 	7,	/* 104 */
730 	7,	/* 112 */
731 	7,	/* 120 */
732 	7,	/* 128 */
733 	2,	/* 136 */
734 	2,	/* 144 */
735 	2,	/* 152 */
736 	2,	/* 160 */
737 	2,	/* 168 */
738 	2,	/* 176 */
739 	2,	/* 184 */
740 	2	/* 192 */
741 };
742 
743 size_t kmalloc_size_roundup(size_t size)
744 {
745 	if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
746 		/*
747 		 * The flags don't matter since size_index is common to all.
748 		 * Neither does the caller for just getting ->object_size.
749 		 */
750 		return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size;
751 	}
752 
753 	/* Above the smaller buckets, size is a multiple of page size. */
754 	if (size && size <= KMALLOC_MAX_SIZE)
755 		return PAGE_SIZE << get_order(size);
756 
757 	/*
758 	 * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR
759 	 * and very large size - kmalloc() may fail.
760 	 */
761 	return size;
762 
763 }
764 EXPORT_SYMBOL(kmalloc_size_roundup);
765 
766 #ifdef CONFIG_ZONE_DMA
767 #define KMALLOC_DMA_NAME(sz)	.name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
768 #else
769 #define KMALLOC_DMA_NAME(sz)
770 #endif
771 
772 #ifdef CONFIG_MEMCG
773 #define KMALLOC_CGROUP_NAME(sz)	.name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
774 #else
775 #define KMALLOC_CGROUP_NAME(sz)
776 #endif
777 
778 #ifndef CONFIG_SLUB_TINY
779 #define KMALLOC_RCL_NAME(sz)	.name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz,
780 #else
781 #define KMALLOC_RCL_NAME(sz)
782 #endif
783 
784 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
785 #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
786 #define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
787 #define KMA_RAND_1(sz)                  .name[KMALLOC_RANDOM_START +  1] = "kmalloc-rnd-01-" #sz,
788 #define KMA_RAND_2(sz)  KMA_RAND_1(sz)  .name[KMALLOC_RANDOM_START +  2] = "kmalloc-rnd-02-" #sz,
789 #define KMA_RAND_3(sz)  KMA_RAND_2(sz)  .name[KMALLOC_RANDOM_START +  3] = "kmalloc-rnd-03-" #sz,
790 #define KMA_RAND_4(sz)  KMA_RAND_3(sz)  .name[KMALLOC_RANDOM_START +  4] = "kmalloc-rnd-04-" #sz,
791 #define KMA_RAND_5(sz)  KMA_RAND_4(sz)  .name[KMALLOC_RANDOM_START +  5] = "kmalloc-rnd-05-" #sz,
792 #define KMA_RAND_6(sz)  KMA_RAND_5(sz)  .name[KMALLOC_RANDOM_START +  6] = "kmalloc-rnd-06-" #sz,
793 #define KMA_RAND_7(sz)  KMA_RAND_6(sz)  .name[KMALLOC_RANDOM_START +  7] = "kmalloc-rnd-07-" #sz,
794 #define KMA_RAND_8(sz)  KMA_RAND_7(sz)  .name[KMALLOC_RANDOM_START +  8] = "kmalloc-rnd-08-" #sz,
795 #define KMA_RAND_9(sz)  KMA_RAND_8(sz)  .name[KMALLOC_RANDOM_START +  9] = "kmalloc-rnd-09-" #sz,
796 #define KMA_RAND_10(sz) KMA_RAND_9(sz)  .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
797 #define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
798 #define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
799 #define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
800 #define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
801 #define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
802 #else // CONFIG_RANDOM_KMALLOC_CACHES
803 #define KMALLOC_RANDOM_NAME(N, sz)
804 #endif
805 
806 #define INIT_KMALLOC_INFO(__size, __short_size)			\
807 {								\
808 	.name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,	\
809 	KMALLOC_RCL_NAME(__short_size)				\
810 	KMALLOC_CGROUP_NAME(__short_size)			\
811 	KMALLOC_DMA_NAME(__short_size)				\
812 	KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size)	\
813 	.size = __size,						\
814 }
815 
816 /*
817  * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time.
818  * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
819  * kmalloc-2M.
820  */
821 const struct kmalloc_info_struct kmalloc_info[] __initconst = {
822 	INIT_KMALLOC_INFO(0, 0),
823 	INIT_KMALLOC_INFO(96, 96),
824 	INIT_KMALLOC_INFO(192, 192),
825 	INIT_KMALLOC_INFO(8, 8),
826 	INIT_KMALLOC_INFO(16, 16),
827 	INIT_KMALLOC_INFO(32, 32),
828 	INIT_KMALLOC_INFO(64, 64),
829 	INIT_KMALLOC_INFO(128, 128),
830 	INIT_KMALLOC_INFO(256, 256),
831 	INIT_KMALLOC_INFO(512, 512),
832 	INIT_KMALLOC_INFO(1024, 1k),
833 	INIT_KMALLOC_INFO(2048, 2k),
834 	INIT_KMALLOC_INFO(4096, 4k),
835 	INIT_KMALLOC_INFO(8192, 8k),
836 	INIT_KMALLOC_INFO(16384, 16k),
837 	INIT_KMALLOC_INFO(32768, 32k),
838 	INIT_KMALLOC_INFO(65536, 64k),
839 	INIT_KMALLOC_INFO(131072, 128k),
840 	INIT_KMALLOC_INFO(262144, 256k),
841 	INIT_KMALLOC_INFO(524288, 512k),
842 	INIT_KMALLOC_INFO(1048576, 1M),
843 	INIT_KMALLOC_INFO(2097152, 2M)
844 };
845 
846 /*
847  * Patch up the size_index table if we have strange large alignment
848  * requirements for the kmalloc array. This is only the case for
849  * MIPS it seems. The standard arches will not generate any code here.
850  *
851  * Largest permitted alignment is 256 bytes due to the way we
852  * handle the index determination for the smaller caches.
853  *
854  * Make sure that nothing crazy happens if someone starts tinkering
855  * around with ARCH_KMALLOC_MINALIGN
856  */
857 void __init setup_kmalloc_cache_index_table(void)
858 {
859 	unsigned int i;
860 
861 	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
862 		!is_power_of_2(KMALLOC_MIN_SIZE));
863 
864 	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
865 		unsigned int elem = size_index_elem(i);
866 
867 		if (elem >= ARRAY_SIZE(kmalloc_size_index))
868 			break;
869 		kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW;
870 	}
871 
872 	if (KMALLOC_MIN_SIZE >= 64) {
873 		/*
874 		 * The 96 byte sized cache is not used if the alignment
875 		 * is 64 byte.
876 		 */
877 		for (i = 64 + 8; i <= 96; i += 8)
878 			kmalloc_size_index[size_index_elem(i)] = 7;
879 
880 	}
881 
882 	if (KMALLOC_MIN_SIZE >= 128) {
883 		/*
884 		 * The 192 byte sized cache is not used if the alignment
885 		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
886 		 * instead.
887 		 */
888 		for (i = 128 + 8; i <= 192; i += 8)
889 			kmalloc_size_index[size_index_elem(i)] = 8;
890 	}
891 }
892 
893 static unsigned int __kmalloc_minalign(void)
894 {
895 	unsigned int minalign = dma_get_cache_alignment();
896 
897 	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
898 	    is_swiotlb_allocated())
899 		minalign = ARCH_KMALLOC_MINALIGN;
900 
901 	return max(minalign, arch_slab_minalign());
902 }
903 
904 static void __init
905 new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
906 {
907 	slab_flags_t flags = 0;
908 	unsigned int minalign = __kmalloc_minalign();
909 	unsigned int aligned_size = kmalloc_info[idx].size;
910 	int aligned_idx = idx;
911 
912 	if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
913 		flags |= SLAB_RECLAIM_ACCOUNT;
914 	} else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) {
915 		if (mem_cgroup_kmem_disabled()) {
916 			kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
917 			return;
918 		}
919 		flags |= SLAB_ACCOUNT;
920 	} else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
921 		flags |= SLAB_CACHE_DMA;
922 	}
923 
924 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
925 	if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
926 		flags |= SLAB_NO_MERGE;
927 #endif
928 
929 	/*
930 	 * If CONFIG_MEMCG is enabled, disable cache merging for
931 	 * KMALLOC_NORMAL caches.
932 	 */
933 	if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL))
934 		flags |= SLAB_NO_MERGE;
935 
936 	if (minalign > ARCH_KMALLOC_MINALIGN) {
937 		aligned_size = ALIGN(aligned_size, minalign);
938 		aligned_idx = __kmalloc_index(aligned_size, false);
939 	}
940 
941 	if (!kmalloc_caches[type][aligned_idx])
942 		kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
943 					kmalloc_info[aligned_idx].name[type],
944 					aligned_size, flags);
945 	if (idx != aligned_idx)
946 		kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
947 }
948 
949 /*
950  * Create the kmalloc array. Some of the regular kmalloc arrays
951  * may already have been created because they were needed to
952  * enable allocations for slab creation.
953  */
954 void __init create_kmalloc_caches(void)
955 {
956 	int i;
957 	enum kmalloc_cache_type type;
958 
959 	/*
960 	 * Including KMALLOC_CGROUP if CONFIG_MEMCG defined
961 	 */
962 	for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
963 		/* Caches that are NOT of the two-to-the-power-of size. */
964 		if (KMALLOC_MIN_SIZE <= 32)
965 			new_kmalloc_cache(1, type);
966 		if (KMALLOC_MIN_SIZE <= 64)
967 			new_kmalloc_cache(2, type);
968 
969 		/* Caches that are of the two-to-the-power-of size. */
970 		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
971 			new_kmalloc_cache(i, type);
972 	}
973 #ifdef CONFIG_RANDOM_KMALLOC_CACHES
974 	random_kmalloc_seed = get_random_u64();
975 #endif
976 
977 	/* Kmalloc array is now usable */
978 	slab_state = UP;
979 
980 	if (IS_ENABLED(CONFIG_SLAB_BUCKETS))
981 		kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
982 						       sizeof(kmem_buckets),
983 						       0, SLAB_NO_MERGE, NULL);
984 }
985 
986 /**
987  * __ksize -- Report full size of underlying allocation
988  * @object: pointer to the object
989  *
990  * This should only be used internally to query the true size of allocations.
991  * It is not meant to be a way to discover the usable size of an allocation
992  * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
993  * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
994  * and/or FORTIFY_SOURCE.
995  *
996  * Return: size of the actual memory used by @object in bytes
997  */
998 size_t __ksize(const void *object)
999 {
1000 	struct folio *folio;
1001 
1002 	if (unlikely(object == ZERO_SIZE_PTR))
1003 		return 0;
1004 
1005 	folio = virt_to_folio(object);
1006 
1007 	if (unlikely(!folio_test_slab(folio))) {
1008 		if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
1009 			return 0;
1010 		if (WARN_ON(object != folio_address(folio)))
1011 			return 0;
1012 		return folio_size(folio);
1013 	}
1014 
1015 #ifdef CONFIG_SLUB_DEBUG
1016 	skip_orig_size_check(folio_slab(folio)->slab_cache, object);
1017 #endif
1018 
1019 	return slab_ksize(folio_slab(folio)->slab_cache);
1020 }
1021 
1022 gfp_t kmalloc_fix_flags(gfp_t flags)
1023 {
1024 	gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1025 
1026 	flags &= ~GFP_SLAB_BUG_MASK;
1027 	pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1028 			invalid_mask, &invalid_mask, flags, &flags);
1029 	dump_stack();
1030 
1031 	return flags;
1032 }
1033 
1034 #ifdef CONFIG_SLAB_FREELIST_RANDOM
1035 /* Randomize a generic freelist */
1036 static void freelist_randomize(unsigned int *list,
1037 			       unsigned int count)
1038 {
1039 	unsigned int rand;
1040 	unsigned int i;
1041 
1042 	for (i = 0; i < count; i++)
1043 		list[i] = i;
1044 
1045 	/* Fisher-Yates shuffle */
1046 	for (i = count - 1; i > 0; i--) {
1047 		rand = get_random_u32_below(i + 1);
1048 		swap(list[i], list[rand]);
1049 	}
1050 }
1051 
1052 /* Create a random sequence per cache */
1053 int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
1054 				    gfp_t gfp)
1055 {
1056 
1057 	if (count < 2 || cachep->random_seq)
1058 		return 0;
1059 
1060 	cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
1061 	if (!cachep->random_seq)
1062 		return -ENOMEM;
1063 
1064 	freelist_randomize(cachep->random_seq, count);
1065 	return 0;
1066 }
1067 
1068 /* Destroy the per-cache random freelist sequence */
1069 void cache_random_seq_destroy(struct kmem_cache *cachep)
1070 {
1071 	kfree(cachep->random_seq);
1072 	cachep->random_seq = NULL;
1073 }
1074 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
1075 
1076 #ifdef CONFIG_SLUB_DEBUG
1077 #define SLABINFO_RIGHTS (0400)
1078 
1079 static void print_slabinfo_header(struct seq_file *m)
1080 {
1081 	/*
1082 	 * Output format version, so at least we can change it
1083 	 * without _too_ many complaints.
1084 	 */
1085 	seq_puts(m, "slabinfo - version: 2.1\n");
1086 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
1087 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
1088 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
1089 	seq_putc(m, '\n');
1090 }
1091 
1092 static void *slab_start(struct seq_file *m, loff_t *pos)
1093 {
1094 	mutex_lock(&slab_mutex);
1095 	return seq_list_start(&slab_caches, *pos);
1096 }
1097 
1098 static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
1099 {
1100 	return seq_list_next(p, &slab_caches, pos);
1101 }
1102 
1103 static void slab_stop(struct seq_file *m, void *p)
1104 {
1105 	mutex_unlock(&slab_mutex);
1106 }
1107 
1108 static void cache_show(struct kmem_cache *s, struct seq_file *m)
1109 {
1110 	struct slabinfo sinfo;
1111 
1112 	memset(&sinfo, 0, sizeof(sinfo));
1113 	get_slabinfo(s, &sinfo);
1114 
1115 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
1116 		   s->name, sinfo.active_objs, sinfo.num_objs, s->size,
1117 		   sinfo.objects_per_slab, (1 << sinfo.cache_order));
1118 
1119 	seq_printf(m, " : tunables %4u %4u %4u",
1120 		   sinfo.limit, sinfo.batchcount, sinfo.shared);
1121 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
1122 		   sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
1123 	seq_putc(m, '\n');
1124 }
1125 
1126 static int slab_show(struct seq_file *m, void *p)
1127 {
1128 	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
1129 
1130 	if (p == slab_caches.next)
1131 		print_slabinfo_header(m);
1132 	cache_show(s, m);
1133 	return 0;
1134 }
1135 
1136 void dump_unreclaimable_slab(void)
1137 {
1138 	struct kmem_cache *s;
1139 	struct slabinfo sinfo;
1140 
1141 	/*
1142 	 * Here acquiring slab_mutex is risky since we don't prefer to get
1143 	 * sleep in oom path. But, without mutex hold, it may introduce a
1144 	 * risk of crash.
1145 	 * Use mutex_trylock to protect the list traverse, dump nothing
1146 	 * without acquiring the mutex.
1147 	 */
1148 	if (!mutex_trylock(&slab_mutex)) {
1149 		pr_warn("excessive unreclaimable slab but cannot dump stats\n");
1150 		return;
1151 	}
1152 
1153 	pr_info("Unreclaimable slab info:\n");
1154 	pr_info("Name                      Used          Total\n");
1155 
1156 	list_for_each_entry(s, &slab_caches, list) {
1157 		if (s->flags & SLAB_RECLAIM_ACCOUNT)
1158 			continue;
1159 
1160 		get_slabinfo(s, &sinfo);
1161 
1162 		if (sinfo.num_objs > 0)
1163 			pr_info("%-17s %10luKB %10luKB\n", s->name,
1164 				(sinfo.active_objs * s->size) / 1024,
1165 				(sinfo.num_objs * s->size) / 1024);
1166 	}
1167 	mutex_unlock(&slab_mutex);
1168 }
1169 
1170 /*
1171  * slabinfo_op - iterator that generates /proc/slabinfo
1172  *
1173  * Output layout:
1174  * cache-name
1175  * num-active-objs
1176  * total-objs
1177  * object size
1178  * num-active-slabs
1179  * total-slabs
1180  * num-pages-per-slab
1181  * + further values on SMP and with statistics enabled
1182  */
1183 static const struct seq_operations slabinfo_op = {
1184 	.start = slab_start,
1185 	.next = slab_next,
1186 	.stop = slab_stop,
1187 	.show = slab_show,
1188 };
1189 
1190 static int slabinfo_open(struct inode *inode, struct file *file)
1191 {
1192 	return seq_open(file, &slabinfo_op);
1193 }
1194 
1195 static const struct proc_ops slabinfo_proc_ops = {
1196 	.proc_flags	= PROC_ENTRY_PERMANENT,
1197 	.proc_open	= slabinfo_open,
1198 	.proc_read	= seq_read,
1199 	.proc_lseek	= seq_lseek,
1200 	.proc_release	= seq_release,
1201 };
1202 
1203 static int __init slab_proc_init(void)
1204 {
1205 	proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops);
1206 	return 0;
1207 }
1208 module_init(slab_proc_init);
1209 
1210 #endif /* CONFIG_SLUB_DEBUG */
1211 
1212 /**
1213  * kfree_sensitive - Clear sensitive information in memory before freeing
1214  * @p: object to free memory of
1215  *
1216  * The memory of the object @p points to is zeroed before freed.
1217  * If @p is %NULL, kfree_sensitive() does nothing.
1218  *
1219  * Note: this function zeroes the whole allocated buffer which can be a good
1220  * deal bigger than the requested buffer size passed to kmalloc(). So be
1221  * careful when using this function in performance sensitive code.
1222  */
1223 void kfree_sensitive(const void *p)
1224 {
1225 	size_t ks;
1226 	void *mem = (void *)p;
1227 
1228 	ks = ksize(mem);
1229 	if (ks) {
1230 		kasan_unpoison_range(mem, ks);
1231 		memzero_explicit(mem, ks);
1232 	}
1233 	kfree(mem);
1234 }
1235 EXPORT_SYMBOL(kfree_sensitive);
1236 
1237 size_t ksize(const void *objp)
1238 {
1239 	/*
1240 	 * We need to first check that the pointer to the object is valid.
1241 	 * The KASAN report printed from ksize() is more useful, then when
1242 	 * it's printed later when the behaviour could be undefined due to
1243 	 * a potential use-after-free or double-free.
1244 	 *
1245 	 * We use kasan_check_byte(), which is supported for the hardware
1246 	 * tag-based KASAN mode, unlike kasan_check_read/write().
1247 	 *
1248 	 * If the pointed to memory is invalid, we return 0 to avoid users of
1249 	 * ksize() writing to and potentially corrupting the memory region.
1250 	 *
1251 	 * We want to perform the check before __ksize(), to avoid potentially
1252 	 * crashing in __ksize() due to accessing invalid metadata.
1253 	 */
1254 	if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
1255 		return 0;
1256 
1257 	return kfence_ksize(objp) ?: __ksize(objp);
1258 }
1259 EXPORT_SYMBOL(ksize);
1260 
1261 #ifdef CONFIG_BPF_SYSCALL
1262 #include <linux/btf.h>
1263 
1264 __bpf_kfunc_start_defs();
1265 
1266 __bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr)
1267 {
1268 	struct slab *slab;
1269 
1270 	if (!virt_addr_valid((void *)(long)addr))
1271 		return NULL;
1272 
1273 	slab = virt_to_slab((void *)(long)addr);
1274 	return slab ? slab->slab_cache : NULL;
1275 }
1276 
1277 __bpf_kfunc_end_defs();
1278 #endif /* CONFIG_BPF_SYSCALL */
1279 
1280 /* Tracepoints definitions. */
1281 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
1282 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
1283 EXPORT_TRACEPOINT_SYMBOL(kfree);
1284 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
1285 
1286 #ifndef CONFIG_KVFREE_RCU_BATCHED
1287 
1288 void kvfree_call_rcu(struct rcu_head *head, void *ptr)
1289 {
1290 	if (head) {
1291 		kasan_record_aux_stack(ptr);
1292 		call_rcu(head, kvfree_rcu_cb);
1293 		return;
1294 	}
1295 
1296 	// kvfree_rcu(one_arg) call.
1297 	might_sleep();
1298 	synchronize_rcu();
1299 	kvfree(ptr);
1300 }
1301 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
1302 
1303 void __init kvfree_rcu_init(void)
1304 {
1305 }
1306 
1307 #else /* CONFIG_KVFREE_RCU_BATCHED */
1308 
1309 /*
1310  * This rcu parameter is runtime-read-only. It reflects
1311  * a minimum allowed number of objects which can be cached
1312  * per-CPU. Object size is equal to one page. This value
1313  * can be changed at boot time.
1314  */
1315 static int rcu_min_cached_objs = 5;
1316 module_param(rcu_min_cached_objs, int, 0444);
1317 
1318 // A page shrinker can ask for pages to be freed to make them
1319 // available for other parts of the system. This usually happens
1320 // under low memory conditions, and in that case we should also
1321 // defer page-cache filling for a short time period.
1322 //
1323 // The default value is 5 seconds, which is long enough to reduce
1324 // interference with the shrinker while it asks other systems to
1325 // drain their caches.
1326 static int rcu_delay_page_cache_fill_msec = 5000;
1327 module_param(rcu_delay_page_cache_fill_msec, int, 0444);
1328 
1329 static struct workqueue_struct *rcu_reclaim_wq;
1330 
1331 /* Maximum number of jiffies to wait before draining a batch. */
1332 #define KFREE_DRAIN_JIFFIES (5 * HZ)
1333 #define KFREE_N_BATCHES 2
1334 #define FREE_N_CHANNELS 2
1335 
1336 /**
1337  * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
1338  * @list: List node. All blocks are linked between each other
1339  * @gp_snap: Snapshot of RCU state for objects placed to this bulk
1340  * @nr_records: Number of active pointers in the array
1341  * @records: Array of the kvfree_rcu() pointers
1342  */
1343 struct kvfree_rcu_bulk_data {
1344 	struct list_head list;
1345 	struct rcu_gp_oldstate gp_snap;
1346 	unsigned long nr_records;
1347 	void *records[] __counted_by(nr_records);
1348 };
1349 
1350 /*
1351  * This macro defines how many entries the "records" array
1352  * will contain. It is based on the fact that the size of
1353  * kvfree_rcu_bulk_data structure becomes exactly one page.
1354  */
1355 #define KVFREE_BULK_MAX_ENTR \
1356 	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
1357 
1358 /**
1359  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
1360  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
1361  * @head_free: List of kfree_rcu() objects waiting for a grace period
1362  * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
1363  * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
1364  * @krcp: Pointer to @kfree_rcu_cpu structure
1365  */
1366 
1367 struct kfree_rcu_cpu_work {
1368 	struct rcu_work rcu_work;
1369 	struct rcu_head *head_free;
1370 	struct rcu_gp_oldstate head_free_gp_snap;
1371 	struct list_head bulk_head_free[FREE_N_CHANNELS];
1372 	struct kfree_rcu_cpu *krcp;
1373 };
1374 
1375 /**
1376  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
1377  * @head: List of kfree_rcu() objects not yet waiting for a grace period
1378  * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
1379  * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
1380  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
1381  * @lock: Synchronize access to this structure
1382  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
1383  * @initialized: The @rcu_work fields have been initialized
1384  * @head_count: Number of objects in rcu_head singular list
1385  * @bulk_count: Number of objects in bulk-list
1386  * @bkvcache:
1387  *	A simple cache list that contains objects for reuse purpose.
1388  *	In order to save some per-cpu space the list is singular.
1389  *	Even though it is lockless an access has to be protected by the
1390  *	per-cpu lock.
1391  * @page_cache_work: A work to refill the cache when it is empty
1392  * @backoff_page_cache_fill: Delay cache refills
1393  * @work_in_progress: Indicates that page_cache_work is running
1394  * @hrtimer: A hrtimer for scheduling a page_cache_work
1395  * @nr_bkv_objs: number of allocated objects at @bkvcache.
1396  *
1397  * This is a per-CPU structure.  The reason that it is not included in
1398  * the rcu_data structure is to permit this code to be extracted from
1399  * the RCU files.  Such extraction could allow further optimization of
1400  * the interactions with the slab allocators.
1401  */
1402 struct kfree_rcu_cpu {
1403 	// Objects queued on a linked list
1404 	// through their rcu_head structures.
1405 	struct rcu_head *head;
1406 	unsigned long head_gp_snap;
1407 	atomic_t head_count;
1408 
1409 	// Objects queued on a bulk-list.
1410 	struct list_head bulk_head[FREE_N_CHANNELS];
1411 	atomic_t bulk_count[FREE_N_CHANNELS];
1412 
1413 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
1414 	raw_spinlock_t lock;
1415 	struct delayed_work monitor_work;
1416 	bool initialized;
1417 
1418 	struct delayed_work page_cache_work;
1419 	atomic_t backoff_page_cache_fill;
1420 	atomic_t work_in_progress;
1421 	struct hrtimer hrtimer;
1422 
1423 	struct llist_head bkvcache;
1424 	int nr_bkv_objs;
1425 };
1426 
1427 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
1428 	.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
1429 };
1430 
1431 static __always_inline void
1432 debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
1433 {
1434 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1435 	int i;
1436 
1437 	for (i = 0; i < bhead->nr_records; i++)
1438 		debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
1439 #endif
1440 }
1441 
1442 static inline struct kfree_rcu_cpu *
1443 krc_this_cpu_lock(unsigned long *flags)
1444 {
1445 	struct kfree_rcu_cpu *krcp;
1446 
1447 	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
1448 	krcp = this_cpu_ptr(&krc);
1449 	raw_spin_lock(&krcp->lock);
1450 
1451 	return krcp;
1452 }
1453 
1454 static inline void
1455 krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
1456 {
1457 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1458 }
1459 
1460 static inline struct kvfree_rcu_bulk_data *
1461 get_cached_bnode(struct kfree_rcu_cpu *krcp)
1462 {
1463 	if (!krcp->nr_bkv_objs)
1464 		return NULL;
1465 
1466 	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
1467 	return (struct kvfree_rcu_bulk_data *)
1468 		llist_del_first(&krcp->bkvcache);
1469 }
1470 
1471 static inline bool
1472 put_cached_bnode(struct kfree_rcu_cpu *krcp,
1473 	struct kvfree_rcu_bulk_data *bnode)
1474 {
1475 	// Check the limit.
1476 	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
1477 		return false;
1478 
1479 	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
1480 	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
1481 	return true;
1482 }
1483 
1484 static int
1485 drain_page_cache(struct kfree_rcu_cpu *krcp)
1486 {
1487 	unsigned long flags;
1488 	struct llist_node *page_list, *pos, *n;
1489 	int freed = 0;
1490 
1491 	if (!rcu_min_cached_objs)
1492 		return 0;
1493 
1494 	raw_spin_lock_irqsave(&krcp->lock, flags);
1495 	page_list = llist_del_all(&krcp->bkvcache);
1496 	WRITE_ONCE(krcp->nr_bkv_objs, 0);
1497 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1498 
1499 	llist_for_each_safe(pos, n, page_list) {
1500 		free_page((unsigned long)pos);
1501 		freed++;
1502 	}
1503 
1504 	return freed;
1505 }
1506 
1507 static void
1508 kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
1509 	struct kvfree_rcu_bulk_data *bnode, int idx)
1510 {
1511 	unsigned long flags;
1512 	int i;
1513 
1514 	if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
1515 		debug_rcu_bhead_unqueue(bnode);
1516 		rcu_lock_acquire(&rcu_callback_map);
1517 		if (idx == 0) { // kmalloc() / kfree().
1518 			trace_rcu_invoke_kfree_bulk_callback(
1519 				"slab", bnode->nr_records,
1520 				bnode->records);
1521 
1522 			kfree_bulk(bnode->nr_records, bnode->records);
1523 		} else { // vmalloc() / vfree().
1524 			for (i = 0; i < bnode->nr_records; i++) {
1525 				trace_rcu_invoke_kvfree_callback(
1526 					"slab", bnode->records[i], 0);
1527 
1528 				vfree(bnode->records[i]);
1529 			}
1530 		}
1531 		rcu_lock_release(&rcu_callback_map);
1532 	}
1533 
1534 	raw_spin_lock_irqsave(&krcp->lock, flags);
1535 	if (put_cached_bnode(krcp, bnode))
1536 		bnode = NULL;
1537 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1538 
1539 	if (bnode)
1540 		free_page((unsigned long) bnode);
1541 
1542 	cond_resched_tasks_rcu_qs();
1543 }
1544 
1545 static void
1546 kvfree_rcu_list(struct rcu_head *head)
1547 {
1548 	struct rcu_head *next;
1549 
1550 	for (; head; head = next) {
1551 		void *ptr = (void *) head->func;
1552 		unsigned long offset = (void *) head - ptr;
1553 
1554 		next = head->next;
1555 		debug_rcu_head_unqueue((struct rcu_head *)ptr);
1556 		rcu_lock_acquire(&rcu_callback_map);
1557 		trace_rcu_invoke_kvfree_callback("slab", head, offset);
1558 
1559 		kvfree(ptr);
1560 
1561 		rcu_lock_release(&rcu_callback_map);
1562 		cond_resched_tasks_rcu_qs();
1563 	}
1564 }
1565 
1566 /*
1567  * This function is invoked in workqueue context after a grace period.
1568  * It frees all the objects queued on ->bulk_head_free or ->head_free.
1569  */
1570 static void kfree_rcu_work(struct work_struct *work)
1571 {
1572 	unsigned long flags;
1573 	struct kvfree_rcu_bulk_data *bnode, *n;
1574 	struct list_head bulk_head[FREE_N_CHANNELS];
1575 	struct rcu_head *head;
1576 	struct kfree_rcu_cpu *krcp;
1577 	struct kfree_rcu_cpu_work *krwp;
1578 	struct rcu_gp_oldstate head_gp_snap;
1579 	int i;
1580 
1581 	krwp = container_of(to_rcu_work(work),
1582 		struct kfree_rcu_cpu_work, rcu_work);
1583 	krcp = krwp->krcp;
1584 
1585 	raw_spin_lock_irqsave(&krcp->lock, flags);
1586 	// Channels 1 and 2.
1587 	for (i = 0; i < FREE_N_CHANNELS; i++)
1588 		list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
1589 
1590 	// Channel 3.
1591 	head = krwp->head_free;
1592 	krwp->head_free = NULL;
1593 	head_gp_snap = krwp->head_free_gp_snap;
1594 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1595 
1596 	// Handle the first two channels.
1597 	for (i = 0; i < FREE_N_CHANNELS; i++) {
1598 		// Start from the tail page, so a GP is likely passed for it.
1599 		list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
1600 			kvfree_rcu_bulk(krcp, bnode, i);
1601 	}
1602 
1603 	/*
1604 	 * This is used when the "bulk" path can not be used for the
1605 	 * double-argument of kvfree_rcu().  This happens when the
1606 	 * page-cache is empty, which means that objects are instead
1607 	 * queued on a linked list through their rcu_head structures.
1608 	 * This list is named "Channel 3".
1609 	 */
1610 	if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
1611 		kvfree_rcu_list(head);
1612 }
1613 
1614 static bool kfree_rcu_sheaf(void *obj)
1615 {
1616 	struct kmem_cache *s;
1617 	struct folio *folio;
1618 	struct slab *slab;
1619 
1620 	if (is_vmalloc_addr(obj))
1621 		return false;
1622 
1623 	folio = virt_to_folio(obj);
1624 	if (unlikely(!folio_test_slab(folio)))
1625 		return false;
1626 
1627 	slab = folio_slab(folio);
1628 	s = slab->slab_cache;
1629 	if (s->cpu_sheaves) {
1630 		if (likely(!IS_ENABLED(CONFIG_NUMA) ||
1631 			   slab_nid(slab) == numa_mem_id()))
1632 			return __kfree_rcu_sheaf(s, obj);
1633 	}
1634 
1635 	return false;
1636 }
1637 
1638 static bool
1639 need_offload_krc(struct kfree_rcu_cpu *krcp)
1640 {
1641 	int i;
1642 
1643 	for (i = 0; i < FREE_N_CHANNELS; i++)
1644 		if (!list_empty(&krcp->bulk_head[i]))
1645 			return true;
1646 
1647 	return !!READ_ONCE(krcp->head);
1648 }
1649 
1650 static bool
1651 need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
1652 {
1653 	int i;
1654 
1655 	for (i = 0; i < FREE_N_CHANNELS; i++)
1656 		if (!list_empty(&krwp->bulk_head_free[i]))
1657 			return true;
1658 
1659 	return !!krwp->head_free;
1660 }
1661 
1662 static int krc_count(struct kfree_rcu_cpu *krcp)
1663 {
1664 	int sum = atomic_read(&krcp->head_count);
1665 	int i;
1666 
1667 	for (i = 0; i < FREE_N_CHANNELS; i++)
1668 		sum += atomic_read(&krcp->bulk_count[i]);
1669 
1670 	return sum;
1671 }
1672 
1673 static void
1674 __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1675 {
1676 	long delay, delay_left;
1677 
1678 	delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
1679 	if (delayed_work_pending(&krcp->monitor_work)) {
1680 		delay_left = krcp->monitor_work.timer.expires - jiffies;
1681 		if (delay < delay_left)
1682 			mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
1683 		return;
1684 	}
1685 	queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
1686 }
1687 
1688 static void
1689 schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1690 {
1691 	unsigned long flags;
1692 
1693 	raw_spin_lock_irqsave(&krcp->lock, flags);
1694 	__schedule_delayed_monitor_work(krcp);
1695 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1696 }
1697 
1698 static void
1699 kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
1700 {
1701 	struct list_head bulk_ready[FREE_N_CHANNELS];
1702 	struct kvfree_rcu_bulk_data *bnode, *n;
1703 	struct rcu_head *head_ready = NULL;
1704 	unsigned long flags;
1705 	int i;
1706 
1707 	raw_spin_lock_irqsave(&krcp->lock, flags);
1708 	for (i = 0; i < FREE_N_CHANNELS; i++) {
1709 		INIT_LIST_HEAD(&bulk_ready[i]);
1710 
1711 		list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
1712 			if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
1713 				break;
1714 
1715 			atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
1716 			list_move(&bnode->list, &bulk_ready[i]);
1717 		}
1718 	}
1719 
1720 	if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
1721 		head_ready = krcp->head;
1722 		atomic_set(&krcp->head_count, 0);
1723 		WRITE_ONCE(krcp->head, NULL);
1724 	}
1725 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1726 
1727 	for (i = 0; i < FREE_N_CHANNELS; i++) {
1728 		list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
1729 			kvfree_rcu_bulk(krcp, bnode, i);
1730 	}
1731 
1732 	if (head_ready)
1733 		kvfree_rcu_list(head_ready);
1734 }
1735 
1736 /*
1737  * Return: %true if a work is queued, %false otherwise.
1738  */
1739 static bool
1740 kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
1741 {
1742 	unsigned long flags;
1743 	bool queued = false;
1744 	int i, j;
1745 
1746 	raw_spin_lock_irqsave(&krcp->lock, flags);
1747 
1748 	// Attempt to start a new batch.
1749 	for (i = 0; i < KFREE_N_BATCHES; i++) {
1750 		struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
1751 
1752 		// Try to detach bulk_head or head and attach it, only when
1753 		// all channels are free.  Any channel is not free means at krwp
1754 		// there is on-going rcu work to handle krwp's free business.
1755 		if (need_wait_for_krwp_work(krwp))
1756 			continue;
1757 
1758 		// kvfree_rcu_drain_ready() might handle this krcp, if so give up.
1759 		if (need_offload_krc(krcp)) {
1760 			// Channel 1 corresponds to the SLAB-pointer bulk path.
1761 			// Channel 2 corresponds to vmalloc-pointer bulk path.
1762 			for (j = 0; j < FREE_N_CHANNELS; j++) {
1763 				if (list_empty(&krwp->bulk_head_free[j])) {
1764 					atomic_set(&krcp->bulk_count[j], 0);
1765 					list_replace_init(&krcp->bulk_head[j],
1766 						&krwp->bulk_head_free[j]);
1767 				}
1768 			}
1769 
1770 			// Channel 3 corresponds to both SLAB and vmalloc
1771 			// objects queued on the linked list.
1772 			if (!krwp->head_free) {
1773 				krwp->head_free = krcp->head;
1774 				get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
1775 				atomic_set(&krcp->head_count, 0);
1776 				WRITE_ONCE(krcp->head, NULL);
1777 			}
1778 
1779 			// One work is per one batch, so there are three
1780 			// "free channels", the batch can handle. Break
1781 			// the loop since it is done with this CPU thus
1782 			// queuing an RCU work is _always_ success here.
1783 			queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
1784 			WARN_ON_ONCE(!queued);
1785 			break;
1786 		}
1787 	}
1788 
1789 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1790 	return queued;
1791 }
1792 
1793 /*
1794  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
1795  */
1796 static void kfree_rcu_monitor(struct work_struct *work)
1797 {
1798 	struct kfree_rcu_cpu *krcp = container_of(work,
1799 		struct kfree_rcu_cpu, monitor_work.work);
1800 
1801 	// Drain ready for reclaim.
1802 	kvfree_rcu_drain_ready(krcp);
1803 
1804 	// Queue a batch for a rest.
1805 	kvfree_rcu_queue_batch(krcp);
1806 
1807 	// If there is nothing to detach, it means that our job is
1808 	// successfully done here. In case of having at least one
1809 	// of the channels that is still busy we should rearm the
1810 	// work to repeat an attempt. Because previous batches are
1811 	// still in progress.
1812 	if (need_offload_krc(krcp))
1813 		schedule_delayed_monitor_work(krcp);
1814 }
1815 
1816 static void fill_page_cache_func(struct work_struct *work)
1817 {
1818 	struct kvfree_rcu_bulk_data *bnode;
1819 	struct kfree_rcu_cpu *krcp =
1820 		container_of(work, struct kfree_rcu_cpu,
1821 			page_cache_work.work);
1822 	unsigned long flags;
1823 	int nr_pages;
1824 	bool pushed;
1825 	int i;
1826 
1827 	nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
1828 		1 : rcu_min_cached_objs;
1829 
1830 	for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
1831 		bnode = (struct kvfree_rcu_bulk_data *)
1832 			__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1833 
1834 		if (!bnode)
1835 			break;
1836 
1837 		raw_spin_lock_irqsave(&krcp->lock, flags);
1838 		pushed = put_cached_bnode(krcp, bnode);
1839 		raw_spin_unlock_irqrestore(&krcp->lock, flags);
1840 
1841 		if (!pushed) {
1842 			free_page((unsigned long) bnode);
1843 			break;
1844 		}
1845 	}
1846 
1847 	atomic_set(&krcp->work_in_progress, 0);
1848 	atomic_set(&krcp->backoff_page_cache_fill, 0);
1849 }
1850 
1851 // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
1852 // state specified by flags.  If can_alloc is true, the caller must
1853 // be schedulable and not be holding any locks or mutexes that might be
1854 // acquired by the memory allocator or anything that it might invoke.
1855 // Returns true if ptr was successfully recorded, else the caller must
1856 // use a fallback.
1857 static inline bool
1858 add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
1859 	unsigned long *flags, void *ptr, bool can_alloc)
1860 {
1861 	struct kvfree_rcu_bulk_data *bnode;
1862 	int idx;
1863 
1864 	*krcp = krc_this_cpu_lock(flags);
1865 	if (unlikely(!(*krcp)->initialized))
1866 		return false;
1867 
1868 	idx = !!is_vmalloc_addr(ptr);
1869 	bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
1870 		struct kvfree_rcu_bulk_data, list);
1871 
1872 	/* Check if a new block is required. */
1873 	if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
1874 		bnode = get_cached_bnode(*krcp);
1875 		if (!bnode && can_alloc) {
1876 			krc_this_cpu_unlock(*krcp, *flags);
1877 
1878 			// __GFP_NORETRY - allows a light-weight direct reclaim
1879 			// what is OK from minimizing of fallback hitting point of
1880 			// view. Apart of that it forbids any OOM invoking what is
1881 			// also beneficial since we are about to release memory soon.
1882 			//
1883 			// __GFP_NOMEMALLOC - prevents from consuming of all the
1884 			// memory reserves. Please note we have a fallback path.
1885 			//
1886 			// __GFP_NOWARN - it is supposed that an allocation can
1887 			// be failed under low memory or high memory pressure
1888 			// scenarios.
1889 			bnode = (struct kvfree_rcu_bulk_data *)
1890 				__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1891 			raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
1892 		}
1893 
1894 		if (!bnode)
1895 			return false;
1896 
1897 		// Initialize the new block and attach it.
1898 		bnode->nr_records = 0;
1899 		list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
1900 	}
1901 
1902 	// Finally insert and update the GP for this page.
1903 	bnode->nr_records++;
1904 	bnode->records[bnode->nr_records - 1] = ptr;
1905 	get_state_synchronize_rcu_full(&bnode->gp_snap);
1906 	atomic_inc(&(*krcp)->bulk_count[idx]);
1907 
1908 	return true;
1909 }
1910 
1911 static enum hrtimer_restart
1912 schedule_page_work_fn(struct hrtimer *t)
1913 {
1914 	struct kfree_rcu_cpu *krcp =
1915 		container_of(t, struct kfree_rcu_cpu, hrtimer);
1916 
1917 	queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
1918 	return HRTIMER_NORESTART;
1919 }
1920 
1921 static void
1922 run_page_cache_worker(struct kfree_rcu_cpu *krcp)
1923 {
1924 	// If cache disabled, bail out.
1925 	if (!rcu_min_cached_objs)
1926 		return;
1927 
1928 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
1929 			!atomic_xchg(&krcp->work_in_progress, 1)) {
1930 		if (atomic_read(&krcp->backoff_page_cache_fill)) {
1931 			queue_delayed_work(rcu_reclaim_wq,
1932 				&krcp->page_cache_work,
1933 					msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
1934 		} else {
1935 			hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
1936 				      HRTIMER_MODE_REL);
1937 			hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
1938 		}
1939 	}
1940 }
1941 
1942 void __init kfree_rcu_scheduler_running(void)
1943 {
1944 	int cpu;
1945 
1946 	for_each_possible_cpu(cpu) {
1947 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
1948 
1949 		if (need_offload_krc(krcp))
1950 			schedule_delayed_monitor_work(krcp);
1951 	}
1952 }
1953 
1954 /*
1955  * Queue a request for lazy invocation of the appropriate free routine
1956  * after a grace period.  Please note that three paths are maintained,
1957  * two for the common case using arrays of pointers and a third one that
1958  * is used only when the main paths cannot be used, for example, due to
1959  * memory pressure.
1960  *
1961  * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
1962  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
1963  * be free'd in workqueue context. This allows us to: batch requests together to
1964  * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
1965  */
1966 void kvfree_call_rcu(struct rcu_head *head, void *ptr)
1967 {
1968 	unsigned long flags;
1969 	struct kfree_rcu_cpu *krcp;
1970 	bool success;
1971 
1972 	/*
1973 	 * Please note there is a limitation for the head-less
1974 	 * variant, that is why there is a clear rule for such
1975 	 * objects: it can be used from might_sleep() context
1976 	 * only. For other places please embed an rcu_head to
1977 	 * your data.
1978 	 */
1979 	if (!head)
1980 		might_sleep();
1981 
1982 	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
1983 		return;
1984 
1985 	// Queue the object but don't yet schedule the batch.
1986 	if (debug_rcu_head_queue(ptr)) {
1987 		// Probable double kfree_rcu(), just leak.
1988 		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
1989 			  __func__, head);
1990 
1991 		// Mark as success and leave.
1992 		return;
1993 	}
1994 
1995 	kasan_record_aux_stack(ptr);
1996 	success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
1997 	if (!success) {
1998 		run_page_cache_worker(krcp);
1999 
2000 		if (head == NULL)
2001 			// Inline if kvfree_rcu(one_arg) call.
2002 			goto unlock_return;
2003 
2004 		head->func = ptr;
2005 		head->next = krcp->head;
2006 		WRITE_ONCE(krcp->head, head);
2007 		atomic_inc(&krcp->head_count);
2008 
2009 		// Take a snapshot for this krcp.
2010 		krcp->head_gp_snap = get_state_synchronize_rcu();
2011 		success = true;
2012 	}
2013 
2014 	/*
2015 	 * The kvfree_rcu() caller considers the pointer freed at this point
2016 	 * and likely removes any references to it. Since the actual slab
2017 	 * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
2018 	 * this object (no scanning or false positives reporting).
2019 	 */
2020 	kmemleak_ignore(ptr);
2021 
2022 	// Set timer to drain after KFREE_DRAIN_JIFFIES.
2023 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
2024 		__schedule_delayed_monitor_work(krcp);
2025 
2026 unlock_return:
2027 	krc_this_cpu_unlock(krcp, flags);
2028 
2029 	/*
2030 	 * Inline kvfree() after synchronize_rcu(). We can do
2031 	 * it from might_sleep() context only, so the current
2032 	 * CPU can pass the QS state.
2033 	 */
2034 	if (!success) {
2035 		debug_rcu_head_unqueue((struct rcu_head *) ptr);
2036 		synchronize_rcu();
2037 		kvfree(ptr);
2038 	}
2039 }
2040 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
2041 
2042 /**
2043  * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
2044  *
2045  * Note that a single argument of kvfree_rcu() call has a slow path that
2046  * triggers synchronize_rcu() following by freeing a pointer. It is done
2047  * before the return from the function. Therefore for any single-argument
2048  * call that will result in a kfree() to a cache that is to be destroyed
2049  * during module exit, it is developer's responsibility to ensure that all
2050  * such calls have returned before the call to kmem_cache_destroy().
2051  */
2052 void kvfree_rcu_barrier(void)
2053 {
2054 	struct kfree_rcu_cpu_work *krwp;
2055 	struct kfree_rcu_cpu *krcp;
2056 	bool queued;
2057 	int i, cpu;
2058 
2059 	flush_all_rcu_sheaves();
2060 
2061 	/*
2062 	 * Firstly we detach objects and queue them over an RCU-batch
2063 	 * for all CPUs. Finally queued works are flushed for each CPU.
2064 	 *
2065 	 * Please note. If there are outstanding batches for a particular
2066 	 * CPU, those have to be finished first following by queuing a new.
2067 	 */
2068 	for_each_possible_cpu(cpu) {
2069 		krcp = per_cpu_ptr(&krc, cpu);
2070 
2071 		/*
2072 		 * Check if this CPU has any objects which have been queued for a
2073 		 * new GP completion. If not(means nothing to detach), we are done
2074 		 * with it. If any batch is pending/running for this "krcp", below
2075 		 * per-cpu flush_rcu_work() waits its completion(see last step).
2076 		 */
2077 		if (!need_offload_krc(krcp))
2078 			continue;
2079 
2080 		while (1) {
2081 			/*
2082 			 * If we are not able to queue a new RCU work it means:
2083 			 * - batches for this CPU are still in flight which should
2084 			 *   be flushed first and then repeat;
2085 			 * - no objects to detach, because of concurrency.
2086 			 */
2087 			queued = kvfree_rcu_queue_batch(krcp);
2088 
2089 			/*
2090 			 * Bail out, if there is no need to offload this "krcp"
2091 			 * anymore. As noted earlier it can run concurrently.
2092 			 */
2093 			if (queued || !need_offload_krc(krcp))
2094 				break;
2095 
2096 			/* There are ongoing batches. */
2097 			for (i = 0; i < KFREE_N_BATCHES; i++) {
2098 				krwp = &(krcp->krw_arr[i]);
2099 				flush_rcu_work(&krwp->rcu_work);
2100 			}
2101 		}
2102 	}
2103 
2104 	/*
2105 	 * Now we guarantee that all objects are flushed.
2106 	 */
2107 	for_each_possible_cpu(cpu) {
2108 		krcp = per_cpu_ptr(&krc, cpu);
2109 
2110 		/*
2111 		 * A monitor work can drain ready to reclaim objects
2112 		 * directly. Wait its completion if running or pending.
2113 		 */
2114 		cancel_delayed_work_sync(&krcp->monitor_work);
2115 
2116 		for (i = 0; i < KFREE_N_BATCHES; i++) {
2117 			krwp = &(krcp->krw_arr[i]);
2118 			flush_rcu_work(&krwp->rcu_work);
2119 		}
2120 	}
2121 }
2122 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
2123 
2124 static unsigned long
2125 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2126 {
2127 	int cpu;
2128 	unsigned long count = 0;
2129 
2130 	/* Snapshot count of all CPUs */
2131 	for_each_possible_cpu(cpu) {
2132 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2133 
2134 		count += krc_count(krcp);
2135 		count += READ_ONCE(krcp->nr_bkv_objs);
2136 		atomic_set(&krcp->backoff_page_cache_fill, 1);
2137 	}
2138 
2139 	return count == 0 ? SHRINK_EMPTY : count;
2140 }
2141 
2142 static unsigned long
2143 kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2144 {
2145 	int cpu, freed = 0;
2146 
2147 	for_each_possible_cpu(cpu) {
2148 		int count;
2149 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2150 
2151 		count = krc_count(krcp);
2152 		count += drain_page_cache(krcp);
2153 		kfree_rcu_monitor(&krcp->monitor_work.work);
2154 
2155 		sc->nr_to_scan -= count;
2156 		freed += count;
2157 
2158 		if (sc->nr_to_scan <= 0)
2159 			break;
2160 	}
2161 
2162 	return freed == 0 ? SHRINK_STOP : freed;
2163 }
2164 
2165 void __init kvfree_rcu_init(void)
2166 {
2167 	int cpu;
2168 	int i, j;
2169 	struct shrinker *kfree_rcu_shrinker;
2170 
2171 	rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
2172 			WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
2173 	WARN_ON(!rcu_reclaim_wq);
2174 
2175 	/* Clamp it to [0:100] seconds interval. */
2176 	if (rcu_delay_page_cache_fill_msec < 0 ||
2177 		rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
2178 
2179 		rcu_delay_page_cache_fill_msec =
2180 			clamp(rcu_delay_page_cache_fill_msec, 0,
2181 				(int) (100 * MSEC_PER_SEC));
2182 
2183 		pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
2184 			rcu_delay_page_cache_fill_msec);
2185 	}
2186 
2187 	for_each_possible_cpu(cpu) {
2188 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2189 
2190 		for (i = 0; i < KFREE_N_BATCHES; i++) {
2191 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
2192 			krcp->krw_arr[i].krcp = krcp;
2193 
2194 			for (j = 0; j < FREE_N_CHANNELS; j++)
2195 				INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
2196 		}
2197 
2198 		for (i = 0; i < FREE_N_CHANNELS; i++)
2199 			INIT_LIST_HEAD(&krcp->bulk_head[i]);
2200 
2201 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
2202 		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
2203 		krcp->initialized = true;
2204 	}
2205 
2206 	kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
2207 	if (!kfree_rcu_shrinker) {
2208 		pr_err("Failed to allocate kfree_rcu() shrinker!\n");
2209 		return;
2210 	}
2211 
2212 	kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
2213 	kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
2214 
2215 	shrinker_register(kfree_rcu_shrinker);
2216 }
2217 
2218 #endif /* CONFIG_KVFREE_RCU_BATCHED */
2219 
2220