1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * SLUB: A slab allocator that limits cache line use instead of queuing
4 * objects in per cpu and per node lists.
5 *
6 * The allocator synchronizes using per slab locks or atomic operations
7 * and only uses a centralized lock to manage a pool of partial slabs.
8 *
9 * (C) 2007 SGI, Christoph Lameter
10 * (C) 2011 Linux Foundation, Christoph Lameter
11 */
12
13 #include <linux/mm.h>
14 #include <linux/swap.h> /* mm_account_reclaimed_pages() */
15 #include <linux/module.h>
16 #include <linux/bit_spinlock.h>
17 #include <linux/interrupt.h>
18 #include <linux/swab.h>
19 #include <linux/bitops.h>
20 #include <linux/slab.h>
21 #include "slab.h"
22 #include <linux/vmalloc.h>
23 #include <linux/proc_fs.h>
24 #include <linux/seq_file.h>
25 #include <linux/kasan.h>
26 #include <linux/node.h>
27 #include <linux/kmsan.h>
28 #include <linux/cpu.h>
29 #include <linux/cpuset.h>
30 #include <linux/mempolicy.h>
31 #include <linux/ctype.h>
32 #include <linux/stackdepot.h>
33 #include <linux/debugobjects.h>
34 #include <linux/kallsyms.h>
35 #include <linux/kfence.h>
36 #include <linux/memory.h>
37 #include <linux/math64.h>
38 #include <linux/fault-inject.h>
39 #include <linux/kmemleak.h>
40 #include <linux/stacktrace.h>
41 #include <linux/prefetch.h>
42 #include <linux/memcontrol.h>
43 #include <linux/random.h>
44 #include <kunit/test.h>
45 #include <kunit/test-bug.h>
46 #include <linux/sort.h>
47 #include <linux/irq_work.h>
48 #include <linux/kprobes.h>
49 #include <linux/debugfs.h>
50 #include <trace/events/kmem.h>
51
52 #include "internal.h"
53
54 /*
55 * Lock order:
56 * 1. slab_mutex (Global Mutex)
57 * 2. node->list_lock (Spinlock)
58 * 3. kmem_cache->cpu_slab->lock (Local lock)
59 * 4. slab_lock(slab) (Only on some arches)
60 * 5. object_map_lock (Only for debugging)
61 *
62 * slab_mutex
63 *
64 * The role of the slab_mutex is to protect the list of all the slabs
65 * and to synchronize major metadata changes to slab cache structures.
66 * Also synchronizes memory hotplug callbacks.
67 *
68 * slab_lock
69 *
70 * The slab_lock is a wrapper around the page lock, thus it is a bit
71 * spinlock.
72 *
73 * The slab_lock is only used on arches that do not have the ability
74 * to do a cmpxchg_double. It only protects:
75 *
76 * A. slab->freelist -> List of free objects in a slab
77 * B. slab->inuse -> Number of objects in use
78 * C. slab->objects -> Number of objects in slab
79 * D. slab->frozen -> frozen state
80 *
81 * Frozen slabs
82 *
83 * If a slab is frozen then it is exempt from list management. It is
84 * the cpu slab which is actively allocated from by the processor that
85 * froze it and it is not on any list. The processor that froze the
86 * slab is the one who can perform list operations on the slab. Other
87 * processors may put objects onto the freelist but the processor that
88 * froze the slab is the only one that can retrieve the objects from the
89 * slab's freelist.
90 *
91 * CPU partial slabs
92 *
93 * The partially empty slabs cached on the CPU partial list are used
94 * for performance reasons, which speeds up the allocation process.
95 * These slabs are not frozen, but are also exempt from list management,
96 * by clearing the SL_partial flag when moving out of the node
97 * partial list. Please see __slab_free() for more details.
98 *
99 * To sum up, the current scheme is:
100 * - node partial slab: SL_partial && !frozen
101 * - cpu partial slab: !SL_partial && !frozen
102 * - cpu slab: !SL_partial && frozen
103 * - full slab: !SL_partial && !frozen
104 *
105 * list_lock
106 *
107 * The list_lock protects the partial and full list on each node and
108 * the partial slab counter. If taken then no new slabs may be added or
109 * removed from the lists nor make the number of partial slabs be modified.
110 * (Note that the total number of slabs is an atomic value that may be
111 * modified without taking the list lock).
112 *
113 * The list_lock is a centralized lock and thus we avoid taking it as
114 * much as possible. As long as SLUB does not have to handle partial
115 * slabs, operations can continue without any centralized lock. F.e.
116 * allocating a long series of objects that fill up slabs does not require
117 * the list lock.
118 *
119 * For debug caches, all allocations are forced to go through a list_lock
120 * protected region to serialize against concurrent validation.
121 *
122 * cpu_slab->lock local lock
123 *
124 * This locks protect slowpath manipulation of all kmem_cache_cpu fields
125 * except the stat counters. This is a percpu structure manipulated only by
126 * the local cpu, so the lock protects against being preempted or interrupted
127 * by an irq. Fast path operations rely on lockless operations instead.
128 *
129 * On PREEMPT_RT, the local lock neither disables interrupts nor preemption
130 * which means the lockless fastpath cannot be used as it might interfere with
131 * an in-progress slow path operations. In this case the local lock is always
132 * taken but it still utilizes the freelist for the common operations.
133 *
134 * lockless fastpaths
135 *
136 * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
137 * are fully lockless when satisfied from the percpu slab (and when
138 * cmpxchg_double is possible to use, otherwise slab_lock is taken).
139 * They also don't disable preemption or migration or irqs. They rely on
140 * the transaction id (tid) field to detect being preempted or moved to
141 * another cpu.
142 *
143 * irq, preemption, migration considerations
144 *
145 * Interrupts are disabled as part of list_lock or local_lock operations, or
146 * around the slab_lock operation, in order to make the slab allocator safe
147 * to use in the context of an irq.
148 *
149 * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
150 * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
151 * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
152 * doesn't have to be revalidated in each section protected by the local lock.
153 *
154 * SLUB assigns one slab for allocation to each processor.
155 * Allocations only occur from these slabs called cpu slabs.
156 *
157 * Slabs with free elements are kept on a partial list and during regular
158 * operations no list for full slabs is used. If an object in a full slab is
159 * freed then the slab will show up again on the partial lists.
160 * We track full slabs for debugging purposes though because otherwise we
161 * cannot scan all objects.
162 *
163 * Slabs are freed when they become empty. Teardown and setup is
164 * minimal so we rely on the page allocators per cpu caches for
165 * fast frees and allocs.
166 *
167 * slab->frozen The slab is frozen and exempt from list processing.
168 * This means that the slab is dedicated to a purpose
169 * such as satisfying allocations for a specific
170 * processor. Objects may be freed in the slab while
171 * it is frozen but slab_free will then skip the usual
172 * list operations. It is up to the processor holding
173 * the slab to integrate the slab into the slab lists
174 * when the slab is no longer needed.
175 *
176 * One use of this flag is to mark slabs that are
177 * used for allocations. Then such a slab becomes a cpu
178 * slab. The cpu slab may be equipped with an additional
179 * freelist that allows lockless access to
180 * free objects in addition to the regular freelist
181 * that requires the slab lock.
182 *
183 * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
184 * options set. This moves slab handling out of
185 * the fast path and disables lockless freelists.
186 */
187
188 /**
189 * enum slab_flags - How the slab flags bits are used.
190 * @SL_locked: Is locked with slab_lock()
191 * @SL_partial: On the per-node partial list
192 * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves
193 *
194 * The slab flags share space with the page flags but some bits have
195 * different interpretations. The high bits are used for information
196 * like zone/node/section.
197 */
198 enum slab_flags {
199 SL_locked = PG_locked,
200 SL_partial = PG_workingset, /* Historical reasons for this bit */
201 SL_pfmemalloc = PG_active, /* Historical reasons for this bit */
202 };
203
204 /*
205 * We could simply use migrate_disable()/enable() but as long as it's a
206 * function call even on !PREEMPT_RT, use inline preempt_disable() there.
207 */
208 #ifndef CONFIG_PREEMPT_RT
209 #define slub_get_cpu_ptr(var) get_cpu_ptr(var)
210 #define slub_put_cpu_ptr(var) put_cpu_ptr(var)
211 #define USE_LOCKLESS_FAST_PATH() (true)
212 #else
213 #define slub_get_cpu_ptr(var) \
214 ({ \
215 migrate_disable(); \
216 this_cpu_ptr(var); \
217 })
218 #define slub_put_cpu_ptr(var) \
219 do { \
220 (void)(var); \
221 migrate_enable(); \
222 } while (0)
223 #define USE_LOCKLESS_FAST_PATH() (false)
224 #endif
225
226 #ifndef CONFIG_SLUB_TINY
227 #define __fastpath_inline __always_inline
228 #else
229 #define __fastpath_inline
230 #endif
231
232 #ifdef CONFIG_SLUB_DEBUG
233 #ifdef CONFIG_SLUB_DEBUG_ON
234 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
235 #else
236 DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
237 #endif
238 #endif /* CONFIG_SLUB_DEBUG */
239
240 #ifdef CONFIG_NUMA
241 static DEFINE_STATIC_KEY_FALSE(strict_numa);
242 #endif
243
244 /* Structure holding parameters for get_partial() call chain */
245 struct partial_context {
246 gfp_t flags;
247 unsigned int orig_size;
248 void *object;
249 };
250
kmem_cache_debug(struct kmem_cache * s)251 static inline bool kmem_cache_debug(struct kmem_cache *s)
252 {
253 return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
254 }
255
fixup_red_left(struct kmem_cache * s,void * p)256 void *fixup_red_left(struct kmem_cache *s, void *p)
257 {
258 if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
259 p += s->red_left_pad;
260
261 return p;
262 }
263
kmem_cache_has_cpu_partial(struct kmem_cache * s)264 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
265 {
266 #ifdef CONFIG_SLUB_CPU_PARTIAL
267 return !kmem_cache_debug(s);
268 #else
269 return false;
270 #endif
271 }
272
273 /*
274 * Issues still to be resolved:
275 *
276 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
277 *
278 * - Variable sizing of the per node arrays
279 */
280
281 /* Enable to log cmpxchg failures */
282 #undef SLUB_DEBUG_CMPXCHG
283
284 #ifndef CONFIG_SLUB_TINY
285 /*
286 * Minimum number of partial slabs. These will be left on the partial
287 * lists even if they are empty. kmem_cache_shrink may reclaim them.
288 */
289 #define MIN_PARTIAL 5
290
291 /*
292 * Maximum number of desirable partial slabs.
293 * The existence of more partial slabs makes kmem_cache_shrink
294 * sort the partial list by the number of objects in use.
295 */
296 #define MAX_PARTIAL 10
297 #else
298 #define MIN_PARTIAL 0
299 #define MAX_PARTIAL 0
300 #endif
301
302 #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
303 SLAB_POISON | SLAB_STORE_USER)
304
305 /*
306 * These debug flags cannot use CMPXCHG because there might be consistency
307 * issues when checking or reading debug information
308 */
309 #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
310 SLAB_TRACE)
311
312
313 /*
314 * Debugging flags that require metadata to be stored in the slab. These get
315 * disabled when slab_debug=O is used and a cache's min order increases with
316 * metadata.
317 */
318 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
319
320 #define OO_SHIFT 16
321 #define OO_MASK ((1 << OO_SHIFT) - 1)
322 #define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */
323
324 /* Internal SLUB flags */
325 /* Poison object */
326 #define __OBJECT_POISON __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON)
327 /* Use cmpxchg_double */
328
329 #ifdef system_has_freelist_aba
330 #define __CMPXCHG_DOUBLE __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE)
331 #else
332 #define __CMPXCHG_DOUBLE __SLAB_FLAG_UNUSED
333 #endif
334
335 /*
336 * Tracking user of a slab.
337 */
338 #define TRACK_ADDRS_COUNT 16
339 struct track {
340 unsigned long addr; /* Called from address */
341 #ifdef CONFIG_STACKDEPOT
342 depot_stack_handle_t handle;
343 #endif
344 int cpu; /* Was running on cpu */
345 int pid; /* Pid context */
346 unsigned long when; /* When did the operation occur */
347 };
348
349 enum track_item { TRACK_ALLOC, TRACK_FREE };
350
351 #ifdef SLAB_SUPPORTS_SYSFS
352 static int sysfs_slab_add(struct kmem_cache *);
353 static int sysfs_slab_alias(struct kmem_cache *, const char *);
354 #else
sysfs_slab_add(struct kmem_cache * s)355 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
sysfs_slab_alias(struct kmem_cache * s,const char * p)356 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
357 { return 0; }
358 #endif
359
360 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
361 static void debugfs_slab_add(struct kmem_cache *);
362 #else
debugfs_slab_add(struct kmem_cache * s)363 static inline void debugfs_slab_add(struct kmem_cache *s) { }
364 #endif
365
366 enum stat_item {
367 ALLOC_PCS, /* Allocation from percpu sheaf */
368 ALLOC_FASTPATH, /* Allocation from cpu slab */
369 ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */
370 FREE_PCS, /* Free to percpu sheaf */
371 FREE_RCU_SHEAF, /* Free to rcu_free sheaf */
372 FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */
373 FREE_FASTPATH, /* Free to cpu slab */
374 FREE_SLOWPATH, /* Freeing not to cpu slab */
375 FREE_FROZEN, /* Freeing to frozen slab */
376 FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */
377 FREE_REMOVE_PARTIAL, /* Freeing removes last object */
378 ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */
379 ALLOC_SLAB, /* Cpu slab acquired from page allocator */
380 ALLOC_REFILL, /* Refill cpu slab from slab freelist */
381 ALLOC_NODE_MISMATCH, /* Switching cpu slab */
382 FREE_SLAB, /* Slab freed to the page allocator */
383 CPUSLAB_FLUSH, /* Abandoning of the cpu slab */
384 DEACTIVATE_FULL, /* Cpu slab was full when deactivated */
385 DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */
386 DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */
387 DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */
388 DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
389 DEACTIVATE_BYPASS, /* Implicit deactivation */
390 ORDER_FALLBACK, /* Number of times fallback was necessary */
391 CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */
392 CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */
393 CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */
394 CPU_PARTIAL_FREE, /* Refill cpu partial on free */
395 CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
396 CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
397 SHEAF_FLUSH, /* Objects flushed from a sheaf */
398 SHEAF_REFILL, /* Objects refilled to a sheaf */
399 SHEAF_ALLOC, /* Allocation of an empty sheaf */
400 SHEAF_FREE, /* Freeing of an empty sheaf */
401 BARN_GET, /* Got full sheaf from barn */
402 BARN_GET_FAIL, /* Failed to get full sheaf from barn */
403 BARN_PUT, /* Put full sheaf to barn */
404 BARN_PUT_FAIL, /* Failed to put full sheaf to barn */
405 SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */
406 SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */
407 SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */
408 SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */
409 SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */
410 NR_SLUB_STAT_ITEMS
411 };
412
413 #ifndef CONFIG_SLUB_TINY
414 /*
415 * When changing the layout, make sure freelist and tid are still compatible
416 * with this_cpu_cmpxchg_double() alignment requirements.
417 */
418 struct kmem_cache_cpu {
419 union {
420 struct {
421 void **freelist; /* Pointer to next available object */
422 unsigned long tid; /* Globally unique transaction id */
423 };
424 freelist_aba_t freelist_tid;
425 };
426 struct slab *slab; /* The slab from which we are allocating */
427 #ifdef CONFIG_SLUB_CPU_PARTIAL
428 struct slab *partial; /* Partially allocated slabs */
429 #endif
430 local_trylock_t lock; /* Protects the fields above */
431 #ifdef CONFIG_SLUB_STATS
432 unsigned int stat[NR_SLUB_STAT_ITEMS];
433 #endif
434 };
435 #endif /* CONFIG_SLUB_TINY */
436
stat(const struct kmem_cache * s,enum stat_item si)437 static inline void stat(const struct kmem_cache *s, enum stat_item si)
438 {
439 #ifdef CONFIG_SLUB_STATS
440 /*
441 * The rmw is racy on a preemptible kernel but this is acceptable, so
442 * avoid this_cpu_add()'s irq-disable overhead.
443 */
444 raw_cpu_inc(s->cpu_slab->stat[si]);
445 #endif
446 }
447
448 static inline
stat_add(const struct kmem_cache * s,enum stat_item si,int v)449 void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
450 {
451 #ifdef CONFIG_SLUB_STATS
452 raw_cpu_add(s->cpu_slab->stat[si], v);
453 #endif
454 }
455
456 #define MAX_FULL_SHEAVES 10
457 #define MAX_EMPTY_SHEAVES 10
458
459 struct node_barn {
460 spinlock_t lock;
461 struct list_head sheaves_full;
462 struct list_head sheaves_empty;
463 unsigned int nr_full;
464 unsigned int nr_empty;
465 };
466
467 struct slab_sheaf {
468 union {
469 struct rcu_head rcu_head;
470 struct list_head barn_list;
471 /* only used for prefilled sheafs */
472 unsigned int capacity;
473 };
474 struct kmem_cache *cache;
475 unsigned int size;
476 int node; /* only used for rcu_sheaf */
477 void *objects[];
478 };
479
480 struct slub_percpu_sheaves {
481 local_trylock_t lock;
482 struct slab_sheaf *main; /* never NULL when unlocked */
483 struct slab_sheaf *spare; /* empty or full, may be NULL */
484 struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
485 };
486
487 /*
488 * The slab lists for all objects.
489 */
490 struct kmem_cache_node {
491 spinlock_t list_lock;
492 unsigned long nr_partial;
493 struct list_head partial;
494 #ifdef CONFIG_SLUB_DEBUG
495 atomic_long_t nr_slabs;
496 atomic_long_t total_objects;
497 struct list_head full;
498 #endif
499 struct node_barn *barn;
500 };
501
get_node(struct kmem_cache * s,int node)502 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
503 {
504 return s->node[node];
505 }
506
507 /* Get the barn of the current cpu's memory node */
get_barn(struct kmem_cache * s)508 static inline struct node_barn *get_barn(struct kmem_cache *s)
509 {
510 return get_node(s, numa_mem_id())->barn;
511 }
512
513 /*
514 * Iterator over all nodes. The body will be executed for each node that has
515 * a kmem_cache_node structure allocated (which is true for all online nodes)
516 */
517 #define for_each_kmem_cache_node(__s, __node, __n) \
518 for (__node = 0; __node < nr_node_ids; __node++) \
519 if ((__n = get_node(__s, __node)))
520
521 /*
522 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
523 * Corresponds to node_state[N_MEMORY], but can temporarily
524 * differ during memory hotplug/hotremove operations.
525 * Protected by slab_mutex.
526 */
527 static nodemask_t slab_nodes;
528
529 /*
530 * Workqueue used for flush_cpu_slab().
531 */
532 static struct workqueue_struct *flushwq;
533
534 struct slub_flush_work {
535 struct work_struct work;
536 struct kmem_cache *s;
537 bool skip;
538 };
539
540 static DEFINE_MUTEX(flush_lock);
541 static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
542
543 /********************************************************************
544 * Core slab cache functions
545 *******************************************************************/
546
547 /*
548 * Returns freelist pointer (ptr). With hardening, this is obfuscated
549 * with an XOR of the address where the pointer is held and a per-cache
550 * random number.
551 */
freelist_ptr_encode(const struct kmem_cache * s,void * ptr,unsigned long ptr_addr)552 static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s,
553 void *ptr, unsigned long ptr_addr)
554 {
555 unsigned long encoded;
556
557 #ifdef CONFIG_SLAB_FREELIST_HARDENED
558 encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr);
559 #else
560 encoded = (unsigned long)ptr;
561 #endif
562 return (freeptr_t){.v = encoded};
563 }
564
freelist_ptr_decode(const struct kmem_cache * s,freeptr_t ptr,unsigned long ptr_addr)565 static inline void *freelist_ptr_decode(const struct kmem_cache *s,
566 freeptr_t ptr, unsigned long ptr_addr)
567 {
568 void *decoded;
569
570 #ifdef CONFIG_SLAB_FREELIST_HARDENED
571 decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr));
572 #else
573 decoded = (void *)ptr.v;
574 #endif
575 return decoded;
576 }
577
get_freepointer(struct kmem_cache * s,void * object)578 static inline void *get_freepointer(struct kmem_cache *s, void *object)
579 {
580 unsigned long ptr_addr;
581 freeptr_t p;
582
583 object = kasan_reset_tag(object);
584 ptr_addr = (unsigned long)object + s->offset;
585 p = *(freeptr_t *)(ptr_addr);
586 return freelist_ptr_decode(s, p, ptr_addr);
587 }
588
589 #ifndef CONFIG_SLUB_TINY
prefetch_freepointer(const struct kmem_cache * s,void * object)590 static void prefetch_freepointer(const struct kmem_cache *s, void *object)
591 {
592 prefetchw(object + s->offset);
593 }
594 #endif
595
596 /*
597 * When running under KMSAN, get_freepointer_safe() may return an uninitialized
598 * pointer value in the case the current thread loses the race for the next
599 * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
600 * slab_alloc_node() will fail, so the uninitialized value won't be used, but
601 * KMSAN will still check all arguments of cmpxchg because of imperfect
602 * handling of inline assembly.
603 * To work around this problem, we apply __no_kmsan_checks to ensure that
604 * get_freepointer_safe() returns initialized memory.
605 */
606 __no_kmsan_checks
get_freepointer_safe(struct kmem_cache * s,void * object)607 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
608 {
609 unsigned long freepointer_addr;
610 freeptr_t p;
611
612 if (!debug_pagealloc_enabled_static())
613 return get_freepointer(s, object);
614
615 object = kasan_reset_tag(object);
616 freepointer_addr = (unsigned long)object + s->offset;
617 copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
618 return freelist_ptr_decode(s, p, freepointer_addr);
619 }
620
set_freepointer(struct kmem_cache * s,void * object,void * fp)621 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
622 {
623 unsigned long freeptr_addr = (unsigned long)object + s->offset;
624
625 #ifdef CONFIG_SLAB_FREELIST_HARDENED
626 BUG_ON(object == fp); /* naive detection of double free or corruption */
627 #endif
628
629 freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
630 *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr);
631 }
632
633 /*
634 * See comment in calculate_sizes().
635 */
freeptr_outside_object(struct kmem_cache * s)636 static inline bool freeptr_outside_object(struct kmem_cache *s)
637 {
638 return s->offset >= s->inuse;
639 }
640
641 /*
642 * Return offset of the end of info block which is inuse + free pointer if
643 * not overlapping with object.
644 */
get_info_end(struct kmem_cache * s)645 static inline unsigned int get_info_end(struct kmem_cache *s)
646 {
647 if (freeptr_outside_object(s))
648 return s->inuse + sizeof(void *);
649 else
650 return s->inuse;
651 }
652
653 /* Loop over all objects in a slab */
654 #define for_each_object(__p, __s, __addr, __objects) \
655 for (__p = fixup_red_left(__s, __addr); \
656 __p < (__addr) + (__objects) * (__s)->size; \
657 __p += (__s)->size)
658
order_objects(unsigned int order,unsigned int size)659 static inline unsigned int order_objects(unsigned int order, unsigned int size)
660 {
661 return ((unsigned int)PAGE_SIZE << order) / size;
662 }
663
oo_make(unsigned int order,unsigned int size)664 static inline struct kmem_cache_order_objects oo_make(unsigned int order,
665 unsigned int size)
666 {
667 struct kmem_cache_order_objects x = {
668 (order << OO_SHIFT) + order_objects(order, size)
669 };
670
671 return x;
672 }
673
oo_order(struct kmem_cache_order_objects x)674 static inline unsigned int oo_order(struct kmem_cache_order_objects x)
675 {
676 return x.x >> OO_SHIFT;
677 }
678
oo_objects(struct kmem_cache_order_objects x)679 static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
680 {
681 return x.x & OO_MASK;
682 }
683
684 #ifdef CONFIG_SLUB_CPU_PARTIAL
slub_set_cpu_partial(struct kmem_cache * s,unsigned int nr_objects)685 static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
686 {
687 unsigned int nr_slabs;
688
689 s->cpu_partial = nr_objects;
690
691 /*
692 * We take the number of objects but actually limit the number of
693 * slabs on the per cpu partial list, in order to limit excessive
694 * growth of the list. For simplicity we assume that the slabs will
695 * be half-full.
696 */
697 nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
698 s->cpu_partial_slabs = nr_slabs;
699 }
700
slub_get_cpu_partial(struct kmem_cache * s)701 static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
702 {
703 return s->cpu_partial_slabs;
704 }
705 #else
706 static inline void
slub_set_cpu_partial(struct kmem_cache * s,unsigned int nr_objects)707 slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
708 {
709 }
710
slub_get_cpu_partial(struct kmem_cache * s)711 static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
712 {
713 return 0;
714 }
715 #endif /* CONFIG_SLUB_CPU_PARTIAL */
716
717 /*
718 * If network-based swap is enabled, slub must keep track of whether memory
719 * were allocated from pfmemalloc reserves.
720 */
slab_test_pfmemalloc(const struct slab * slab)721 static inline bool slab_test_pfmemalloc(const struct slab *slab)
722 {
723 return test_bit(SL_pfmemalloc, &slab->flags.f);
724 }
725
slab_set_pfmemalloc(struct slab * slab)726 static inline void slab_set_pfmemalloc(struct slab *slab)
727 {
728 set_bit(SL_pfmemalloc, &slab->flags.f);
729 }
730
__slab_clear_pfmemalloc(struct slab * slab)731 static inline void __slab_clear_pfmemalloc(struct slab *slab)
732 {
733 __clear_bit(SL_pfmemalloc, &slab->flags.f);
734 }
735
736 /*
737 * Per slab locking using the pagelock
738 */
slab_lock(struct slab * slab)739 static __always_inline void slab_lock(struct slab *slab)
740 {
741 bit_spin_lock(SL_locked, &slab->flags.f);
742 }
743
slab_unlock(struct slab * slab)744 static __always_inline void slab_unlock(struct slab *slab)
745 {
746 bit_spin_unlock(SL_locked, &slab->flags.f);
747 }
748
749 static inline bool
__update_freelist_fast(struct slab * slab,void * freelist_old,unsigned long counters_old,void * freelist_new,unsigned long counters_new)750 __update_freelist_fast(struct slab *slab,
751 void *freelist_old, unsigned long counters_old,
752 void *freelist_new, unsigned long counters_new)
753 {
754 #ifdef system_has_freelist_aba
755 freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
756 freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
757
758 return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
759 #else
760 return false;
761 #endif
762 }
763
764 static inline bool
__update_freelist_slow(struct slab * slab,void * freelist_old,unsigned long counters_old,void * freelist_new,unsigned long counters_new)765 __update_freelist_slow(struct slab *slab,
766 void *freelist_old, unsigned long counters_old,
767 void *freelist_new, unsigned long counters_new)
768 {
769 bool ret = false;
770
771 slab_lock(slab);
772 if (slab->freelist == freelist_old &&
773 slab->counters == counters_old) {
774 slab->freelist = freelist_new;
775 slab->counters = counters_new;
776 ret = true;
777 }
778 slab_unlock(slab);
779
780 return ret;
781 }
782
783 /*
784 * Interrupts must be disabled (for the fallback code to work right), typically
785 * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
786 * part of bit_spin_lock(), is sufficient because the policy is not to allow any
787 * allocation/ free operation in hardirq context. Therefore nothing can
788 * interrupt the operation.
789 */
__slab_update_freelist(struct kmem_cache * s,struct slab * slab,void * freelist_old,unsigned long counters_old,void * freelist_new,unsigned long counters_new,const char * n)790 static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
791 void *freelist_old, unsigned long counters_old,
792 void *freelist_new, unsigned long counters_new,
793 const char *n)
794 {
795 bool ret;
796
797 if (USE_LOCKLESS_FAST_PATH())
798 lockdep_assert_irqs_disabled();
799
800 if (s->flags & __CMPXCHG_DOUBLE) {
801 ret = __update_freelist_fast(slab, freelist_old, counters_old,
802 freelist_new, counters_new);
803 } else {
804 ret = __update_freelist_slow(slab, freelist_old, counters_old,
805 freelist_new, counters_new);
806 }
807 if (likely(ret))
808 return true;
809
810 cpu_relax();
811 stat(s, CMPXCHG_DOUBLE_FAIL);
812
813 #ifdef SLUB_DEBUG_CMPXCHG
814 pr_info("%s %s: cmpxchg double redo ", n, s->name);
815 #endif
816
817 return false;
818 }
819
slab_update_freelist(struct kmem_cache * s,struct slab * slab,void * freelist_old,unsigned long counters_old,void * freelist_new,unsigned long counters_new,const char * n)820 static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
821 void *freelist_old, unsigned long counters_old,
822 void *freelist_new, unsigned long counters_new,
823 const char *n)
824 {
825 bool ret;
826
827 if (s->flags & __CMPXCHG_DOUBLE) {
828 ret = __update_freelist_fast(slab, freelist_old, counters_old,
829 freelist_new, counters_new);
830 } else {
831 unsigned long flags;
832
833 local_irq_save(flags);
834 ret = __update_freelist_slow(slab, freelist_old, counters_old,
835 freelist_new, counters_new);
836 local_irq_restore(flags);
837 }
838 if (likely(ret))
839 return true;
840
841 cpu_relax();
842 stat(s, CMPXCHG_DOUBLE_FAIL);
843
844 #ifdef SLUB_DEBUG_CMPXCHG
845 pr_info("%s %s: cmpxchg double redo ", n, s->name);
846 #endif
847
848 return false;
849 }
850
851 /*
852 * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
853 * family will round up the real request size to these fixed ones, so
854 * there could be an extra area than what is requested. Save the original
855 * request size in the meta data area, for better debug and sanity check.
856 */
set_orig_size(struct kmem_cache * s,void * object,unsigned int orig_size)857 static inline void set_orig_size(struct kmem_cache *s,
858 void *object, unsigned int orig_size)
859 {
860 void *p = kasan_reset_tag(object);
861
862 if (!slub_debug_orig_size(s))
863 return;
864
865 p += get_info_end(s);
866 p += sizeof(struct track) * 2;
867
868 *(unsigned int *)p = orig_size;
869 }
870
get_orig_size(struct kmem_cache * s,void * object)871 static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
872 {
873 void *p = kasan_reset_tag(object);
874
875 if (is_kfence_address(object))
876 return kfence_ksize(object);
877
878 if (!slub_debug_orig_size(s))
879 return s->object_size;
880
881 p += get_info_end(s);
882 p += sizeof(struct track) * 2;
883
884 return *(unsigned int *)p;
885 }
886
887 #ifdef CONFIG_SLUB_DEBUG
888
889 /*
890 * For debugging context when we want to check if the struct slab pointer
891 * appears to be valid.
892 */
validate_slab_ptr(struct slab * slab)893 static inline bool validate_slab_ptr(struct slab *slab)
894 {
895 return PageSlab(slab_page(slab));
896 }
897
898 static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
899 static DEFINE_SPINLOCK(object_map_lock);
900
__fill_map(unsigned long * obj_map,struct kmem_cache * s,struct slab * slab)901 static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
902 struct slab *slab)
903 {
904 void *addr = slab_address(slab);
905 void *p;
906
907 bitmap_zero(obj_map, slab->objects);
908
909 for (p = slab->freelist; p; p = get_freepointer(s, p))
910 set_bit(__obj_to_index(s, addr, p), obj_map);
911 }
912
913 #if IS_ENABLED(CONFIG_KUNIT)
slab_add_kunit_errors(void)914 static bool slab_add_kunit_errors(void)
915 {
916 struct kunit_resource *resource;
917
918 if (!kunit_get_current_test())
919 return false;
920
921 resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
922 if (!resource)
923 return false;
924
925 (*(int *)resource->data)++;
926 kunit_put_resource(resource);
927 return true;
928 }
929
slab_in_kunit_test(void)930 bool slab_in_kunit_test(void)
931 {
932 struct kunit_resource *resource;
933
934 if (!kunit_get_current_test())
935 return false;
936
937 resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
938 if (!resource)
939 return false;
940
941 kunit_put_resource(resource);
942 return true;
943 }
944 #else
slab_add_kunit_errors(void)945 static inline bool slab_add_kunit_errors(void) { return false; }
946 #endif
947
size_from_object(struct kmem_cache * s)948 static inline unsigned int size_from_object(struct kmem_cache *s)
949 {
950 if (s->flags & SLAB_RED_ZONE)
951 return s->size - s->red_left_pad;
952
953 return s->size;
954 }
955
restore_red_left(struct kmem_cache * s,void * p)956 static inline void *restore_red_left(struct kmem_cache *s, void *p)
957 {
958 if (s->flags & SLAB_RED_ZONE)
959 p -= s->red_left_pad;
960
961 return p;
962 }
963
964 /*
965 * Debug settings:
966 */
967 #if defined(CONFIG_SLUB_DEBUG_ON)
968 static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
969 #else
970 static slab_flags_t slub_debug;
971 #endif
972
973 static char *slub_debug_string;
974 static int disable_higher_order_debug;
975
976 /*
977 * slub is about to manipulate internal object metadata. This memory lies
978 * outside the range of the allocated object, so accessing it would normally
979 * be reported by kasan as a bounds error. metadata_access_enable() is used
980 * to tell kasan that these accesses are OK.
981 */
metadata_access_enable(void)982 static inline void metadata_access_enable(void)
983 {
984 kasan_disable_current();
985 kmsan_disable_current();
986 }
987
metadata_access_disable(void)988 static inline void metadata_access_disable(void)
989 {
990 kmsan_enable_current();
991 kasan_enable_current();
992 }
993
994 /*
995 * Object debugging
996 */
997
998 /* Verify that a pointer has an address that is valid within a slab page */
check_valid_pointer(struct kmem_cache * s,struct slab * slab,void * object)999 static inline int check_valid_pointer(struct kmem_cache *s,
1000 struct slab *slab, void *object)
1001 {
1002 void *base;
1003
1004 if (!object)
1005 return 1;
1006
1007 base = slab_address(slab);
1008 object = kasan_reset_tag(object);
1009 object = restore_red_left(s, object);
1010 if (object < base || object >= base + slab->objects * s->size ||
1011 (object - base) % s->size) {
1012 return 0;
1013 }
1014
1015 return 1;
1016 }
1017
print_section(char * level,char * text,u8 * addr,unsigned int length)1018 static void print_section(char *level, char *text, u8 *addr,
1019 unsigned int length)
1020 {
1021 metadata_access_enable();
1022 print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
1023 16, 1, kasan_reset_tag((void *)addr), length, 1);
1024 metadata_access_disable();
1025 }
1026
get_track(struct kmem_cache * s,void * object,enum track_item alloc)1027 static struct track *get_track(struct kmem_cache *s, void *object,
1028 enum track_item alloc)
1029 {
1030 struct track *p;
1031
1032 p = object + get_info_end(s);
1033
1034 return kasan_reset_tag(p + alloc);
1035 }
1036
1037 #ifdef CONFIG_STACKDEPOT
set_track_prepare(gfp_t gfp_flags)1038 static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
1039 {
1040 depot_stack_handle_t handle;
1041 unsigned long entries[TRACK_ADDRS_COUNT];
1042 unsigned int nr_entries;
1043
1044 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
1045 handle = stack_depot_save(entries, nr_entries, gfp_flags);
1046
1047 return handle;
1048 }
1049 #else
set_track_prepare(gfp_t gfp_flags)1050 static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
1051 {
1052 return 0;
1053 }
1054 #endif
1055
set_track_update(struct kmem_cache * s,void * object,enum track_item alloc,unsigned long addr,depot_stack_handle_t handle)1056 static void set_track_update(struct kmem_cache *s, void *object,
1057 enum track_item alloc, unsigned long addr,
1058 depot_stack_handle_t handle)
1059 {
1060 struct track *p = get_track(s, object, alloc);
1061
1062 #ifdef CONFIG_STACKDEPOT
1063 p->handle = handle;
1064 #endif
1065 p->addr = addr;
1066 p->cpu = smp_processor_id();
1067 p->pid = current->pid;
1068 p->when = jiffies;
1069 }
1070
set_track(struct kmem_cache * s,void * object,enum track_item alloc,unsigned long addr,gfp_t gfp_flags)1071 static __always_inline void set_track(struct kmem_cache *s, void *object,
1072 enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
1073 {
1074 depot_stack_handle_t handle = set_track_prepare(gfp_flags);
1075
1076 set_track_update(s, object, alloc, addr, handle);
1077 }
1078
init_tracking(struct kmem_cache * s,void * object)1079 static void init_tracking(struct kmem_cache *s, void *object)
1080 {
1081 struct track *p;
1082
1083 if (!(s->flags & SLAB_STORE_USER))
1084 return;
1085
1086 p = get_track(s, object, TRACK_ALLOC);
1087 memset(p, 0, 2*sizeof(struct track));
1088 }
1089
print_track(const char * s,struct track * t,unsigned long pr_time)1090 static void print_track(const char *s, struct track *t, unsigned long pr_time)
1091 {
1092 depot_stack_handle_t handle __maybe_unused;
1093
1094 if (!t->addr)
1095 return;
1096
1097 pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
1098 s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
1099 #ifdef CONFIG_STACKDEPOT
1100 handle = READ_ONCE(t->handle);
1101 if (handle)
1102 stack_depot_print(handle);
1103 else
1104 pr_err("object allocation/free stack trace missing\n");
1105 #endif
1106 }
1107
print_tracking(struct kmem_cache * s,void * object)1108 void print_tracking(struct kmem_cache *s, void *object)
1109 {
1110 unsigned long pr_time = jiffies;
1111 if (!(s->flags & SLAB_STORE_USER))
1112 return;
1113
1114 print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
1115 print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
1116 }
1117
print_slab_info(const struct slab * slab)1118 static void print_slab_info(const struct slab *slab)
1119 {
1120 pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
1121 slab, slab->objects, slab->inuse, slab->freelist,
1122 &slab->flags.f);
1123 }
1124
skip_orig_size_check(struct kmem_cache * s,const void * object)1125 void skip_orig_size_check(struct kmem_cache *s, const void *object)
1126 {
1127 set_orig_size(s, (void *)object, s->object_size);
1128 }
1129
__slab_bug(struct kmem_cache * s,const char * fmt,va_list argsp)1130 static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
1131 {
1132 struct va_format vaf;
1133 va_list args;
1134
1135 va_copy(args, argsp);
1136 vaf.fmt = fmt;
1137 vaf.va = &args;
1138 pr_err("=============================================================================\n");
1139 pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
1140 pr_err("-----------------------------------------------------------------------------\n\n");
1141 va_end(args);
1142 }
1143
slab_bug(struct kmem_cache * s,const char * fmt,...)1144 static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
1145 {
1146 va_list args;
1147
1148 va_start(args, fmt);
1149 __slab_bug(s, fmt, args);
1150 va_end(args);
1151 }
1152
1153 __printf(2, 3)
slab_fix(struct kmem_cache * s,const char * fmt,...)1154 static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
1155 {
1156 struct va_format vaf;
1157 va_list args;
1158
1159 if (slab_add_kunit_errors())
1160 return;
1161
1162 va_start(args, fmt);
1163 vaf.fmt = fmt;
1164 vaf.va = &args;
1165 pr_err("FIX %s: %pV\n", s->name, &vaf);
1166 va_end(args);
1167 }
1168
print_trailer(struct kmem_cache * s,struct slab * slab,u8 * p)1169 static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
1170 {
1171 unsigned int off; /* Offset of last byte */
1172 u8 *addr = slab_address(slab);
1173
1174 print_tracking(s, p);
1175
1176 print_slab_info(slab);
1177
1178 pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
1179 p, p - addr, get_freepointer(s, p));
1180
1181 if (s->flags & SLAB_RED_ZONE)
1182 print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
1183 s->red_left_pad);
1184 else if (p > addr + 16)
1185 print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
1186
1187 print_section(KERN_ERR, "Object ", p,
1188 min_t(unsigned int, s->object_size, PAGE_SIZE));
1189 if (s->flags & SLAB_RED_ZONE)
1190 print_section(KERN_ERR, "Redzone ", p + s->object_size,
1191 s->inuse - s->object_size);
1192
1193 off = get_info_end(s);
1194
1195 if (s->flags & SLAB_STORE_USER)
1196 off += 2 * sizeof(struct track);
1197
1198 if (slub_debug_orig_size(s))
1199 off += sizeof(unsigned int);
1200
1201 off += kasan_metadata_size(s, false);
1202
1203 if (off != size_from_object(s))
1204 /* Beginning of the filler is the free pointer */
1205 print_section(KERN_ERR, "Padding ", p + off,
1206 size_from_object(s) - off);
1207 }
1208
object_err(struct kmem_cache * s,struct slab * slab,u8 * object,const char * reason)1209 static void object_err(struct kmem_cache *s, struct slab *slab,
1210 u8 *object, const char *reason)
1211 {
1212 if (slab_add_kunit_errors())
1213 return;
1214
1215 slab_bug(s, reason);
1216 if (!object || !check_valid_pointer(s, slab, object)) {
1217 print_slab_info(slab);
1218 pr_err("Invalid pointer 0x%p\n", object);
1219 } else {
1220 print_trailer(s, slab, object);
1221 }
1222 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1223
1224 WARN_ON(1);
1225 }
1226
freelist_corrupted(struct kmem_cache * s,struct slab * slab,void ** freelist,void * nextfree)1227 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
1228 void **freelist, void *nextfree)
1229 {
1230 if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
1231 !check_valid_pointer(s, slab, nextfree) && freelist) {
1232 object_err(s, slab, *freelist, "Freechain corrupt");
1233 *freelist = NULL;
1234 slab_fix(s, "Isolate corrupted freechain");
1235 return true;
1236 }
1237
1238 return false;
1239 }
1240
__slab_err(struct slab * slab)1241 static void __slab_err(struct slab *slab)
1242 {
1243 if (slab_in_kunit_test())
1244 return;
1245
1246 print_slab_info(slab);
1247 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1248
1249 WARN_ON(1);
1250 }
1251
slab_err(struct kmem_cache * s,struct slab * slab,const char * fmt,...)1252 static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
1253 const char *fmt, ...)
1254 {
1255 va_list args;
1256
1257 if (slab_add_kunit_errors())
1258 return;
1259
1260 va_start(args, fmt);
1261 __slab_bug(s, fmt, args);
1262 va_end(args);
1263
1264 __slab_err(slab);
1265 }
1266
init_object(struct kmem_cache * s,void * object,u8 val)1267 static void init_object(struct kmem_cache *s, void *object, u8 val)
1268 {
1269 u8 *p = kasan_reset_tag(object);
1270 unsigned int poison_size = s->object_size;
1271
1272 if (s->flags & SLAB_RED_ZONE) {
1273 /*
1274 * Here and below, avoid overwriting the KMSAN shadow. Keeping
1275 * the shadow makes it possible to distinguish uninit-value
1276 * from use-after-free.
1277 */
1278 memset_no_sanitize_memory(p - s->red_left_pad, val,
1279 s->red_left_pad);
1280
1281 if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
1282 /*
1283 * Redzone the extra allocated space by kmalloc than
1284 * requested, and the poison size will be limited to
1285 * the original request size accordingly.
1286 */
1287 poison_size = get_orig_size(s, object);
1288 }
1289 }
1290
1291 if (s->flags & __OBJECT_POISON) {
1292 memset_no_sanitize_memory(p, POISON_FREE, poison_size - 1);
1293 memset_no_sanitize_memory(p + poison_size - 1, POISON_END, 1);
1294 }
1295
1296 if (s->flags & SLAB_RED_ZONE)
1297 memset_no_sanitize_memory(p + poison_size, val,
1298 s->inuse - poison_size);
1299 }
1300
restore_bytes(struct kmem_cache * s,const char * message,u8 data,void * from,void * to)1301 static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
1302 void *from, void *to)
1303 {
1304 slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
1305 memset(from, data, to - from);
1306 }
1307
1308 #ifdef CONFIG_KMSAN
1309 #define pad_check_attributes noinline __no_kmsan_checks
1310 #else
1311 #define pad_check_attributes
1312 #endif
1313
1314 static pad_check_attributes int
check_bytes_and_report(struct kmem_cache * s,struct slab * slab,u8 * object,const char * what,u8 * start,unsigned int value,unsigned int bytes,bool slab_obj_print)1315 check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
1316 u8 *object, const char *what, u8 *start, unsigned int value,
1317 unsigned int bytes, bool slab_obj_print)
1318 {
1319 u8 *fault;
1320 u8 *end;
1321 u8 *addr = slab_address(slab);
1322
1323 metadata_access_enable();
1324 fault = memchr_inv(kasan_reset_tag(start), value, bytes);
1325 metadata_access_disable();
1326 if (!fault)
1327 return 1;
1328
1329 end = start + bytes;
1330 while (end > fault && end[-1] == value)
1331 end--;
1332
1333 if (slab_add_kunit_errors())
1334 goto skip_bug_print;
1335
1336 pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
1337 what, fault, end - 1, fault - addr, fault[0], value);
1338
1339 if (slab_obj_print)
1340 object_err(s, slab, object, "Object corrupt");
1341
1342 skip_bug_print:
1343 restore_bytes(s, what, value, fault, end);
1344 return 0;
1345 }
1346
1347 /*
1348 * Object layout:
1349 *
1350 * object address
1351 * Bytes of the object to be managed.
1352 * If the freepointer may overlay the object then the free
1353 * pointer is at the middle of the object.
1354 *
1355 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
1356 * 0xa5 (POISON_END)
1357 *
1358 * object + s->object_size
1359 * Padding to reach word boundary. This is also used for Redzoning.
1360 * Padding is extended by another word if Redzoning is enabled and
1361 * object_size == inuse.
1362 *
1363 * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with
1364 * 0xcc (SLUB_RED_ACTIVE) for objects in use.
1365 *
1366 * object + s->inuse
1367 * Meta data starts here.
1368 *
1369 * A. Free pointer (if we cannot overwrite object on free)
1370 * B. Tracking data for SLAB_STORE_USER
1371 * C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
1372 * D. Padding to reach required alignment boundary or at minimum
1373 * one word if debugging is on to be able to detect writes
1374 * before the word boundary.
1375 *
1376 * Padding is done using 0x5a (POISON_INUSE)
1377 *
1378 * object + s->size
1379 * Nothing is used beyond s->size.
1380 *
1381 * If slabcaches are merged then the object_size and inuse boundaries are mostly
1382 * ignored. And therefore no slab options that rely on these boundaries
1383 * may be used with merged slabcaches.
1384 */
1385
check_pad_bytes(struct kmem_cache * s,struct slab * slab,u8 * p)1386 static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
1387 {
1388 unsigned long off = get_info_end(s); /* The end of info */
1389
1390 if (s->flags & SLAB_STORE_USER) {
1391 /* We also have user information there */
1392 off += 2 * sizeof(struct track);
1393
1394 if (s->flags & SLAB_KMALLOC)
1395 off += sizeof(unsigned int);
1396 }
1397
1398 off += kasan_metadata_size(s, false);
1399
1400 if (size_from_object(s) == off)
1401 return 1;
1402
1403 return check_bytes_and_report(s, slab, p, "Object padding",
1404 p + off, POISON_INUSE, size_from_object(s) - off, true);
1405 }
1406
1407 /* Check the pad bytes at the end of a slab page */
1408 static pad_check_attributes void
slab_pad_check(struct kmem_cache * s,struct slab * slab)1409 slab_pad_check(struct kmem_cache *s, struct slab *slab)
1410 {
1411 u8 *start;
1412 u8 *fault;
1413 u8 *end;
1414 u8 *pad;
1415 int length;
1416 int remainder;
1417
1418 if (!(s->flags & SLAB_POISON))
1419 return;
1420
1421 start = slab_address(slab);
1422 length = slab_size(slab);
1423 end = start + length;
1424 remainder = length % s->size;
1425 if (!remainder)
1426 return;
1427
1428 pad = end - remainder;
1429 metadata_access_enable();
1430 fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
1431 metadata_access_disable();
1432 if (!fault)
1433 return;
1434 while (end > fault && end[-1] == POISON_INUSE)
1435 end--;
1436
1437 slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
1438 fault, end - 1, fault - start);
1439 print_section(KERN_ERR, "Padding ", pad, remainder);
1440 __slab_err(slab);
1441
1442 restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
1443 }
1444
check_object(struct kmem_cache * s,struct slab * slab,void * object,u8 val)1445 static int check_object(struct kmem_cache *s, struct slab *slab,
1446 void *object, u8 val)
1447 {
1448 u8 *p = object;
1449 u8 *endobject = object + s->object_size;
1450 unsigned int orig_size, kasan_meta_size;
1451 int ret = 1;
1452
1453 if (s->flags & SLAB_RED_ZONE) {
1454 if (!check_bytes_and_report(s, slab, object, "Left Redzone",
1455 object - s->red_left_pad, val, s->red_left_pad, ret))
1456 ret = 0;
1457
1458 if (!check_bytes_and_report(s, slab, object, "Right Redzone",
1459 endobject, val, s->inuse - s->object_size, ret))
1460 ret = 0;
1461
1462 if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
1463 orig_size = get_orig_size(s, object);
1464
1465 if (s->object_size > orig_size &&
1466 !check_bytes_and_report(s, slab, object,
1467 "kmalloc Redzone", p + orig_size,
1468 val, s->object_size - orig_size, ret)) {
1469 ret = 0;
1470 }
1471 }
1472 } else {
1473 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
1474 if (!check_bytes_and_report(s, slab, p, "Alignment padding",
1475 endobject, POISON_INUSE,
1476 s->inuse - s->object_size, ret))
1477 ret = 0;
1478 }
1479 }
1480
1481 if (s->flags & SLAB_POISON) {
1482 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) {
1483 /*
1484 * KASAN can save its free meta data inside of the
1485 * object at offset 0. Thus, skip checking the part of
1486 * the redzone that overlaps with the meta data.
1487 */
1488 kasan_meta_size = kasan_metadata_size(s, true);
1489 if (kasan_meta_size < s->object_size - 1 &&
1490 !check_bytes_and_report(s, slab, p, "Poison",
1491 p + kasan_meta_size, POISON_FREE,
1492 s->object_size - kasan_meta_size - 1, ret))
1493 ret = 0;
1494 if (kasan_meta_size < s->object_size &&
1495 !check_bytes_and_report(s, slab, p, "End Poison",
1496 p + s->object_size - 1, POISON_END, 1, ret))
1497 ret = 0;
1498 }
1499 /*
1500 * check_pad_bytes cleans up on its own.
1501 */
1502 if (!check_pad_bytes(s, slab, p))
1503 ret = 0;
1504 }
1505
1506 /*
1507 * Cannot check freepointer while object is allocated if
1508 * object and freepointer overlap.
1509 */
1510 if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) &&
1511 !check_valid_pointer(s, slab, get_freepointer(s, p))) {
1512 object_err(s, slab, p, "Freepointer corrupt");
1513 /*
1514 * No choice but to zap it and thus lose the remainder
1515 * of the free objects in this slab. May cause
1516 * another error because the object count is now wrong.
1517 */
1518 set_freepointer(s, p, NULL);
1519 ret = 0;
1520 }
1521
1522 return ret;
1523 }
1524
1525 /*
1526 * Checks if the slab state looks sane. Assumes the struct slab pointer
1527 * was either obtained in a way that ensures it's valid, or validated
1528 * by validate_slab_ptr()
1529 */
check_slab(struct kmem_cache * s,struct slab * slab)1530 static int check_slab(struct kmem_cache *s, struct slab *slab)
1531 {
1532 int maxobj;
1533
1534 maxobj = order_objects(slab_order(slab), s->size);
1535 if (slab->objects > maxobj) {
1536 slab_err(s, slab, "objects %u > max %u",
1537 slab->objects, maxobj);
1538 return 0;
1539 }
1540 if (slab->inuse > slab->objects) {
1541 slab_err(s, slab, "inuse %u > max %u",
1542 slab->inuse, slab->objects);
1543 return 0;
1544 }
1545 if (slab->frozen) {
1546 slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed");
1547 return 0;
1548 }
1549
1550 /* Slab_pad_check fixes things up after itself */
1551 slab_pad_check(s, slab);
1552 return 1;
1553 }
1554
1555 /*
1556 * Determine if a certain object in a slab is on the freelist. Must hold the
1557 * slab lock to guarantee that the chains are in a consistent state.
1558 */
on_freelist(struct kmem_cache * s,struct slab * slab,void * search)1559 static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
1560 {
1561 int nr = 0;
1562 void *fp;
1563 void *object = NULL;
1564 int max_objects;
1565
1566 fp = slab->freelist;
1567 while (fp && nr <= slab->objects) {
1568 if (fp == search)
1569 return true;
1570 if (!check_valid_pointer(s, slab, fp)) {
1571 if (object) {
1572 object_err(s, slab, object,
1573 "Freechain corrupt");
1574 set_freepointer(s, object, NULL);
1575 break;
1576 } else {
1577 slab_err(s, slab, "Freepointer corrupt");
1578 slab->freelist = NULL;
1579 slab->inuse = slab->objects;
1580 slab_fix(s, "Freelist cleared");
1581 return false;
1582 }
1583 }
1584 object = fp;
1585 fp = get_freepointer(s, object);
1586 nr++;
1587 }
1588
1589 if (nr > slab->objects) {
1590 slab_err(s, slab, "Freelist cycle detected");
1591 slab->freelist = NULL;
1592 slab->inuse = slab->objects;
1593 slab_fix(s, "Freelist cleared");
1594 return false;
1595 }
1596
1597 max_objects = order_objects(slab_order(slab), s->size);
1598 if (max_objects > MAX_OBJS_PER_PAGE)
1599 max_objects = MAX_OBJS_PER_PAGE;
1600
1601 if (slab->objects != max_objects) {
1602 slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
1603 slab->objects, max_objects);
1604 slab->objects = max_objects;
1605 slab_fix(s, "Number of objects adjusted");
1606 }
1607 if (slab->inuse != slab->objects - nr) {
1608 slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
1609 slab->inuse, slab->objects - nr);
1610 slab->inuse = slab->objects - nr;
1611 slab_fix(s, "Object count adjusted");
1612 }
1613 return search == NULL;
1614 }
1615
trace(struct kmem_cache * s,struct slab * slab,void * object,int alloc)1616 static void trace(struct kmem_cache *s, struct slab *slab, void *object,
1617 int alloc)
1618 {
1619 if (s->flags & SLAB_TRACE) {
1620 pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
1621 s->name,
1622 alloc ? "alloc" : "free",
1623 object, slab->inuse,
1624 slab->freelist);
1625
1626 if (!alloc)
1627 print_section(KERN_INFO, "Object ", (void *)object,
1628 s->object_size);
1629
1630 dump_stack();
1631 }
1632 }
1633
1634 /*
1635 * Tracking of fully allocated slabs for debugging purposes.
1636 */
add_full(struct kmem_cache * s,struct kmem_cache_node * n,struct slab * slab)1637 static void add_full(struct kmem_cache *s,
1638 struct kmem_cache_node *n, struct slab *slab)
1639 {
1640 if (!(s->flags & SLAB_STORE_USER))
1641 return;
1642
1643 lockdep_assert_held(&n->list_lock);
1644 list_add(&slab->slab_list, &n->full);
1645 }
1646
remove_full(struct kmem_cache * s,struct kmem_cache_node * n,struct slab * slab)1647 static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
1648 {
1649 if (!(s->flags & SLAB_STORE_USER))
1650 return;
1651
1652 lockdep_assert_held(&n->list_lock);
1653 list_del(&slab->slab_list);
1654 }
1655
node_nr_slabs(struct kmem_cache_node * n)1656 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1657 {
1658 return atomic_long_read(&n->nr_slabs);
1659 }
1660
inc_slabs_node(struct kmem_cache * s,int node,int objects)1661 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1662 {
1663 struct kmem_cache_node *n = get_node(s, node);
1664
1665 atomic_long_inc(&n->nr_slabs);
1666 atomic_long_add(objects, &n->total_objects);
1667 }
dec_slabs_node(struct kmem_cache * s,int node,int objects)1668 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1669 {
1670 struct kmem_cache_node *n = get_node(s, node);
1671
1672 atomic_long_dec(&n->nr_slabs);
1673 atomic_long_sub(objects, &n->total_objects);
1674 }
1675
1676 /* Object debug checks for alloc/free paths */
setup_object_debug(struct kmem_cache * s,void * object)1677 static void setup_object_debug(struct kmem_cache *s, void *object)
1678 {
1679 if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
1680 return;
1681
1682 init_object(s, object, SLUB_RED_INACTIVE);
1683 init_tracking(s, object);
1684 }
1685
1686 static
setup_slab_debug(struct kmem_cache * s,struct slab * slab,void * addr)1687 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
1688 {
1689 if (!kmem_cache_debug_flags(s, SLAB_POISON))
1690 return;
1691
1692 metadata_access_enable();
1693 memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
1694 metadata_access_disable();
1695 }
1696
alloc_consistency_checks(struct kmem_cache * s,struct slab * slab,void * object)1697 static inline int alloc_consistency_checks(struct kmem_cache *s,
1698 struct slab *slab, void *object)
1699 {
1700 if (!check_slab(s, slab))
1701 return 0;
1702
1703 if (!check_valid_pointer(s, slab, object)) {
1704 object_err(s, slab, object, "Freelist Pointer check fails");
1705 return 0;
1706 }
1707
1708 if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
1709 return 0;
1710
1711 return 1;
1712 }
1713
alloc_debug_processing(struct kmem_cache * s,struct slab * slab,void * object,int orig_size)1714 static noinline bool alloc_debug_processing(struct kmem_cache *s,
1715 struct slab *slab, void *object, int orig_size)
1716 {
1717 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1718 if (!alloc_consistency_checks(s, slab, object))
1719 goto bad;
1720 }
1721
1722 /* Success. Perform special debug activities for allocs */
1723 trace(s, slab, object, 1);
1724 set_orig_size(s, object, orig_size);
1725 init_object(s, object, SLUB_RED_ACTIVE);
1726 return true;
1727
1728 bad:
1729 /*
1730 * Let's do the best we can to avoid issues in the future. Marking all
1731 * objects as used avoids touching the remaining objects.
1732 */
1733 slab_fix(s, "Marking all objects used");
1734 slab->inuse = slab->objects;
1735 slab->freelist = NULL;
1736 slab->frozen = 1; /* mark consistency-failed slab as frozen */
1737
1738 return false;
1739 }
1740
free_consistency_checks(struct kmem_cache * s,struct slab * slab,void * object,unsigned long addr)1741 static inline int free_consistency_checks(struct kmem_cache *s,
1742 struct slab *slab, void *object, unsigned long addr)
1743 {
1744 if (!check_valid_pointer(s, slab, object)) {
1745 slab_err(s, slab, "Invalid object pointer 0x%p", object);
1746 return 0;
1747 }
1748
1749 if (on_freelist(s, slab, object)) {
1750 object_err(s, slab, object, "Object already free");
1751 return 0;
1752 }
1753
1754 if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
1755 return 0;
1756
1757 if (unlikely(s != slab->slab_cache)) {
1758 if (!slab->slab_cache) {
1759 slab_err(NULL, slab, "No slab cache for object 0x%p",
1760 object);
1761 } else {
1762 object_err(s, slab, object,
1763 "page slab pointer corrupt.");
1764 }
1765 return 0;
1766 }
1767 return 1;
1768 }
1769
1770 /*
1771 * Parse a block of slab_debug options. Blocks are delimited by ';'
1772 *
1773 * @str: start of block
1774 * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
1775 * @slabs: return start of list of slabs, or NULL when there's no list
1776 * @init: assume this is initial parsing and not per-kmem-create parsing
1777 *
1778 * returns the start of next block if there's any, or NULL
1779 */
1780 static char *
parse_slub_debug_flags(char * str,slab_flags_t * flags,char ** slabs,bool init)1781 parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
1782 {
1783 bool higher_order_disable = false;
1784
1785 /* Skip any completely empty blocks */
1786 while (*str && *str == ';')
1787 str++;
1788
1789 if (*str == ',') {
1790 /*
1791 * No options but restriction on slabs. This means full
1792 * debugging for slabs matching a pattern.
1793 */
1794 *flags = DEBUG_DEFAULT_FLAGS;
1795 goto check_slabs;
1796 }
1797 *flags = 0;
1798
1799 /* Determine which debug features should be switched on */
1800 for (; *str && *str != ',' && *str != ';'; str++) {
1801 switch (tolower(*str)) {
1802 case '-':
1803 *flags = 0;
1804 break;
1805 case 'f':
1806 *flags |= SLAB_CONSISTENCY_CHECKS;
1807 break;
1808 case 'z':
1809 *flags |= SLAB_RED_ZONE;
1810 break;
1811 case 'p':
1812 *flags |= SLAB_POISON;
1813 break;
1814 case 'u':
1815 *flags |= SLAB_STORE_USER;
1816 break;
1817 case 't':
1818 *flags |= SLAB_TRACE;
1819 break;
1820 case 'a':
1821 *flags |= SLAB_FAILSLAB;
1822 break;
1823 case 'o':
1824 /*
1825 * Avoid enabling debugging on caches if its minimum
1826 * order would increase as a result.
1827 */
1828 higher_order_disable = true;
1829 break;
1830 default:
1831 if (init)
1832 pr_err("slab_debug option '%c' unknown. skipped\n", *str);
1833 }
1834 }
1835 check_slabs:
1836 if (*str == ',')
1837 *slabs = ++str;
1838 else
1839 *slabs = NULL;
1840
1841 /* Skip over the slab list */
1842 while (*str && *str != ';')
1843 str++;
1844
1845 /* Skip any completely empty blocks */
1846 while (*str && *str == ';')
1847 str++;
1848
1849 if (init && higher_order_disable)
1850 disable_higher_order_debug = 1;
1851
1852 if (*str)
1853 return str;
1854 else
1855 return NULL;
1856 }
1857
setup_slub_debug(char * str)1858 static int __init setup_slub_debug(char *str)
1859 {
1860 slab_flags_t flags;
1861 slab_flags_t global_flags;
1862 char *saved_str;
1863 char *slab_list;
1864 bool global_slub_debug_changed = false;
1865 bool slab_list_specified = false;
1866
1867 global_flags = DEBUG_DEFAULT_FLAGS;
1868 if (*str++ != '=' || !*str)
1869 /*
1870 * No options specified. Switch on full debugging.
1871 */
1872 goto out;
1873
1874 saved_str = str;
1875 while (str) {
1876 str = parse_slub_debug_flags(str, &flags, &slab_list, true);
1877
1878 if (!slab_list) {
1879 global_flags = flags;
1880 global_slub_debug_changed = true;
1881 } else {
1882 slab_list_specified = true;
1883 if (flags & SLAB_STORE_USER)
1884 stack_depot_request_early_init();
1885 }
1886 }
1887
1888 /*
1889 * For backwards compatibility, a single list of flags with list of
1890 * slabs means debugging is only changed for those slabs, so the global
1891 * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
1892 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
1893 * long as there is no option specifying flags without a slab list.
1894 */
1895 if (slab_list_specified) {
1896 if (!global_slub_debug_changed)
1897 global_flags = slub_debug;
1898 slub_debug_string = saved_str;
1899 }
1900 out:
1901 slub_debug = global_flags;
1902 if (slub_debug & SLAB_STORE_USER)
1903 stack_depot_request_early_init();
1904 if (slub_debug != 0 || slub_debug_string)
1905 static_branch_enable(&slub_debug_enabled);
1906 else
1907 static_branch_disable(&slub_debug_enabled);
1908 if ((static_branch_unlikely(&init_on_alloc) ||
1909 static_branch_unlikely(&init_on_free)) &&
1910 (slub_debug & SLAB_POISON))
1911 pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
1912 return 1;
1913 }
1914
1915 __setup("slab_debug", setup_slub_debug);
1916 __setup_param("slub_debug", slub_debug, setup_slub_debug, 0);
1917
1918 /*
1919 * kmem_cache_flags - apply debugging options to the cache
1920 * @flags: flags to set
1921 * @name: name of the cache
1922 *
1923 * Debug option(s) are applied to @flags. In addition to the debug
1924 * option(s), if a slab name (or multiple) is specified i.e.
1925 * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1926 * then only the select slabs will receive the debug option(s).
1927 */
kmem_cache_flags(slab_flags_t flags,const char * name)1928 slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
1929 {
1930 char *iter;
1931 size_t len;
1932 char *next_block;
1933 slab_flags_t block_flags;
1934 slab_flags_t slub_debug_local = slub_debug;
1935
1936 if (flags & SLAB_NO_USER_FLAGS)
1937 return flags;
1938
1939 /*
1940 * If the slab cache is for debugging (e.g. kmemleak) then
1941 * don't store user (stack trace) information by default,
1942 * but let the user enable it via the command line below.
1943 */
1944 if (flags & SLAB_NOLEAKTRACE)
1945 slub_debug_local &= ~SLAB_STORE_USER;
1946
1947 len = strlen(name);
1948 next_block = slub_debug_string;
1949 /* Go through all blocks of debug options, see if any matches our slab's name */
1950 while (next_block) {
1951 next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
1952 if (!iter)
1953 continue;
1954 /* Found a block that has a slab list, search it */
1955 while (*iter) {
1956 char *end, *glob;
1957 size_t cmplen;
1958
1959 end = strchrnul(iter, ',');
1960 if (next_block && next_block < end)
1961 end = next_block - 1;
1962
1963 glob = strnchr(iter, end - iter, '*');
1964 if (glob)
1965 cmplen = glob - iter;
1966 else
1967 cmplen = max_t(size_t, len, (end - iter));
1968
1969 if (!strncmp(name, iter, cmplen)) {
1970 flags |= block_flags;
1971 return flags;
1972 }
1973
1974 if (!*end || *end == ';')
1975 break;
1976 iter = end + 1;
1977 }
1978 }
1979
1980 return flags | slub_debug_local;
1981 }
1982 #else /* !CONFIG_SLUB_DEBUG */
setup_object_debug(struct kmem_cache * s,void * object)1983 static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
1984 static inline
setup_slab_debug(struct kmem_cache * s,struct slab * slab,void * addr)1985 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
1986
alloc_debug_processing(struct kmem_cache * s,struct slab * slab,void * object,int orig_size)1987 static inline bool alloc_debug_processing(struct kmem_cache *s,
1988 struct slab *slab, void *object, int orig_size) { return true; }
1989
free_debug_processing(struct kmem_cache * s,struct slab * slab,void * head,void * tail,int * bulk_cnt,unsigned long addr,depot_stack_handle_t handle)1990 static inline bool free_debug_processing(struct kmem_cache *s,
1991 struct slab *slab, void *head, void *tail, int *bulk_cnt,
1992 unsigned long addr, depot_stack_handle_t handle) { return true; }
1993
slab_pad_check(struct kmem_cache * s,struct slab * slab)1994 static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
check_object(struct kmem_cache * s,struct slab * slab,void * object,u8 val)1995 static inline int check_object(struct kmem_cache *s, struct slab *slab,
1996 void *object, u8 val) { return 1; }
set_track_prepare(gfp_t gfp_flags)1997 static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
set_track(struct kmem_cache * s,void * object,enum track_item alloc,unsigned long addr,gfp_t gfp_flags)1998 static inline void set_track(struct kmem_cache *s, void *object,
1999 enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
add_full(struct kmem_cache * s,struct kmem_cache_node * n,struct slab * slab)2000 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
2001 struct slab *slab) {}
remove_full(struct kmem_cache * s,struct kmem_cache_node * n,struct slab * slab)2002 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
2003 struct slab *slab) {}
kmem_cache_flags(slab_flags_t flags,const char * name)2004 slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
2005 {
2006 return flags;
2007 }
2008 #define slub_debug 0
2009
2010 #define disable_higher_order_debug 0
2011
node_nr_slabs(struct kmem_cache_node * n)2012 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
2013 { return 0; }
inc_slabs_node(struct kmem_cache * s,int node,int objects)2014 static inline void inc_slabs_node(struct kmem_cache *s, int node,
2015 int objects) {}
dec_slabs_node(struct kmem_cache * s,int node,int objects)2016 static inline void dec_slabs_node(struct kmem_cache *s, int node,
2017 int objects) {}
2018 #ifndef CONFIG_SLUB_TINY
freelist_corrupted(struct kmem_cache * s,struct slab * slab,void ** freelist,void * nextfree)2019 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
2020 void **freelist, void *nextfree)
2021 {
2022 return false;
2023 }
2024 #endif
2025 #endif /* CONFIG_SLUB_DEBUG */
2026
2027 #ifdef CONFIG_SLAB_OBJ_EXT
2028
2029 #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
2030
mark_objexts_empty(struct slabobj_ext * obj_exts)2031 static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
2032 {
2033 struct slabobj_ext *slab_exts;
2034 struct slab *obj_exts_slab;
2035
2036 obj_exts_slab = virt_to_slab(obj_exts);
2037 slab_exts = slab_obj_exts(obj_exts_slab);
2038 if (slab_exts) {
2039 unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
2040 obj_exts_slab, obj_exts);
2041 /* codetag should be NULL */
2042 WARN_ON(slab_exts[offs].ref.ct);
2043 set_codetag_empty(&slab_exts[offs].ref);
2044 }
2045 }
2046
mark_failed_objexts_alloc(struct slab * slab)2047 static inline void mark_failed_objexts_alloc(struct slab *slab)
2048 {
2049 slab->obj_exts = OBJEXTS_ALLOC_FAIL;
2050 }
2051
handle_failed_objexts_alloc(unsigned long obj_exts,struct slabobj_ext * vec,unsigned int objects)2052 static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
2053 struct slabobj_ext *vec, unsigned int objects)
2054 {
2055 /*
2056 * If vector previously failed to allocate then we have live
2057 * objects with no tag reference. Mark all references in this
2058 * vector as empty to avoid warnings later on.
2059 */
2060 if (obj_exts == OBJEXTS_ALLOC_FAIL) {
2061 unsigned int i;
2062
2063 for (i = 0; i < objects; i++)
2064 set_codetag_empty(&vec[i].ref);
2065 }
2066 }
2067
2068 #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
2069
mark_objexts_empty(struct slabobj_ext * obj_exts)2070 static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
mark_failed_objexts_alloc(struct slab * slab)2071 static inline void mark_failed_objexts_alloc(struct slab *slab) {}
handle_failed_objexts_alloc(unsigned long obj_exts,struct slabobj_ext * vec,unsigned int objects)2072 static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
2073 struct slabobj_ext *vec, unsigned int objects) {}
2074
2075 #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
2076
2077 /*
2078 * The allocated objcg pointers array is not accounted directly.
2079 * Moreover, it should not come from DMA buffer and is not readily
2080 * reclaimable. So those GFP bits should be masked off.
2081 */
2082 #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
2083 __GFP_ACCOUNT | __GFP_NOFAIL)
2084
init_slab_obj_exts(struct slab * slab)2085 static inline void init_slab_obj_exts(struct slab *slab)
2086 {
2087 slab->obj_exts = 0;
2088 }
2089
alloc_slab_obj_exts(struct slab * slab,struct kmem_cache * s,gfp_t gfp,bool new_slab)2090 int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
2091 gfp_t gfp, bool new_slab)
2092 {
2093 bool allow_spin = gfpflags_allow_spinning(gfp);
2094 unsigned int objects = objs_per_slab(s, slab);
2095 unsigned long new_exts;
2096 unsigned long old_exts;
2097 struct slabobj_ext *vec;
2098
2099 gfp &= ~OBJCGS_CLEAR_MASK;
2100 /* Prevent recursive extension vector allocation */
2101 gfp |= __GFP_NO_OBJ_EXT;
2102
2103 /*
2104 * Note that allow_spin may be false during early boot and its
2105 * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting
2106 * architectures with cmpxchg16b, early obj_exts will be missing for
2107 * very early allocations on those.
2108 */
2109 if (unlikely(!allow_spin)) {
2110 size_t sz = objects * sizeof(struct slabobj_ext);
2111
2112 vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT,
2113 slab_nid(slab));
2114 } else {
2115 vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
2116 slab_nid(slab));
2117 }
2118 if (!vec) {
2119 /* Mark vectors which failed to allocate */
2120 mark_failed_objexts_alloc(slab);
2121
2122 return -ENOMEM;
2123 }
2124
2125 new_exts = (unsigned long)vec;
2126 if (unlikely(!allow_spin))
2127 new_exts |= OBJEXTS_NOSPIN_ALLOC;
2128 #ifdef CONFIG_MEMCG
2129 new_exts |= MEMCG_DATA_OBJEXTS;
2130 #endif
2131 old_exts = READ_ONCE(slab->obj_exts);
2132 handle_failed_objexts_alloc(old_exts, vec, objects);
2133 if (new_slab) {
2134 /*
2135 * If the slab is brand new and nobody can yet access its
2136 * obj_exts, no synchronization is required and obj_exts can
2137 * be simply assigned.
2138 */
2139 slab->obj_exts = new_exts;
2140 } else if ((old_exts & ~OBJEXTS_FLAGS_MASK) ||
2141 cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
2142 /*
2143 * If the slab is already in use, somebody can allocate and
2144 * assign slabobj_exts in parallel. In this case the existing
2145 * objcg vector should be reused.
2146 */
2147 mark_objexts_empty(vec);
2148 if (unlikely(!allow_spin))
2149 kfree_nolock(vec);
2150 else
2151 kfree(vec);
2152 return 0;
2153 }
2154
2155 kmemleak_not_leak(vec);
2156 return 0;
2157 }
2158
free_slab_obj_exts(struct slab * slab)2159 static inline void free_slab_obj_exts(struct slab *slab)
2160 {
2161 struct slabobj_ext *obj_exts;
2162
2163 obj_exts = slab_obj_exts(slab);
2164 if (!obj_exts)
2165 return;
2166
2167 /*
2168 * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
2169 * corresponding extension will be NULL. alloc_tag_sub() will throw a
2170 * warning if slab has extensions but the extension of an object is
2171 * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
2172 * the extension for obj_exts is expected to be NULL.
2173 */
2174 mark_objexts_empty(obj_exts);
2175 if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC))
2176 kfree_nolock(obj_exts);
2177 else
2178 kfree(obj_exts);
2179 slab->obj_exts = 0;
2180 }
2181
2182 #else /* CONFIG_SLAB_OBJ_EXT */
2183
init_slab_obj_exts(struct slab * slab)2184 static inline void init_slab_obj_exts(struct slab *slab)
2185 {
2186 }
2187
alloc_slab_obj_exts(struct slab * slab,struct kmem_cache * s,gfp_t gfp,bool new_slab)2188 static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
2189 gfp_t gfp, bool new_slab)
2190 {
2191 return 0;
2192 }
2193
free_slab_obj_exts(struct slab * slab)2194 static inline void free_slab_obj_exts(struct slab *slab)
2195 {
2196 }
2197
2198 #endif /* CONFIG_SLAB_OBJ_EXT */
2199
2200 #ifdef CONFIG_MEM_ALLOC_PROFILING
2201
2202 static inline struct slabobj_ext *
prepare_slab_obj_exts_hook(struct kmem_cache * s,gfp_t flags,void * p)2203 prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
2204 {
2205 struct slab *slab;
2206
2207 slab = virt_to_slab(p);
2208 if (!slab_obj_exts(slab) &&
2209 alloc_slab_obj_exts(slab, s, flags, false)) {
2210 pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
2211 __func__, s->name);
2212 return NULL;
2213 }
2214
2215 return slab_obj_exts(slab) + obj_to_index(s, slab, p);
2216 }
2217
2218 /* Should be called only if mem_alloc_profiling_enabled() */
2219 static noinline void
__alloc_tagging_slab_alloc_hook(struct kmem_cache * s,void * object,gfp_t flags)2220 __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2221 {
2222 struct slabobj_ext *obj_exts;
2223
2224 if (!object)
2225 return;
2226
2227 if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
2228 return;
2229
2230 if (flags & __GFP_NO_OBJ_EXT)
2231 return;
2232
2233 obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
2234 /*
2235 * Currently obj_exts is used only for allocation profiling.
2236 * If other users appear then mem_alloc_profiling_enabled()
2237 * check should be added before alloc_tag_add().
2238 */
2239 if (likely(obj_exts))
2240 alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
2241 else
2242 alloc_tag_set_inaccurate(current->alloc_tag);
2243 }
2244
2245 static inline void
alloc_tagging_slab_alloc_hook(struct kmem_cache * s,void * object,gfp_t flags)2246 alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2247 {
2248 if (mem_alloc_profiling_enabled())
2249 __alloc_tagging_slab_alloc_hook(s, object, flags);
2250 }
2251
2252 /* Should be called only if mem_alloc_profiling_enabled() */
2253 static noinline void
__alloc_tagging_slab_free_hook(struct kmem_cache * s,struct slab * slab,void ** p,int objects)2254 __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2255 int objects)
2256 {
2257 struct slabobj_ext *obj_exts;
2258 int i;
2259
2260 /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
2261 if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
2262 return;
2263
2264 obj_exts = slab_obj_exts(slab);
2265 if (!obj_exts)
2266 return;
2267
2268 for (i = 0; i < objects; i++) {
2269 unsigned int off = obj_to_index(s, slab, p[i]);
2270
2271 alloc_tag_sub(&obj_exts[off].ref, s->size);
2272 }
2273 }
2274
2275 static inline void
alloc_tagging_slab_free_hook(struct kmem_cache * s,struct slab * slab,void ** p,int objects)2276 alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2277 int objects)
2278 {
2279 if (mem_alloc_profiling_enabled())
2280 __alloc_tagging_slab_free_hook(s, slab, p, objects);
2281 }
2282
2283 #else /* CONFIG_MEM_ALLOC_PROFILING */
2284
2285 static inline void
alloc_tagging_slab_alloc_hook(struct kmem_cache * s,void * object,gfp_t flags)2286 alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2287 {
2288 }
2289
2290 static inline void
alloc_tagging_slab_free_hook(struct kmem_cache * s,struct slab * slab,void ** p,int objects)2291 alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2292 int objects)
2293 {
2294 }
2295
2296 #endif /* CONFIG_MEM_ALLOC_PROFILING */
2297
2298
2299 #ifdef CONFIG_MEMCG
2300
2301 static void memcg_alloc_abort_single(struct kmem_cache *s, void *object);
2302
2303 static __fastpath_inline
memcg_slab_post_alloc_hook(struct kmem_cache * s,struct list_lru * lru,gfp_t flags,size_t size,void ** p)2304 bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
2305 gfp_t flags, size_t size, void **p)
2306 {
2307 if (likely(!memcg_kmem_online()))
2308 return true;
2309
2310 if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
2311 return true;
2312
2313 if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p)))
2314 return true;
2315
2316 if (likely(size == 1)) {
2317 memcg_alloc_abort_single(s, *p);
2318 *p = NULL;
2319 } else {
2320 kmem_cache_free_bulk(s, size, p);
2321 }
2322
2323 return false;
2324 }
2325
2326 static __fastpath_inline
memcg_slab_free_hook(struct kmem_cache * s,struct slab * slab,void ** p,int objects)2327 void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2328 int objects)
2329 {
2330 struct slabobj_ext *obj_exts;
2331
2332 if (!memcg_kmem_online())
2333 return;
2334
2335 obj_exts = slab_obj_exts(slab);
2336 if (likely(!obj_exts))
2337 return;
2338
2339 __memcg_slab_free_hook(s, slab, p, objects, obj_exts);
2340 }
2341
2342 static __fastpath_inline
memcg_slab_post_charge(void * p,gfp_t flags)2343 bool memcg_slab_post_charge(void *p, gfp_t flags)
2344 {
2345 struct slabobj_ext *slab_exts;
2346 struct kmem_cache *s;
2347 struct folio *folio;
2348 struct slab *slab;
2349 unsigned long off;
2350
2351 folio = virt_to_folio(p);
2352 if (!folio_test_slab(folio)) {
2353 int size;
2354
2355 if (folio_memcg_kmem(folio))
2356 return true;
2357
2358 if (__memcg_kmem_charge_page(folio_page(folio, 0), flags,
2359 folio_order(folio)))
2360 return false;
2361
2362 /*
2363 * This folio has already been accounted in the global stats but
2364 * not in the memcg stats. So, subtract from the global and use
2365 * the interface which adds to both global and memcg stats.
2366 */
2367 size = folio_size(folio);
2368 node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size);
2369 lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size);
2370 return true;
2371 }
2372
2373 slab = folio_slab(folio);
2374 s = slab->slab_cache;
2375
2376 /*
2377 * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency
2378 * of slab_obj_exts being allocated from the same slab and thus the slab
2379 * becoming effectively unfreeable.
2380 */
2381 if (is_kmalloc_normal(s))
2382 return true;
2383
2384 /* Ignore already charged objects. */
2385 slab_exts = slab_obj_exts(slab);
2386 if (slab_exts) {
2387 off = obj_to_index(s, slab, p);
2388 if (unlikely(slab_exts[off].objcg))
2389 return true;
2390 }
2391
2392 return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p);
2393 }
2394
2395 #else /* CONFIG_MEMCG */
memcg_slab_post_alloc_hook(struct kmem_cache * s,struct list_lru * lru,gfp_t flags,size_t size,void ** p)2396 static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
2397 struct list_lru *lru,
2398 gfp_t flags, size_t size,
2399 void **p)
2400 {
2401 return true;
2402 }
2403
memcg_slab_free_hook(struct kmem_cache * s,struct slab * slab,void ** p,int objects)2404 static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
2405 void **p, int objects)
2406 {
2407 }
2408
memcg_slab_post_charge(void * p,gfp_t flags)2409 static inline bool memcg_slab_post_charge(void *p, gfp_t flags)
2410 {
2411 return true;
2412 }
2413 #endif /* CONFIG_MEMCG */
2414
2415 #ifdef CONFIG_SLUB_RCU_DEBUG
2416 static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
2417
2418 struct rcu_delayed_free {
2419 struct rcu_head head;
2420 void *object;
2421 };
2422 #endif
2423
2424 /*
2425 * Hooks for other subsystems that check memory allocations. In a typical
2426 * production configuration these hooks all should produce no code at all.
2427 *
2428 * Returns true if freeing of the object can proceed, false if its reuse
2429 * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned
2430 * to KFENCE.
2431 */
2432 static __always_inline
slab_free_hook(struct kmem_cache * s,void * x,bool init,bool after_rcu_delay)2433 bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
2434 bool after_rcu_delay)
2435 {
2436 /* Are the object contents still accessible? */
2437 bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay;
2438
2439 kmemleak_free_recursive(x, s->flags);
2440 kmsan_slab_free(s, x);
2441
2442 debug_check_no_locks_freed(x, s->object_size);
2443
2444 if (!(s->flags & SLAB_DEBUG_OBJECTS))
2445 debug_check_no_obj_freed(x, s->object_size);
2446
2447 /* Use KCSAN to help debug racy use-after-free. */
2448 if (!still_accessible)
2449 __kcsan_check_access(x, s->object_size,
2450 KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
2451
2452 if (kfence_free(x))
2453 return false;
2454
2455 /*
2456 * Give KASAN a chance to notice an invalid free operation before we
2457 * modify the object.
2458 */
2459 if (kasan_slab_pre_free(s, x))
2460 return false;
2461
2462 #ifdef CONFIG_SLUB_RCU_DEBUG
2463 if (still_accessible) {
2464 struct rcu_delayed_free *delayed_free;
2465
2466 delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT);
2467 if (delayed_free) {
2468 /*
2469 * Let KASAN track our call stack as a "related work
2470 * creation", just like if the object had been freed
2471 * normally via kfree_rcu().
2472 * We have to do this manually because the rcu_head is
2473 * not located inside the object.
2474 */
2475 kasan_record_aux_stack(x);
2476
2477 delayed_free->object = x;
2478 call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
2479 return false;
2480 }
2481 }
2482 #endif /* CONFIG_SLUB_RCU_DEBUG */
2483
2484 /*
2485 * As memory initialization might be integrated into KASAN,
2486 * kasan_slab_free and initialization memset's must be
2487 * kept together to avoid discrepancies in behavior.
2488 *
2489 * The initialization memset's clear the object and the metadata,
2490 * but don't touch the SLAB redzone.
2491 *
2492 * The object's freepointer is also avoided if stored outside the
2493 * object.
2494 */
2495 if (unlikely(init)) {
2496 int rsize;
2497 unsigned int inuse, orig_size;
2498
2499 inuse = get_info_end(s);
2500 orig_size = get_orig_size(s, x);
2501 if (!kasan_has_integrated_init())
2502 memset(kasan_reset_tag(x), 0, orig_size);
2503 rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
2504 memset((char *)kasan_reset_tag(x) + inuse, 0,
2505 s->size - inuse - rsize);
2506 /*
2507 * Restore orig_size, otherwize kmalloc redzone overwritten
2508 * would be reported
2509 */
2510 set_orig_size(s, x, orig_size);
2511
2512 }
2513 /* KASAN might put x into memory quarantine, delaying its reuse. */
2514 return !kasan_slab_free(s, x, init, still_accessible, false);
2515 }
2516
2517 static __fastpath_inline
slab_free_freelist_hook(struct kmem_cache * s,void ** head,void ** tail,int * cnt)2518 bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
2519 int *cnt)
2520 {
2521
2522 void *object;
2523 void *next = *head;
2524 void *old_tail = *tail;
2525 bool init;
2526
2527 if (is_kfence_address(next)) {
2528 slab_free_hook(s, next, false, false);
2529 return false;
2530 }
2531
2532 /* Head and tail of the reconstructed freelist */
2533 *head = NULL;
2534 *tail = NULL;
2535
2536 init = slab_want_init_on_free(s);
2537
2538 do {
2539 object = next;
2540 next = get_freepointer(s, object);
2541
2542 /* If object's reuse doesn't have to be delayed */
2543 if (likely(slab_free_hook(s, object, init, false))) {
2544 /* Move object to the new freelist */
2545 set_freepointer(s, object, *head);
2546 *head = object;
2547 if (!*tail)
2548 *tail = object;
2549 } else {
2550 /*
2551 * Adjust the reconstructed freelist depth
2552 * accordingly if object's reuse is delayed.
2553 */
2554 --(*cnt);
2555 }
2556 } while (object != old_tail);
2557
2558 return *head != NULL;
2559 }
2560
setup_object(struct kmem_cache * s,void * object)2561 static void *setup_object(struct kmem_cache *s, void *object)
2562 {
2563 setup_object_debug(s, object);
2564 object = kasan_init_slab_obj(s, object);
2565 if (unlikely(s->ctor)) {
2566 kasan_unpoison_new_object(s, object);
2567 s->ctor(object);
2568 kasan_poison_new_object(s, object);
2569 }
2570 return object;
2571 }
2572
alloc_empty_sheaf(struct kmem_cache * s,gfp_t gfp)2573 static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
2574 {
2575 struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects,
2576 s->sheaf_capacity), gfp);
2577
2578 if (unlikely(!sheaf))
2579 return NULL;
2580
2581 sheaf->cache = s;
2582
2583 stat(s, SHEAF_ALLOC);
2584
2585 return sheaf;
2586 }
2587
free_empty_sheaf(struct kmem_cache * s,struct slab_sheaf * sheaf)2588 static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
2589 {
2590 kfree(sheaf);
2591
2592 stat(s, SHEAF_FREE);
2593 }
2594
2595 static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
2596 size_t size, void **p);
2597
2598
refill_sheaf(struct kmem_cache * s,struct slab_sheaf * sheaf,gfp_t gfp)2599 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
2600 gfp_t gfp)
2601 {
2602 int to_fill = s->sheaf_capacity - sheaf->size;
2603 int filled;
2604
2605 if (!to_fill)
2606 return 0;
2607
2608 filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
2609 &sheaf->objects[sheaf->size]);
2610
2611 sheaf->size += filled;
2612
2613 stat_add(s, SHEAF_REFILL, filled);
2614
2615 if (filled < to_fill)
2616 return -ENOMEM;
2617
2618 return 0;
2619 }
2620
2621
alloc_full_sheaf(struct kmem_cache * s,gfp_t gfp)2622 static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
2623 {
2624 struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp);
2625
2626 if (!sheaf)
2627 return NULL;
2628
2629 if (refill_sheaf(s, sheaf, gfp)) {
2630 free_empty_sheaf(s, sheaf);
2631 return NULL;
2632 }
2633
2634 return sheaf;
2635 }
2636
2637 /*
2638 * Maximum number of objects freed during a single flush of main pcs sheaf.
2639 * Translates directly to an on-stack array size.
2640 */
2641 #define PCS_BATCH_MAX 32U
2642
2643 static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
2644
2645 /*
2646 * Free all objects from the main sheaf. In order to perform
2647 * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where
2648 * object pointers are moved to a on-stack array under the lock. To bound the
2649 * stack usage, limit each batch to PCS_BATCH_MAX.
2650 *
2651 * returns true if at least partially flushed
2652 */
sheaf_flush_main(struct kmem_cache * s)2653 static bool sheaf_flush_main(struct kmem_cache *s)
2654 {
2655 struct slub_percpu_sheaves *pcs;
2656 unsigned int batch, remaining;
2657 void *objects[PCS_BATCH_MAX];
2658 struct slab_sheaf *sheaf;
2659 bool ret = false;
2660
2661 next_batch:
2662 if (!local_trylock(&s->cpu_sheaves->lock))
2663 return ret;
2664
2665 pcs = this_cpu_ptr(s->cpu_sheaves);
2666 sheaf = pcs->main;
2667
2668 batch = min(PCS_BATCH_MAX, sheaf->size);
2669
2670 sheaf->size -= batch;
2671 memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *));
2672
2673 remaining = sheaf->size;
2674
2675 local_unlock(&s->cpu_sheaves->lock);
2676
2677 __kmem_cache_free_bulk(s, batch, &objects[0]);
2678
2679 stat_add(s, SHEAF_FLUSH, batch);
2680
2681 ret = true;
2682
2683 if (remaining)
2684 goto next_batch;
2685
2686 return ret;
2687 }
2688
2689 /*
2690 * Free all objects from a sheaf that's unused, i.e. not linked to any
2691 * cpu_sheaves, so we need no locking and batching. The locking is also not
2692 * necessary when flushing cpu's sheaves (both spare and main) during cpu
2693 * hotremove as the cpu is not executing anymore.
2694 */
sheaf_flush_unused(struct kmem_cache * s,struct slab_sheaf * sheaf)2695 static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
2696 {
2697 if (!sheaf->size)
2698 return;
2699
2700 stat_add(s, SHEAF_FLUSH, sheaf->size);
2701
2702 __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]);
2703
2704 sheaf->size = 0;
2705 }
2706
__rcu_free_sheaf_prepare(struct kmem_cache * s,struct slab_sheaf * sheaf)2707 static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
2708 struct slab_sheaf *sheaf)
2709 {
2710 bool init = slab_want_init_on_free(s);
2711 void **p = &sheaf->objects[0];
2712 unsigned int i = 0;
2713
2714 while (i < sheaf->size) {
2715 struct slab *slab = virt_to_slab(p[i]);
2716
2717 memcg_slab_free_hook(s, slab, p + i, 1);
2718 alloc_tagging_slab_free_hook(s, slab, p + i, 1);
2719
2720 if (unlikely(!slab_free_hook(s, p[i], init, true))) {
2721 p[i] = p[--sheaf->size];
2722 continue;
2723 }
2724
2725 i++;
2726 }
2727 }
2728
rcu_free_sheaf_nobarn(struct rcu_head * head)2729 static void rcu_free_sheaf_nobarn(struct rcu_head *head)
2730 {
2731 struct slab_sheaf *sheaf;
2732 struct kmem_cache *s;
2733
2734 sheaf = container_of(head, struct slab_sheaf, rcu_head);
2735 s = sheaf->cache;
2736
2737 __rcu_free_sheaf_prepare(s, sheaf);
2738
2739 sheaf_flush_unused(s, sheaf);
2740
2741 free_empty_sheaf(s, sheaf);
2742 }
2743
2744 /*
2745 * Caller needs to make sure migration is disabled in order to fully flush
2746 * single cpu's sheaves
2747 *
2748 * must not be called from an irq
2749 *
2750 * flushing operations are rare so let's keep it simple and flush to slabs
2751 * directly, skipping the barn
2752 */
pcs_flush_all(struct kmem_cache * s)2753 static void pcs_flush_all(struct kmem_cache *s)
2754 {
2755 struct slub_percpu_sheaves *pcs;
2756 struct slab_sheaf *spare, *rcu_free;
2757
2758 local_lock(&s->cpu_sheaves->lock);
2759 pcs = this_cpu_ptr(s->cpu_sheaves);
2760
2761 spare = pcs->spare;
2762 pcs->spare = NULL;
2763
2764 rcu_free = pcs->rcu_free;
2765 pcs->rcu_free = NULL;
2766
2767 local_unlock(&s->cpu_sheaves->lock);
2768
2769 if (spare) {
2770 sheaf_flush_unused(s, spare);
2771 free_empty_sheaf(s, spare);
2772 }
2773
2774 if (rcu_free)
2775 call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
2776
2777 sheaf_flush_main(s);
2778 }
2779
__pcs_flush_all_cpu(struct kmem_cache * s,unsigned int cpu)2780 static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
2781 {
2782 struct slub_percpu_sheaves *pcs;
2783
2784 pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
2785
2786 /* The cpu is not executing anymore so we don't need pcs->lock */
2787 sheaf_flush_unused(s, pcs->main);
2788 if (pcs->spare) {
2789 sheaf_flush_unused(s, pcs->spare);
2790 free_empty_sheaf(s, pcs->spare);
2791 pcs->spare = NULL;
2792 }
2793
2794 if (pcs->rcu_free) {
2795 call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
2796 pcs->rcu_free = NULL;
2797 }
2798 }
2799
pcs_destroy(struct kmem_cache * s)2800 static void pcs_destroy(struct kmem_cache *s)
2801 {
2802 int cpu;
2803
2804 for_each_possible_cpu(cpu) {
2805 struct slub_percpu_sheaves *pcs;
2806
2807 pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
2808
2809 /* can happen when unwinding failed create */
2810 if (!pcs->main)
2811 continue;
2812
2813 /*
2814 * We have already passed __kmem_cache_shutdown() so everything
2815 * was flushed and there should be no objects allocated from
2816 * slabs, otherwise kmem_cache_destroy() would have aborted.
2817 * Therefore something would have to be really wrong if the
2818 * warnings here trigger, and we should rather leave objects and
2819 * sheaves to leak in that case.
2820 */
2821
2822 WARN_ON(pcs->spare);
2823 WARN_ON(pcs->rcu_free);
2824
2825 if (!WARN_ON(pcs->main->size)) {
2826 free_empty_sheaf(s, pcs->main);
2827 pcs->main = NULL;
2828 }
2829 }
2830
2831 free_percpu(s->cpu_sheaves);
2832 s->cpu_sheaves = NULL;
2833 }
2834
barn_get_empty_sheaf(struct node_barn * barn)2835 static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
2836 {
2837 struct slab_sheaf *empty = NULL;
2838 unsigned long flags;
2839
2840 if (!data_race(barn->nr_empty))
2841 return NULL;
2842
2843 spin_lock_irqsave(&barn->lock, flags);
2844
2845 if (likely(barn->nr_empty)) {
2846 empty = list_first_entry(&barn->sheaves_empty,
2847 struct slab_sheaf, barn_list);
2848 list_del(&empty->barn_list);
2849 barn->nr_empty--;
2850 }
2851
2852 spin_unlock_irqrestore(&barn->lock, flags);
2853
2854 return empty;
2855 }
2856
2857 /*
2858 * The following two functions are used mainly in cases where we have to undo an
2859 * intended action due to a race or cpu migration. Thus they do not check the
2860 * empty or full sheaf limits for simplicity.
2861 */
2862
barn_put_empty_sheaf(struct node_barn * barn,struct slab_sheaf * sheaf)2863 static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
2864 {
2865 unsigned long flags;
2866
2867 spin_lock_irqsave(&barn->lock, flags);
2868
2869 list_add(&sheaf->barn_list, &barn->sheaves_empty);
2870 barn->nr_empty++;
2871
2872 spin_unlock_irqrestore(&barn->lock, flags);
2873 }
2874
barn_put_full_sheaf(struct node_barn * barn,struct slab_sheaf * sheaf)2875 static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
2876 {
2877 unsigned long flags;
2878
2879 spin_lock_irqsave(&barn->lock, flags);
2880
2881 list_add(&sheaf->barn_list, &barn->sheaves_full);
2882 barn->nr_full++;
2883
2884 spin_unlock_irqrestore(&barn->lock, flags);
2885 }
2886
barn_get_full_or_empty_sheaf(struct node_barn * barn)2887 static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
2888 {
2889 struct slab_sheaf *sheaf = NULL;
2890 unsigned long flags;
2891
2892 if (!data_race(barn->nr_full) && !data_race(barn->nr_empty))
2893 return NULL;
2894
2895 spin_lock_irqsave(&barn->lock, flags);
2896
2897 if (barn->nr_full) {
2898 sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
2899 barn_list);
2900 list_del(&sheaf->barn_list);
2901 barn->nr_full--;
2902 } else if (barn->nr_empty) {
2903 sheaf = list_first_entry(&barn->sheaves_empty,
2904 struct slab_sheaf, barn_list);
2905 list_del(&sheaf->barn_list);
2906 barn->nr_empty--;
2907 }
2908
2909 spin_unlock_irqrestore(&barn->lock, flags);
2910
2911 return sheaf;
2912 }
2913
2914 /*
2915 * If a full sheaf is available, return it and put the supplied empty one to
2916 * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't
2917 * change.
2918 */
2919 static struct slab_sheaf *
barn_replace_empty_sheaf(struct node_barn * barn,struct slab_sheaf * empty)2920 barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
2921 {
2922 struct slab_sheaf *full = NULL;
2923 unsigned long flags;
2924
2925 if (!data_race(barn->nr_full))
2926 return NULL;
2927
2928 spin_lock_irqsave(&barn->lock, flags);
2929
2930 if (likely(barn->nr_full)) {
2931 full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
2932 barn_list);
2933 list_del(&full->barn_list);
2934 list_add(&empty->barn_list, &barn->sheaves_empty);
2935 barn->nr_full--;
2936 barn->nr_empty++;
2937 }
2938
2939 spin_unlock_irqrestore(&barn->lock, flags);
2940
2941 return full;
2942 }
2943
2944 /*
2945 * If an empty sheaf is available, return it and put the supplied full one to
2946 * barn. But if there are too many full sheaves, reject this with -E2BIG.
2947 */
2948 static struct slab_sheaf *
barn_replace_full_sheaf(struct node_barn * barn,struct slab_sheaf * full)2949 barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
2950 {
2951 struct slab_sheaf *empty;
2952 unsigned long flags;
2953
2954 /* we don't repeat this check under barn->lock as it's not critical */
2955 if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES)
2956 return ERR_PTR(-E2BIG);
2957 if (!data_race(barn->nr_empty))
2958 return ERR_PTR(-ENOMEM);
2959
2960 spin_lock_irqsave(&barn->lock, flags);
2961
2962 if (likely(barn->nr_empty)) {
2963 empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
2964 barn_list);
2965 list_del(&empty->barn_list);
2966 list_add(&full->barn_list, &barn->sheaves_full);
2967 barn->nr_empty--;
2968 barn->nr_full++;
2969 } else {
2970 empty = ERR_PTR(-ENOMEM);
2971 }
2972
2973 spin_unlock_irqrestore(&barn->lock, flags);
2974
2975 return empty;
2976 }
2977
barn_init(struct node_barn * barn)2978 static void barn_init(struct node_barn *barn)
2979 {
2980 spin_lock_init(&barn->lock);
2981 INIT_LIST_HEAD(&barn->sheaves_full);
2982 INIT_LIST_HEAD(&barn->sheaves_empty);
2983 barn->nr_full = 0;
2984 barn->nr_empty = 0;
2985 }
2986
barn_shrink(struct kmem_cache * s,struct node_barn * barn)2987 static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
2988 {
2989 struct list_head empty_list;
2990 struct list_head full_list;
2991 struct slab_sheaf *sheaf, *sheaf2;
2992 unsigned long flags;
2993
2994 INIT_LIST_HEAD(&empty_list);
2995 INIT_LIST_HEAD(&full_list);
2996
2997 spin_lock_irqsave(&barn->lock, flags);
2998
2999 list_splice_init(&barn->sheaves_full, &full_list);
3000 barn->nr_full = 0;
3001 list_splice_init(&barn->sheaves_empty, &empty_list);
3002 barn->nr_empty = 0;
3003
3004 spin_unlock_irqrestore(&barn->lock, flags);
3005
3006 list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) {
3007 sheaf_flush_unused(s, sheaf);
3008 free_empty_sheaf(s, sheaf);
3009 }
3010
3011 list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list)
3012 free_empty_sheaf(s, sheaf);
3013 }
3014
3015 /*
3016 * Slab allocation and freeing
3017 */
alloc_slab_page(gfp_t flags,int node,struct kmem_cache_order_objects oo,bool allow_spin)3018 static inline struct slab *alloc_slab_page(gfp_t flags, int node,
3019 struct kmem_cache_order_objects oo,
3020 bool allow_spin)
3021 {
3022 struct folio *folio;
3023 struct slab *slab;
3024 unsigned int order = oo_order(oo);
3025
3026 if (unlikely(!allow_spin))
3027 folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
3028 node, order);
3029 else if (node == NUMA_NO_NODE)
3030 folio = (struct folio *)alloc_frozen_pages(flags, order);
3031 else
3032 folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);
3033
3034 if (!folio)
3035 return NULL;
3036
3037 slab = folio_slab(folio);
3038 __folio_set_slab(folio);
3039 if (folio_is_pfmemalloc(folio))
3040 slab_set_pfmemalloc(slab);
3041
3042 return slab;
3043 }
3044
3045 #ifdef CONFIG_SLAB_FREELIST_RANDOM
3046 /* Pre-initialize the random sequence cache */
init_cache_random_seq(struct kmem_cache * s)3047 static int init_cache_random_seq(struct kmem_cache *s)
3048 {
3049 unsigned int count = oo_objects(s->oo);
3050 int err;
3051
3052 /* Bailout if already initialised */
3053 if (s->random_seq)
3054 return 0;
3055
3056 err = cache_random_seq_create(s, count, GFP_KERNEL);
3057 if (err) {
3058 pr_err("SLUB: Unable to initialize free list for %s\n",
3059 s->name);
3060 return err;
3061 }
3062
3063 /* Transform to an offset on the set of pages */
3064 if (s->random_seq) {
3065 unsigned int i;
3066
3067 for (i = 0; i < count; i++)
3068 s->random_seq[i] *= s->size;
3069 }
3070 return 0;
3071 }
3072
3073 /* Initialize each random sequence freelist per cache */
init_freelist_randomization(void)3074 static void __init init_freelist_randomization(void)
3075 {
3076 struct kmem_cache *s;
3077
3078 mutex_lock(&slab_mutex);
3079
3080 list_for_each_entry(s, &slab_caches, list)
3081 init_cache_random_seq(s);
3082
3083 mutex_unlock(&slab_mutex);
3084 }
3085
3086 /* Get the next entry on the pre-computed freelist randomized */
next_freelist_entry(struct kmem_cache * s,unsigned long * pos,void * start,unsigned long page_limit,unsigned long freelist_count)3087 static void *next_freelist_entry(struct kmem_cache *s,
3088 unsigned long *pos, void *start,
3089 unsigned long page_limit,
3090 unsigned long freelist_count)
3091 {
3092 unsigned int idx;
3093
3094 /*
3095 * If the target page allocation failed, the number of objects on the
3096 * page might be smaller than the usual size defined by the cache.
3097 */
3098 do {
3099 idx = s->random_seq[*pos];
3100 *pos += 1;
3101 if (*pos >= freelist_count)
3102 *pos = 0;
3103 } while (unlikely(idx >= page_limit));
3104
3105 return (char *)start + idx;
3106 }
3107
3108 /* Shuffle the single linked freelist based on a random pre-computed sequence */
shuffle_freelist(struct kmem_cache * s,struct slab * slab)3109 static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
3110 {
3111 void *start;
3112 void *cur;
3113 void *next;
3114 unsigned long idx, pos, page_limit, freelist_count;
3115
3116 if (slab->objects < 2 || !s->random_seq)
3117 return false;
3118
3119 freelist_count = oo_objects(s->oo);
3120 pos = get_random_u32_below(freelist_count);
3121
3122 page_limit = slab->objects * s->size;
3123 start = fixup_red_left(s, slab_address(slab));
3124
3125 /* First entry is used as the base of the freelist */
3126 cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
3127 cur = setup_object(s, cur);
3128 slab->freelist = cur;
3129
3130 for (idx = 1; idx < slab->objects; idx++) {
3131 next = next_freelist_entry(s, &pos, start, page_limit,
3132 freelist_count);
3133 next = setup_object(s, next);
3134 set_freepointer(s, cur, next);
3135 cur = next;
3136 }
3137 set_freepointer(s, cur, NULL);
3138
3139 return true;
3140 }
3141 #else
init_cache_random_seq(struct kmem_cache * s)3142 static inline int init_cache_random_seq(struct kmem_cache *s)
3143 {
3144 return 0;
3145 }
init_freelist_randomization(void)3146 static inline void init_freelist_randomization(void) { }
shuffle_freelist(struct kmem_cache * s,struct slab * slab)3147 static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
3148 {
3149 return false;
3150 }
3151 #endif /* CONFIG_SLAB_FREELIST_RANDOM */
3152
account_slab(struct slab * slab,int order,struct kmem_cache * s,gfp_t gfp)3153 static __always_inline void account_slab(struct slab *slab, int order,
3154 struct kmem_cache *s, gfp_t gfp)
3155 {
3156 if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
3157 alloc_slab_obj_exts(slab, s, gfp, true);
3158
3159 mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
3160 PAGE_SIZE << order);
3161 }
3162
unaccount_slab(struct slab * slab,int order,struct kmem_cache * s)3163 static __always_inline void unaccount_slab(struct slab *slab, int order,
3164 struct kmem_cache *s)
3165 {
3166 /*
3167 * The slab object extensions should now be freed regardless of
3168 * whether mem_alloc_profiling_enabled() or not because profiling
3169 * might have been disabled after slab->obj_exts got allocated.
3170 */
3171 free_slab_obj_exts(slab);
3172
3173 mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
3174 -(PAGE_SIZE << order));
3175 }
3176
allocate_slab(struct kmem_cache * s,gfp_t flags,int node)3177 static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
3178 {
3179 bool allow_spin = gfpflags_allow_spinning(flags);
3180 struct slab *slab;
3181 struct kmem_cache_order_objects oo = s->oo;
3182 gfp_t alloc_gfp;
3183 void *start, *p, *next;
3184 int idx;
3185 bool shuffle;
3186
3187 flags &= gfp_allowed_mask;
3188
3189 flags |= s->allocflags;
3190
3191 /*
3192 * Let the initial higher-order allocation fail under memory pressure
3193 * so we fall-back to the minimum order allocation.
3194 */
3195 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
3196 if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
3197 alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
3198
3199 /*
3200 * __GFP_RECLAIM could be cleared on the first allocation attempt,
3201 * so pass allow_spin flag directly.
3202 */
3203 slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
3204 if (unlikely(!slab)) {
3205 oo = s->min;
3206 alloc_gfp = flags;
3207 /*
3208 * Allocation may have failed due to fragmentation.
3209 * Try a lower order alloc if possible
3210 */
3211 slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
3212 if (unlikely(!slab))
3213 return NULL;
3214 stat(s, ORDER_FALLBACK);
3215 }
3216
3217 slab->objects = oo_objects(oo);
3218 slab->inuse = 0;
3219 slab->frozen = 0;
3220 init_slab_obj_exts(slab);
3221
3222 account_slab(slab, oo_order(oo), s, flags);
3223
3224 slab->slab_cache = s;
3225
3226 kasan_poison_slab(slab);
3227
3228 start = slab_address(slab);
3229
3230 setup_slab_debug(s, slab, start);
3231
3232 shuffle = shuffle_freelist(s, slab);
3233
3234 if (!shuffle) {
3235 start = fixup_red_left(s, start);
3236 start = setup_object(s, start);
3237 slab->freelist = start;
3238 for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
3239 next = p + s->size;
3240 next = setup_object(s, next);
3241 set_freepointer(s, p, next);
3242 p = next;
3243 }
3244 set_freepointer(s, p, NULL);
3245 }
3246
3247 return slab;
3248 }
3249
new_slab(struct kmem_cache * s,gfp_t flags,int node)3250 static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
3251 {
3252 if (unlikely(flags & GFP_SLAB_BUG_MASK))
3253 flags = kmalloc_fix_flags(flags);
3254
3255 WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
3256
3257 return allocate_slab(s,
3258 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
3259 }
3260
__free_slab(struct kmem_cache * s,struct slab * slab)3261 static void __free_slab(struct kmem_cache *s, struct slab *slab)
3262 {
3263 struct folio *folio = slab_folio(slab);
3264 int order = folio_order(folio);
3265 int pages = 1 << order;
3266
3267 __slab_clear_pfmemalloc(slab);
3268 folio->mapping = NULL;
3269 __folio_clear_slab(folio);
3270 mm_account_reclaimed_pages(pages);
3271 unaccount_slab(slab, order, s);
3272 free_frozen_pages(&folio->page, order);
3273 }
3274
rcu_free_slab(struct rcu_head * h)3275 static void rcu_free_slab(struct rcu_head *h)
3276 {
3277 struct slab *slab = container_of(h, struct slab, rcu_head);
3278
3279 __free_slab(slab->slab_cache, slab);
3280 }
3281
free_slab(struct kmem_cache * s,struct slab * slab)3282 static void free_slab(struct kmem_cache *s, struct slab *slab)
3283 {
3284 if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
3285 void *p;
3286
3287 slab_pad_check(s, slab);
3288 for_each_object(p, s, slab_address(slab), slab->objects)
3289 check_object(s, slab, p, SLUB_RED_INACTIVE);
3290 }
3291
3292 if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
3293 call_rcu(&slab->rcu_head, rcu_free_slab);
3294 else
3295 __free_slab(s, slab);
3296 }
3297
discard_slab(struct kmem_cache * s,struct slab * slab)3298 static void discard_slab(struct kmem_cache *s, struct slab *slab)
3299 {
3300 dec_slabs_node(s, slab_nid(slab), slab->objects);
3301 free_slab(s, slab);
3302 }
3303
slab_test_node_partial(const struct slab * slab)3304 static inline bool slab_test_node_partial(const struct slab *slab)
3305 {
3306 return test_bit(SL_partial, &slab->flags.f);
3307 }
3308
slab_set_node_partial(struct slab * slab)3309 static inline void slab_set_node_partial(struct slab *slab)
3310 {
3311 set_bit(SL_partial, &slab->flags.f);
3312 }
3313
slab_clear_node_partial(struct slab * slab)3314 static inline void slab_clear_node_partial(struct slab *slab)
3315 {
3316 clear_bit(SL_partial, &slab->flags.f);
3317 }
3318
3319 /*
3320 * Management of partially allocated slabs.
3321 */
3322 static inline void
__add_partial(struct kmem_cache_node * n,struct slab * slab,int tail)3323 __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
3324 {
3325 n->nr_partial++;
3326 if (tail == DEACTIVATE_TO_TAIL)
3327 list_add_tail(&slab->slab_list, &n->partial);
3328 else
3329 list_add(&slab->slab_list, &n->partial);
3330 slab_set_node_partial(slab);
3331 }
3332
add_partial(struct kmem_cache_node * n,struct slab * slab,int tail)3333 static inline void add_partial(struct kmem_cache_node *n,
3334 struct slab *slab, int tail)
3335 {
3336 lockdep_assert_held(&n->list_lock);
3337 __add_partial(n, slab, tail);
3338 }
3339
remove_partial(struct kmem_cache_node * n,struct slab * slab)3340 static inline void remove_partial(struct kmem_cache_node *n,
3341 struct slab *slab)
3342 {
3343 lockdep_assert_held(&n->list_lock);
3344 list_del(&slab->slab_list);
3345 slab_clear_node_partial(slab);
3346 n->nr_partial--;
3347 }
3348
3349 /*
3350 * Called only for kmem_cache_debug() caches instead of remove_partial(), with a
3351 * slab from the n->partial list. Remove only a single object from the slab, do
3352 * the alloc_debug_processing() checks and leave the slab on the list, or move
3353 * it to full list if it was the last free object.
3354 */
alloc_single_from_partial(struct kmem_cache * s,struct kmem_cache_node * n,struct slab * slab,int orig_size)3355 static void *alloc_single_from_partial(struct kmem_cache *s,
3356 struct kmem_cache_node *n, struct slab *slab, int orig_size)
3357 {
3358 void *object;
3359
3360 lockdep_assert_held(&n->list_lock);
3361
3362 #ifdef CONFIG_SLUB_DEBUG
3363 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
3364 if (!validate_slab_ptr(slab)) {
3365 slab_err(s, slab, "Not a valid slab page");
3366 return NULL;
3367 }
3368 }
3369 #endif
3370
3371 object = slab->freelist;
3372 slab->freelist = get_freepointer(s, object);
3373 slab->inuse++;
3374
3375 if (!alloc_debug_processing(s, slab, object, orig_size)) {
3376 remove_partial(n, slab);
3377 return NULL;
3378 }
3379
3380 if (slab->inuse == slab->objects) {
3381 remove_partial(n, slab);
3382 add_full(s, n, slab);
3383 }
3384
3385 return object;
3386 }
3387
3388 static void defer_deactivate_slab(struct slab *slab, void *flush_freelist);
3389
3390 /*
3391 * Called only for kmem_cache_debug() caches to allocate from a freshly
3392 * allocated slab. Allocate a single object instead of whole freelist
3393 * and put the slab to the partial (or full) list.
3394 */
alloc_single_from_new_slab(struct kmem_cache * s,struct slab * slab,int orig_size,gfp_t gfpflags)3395 static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
3396 int orig_size, gfp_t gfpflags)
3397 {
3398 bool allow_spin = gfpflags_allow_spinning(gfpflags);
3399 int nid = slab_nid(slab);
3400 struct kmem_cache_node *n = get_node(s, nid);
3401 unsigned long flags;
3402 void *object;
3403
3404 if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
3405 /* Unlucky, discard newly allocated slab */
3406 slab->frozen = 1;
3407 defer_deactivate_slab(slab, NULL);
3408 return NULL;
3409 }
3410
3411 object = slab->freelist;
3412 slab->freelist = get_freepointer(s, object);
3413 slab->inuse = 1;
3414
3415 if (!alloc_debug_processing(s, slab, object, orig_size)) {
3416 /*
3417 * It's not really expected that this would fail on a
3418 * freshly allocated slab, but a concurrent memory
3419 * corruption in theory could cause that.
3420 * Leak memory of allocated slab.
3421 */
3422 if (!allow_spin)
3423 spin_unlock_irqrestore(&n->list_lock, flags);
3424 return NULL;
3425 }
3426
3427 if (allow_spin)
3428 spin_lock_irqsave(&n->list_lock, flags);
3429
3430 if (slab->inuse == slab->objects)
3431 add_full(s, n, slab);
3432 else
3433 add_partial(n, slab, DEACTIVATE_TO_HEAD);
3434
3435 inc_slabs_node(s, nid, slab->objects);
3436 spin_unlock_irqrestore(&n->list_lock, flags);
3437
3438 return object;
3439 }
3440
3441 #ifdef CONFIG_SLUB_CPU_PARTIAL
3442 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
3443 #else
put_cpu_partial(struct kmem_cache * s,struct slab * slab,int drain)3444 static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
3445 int drain) { }
3446 #endif
3447 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
3448
3449 /*
3450 * Try to allocate a partial slab from a specific node.
3451 */
get_partial_node(struct kmem_cache * s,struct kmem_cache_node * n,struct partial_context * pc)3452 static struct slab *get_partial_node(struct kmem_cache *s,
3453 struct kmem_cache_node *n,
3454 struct partial_context *pc)
3455 {
3456 struct slab *slab, *slab2, *partial = NULL;
3457 unsigned long flags;
3458 unsigned int partial_slabs = 0;
3459
3460 /*
3461 * Racy check. If we mistakenly see no partial slabs then we
3462 * just allocate an empty slab. If we mistakenly try to get a
3463 * partial slab and there is none available then get_partial()
3464 * will return NULL.
3465 */
3466 if (!n || !n->nr_partial)
3467 return NULL;
3468
3469 if (gfpflags_allow_spinning(pc->flags))
3470 spin_lock_irqsave(&n->list_lock, flags);
3471 else if (!spin_trylock_irqsave(&n->list_lock, flags))
3472 return NULL;
3473 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
3474 if (!pfmemalloc_match(slab, pc->flags))
3475 continue;
3476
3477 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
3478 void *object = alloc_single_from_partial(s, n, slab,
3479 pc->orig_size);
3480 if (object) {
3481 partial = slab;
3482 pc->object = object;
3483 break;
3484 }
3485 continue;
3486 }
3487
3488 remove_partial(n, slab);
3489
3490 if (!partial) {
3491 partial = slab;
3492 stat(s, ALLOC_FROM_PARTIAL);
3493
3494 if ((slub_get_cpu_partial(s) == 0)) {
3495 break;
3496 }
3497 } else {
3498 put_cpu_partial(s, slab, 0);
3499 stat(s, CPU_PARTIAL_NODE);
3500
3501 if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
3502 break;
3503 }
3504 }
3505 }
3506 spin_unlock_irqrestore(&n->list_lock, flags);
3507 return partial;
3508 }
3509
3510 /*
3511 * Get a slab from somewhere. Search in increasing NUMA distances.
3512 */
get_any_partial(struct kmem_cache * s,struct partial_context * pc)3513 static struct slab *get_any_partial(struct kmem_cache *s,
3514 struct partial_context *pc)
3515 {
3516 #ifdef CONFIG_NUMA
3517 struct zonelist *zonelist;
3518 struct zoneref *z;
3519 struct zone *zone;
3520 enum zone_type highest_zoneidx = gfp_zone(pc->flags);
3521 struct slab *slab;
3522 unsigned int cpuset_mems_cookie;
3523
3524 /*
3525 * The defrag ratio allows a configuration of the tradeoffs between
3526 * inter node defragmentation and node local allocations. A lower
3527 * defrag_ratio increases the tendency to do local allocations
3528 * instead of attempting to obtain partial slabs from other nodes.
3529 *
3530 * If the defrag_ratio is set to 0 then kmalloc() always
3531 * returns node local objects. If the ratio is higher then kmalloc()
3532 * may return off node objects because partial slabs are obtained
3533 * from other nodes and filled up.
3534 *
3535 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
3536 * (which makes defrag_ratio = 1000) then every (well almost)
3537 * allocation will first attempt to defrag slab caches on other nodes.
3538 * This means scanning over all nodes to look for partial slabs which
3539 * may be expensive if we do it every time we are trying to find a slab
3540 * with available objects.
3541 */
3542 if (!s->remote_node_defrag_ratio ||
3543 get_cycles() % 1024 > s->remote_node_defrag_ratio)
3544 return NULL;
3545
3546 do {
3547 cpuset_mems_cookie = read_mems_allowed_begin();
3548 zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
3549 for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
3550 struct kmem_cache_node *n;
3551
3552 n = get_node(s, zone_to_nid(zone));
3553
3554 if (n && cpuset_zone_allowed(zone, pc->flags) &&
3555 n->nr_partial > s->min_partial) {
3556 slab = get_partial_node(s, n, pc);
3557 if (slab) {
3558 /*
3559 * Don't check read_mems_allowed_retry()
3560 * here - if mems_allowed was updated in
3561 * parallel, that was a harmless race
3562 * between allocation and the cpuset
3563 * update
3564 */
3565 return slab;
3566 }
3567 }
3568 }
3569 } while (read_mems_allowed_retry(cpuset_mems_cookie));
3570 #endif /* CONFIG_NUMA */
3571 return NULL;
3572 }
3573
3574 /*
3575 * Get a partial slab, lock it and return it.
3576 */
get_partial(struct kmem_cache * s,int node,struct partial_context * pc)3577 static struct slab *get_partial(struct kmem_cache *s, int node,
3578 struct partial_context *pc)
3579 {
3580 struct slab *slab;
3581 int searchnode = node;
3582
3583 if (node == NUMA_NO_NODE)
3584 searchnode = numa_mem_id();
3585
3586 slab = get_partial_node(s, get_node(s, searchnode), pc);
3587 if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
3588 return slab;
3589
3590 return get_any_partial(s, pc);
3591 }
3592
3593 #ifndef CONFIG_SLUB_TINY
3594
3595 #ifdef CONFIG_PREEMPTION
3596 /*
3597 * Calculate the next globally unique transaction for disambiguation
3598 * during cmpxchg. The transactions start with the cpu number and are then
3599 * incremented by CONFIG_NR_CPUS.
3600 */
3601 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
3602 #else
3603 /*
3604 * No preemption supported therefore also no need to check for
3605 * different cpus.
3606 */
3607 #define TID_STEP 1
3608 #endif /* CONFIG_PREEMPTION */
3609
next_tid(unsigned long tid)3610 static inline unsigned long next_tid(unsigned long tid)
3611 {
3612 return tid + TID_STEP;
3613 }
3614
3615 #ifdef SLUB_DEBUG_CMPXCHG
tid_to_cpu(unsigned long tid)3616 static inline unsigned int tid_to_cpu(unsigned long tid)
3617 {
3618 return tid % TID_STEP;
3619 }
3620
tid_to_event(unsigned long tid)3621 static inline unsigned long tid_to_event(unsigned long tid)
3622 {
3623 return tid / TID_STEP;
3624 }
3625 #endif
3626
init_tid(int cpu)3627 static inline unsigned int init_tid(int cpu)
3628 {
3629 return cpu;
3630 }
3631
note_cmpxchg_failure(const char * n,const struct kmem_cache * s,unsigned long tid)3632 static inline void note_cmpxchg_failure(const char *n,
3633 const struct kmem_cache *s, unsigned long tid)
3634 {
3635 #ifdef SLUB_DEBUG_CMPXCHG
3636 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
3637
3638 pr_info("%s %s: cmpxchg redo ", n, s->name);
3639
3640 if (IS_ENABLED(CONFIG_PREEMPTION) &&
3641 tid_to_cpu(tid) != tid_to_cpu(actual_tid)) {
3642 pr_warn("due to cpu change %d -> %d\n",
3643 tid_to_cpu(tid), tid_to_cpu(actual_tid));
3644 } else if (tid_to_event(tid) != tid_to_event(actual_tid)) {
3645 pr_warn("due to cpu running other code. Event %ld->%ld\n",
3646 tid_to_event(tid), tid_to_event(actual_tid));
3647 } else {
3648 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
3649 actual_tid, tid, next_tid(tid));
3650 }
3651 #endif
3652 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
3653 }
3654
init_kmem_cache_cpus(struct kmem_cache * s)3655 static void init_kmem_cache_cpus(struct kmem_cache *s)
3656 {
3657 #ifdef CONFIG_PREEMPT_RT
3658 /*
3659 * Register lockdep key for non-boot kmem caches to avoid
3660 * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
3661 */
3662 bool finegrain_lockdep = !init_section_contains(s, 1);
3663 #else
3664 /*
3665 * Don't bother with different lockdep classes for each
3666 * kmem_cache, since we only use local_trylock_irqsave().
3667 */
3668 bool finegrain_lockdep = false;
3669 #endif
3670 int cpu;
3671 struct kmem_cache_cpu *c;
3672
3673 if (finegrain_lockdep)
3674 lockdep_register_key(&s->lock_key);
3675 for_each_possible_cpu(cpu) {
3676 c = per_cpu_ptr(s->cpu_slab, cpu);
3677 local_trylock_init(&c->lock);
3678 if (finegrain_lockdep)
3679 lockdep_set_class(&c->lock, &s->lock_key);
3680 c->tid = init_tid(cpu);
3681 }
3682 }
3683
3684 /*
3685 * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
3686 * unfreezes the slabs and puts it on the proper list.
3687 * Assumes the slab has been already safely taken away from kmem_cache_cpu
3688 * by the caller.
3689 */
deactivate_slab(struct kmem_cache * s,struct slab * slab,void * freelist)3690 static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
3691 void *freelist)
3692 {
3693 struct kmem_cache_node *n = get_node(s, slab_nid(slab));
3694 int free_delta = 0;
3695 void *nextfree, *freelist_iter, *freelist_tail;
3696 int tail = DEACTIVATE_TO_HEAD;
3697 unsigned long flags = 0;
3698 struct slab new;
3699 struct slab old;
3700
3701 if (READ_ONCE(slab->freelist)) {
3702 stat(s, DEACTIVATE_REMOTE_FREES);
3703 tail = DEACTIVATE_TO_TAIL;
3704 }
3705
3706 /*
3707 * Stage one: Count the objects on cpu's freelist as free_delta and
3708 * remember the last object in freelist_tail for later splicing.
3709 */
3710 freelist_tail = NULL;
3711 freelist_iter = freelist;
3712 while (freelist_iter) {
3713 nextfree = get_freepointer(s, freelist_iter);
3714
3715 /*
3716 * If 'nextfree' is invalid, it is possible that the object at
3717 * 'freelist_iter' is already corrupted. So isolate all objects
3718 * starting at 'freelist_iter' by skipping them.
3719 */
3720 if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
3721 break;
3722
3723 freelist_tail = freelist_iter;
3724 free_delta++;
3725
3726 freelist_iter = nextfree;
3727 }
3728
3729 /*
3730 * Stage two: Unfreeze the slab while splicing the per-cpu
3731 * freelist to the head of slab's freelist.
3732 */
3733 do {
3734 old.freelist = READ_ONCE(slab->freelist);
3735 old.counters = READ_ONCE(slab->counters);
3736 VM_BUG_ON(!old.frozen);
3737
3738 /* Determine target state of the slab */
3739 new.counters = old.counters;
3740 new.frozen = 0;
3741 if (freelist_tail) {
3742 new.inuse -= free_delta;
3743 set_freepointer(s, freelist_tail, old.freelist);
3744 new.freelist = freelist;
3745 } else {
3746 new.freelist = old.freelist;
3747 }
3748 } while (!slab_update_freelist(s, slab,
3749 old.freelist, old.counters,
3750 new.freelist, new.counters,
3751 "unfreezing slab"));
3752
3753 /*
3754 * Stage three: Manipulate the slab list based on the updated state.
3755 */
3756 if (!new.inuse && n->nr_partial >= s->min_partial) {
3757 stat(s, DEACTIVATE_EMPTY);
3758 discard_slab(s, slab);
3759 stat(s, FREE_SLAB);
3760 } else if (new.freelist) {
3761 spin_lock_irqsave(&n->list_lock, flags);
3762 add_partial(n, slab, tail);
3763 spin_unlock_irqrestore(&n->list_lock, flags);
3764 stat(s, tail);
3765 } else {
3766 stat(s, DEACTIVATE_FULL);
3767 }
3768 }
3769
3770 /*
3771 * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
3772 * can be acquired without a deadlock before invoking the function.
3773 *
3774 * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
3775 * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
3776 * and kmalloc() is not used in an unsupported context.
3777 *
3778 * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
3779 * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
3780 * lockdep_assert() will catch a bug in case:
3781 * #1
3782 * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
3783 * or
3784 * #2
3785 * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
3786 *
3787 * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
3788 * disabled context. The lock will always be acquired and if needed it
3789 * block and sleep until the lock is available.
3790 * #1 is possible in !PREEMPT_RT only.
3791 * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
3792 * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
3793 * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
3794 *
3795 * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
3796 */
3797 #if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
3798 #define local_lock_cpu_slab(s, flags) \
3799 local_lock_irqsave(&(s)->cpu_slab->lock, flags)
3800 #else
3801 #define local_lock_cpu_slab(s, flags) \
3802 do { \
3803 bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
3804 lockdep_assert(__l); \
3805 } while (0)
3806 #endif
3807
3808 #define local_unlock_cpu_slab(s, flags) \
3809 local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
3810
3811 #ifdef CONFIG_SLUB_CPU_PARTIAL
__put_partials(struct kmem_cache * s,struct slab * partial_slab)3812 static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
3813 {
3814 struct kmem_cache_node *n = NULL, *n2 = NULL;
3815 struct slab *slab, *slab_to_discard = NULL;
3816 unsigned long flags = 0;
3817
3818 while (partial_slab) {
3819 slab = partial_slab;
3820 partial_slab = slab->next;
3821
3822 n2 = get_node(s, slab_nid(slab));
3823 if (n != n2) {
3824 if (n)
3825 spin_unlock_irqrestore(&n->list_lock, flags);
3826
3827 n = n2;
3828 spin_lock_irqsave(&n->list_lock, flags);
3829 }
3830
3831 if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) {
3832 slab->next = slab_to_discard;
3833 slab_to_discard = slab;
3834 } else {
3835 add_partial(n, slab, DEACTIVATE_TO_TAIL);
3836 stat(s, FREE_ADD_PARTIAL);
3837 }
3838 }
3839
3840 if (n)
3841 spin_unlock_irqrestore(&n->list_lock, flags);
3842
3843 while (slab_to_discard) {
3844 slab = slab_to_discard;
3845 slab_to_discard = slab_to_discard->next;
3846
3847 stat(s, DEACTIVATE_EMPTY);
3848 discard_slab(s, slab);
3849 stat(s, FREE_SLAB);
3850 }
3851 }
3852
3853 /*
3854 * Put all the cpu partial slabs to the node partial list.
3855 */
put_partials(struct kmem_cache * s)3856 static void put_partials(struct kmem_cache *s)
3857 {
3858 struct slab *partial_slab;
3859 unsigned long flags;
3860
3861 local_lock_irqsave(&s->cpu_slab->lock, flags);
3862 partial_slab = this_cpu_read(s->cpu_slab->partial);
3863 this_cpu_write(s->cpu_slab->partial, NULL);
3864 local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3865
3866 if (partial_slab)
3867 __put_partials(s, partial_slab);
3868 }
3869
put_partials_cpu(struct kmem_cache * s,struct kmem_cache_cpu * c)3870 static void put_partials_cpu(struct kmem_cache *s,
3871 struct kmem_cache_cpu *c)
3872 {
3873 struct slab *partial_slab;
3874
3875 partial_slab = slub_percpu_partial(c);
3876 c->partial = NULL;
3877
3878 if (partial_slab)
3879 __put_partials(s, partial_slab);
3880 }
3881
3882 /*
3883 * Put a slab into a partial slab slot if available.
3884 *
3885 * If we did not find a slot then simply move all the partials to the
3886 * per node partial list.
3887 */
put_cpu_partial(struct kmem_cache * s,struct slab * slab,int drain)3888 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
3889 {
3890 struct slab *oldslab;
3891 struct slab *slab_to_put = NULL;
3892 unsigned long flags;
3893 int slabs = 0;
3894
3895 local_lock_cpu_slab(s, flags);
3896
3897 oldslab = this_cpu_read(s->cpu_slab->partial);
3898
3899 if (oldslab) {
3900 if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
3901 /*
3902 * Partial array is full. Move the existing set to the
3903 * per node partial list. Postpone the actual unfreezing
3904 * outside of the critical section.
3905 */
3906 slab_to_put = oldslab;
3907 oldslab = NULL;
3908 } else {
3909 slabs = oldslab->slabs;
3910 }
3911 }
3912
3913 slabs++;
3914
3915 slab->slabs = slabs;
3916 slab->next = oldslab;
3917
3918 this_cpu_write(s->cpu_slab->partial, slab);
3919
3920 local_unlock_cpu_slab(s, flags);
3921
3922 if (slab_to_put) {
3923 __put_partials(s, slab_to_put);
3924 stat(s, CPU_PARTIAL_DRAIN);
3925 }
3926 }
3927
3928 #else /* CONFIG_SLUB_CPU_PARTIAL */
3929
put_partials(struct kmem_cache * s)3930 static inline void put_partials(struct kmem_cache *s) { }
put_partials_cpu(struct kmem_cache * s,struct kmem_cache_cpu * c)3931 static inline void put_partials_cpu(struct kmem_cache *s,
3932 struct kmem_cache_cpu *c) { }
3933
3934 #endif /* CONFIG_SLUB_CPU_PARTIAL */
3935
flush_slab(struct kmem_cache * s,struct kmem_cache_cpu * c)3936 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
3937 {
3938 unsigned long flags;
3939 struct slab *slab;
3940 void *freelist;
3941
3942 local_lock_irqsave(&s->cpu_slab->lock, flags);
3943
3944 slab = c->slab;
3945 freelist = c->freelist;
3946
3947 c->slab = NULL;
3948 c->freelist = NULL;
3949 c->tid = next_tid(c->tid);
3950
3951 local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3952
3953 if (slab) {
3954 deactivate_slab(s, slab, freelist);
3955 stat(s, CPUSLAB_FLUSH);
3956 }
3957 }
3958
__flush_cpu_slab(struct kmem_cache * s,int cpu)3959 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
3960 {
3961 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3962 void *freelist = c->freelist;
3963 struct slab *slab = c->slab;
3964
3965 c->slab = NULL;
3966 c->freelist = NULL;
3967 c->tid = next_tid(c->tid);
3968
3969 if (slab) {
3970 deactivate_slab(s, slab, freelist);
3971 stat(s, CPUSLAB_FLUSH);
3972 }
3973
3974 put_partials_cpu(s, c);
3975 }
3976
flush_this_cpu_slab(struct kmem_cache * s)3977 static inline void flush_this_cpu_slab(struct kmem_cache *s)
3978 {
3979 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
3980
3981 if (c->slab)
3982 flush_slab(s, c);
3983
3984 put_partials(s);
3985 }
3986
has_cpu_slab(int cpu,struct kmem_cache * s)3987 static bool has_cpu_slab(int cpu, struct kmem_cache *s)
3988 {
3989 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3990
3991 return c->slab || slub_percpu_partial(c);
3992 }
3993
3994 #else /* CONFIG_SLUB_TINY */
__flush_cpu_slab(struct kmem_cache * s,int cpu)3995 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
has_cpu_slab(int cpu,struct kmem_cache * s)3996 static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; }
flush_this_cpu_slab(struct kmem_cache * s)3997 static inline void flush_this_cpu_slab(struct kmem_cache *s) { }
3998 #endif /* CONFIG_SLUB_TINY */
3999
has_pcs_used(int cpu,struct kmem_cache * s)4000 static bool has_pcs_used(int cpu, struct kmem_cache *s)
4001 {
4002 struct slub_percpu_sheaves *pcs;
4003
4004 if (!s->cpu_sheaves)
4005 return false;
4006
4007 pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
4008
4009 return (pcs->spare || pcs->rcu_free || pcs->main->size);
4010 }
4011
4012 /*
4013 * Flush cpu slab.
4014 *
4015 * Called from CPU work handler with migration disabled.
4016 */
flush_cpu_slab(struct work_struct * w)4017 static void flush_cpu_slab(struct work_struct *w)
4018 {
4019 struct kmem_cache *s;
4020 struct slub_flush_work *sfw;
4021
4022 sfw = container_of(w, struct slub_flush_work, work);
4023
4024 s = sfw->s;
4025
4026 if (s->cpu_sheaves)
4027 pcs_flush_all(s);
4028
4029 flush_this_cpu_slab(s);
4030 }
4031
flush_all_cpus_locked(struct kmem_cache * s)4032 static void flush_all_cpus_locked(struct kmem_cache *s)
4033 {
4034 struct slub_flush_work *sfw;
4035 unsigned int cpu;
4036
4037 lockdep_assert_cpus_held();
4038 mutex_lock(&flush_lock);
4039
4040 for_each_online_cpu(cpu) {
4041 sfw = &per_cpu(slub_flush, cpu);
4042 if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) {
4043 sfw->skip = true;
4044 continue;
4045 }
4046 INIT_WORK(&sfw->work, flush_cpu_slab);
4047 sfw->skip = false;
4048 sfw->s = s;
4049 queue_work_on(cpu, flushwq, &sfw->work);
4050 }
4051
4052 for_each_online_cpu(cpu) {
4053 sfw = &per_cpu(slub_flush, cpu);
4054 if (sfw->skip)
4055 continue;
4056 flush_work(&sfw->work);
4057 }
4058
4059 mutex_unlock(&flush_lock);
4060 }
4061
flush_all(struct kmem_cache * s)4062 static void flush_all(struct kmem_cache *s)
4063 {
4064 cpus_read_lock();
4065 flush_all_cpus_locked(s);
4066 cpus_read_unlock();
4067 }
4068
flush_rcu_sheaf(struct work_struct * w)4069 static void flush_rcu_sheaf(struct work_struct *w)
4070 {
4071 struct slub_percpu_sheaves *pcs;
4072 struct slab_sheaf *rcu_free;
4073 struct slub_flush_work *sfw;
4074 struct kmem_cache *s;
4075
4076 sfw = container_of(w, struct slub_flush_work, work);
4077 s = sfw->s;
4078
4079 local_lock(&s->cpu_sheaves->lock);
4080 pcs = this_cpu_ptr(s->cpu_sheaves);
4081
4082 rcu_free = pcs->rcu_free;
4083 pcs->rcu_free = NULL;
4084
4085 local_unlock(&s->cpu_sheaves->lock);
4086
4087 if (rcu_free)
4088 call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
4089 }
4090
4091
4092 /* needed for kvfree_rcu_barrier() */
flush_all_rcu_sheaves(void)4093 void flush_all_rcu_sheaves(void)
4094 {
4095 struct slub_flush_work *sfw;
4096 struct kmem_cache *s;
4097 unsigned int cpu;
4098
4099 cpus_read_lock();
4100 mutex_lock(&slab_mutex);
4101
4102 list_for_each_entry(s, &slab_caches, list) {
4103 if (!s->cpu_sheaves)
4104 continue;
4105
4106 mutex_lock(&flush_lock);
4107
4108 for_each_online_cpu(cpu) {
4109 sfw = &per_cpu(slub_flush, cpu);
4110
4111 /*
4112 * we don't check if rcu_free sheaf exists - racing
4113 * __kfree_rcu_sheaf() might have just removed it.
4114 * by executing flush_rcu_sheaf() on the cpu we make
4115 * sure the __kfree_rcu_sheaf() finished its call_rcu()
4116 */
4117
4118 INIT_WORK(&sfw->work, flush_rcu_sheaf);
4119 sfw->s = s;
4120 queue_work_on(cpu, flushwq, &sfw->work);
4121 }
4122
4123 for_each_online_cpu(cpu) {
4124 sfw = &per_cpu(slub_flush, cpu);
4125 flush_work(&sfw->work);
4126 }
4127
4128 mutex_unlock(&flush_lock);
4129 }
4130
4131 mutex_unlock(&slab_mutex);
4132 cpus_read_unlock();
4133
4134 rcu_barrier();
4135 }
4136
4137 /*
4138 * Use the cpu notifier to insure that the cpu slabs are flushed when
4139 * necessary.
4140 */
slub_cpu_dead(unsigned int cpu)4141 static int slub_cpu_dead(unsigned int cpu)
4142 {
4143 struct kmem_cache *s;
4144
4145 mutex_lock(&slab_mutex);
4146 list_for_each_entry(s, &slab_caches, list) {
4147 __flush_cpu_slab(s, cpu);
4148 if (s->cpu_sheaves)
4149 __pcs_flush_all_cpu(s, cpu);
4150 }
4151 mutex_unlock(&slab_mutex);
4152 return 0;
4153 }
4154
4155 /*
4156 * Check if the objects in a per cpu structure fit numa
4157 * locality expectations.
4158 */
node_match(struct slab * slab,int node)4159 static inline int node_match(struct slab *slab, int node)
4160 {
4161 #ifdef CONFIG_NUMA
4162 if (node != NUMA_NO_NODE && slab_nid(slab) != node)
4163 return 0;
4164 #endif
4165 return 1;
4166 }
4167
4168 #ifdef CONFIG_SLUB_DEBUG
count_free(struct slab * slab)4169 static int count_free(struct slab *slab)
4170 {
4171 return slab->objects - slab->inuse;
4172 }
4173
node_nr_objs(struct kmem_cache_node * n)4174 static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
4175 {
4176 return atomic_long_read(&n->total_objects);
4177 }
4178
4179 /* Supports checking bulk free of a constructed freelist */
free_debug_processing(struct kmem_cache * s,struct slab * slab,void * head,void * tail,int * bulk_cnt,unsigned long addr,depot_stack_handle_t handle)4180 static inline bool free_debug_processing(struct kmem_cache *s,
4181 struct slab *slab, void *head, void *tail, int *bulk_cnt,
4182 unsigned long addr, depot_stack_handle_t handle)
4183 {
4184 bool checks_ok = false;
4185 void *object = head;
4186 int cnt = 0;
4187
4188 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
4189 if (!check_slab(s, slab))
4190 goto out;
4191 }
4192
4193 if (slab->inuse < *bulk_cnt) {
4194 slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
4195 slab->inuse, *bulk_cnt);
4196 goto out;
4197 }
4198
4199 next_object:
4200
4201 if (++cnt > *bulk_cnt)
4202 goto out_cnt;
4203
4204 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
4205 if (!free_consistency_checks(s, slab, object, addr))
4206 goto out;
4207 }
4208
4209 if (s->flags & SLAB_STORE_USER)
4210 set_track_update(s, object, TRACK_FREE, addr, handle);
4211 trace(s, slab, object, 0);
4212 /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
4213 init_object(s, object, SLUB_RED_INACTIVE);
4214
4215 /* Reached end of constructed freelist yet? */
4216 if (object != tail) {
4217 object = get_freepointer(s, object);
4218 goto next_object;
4219 }
4220 checks_ok = true;
4221
4222 out_cnt:
4223 if (cnt != *bulk_cnt) {
4224 slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
4225 *bulk_cnt, cnt);
4226 *bulk_cnt = cnt;
4227 }
4228
4229 out:
4230
4231 if (!checks_ok)
4232 slab_fix(s, "Object at 0x%p not freed", object);
4233
4234 return checks_ok;
4235 }
4236 #endif /* CONFIG_SLUB_DEBUG */
4237
4238 #if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
count_partial(struct kmem_cache_node * n,int (* get_count)(struct slab *))4239 static unsigned long count_partial(struct kmem_cache_node *n,
4240 int (*get_count)(struct slab *))
4241 {
4242 unsigned long flags;
4243 unsigned long x = 0;
4244 struct slab *slab;
4245
4246 spin_lock_irqsave(&n->list_lock, flags);
4247 list_for_each_entry(slab, &n->partial, slab_list)
4248 x += get_count(slab);
4249 spin_unlock_irqrestore(&n->list_lock, flags);
4250 return x;
4251 }
4252 #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
4253
4254 #ifdef CONFIG_SLUB_DEBUG
4255 #define MAX_PARTIAL_TO_SCAN 10000
4256
count_partial_free_approx(struct kmem_cache_node * n)4257 static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
4258 {
4259 unsigned long flags;
4260 unsigned long x = 0;
4261 struct slab *slab;
4262
4263 spin_lock_irqsave(&n->list_lock, flags);
4264 if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
4265 list_for_each_entry(slab, &n->partial, slab_list)
4266 x += slab->objects - slab->inuse;
4267 } else {
4268 /*
4269 * For a long list, approximate the total count of objects in
4270 * it to meet the limit on the number of slabs to scan.
4271 * Scan from both the list's head and tail for better accuracy.
4272 */
4273 unsigned long scanned = 0;
4274
4275 list_for_each_entry(slab, &n->partial, slab_list) {
4276 x += slab->objects - slab->inuse;
4277 if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
4278 break;
4279 }
4280 list_for_each_entry_reverse(slab, &n->partial, slab_list) {
4281 x += slab->objects - slab->inuse;
4282 if (++scanned == MAX_PARTIAL_TO_SCAN)
4283 break;
4284 }
4285 x = mult_frac(x, n->nr_partial, scanned);
4286 x = min(x, node_nr_objs(n));
4287 }
4288 spin_unlock_irqrestore(&n->list_lock, flags);
4289 return x;
4290 }
4291
4292 static noinline void
slab_out_of_memory(struct kmem_cache * s,gfp_t gfpflags,int nid)4293 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
4294 {
4295 static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
4296 DEFAULT_RATELIMIT_BURST);
4297 int cpu = raw_smp_processor_id();
4298 int node;
4299 struct kmem_cache_node *n;
4300
4301 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
4302 return;
4303
4304 pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n",
4305 cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags);
4306 pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
4307 s->name, s->object_size, s->size, oo_order(s->oo),
4308 oo_order(s->min));
4309
4310 if (oo_order(s->min) > get_order(s->object_size))
4311 pr_warn(" %s debugging increased min order, use slab_debug=O to disable.\n",
4312 s->name);
4313
4314 for_each_kmem_cache_node(s, node, n) {
4315 unsigned long nr_slabs;
4316 unsigned long nr_objs;
4317 unsigned long nr_free;
4318
4319 nr_free = count_partial_free_approx(n);
4320 nr_slabs = node_nr_slabs(n);
4321 nr_objs = node_nr_objs(n);
4322
4323 pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
4324 node, nr_slabs, nr_objs, nr_free);
4325 }
4326 }
4327 #else /* CONFIG_SLUB_DEBUG */
4328 static inline void
slab_out_of_memory(struct kmem_cache * s,gfp_t gfpflags,int nid)4329 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
4330 #endif
4331
pfmemalloc_match(struct slab * slab,gfp_t gfpflags)4332 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
4333 {
4334 if (unlikely(slab_test_pfmemalloc(slab)))
4335 return gfp_pfmemalloc_allowed(gfpflags);
4336
4337 return true;
4338 }
4339
4340 #ifndef CONFIG_SLUB_TINY
4341 static inline bool
__update_cpu_freelist_fast(struct kmem_cache * s,void * freelist_old,void * freelist_new,unsigned long tid)4342 __update_cpu_freelist_fast(struct kmem_cache *s,
4343 void *freelist_old, void *freelist_new,
4344 unsigned long tid)
4345 {
4346 freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
4347 freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
4348
4349 return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
4350 &old.full, new.full);
4351 }
4352
4353 /*
4354 * Check the slab->freelist and either transfer the freelist to the
4355 * per cpu freelist or deactivate the slab.
4356 *
4357 * The slab is still frozen if the return value is not NULL.
4358 *
4359 * If this function returns NULL then the slab has been unfrozen.
4360 */
get_freelist(struct kmem_cache * s,struct slab * slab)4361 static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
4362 {
4363 struct slab new;
4364 unsigned long counters;
4365 void *freelist;
4366
4367 lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
4368
4369 do {
4370 freelist = slab->freelist;
4371 counters = slab->counters;
4372
4373 new.counters = counters;
4374
4375 new.inuse = slab->objects;
4376 new.frozen = freelist != NULL;
4377
4378 } while (!__slab_update_freelist(s, slab,
4379 freelist, counters,
4380 NULL, new.counters,
4381 "get_freelist"));
4382
4383 return freelist;
4384 }
4385
4386 /*
4387 * Freeze the partial slab and return the pointer to the freelist.
4388 */
freeze_slab(struct kmem_cache * s,struct slab * slab)4389 static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
4390 {
4391 struct slab new;
4392 unsigned long counters;
4393 void *freelist;
4394
4395 do {
4396 freelist = slab->freelist;
4397 counters = slab->counters;
4398
4399 new.counters = counters;
4400 VM_BUG_ON(new.frozen);
4401
4402 new.inuse = slab->objects;
4403 new.frozen = 1;
4404
4405 } while (!slab_update_freelist(s, slab,
4406 freelist, counters,
4407 NULL, new.counters,
4408 "freeze_slab"));
4409
4410 return freelist;
4411 }
4412
4413 /*
4414 * Slow path. The lockless freelist is empty or we need to perform
4415 * debugging duties.
4416 *
4417 * Processing is still very fast if new objects have been freed to the
4418 * regular freelist. In that case we simply take over the regular freelist
4419 * as the lockless freelist and zap the regular freelist.
4420 *
4421 * If that is not working then we fall back to the partial lists. We take the
4422 * first element of the freelist as the object to allocate now and move the
4423 * rest of the freelist to the lockless freelist.
4424 *
4425 * And if we were unable to get a new slab from the partial slab lists then
4426 * we need to allocate a new slab. This is the slowest path since it involves
4427 * a call to the page allocator and the setup of a new slab.
4428 *
4429 * Version of __slab_alloc to use when we know that preemption is
4430 * already disabled (which is the case for bulk allocation).
4431 */
___slab_alloc(struct kmem_cache * s,gfp_t gfpflags,int node,unsigned long addr,struct kmem_cache_cpu * c,unsigned int orig_size)4432 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
4433 unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
4434 {
4435 bool allow_spin = gfpflags_allow_spinning(gfpflags);
4436 void *freelist;
4437 struct slab *slab;
4438 unsigned long flags;
4439 struct partial_context pc;
4440 bool try_thisnode = true;
4441
4442 stat(s, ALLOC_SLOWPATH);
4443
4444 reread_slab:
4445
4446 slab = READ_ONCE(c->slab);
4447 if (!slab) {
4448 /*
4449 * if the node is not online or has no normal memory, just
4450 * ignore the node constraint
4451 */
4452 if (unlikely(node != NUMA_NO_NODE &&
4453 !node_isset(node, slab_nodes)))
4454 node = NUMA_NO_NODE;
4455 goto new_slab;
4456 }
4457
4458 if (unlikely(!node_match(slab, node))) {
4459 /*
4460 * same as above but node_match() being false already
4461 * implies node != NUMA_NO_NODE.
4462 *
4463 * We don't strictly honor pfmemalloc and NUMA preferences
4464 * when !allow_spin because:
4465 *
4466 * 1. Most kmalloc() users allocate objects on the local node,
4467 * so kmalloc_nolock() tries not to interfere with them by
4468 * deactivating the cpu slab.
4469 *
4470 * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause
4471 * unnecessary slab allocations even when n->partial list
4472 * is not empty.
4473 */
4474 if (!node_isset(node, slab_nodes) ||
4475 !allow_spin) {
4476 node = NUMA_NO_NODE;
4477 } else {
4478 stat(s, ALLOC_NODE_MISMATCH);
4479 goto deactivate_slab;
4480 }
4481 }
4482
4483 /*
4484 * By rights, we should be searching for a slab page that was
4485 * PFMEMALLOC but right now, we are losing the pfmemalloc
4486 * information when the page leaves the per-cpu allocator
4487 */
4488 if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
4489 goto deactivate_slab;
4490
4491 /* must check again c->slab in case we got preempted and it changed */
4492 local_lock_cpu_slab(s, flags);
4493
4494 if (unlikely(slab != c->slab)) {
4495 local_unlock_cpu_slab(s, flags);
4496 goto reread_slab;
4497 }
4498 freelist = c->freelist;
4499 if (freelist)
4500 goto load_freelist;
4501
4502 freelist = get_freelist(s, slab);
4503
4504 if (!freelist) {
4505 c->slab = NULL;
4506 c->tid = next_tid(c->tid);
4507 local_unlock_cpu_slab(s, flags);
4508 stat(s, DEACTIVATE_BYPASS);
4509 goto new_slab;
4510 }
4511
4512 stat(s, ALLOC_REFILL);
4513
4514 load_freelist:
4515
4516 lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
4517
4518 /*
4519 * freelist is pointing to the list of objects to be used.
4520 * slab is pointing to the slab from which the objects are obtained.
4521 * That slab must be frozen for per cpu allocations to work.
4522 */
4523 VM_BUG_ON(!c->slab->frozen);
4524 c->freelist = get_freepointer(s, freelist);
4525 c->tid = next_tid(c->tid);
4526 local_unlock_cpu_slab(s, flags);
4527 return freelist;
4528
4529 deactivate_slab:
4530
4531 local_lock_cpu_slab(s, flags);
4532 if (slab != c->slab) {
4533 local_unlock_cpu_slab(s, flags);
4534 goto reread_slab;
4535 }
4536 freelist = c->freelist;
4537 c->slab = NULL;
4538 c->freelist = NULL;
4539 c->tid = next_tid(c->tid);
4540 local_unlock_cpu_slab(s, flags);
4541 deactivate_slab(s, slab, freelist);
4542
4543 new_slab:
4544
4545 #ifdef CONFIG_SLUB_CPU_PARTIAL
4546 while (slub_percpu_partial(c)) {
4547 local_lock_cpu_slab(s, flags);
4548 if (unlikely(c->slab)) {
4549 local_unlock_cpu_slab(s, flags);
4550 goto reread_slab;
4551 }
4552 if (unlikely(!slub_percpu_partial(c))) {
4553 local_unlock_cpu_slab(s, flags);
4554 /* we were preempted and partial list got empty */
4555 goto new_objects;
4556 }
4557
4558 slab = slub_percpu_partial(c);
4559 slub_set_percpu_partial(c, slab);
4560
4561 if (likely(node_match(slab, node) &&
4562 pfmemalloc_match(slab, gfpflags)) ||
4563 !allow_spin) {
4564 c->slab = slab;
4565 freelist = get_freelist(s, slab);
4566 VM_BUG_ON(!freelist);
4567 stat(s, CPU_PARTIAL_ALLOC);
4568 goto load_freelist;
4569 }
4570
4571 local_unlock_cpu_slab(s, flags);
4572
4573 slab->next = NULL;
4574 __put_partials(s, slab);
4575 }
4576 #endif
4577
4578 new_objects:
4579
4580 pc.flags = gfpflags;
4581 /*
4582 * When a preferred node is indicated but no __GFP_THISNODE
4583 *
4584 * 1) try to get a partial slab from target node only by having
4585 * __GFP_THISNODE in pc.flags for get_partial()
4586 * 2) if 1) failed, try to allocate a new slab from target node with
4587 * GPF_NOWAIT | __GFP_THISNODE opportunistically
4588 * 3) if 2) failed, retry with original gfpflags which will allow
4589 * get_partial() try partial lists of other nodes before potentially
4590 * allocating new page from other nodes
4591 */
4592 if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
4593 && try_thisnode)) {
4594 if (unlikely(!allow_spin))
4595 /* Do not upgrade gfp to NOWAIT from more restrictive mode */
4596 pc.flags = gfpflags | __GFP_THISNODE;
4597 else
4598 pc.flags = GFP_NOWAIT | __GFP_THISNODE;
4599 }
4600
4601 pc.orig_size = orig_size;
4602 slab = get_partial(s, node, &pc);
4603 if (slab) {
4604 if (kmem_cache_debug(s)) {
4605 freelist = pc.object;
4606 /*
4607 * For debug caches here we had to go through
4608 * alloc_single_from_partial() so just store the
4609 * tracking info and return the object.
4610 *
4611 * Due to disabled preemption we need to disallow
4612 * blocking. The flags are further adjusted by
4613 * gfp_nested_mask() in stack_depot itself.
4614 */
4615 if (s->flags & SLAB_STORE_USER)
4616 set_track(s, freelist, TRACK_ALLOC, addr,
4617 gfpflags & ~(__GFP_DIRECT_RECLAIM));
4618
4619 return freelist;
4620 }
4621
4622 freelist = freeze_slab(s, slab);
4623 goto retry_load_slab;
4624 }
4625
4626 slub_put_cpu_ptr(s->cpu_slab);
4627 slab = new_slab(s, pc.flags, node);
4628 c = slub_get_cpu_ptr(s->cpu_slab);
4629
4630 if (unlikely(!slab)) {
4631 if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
4632 && try_thisnode) {
4633 try_thisnode = false;
4634 goto new_objects;
4635 }
4636 slab_out_of_memory(s, gfpflags, node);
4637 return NULL;
4638 }
4639
4640 stat(s, ALLOC_SLAB);
4641
4642 if (kmem_cache_debug(s)) {
4643 freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
4644
4645 if (unlikely(!freelist))
4646 goto new_objects;
4647
4648 if (s->flags & SLAB_STORE_USER)
4649 set_track(s, freelist, TRACK_ALLOC, addr,
4650 gfpflags & ~(__GFP_DIRECT_RECLAIM));
4651
4652 return freelist;
4653 }
4654
4655 /*
4656 * No other reference to the slab yet so we can
4657 * muck around with it freely without cmpxchg
4658 */
4659 freelist = slab->freelist;
4660 slab->freelist = NULL;
4661 slab->inuse = slab->objects;
4662 slab->frozen = 1;
4663
4664 inc_slabs_node(s, slab_nid(slab), slab->objects);
4665
4666 if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
4667 /*
4668 * For !pfmemalloc_match() case we don't load freelist so that
4669 * we don't make further mismatched allocations easier.
4670 */
4671 deactivate_slab(s, slab, get_freepointer(s, freelist));
4672 return freelist;
4673 }
4674
4675 retry_load_slab:
4676
4677 local_lock_cpu_slab(s, flags);
4678 if (unlikely(c->slab)) {
4679 void *flush_freelist = c->freelist;
4680 struct slab *flush_slab = c->slab;
4681
4682 c->slab = NULL;
4683 c->freelist = NULL;
4684 c->tid = next_tid(c->tid);
4685
4686 local_unlock_cpu_slab(s, flags);
4687
4688 if (unlikely(!allow_spin)) {
4689 /* Reentrant slub cannot take locks, defer */
4690 defer_deactivate_slab(flush_slab, flush_freelist);
4691 } else {
4692 deactivate_slab(s, flush_slab, flush_freelist);
4693 }
4694
4695 stat(s, CPUSLAB_FLUSH);
4696
4697 goto retry_load_slab;
4698 }
4699 c->slab = slab;
4700
4701 goto load_freelist;
4702 }
4703 /*
4704 * We disallow kprobes in ___slab_alloc() to prevent reentrance
4705 *
4706 * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
4707 * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
4708 * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
4709 * manipulating c->freelist without lock.
4710 *
4711 * This does not prevent kprobe in functions called from ___slab_alloc() such as
4712 * local_lock_irqsave() itself, and that is fine, we only need to protect the
4713 * c->freelist manipulation in ___slab_alloc() itself.
4714 */
4715 NOKPROBE_SYMBOL(___slab_alloc);
4716
4717 /*
4718 * A wrapper for ___slab_alloc() for contexts where preemption is not yet
4719 * disabled. Compensates for possible cpu changes by refetching the per cpu area
4720 * pointer.
4721 */
__slab_alloc(struct kmem_cache * s,gfp_t gfpflags,int node,unsigned long addr,struct kmem_cache_cpu * c,unsigned int orig_size)4722 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
4723 unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
4724 {
4725 void *p;
4726
4727 #ifdef CONFIG_PREEMPT_COUNT
4728 /*
4729 * We may have been preempted and rescheduled on a different
4730 * cpu before disabling preemption. Need to reload cpu area
4731 * pointer.
4732 */
4733 c = slub_get_cpu_ptr(s->cpu_slab);
4734 #endif
4735 if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
4736 if (local_lock_is_locked(&s->cpu_slab->lock)) {
4737 /*
4738 * EBUSY is an internal signal to kmalloc_nolock() to
4739 * retry a different bucket. It's not propagated
4740 * to the caller.
4741 */
4742 p = ERR_PTR(-EBUSY);
4743 goto out;
4744 }
4745 }
4746 p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
4747 out:
4748 #ifdef CONFIG_PREEMPT_COUNT
4749 slub_put_cpu_ptr(s->cpu_slab);
4750 #endif
4751 return p;
4752 }
4753
__slab_alloc_node(struct kmem_cache * s,gfp_t gfpflags,int node,unsigned long addr,size_t orig_size)4754 static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
4755 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
4756 {
4757 struct kmem_cache_cpu *c;
4758 struct slab *slab;
4759 unsigned long tid;
4760 void *object;
4761
4762 redo:
4763 /*
4764 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
4765 * enabled. We may switch back and forth between cpus while
4766 * reading from one cpu area. That does not matter as long
4767 * as we end up on the original cpu again when doing the cmpxchg.
4768 *
4769 * We must guarantee that tid and kmem_cache_cpu are retrieved on the
4770 * same cpu. We read first the kmem_cache_cpu pointer and use it to read
4771 * the tid. If we are preempted and switched to another cpu between the
4772 * two reads, it's OK as the two are still associated with the same cpu
4773 * and cmpxchg later will validate the cpu.
4774 */
4775 c = raw_cpu_ptr(s->cpu_slab);
4776 tid = READ_ONCE(c->tid);
4777
4778 /*
4779 * Irqless object alloc/free algorithm used here depends on sequence
4780 * of fetching cpu_slab's data. tid should be fetched before anything
4781 * on c to guarantee that object and slab associated with previous tid
4782 * won't be used with current tid. If we fetch tid first, object and
4783 * slab could be one associated with next tid and our alloc/free
4784 * request will be failed. In this case, we will retry. So, no problem.
4785 */
4786 barrier();
4787
4788 /*
4789 * The transaction ids are globally unique per cpu and per operation on
4790 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
4791 * occurs on the right processor and that there was no operation on the
4792 * linked list in between.
4793 */
4794
4795 object = c->freelist;
4796 slab = c->slab;
4797
4798 #ifdef CONFIG_NUMA
4799 if (static_branch_unlikely(&strict_numa) &&
4800 node == NUMA_NO_NODE) {
4801
4802 struct mempolicy *mpol = current->mempolicy;
4803
4804 if (mpol) {
4805 /*
4806 * Special BIND rule support. If existing slab
4807 * is in permitted set then do not redirect
4808 * to a particular node.
4809 * Otherwise we apply the memory policy to get
4810 * the node we need to allocate on.
4811 */
4812 if (mpol->mode != MPOL_BIND || !slab ||
4813 !node_isset(slab_nid(slab), mpol->nodes))
4814
4815 node = mempolicy_slab_node();
4816 }
4817 }
4818 #endif
4819
4820 if (!USE_LOCKLESS_FAST_PATH() ||
4821 unlikely(!object || !slab || !node_match(slab, node))) {
4822 object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
4823 } else {
4824 void *next_object = get_freepointer_safe(s, object);
4825
4826 /*
4827 * The cmpxchg will only match if there was no additional
4828 * operation and if we are on the right processor.
4829 *
4830 * The cmpxchg does the following atomically (without lock
4831 * semantics!)
4832 * 1. Relocate first pointer to the current per cpu area.
4833 * 2. Verify that tid and freelist have not been changed
4834 * 3. If they were not changed replace tid and freelist
4835 *
4836 * Since this is without lock semantics the protection is only
4837 * against code executing on this cpu *not* from access by
4838 * other cpus.
4839 */
4840 if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
4841 note_cmpxchg_failure("slab_alloc", s, tid);
4842 goto redo;
4843 }
4844 prefetch_freepointer(s, next_object);
4845 stat(s, ALLOC_FASTPATH);
4846 }
4847
4848 return object;
4849 }
4850 #else /* CONFIG_SLUB_TINY */
__slab_alloc_node(struct kmem_cache * s,gfp_t gfpflags,int node,unsigned long addr,size_t orig_size)4851 static void *__slab_alloc_node(struct kmem_cache *s,
4852 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
4853 {
4854 struct partial_context pc;
4855 struct slab *slab;
4856 void *object;
4857
4858 pc.flags = gfpflags;
4859 pc.orig_size = orig_size;
4860 slab = get_partial(s, node, &pc);
4861
4862 if (slab)
4863 return pc.object;
4864
4865 slab = new_slab(s, gfpflags, node);
4866 if (unlikely(!slab)) {
4867 slab_out_of_memory(s, gfpflags, node);
4868 return NULL;
4869 }
4870
4871 object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
4872
4873 return object;
4874 }
4875 #endif /* CONFIG_SLUB_TINY */
4876
4877 /*
4878 * If the object has been wiped upon free, make sure it's fully initialized by
4879 * zeroing out freelist pointer.
4880 *
4881 * Note that we also wipe custom freelist pointers.
4882 */
maybe_wipe_obj_freeptr(struct kmem_cache * s,void * obj)4883 static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
4884 void *obj)
4885 {
4886 if (unlikely(slab_want_init_on_free(s)) && obj &&
4887 !freeptr_outside_object(s))
4888 memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
4889 0, sizeof(void *));
4890 }
4891
4892 static __fastpath_inline
slab_pre_alloc_hook(struct kmem_cache * s,gfp_t flags)4893 struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
4894 {
4895 flags &= gfp_allowed_mask;
4896
4897 might_alloc(flags);
4898
4899 if (unlikely(should_failslab(s, flags)))
4900 return NULL;
4901
4902 return s;
4903 }
4904
4905 static __fastpath_inline
slab_post_alloc_hook(struct kmem_cache * s,struct list_lru * lru,gfp_t flags,size_t size,void ** p,bool init,unsigned int orig_size)4906 bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
4907 gfp_t flags, size_t size, void **p, bool init,
4908 unsigned int orig_size)
4909 {
4910 unsigned int zero_size = s->object_size;
4911 bool kasan_init = init;
4912 size_t i;
4913 gfp_t init_flags = flags & gfp_allowed_mask;
4914
4915 /*
4916 * For kmalloc object, the allocated memory size(object_size) is likely
4917 * larger than the requested size(orig_size). If redzone check is
4918 * enabled for the extra space, don't zero it, as it will be redzoned
4919 * soon. The redzone operation for this extra space could be seen as a
4920 * replacement of current poisoning under certain debug option, and
4921 * won't break other sanity checks.
4922 */
4923 if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) &&
4924 (s->flags & SLAB_KMALLOC))
4925 zero_size = orig_size;
4926
4927 /*
4928 * When slab_debug is enabled, avoid memory initialization integrated
4929 * into KASAN and instead zero out the memory via the memset below with
4930 * the proper size. Otherwise, KASAN might overwrite SLUB redzones and
4931 * cause false-positive reports. This does not lead to a performance
4932 * penalty on production builds, as slab_debug is not intended to be
4933 * enabled there.
4934 */
4935 if (__slub_debug_enabled())
4936 kasan_init = false;
4937
4938 /*
4939 * As memory initialization might be integrated into KASAN,
4940 * kasan_slab_alloc and initialization memset must be
4941 * kept together to avoid discrepancies in behavior.
4942 *
4943 * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
4944 */
4945 for (i = 0; i < size; i++) {
4946 p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
4947 if (p[i] && init && (!kasan_init ||
4948 !kasan_has_integrated_init()))
4949 memset(p[i], 0, zero_size);
4950 if (gfpflags_allow_spinning(flags))
4951 kmemleak_alloc_recursive(p[i], s->object_size, 1,
4952 s->flags, init_flags);
4953 kmsan_slab_alloc(s, p[i], init_flags);
4954 alloc_tagging_slab_alloc_hook(s, p[i], flags);
4955 }
4956
4957 return memcg_slab_post_alloc_hook(s, lru, flags, size, p);
4958 }
4959
4960 /*
4961 * Replace the empty main sheaf with a (at least partially) full sheaf.
4962 *
4963 * Must be called with the cpu_sheaves local lock locked. If successful, returns
4964 * the pcs pointer and the local lock locked (possibly on a different cpu than
4965 * initially called). If not successful, returns NULL and the local lock
4966 * unlocked.
4967 */
4968 static struct slub_percpu_sheaves *
__pcs_replace_empty_main(struct kmem_cache * s,struct slub_percpu_sheaves * pcs,gfp_t gfp)4969 __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp)
4970 {
4971 struct slab_sheaf *empty = NULL;
4972 struct slab_sheaf *full;
4973 struct node_barn *barn;
4974 bool can_alloc;
4975
4976 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
4977
4978 if (pcs->spare && pcs->spare->size > 0) {
4979 swap(pcs->main, pcs->spare);
4980 return pcs;
4981 }
4982
4983 barn = get_barn(s);
4984
4985 full = barn_replace_empty_sheaf(barn, pcs->main);
4986
4987 if (full) {
4988 stat(s, BARN_GET);
4989 pcs->main = full;
4990 return pcs;
4991 }
4992
4993 stat(s, BARN_GET_FAIL);
4994
4995 can_alloc = gfpflags_allow_blocking(gfp);
4996
4997 if (can_alloc) {
4998 if (pcs->spare) {
4999 empty = pcs->spare;
5000 pcs->spare = NULL;
5001 } else {
5002 empty = barn_get_empty_sheaf(barn);
5003 }
5004 }
5005
5006 local_unlock(&s->cpu_sheaves->lock);
5007
5008 if (!can_alloc)
5009 return NULL;
5010
5011 if (empty) {
5012 if (!refill_sheaf(s, empty, gfp)) {
5013 full = empty;
5014 } else {
5015 /*
5016 * we must be very low on memory so don't bother
5017 * with the barn
5018 */
5019 free_empty_sheaf(s, empty);
5020 }
5021 } else {
5022 full = alloc_full_sheaf(s, gfp);
5023 }
5024
5025 if (!full)
5026 return NULL;
5027
5028 /*
5029 * we can reach here only when gfpflags_allow_blocking
5030 * so this must not be an irq
5031 */
5032 local_lock(&s->cpu_sheaves->lock);
5033 pcs = this_cpu_ptr(s->cpu_sheaves);
5034
5035 /*
5036 * If we are returning empty sheaf, we either got it from the
5037 * barn or had to allocate one. If we are returning a full
5038 * sheaf, it's due to racing or being migrated to a different
5039 * cpu. Breaching the barn's sheaf limits should be thus rare
5040 * enough so just ignore them to simplify the recovery.
5041 */
5042
5043 if (pcs->main->size == 0) {
5044 barn_put_empty_sheaf(barn, pcs->main);
5045 pcs->main = full;
5046 return pcs;
5047 }
5048
5049 if (!pcs->spare) {
5050 pcs->spare = full;
5051 return pcs;
5052 }
5053
5054 if (pcs->spare->size == 0) {
5055 barn_put_empty_sheaf(barn, pcs->spare);
5056 pcs->spare = full;
5057 return pcs;
5058 }
5059
5060 barn_put_full_sheaf(barn, full);
5061 stat(s, BARN_PUT);
5062
5063 return pcs;
5064 }
5065
5066 static __fastpath_inline
alloc_from_pcs(struct kmem_cache * s,gfp_t gfp,int node)5067 void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
5068 {
5069 struct slub_percpu_sheaves *pcs;
5070 bool node_requested;
5071 void *object;
5072
5073 #ifdef CONFIG_NUMA
5074 if (static_branch_unlikely(&strict_numa) &&
5075 node == NUMA_NO_NODE) {
5076
5077 struct mempolicy *mpol = current->mempolicy;
5078
5079 if (mpol) {
5080 /*
5081 * Special BIND rule support. If the local node
5082 * is in permitted set then do not redirect
5083 * to a particular node.
5084 * Otherwise we apply the memory policy to get
5085 * the node we need to allocate on.
5086 */
5087 if (mpol->mode != MPOL_BIND ||
5088 !node_isset(numa_mem_id(), mpol->nodes))
5089
5090 node = mempolicy_slab_node();
5091 }
5092 }
5093 #endif
5094
5095 node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE;
5096
5097 /*
5098 * We assume the percpu sheaves contain only local objects although it's
5099 * not completely guaranteed, so we verify later.
5100 */
5101 if (unlikely(node_requested && node != numa_mem_id()))
5102 return NULL;
5103
5104 if (!local_trylock(&s->cpu_sheaves->lock))
5105 return NULL;
5106
5107 pcs = this_cpu_ptr(s->cpu_sheaves);
5108
5109 if (unlikely(pcs->main->size == 0)) {
5110 pcs = __pcs_replace_empty_main(s, pcs, gfp);
5111 if (unlikely(!pcs))
5112 return NULL;
5113 }
5114
5115 object = pcs->main->objects[pcs->main->size - 1];
5116
5117 if (unlikely(node_requested)) {
5118 /*
5119 * Verify that the object was from the node we want. This could
5120 * be false because of cpu migration during an unlocked part of
5121 * the current allocation or previous freeing process.
5122 */
5123 if (folio_nid(virt_to_folio(object)) != node) {
5124 local_unlock(&s->cpu_sheaves->lock);
5125 return NULL;
5126 }
5127 }
5128
5129 pcs->main->size--;
5130
5131 local_unlock(&s->cpu_sheaves->lock);
5132
5133 stat(s, ALLOC_PCS);
5134
5135 return object;
5136 }
5137
5138 static __fastpath_inline
alloc_from_pcs_bulk(struct kmem_cache * s,size_t size,void ** p)5139 unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
5140 {
5141 struct slub_percpu_sheaves *pcs;
5142 struct slab_sheaf *main;
5143 unsigned int allocated = 0;
5144 unsigned int batch;
5145
5146 next_batch:
5147 if (!local_trylock(&s->cpu_sheaves->lock))
5148 return allocated;
5149
5150 pcs = this_cpu_ptr(s->cpu_sheaves);
5151
5152 if (unlikely(pcs->main->size == 0)) {
5153
5154 struct slab_sheaf *full;
5155
5156 if (pcs->spare && pcs->spare->size > 0) {
5157 swap(pcs->main, pcs->spare);
5158 goto do_alloc;
5159 }
5160
5161 full = barn_replace_empty_sheaf(get_barn(s), pcs->main);
5162
5163 if (full) {
5164 stat(s, BARN_GET);
5165 pcs->main = full;
5166 goto do_alloc;
5167 }
5168
5169 stat(s, BARN_GET_FAIL);
5170
5171 local_unlock(&s->cpu_sheaves->lock);
5172
5173 /*
5174 * Once full sheaves in barn are depleted, let the bulk
5175 * allocation continue from slab pages, otherwise we would just
5176 * be copying arrays of pointers twice.
5177 */
5178 return allocated;
5179 }
5180
5181 do_alloc:
5182
5183 main = pcs->main;
5184 batch = min(size, main->size);
5185
5186 main->size -= batch;
5187 memcpy(p, main->objects + main->size, batch * sizeof(void *));
5188
5189 local_unlock(&s->cpu_sheaves->lock);
5190
5191 stat_add(s, ALLOC_PCS, batch);
5192
5193 allocated += batch;
5194
5195 if (batch < size) {
5196 p += batch;
5197 size -= batch;
5198 goto next_batch;
5199 }
5200
5201 return allocated;
5202 }
5203
5204
5205 /*
5206 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
5207 * have the fastpath folded into their functions. So no function call
5208 * overhead for requests that can be satisfied on the fastpath.
5209 *
5210 * The fastpath works by first checking if the lockless freelist can be used.
5211 * If not then __slab_alloc is called for slow processing.
5212 *
5213 * Otherwise we can simply pick the next object from the lockless free list.
5214 */
slab_alloc_node(struct kmem_cache * s,struct list_lru * lru,gfp_t gfpflags,int node,unsigned long addr,size_t orig_size)5215 static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
5216 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
5217 {
5218 void *object;
5219 bool init = false;
5220
5221 s = slab_pre_alloc_hook(s, gfpflags);
5222 if (unlikely(!s))
5223 return NULL;
5224
5225 object = kfence_alloc(s, orig_size, gfpflags);
5226 if (unlikely(object))
5227 goto out;
5228
5229 if (s->cpu_sheaves)
5230 object = alloc_from_pcs(s, gfpflags, node);
5231
5232 if (!object)
5233 object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
5234
5235 maybe_wipe_obj_freeptr(s, object);
5236 init = slab_want_init_on_alloc(gfpflags, s);
5237
5238 out:
5239 /*
5240 * When init equals 'true', like for kzalloc() family, only
5241 * @orig_size bytes might be zeroed instead of s->object_size
5242 * In case this fails due to memcg_slab_post_alloc_hook(),
5243 * object is set to NULL
5244 */
5245 slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size);
5246
5247 return object;
5248 }
5249
kmem_cache_alloc_noprof(struct kmem_cache * s,gfp_t gfpflags)5250 void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
5251 {
5252 void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_,
5253 s->object_size);
5254
5255 trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
5256
5257 return ret;
5258 }
5259 EXPORT_SYMBOL(kmem_cache_alloc_noprof);
5260
kmem_cache_alloc_lru_noprof(struct kmem_cache * s,struct list_lru * lru,gfp_t gfpflags)5261 void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
5262 gfp_t gfpflags)
5263 {
5264 void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_,
5265 s->object_size);
5266
5267 trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
5268
5269 return ret;
5270 }
5271 EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
5272
kmem_cache_charge(void * objp,gfp_t gfpflags)5273 bool kmem_cache_charge(void *objp, gfp_t gfpflags)
5274 {
5275 if (!memcg_kmem_online())
5276 return true;
5277
5278 return memcg_slab_post_charge(objp, gfpflags);
5279 }
5280 EXPORT_SYMBOL(kmem_cache_charge);
5281
5282 /**
5283 * kmem_cache_alloc_node - Allocate an object on the specified node
5284 * @s: The cache to allocate from.
5285 * @gfpflags: See kmalloc().
5286 * @node: node number of the target node.
5287 *
5288 * Identical to kmem_cache_alloc but it will allocate memory on the given
5289 * node, which can improve the performance for cpu bound structures.
5290 *
5291 * Fallback to other node is possible if __GFP_THISNODE is not set.
5292 *
5293 * Return: pointer to the new object or %NULL in case of error
5294 */
kmem_cache_alloc_node_noprof(struct kmem_cache * s,gfp_t gfpflags,int node)5295 void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
5296 {
5297 void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
5298
5299 trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
5300
5301 return ret;
5302 }
5303 EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
5304
5305 /*
5306 * returns a sheaf that has at least the requested size
5307 * when prefilling is needed, do so with given gfp flags
5308 *
5309 * return NULL if sheaf allocation or prefilling failed
5310 */
5311 struct slab_sheaf *
kmem_cache_prefill_sheaf(struct kmem_cache * s,gfp_t gfp,unsigned int size)5312 kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
5313 {
5314 struct slub_percpu_sheaves *pcs;
5315 struct slab_sheaf *sheaf = NULL;
5316
5317 if (unlikely(size > s->sheaf_capacity)) {
5318
5319 /*
5320 * slab_debug disables cpu sheaves intentionally so all
5321 * prefilled sheaves become "oversize" and we give up on
5322 * performance for the debugging. Same with SLUB_TINY.
5323 * Creating a cache without sheaves and then requesting a
5324 * prefilled sheaf is however not expected, so warn.
5325 */
5326 WARN_ON_ONCE(s->sheaf_capacity == 0 &&
5327 !IS_ENABLED(CONFIG_SLUB_TINY) &&
5328 !(s->flags & SLAB_DEBUG_FLAGS));
5329
5330 sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
5331 if (!sheaf)
5332 return NULL;
5333
5334 stat(s, SHEAF_PREFILL_OVERSIZE);
5335 sheaf->cache = s;
5336 sheaf->capacity = size;
5337
5338 if (!__kmem_cache_alloc_bulk(s, gfp, size,
5339 &sheaf->objects[0])) {
5340 kfree(sheaf);
5341 return NULL;
5342 }
5343
5344 sheaf->size = size;
5345
5346 return sheaf;
5347 }
5348
5349 local_lock(&s->cpu_sheaves->lock);
5350 pcs = this_cpu_ptr(s->cpu_sheaves);
5351
5352 if (pcs->spare) {
5353 sheaf = pcs->spare;
5354 pcs->spare = NULL;
5355 stat(s, SHEAF_PREFILL_FAST);
5356 } else {
5357 stat(s, SHEAF_PREFILL_SLOW);
5358 sheaf = barn_get_full_or_empty_sheaf(get_barn(s));
5359 if (sheaf && sheaf->size)
5360 stat(s, BARN_GET);
5361 else
5362 stat(s, BARN_GET_FAIL);
5363 }
5364
5365 local_unlock(&s->cpu_sheaves->lock);
5366
5367
5368 if (!sheaf)
5369 sheaf = alloc_empty_sheaf(s, gfp);
5370
5371 if (sheaf && sheaf->size < size) {
5372 if (refill_sheaf(s, sheaf, gfp)) {
5373 sheaf_flush_unused(s, sheaf);
5374 free_empty_sheaf(s, sheaf);
5375 sheaf = NULL;
5376 }
5377 }
5378
5379 if (sheaf)
5380 sheaf->capacity = s->sheaf_capacity;
5381
5382 return sheaf;
5383 }
5384
5385 /*
5386 * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
5387 *
5388 * If the sheaf cannot simply become the percpu spare sheaf, but there's space
5389 * for a full sheaf in the barn, we try to refill the sheaf back to the cache's
5390 * sheaf_capacity to avoid handling partially full sheaves.
5391 *
5392 * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the
5393 * sheaf is instead flushed and freed.
5394 */
kmem_cache_return_sheaf(struct kmem_cache * s,gfp_t gfp,struct slab_sheaf * sheaf)5395 void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
5396 struct slab_sheaf *sheaf)
5397 {
5398 struct slub_percpu_sheaves *pcs;
5399 struct node_barn *barn;
5400
5401 if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
5402 sheaf_flush_unused(s, sheaf);
5403 kfree(sheaf);
5404 return;
5405 }
5406
5407 local_lock(&s->cpu_sheaves->lock);
5408 pcs = this_cpu_ptr(s->cpu_sheaves);
5409 barn = get_barn(s);
5410
5411 if (!pcs->spare) {
5412 pcs->spare = sheaf;
5413 sheaf = NULL;
5414 stat(s, SHEAF_RETURN_FAST);
5415 }
5416
5417 local_unlock(&s->cpu_sheaves->lock);
5418
5419 if (!sheaf)
5420 return;
5421
5422 stat(s, SHEAF_RETURN_SLOW);
5423
5424 /*
5425 * If the barn has too many full sheaves or we fail to refill the sheaf,
5426 * simply flush and free it.
5427 */
5428 if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES ||
5429 refill_sheaf(s, sheaf, gfp)) {
5430 sheaf_flush_unused(s, sheaf);
5431 free_empty_sheaf(s, sheaf);
5432 return;
5433 }
5434
5435 barn_put_full_sheaf(barn, sheaf);
5436 stat(s, BARN_PUT);
5437 }
5438
5439 /*
5440 * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
5441 * the given size
5442 *
5443 * the sheaf might be replaced by a new one when requesting more than
5444 * s->sheaf_capacity objects if such replacement is necessary, but the refill
5445 * fails (returning -ENOMEM), the existing sheaf is left intact
5446 *
5447 * In practice we always refill to full sheaf's capacity.
5448 */
kmem_cache_refill_sheaf(struct kmem_cache * s,gfp_t gfp,struct slab_sheaf ** sheafp,unsigned int size)5449 int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
5450 struct slab_sheaf **sheafp, unsigned int size)
5451 {
5452 struct slab_sheaf *sheaf;
5453
5454 /*
5455 * TODO: do we want to support *sheaf == NULL to be equivalent of
5456 * kmem_cache_prefill_sheaf() ?
5457 */
5458 if (!sheafp || !(*sheafp))
5459 return -EINVAL;
5460
5461 sheaf = *sheafp;
5462 if (sheaf->size >= size)
5463 return 0;
5464
5465 if (likely(sheaf->capacity >= size)) {
5466 if (likely(sheaf->capacity == s->sheaf_capacity))
5467 return refill_sheaf(s, sheaf, gfp);
5468
5469 if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
5470 &sheaf->objects[sheaf->size])) {
5471 return -ENOMEM;
5472 }
5473 sheaf->size = sheaf->capacity;
5474
5475 return 0;
5476 }
5477
5478 /*
5479 * We had a regular sized sheaf and need an oversize one, or we had an
5480 * oversize one already but need a larger one now.
5481 * This should be a very rare path so let's not complicate it.
5482 */
5483 sheaf = kmem_cache_prefill_sheaf(s, gfp, size);
5484 if (!sheaf)
5485 return -ENOMEM;
5486
5487 kmem_cache_return_sheaf(s, gfp, *sheafp);
5488 *sheafp = sheaf;
5489 return 0;
5490 }
5491
5492 /*
5493 * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf()
5494 *
5495 * Guaranteed not to fail as many allocations as was the requested size.
5496 * After the sheaf is emptied, it fails - no fallback to the slab cache itself.
5497 *
5498 * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
5499 * memcg charging is forced over limit if necessary, to avoid failure.
5500 */
5501 void *
kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache * s,gfp_t gfp,struct slab_sheaf * sheaf)5502 kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
5503 struct slab_sheaf *sheaf)
5504 {
5505 void *ret = NULL;
5506 bool init;
5507
5508 if (sheaf->size == 0)
5509 goto out;
5510
5511 ret = sheaf->objects[--sheaf->size];
5512
5513 init = slab_want_init_on_alloc(gfp, s);
5514
5515 /* add __GFP_NOFAIL to force successful memcg charging */
5516 slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size);
5517 out:
5518 trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE);
5519
5520 return ret;
5521 }
5522
kmem_cache_sheaf_size(struct slab_sheaf * sheaf)5523 unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
5524 {
5525 return sheaf->size;
5526 }
5527 /*
5528 * To avoid unnecessary overhead, we pass through large allocation requests
5529 * directly to the page allocator. We use __GFP_COMP, because we will need to
5530 * know the allocation order to free the pages properly in kfree.
5531 */
___kmalloc_large_node(size_t size,gfp_t flags,int node)5532 static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
5533 {
5534 struct folio *folio;
5535 void *ptr = NULL;
5536 unsigned int order = get_order(size);
5537
5538 if (unlikely(flags & GFP_SLAB_BUG_MASK))
5539 flags = kmalloc_fix_flags(flags);
5540
5541 flags |= __GFP_COMP;
5542
5543 if (node == NUMA_NO_NODE)
5544 folio = (struct folio *)alloc_frozen_pages_noprof(flags, order);
5545 else
5546 folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL);
5547
5548 if (folio) {
5549 ptr = folio_address(folio);
5550 lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
5551 PAGE_SIZE << order);
5552 __folio_set_large_kmalloc(folio);
5553 }
5554
5555 ptr = kasan_kmalloc_large(ptr, size, flags);
5556 /* As ptr might get tagged, call kmemleak hook after KASAN. */
5557 kmemleak_alloc(ptr, size, 1, flags);
5558 kmsan_kmalloc_large(ptr, size, flags);
5559
5560 return ptr;
5561 }
5562
__kmalloc_large_noprof(size_t size,gfp_t flags)5563 void *__kmalloc_large_noprof(size_t size, gfp_t flags)
5564 {
5565 void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE);
5566
5567 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
5568 flags, NUMA_NO_NODE);
5569 return ret;
5570 }
5571 EXPORT_SYMBOL(__kmalloc_large_noprof);
5572
__kmalloc_large_node_noprof(size_t size,gfp_t flags,int node)5573 void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
5574 {
5575 void *ret = ___kmalloc_large_node(size, flags, node);
5576
5577 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
5578 flags, node);
5579 return ret;
5580 }
5581 EXPORT_SYMBOL(__kmalloc_large_node_noprof);
5582
5583 static __always_inline
__do_kmalloc_node(size_t size,kmem_buckets * b,gfp_t flags,int node,unsigned long caller)5584 void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
5585 unsigned long caller)
5586 {
5587 struct kmem_cache *s;
5588 void *ret;
5589
5590 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
5591 ret = __kmalloc_large_node_noprof(size, flags, node);
5592 trace_kmalloc(caller, ret, size,
5593 PAGE_SIZE << get_order(size), flags, node);
5594 return ret;
5595 }
5596
5597 if (unlikely(!size))
5598 return ZERO_SIZE_PTR;
5599
5600 s = kmalloc_slab(size, b, flags, caller);
5601
5602 ret = slab_alloc_node(s, NULL, flags, node, caller, size);
5603 ret = kasan_kmalloc(s, ret, size, flags);
5604 trace_kmalloc(caller, ret, size, s->size, flags, node);
5605 return ret;
5606 }
__kmalloc_node_noprof(DECL_BUCKET_PARAMS (size,b),gfp_t flags,int node)5607 void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
5608 {
5609 return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_);
5610 }
5611 EXPORT_SYMBOL(__kmalloc_node_noprof);
5612
__kmalloc_noprof(size_t size,gfp_t flags)5613 void *__kmalloc_noprof(size_t size, gfp_t flags)
5614 {
5615 return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_);
5616 }
5617 EXPORT_SYMBOL(__kmalloc_noprof);
5618
5619 /**
5620 * kmalloc_nolock - Allocate an object of given size from any context.
5621 * @size: size to allocate
5622 * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT
5623 * allowed.
5624 * @node: node number of the target node.
5625 *
5626 * Return: pointer to the new object or NULL in case of error.
5627 * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
5628 * There is no reason to call it again and expect !NULL.
5629 */
kmalloc_nolock_noprof(size_t size,gfp_t gfp_flags,int node)5630 void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
5631 {
5632 gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
5633 struct kmem_cache *s;
5634 bool can_retry = true;
5635 void *ret = ERR_PTR(-EBUSY);
5636
5637 VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
5638 __GFP_NO_OBJ_EXT));
5639
5640 if (unlikely(!size))
5641 return ZERO_SIZE_PTR;
5642
5643 if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
5644 /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */
5645 return NULL;
5646 retry:
5647 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
5648 return NULL;
5649 s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_);
5650
5651 if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
5652 /*
5653 * kmalloc_nolock() is not supported on architectures that
5654 * don't implement cmpxchg16b, but debug caches don't use
5655 * per-cpu slab and per-cpu partial slabs. They rely on
5656 * kmem_cache_node->list_lock, so kmalloc_nolock() can
5657 * attempt to allocate from debug caches by
5658 * spin_trylock_irqsave(&n->list_lock, ...)
5659 */
5660 return NULL;
5661
5662 /*
5663 * Do not call slab_alloc_node(), since trylock mode isn't
5664 * compatible with slab_pre_alloc_hook/should_failslab and
5665 * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
5666 * and slab_post_alloc_hook() directly.
5667 *
5668 * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
5669 * in irq saved region. It assumes that the same cpu will not
5670 * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
5671 * Therefore use in_nmi() to check whether particular bucket is in
5672 * irq protected section.
5673 *
5674 * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
5675 * this cpu was interrupted somewhere inside ___slab_alloc() after
5676 * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
5677 * In this case fast path with __update_cpu_freelist_fast() is not safe.
5678 */
5679 #ifndef CONFIG_SLUB_TINY
5680 if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
5681 #endif
5682 ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
5683
5684 if (PTR_ERR(ret) == -EBUSY) {
5685 if (can_retry) {
5686 /* pick the next kmalloc bucket */
5687 size = s->object_size + 1;
5688 /*
5689 * Another alternative is to
5690 * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
5691 * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
5692 * to retry from bucket of the same size.
5693 */
5694 can_retry = false;
5695 goto retry;
5696 }
5697 ret = NULL;
5698 }
5699
5700 maybe_wipe_obj_freeptr(s, ret);
5701 slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
5702 slab_want_init_on_alloc(alloc_gfp, s), size);
5703
5704 ret = kasan_kmalloc(s, ret, size, alloc_gfp);
5705 return ret;
5706 }
5707 EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
5708
__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS (size,b),gfp_t flags,int node,unsigned long caller)5709 void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
5710 int node, unsigned long caller)
5711 {
5712 return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller);
5713
5714 }
5715 EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof);
5716
__kmalloc_cache_noprof(struct kmem_cache * s,gfp_t gfpflags,size_t size)5717 void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
5718 {
5719 void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
5720 _RET_IP_, size);
5721
5722 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
5723
5724 ret = kasan_kmalloc(s, ret, size, gfpflags);
5725 return ret;
5726 }
5727 EXPORT_SYMBOL(__kmalloc_cache_noprof);
5728
__kmalloc_cache_node_noprof(struct kmem_cache * s,gfp_t gfpflags,int node,size_t size)5729 void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
5730 int node, size_t size)
5731 {
5732 void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
5733
5734 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
5735
5736 ret = kasan_kmalloc(s, ret, size, gfpflags);
5737 return ret;
5738 }
5739 EXPORT_SYMBOL(__kmalloc_cache_node_noprof);
5740
free_to_partial_list(struct kmem_cache * s,struct slab * slab,void * head,void * tail,int bulk_cnt,unsigned long addr)5741 static noinline void free_to_partial_list(
5742 struct kmem_cache *s, struct slab *slab,
5743 void *head, void *tail, int bulk_cnt,
5744 unsigned long addr)
5745 {
5746 struct kmem_cache_node *n = get_node(s, slab_nid(slab));
5747 struct slab *slab_free = NULL;
5748 int cnt = bulk_cnt;
5749 unsigned long flags;
5750 depot_stack_handle_t handle = 0;
5751
5752 /*
5753 * We cannot use GFP_NOWAIT as there are callsites where waking up
5754 * kswapd could deadlock
5755 */
5756 if (s->flags & SLAB_STORE_USER)
5757 handle = set_track_prepare(__GFP_NOWARN);
5758
5759 spin_lock_irqsave(&n->list_lock, flags);
5760
5761 if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
5762 void *prior = slab->freelist;
5763
5764 /* Perform the actual freeing while we still hold the locks */
5765 slab->inuse -= cnt;
5766 set_freepointer(s, tail, prior);
5767 slab->freelist = head;
5768
5769 /*
5770 * If the slab is empty, and node's partial list is full,
5771 * it should be discarded anyway no matter it's on full or
5772 * partial list.
5773 */
5774 if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
5775 slab_free = slab;
5776
5777 if (!prior) {
5778 /* was on full list */
5779 remove_full(s, n, slab);
5780 if (!slab_free) {
5781 add_partial(n, slab, DEACTIVATE_TO_TAIL);
5782 stat(s, FREE_ADD_PARTIAL);
5783 }
5784 } else if (slab_free) {
5785 remove_partial(n, slab);
5786 stat(s, FREE_REMOVE_PARTIAL);
5787 }
5788 }
5789
5790 if (slab_free) {
5791 /*
5792 * Update the counters while still holding n->list_lock to
5793 * prevent spurious validation warnings
5794 */
5795 dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
5796 }
5797
5798 spin_unlock_irqrestore(&n->list_lock, flags);
5799
5800 if (slab_free) {
5801 stat(s, FREE_SLAB);
5802 free_slab(s, slab_free);
5803 }
5804 }
5805
5806 /*
5807 * Slow path handling. This may still be called frequently since objects
5808 * have a longer lifetime than the cpu slabs in most processing loads.
5809 *
5810 * So we still attempt to reduce cache line usage. Just take the slab
5811 * lock and free the item. If there is no additional partial slab
5812 * handling required then we can return immediately.
5813 */
__slab_free(struct kmem_cache * s,struct slab * slab,void * head,void * tail,int cnt,unsigned long addr)5814 static void __slab_free(struct kmem_cache *s, struct slab *slab,
5815 void *head, void *tail, int cnt,
5816 unsigned long addr)
5817
5818 {
5819 void *prior;
5820 int was_frozen;
5821 struct slab new;
5822 unsigned long counters;
5823 struct kmem_cache_node *n = NULL;
5824 unsigned long flags;
5825 bool on_node_partial;
5826
5827 stat(s, FREE_SLOWPATH);
5828
5829 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
5830 free_to_partial_list(s, slab, head, tail, cnt, addr);
5831 return;
5832 }
5833
5834 do {
5835 if (unlikely(n)) {
5836 spin_unlock_irqrestore(&n->list_lock, flags);
5837 n = NULL;
5838 }
5839 prior = slab->freelist;
5840 counters = slab->counters;
5841 set_freepointer(s, tail, prior);
5842 new.counters = counters;
5843 was_frozen = new.frozen;
5844 new.inuse -= cnt;
5845 if ((!new.inuse || !prior) && !was_frozen) {
5846 /* Needs to be taken off a list */
5847 if (!kmem_cache_has_cpu_partial(s) || prior) {
5848
5849 n = get_node(s, slab_nid(slab));
5850 /*
5851 * Speculatively acquire the list_lock.
5852 * If the cmpxchg does not succeed then we may
5853 * drop the list_lock without any processing.
5854 *
5855 * Otherwise the list_lock will synchronize with
5856 * other processors updating the list of slabs.
5857 */
5858 spin_lock_irqsave(&n->list_lock, flags);
5859
5860 on_node_partial = slab_test_node_partial(slab);
5861 }
5862 }
5863
5864 } while (!slab_update_freelist(s, slab,
5865 prior, counters,
5866 head, new.counters,
5867 "__slab_free"));
5868
5869 if (likely(!n)) {
5870
5871 if (likely(was_frozen)) {
5872 /*
5873 * The list lock was not taken therefore no list
5874 * activity can be necessary.
5875 */
5876 stat(s, FREE_FROZEN);
5877 } else if (kmem_cache_has_cpu_partial(s) && !prior) {
5878 /*
5879 * If we started with a full slab then put it onto the
5880 * per cpu partial list.
5881 */
5882 put_cpu_partial(s, slab, 1);
5883 stat(s, CPU_PARTIAL_FREE);
5884 }
5885
5886 return;
5887 }
5888
5889 /*
5890 * This slab was partially empty but not on the per-node partial list,
5891 * in which case we shouldn't manipulate its list, just return.
5892 */
5893 if (prior && !on_node_partial) {
5894 spin_unlock_irqrestore(&n->list_lock, flags);
5895 return;
5896 }
5897
5898 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
5899 goto slab_empty;
5900
5901 /*
5902 * Objects left in the slab. If it was not on the partial list before
5903 * then add it.
5904 */
5905 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
5906 add_partial(n, slab, DEACTIVATE_TO_TAIL);
5907 stat(s, FREE_ADD_PARTIAL);
5908 }
5909 spin_unlock_irqrestore(&n->list_lock, flags);
5910 return;
5911
5912 slab_empty:
5913 if (prior) {
5914 /*
5915 * Slab on the partial list.
5916 */
5917 remove_partial(n, slab);
5918 stat(s, FREE_REMOVE_PARTIAL);
5919 }
5920
5921 spin_unlock_irqrestore(&n->list_lock, flags);
5922 stat(s, FREE_SLAB);
5923 discard_slab(s, slab);
5924 }
5925
5926 /*
5927 * pcs is locked. We should have get rid of the spare sheaf and obtained an
5928 * empty sheaf, while the main sheaf is full. We want to install the empty sheaf
5929 * as a main sheaf, and make the current main sheaf a spare sheaf.
5930 *
5931 * However due to having relinquished the cpu_sheaves lock when obtaining
5932 * the empty sheaf, we need to handle some unlikely but possible cases.
5933 *
5934 * If we put any sheaf to barn here, it's because we were interrupted or have
5935 * been migrated to a different cpu, which should be rare enough so just ignore
5936 * the barn's limits to simplify the handling.
5937 *
5938 * An alternative scenario that gets us here is when we fail
5939 * barn_replace_full_sheaf(), because there's no empty sheaf available in the
5940 * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the
5941 * limit on full sheaves was not exceeded, we assume it didn't change and just
5942 * put the full sheaf there.
5943 */
__pcs_install_empty_sheaf(struct kmem_cache * s,struct slub_percpu_sheaves * pcs,struct slab_sheaf * empty)5944 static void __pcs_install_empty_sheaf(struct kmem_cache *s,
5945 struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty)
5946 {
5947 struct node_barn *barn;
5948
5949 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
5950
5951 /* This is what we expect to find if nobody interrupted us. */
5952 if (likely(!pcs->spare)) {
5953 pcs->spare = pcs->main;
5954 pcs->main = empty;
5955 return;
5956 }
5957
5958 barn = get_barn(s);
5959
5960 /*
5961 * Unlikely because if the main sheaf had space, we would have just
5962 * freed to it. Get rid of our empty sheaf.
5963 */
5964 if (pcs->main->size < s->sheaf_capacity) {
5965 barn_put_empty_sheaf(barn, empty);
5966 return;
5967 }
5968
5969 /* Also unlikely for the same reason */
5970 if (pcs->spare->size < s->sheaf_capacity) {
5971 swap(pcs->main, pcs->spare);
5972 barn_put_empty_sheaf(barn, empty);
5973 return;
5974 }
5975
5976 /*
5977 * We probably failed barn_replace_full_sheaf() due to no empty sheaf
5978 * available there, but we allocated one, so finish the job.
5979 */
5980 barn_put_full_sheaf(barn, pcs->main);
5981 stat(s, BARN_PUT);
5982 pcs->main = empty;
5983 }
5984
5985 /*
5986 * Replace the full main sheaf with a (at least partially) empty sheaf.
5987 *
5988 * Must be called with the cpu_sheaves local lock locked. If successful, returns
5989 * the pcs pointer and the local lock locked (possibly on a different cpu than
5990 * initially called). If not successful, returns NULL and the local lock
5991 * unlocked.
5992 */
5993 static struct slub_percpu_sheaves *
__pcs_replace_full_main(struct kmem_cache * s,struct slub_percpu_sheaves * pcs)5994 __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
5995 {
5996 struct slab_sheaf *empty;
5997 struct node_barn *barn;
5998 bool put_fail;
5999
6000 restart:
6001 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
6002
6003 barn = get_barn(s);
6004 put_fail = false;
6005
6006 if (!pcs->spare) {
6007 empty = barn_get_empty_sheaf(barn);
6008 if (empty) {
6009 pcs->spare = pcs->main;
6010 pcs->main = empty;
6011 return pcs;
6012 }
6013 goto alloc_empty;
6014 }
6015
6016 if (pcs->spare->size < s->sheaf_capacity) {
6017 swap(pcs->main, pcs->spare);
6018 return pcs;
6019 }
6020
6021 empty = barn_replace_full_sheaf(barn, pcs->main);
6022
6023 if (!IS_ERR(empty)) {
6024 stat(s, BARN_PUT);
6025 pcs->main = empty;
6026 return pcs;
6027 }
6028
6029 if (PTR_ERR(empty) == -E2BIG) {
6030 /* Since we got here, spare exists and is full */
6031 struct slab_sheaf *to_flush = pcs->spare;
6032
6033 stat(s, BARN_PUT_FAIL);
6034
6035 pcs->spare = NULL;
6036 local_unlock(&s->cpu_sheaves->lock);
6037
6038 sheaf_flush_unused(s, to_flush);
6039 empty = to_flush;
6040 goto got_empty;
6041 }
6042
6043 /*
6044 * We could not replace full sheaf because barn had no empty
6045 * sheaves. We can still allocate it and put the full sheaf in
6046 * __pcs_install_empty_sheaf(), but if we fail to allocate it,
6047 * make sure to count the fail.
6048 */
6049 put_fail = true;
6050
6051 alloc_empty:
6052 local_unlock(&s->cpu_sheaves->lock);
6053
6054 empty = alloc_empty_sheaf(s, GFP_NOWAIT);
6055 if (empty)
6056 goto got_empty;
6057
6058 if (put_fail)
6059 stat(s, BARN_PUT_FAIL);
6060
6061 if (!sheaf_flush_main(s))
6062 return NULL;
6063
6064 if (!local_trylock(&s->cpu_sheaves->lock))
6065 return NULL;
6066
6067 pcs = this_cpu_ptr(s->cpu_sheaves);
6068
6069 /*
6070 * we flushed the main sheaf so it should be empty now,
6071 * but in case we got preempted or migrated, we need to
6072 * check again
6073 */
6074 if (pcs->main->size == s->sheaf_capacity)
6075 goto restart;
6076
6077 return pcs;
6078
6079 got_empty:
6080 if (!local_trylock(&s->cpu_sheaves->lock)) {
6081 barn_put_empty_sheaf(barn, empty);
6082 return NULL;
6083 }
6084
6085 pcs = this_cpu_ptr(s->cpu_sheaves);
6086 __pcs_install_empty_sheaf(s, pcs, empty);
6087
6088 return pcs;
6089 }
6090
6091 /*
6092 * Free an object to the percpu sheaves.
6093 * The object is expected to have passed slab_free_hook() already.
6094 */
6095 static __fastpath_inline
free_to_pcs(struct kmem_cache * s,void * object)6096 bool free_to_pcs(struct kmem_cache *s, void *object)
6097 {
6098 struct slub_percpu_sheaves *pcs;
6099
6100 if (!local_trylock(&s->cpu_sheaves->lock))
6101 return false;
6102
6103 pcs = this_cpu_ptr(s->cpu_sheaves);
6104
6105 if (unlikely(pcs->main->size == s->sheaf_capacity)) {
6106
6107 pcs = __pcs_replace_full_main(s, pcs);
6108 if (unlikely(!pcs))
6109 return false;
6110 }
6111
6112 pcs->main->objects[pcs->main->size++] = object;
6113
6114 local_unlock(&s->cpu_sheaves->lock);
6115
6116 stat(s, FREE_PCS);
6117
6118 return true;
6119 }
6120
rcu_free_sheaf(struct rcu_head * head)6121 static void rcu_free_sheaf(struct rcu_head *head)
6122 {
6123 struct slab_sheaf *sheaf;
6124 struct node_barn *barn;
6125 struct kmem_cache *s;
6126
6127 sheaf = container_of(head, struct slab_sheaf, rcu_head);
6128
6129 s = sheaf->cache;
6130
6131 /*
6132 * This may remove some objects due to slab_free_hook() returning false,
6133 * so that the sheaf might no longer be completely full. But it's easier
6134 * to handle it as full (unless it became completely empty), as the code
6135 * handles it fine. The only downside is that sheaf will serve fewer
6136 * allocations when reused. It only happens due to debugging, which is a
6137 * performance hit anyway.
6138 */
6139 __rcu_free_sheaf_prepare(s, sheaf);
6140
6141 barn = get_node(s, sheaf->node)->barn;
6142
6143 /* due to slab_free_hook() */
6144 if (unlikely(sheaf->size == 0))
6145 goto empty;
6146
6147 /*
6148 * Checking nr_full/nr_empty outside lock avoids contention in case the
6149 * barn is at the respective limit. Due to the race we might go over the
6150 * limit but that should be rare and harmless.
6151 */
6152
6153 if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
6154 stat(s, BARN_PUT);
6155 barn_put_full_sheaf(barn, sheaf);
6156 return;
6157 }
6158
6159 stat(s, BARN_PUT_FAIL);
6160 sheaf_flush_unused(s, sheaf);
6161
6162 empty:
6163 if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
6164 barn_put_empty_sheaf(barn, sheaf);
6165 return;
6166 }
6167
6168 free_empty_sheaf(s, sheaf);
6169 }
6170
__kfree_rcu_sheaf(struct kmem_cache * s,void * obj)6171 bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
6172 {
6173 struct slub_percpu_sheaves *pcs;
6174 struct slab_sheaf *rcu_sheaf;
6175
6176 if (!local_trylock(&s->cpu_sheaves->lock))
6177 goto fail;
6178
6179 pcs = this_cpu_ptr(s->cpu_sheaves);
6180
6181 if (unlikely(!pcs->rcu_free)) {
6182
6183 struct slab_sheaf *empty;
6184 struct node_barn *barn;
6185
6186 if (pcs->spare && pcs->spare->size == 0) {
6187 pcs->rcu_free = pcs->spare;
6188 pcs->spare = NULL;
6189 goto do_free;
6190 }
6191
6192 barn = get_barn(s);
6193
6194 empty = barn_get_empty_sheaf(barn);
6195
6196 if (empty) {
6197 pcs->rcu_free = empty;
6198 goto do_free;
6199 }
6200
6201 local_unlock(&s->cpu_sheaves->lock);
6202
6203 empty = alloc_empty_sheaf(s, GFP_NOWAIT);
6204
6205 if (!empty)
6206 goto fail;
6207
6208 if (!local_trylock(&s->cpu_sheaves->lock)) {
6209 barn_put_empty_sheaf(barn, empty);
6210 goto fail;
6211 }
6212
6213 pcs = this_cpu_ptr(s->cpu_sheaves);
6214
6215 if (unlikely(pcs->rcu_free))
6216 barn_put_empty_sheaf(barn, empty);
6217 else
6218 pcs->rcu_free = empty;
6219 }
6220
6221 do_free:
6222
6223 rcu_sheaf = pcs->rcu_free;
6224
6225 /*
6226 * Since we flush immediately when size reaches capacity, we never reach
6227 * this with size already at capacity, so no OOB write is possible.
6228 */
6229 rcu_sheaf->objects[rcu_sheaf->size++] = obj;
6230
6231 if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
6232 rcu_sheaf = NULL;
6233 } else {
6234 pcs->rcu_free = NULL;
6235 rcu_sheaf->node = numa_mem_id();
6236 }
6237
6238 /*
6239 * we flush before local_unlock to make sure a racing
6240 * flush_all_rcu_sheaves() doesn't miss this sheaf
6241 */
6242 if (rcu_sheaf)
6243 call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
6244
6245 local_unlock(&s->cpu_sheaves->lock);
6246
6247 stat(s, FREE_RCU_SHEAF);
6248 return true;
6249
6250 fail:
6251 stat(s, FREE_RCU_SHEAF_FAIL);
6252 return false;
6253 }
6254
6255 /*
6256 * Bulk free objects to the percpu sheaves.
6257 * Unlike free_to_pcs() this includes the calls to all necessary hooks
6258 * and the fallback to freeing to slab pages.
6259 */
free_to_pcs_bulk(struct kmem_cache * s,size_t size,void ** p)6260 static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
6261 {
6262 struct slub_percpu_sheaves *pcs;
6263 struct slab_sheaf *main, *empty;
6264 bool init = slab_want_init_on_free(s);
6265 unsigned int batch, i = 0;
6266 struct node_barn *barn;
6267 void *remote_objects[PCS_BATCH_MAX];
6268 unsigned int remote_nr = 0;
6269 int node = numa_mem_id();
6270
6271 next_remote_batch:
6272 while (i < size) {
6273 struct slab *slab = virt_to_slab(p[i]);
6274
6275 memcg_slab_free_hook(s, slab, p + i, 1);
6276 alloc_tagging_slab_free_hook(s, slab, p + i, 1);
6277
6278 if (unlikely(!slab_free_hook(s, p[i], init, false))) {
6279 p[i] = p[--size];
6280 if (!size)
6281 goto flush_remote;
6282 continue;
6283 }
6284
6285 if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) {
6286 remote_objects[remote_nr] = p[i];
6287 p[i] = p[--size];
6288 if (++remote_nr >= PCS_BATCH_MAX)
6289 goto flush_remote;
6290 continue;
6291 }
6292
6293 i++;
6294 }
6295
6296 next_batch:
6297 if (!local_trylock(&s->cpu_sheaves->lock))
6298 goto fallback;
6299
6300 pcs = this_cpu_ptr(s->cpu_sheaves);
6301
6302 if (likely(pcs->main->size < s->sheaf_capacity))
6303 goto do_free;
6304
6305 barn = get_barn(s);
6306
6307 if (!pcs->spare) {
6308 empty = barn_get_empty_sheaf(barn);
6309 if (!empty)
6310 goto no_empty;
6311
6312 pcs->spare = pcs->main;
6313 pcs->main = empty;
6314 goto do_free;
6315 }
6316
6317 if (pcs->spare->size < s->sheaf_capacity) {
6318 swap(pcs->main, pcs->spare);
6319 goto do_free;
6320 }
6321
6322 empty = barn_replace_full_sheaf(barn, pcs->main);
6323 if (IS_ERR(empty)) {
6324 stat(s, BARN_PUT_FAIL);
6325 goto no_empty;
6326 }
6327
6328 stat(s, BARN_PUT);
6329 pcs->main = empty;
6330
6331 do_free:
6332 main = pcs->main;
6333 batch = min(size, s->sheaf_capacity - main->size);
6334
6335 memcpy(main->objects + main->size, p, batch * sizeof(void *));
6336 main->size += batch;
6337
6338 local_unlock(&s->cpu_sheaves->lock);
6339
6340 stat_add(s, FREE_PCS, batch);
6341
6342 if (batch < size) {
6343 p += batch;
6344 size -= batch;
6345 goto next_batch;
6346 }
6347
6348 return;
6349
6350 no_empty:
6351 local_unlock(&s->cpu_sheaves->lock);
6352
6353 /*
6354 * if we depleted all empty sheaves in the barn or there are too
6355 * many full sheaves, free the rest to slab pages
6356 */
6357 fallback:
6358 __kmem_cache_free_bulk(s, size, p);
6359
6360 flush_remote:
6361 if (remote_nr) {
6362 __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
6363 if (i < size) {
6364 remote_nr = 0;
6365 goto next_remote_batch;
6366 }
6367 }
6368 }
6369
6370 struct defer_free {
6371 struct llist_head objects;
6372 struct llist_head slabs;
6373 struct irq_work work;
6374 };
6375
6376 static void free_deferred_objects(struct irq_work *work);
6377
6378 static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
6379 .objects = LLIST_HEAD_INIT(objects),
6380 .slabs = LLIST_HEAD_INIT(slabs),
6381 .work = IRQ_WORK_INIT(free_deferred_objects),
6382 };
6383
6384 /*
6385 * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
6386 * to take sleeping spin_locks from __slab_free() and deactivate_slab().
6387 * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
6388 */
free_deferred_objects(struct irq_work * work)6389 static void free_deferred_objects(struct irq_work *work)
6390 {
6391 struct defer_free *df = container_of(work, struct defer_free, work);
6392 struct llist_head *objs = &df->objects;
6393 struct llist_head *slabs = &df->slabs;
6394 struct llist_node *llnode, *pos, *t;
6395
6396 if (llist_empty(objs) && llist_empty(slabs))
6397 return;
6398
6399 llnode = llist_del_all(objs);
6400 llist_for_each_safe(pos, t, llnode) {
6401 struct kmem_cache *s;
6402 struct slab *slab;
6403 void *x = pos;
6404
6405 slab = virt_to_slab(x);
6406 s = slab->slab_cache;
6407
6408 /*
6409 * We used freepointer in 'x' to link 'x' into df->objects.
6410 * Clear it to NULL to avoid false positive detection
6411 * of "Freepointer corruption".
6412 */
6413 *(void **)x = NULL;
6414
6415 /* Point 'x' back to the beginning of allocated object */
6416 x -= s->offset;
6417 __slab_free(s, slab, x, x, 1, _THIS_IP_);
6418 }
6419
6420 llnode = llist_del_all(slabs);
6421 llist_for_each_safe(pos, t, llnode) {
6422 struct slab *slab = container_of(pos, struct slab, llnode);
6423
6424 #ifdef CONFIG_SLUB_TINY
6425 discard_slab(slab->slab_cache, slab);
6426 #else
6427 deactivate_slab(slab->slab_cache, slab, slab->flush_freelist);
6428 #endif
6429 }
6430 }
6431
defer_free(struct kmem_cache * s,void * head)6432 static void defer_free(struct kmem_cache *s, void *head)
6433 {
6434 struct defer_free *df = this_cpu_ptr(&defer_free_objects);
6435
6436 if (llist_add(head + s->offset, &df->objects))
6437 irq_work_queue(&df->work);
6438 }
6439
defer_deactivate_slab(struct slab * slab,void * flush_freelist)6440 static void defer_deactivate_slab(struct slab *slab, void *flush_freelist)
6441 {
6442 struct defer_free *df = this_cpu_ptr(&defer_free_objects);
6443
6444 slab->flush_freelist = flush_freelist;
6445 if (llist_add(&slab->llnode, &df->slabs))
6446 irq_work_queue(&df->work);
6447 }
6448
defer_free_barrier(void)6449 void defer_free_barrier(void)
6450 {
6451 int cpu;
6452
6453 for_each_possible_cpu(cpu)
6454 irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
6455 }
6456
6457 #ifndef CONFIG_SLUB_TINY
6458 /*
6459 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
6460 * can perform fastpath freeing without additional function calls.
6461 *
6462 * The fastpath is only possible if we are freeing to the current cpu slab
6463 * of this processor. This typically the case if we have just allocated
6464 * the item before.
6465 *
6466 * If fastpath is not possible then fall back to __slab_free where we deal
6467 * with all sorts of special processing.
6468 *
6469 * Bulk free of a freelist with several objects (all pointing to the
6470 * same slab) possible by specifying head and tail ptr, plus objects
6471 * count (cnt). Bulk free indicated by tail pointer being set.
6472 */
do_slab_free(struct kmem_cache * s,struct slab * slab,void * head,void * tail,int cnt,unsigned long addr)6473 static __always_inline void do_slab_free(struct kmem_cache *s,
6474 struct slab *slab, void *head, void *tail,
6475 int cnt, unsigned long addr)
6476 {
6477 /* cnt == 0 signals that it's called from kfree_nolock() */
6478 bool allow_spin = cnt;
6479 struct kmem_cache_cpu *c;
6480 unsigned long tid;
6481 void **freelist;
6482
6483 redo:
6484 /*
6485 * Determine the currently cpus per cpu slab.
6486 * The cpu may change afterward. However that does not matter since
6487 * data is retrieved via this pointer. If we are on the same cpu
6488 * during the cmpxchg then the free will succeed.
6489 */
6490 c = raw_cpu_ptr(s->cpu_slab);
6491 tid = READ_ONCE(c->tid);
6492
6493 /* Same with comment on barrier() in __slab_alloc_node() */
6494 barrier();
6495
6496 if (unlikely(slab != c->slab)) {
6497 if (unlikely(!allow_spin)) {
6498 /*
6499 * __slab_free() can locklessly cmpxchg16 into a slab,
6500 * but then it might need to take spin_lock or local_lock
6501 * in put_cpu_partial() for further processing.
6502 * Avoid the complexity and simply add to a deferred list.
6503 */
6504 defer_free(s, head);
6505 } else {
6506 __slab_free(s, slab, head, tail, cnt, addr);
6507 }
6508 return;
6509 }
6510
6511 if (unlikely(!allow_spin)) {
6512 if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) &&
6513 local_lock_is_locked(&s->cpu_slab->lock)) {
6514 defer_free(s, head);
6515 return;
6516 }
6517 cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */
6518 }
6519
6520 if (USE_LOCKLESS_FAST_PATH()) {
6521 freelist = READ_ONCE(c->freelist);
6522
6523 set_freepointer(s, tail, freelist);
6524
6525 if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
6526 note_cmpxchg_failure("slab_free", s, tid);
6527 goto redo;
6528 }
6529 } else {
6530 __maybe_unused unsigned long flags = 0;
6531
6532 /* Update the free list under the local lock */
6533 local_lock_cpu_slab(s, flags);
6534 c = this_cpu_ptr(s->cpu_slab);
6535 if (unlikely(slab != c->slab)) {
6536 local_unlock_cpu_slab(s, flags);
6537 goto redo;
6538 }
6539 tid = c->tid;
6540 freelist = c->freelist;
6541
6542 set_freepointer(s, tail, freelist);
6543 c->freelist = head;
6544 c->tid = next_tid(tid);
6545
6546 local_unlock_cpu_slab(s, flags);
6547 }
6548 stat_add(s, FREE_FASTPATH, cnt);
6549 }
6550 #else /* CONFIG_SLUB_TINY */
do_slab_free(struct kmem_cache * s,struct slab * slab,void * head,void * tail,int cnt,unsigned long addr)6551 static void do_slab_free(struct kmem_cache *s,
6552 struct slab *slab, void *head, void *tail,
6553 int cnt, unsigned long addr)
6554 {
6555 __slab_free(s, slab, head, tail, cnt, addr);
6556 }
6557 #endif /* CONFIG_SLUB_TINY */
6558
6559 static __fastpath_inline
slab_free(struct kmem_cache * s,struct slab * slab,void * object,unsigned long addr)6560 void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
6561 unsigned long addr)
6562 {
6563 memcg_slab_free_hook(s, slab, &object, 1);
6564 alloc_tagging_slab_free_hook(s, slab, &object, 1);
6565
6566 if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
6567 return;
6568
6569 if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
6570 slab_nid(slab) == numa_mem_id())) {
6571 if (likely(free_to_pcs(s, object)))
6572 return;
6573 }
6574
6575 do_slab_free(s, slab, object, object, 1, addr);
6576 }
6577
6578 #ifdef CONFIG_MEMCG
6579 /* Do not inline the rare memcg charging failed path into the allocation path */
6580 static noinline
memcg_alloc_abort_single(struct kmem_cache * s,void * object)6581 void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
6582 {
6583 if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
6584 do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_);
6585 }
6586 #endif
6587
6588 static __fastpath_inline
slab_free_bulk(struct kmem_cache * s,struct slab * slab,void * head,void * tail,void ** p,int cnt,unsigned long addr)6589 void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
6590 void *tail, void **p, int cnt, unsigned long addr)
6591 {
6592 memcg_slab_free_hook(s, slab, p, cnt);
6593 alloc_tagging_slab_free_hook(s, slab, p, cnt);
6594 /*
6595 * With KASAN enabled slab_free_freelist_hook modifies the freelist
6596 * to remove objects, whose reuse must be delayed.
6597 */
6598 if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt)))
6599 do_slab_free(s, slab, head, tail, cnt, addr);
6600 }
6601
6602 #ifdef CONFIG_SLUB_RCU_DEBUG
slab_free_after_rcu_debug(struct rcu_head * rcu_head)6603 static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
6604 {
6605 struct rcu_delayed_free *delayed_free =
6606 container_of(rcu_head, struct rcu_delayed_free, head);
6607 void *object = delayed_free->object;
6608 struct slab *slab = virt_to_slab(object);
6609 struct kmem_cache *s;
6610
6611 kfree(delayed_free);
6612
6613 if (WARN_ON(is_kfence_address(object)))
6614 return;
6615
6616 /* find the object and the cache again */
6617 if (WARN_ON(!slab))
6618 return;
6619 s = slab->slab_cache;
6620 if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
6621 return;
6622
6623 /* resume freeing */
6624 if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
6625 do_slab_free(s, slab, object, object, 1, _THIS_IP_);
6626 }
6627 #endif /* CONFIG_SLUB_RCU_DEBUG */
6628
6629 #ifdef CONFIG_KASAN_GENERIC
___cache_free(struct kmem_cache * cache,void * x,unsigned long addr)6630 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
6631 {
6632 do_slab_free(cache, virt_to_slab(x), x, x, 1, addr);
6633 }
6634 #endif
6635
virt_to_cache(const void * obj)6636 static inline struct kmem_cache *virt_to_cache(const void *obj)
6637 {
6638 struct slab *slab;
6639
6640 slab = virt_to_slab(obj);
6641 if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__))
6642 return NULL;
6643 return slab->slab_cache;
6644 }
6645
cache_from_obj(struct kmem_cache * s,void * x)6646 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
6647 {
6648 struct kmem_cache *cachep;
6649
6650 if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
6651 !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
6652 return s;
6653
6654 cachep = virt_to_cache(x);
6655 if (WARN(cachep && cachep != s,
6656 "%s: Wrong slab cache. %s but object is from %s\n",
6657 __func__, s->name, cachep->name))
6658 print_tracking(cachep, x);
6659 return cachep;
6660 }
6661
6662 /**
6663 * kmem_cache_free - Deallocate an object
6664 * @s: The cache the allocation was from.
6665 * @x: The previously allocated object.
6666 *
6667 * Free an object which was previously allocated from this
6668 * cache.
6669 */
kmem_cache_free(struct kmem_cache * s,void * x)6670 void kmem_cache_free(struct kmem_cache *s, void *x)
6671 {
6672 s = cache_from_obj(s, x);
6673 if (!s)
6674 return;
6675 trace_kmem_cache_free(_RET_IP_, x, s);
6676 slab_free(s, virt_to_slab(x), x, _RET_IP_);
6677 }
6678 EXPORT_SYMBOL(kmem_cache_free);
6679
free_large_kmalloc(struct folio * folio,void * object)6680 static void free_large_kmalloc(struct folio *folio, void *object)
6681 {
6682 unsigned int order = folio_order(folio);
6683
6684 if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) {
6685 dump_page(&folio->page, "Not a kmalloc allocation");
6686 return;
6687 }
6688
6689 if (WARN_ON_ONCE(order == 0))
6690 pr_warn_once("object pointer: 0x%p\n", object);
6691
6692 kmemleak_free(object);
6693 kasan_kfree_large(object);
6694 kmsan_kfree_large(object);
6695
6696 lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
6697 -(PAGE_SIZE << order));
6698 __folio_clear_large_kmalloc(folio);
6699 free_frozen_pages(&folio->page, order);
6700 }
6701
6702 /*
6703 * Given an rcu_head embedded within an object obtained from kvmalloc at an
6704 * offset < 4k, free the object in question.
6705 */
kvfree_rcu_cb(struct rcu_head * head)6706 void kvfree_rcu_cb(struct rcu_head *head)
6707 {
6708 void *obj = head;
6709 struct folio *folio;
6710 struct slab *slab;
6711 struct kmem_cache *s;
6712 void *slab_addr;
6713
6714 if (is_vmalloc_addr(obj)) {
6715 obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
6716 vfree(obj);
6717 return;
6718 }
6719
6720 folio = virt_to_folio(obj);
6721 if (!folio_test_slab(folio)) {
6722 /*
6723 * rcu_head offset can be only less than page size so no need to
6724 * consider folio order
6725 */
6726 obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
6727 free_large_kmalloc(folio, obj);
6728 return;
6729 }
6730
6731 slab = folio_slab(folio);
6732 s = slab->slab_cache;
6733 slab_addr = folio_address(folio);
6734
6735 if (is_kfence_address(obj)) {
6736 obj = kfence_object_start(obj);
6737 } else {
6738 unsigned int idx = __obj_to_index(s, slab_addr, obj);
6739
6740 obj = slab_addr + s->size * idx;
6741 obj = fixup_red_left(s, obj);
6742 }
6743
6744 slab_free(s, slab, obj, _RET_IP_);
6745 }
6746
6747 /**
6748 * kfree - free previously allocated memory
6749 * @object: pointer returned by kmalloc() or kmem_cache_alloc()
6750 *
6751 * If @object is NULL, no operation is performed.
6752 */
kfree(const void * object)6753 void kfree(const void *object)
6754 {
6755 struct folio *folio;
6756 struct slab *slab;
6757 struct kmem_cache *s;
6758 void *x = (void *)object;
6759
6760 trace_kfree(_RET_IP_, object);
6761
6762 if (unlikely(ZERO_OR_NULL_PTR(object)))
6763 return;
6764
6765 folio = virt_to_folio(object);
6766 if (unlikely(!folio_test_slab(folio))) {
6767 free_large_kmalloc(folio, (void *)object);
6768 return;
6769 }
6770
6771 slab = folio_slab(folio);
6772 s = slab->slab_cache;
6773 slab_free(s, slab, x, _RET_IP_);
6774 }
6775 EXPORT_SYMBOL(kfree);
6776
6777 /*
6778 * Can be called while holding raw_spinlock_t or from IRQ and NMI,
6779 * but ONLY for objects allocated by kmalloc_nolock().
6780 * Debug checks (like kmemleak and kfence) were skipped on allocation,
6781 * hence
6782 * obj = kmalloc(); kfree_nolock(obj);
6783 * will miss kmemleak/kfence book keeping and will cause false positives.
6784 * large_kmalloc is not supported either.
6785 */
kfree_nolock(const void * object)6786 void kfree_nolock(const void *object)
6787 {
6788 struct folio *folio;
6789 struct slab *slab;
6790 struct kmem_cache *s;
6791 void *x = (void *)object;
6792
6793 if (unlikely(ZERO_OR_NULL_PTR(object)))
6794 return;
6795
6796 folio = virt_to_folio(object);
6797 if (unlikely(!folio_test_slab(folio))) {
6798 WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
6799 return;
6800 }
6801
6802 slab = folio_slab(folio);
6803 s = slab->slab_cache;
6804
6805 memcg_slab_free_hook(s, slab, &x, 1);
6806 alloc_tagging_slab_free_hook(s, slab, &x, 1);
6807 /*
6808 * Unlike slab_free() do NOT call the following:
6809 * kmemleak_free_recursive(x, s->flags);
6810 * debug_check_no_locks_freed(x, s->object_size);
6811 * debug_check_no_obj_freed(x, s->object_size);
6812 * __kcsan_check_access(x, s->object_size, ..);
6813 * kfence_free(x);
6814 * since they take spinlocks or not safe from any context.
6815 */
6816 kmsan_slab_free(s, x);
6817 /*
6818 * If KASAN finds a kernel bug it will do kasan_report_invalid_free()
6819 * which will call raw_spin_lock_irqsave() which is technically
6820 * unsafe from NMI, but take chance and report kernel bug.
6821 * The sequence of
6822 * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI
6823 * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU
6824 * is double buggy and deserves to deadlock.
6825 */
6826 if (kasan_slab_pre_free(s, x))
6827 return;
6828 /*
6829 * memcg, kasan_slab_pre_free are done for 'x'.
6830 * The only thing left is kasan_poison without quarantine,
6831 * since kasan quarantine takes locks and not supported from NMI.
6832 */
6833 kasan_slab_free(s, x, false, false, /* skip quarantine */true);
6834 #ifndef CONFIG_SLUB_TINY
6835 do_slab_free(s, slab, x, x, 0, _RET_IP_);
6836 #else
6837 defer_free(s, x);
6838 #endif
6839 }
6840 EXPORT_SYMBOL_GPL(kfree_nolock);
6841
6842 static __always_inline __realloc_size(2) void *
__do_krealloc(const void * p,size_t new_size,unsigned long align,gfp_t flags,int nid)6843 __do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
6844 {
6845 void *ret;
6846 size_t ks = 0;
6847 int orig_size = 0;
6848 struct kmem_cache *s = NULL;
6849
6850 if (unlikely(ZERO_OR_NULL_PTR(p)))
6851 goto alloc_new;
6852
6853 /* Check for double-free. */
6854 if (!kasan_check_byte(p))
6855 return NULL;
6856
6857 /*
6858 * If reallocation is not necessary (e. g. the new size is less
6859 * than the current allocated size), the current allocation will be
6860 * preserved unless __GFP_THISNODE is set. In the latter case a new
6861 * allocation on the requested node will be attempted.
6862 */
6863 if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
6864 nid != page_to_nid(virt_to_page(p)))
6865 goto alloc_new;
6866
6867 if (is_kfence_address(p)) {
6868 ks = orig_size = kfence_ksize(p);
6869 } else {
6870 struct folio *folio;
6871
6872 folio = virt_to_folio(p);
6873 if (unlikely(!folio_test_slab(folio))) {
6874 /* Big kmalloc object */
6875 WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE);
6876 WARN_ON(p != folio_address(folio));
6877 ks = folio_size(folio);
6878 } else {
6879 s = folio_slab(folio)->slab_cache;
6880 orig_size = get_orig_size(s, (void *)p);
6881 ks = s->object_size;
6882 }
6883 }
6884
6885 /* If the old object doesn't fit, allocate a bigger one */
6886 if (new_size > ks)
6887 goto alloc_new;
6888
6889 /* If the old object doesn't satisfy the new alignment, allocate a new one */
6890 if (!IS_ALIGNED((unsigned long)p, align))
6891 goto alloc_new;
6892
6893 /* Zero out spare memory. */
6894 if (want_init_on_alloc(flags)) {
6895 kasan_disable_current();
6896 if (orig_size && orig_size < new_size)
6897 memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size);
6898 else
6899 memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
6900 kasan_enable_current();
6901 }
6902
6903 /* Setup kmalloc redzone when needed */
6904 if (s && slub_debug_orig_size(s)) {
6905 set_orig_size(s, (void *)p, new_size);
6906 if (s->flags & SLAB_RED_ZONE && new_size < ks)
6907 memset_no_sanitize_memory(kasan_reset_tag(p) + new_size,
6908 SLUB_RED_ACTIVE, ks - new_size);
6909 }
6910
6911 p = kasan_krealloc(p, new_size, flags);
6912 return (void *)p;
6913
6914 alloc_new:
6915 ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
6916 if (ret && p) {
6917 /* Disable KASAN checks as the object's redzone is accessed. */
6918 kasan_disable_current();
6919 memcpy(ret, kasan_reset_tag(p), orig_size ?: ks);
6920 kasan_enable_current();
6921 }
6922
6923 return ret;
6924 }
6925
6926 /**
6927 * krealloc_node_align - reallocate memory. The contents will remain unchanged.
6928 * @p: object to reallocate memory for.
6929 * @new_size: how many bytes of memory are required.
6930 * @align: desired alignment.
6931 * @flags: the type of memory to allocate.
6932 * @nid: NUMA node or NUMA_NO_NODE
6933 *
6934 * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
6935 * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
6936 *
6937 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
6938 * Documentation/core-api/memory-allocation.rst for more details.
6939 *
6940 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
6941 * initial memory allocation, every subsequent call to this API for the same
6942 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
6943 * __GFP_ZERO is not fully honored by this API.
6944 *
6945 * When slub_debug_orig_size() is off, krealloc() only knows about the bucket
6946 * size of an allocation (but not the exact size it was allocated with) and
6947 * hence implements the following semantics for shrinking and growing buffers
6948 * with __GFP_ZERO::
6949 *
6950 * new bucket
6951 * 0 size size
6952 * |--------|----------------|
6953 * | keep | zero |
6954 *
6955 * Otherwise, the original allocation size 'orig_size' could be used to
6956 * precisely clear the requested size, and the new size will also be stored
6957 * as the new 'orig_size'.
6958 *
6959 * In any case, the contents of the object pointed to are preserved up to the
6960 * lesser of the new and old sizes.
6961 *
6962 * Return: pointer to the allocated memory or %NULL in case of error
6963 */
krealloc_node_align_noprof(const void * p,size_t new_size,unsigned long align,gfp_t flags,int nid)6964 void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
6965 gfp_t flags, int nid)
6966 {
6967 void *ret;
6968
6969 if (unlikely(!new_size)) {
6970 kfree(p);
6971 return ZERO_SIZE_PTR;
6972 }
6973
6974 ret = __do_krealloc(p, new_size, align, flags, nid);
6975 if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
6976 kfree(p);
6977
6978 return ret;
6979 }
6980 EXPORT_SYMBOL(krealloc_node_align_noprof);
6981
kmalloc_gfp_adjust(gfp_t flags,size_t size)6982 static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
6983 {
6984 /*
6985 * We want to attempt a large physically contiguous block first because
6986 * it is less likely to fragment multiple larger blocks and therefore
6987 * contribute to a long term fragmentation less than vmalloc fallback.
6988 * However make sure that larger requests are not too disruptive - i.e.
6989 * do not direct reclaim unless physically continuous memory is preferred
6990 * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
6991 * start working in the background
6992 */
6993 if (size > PAGE_SIZE) {
6994 flags |= __GFP_NOWARN;
6995
6996 if (!(flags & __GFP_RETRY_MAYFAIL))
6997 flags &= ~__GFP_DIRECT_RECLAIM;
6998
6999 /* nofail semantic is implemented by the vmalloc fallback */
7000 flags &= ~__GFP_NOFAIL;
7001 }
7002
7003 return flags;
7004 }
7005
7006 /**
7007 * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
7008 * failure, fall back to non-contiguous (vmalloc) allocation.
7009 * @size: size of the request.
7010 * @b: which set of kmalloc buckets to allocate from.
7011 * @align: desired alignment.
7012 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
7013 * @node: numa node to allocate from
7014 *
7015 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
7016 * Documentation/core-api/memory-allocation.rst for more details.
7017 *
7018 * Uses kmalloc to get the memory but if the allocation fails then falls back
7019 * to the vmalloc allocator. Use kvfree for freeing the memory.
7020 *
7021 * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
7022 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
7023 * preferable to the vmalloc fallback, due to visible performance drawbacks.
7024 *
7025 * Return: pointer to the allocated memory of %NULL in case of failure
7026 */
__kvmalloc_node_noprof(DECL_BUCKET_PARAMS (size,b),unsigned long align,gfp_t flags,int node)7027 void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
7028 gfp_t flags, int node)
7029 {
7030 void *ret;
7031
7032 /*
7033 * It doesn't really make sense to fallback to vmalloc for sub page
7034 * requests
7035 */
7036 ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
7037 kmalloc_gfp_adjust(flags, size),
7038 node, _RET_IP_);
7039 if (ret || size <= PAGE_SIZE)
7040 return ret;
7041
7042 /* non-sleeping allocations are not supported by vmalloc */
7043 if (!gfpflags_allow_blocking(flags))
7044 return NULL;
7045
7046 /* Don't even allow crazy sizes */
7047 if (unlikely(size > INT_MAX)) {
7048 WARN_ON_ONCE(!(flags & __GFP_NOWARN));
7049 return NULL;
7050 }
7051
7052 /*
7053 * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
7054 * since the callers already cannot assume anything
7055 * about the resulting pointer, and cannot play
7056 * protection games.
7057 */
7058 return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
7059 flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
7060 node, __builtin_return_address(0));
7061 }
7062 EXPORT_SYMBOL(__kvmalloc_node_noprof);
7063
7064 /**
7065 * kvfree() - Free memory.
7066 * @addr: Pointer to allocated memory.
7067 *
7068 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
7069 * It is slightly more efficient to use kfree() or vfree() if you are certain
7070 * that you know which one to use.
7071 *
7072 * Context: Either preemptible task context or not-NMI interrupt.
7073 */
kvfree(const void * addr)7074 void kvfree(const void *addr)
7075 {
7076 if (is_vmalloc_addr(addr))
7077 vfree(addr);
7078 else
7079 kfree(addr);
7080 }
7081 EXPORT_SYMBOL(kvfree);
7082
7083 /**
7084 * kvfree_sensitive - Free a data object containing sensitive information.
7085 * @addr: address of the data object to be freed.
7086 * @len: length of the data object.
7087 *
7088 * Use the special memzero_explicit() function to clear the content of a
7089 * kvmalloc'ed object containing sensitive data to make sure that the
7090 * compiler won't optimize out the data clearing.
7091 */
kvfree_sensitive(const void * addr,size_t len)7092 void kvfree_sensitive(const void *addr, size_t len)
7093 {
7094 if (likely(!ZERO_OR_NULL_PTR(addr))) {
7095 memzero_explicit((void *)addr, len);
7096 kvfree(addr);
7097 }
7098 }
7099 EXPORT_SYMBOL(kvfree_sensitive);
7100
7101 /**
7102 * kvrealloc_node_align - reallocate memory; contents remain unchanged
7103 * @p: object to reallocate memory for
7104 * @size: the size to reallocate
7105 * @align: desired alignment
7106 * @flags: the flags for the page level allocator
7107 * @nid: NUMA node id
7108 *
7109 * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
7110 * and @p is not a %NULL pointer, the object pointed to is freed.
7111 *
7112 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
7113 * Documentation/core-api/memory-allocation.rst for more details.
7114 *
7115 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
7116 * initial memory allocation, every subsequent call to this API for the same
7117 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
7118 * __GFP_ZERO is not fully honored by this API.
7119 *
7120 * In any case, the contents of the object pointed to are preserved up to the
7121 * lesser of the new and old sizes.
7122 *
7123 * This function must not be called concurrently with itself or kvfree() for the
7124 * same memory allocation.
7125 *
7126 * Return: pointer to the allocated memory or %NULL in case of error
7127 */
kvrealloc_node_align_noprof(const void * p,size_t size,unsigned long align,gfp_t flags,int nid)7128 void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
7129 gfp_t flags, int nid)
7130 {
7131 void *n;
7132
7133 if (is_vmalloc_addr(p))
7134 return vrealloc_node_align_noprof(p, size, align, flags, nid);
7135
7136 n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
7137 if (!n) {
7138 /* We failed to krealloc(), fall back to kvmalloc(). */
7139 n = kvmalloc_node_align_noprof(size, align, flags, nid);
7140 if (!n)
7141 return NULL;
7142
7143 if (p) {
7144 /* We already know that `p` is not a vmalloc address. */
7145 kasan_disable_current();
7146 memcpy(n, kasan_reset_tag(p), ksize(p));
7147 kasan_enable_current();
7148
7149 kfree(p);
7150 }
7151 }
7152
7153 return n;
7154 }
7155 EXPORT_SYMBOL(kvrealloc_node_align_noprof);
7156
7157 struct detached_freelist {
7158 struct slab *slab;
7159 void *tail;
7160 void *freelist;
7161 int cnt;
7162 struct kmem_cache *s;
7163 };
7164
7165 /*
7166 * This function progressively scans the array with free objects (with
7167 * a limited look ahead) and extract objects belonging to the same
7168 * slab. It builds a detached freelist directly within the given
7169 * slab/objects. This can happen without any need for
7170 * synchronization, because the objects are owned by running process.
7171 * The freelist is build up as a single linked list in the objects.
7172 * The idea is, that this detached freelist can then be bulk
7173 * transferred to the real freelist(s), but only requiring a single
7174 * synchronization primitive. Look ahead in the array is limited due
7175 * to performance reasons.
7176 */
7177 static inline
build_detached_freelist(struct kmem_cache * s,size_t size,void ** p,struct detached_freelist * df)7178 int build_detached_freelist(struct kmem_cache *s, size_t size,
7179 void **p, struct detached_freelist *df)
7180 {
7181 int lookahead = 3;
7182 void *object;
7183 struct folio *folio;
7184 size_t same;
7185
7186 object = p[--size];
7187 folio = virt_to_folio(object);
7188 if (!s) {
7189 /* Handle kalloc'ed objects */
7190 if (unlikely(!folio_test_slab(folio))) {
7191 free_large_kmalloc(folio, object);
7192 df->slab = NULL;
7193 return size;
7194 }
7195 /* Derive kmem_cache from object */
7196 df->slab = folio_slab(folio);
7197 df->s = df->slab->slab_cache;
7198 } else {
7199 df->slab = folio_slab(folio);
7200 df->s = cache_from_obj(s, object); /* Support for memcg */
7201 }
7202
7203 /* Start new detached freelist */
7204 df->tail = object;
7205 df->freelist = object;
7206 df->cnt = 1;
7207
7208 if (is_kfence_address(object))
7209 return size;
7210
7211 set_freepointer(df->s, object, NULL);
7212
7213 same = size;
7214 while (size) {
7215 object = p[--size];
7216 /* df->slab is always set at this point */
7217 if (df->slab == virt_to_slab(object)) {
7218 /* Opportunity build freelist */
7219 set_freepointer(df->s, object, df->freelist);
7220 df->freelist = object;
7221 df->cnt++;
7222 same--;
7223 if (size != same)
7224 swap(p[size], p[same]);
7225 continue;
7226 }
7227
7228 /* Limit look ahead search */
7229 if (!--lookahead)
7230 break;
7231 }
7232
7233 return same;
7234 }
7235
7236 /*
7237 * Internal bulk free of objects that were not initialised by the post alloc
7238 * hooks and thus should not be processed by the free hooks
7239 */
__kmem_cache_free_bulk(struct kmem_cache * s,size_t size,void ** p)7240 static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
7241 {
7242 if (!size)
7243 return;
7244
7245 do {
7246 struct detached_freelist df;
7247
7248 size = build_detached_freelist(s, size, p, &df);
7249 if (!df.slab)
7250 continue;
7251
7252 if (kfence_free(df.freelist))
7253 continue;
7254
7255 do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt,
7256 _RET_IP_);
7257 } while (likely(size));
7258 }
7259
7260 /* Note that interrupts must be enabled when calling this function. */
kmem_cache_free_bulk(struct kmem_cache * s,size_t size,void ** p)7261 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
7262 {
7263 if (!size)
7264 return;
7265
7266 /*
7267 * freeing to sheaves is so incompatible with the detached freelist so
7268 * once we go that way, we have to do everything differently
7269 */
7270 if (s && s->cpu_sheaves) {
7271 free_to_pcs_bulk(s, size, p);
7272 return;
7273 }
7274
7275 do {
7276 struct detached_freelist df;
7277
7278 size = build_detached_freelist(s, size, p, &df);
7279 if (!df.slab)
7280 continue;
7281
7282 slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size],
7283 df.cnt, _RET_IP_);
7284 } while (likely(size));
7285 }
7286 EXPORT_SYMBOL(kmem_cache_free_bulk);
7287
7288 #ifndef CONFIG_SLUB_TINY
7289 static inline
__kmem_cache_alloc_bulk(struct kmem_cache * s,gfp_t flags,size_t size,void ** p)7290 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
7291 void **p)
7292 {
7293 struct kmem_cache_cpu *c;
7294 unsigned long irqflags;
7295 int i;
7296
7297 /*
7298 * Drain objects in the per cpu slab, while disabling local
7299 * IRQs, which protects against PREEMPT and interrupts
7300 * handlers invoking normal fastpath.
7301 */
7302 c = slub_get_cpu_ptr(s->cpu_slab);
7303 local_lock_irqsave(&s->cpu_slab->lock, irqflags);
7304
7305 for (i = 0; i < size; i++) {
7306 void *object = kfence_alloc(s, s->object_size, flags);
7307
7308 if (unlikely(object)) {
7309 p[i] = object;
7310 continue;
7311 }
7312
7313 object = c->freelist;
7314 if (unlikely(!object)) {
7315 /*
7316 * We may have removed an object from c->freelist using
7317 * the fastpath in the previous iteration; in that case,
7318 * c->tid has not been bumped yet.
7319 * Since ___slab_alloc() may reenable interrupts while
7320 * allocating memory, we should bump c->tid now.
7321 */
7322 c->tid = next_tid(c->tid);
7323
7324 local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
7325
7326 /*
7327 * Invoking slow path likely have side-effect
7328 * of re-populating per CPU c->freelist
7329 */
7330 p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
7331 _RET_IP_, c, s->object_size);
7332 if (unlikely(!p[i]))
7333 goto error;
7334
7335 c = this_cpu_ptr(s->cpu_slab);
7336 maybe_wipe_obj_freeptr(s, p[i]);
7337
7338 local_lock_irqsave(&s->cpu_slab->lock, irqflags);
7339
7340 continue; /* goto for-loop */
7341 }
7342 c->freelist = get_freepointer(s, object);
7343 p[i] = object;
7344 maybe_wipe_obj_freeptr(s, p[i]);
7345 stat(s, ALLOC_FASTPATH);
7346 }
7347 c->tid = next_tid(c->tid);
7348 local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
7349 slub_put_cpu_ptr(s->cpu_slab);
7350
7351 return i;
7352
7353 error:
7354 slub_put_cpu_ptr(s->cpu_slab);
7355 __kmem_cache_free_bulk(s, i, p);
7356 return 0;
7357
7358 }
7359 #else /* CONFIG_SLUB_TINY */
__kmem_cache_alloc_bulk(struct kmem_cache * s,gfp_t flags,size_t size,void ** p)7360 static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
7361 size_t size, void **p)
7362 {
7363 int i;
7364
7365 for (i = 0; i < size; i++) {
7366 void *object = kfence_alloc(s, s->object_size, flags);
7367
7368 if (unlikely(object)) {
7369 p[i] = object;
7370 continue;
7371 }
7372
7373 p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE,
7374 _RET_IP_, s->object_size);
7375 if (unlikely(!p[i]))
7376 goto error;
7377
7378 maybe_wipe_obj_freeptr(s, p[i]);
7379 }
7380
7381 return i;
7382
7383 error:
7384 __kmem_cache_free_bulk(s, i, p);
7385 return 0;
7386 }
7387 #endif /* CONFIG_SLUB_TINY */
7388
7389 /* Note that interrupts must be enabled when calling this function. */
kmem_cache_alloc_bulk_noprof(struct kmem_cache * s,gfp_t flags,size_t size,void ** p)7390 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
7391 void **p)
7392 {
7393 unsigned int i = 0;
7394
7395 if (!size)
7396 return 0;
7397
7398 s = slab_pre_alloc_hook(s, flags);
7399 if (unlikely(!s))
7400 return 0;
7401
7402 if (s->cpu_sheaves)
7403 i = alloc_from_pcs_bulk(s, size, p);
7404
7405 if (i < size) {
7406 /*
7407 * If we ran out of memory, don't bother with freeing back to
7408 * the percpu sheaves, we have bigger problems.
7409 */
7410 if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) {
7411 if (i > 0)
7412 __kmem_cache_free_bulk(s, i, p);
7413 return 0;
7414 }
7415 }
7416
7417 /*
7418 * memcg and kmem_cache debug support and memory initialization.
7419 * Done outside of the IRQ disabled fastpath loop.
7420 */
7421 if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p,
7422 slab_want_init_on_alloc(flags, s), s->object_size))) {
7423 return 0;
7424 }
7425
7426 return size;
7427 }
7428 EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
7429
7430 /*
7431 * Object placement in a slab is made very easy because we always start at
7432 * offset 0. If we tune the size of the object to the alignment then we can
7433 * get the required alignment by putting one properly sized object after
7434 * another.
7435 *
7436 * Notice that the allocation order determines the sizes of the per cpu
7437 * caches. Each processor has always one slab available for allocations.
7438 * Increasing the allocation order reduces the number of times that slabs
7439 * must be moved on and off the partial lists and is therefore a factor in
7440 * locking overhead.
7441 */
7442
7443 /*
7444 * Minimum / Maximum order of slab pages. This influences locking overhead
7445 * and slab fragmentation. A higher order reduces the number of partial slabs
7446 * and increases the number of allocations possible without having to
7447 * take the list_lock.
7448 */
7449 static unsigned int slub_min_order;
7450 static unsigned int slub_max_order =
7451 IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
7452 static unsigned int slub_min_objects;
7453
7454 /*
7455 * Calculate the order of allocation given an slab object size.
7456 *
7457 * The order of allocation has significant impact on performance and other
7458 * system components. Generally order 0 allocations should be preferred since
7459 * order 0 does not cause fragmentation in the page allocator. Larger objects
7460 * be problematic to put into order 0 slabs because there may be too much
7461 * unused space left. We go to a higher order if more than 1/16th of the slab
7462 * would be wasted.
7463 *
7464 * In order to reach satisfactory performance we must ensure that a minimum
7465 * number of objects is in one slab. Otherwise we may generate too much
7466 * activity on the partial lists which requires taking the list_lock. This is
7467 * less a concern for large slabs though which are rarely used.
7468 *
7469 * slab_max_order specifies the order where we begin to stop considering the
7470 * number of objects in a slab as critical. If we reach slab_max_order then
7471 * we try to keep the page order as low as possible. So we accept more waste
7472 * of space in favor of a small page order.
7473 *
7474 * Higher order allocations also allow the placement of more objects in a
7475 * slab and thereby reduce object handling overhead. If the user has
7476 * requested a higher minimum order then we start with that one instead of
7477 * the smallest order which will fit the object.
7478 */
calc_slab_order(unsigned int size,unsigned int min_order,unsigned int max_order,unsigned int fract_leftover)7479 static inline unsigned int calc_slab_order(unsigned int size,
7480 unsigned int min_order, unsigned int max_order,
7481 unsigned int fract_leftover)
7482 {
7483 unsigned int order;
7484
7485 for (order = min_order; order <= max_order; order++) {
7486
7487 unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
7488 unsigned int rem;
7489
7490 rem = slab_size % size;
7491
7492 if (rem <= slab_size / fract_leftover)
7493 break;
7494 }
7495
7496 return order;
7497 }
7498
calculate_order(unsigned int size)7499 static inline int calculate_order(unsigned int size)
7500 {
7501 unsigned int order;
7502 unsigned int min_objects;
7503 unsigned int max_objects;
7504 unsigned int min_order;
7505
7506 min_objects = slub_min_objects;
7507 if (!min_objects) {
7508 /*
7509 * Some architectures will only update present cpus when
7510 * onlining them, so don't trust the number if it's just 1. But
7511 * we also don't want to use nr_cpu_ids always, as on some other
7512 * architectures, there can be many possible cpus, but never
7513 * onlined. Here we compromise between trying to avoid too high
7514 * order on systems that appear larger than they are, and too
7515 * low order on systems that appear smaller than they are.
7516 */
7517 unsigned int nr_cpus = num_present_cpus();
7518 if (nr_cpus <= 1)
7519 nr_cpus = nr_cpu_ids;
7520 min_objects = 4 * (fls(nr_cpus) + 1);
7521 }
7522 /* min_objects can't be 0 because get_order(0) is undefined */
7523 max_objects = max(order_objects(slub_max_order, size), 1U);
7524 min_objects = min(min_objects, max_objects);
7525
7526 min_order = max_t(unsigned int, slub_min_order,
7527 get_order(min_objects * size));
7528 if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
7529 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
7530
7531 /*
7532 * Attempt to find best configuration for a slab. This works by first
7533 * attempting to generate a layout with the best possible configuration
7534 * and backing off gradually.
7535 *
7536 * We start with accepting at most 1/16 waste and try to find the
7537 * smallest order from min_objects-derived/slab_min_order up to
7538 * slab_max_order that will satisfy the constraint. Note that increasing
7539 * the order can only result in same or less fractional waste, not more.
7540 *
7541 * If that fails, we increase the acceptable fraction of waste and try
7542 * again. The last iteration with fraction of 1/2 would effectively
7543 * accept any waste and give us the order determined by min_objects, as
7544 * long as at least single object fits within slab_max_order.
7545 */
7546 for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
7547 order = calc_slab_order(size, min_order, slub_max_order,
7548 fraction);
7549 if (order <= slub_max_order)
7550 return order;
7551 }
7552
7553 /*
7554 * Doh this slab cannot be placed using slab_max_order.
7555 */
7556 order = get_order(size);
7557 if (order <= MAX_PAGE_ORDER)
7558 return order;
7559 return -ENOSYS;
7560 }
7561
7562 static void
init_kmem_cache_node(struct kmem_cache_node * n,struct node_barn * barn)7563 init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn)
7564 {
7565 n->nr_partial = 0;
7566 spin_lock_init(&n->list_lock);
7567 INIT_LIST_HEAD(&n->partial);
7568 #ifdef CONFIG_SLUB_DEBUG
7569 atomic_long_set(&n->nr_slabs, 0);
7570 atomic_long_set(&n->total_objects, 0);
7571 INIT_LIST_HEAD(&n->full);
7572 #endif
7573 n->barn = barn;
7574 if (barn)
7575 barn_init(barn);
7576 }
7577
7578 #ifndef CONFIG_SLUB_TINY
alloc_kmem_cache_cpus(struct kmem_cache * s)7579 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
7580 {
7581 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
7582 NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
7583 sizeof(struct kmem_cache_cpu));
7584
7585 /*
7586 * Must align to double word boundary for the double cmpxchg
7587 * instructions to work; see __pcpu_double_call_return_bool().
7588 */
7589 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
7590 2 * sizeof(void *));
7591
7592 if (!s->cpu_slab)
7593 return 0;
7594
7595 init_kmem_cache_cpus(s);
7596
7597 return 1;
7598 }
7599 #else
alloc_kmem_cache_cpus(struct kmem_cache * s)7600 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
7601 {
7602 return 1;
7603 }
7604 #endif /* CONFIG_SLUB_TINY */
7605
init_percpu_sheaves(struct kmem_cache * s)7606 static int init_percpu_sheaves(struct kmem_cache *s)
7607 {
7608 int cpu;
7609
7610 for_each_possible_cpu(cpu) {
7611 struct slub_percpu_sheaves *pcs;
7612
7613 pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
7614
7615 local_trylock_init(&pcs->lock);
7616
7617 pcs->main = alloc_empty_sheaf(s, GFP_KERNEL);
7618
7619 if (!pcs->main)
7620 return -ENOMEM;
7621 }
7622
7623 return 0;
7624 }
7625
7626 static struct kmem_cache *kmem_cache_node;
7627
7628 /*
7629 * No kmalloc_node yet so do it by hand. We know that this is the first
7630 * slab on the node for this slabcache. There are no concurrent accesses
7631 * possible.
7632 *
7633 * Note that this function only works on the kmem_cache_node
7634 * when allocating for the kmem_cache_node. This is used for bootstrapping
7635 * memory on a fresh node that has no slab structures yet.
7636 */
early_kmem_cache_node_alloc(int node)7637 static void early_kmem_cache_node_alloc(int node)
7638 {
7639 struct slab *slab;
7640 struct kmem_cache_node *n;
7641
7642 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
7643
7644 slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
7645
7646 BUG_ON(!slab);
7647 if (slab_nid(slab) != node) {
7648 pr_err("SLUB: Unable to allocate memory from node %d\n", node);
7649 pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
7650 }
7651
7652 n = slab->freelist;
7653 BUG_ON(!n);
7654 #ifdef CONFIG_SLUB_DEBUG
7655 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
7656 #endif
7657 n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
7658 slab->freelist = get_freepointer(kmem_cache_node, n);
7659 slab->inuse = 1;
7660 kmem_cache_node->node[node] = n;
7661 init_kmem_cache_node(n, NULL);
7662 inc_slabs_node(kmem_cache_node, node, slab->objects);
7663
7664 /*
7665 * No locks need to be taken here as it has just been
7666 * initialized and there is no concurrent access.
7667 */
7668 __add_partial(n, slab, DEACTIVATE_TO_HEAD);
7669 }
7670
free_kmem_cache_nodes(struct kmem_cache * s)7671 static void free_kmem_cache_nodes(struct kmem_cache *s)
7672 {
7673 int node;
7674 struct kmem_cache_node *n;
7675
7676 for_each_kmem_cache_node(s, node, n) {
7677 if (n->barn) {
7678 WARN_ON(n->barn->nr_full);
7679 WARN_ON(n->barn->nr_empty);
7680 kfree(n->barn);
7681 n->barn = NULL;
7682 }
7683
7684 s->node[node] = NULL;
7685 kmem_cache_free(kmem_cache_node, n);
7686 }
7687 }
7688
__kmem_cache_release(struct kmem_cache * s)7689 void __kmem_cache_release(struct kmem_cache *s)
7690 {
7691 cache_random_seq_destroy(s);
7692 if (s->cpu_sheaves)
7693 pcs_destroy(s);
7694 #ifndef CONFIG_SLUB_TINY
7695 #ifdef CONFIG_PREEMPT_RT
7696 lockdep_unregister_key(&s->lock_key);
7697 #endif
7698 free_percpu(s->cpu_slab);
7699 #endif
7700 free_kmem_cache_nodes(s);
7701 }
7702
init_kmem_cache_nodes(struct kmem_cache * s)7703 static int init_kmem_cache_nodes(struct kmem_cache *s)
7704 {
7705 int node;
7706
7707 for_each_node_mask(node, slab_nodes) {
7708 struct kmem_cache_node *n;
7709 struct node_barn *barn = NULL;
7710
7711 if (slab_state == DOWN) {
7712 early_kmem_cache_node_alloc(node);
7713 continue;
7714 }
7715
7716 if (s->cpu_sheaves) {
7717 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
7718
7719 if (!barn)
7720 return 0;
7721 }
7722
7723 n = kmem_cache_alloc_node(kmem_cache_node,
7724 GFP_KERNEL, node);
7725 if (!n) {
7726 kfree(barn);
7727 return 0;
7728 }
7729
7730 init_kmem_cache_node(n, barn);
7731
7732 s->node[node] = n;
7733 }
7734 return 1;
7735 }
7736
set_cpu_partial(struct kmem_cache * s)7737 static void set_cpu_partial(struct kmem_cache *s)
7738 {
7739 #ifdef CONFIG_SLUB_CPU_PARTIAL
7740 unsigned int nr_objects;
7741
7742 /*
7743 * cpu_partial determined the maximum number of objects kept in the
7744 * per cpu partial lists of a processor.
7745 *
7746 * Per cpu partial lists mainly contain slabs that just have one
7747 * object freed. If they are used for allocation then they can be
7748 * filled up again with minimal effort. The slab will never hit the
7749 * per node partial lists and therefore no locking will be required.
7750 *
7751 * For backwards compatibility reasons, this is determined as number
7752 * of objects, even though we now limit maximum number of pages, see
7753 * slub_set_cpu_partial()
7754 */
7755 if (!kmem_cache_has_cpu_partial(s))
7756 nr_objects = 0;
7757 else if (s->size >= PAGE_SIZE)
7758 nr_objects = 6;
7759 else if (s->size >= 1024)
7760 nr_objects = 24;
7761 else if (s->size >= 256)
7762 nr_objects = 52;
7763 else
7764 nr_objects = 120;
7765
7766 slub_set_cpu_partial(s, nr_objects);
7767 #endif
7768 }
7769
7770 /*
7771 * calculate_sizes() determines the order and the distribution of data within
7772 * a slab object.
7773 */
calculate_sizes(struct kmem_cache_args * args,struct kmem_cache * s)7774 static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
7775 {
7776 slab_flags_t flags = s->flags;
7777 unsigned int size = s->object_size;
7778 unsigned int order;
7779
7780 /*
7781 * Round up object size to the next word boundary. We can only
7782 * place the free pointer at word boundaries and this determines
7783 * the possible location of the free pointer.
7784 */
7785 size = ALIGN(size, sizeof(void *));
7786
7787 #ifdef CONFIG_SLUB_DEBUG
7788 /*
7789 * Determine if we can poison the object itself. If the user of
7790 * the slab may touch the object after free or before allocation
7791 * then we should never poison the object itself.
7792 */
7793 if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
7794 !s->ctor)
7795 s->flags |= __OBJECT_POISON;
7796 else
7797 s->flags &= ~__OBJECT_POISON;
7798
7799
7800 /*
7801 * If we are Redzoning then check if there is some space between the
7802 * end of the object and the free pointer. If not then add an
7803 * additional word to have some bytes to store Redzone information.
7804 */
7805 if ((flags & SLAB_RED_ZONE) && size == s->object_size)
7806 size += sizeof(void *);
7807 #endif
7808
7809 /*
7810 * With that we have determined the number of bytes in actual use
7811 * by the object and redzoning.
7812 */
7813 s->inuse = size;
7814
7815 if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||
7816 (flags & SLAB_POISON) || s->ctor ||
7817 ((flags & SLAB_RED_ZONE) &&
7818 (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
7819 /*
7820 * Relocate free pointer after the object if it is not
7821 * permitted to overwrite the first word of the object on
7822 * kmem_cache_free.
7823 *
7824 * This is the case if we do RCU, have a constructor or
7825 * destructor, are poisoning the objects, or are
7826 * redzoning an object smaller than sizeof(void *) or are
7827 * redzoning an object with slub_debug_orig_size() enabled,
7828 * in which case the right redzone may be extended.
7829 *
7830 * The assumption that s->offset >= s->inuse means free
7831 * pointer is outside of the object is used in the
7832 * freeptr_outside_object() function. If that is no
7833 * longer true, the function needs to be modified.
7834 */
7835 s->offset = size;
7836 size += sizeof(void *);
7837 } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) {
7838 s->offset = args->freeptr_offset;
7839 } else {
7840 /*
7841 * Store freelist pointer near middle of object to keep
7842 * it away from the edges of the object to avoid small
7843 * sized over/underflows from neighboring allocations.
7844 */
7845 s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
7846 }
7847
7848 #ifdef CONFIG_SLUB_DEBUG
7849 if (flags & SLAB_STORE_USER) {
7850 /*
7851 * Need to store information about allocs and frees after
7852 * the object.
7853 */
7854 size += 2 * sizeof(struct track);
7855
7856 /* Save the original kmalloc request size */
7857 if (flags & SLAB_KMALLOC)
7858 size += sizeof(unsigned int);
7859 }
7860 #endif
7861
7862 kasan_cache_create(s, &size, &s->flags);
7863 #ifdef CONFIG_SLUB_DEBUG
7864 if (flags & SLAB_RED_ZONE) {
7865 /*
7866 * Add some empty padding so that we can catch
7867 * overwrites from earlier objects rather than let
7868 * tracking information or the free pointer be
7869 * corrupted if a user writes before the start
7870 * of the object.
7871 */
7872 size += sizeof(void *);
7873
7874 s->red_left_pad = sizeof(void *);
7875 s->red_left_pad = ALIGN(s->red_left_pad, s->align);
7876 size += s->red_left_pad;
7877 }
7878 #endif
7879
7880 /*
7881 * SLUB stores one object immediately after another beginning from
7882 * offset 0. In order to align the objects we have to simply size
7883 * each object to conform to the alignment.
7884 */
7885 size = ALIGN(size, s->align);
7886 s->size = size;
7887 s->reciprocal_size = reciprocal_value(size);
7888 order = calculate_order(size);
7889
7890 if ((int)order < 0)
7891 return 0;
7892
7893 s->allocflags = __GFP_COMP;
7894
7895 if (s->flags & SLAB_CACHE_DMA)
7896 s->allocflags |= GFP_DMA;
7897
7898 if (s->flags & SLAB_CACHE_DMA32)
7899 s->allocflags |= GFP_DMA32;
7900
7901 if (s->flags & SLAB_RECLAIM_ACCOUNT)
7902 s->allocflags |= __GFP_RECLAIMABLE;
7903
7904 /*
7905 * Determine the number of objects per slab
7906 */
7907 s->oo = oo_make(order, size);
7908 s->min = oo_make(get_order(size), size);
7909
7910 return !!oo_objects(s->oo);
7911 }
7912
list_slab_objects(struct kmem_cache * s,struct slab * slab)7913 static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
7914 {
7915 #ifdef CONFIG_SLUB_DEBUG
7916 void *addr = slab_address(slab);
7917 void *p;
7918
7919 if (!slab_add_kunit_errors())
7920 slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
7921
7922 spin_lock(&object_map_lock);
7923 __fill_map(object_map, s, slab);
7924
7925 for_each_object(p, s, addr, slab->objects) {
7926
7927 if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
7928 if (slab_add_kunit_errors())
7929 continue;
7930 pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
7931 print_tracking(s, p);
7932 }
7933 }
7934 spin_unlock(&object_map_lock);
7935
7936 __slab_err(slab);
7937 #endif
7938 }
7939
7940 /*
7941 * Attempt to free all partial slabs on a node.
7942 * This is called from __kmem_cache_shutdown(). We must take list_lock
7943 * because sysfs file might still access partial list after the shutdowning.
7944 */
free_partial(struct kmem_cache * s,struct kmem_cache_node * n)7945 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
7946 {
7947 LIST_HEAD(discard);
7948 struct slab *slab, *h;
7949
7950 BUG_ON(irqs_disabled());
7951 spin_lock_irq(&n->list_lock);
7952 list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
7953 if (!slab->inuse) {
7954 remove_partial(n, slab);
7955 list_add(&slab->slab_list, &discard);
7956 } else {
7957 list_slab_objects(s, slab);
7958 }
7959 }
7960 spin_unlock_irq(&n->list_lock);
7961
7962 list_for_each_entry_safe(slab, h, &discard, slab_list)
7963 discard_slab(s, slab);
7964 }
7965
__kmem_cache_empty(struct kmem_cache * s)7966 bool __kmem_cache_empty(struct kmem_cache *s)
7967 {
7968 int node;
7969 struct kmem_cache_node *n;
7970
7971 for_each_kmem_cache_node(s, node, n)
7972 if (n->nr_partial || node_nr_slabs(n))
7973 return false;
7974 return true;
7975 }
7976
7977 /*
7978 * Release all resources used by a slab cache.
7979 */
__kmem_cache_shutdown(struct kmem_cache * s)7980 int __kmem_cache_shutdown(struct kmem_cache *s)
7981 {
7982 int node;
7983 struct kmem_cache_node *n;
7984
7985 flush_all_cpus_locked(s);
7986
7987 /* we might have rcu sheaves in flight */
7988 if (s->cpu_sheaves)
7989 rcu_barrier();
7990
7991 /* Attempt to free all objects */
7992 for_each_kmem_cache_node(s, node, n) {
7993 if (n->barn)
7994 barn_shrink(s, n->barn);
7995 free_partial(s, n);
7996 if (n->nr_partial || node_nr_slabs(n))
7997 return 1;
7998 }
7999 return 0;
8000 }
8001
8002 #ifdef CONFIG_PRINTK
__kmem_obj_info(struct kmem_obj_info * kpp,void * object,struct slab * slab)8003 void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
8004 {
8005 void *base;
8006 int __maybe_unused i;
8007 unsigned int objnr;
8008 void *objp;
8009 void *objp0;
8010 struct kmem_cache *s = slab->slab_cache;
8011 struct track __maybe_unused *trackp;
8012
8013 kpp->kp_ptr = object;
8014 kpp->kp_slab = slab;
8015 kpp->kp_slab_cache = s;
8016 base = slab_address(slab);
8017 objp0 = kasan_reset_tag(object);
8018 #ifdef CONFIG_SLUB_DEBUG
8019 objp = restore_red_left(s, objp0);
8020 #else
8021 objp = objp0;
8022 #endif
8023 objnr = obj_to_index(s, slab, objp);
8024 kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
8025 objp = base + s->size * objnr;
8026 kpp->kp_objp = objp;
8027 if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
8028 || (objp - base) % s->size) ||
8029 !(s->flags & SLAB_STORE_USER))
8030 return;
8031 #ifdef CONFIG_SLUB_DEBUG
8032 objp = fixup_red_left(s, objp);
8033 trackp = get_track(s, objp, TRACK_ALLOC);
8034 kpp->kp_ret = (void *)trackp->addr;
8035 #ifdef CONFIG_STACKDEPOT
8036 {
8037 depot_stack_handle_t handle;
8038 unsigned long *entries;
8039 unsigned int nr_entries;
8040
8041 handle = READ_ONCE(trackp->handle);
8042 if (handle) {
8043 nr_entries = stack_depot_fetch(handle, &entries);
8044 for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
8045 kpp->kp_stack[i] = (void *)entries[i];
8046 }
8047
8048 trackp = get_track(s, objp, TRACK_FREE);
8049 handle = READ_ONCE(trackp->handle);
8050 if (handle) {
8051 nr_entries = stack_depot_fetch(handle, &entries);
8052 for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
8053 kpp->kp_free_stack[i] = (void *)entries[i];
8054 }
8055 }
8056 #endif
8057 #endif
8058 }
8059 #endif
8060
8061 /********************************************************************
8062 * Kmalloc subsystem
8063 *******************************************************************/
8064
setup_slub_min_order(char * str)8065 static int __init setup_slub_min_order(char *str)
8066 {
8067 get_option(&str, (int *)&slub_min_order);
8068
8069 if (slub_min_order > slub_max_order)
8070 slub_max_order = slub_min_order;
8071
8072 return 1;
8073 }
8074
8075 __setup("slab_min_order=", setup_slub_min_order);
8076 __setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0);
8077
8078
setup_slub_max_order(char * str)8079 static int __init setup_slub_max_order(char *str)
8080 {
8081 get_option(&str, (int *)&slub_max_order);
8082 slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
8083
8084 if (slub_min_order > slub_max_order)
8085 slub_min_order = slub_max_order;
8086
8087 return 1;
8088 }
8089
8090 __setup("slab_max_order=", setup_slub_max_order);
8091 __setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0);
8092
setup_slub_min_objects(char * str)8093 static int __init setup_slub_min_objects(char *str)
8094 {
8095 get_option(&str, (int *)&slub_min_objects);
8096
8097 return 1;
8098 }
8099
8100 __setup("slab_min_objects=", setup_slub_min_objects);
8101 __setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0);
8102
8103 #ifdef CONFIG_NUMA
setup_slab_strict_numa(char * str)8104 static int __init setup_slab_strict_numa(char *str)
8105 {
8106 if (nr_node_ids > 1) {
8107 static_branch_enable(&strict_numa);
8108 pr_info("SLUB: Strict NUMA enabled.\n");
8109 } else {
8110 pr_warn("slab_strict_numa parameter set on non NUMA system.\n");
8111 }
8112
8113 return 1;
8114 }
8115
8116 __setup("slab_strict_numa", setup_slab_strict_numa);
8117 #endif
8118
8119
8120 #ifdef CONFIG_HARDENED_USERCOPY
8121 /*
8122 * Rejects incorrectly sized objects and objects that are to be copied
8123 * to/from userspace but do not fall entirely within the containing slab
8124 * cache's usercopy region.
8125 *
8126 * Returns NULL if check passes, otherwise const char * to name of cache
8127 * to indicate an error.
8128 */
__check_heap_object(const void * ptr,unsigned long n,const struct slab * slab,bool to_user)8129 void __check_heap_object(const void *ptr, unsigned long n,
8130 const struct slab *slab, bool to_user)
8131 {
8132 struct kmem_cache *s;
8133 unsigned int offset;
8134 bool is_kfence = is_kfence_address(ptr);
8135
8136 ptr = kasan_reset_tag(ptr);
8137
8138 /* Find object and usable object size. */
8139 s = slab->slab_cache;
8140
8141 /* Reject impossible pointers. */
8142 if (ptr < slab_address(slab))
8143 usercopy_abort("SLUB object not in SLUB page?!", NULL,
8144 to_user, 0, n);
8145
8146 /* Find offset within object. */
8147 if (is_kfence)
8148 offset = ptr - kfence_object_start(ptr);
8149 else
8150 offset = (ptr - slab_address(slab)) % s->size;
8151
8152 /* Adjust for redzone and reject if within the redzone. */
8153 if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
8154 if (offset < s->red_left_pad)
8155 usercopy_abort("SLUB object in left red zone",
8156 s->name, to_user, offset, n);
8157 offset -= s->red_left_pad;
8158 }
8159
8160 /* Allow address range falling entirely within usercopy region. */
8161 if (offset >= s->useroffset &&
8162 offset - s->useroffset <= s->usersize &&
8163 n <= s->useroffset - offset + s->usersize)
8164 return;
8165
8166 usercopy_abort("SLUB object", s->name, to_user, offset, n);
8167 }
8168 #endif /* CONFIG_HARDENED_USERCOPY */
8169
8170 #define SHRINK_PROMOTE_MAX 32
8171
8172 /*
8173 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
8174 * up most to the head of the partial lists. New allocations will then
8175 * fill those up and thus they can be removed from the partial lists.
8176 *
8177 * The slabs with the least items are placed last. This results in them
8178 * being allocated from last increasing the chance that the last objects
8179 * are freed in them.
8180 */
__kmem_cache_do_shrink(struct kmem_cache * s)8181 static int __kmem_cache_do_shrink(struct kmem_cache *s)
8182 {
8183 int node;
8184 int i;
8185 struct kmem_cache_node *n;
8186 struct slab *slab;
8187 struct slab *t;
8188 struct list_head discard;
8189 struct list_head promote[SHRINK_PROMOTE_MAX];
8190 unsigned long flags;
8191 int ret = 0;
8192
8193 for_each_kmem_cache_node(s, node, n) {
8194 INIT_LIST_HEAD(&discard);
8195 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
8196 INIT_LIST_HEAD(promote + i);
8197
8198 if (n->barn)
8199 barn_shrink(s, n->barn);
8200
8201 spin_lock_irqsave(&n->list_lock, flags);
8202
8203 /*
8204 * Build lists of slabs to discard or promote.
8205 *
8206 * Note that concurrent frees may occur while we hold the
8207 * list_lock. slab->inuse here is the upper limit.
8208 */
8209 list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
8210 int free = slab->objects - slab->inuse;
8211
8212 /* Do not reread slab->inuse */
8213 barrier();
8214
8215 /* We do not keep full slabs on the list */
8216 BUG_ON(free <= 0);
8217
8218 if (free == slab->objects) {
8219 list_move(&slab->slab_list, &discard);
8220 slab_clear_node_partial(slab);
8221 n->nr_partial--;
8222 dec_slabs_node(s, node, slab->objects);
8223 } else if (free <= SHRINK_PROMOTE_MAX)
8224 list_move(&slab->slab_list, promote + free - 1);
8225 }
8226
8227 /*
8228 * Promote the slabs filled up most to the head of the
8229 * partial list.
8230 */
8231 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
8232 list_splice(promote + i, &n->partial);
8233
8234 spin_unlock_irqrestore(&n->list_lock, flags);
8235
8236 /* Release empty slabs */
8237 list_for_each_entry_safe(slab, t, &discard, slab_list)
8238 free_slab(s, slab);
8239
8240 if (node_nr_slabs(n))
8241 ret = 1;
8242 }
8243
8244 return ret;
8245 }
8246
__kmem_cache_shrink(struct kmem_cache * s)8247 int __kmem_cache_shrink(struct kmem_cache *s)
8248 {
8249 flush_all(s);
8250 return __kmem_cache_do_shrink(s);
8251 }
8252
slab_mem_going_offline_callback(void)8253 static int slab_mem_going_offline_callback(void)
8254 {
8255 struct kmem_cache *s;
8256
8257 mutex_lock(&slab_mutex);
8258 list_for_each_entry(s, &slab_caches, list) {
8259 flush_all_cpus_locked(s);
8260 __kmem_cache_do_shrink(s);
8261 }
8262 mutex_unlock(&slab_mutex);
8263
8264 return 0;
8265 }
8266
slab_mem_going_online_callback(int nid)8267 static int slab_mem_going_online_callback(int nid)
8268 {
8269 struct kmem_cache_node *n;
8270 struct kmem_cache *s;
8271 int ret = 0;
8272
8273 /*
8274 * We are bringing a node online. No memory is available yet. We must
8275 * allocate a kmem_cache_node structure in order to bring the node
8276 * online.
8277 */
8278 mutex_lock(&slab_mutex);
8279 list_for_each_entry(s, &slab_caches, list) {
8280 struct node_barn *barn = NULL;
8281
8282 /*
8283 * The structure may already exist if the node was previously
8284 * onlined and offlined.
8285 */
8286 if (get_node(s, nid))
8287 continue;
8288
8289 if (s->cpu_sheaves) {
8290 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid);
8291
8292 if (!barn) {
8293 ret = -ENOMEM;
8294 goto out;
8295 }
8296 }
8297
8298 /*
8299 * XXX: kmem_cache_alloc_node will fallback to other nodes
8300 * since memory is not yet available from the node that
8301 * is brought up.
8302 */
8303 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
8304 if (!n) {
8305 kfree(barn);
8306 ret = -ENOMEM;
8307 goto out;
8308 }
8309
8310 init_kmem_cache_node(n, barn);
8311
8312 s->node[nid] = n;
8313 }
8314 /*
8315 * Any cache created after this point will also have kmem_cache_node
8316 * initialized for the new node.
8317 */
8318 node_set(nid, slab_nodes);
8319 out:
8320 mutex_unlock(&slab_mutex);
8321 return ret;
8322 }
8323
slab_memory_callback(struct notifier_block * self,unsigned long action,void * arg)8324 static int slab_memory_callback(struct notifier_block *self,
8325 unsigned long action, void *arg)
8326 {
8327 struct node_notify *nn = arg;
8328 int nid = nn->nid;
8329 int ret = 0;
8330
8331 switch (action) {
8332 case NODE_ADDING_FIRST_MEMORY:
8333 ret = slab_mem_going_online_callback(nid);
8334 break;
8335 case NODE_REMOVING_LAST_MEMORY:
8336 ret = slab_mem_going_offline_callback();
8337 break;
8338 }
8339 if (ret)
8340 ret = notifier_from_errno(ret);
8341 else
8342 ret = NOTIFY_OK;
8343 return ret;
8344 }
8345
8346 /********************************************************************
8347 * Basic setup of slabs
8348 *******************************************************************/
8349
8350 /*
8351 * Used for early kmem_cache structures that were allocated using
8352 * the page allocator. Allocate them properly then fix up the pointers
8353 * that may be pointing to the wrong kmem_cache structure.
8354 */
8355
bootstrap(struct kmem_cache * static_cache)8356 static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
8357 {
8358 int node;
8359 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
8360 struct kmem_cache_node *n;
8361
8362 memcpy(s, static_cache, kmem_cache->object_size);
8363
8364 /*
8365 * This runs very early, and only the boot processor is supposed to be
8366 * up. Even if it weren't true, IRQs are not up so we couldn't fire
8367 * IPIs around.
8368 */
8369 __flush_cpu_slab(s, smp_processor_id());
8370 for_each_kmem_cache_node(s, node, n) {
8371 struct slab *p;
8372
8373 list_for_each_entry(p, &n->partial, slab_list)
8374 p->slab_cache = s;
8375
8376 #ifdef CONFIG_SLUB_DEBUG
8377 list_for_each_entry(p, &n->full, slab_list)
8378 p->slab_cache = s;
8379 #endif
8380 }
8381 list_add(&s->list, &slab_caches);
8382 return s;
8383 }
8384
kmem_cache_init(void)8385 void __init kmem_cache_init(void)
8386 {
8387 static __initdata struct kmem_cache boot_kmem_cache,
8388 boot_kmem_cache_node;
8389 int node;
8390
8391 if (debug_guardpage_minorder())
8392 slub_max_order = 0;
8393
8394 /* Inform pointer hashing choice about slub debugging state. */
8395 hash_pointers_finalize(__slub_debug_enabled());
8396
8397 kmem_cache_node = &boot_kmem_cache_node;
8398 kmem_cache = &boot_kmem_cache;
8399
8400 /*
8401 * Initialize the nodemask for which we will allocate per node
8402 * structures. Here we don't need taking slab_mutex yet.
8403 */
8404 for_each_node_state(node, N_MEMORY)
8405 node_set(node, slab_nodes);
8406
8407 create_boot_cache(kmem_cache_node, "kmem_cache_node",
8408 sizeof(struct kmem_cache_node),
8409 SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
8410
8411 hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
8412
8413 /* Able to allocate the per node structures */
8414 slab_state = PARTIAL;
8415
8416 create_boot_cache(kmem_cache, "kmem_cache",
8417 offsetof(struct kmem_cache, node) +
8418 nr_node_ids * sizeof(struct kmem_cache_node *),
8419 SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
8420
8421 kmem_cache = bootstrap(&boot_kmem_cache);
8422 kmem_cache_node = bootstrap(&boot_kmem_cache_node);
8423
8424 /* Now we can use the kmem_cache to allocate kmalloc slabs */
8425 setup_kmalloc_cache_index_table();
8426 create_kmalloc_caches();
8427
8428 /* Setup random freelists for each cache */
8429 init_freelist_randomization();
8430
8431 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
8432 slub_cpu_dead);
8433
8434 pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
8435 cache_line_size(),
8436 slub_min_order, slub_max_order, slub_min_objects,
8437 nr_cpu_ids, nr_node_ids);
8438 }
8439
kmem_cache_init_late(void)8440 void __init kmem_cache_init_late(void)
8441 {
8442 #ifndef CONFIG_SLUB_TINY
8443 flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
8444 WARN_ON(!flushwq);
8445 #endif
8446 }
8447
8448 struct kmem_cache *
__kmem_cache_alias(const char * name,unsigned int size,unsigned int align,slab_flags_t flags,void (* ctor)(void *))8449 __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
8450 slab_flags_t flags, void (*ctor)(void *))
8451 {
8452 struct kmem_cache *s;
8453
8454 s = find_mergeable(size, align, flags, name, ctor);
8455 if (s) {
8456 if (sysfs_slab_alias(s, name))
8457 pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
8458 name);
8459
8460 s->refcount++;
8461
8462 /*
8463 * Adjust the object sizes so that we clear
8464 * the complete object on kzalloc.
8465 */
8466 s->object_size = max(s->object_size, size);
8467 s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
8468 }
8469
8470 return s;
8471 }
8472
do_kmem_cache_create(struct kmem_cache * s,const char * name,unsigned int size,struct kmem_cache_args * args,slab_flags_t flags)8473 int do_kmem_cache_create(struct kmem_cache *s, const char *name,
8474 unsigned int size, struct kmem_cache_args *args,
8475 slab_flags_t flags)
8476 {
8477 int err = -EINVAL;
8478
8479 s->name = name;
8480 s->size = s->object_size = size;
8481
8482 s->flags = kmem_cache_flags(flags, s->name);
8483 #ifdef CONFIG_SLAB_FREELIST_HARDENED
8484 s->random = get_random_long();
8485 #endif
8486 s->align = args->align;
8487 s->ctor = args->ctor;
8488 #ifdef CONFIG_HARDENED_USERCOPY
8489 s->useroffset = args->useroffset;
8490 s->usersize = args->usersize;
8491 #endif
8492
8493 if (!calculate_sizes(args, s))
8494 goto out;
8495 if (disable_higher_order_debug) {
8496 /*
8497 * Disable debugging flags that store metadata if the min slab
8498 * order increased.
8499 */
8500 if (get_order(s->size) > get_order(s->object_size)) {
8501 s->flags &= ~DEBUG_METADATA_FLAGS;
8502 s->offset = 0;
8503 if (!calculate_sizes(args, s))
8504 goto out;
8505 }
8506 }
8507
8508 #ifdef system_has_freelist_aba
8509 if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
8510 /* Enable fast mode */
8511 s->flags |= __CMPXCHG_DOUBLE;
8512 }
8513 #endif
8514
8515 /*
8516 * The larger the object size is, the more slabs we want on the partial
8517 * list to avoid pounding the page allocator excessively.
8518 */
8519 s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
8520 s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
8521
8522 set_cpu_partial(s);
8523
8524 if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY)
8525 && !(s->flags & SLAB_DEBUG_FLAGS)) {
8526 s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves);
8527 if (!s->cpu_sheaves) {
8528 err = -ENOMEM;
8529 goto out;
8530 }
8531 // TODO: increase capacity to grow slab_sheaf up to next kmalloc size?
8532 s->sheaf_capacity = args->sheaf_capacity;
8533 }
8534
8535 #ifdef CONFIG_NUMA
8536 s->remote_node_defrag_ratio = 1000;
8537 #endif
8538
8539 /* Initialize the pre-computed randomized freelist if slab is up */
8540 if (slab_state >= UP) {
8541 if (init_cache_random_seq(s))
8542 goto out;
8543 }
8544
8545 if (!init_kmem_cache_nodes(s))
8546 goto out;
8547
8548 if (!alloc_kmem_cache_cpus(s))
8549 goto out;
8550
8551 if (s->cpu_sheaves) {
8552 err = init_percpu_sheaves(s);
8553 if (err)
8554 goto out;
8555 }
8556
8557 err = 0;
8558
8559 /* Mutex is not taken during early boot */
8560 if (slab_state <= UP)
8561 goto out;
8562
8563 /*
8564 * Failing to create sysfs files is not critical to SLUB functionality.
8565 * If it fails, proceed with cache creation without these files.
8566 */
8567 if (sysfs_slab_add(s))
8568 pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name);
8569
8570 if (s->flags & SLAB_STORE_USER)
8571 debugfs_slab_add(s);
8572
8573 out:
8574 if (err)
8575 __kmem_cache_release(s);
8576 return err;
8577 }
8578
8579 #ifdef SLAB_SUPPORTS_SYSFS
count_inuse(struct slab * slab)8580 static int count_inuse(struct slab *slab)
8581 {
8582 return slab->inuse;
8583 }
8584
count_total(struct slab * slab)8585 static int count_total(struct slab *slab)
8586 {
8587 return slab->objects;
8588 }
8589 #endif
8590
8591 #ifdef CONFIG_SLUB_DEBUG
validate_slab(struct kmem_cache * s,struct slab * slab,unsigned long * obj_map)8592 static void validate_slab(struct kmem_cache *s, struct slab *slab,
8593 unsigned long *obj_map)
8594 {
8595 void *p;
8596 void *addr = slab_address(slab);
8597
8598 if (!validate_slab_ptr(slab)) {
8599 slab_err(s, slab, "Not a valid slab page");
8600 return;
8601 }
8602
8603 if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
8604 return;
8605
8606 /* Now we know that a valid freelist exists */
8607 __fill_map(obj_map, s, slab);
8608 for_each_object(p, s, addr, slab->objects) {
8609 u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
8610 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
8611
8612 if (!check_object(s, slab, p, val))
8613 break;
8614 }
8615 }
8616
validate_slab_node(struct kmem_cache * s,struct kmem_cache_node * n,unsigned long * obj_map)8617 static int validate_slab_node(struct kmem_cache *s,
8618 struct kmem_cache_node *n, unsigned long *obj_map)
8619 {
8620 unsigned long count = 0;
8621 struct slab *slab;
8622 unsigned long flags;
8623
8624 spin_lock_irqsave(&n->list_lock, flags);
8625
8626 list_for_each_entry(slab, &n->partial, slab_list) {
8627 validate_slab(s, slab, obj_map);
8628 count++;
8629 }
8630 if (count != n->nr_partial) {
8631 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
8632 s->name, count, n->nr_partial);
8633 slab_add_kunit_errors();
8634 }
8635
8636 if (!(s->flags & SLAB_STORE_USER))
8637 goto out;
8638
8639 list_for_each_entry(slab, &n->full, slab_list) {
8640 validate_slab(s, slab, obj_map);
8641 count++;
8642 }
8643 if (count != node_nr_slabs(n)) {
8644 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
8645 s->name, count, node_nr_slabs(n));
8646 slab_add_kunit_errors();
8647 }
8648
8649 out:
8650 spin_unlock_irqrestore(&n->list_lock, flags);
8651 return count;
8652 }
8653
validate_slab_cache(struct kmem_cache * s)8654 long validate_slab_cache(struct kmem_cache *s)
8655 {
8656 int node;
8657 unsigned long count = 0;
8658 struct kmem_cache_node *n;
8659 unsigned long *obj_map;
8660
8661 obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
8662 if (!obj_map)
8663 return -ENOMEM;
8664
8665 flush_all(s);
8666 for_each_kmem_cache_node(s, node, n)
8667 count += validate_slab_node(s, n, obj_map);
8668
8669 bitmap_free(obj_map);
8670
8671 return count;
8672 }
8673 EXPORT_SYMBOL(validate_slab_cache);
8674
8675 #ifdef CONFIG_DEBUG_FS
8676 /*
8677 * Generate lists of code addresses where slabcache objects are allocated
8678 * and freed.
8679 */
8680
8681 struct location {
8682 depot_stack_handle_t handle;
8683 unsigned long count;
8684 unsigned long addr;
8685 unsigned long waste;
8686 long long sum_time;
8687 long min_time;
8688 long max_time;
8689 long min_pid;
8690 long max_pid;
8691 DECLARE_BITMAP(cpus, NR_CPUS);
8692 nodemask_t nodes;
8693 };
8694
8695 struct loc_track {
8696 unsigned long max;
8697 unsigned long count;
8698 struct location *loc;
8699 loff_t idx;
8700 };
8701
8702 static struct dentry *slab_debugfs_root;
8703
free_loc_track(struct loc_track * t)8704 static void free_loc_track(struct loc_track *t)
8705 {
8706 if (t->max)
8707 free_pages((unsigned long)t->loc,
8708 get_order(sizeof(struct location) * t->max));
8709 }
8710
alloc_loc_track(struct loc_track * t,unsigned long max,gfp_t flags)8711 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
8712 {
8713 struct location *l;
8714 int order;
8715
8716 order = get_order(sizeof(struct location) * max);
8717
8718 l = (void *)__get_free_pages(flags, order);
8719 if (!l)
8720 return 0;
8721
8722 if (t->count) {
8723 memcpy(l, t->loc, sizeof(struct location) * t->count);
8724 free_loc_track(t);
8725 }
8726 t->max = max;
8727 t->loc = l;
8728 return 1;
8729 }
8730
add_location(struct loc_track * t,struct kmem_cache * s,const struct track * track,unsigned int orig_size)8731 static int add_location(struct loc_track *t, struct kmem_cache *s,
8732 const struct track *track,
8733 unsigned int orig_size)
8734 {
8735 long start, end, pos;
8736 struct location *l;
8737 unsigned long caddr, chandle, cwaste;
8738 unsigned long age = jiffies - track->when;
8739 depot_stack_handle_t handle = 0;
8740 unsigned int waste = s->object_size - orig_size;
8741
8742 #ifdef CONFIG_STACKDEPOT
8743 handle = READ_ONCE(track->handle);
8744 #endif
8745 start = -1;
8746 end = t->count;
8747
8748 for ( ; ; ) {
8749 pos = start + (end - start + 1) / 2;
8750
8751 /*
8752 * There is nothing at "end". If we end up there
8753 * we need to add something to before end.
8754 */
8755 if (pos == end)
8756 break;
8757
8758 l = &t->loc[pos];
8759 caddr = l->addr;
8760 chandle = l->handle;
8761 cwaste = l->waste;
8762 if ((track->addr == caddr) && (handle == chandle) &&
8763 (waste == cwaste)) {
8764
8765 l->count++;
8766 if (track->when) {
8767 l->sum_time += age;
8768 if (age < l->min_time)
8769 l->min_time = age;
8770 if (age > l->max_time)
8771 l->max_time = age;
8772
8773 if (track->pid < l->min_pid)
8774 l->min_pid = track->pid;
8775 if (track->pid > l->max_pid)
8776 l->max_pid = track->pid;
8777
8778 cpumask_set_cpu(track->cpu,
8779 to_cpumask(l->cpus));
8780 }
8781 node_set(page_to_nid(virt_to_page(track)), l->nodes);
8782 return 1;
8783 }
8784
8785 if (track->addr < caddr)
8786 end = pos;
8787 else if (track->addr == caddr && handle < chandle)
8788 end = pos;
8789 else if (track->addr == caddr && handle == chandle &&
8790 waste < cwaste)
8791 end = pos;
8792 else
8793 start = pos;
8794 }
8795
8796 /*
8797 * Not found. Insert new tracking element.
8798 */
8799 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
8800 return 0;
8801
8802 l = t->loc + pos;
8803 if (pos < t->count)
8804 memmove(l + 1, l,
8805 (t->count - pos) * sizeof(struct location));
8806 t->count++;
8807 l->count = 1;
8808 l->addr = track->addr;
8809 l->sum_time = age;
8810 l->min_time = age;
8811 l->max_time = age;
8812 l->min_pid = track->pid;
8813 l->max_pid = track->pid;
8814 l->handle = handle;
8815 l->waste = waste;
8816 cpumask_clear(to_cpumask(l->cpus));
8817 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
8818 nodes_clear(l->nodes);
8819 node_set(page_to_nid(virt_to_page(track)), l->nodes);
8820 return 1;
8821 }
8822
process_slab(struct loc_track * t,struct kmem_cache * s,struct slab * slab,enum track_item alloc,unsigned long * obj_map)8823 static void process_slab(struct loc_track *t, struct kmem_cache *s,
8824 struct slab *slab, enum track_item alloc,
8825 unsigned long *obj_map)
8826 {
8827 void *addr = slab_address(slab);
8828 bool is_alloc = (alloc == TRACK_ALLOC);
8829 void *p;
8830
8831 __fill_map(obj_map, s, slab);
8832
8833 for_each_object(p, s, addr, slab->objects)
8834 if (!test_bit(__obj_to_index(s, addr, p), obj_map))
8835 add_location(t, s, get_track(s, p, alloc),
8836 is_alloc ? get_orig_size(s, p) :
8837 s->object_size);
8838 }
8839 #endif /* CONFIG_DEBUG_FS */
8840 #endif /* CONFIG_SLUB_DEBUG */
8841
8842 #ifdef SLAB_SUPPORTS_SYSFS
8843 enum slab_stat_type {
8844 SL_ALL, /* All slabs */
8845 SL_PARTIAL, /* Only partially allocated slabs */
8846 SL_CPU, /* Only slabs used for cpu caches */
8847 SL_OBJECTS, /* Determine allocated objects not slabs */
8848 SL_TOTAL /* Determine object capacity not slabs */
8849 };
8850
8851 #define SO_ALL (1 << SL_ALL)
8852 #define SO_PARTIAL (1 << SL_PARTIAL)
8853 #define SO_CPU (1 << SL_CPU)
8854 #define SO_OBJECTS (1 << SL_OBJECTS)
8855 #define SO_TOTAL (1 << SL_TOTAL)
8856
show_slab_objects(struct kmem_cache * s,char * buf,unsigned long flags)8857 static ssize_t show_slab_objects(struct kmem_cache *s,
8858 char *buf, unsigned long flags)
8859 {
8860 unsigned long total = 0;
8861 int node;
8862 int x;
8863 unsigned long *nodes;
8864 int len = 0;
8865
8866 nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
8867 if (!nodes)
8868 return -ENOMEM;
8869
8870 if (flags & SO_CPU) {
8871 int cpu;
8872
8873 for_each_possible_cpu(cpu) {
8874 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
8875 cpu);
8876 int node;
8877 struct slab *slab;
8878
8879 slab = READ_ONCE(c->slab);
8880 if (!slab)
8881 continue;
8882
8883 node = slab_nid(slab);
8884 if (flags & SO_TOTAL)
8885 x = slab->objects;
8886 else if (flags & SO_OBJECTS)
8887 x = slab->inuse;
8888 else
8889 x = 1;
8890
8891 total += x;
8892 nodes[node] += x;
8893
8894 #ifdef CONFIG_SLUB_CPU_PARTIAL
8895 slab = slub_percpu_partial_read_once(c);
8896 if (slab) {
8897 node = slab_nid(slab);
8898 if (flags & SO_TOTAL)
8899 WARN_ON_ONCE(1);
8900 else if (flags & SO_OBJECTS)
8901 WARN_ON_ONCE(1);
8902 else
8903 x = data_race(slab->slabs);
8904 total += x;
8905 nodes[node] += x;
8906 }
8907 #endif
8908 }
8909 }
8910
8911 /*
8912 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
8913 * already held which will conflict with an existing lock order:
8914 *
8915 * mem_hotplug_lock->slab_mutex->kernfs_mutex
8916 *
8917 * We don't really need mem_hotplug_lock (to hold off
8918 * slab_mem_going_offline_callback) here because slab's memory hot
8919 * unplug code doesn't destroy the kmem_cache->node[] data.
8920 */
8921
8922 #ifdef CONFIG_SLUB_DEBUG
8923 if (flags & SO_ALL) {
8924 struct kmem_cache_node *n;
8925
8926 for_each_kmem_cache_node(s, node, n) {
8927
8928 if (flags & SO_TOTAL)
8929 x = node_nr_objs(n);
8930 else if (flags & SO_OBJECTS)
8931 x = node_nr_objs(n) - count_partial(n, count_free);
8932 else
8933 x = node_nr_slabs(n);
8934 total += x;
8935 nodes[node] += x;
8936 }
8937
8938 } else
8939 #endif
8940 if (flags & SO_PARTIAL) {
8941 struct kmem_cache_node *n;
8942
8943 for_each_kmem_cache_node(s, node, n) {
8944 if (flags & SO_TOTAL)
8945 x = count_partial(n, count_total);
8946 else if (flags & SO_OBJECTS)
8947 x = count_partial(n, count_inuse);
8948 else
8949 x = n->nr_partial;
8950 total += x;
8951 nodes[node] += x;
8952 }
8953 }
8954
8955 len += sysfs_emit_at(buf, len, "%lu", total);
8956 #ifdef CONFIG_NUMA
8957 for (node = 0; node < nr_node_ids; node++) {
8958 if (nodes[node])
8959 len += sysfs_emit_at(buf, len, " N%d=%lu",
8960 node, nodes[node]);
8961 }
8962 #endif
8963 len += sysfs_emit_at(buf, len, "\n");
8964 kfree(nodes);
8965
8966 return len;
8967 }
8968
8969 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
8970 #define to_slab(n) container_of(n, struct kmem_cache, kobj)
8971
8972 struct slab_attribute {
8973 struct attribute attr;
8974 ssize_t (*show)(struct kmem_cache *s, char *buf);
8975 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
8976 };
8977
8978 #define SLAB_ATTR_RO(_name) \
8979 static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
8980
8981 #define SLAB_ATTR(_name) \
8982 static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
8983
slab_size_show(struct kmem_cache * s,char * buf)8984 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
8985 {
8986 return sysfs_emit(buf, "%u\n", s->size);
8987 }
8988 SLAB_ATTR_RO(slab_size);
8989
align_show(struct kmem_cache * s,char * buf)8990 static ssize_t align_show(struct kmem_cache *s, char *buf)
8991 {
8992 return sysfs_emit(buf, "%u\n", s->align);
8993 }
8994 SLAB_ATTR_RO(align);
8995
object_size_show(struct kmem_cache * s,char * buf)8996 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
8997 {
8998 return sysfs_emit(buf, "%u\n", s->object_size);
8999 }
9000 SLAB_ATTR_RO(object_size);
9001
objs_per_slab_show(struct kmem_cache * s,char * buf)9002 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
9003 {
9004 return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
9005 }
9006 SLAB_ATTR_RO(objs_per_slab);
9007
order_show(struct kmem_cache * s,char * buf)9008 static ssize_t order_show(struct kmem_cache *s, char *buf)
9009 {
9010 return sysfs_emit(buf, "%u\n", oo_order(s->oo));
9011 }
9012 SLAB_ATTR_RO(order);
9013
sheaf_capacity_show(struct kmem_cache * s,char * buf)9014 static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf)
9015 {
9016 return sysfs_emit(buf, "%u\n", s->sheaf_capacity);
9017 }
9018 SLAB_ATTR_RO(sheaf_capacity);
9019
min_partial_show(struct kmem_cache * s,char * buf)9020 static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
9021 {
9022 return sysfs_emit(buf, "%lu\n", s->min_partial);
9023 }
9024
min_partial_store(struct kmem_cache * s,const char * buf,size_t length)9025 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
9026 size_t length)
9027 {
9028 unsigned long min;
9029 int err;
9030
9031 err = kstrtoul(buf, 10, &min);
9032 if (err)
9033 return err;
9034
9035 s->min_partial = min;
9036 return length;
9037 }
9038 SLAB_ATTR(min_partial);
9039
cpu_partial_show(struct kmem_cache * s,char * buf)9040 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
9041 {
9042 unsigned int nr_partial = 0;
9043 #ifdef CONFIG_SLUB_CPU_PARTIAL
9044 nr_partial = s->cpu_partial;
9045 #endif
9046
9047 return sysfs_emit(buf, "%u\n", nr_partial);
9048 }
9049
cpu_partial_store(struct kmem_cache * s,const char * buf,size_t length)9050 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
9051 size_t length)
9052 {
9053 unsigned int objects;
9054 int err;
9055
9056 err = kstrtouint(buf, 10, &objects);
9057 if (err)
9058 return err;
9059 if (objects && !kmem_cache_has_cpu_partial(s))
9060 return -EINVAL;
9061
9062 slub_set_cpu_partial(s, objects);
9063 flush_all(s);
9064 return length;
9065 }
9066 SLAB_ATTR(cpu_partial);
9067
ctor_show(struct kmem_cache * s,char * buf)9068 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
9069 {
9070 if (!s->ctor)
9071 return 0;
9072 return sysfs_emit(buf, "%pS\n", s->ctor);
9073 }
9074 SLAB_ATTR_RO(ctor);
9075
aliases_show(struct kmem_cache * s,char * buf)9076 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
9077 {
9078 return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
9079 }
9080 SLAB_ATTR_RO(aliases);
9081
partial_show(struct kmem_cache * s,char * buf)9082 static ssize_t partial_show(struct kmem_cache *s, char *buf)
9083 {
9084 return show_slab_objects(s, buf, SO_PARTIAL);
9085 }
9086 SLAB_ATTR_RO(partial);
9087
cpu_slabs_show(struct kmem_cache * s,char * buf)9088 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
9089 {
9090 return show_slab_objects(s, buf, SO_CPU);
9091 }
9092 SLAB_ATTR_RO(cpu_slabs);
9093
objects_partial_show(struct kmem_cache * s,char * buf)9094 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
9095 {
9096 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
9097 }
9098 SLAB_ATTR_RO(objects_partial);
9099
slabs_cpu_partial_show(struct kmem_cache * s,char * buf)9100 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
9101 {
9102 int objects = 0;
9103 int slabs = 0;
9104 int cpu __maybe_unused;
9105 int len = 0;
9106
9107 #ifdef CONFIG_SLUB_CPU_PARTIAL
9108 for_each_online_cpu(cpu) {
9109 struct slab *slab;
9110
9111 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
9112
9113 if (slab)
9114 slabs += data_race(slab->slabs);
9115 }
9116 #endif
9117
9118 /* Approximate half-full slabs, see slub_set_cpu_partial() */
9119 objects = (slabs * oo_objects(s->oo)) / 2;
9120 len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
9121
9122 #ifdef CONFIG_SLUB_CPU_PARTIAL
9123 for_each_online_cpu(cpu) {
9124 struct slab *slab;
9125
9126 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
9127 if (slab) {
9128 slabs = data_race(slab->slabs);
9129 objects = (slabs * oo_objects(s->oo)) / 2;
9130 len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
9131 cpu, objects, slabs);
9132 }
9133 }
9134 #endif
9135 len += sysfs_emit_at(buf, len, "\n");
9136
9137 return len;
9138 }
9139 SLAB_ATTR_RO(slabs_cpu_partial);
9140
reclaim_account_show(struct kmem_cache * s,char * buf)9141 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
9142 {
9143 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
9144 }
9145 SLAB_ATTR_RO(reclaim_account);
9146
hwcache_align_show(struct kmem_cache * s,char * buf)9147 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
9148 {
9149 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
9150 }
9151 SLAB_ATTR_RO(hwcache_align);
9152
9153 #ifdef CONFIG_ZONE_DMA
cache_dma_show(struct kmem_cache * s,char * buf)9154 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
9155 {
9156 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
9157 }
9158 SLAB_ATTR_RO(cache_dma);
9159 #endif
9160
9161 #ifdef CONFIG_HARDENED_USERCOPY
usersize_show(struct kmem_cache * s,char * buf)9162 static ssize_t usersize_show(struct kmem_cache *s, char *buf)
9163 {
9164 return sysfs_emit(buf, "%u\n", s->usersize);
9165 }
9166 SLAB_ATTR_RO(usersize);
9167 #endif
9168
destroy_by_rcu_show(struct kmem_cache * s,char * buf)9169 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
9170 {
9171 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
9172 }
9173 SLAB_ATTR_RO(destroy_by_rcu);
9174
9175 #ifdef CONFIG_SLUB_DEBUG
slabs_show(struct kmem_cache * s,char * buf)9176 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
9177 {
9178 return show_slab_objects(s, buf, SO_ALL);
9179 }
9180 SLAB_ATTR_RO(slabs);
9181
total_objects_show(struct kmem_cache * s,char * buf)9182 static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
9183 {
9184 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
9185 }
9186 SLAB_ATTR_RO(total_objects);
9187
objects_show(struct kmem_cache * s,char * buf)9188 static ssize_t objects_show(struct kmem_cache *s, char *buf)
9189 {
9190 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
9191 }
9192 SLAB_ATTR_RO(objects);
9193
sanity_checks_show(struct kmem_cache * s,char * buf)9194 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
9195 {
9196 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
9197 }
9198 SLAB_ATTR_RO(sanity_checks);
9199
trace_show(struct kmem_cache * s,char * buf)9200 static ssize_t trace_show(struct kmem_cache *s, char *buf)
9201 {
9202 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
9203 }
9204 SLAB_ATTR_RO(trace);
9205
red_zone_show(struct kmem_cache * s,char * buf)9206 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
9207 {
9208 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
9209 }
9210
9211 SLAB_ATTR_RO(red_zone);
9212
poison_show(struct kmem_cache * s,char * buf)9213 static ssize_t poison_show(struct kmem_cache *s, char *buf)
9214 {
9215 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
9216 }
9217
9218 SLAB_ATTR_RO(poison);
9219
store_user_show(struct kmem_cache * s,char * buf)9220 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
9221 {
9222 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
9223 }
9224
9225 SLAB_ATTR_RO(store_user);
9226
validate_show(struct kmem_cache * s,char * buf)9227 static ssize_t validate_show(struct kmem_cache *s, char *buf)
9228 {
9229 return 0;
9230 }
9231
validate_store(struct kmem_cache * s,const char * buf,size_t length)9232 static ssize_t validate_store(struct kmem_cache *s,
9233 const char *buf, size_t length)
9234 {
9235 int ret = -EINVAL;
9236
9237 if (buf[0] == '1' && kmem_cache_debug(s)) {
9238 ret = validate_slab_cache(s);
9239 if (ret >= 0)
9240 ret = length;
9241 }
9242 return ret;
9243 }
9244 SLAB_ATTR(validate);
9245
9246 #endif /* CONFIG_SLUB_DEBUG */
9247
9248 #ifdef CONFIG_FAILSLAB
failslab_show(struct kmem_cache * s,char * buf)9249 static ssize_t failslab_show(struct kmem_cache *s, char *buf)
9250 {
9251 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
9252 }
9253
failslab_store(struct kmem_cache * s,const char * buf,size_t length)9254 static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
9255 size_t length)
9256 {
9257 if (s->refcount > 1)
9258 return -EINVAL;
9259
9260 if (buf[0] == '1')
9261 WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
9262 else
9263 WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
9264
9265 return length;
9266 }
9267 SLAB_ATTR(failslab);
9268 #endif
9269
shrink_show(struct kmem_cache * s,char * buf)9270 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
9271 {
9272 return 0;
9273 }
9274
shrink_store(struct kmem_cache * s,const char * buf,size_t length)9275 static ssize_t shrink_store(struct kmem_cache *s,
9276 const char *buf, size_t length)
9277 {
9278 if (buf[0] == '1')
9279 kmem_cache_shrink(s);
9280 else
9281 return -EINVAL;
9282 return length;
9283 }
9284 SLAB_ATTR(shrink);
9285
9286 #ifdef CONFIG_NUMA
remote_node_defrag_ratio_show(struct kmem_cache * s,char * buf)9287 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
9288 {
9289 return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
9290 }
9291
remote_node_defrag_ratio_store(struct kmem_cache * s,const char * buf,size_t length)9292 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
9293 const char *buf, size_t length)
9294 {
9295 unsigned int ratio;
9296 int err;
9297
9298 err = kstrtouint(buf, 10, &ratio);
9299 if (err)
9300 return err;
9301 if (ratio > 100)
9302 return -ERANGE;
9303
9304 s->remote_node_defrag_ratio = ratio * 10;
9305
9306 return length;
9307 }
9308 SLAB_ATTR(remote_node_defrag_ratio);
9309 #endif
9310
9311 #ifdef CONFIG_SLUB_STATS
show_stat(struct kmem_cache * s,char * buf,enum stat_item si)9312 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
9313 {
9314 unsigned long sum = 0;
9315 int cpu;
9316 int len = 0;
9317 int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
9318
9319 if (!data)
9320 return -ENOMEM;
9321
9322 for_each_online_cpu(cpu) {
9323 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
9324
9325 data[cpu] = x;
9326 sum += x;
9327 }
9328
9329 len += sysfs_emit_at(buf, len, "%lu", sum);
9330
9331 #ifdef CONFIG_SMP
9332 for_each_online_cpu(cpu) {
9333 if (data[cpu])
9334 len += sysfs_emit_at(buf, len, " C%d=%u",
9335 cpu, data[cpu]);
9336 }
9337 #endif
9338 kfree(data);
9339 len += sysfs_emit_at(buf, len, "\n");
9340
9341 return len;
9342 }
9343
clear_stat(struct kmem_cache * s,enum stat_item si)9344 static void clear_stat(struct kmem_cache *s, enum stat_item si)
9345 {
9346 int cpu;
9347
9348 for_each_online_cpu(cpu)
9349 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
9350 }
9351
9352 #define STAT_ATTR(si, text) \
9353 static ssize_t text##_show(struct kmem_cache *s, char *buf) \
9354 { \
9355 return show_stat(s, buf, si); \
9356 } \
9357 static ssize_t text##_store(struct kmem_cache *s, \
9358 const char *buf, size_t length) \
9359 { \
9360 if (buf[0] != '0') \
9361 return -EINVAL; \
9362 clear_stat(s, si); \
9363 return length; \
9364 } \
9365 SLAB_ATTR(text); \
9366
9367 STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
9368 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
9369 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
9370 STAT_ATTR(FREE_PCS, free_cpu_sheaf);
9371 STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
9372 STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
9373 STAT_ATTR(FREE_FASTPATH, free_fastpath);
9374 STAT_ATTR(FREE_SLOWPATH, free_slowpath);
9375 STAT_ATTR(FREE_FROZEN, free_frozen);
9376 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
9377 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
9378 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
9379 STAT_ATTR(ALLOC_SLAB, alloc_slab);
9380 STAT_ATTR(ALLOC_REFILL, alloc_refill);
9381 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
9382 STAT_ATTR(FREE_SLAB, free_slab);
9383 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
9384 STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
9385 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
9386 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
9387 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
9388 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
9389 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
9390 STAT_ATTR(ORDER_FALLBACK, order_fallback);
9391 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
9392 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
9393 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
9394 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
9395 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
9396 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
9397 STAT_ATTR(SHEAF_FLUSH, sheaf_flush);
9398 STAT_ATTR(SHEAF_REFILL, sheaf_refill);
9399 STAT_ATTR(SHEAF_ALLOC, sheaf_alloc);
9400 STAT_ATTR(SHEAF_FREE, sheaf_free);
9401 STAT_ATTR(BARN_GET, barn_get);
9402 STAT_ATTR(BARN_GET_FAIL, barn_get_fail);
9403 STAT_ATTR(BARN_PUT, barn_put);
9404 STAT_ATTR(BARN_PUT_FAIL, barn_put_fail);
9405 STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast);
9406 STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow);
9407 STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize);
9408 STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast);
9409 STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow);
9410 #endif /* CONFIG_SLUB_STATS */
9411
9412 #ifdef CONFIG_KFENCE
skip_kfence_show(struct kmem_cache * s,char * buf)9413 static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
9414 {
9415 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
9416 }
9417
skip_kfence_store(struct kmem_cache * s,const char * buf,size_t length)9418 static ssize_t skip_kfence_store(struct kmem_cache *s,
9419 const char *buf, size_t length)
9420 {
9421 int ret = length;
9422
9423 if (buf[0] == '0')
9424 s->flags &= ~SLAB_SKIP_KFENCE;
9425 else if (buf[0] == '1')
9426 s->flags |= SLAB_SKIP_KFENCE;
9427 else
9428 ret = -EINVAL;
9429
9430 return ret;
9431 }
9432 SLAB_ATTR(skip_kfence);
9433 #endif
9434
9435 static struct attribute *slab_attrs[] = {
9436 &slab_size_attr.attr,
9437 &object_size_attr.attr,
9438 &objs_per_slab_attr.attr,
9439 &order_attr.attr,
9440 &sheaf_capacity_attr.attr,
9441 &min_partial_attr.attr,
9442 &cpu_partial_attr.attr,
9443 &objects_partial_attr.attr,
9444 &partial_attr.attr,
9445 &cpu_slabs_attr.attr,
9446 &ctor_attr.attr,
9447 &aliases_attr.attr,
9448 &align_attr.attr,
9449 &hwcache_align_attr.attr,
9450 &reclaim_account_attr.attr,
9451 &destroy_by_rcu_attr.attr,
9452 &shrink_attr.attr,
9453 &slabs_cpu_partial_attr.attr,
9454 #ifdef CONFIG_SLUB_DEBUG
9455 &total_objects_attr.attr,
9456 &objects_attr.attr,
9457 &slabs_attr.attr,
9458 &sanity_checks_attr.attr,
9459 &trace_attr.attr,
9460 &red_zone_attr.attr,
9461 &poison_attr.attr,
9462 &store_user_attr.attr,
9463 &validate_attr.attr,
9464 #endif
9465 #ifdef CONFIG_ZONE_DMA
9466 &cache_dma_attr.attr,
9467 #endif
9468 #ifdef CONFIG_NUMA
9469 &remote_node_defrag_ratio_attr.attr,
9470 #endif
9471 #ifdef CONFIG_SLUB_STATS
9472 &alloc_cpu_sheaf_attr.attr,
9473 &alloc_fastpath_attr.attr,
9474 &alloc_slowpath_attr.attr,
9475 &free_cpu_sheaf_attr.attr,
9476 &free_rcu_sheaf_attr.attr,
9477 &free_rcu_sheaf_fail_attr.attr,
9478 &free_fastpath_attr.attr,
9479 &free_slowpath_attr.attr,
9480 &free_frozen_attr.attr,
9481 &free_add_partial_attr.attr,
9482 &free_remove_partial_attr.attr,
9483 &alloc_from_partial_attr.attr,
9484 &alloc_slab_attr.attr,
9485 &alloc_refill_attr.attr,
9486 &alloc_node_mismatch_attr.attr,
9487 &free_slab_attr.attr,
9488 &cpuslab_flush_attr.attr,
9489 &deactivate_full_attr.attr,
9490 &deactivate_empty_attr.attr,
9491 &deactivate_to_head_attr.attr,
9492 &deactivate_to_tail_attr.attr,
9493 &deactivate_remote_frees_attr.attr,
9494 &deactivate_bypass_attr.attr,
9495 &order_fallback_attr.attr,
9496 &cmpxchg_double_fail_attr.attr,
9497 &cmpxchg_double_cpu_fail_attr.attr,
9498 &cpu_partial_alloc_attr.attr,
9499 &cpu_partial_free_attr.attr,
9500 &cpu_partial_node_attr.attr,
9501 &cpu_partial_drain_attr.attr,
9502 &sheaf_flush_attr.attr,
9503 &sheaf_refill_attr.attr,
9504 &sheaf_alloc_attr.attr,
9505 &sheaf_free_attr.attr,
9506 &barn_get_attr.attr,
9507 &barn_get_fail_attr.attr,
9508 &barn_put_attr.attr,
9509 &barn_put_fail_attr.attr,
9510 &sheaf_prefill_fast_attr.attr,
9511 &sheaf_prefill_slow_attr.attr,
9512 &sheaf_prefill_oversize_attr.attr,
9513 &sheaf_return_fast_attr.attr,
9514 &sheaf_return_slow_attr.attr,
9515 #endif
9516 #ifdef CONFIG_FAILSLAB
9517 &failslab_attr.attr,
9518 #endif
9519 #ifdef CONFIG_HARDENED_USERCOPY
9520 &usersize_attr.attr,
9521 #endif
9522 #ifdef CONFIG_KFENCE
9523 &skip_kfence_attr.attr,
9524 #endif
9525
9526 NULL
9527 };
9528
9529 static const struct attribute_group slab_attr_group = {
9530 .attrs = slab_attrs,
9531 };
9532
slab_attr_show(struct kobject * kobj,struct attribute * attr,char * buf)9533 static ssize_t slab_attr_show(struct kobject *kobj,
9534 struct attribute *attr,
9535 char *buf)
9536 {
9537 struct slab_attribute *attribute;
9538 struct kmem_cache *s;
9539
9540 attribute = to_slab_attr(attr);
9541 s = to_slab(kobj);
9542
9543 if (!attribute->show)
9544 return -EIO;
9545
9546 return attribute->show(s, buf);
9547 }
9548
slab_attr_store(struct kobject * kobj,struct attribute * attr,const char * buf,size_t len)9549 static ssize_t slab_attr_store(struct kobject *kobj,
9550 struct attribute *attr,
9551 const char *buf, size_t len)
9552 {
9553 struct slab_attribute *attribute;
9554 struct kmem_cache *s;
9555
9556 attribute = to_slab_attr(attr);
9557 s = to_slab(kobj);
9558
9559 if (!attribute->store)
9560 return -EIO;
9561
9562 return attribute->store(s, buf, len);
9563 }
9564
kmem_cache_release(struct kobject * k)9565 static void kmem_cache_release(struct kobject *k)
9566 {
9567 slab_kmem_cache_release(to_slab(k));
9568 }
9569
9570 static const struct sysfs_ops slab_sysfs_ops = {
9571 .show = slab_attr_show,
9572 .store = slab_attr_store,
9573 };
9574
9575 static const struct kobj_type slab_ktype = {
9576 .sysfs_ops = &slab_sysfs_ops,
9577 .release = kmem_cache_release,
9578 };
9579
9580 static struct kset *slab_kset;
9581
cache_kset(struct kmem_cache * s)9582 static inline struct kset *cache_kset(struct kmem_cache *s)
9583 {
9584 return slab_kset;
9585 }
9586
9587 #define ID_STR_LENGTH 32
9588
9589 /* Create a unique string id for a slab cache:
9590 *
9591 * Format :[flags-]size
9592 */
create_unique_id(struct kmem_cache * s)9593 static char *create_unique_id(struct kmem_cache *s)
9594 {
9595 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
9596 char *p = name;
9597
9598 if (!name)
9599 return ERR_PTR(-ENOMEM);
9600
9601 *p++ = ':';
9602 /*
9603 * First flags affecting slabcache operations. We will only
9604 * get here for aliasable slabs so we do not need to support
9605 * too many flags. The flags here must cover all flags that
9606 * are matched during merging to guarantee that the id is
9607 * unique.
9608 */
9609 if (s->flags & SLAB_CACHE_DMA)
9610 *p++ = 'd';
9611 if (s->flags & SLAB_CACHE_DMA32)
9612 *p++ = 'D';
9613 if (s->flags & SLAB_RECLAIM_ACCOUNT)
9614 *p++ = 'a';
9615 if (s->flags & SLAB_CONSISTENCY_CHECKS)
9616 *p++ = 'F';
9617 if (s->flags & SLAB_ACCOUNT)
9618 *p++ = 'A';
9619 if (p != name + 1)
9620 *p++ = '-';
9621 p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
9622
9623 if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
9624 kfree(name);
9625 return ERR_PTR(-EINVAL);
9626 }
9627 kmsan_unpoison_memory(name, p - name);
9628 return name;
9629 }
9630
sysfs_slab_add(struct kmem_cache * s)9631 static int sysfs_slab_add(struct kmem_cache *s)
9632 {
9633 int err;
9634 const char *name;
9635 struct kset *kset = cache_kset(s);
9636 int unmergeable = slab_unmergeable(s);
9637
9638 if (!unmergeable && disable_higher_order_debug &&
9639 (slub_debug & DEBUG_METADATA_FLAGS))
9640 unmergeable = 1;
9641
9642 if (unmergeable) {
9643 /*
9644 * Slabcache can never be merged so we can use the name proper.
9645 * This is typically the case for debug situations. In that
9646 * case we can catch duplicate names easily.
9647 */
9648 sysfs_remove_link(&slab_kset->kobj, s->name);
9649 name = s->name;
9650 } else {
9651 /*
9652 * Create a unique name for the slab as a target
9653 * for the symlinks.
9654 */
9655 name = create_unique_id(s);
9656 if (IS_ERR(name))
9657 return PTR_ERR(name);
9658 }
9659
9660 s->kobj.kset = kset;
9661 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
9662 if (err)
9663 goto out;
9664
9665 err = sysfs_create_group(&s->kobj, &slab_attr_group);
9666 if (err)
9667 goto out_del_kobj;
9668
9669 if (!unmergeable) {
9670 /* Setup first alias */
9671 sysfs_slab_alias(s, s->name);
9672 }
9673 out:
9674 if (!unmergeable)
9675 kfree(name);
9676 return err;
9677 out_del_kobj:
9678 kobject_del(&s->kobj);
9679 goto out;
9680 }
9681
sysfs_slab_unlink(struct kmem_cache * s)9682 void sysfs_slab_unlink(struct kmem_cache *s)
9683 {
9684 if (s->kobj.state_in_sysfs)
9685 kobject_del(&s->kobj);
9686 }
9687
sysfs_slab_release(struct kmem_cache * s)9688 void sysfs_slab_release(struct kmem_cache *s)
9689 {
9690 kobject_put(&s->kobj);
9691 }
9692
9693 /*
9694 * Need to buffer aliases during bootup until sysfs becomes
9695 * available lest we lose that information.
9696 */
9697 struct saved_alias {
9698 struct kmem_cache *s;
9699 const char *name;
9700 struct saved_alias *next;
9701 };
9702
9703 static struct saved_alias *alias_list;
9704
sysfs_slab_alias(struct kmem_cache * s,const char * name)9705 static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
9706 {
9707 struct saved_alias *al;
9708
9709 if (slab_state == FULL) {
9710 /*
9711 * If we have a leftover link then remove it.
9712 */
9713 sysfs_remove_link(&slab_kset->kobj, name);
9714 /*
9715 * The original cache may have failed to generate sysfs file.
9716 * In that case, sysfs_create_link() returns -ENOENT and
9717 * symbolic link creation is skipped.
9718 */
9719 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
9720 }
9721
9722 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
9723 if (!al)
9724 return -ENOMEM;
9725
9726 al->s = s;
9727 al->name = name;
9728 al->next = alias_list;
9729 alias_list = al;
9730 kmsan_unpoison_memory(al, sizeof(*al));
9731 return 0;
9732 }
9733
slab_sysfs_init(void)9734 static int __init slab_sysfs_init(void)
9735 {
9736 struct kmem_cache *s;
9737 int err;
9738
9739 mutex_lock(&slab_mutex);
9740
9741 slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
9742 if (!slab_kset) {
9743 mutex_unlock(&slab_mutex);
9744 pr_err("Cannot register slab subsystem.\n");
9745 return -ENOMEM;
9746 }
9747
9748 slab_state = FULL;
9749
9750 list_for_each_entry(s, &slab_caches, list) {
9751 err = sysfs_slab_add(s);
9752 if (err)
9753 pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
9754 s->name);
9755 }
9756
9757 while (alias_list) {
9758 struct saved_alias *al = alias_list;
9759
9760 alias_list = alias_list->next;
9761 err = sysfs_slab_alias(al->s, al->name);
9762 if (err)
9763 pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
9764 al->name);
9765 kfree(al);
9766 }
9767
9768 mutex_unlock(&slab_mutex);
9769 return 0;
9770 }
9771 late_initcall(slab_sysfs_init);
9772 #endif /* SLAB_SUPPORTS_SYSFS */
9773
9774 #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
slab_debugfs_show(struct seq_file * seq,void * v)9775 static int slab_debugfs_show(struct seq_file *seq, void *v)
9776 {
9777 struct loc_track *t = seq->private;
9778 struct location *l;
9779 unsigned long idx;
9780
9781 idx = (unsigned long) t->idx;
9782 if (idx < t->count) {
9783 l = &t->loc[idx];
9784
9785 seq_printf(seq, "%7ld ", l->count);
9786
9787 if (l->addr)
9788 seq_printf(seq, "%pS", (void *)l->addr);
9789 else
9790 seq_puts(seq, "<not-available>");
9791
9792 if (l->waste)
9793 seq_printf(seq, " waste=%lu/%lu",
9794 l->count * l->waste, l->waste);
9795
9796 if (l->sum_time != l->min_time) {
9797 seq_printf(seq, " age=%ld/%llu/%ld",
9798 l->min_time, div_u64(l->sum_time, l->count),
9799 l->max_time);
9800 } else
9801 seq_printf(seq, " age=%ld", l->min_time);
9802
9803 if (l->min_pid != l->max_pid)
9804 seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
9805 else
9806 seq_printf(seq, " pid=%ld",
9807 l->min_pid);
9808
9809 if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
9810 seq_printf(seq, " cpus=%*pbl",
9811 cpumask_pr_args(to_cpumask(l->cpus)));
9812
9813 if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
9814 seq_printf(seq, " nodes=%*pbl",
9815 nodemask_pr_args(&l->nodes));
9816
9817 #ifdef CONFIG_STACKDEPOT
9818 {
9819 depot_stack_handle_t handle;
9820 unsigned long *entries;
9821 unsigned int nr_entries, j;
9822
9823 handle = READ_ONCE(l->handle);
9824 if (handle) {
9825 nr_entries = stack_depot_fetch(handle, &entries);
9826 seq_puts(seq, "\n");
9827 for (j = 0; j < nr_entries; j++)
9828 seq_printf(seq, " %pS\n", (void *)entries[j]);
9829 }
9830 }
9831 #endif
9832 seq_puts(seq, "\n");
9833 }
9834
9835 if (!idx && !t->count)
9836 seq_puts(seq, "No data\n");
9837
9838 return 0;
9839 }
9840
slab_debugfs_stop(struct seq_file * seq,void * v)9841 static void slab_debugfs_stop(struct seq_file *seq, void *v)
9842 {
9843 }
9844
slab_debugfs_next(struct seq_file * seq,void * v,loff_t * ppos)9845 static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
9846 {
9847 struct loc_track *t = seq->private;
9848
9849 t->idx = ++(*ppos);
9850 if (*ppos <= t->count)
9851 return ppos;
9852
9853 return NULL;
9854 }
9855
cmp_loc_by_count(const void * a,const void * b)9856 static int cmp_loc_by_count(const void *a, const void *b)
9857 {
9858 struct location *loc1 = (struct location *)a;
9859 struct location *loc2 = (struct location *)b;
9860
9861 return cmp_int(loc2->count, loc1->count);
9862 }
9863
slab_debugfs_start(struct seq_file * seq,loff_t * ppos)9864 static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
9865 {
9866 struct loc_track *t = seq->private;
9867
9868 t->idx = *ppos;
9869 return ppos;
9870 }
9871
9872 static const struct seq_operations slab_debugfs_sops = {
9873 .start = slab_debugfs_start,
9874 .next = slab_debugfs_next,
9875 .stop = slab_debugfs_stop,
9876 .show = slab_debugfs_show,
9877 };
9878
slab_debug_trace_open(struct inode * inode,struct file * filep)9879 static int slab_debug_trace_open(struct inode *inode, struct file *filep)
9880 {
9881
9882 struct kmem_cache_node *n;
9883 enum track_item alloc;
9884 int node;
9885 struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
9886 sizeof(struct loc_track));
9887 struct kmem_cache *s = file_inode(filep)->i_private;
9888 unsigned long *obj_map;
9889
9890 if (!t)
9891 return -ENOMEM;
9892
9893 obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
9894 if (!obj_map) {
9895 seq_release_private(inode, filep);
9896 return -ENOMEM;
9897 }
9898
9899 alloc = debugfs_get_aux_num(filep);
9900
9901 if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
9902 bitmap_free(obj_map);
9903 seq_release_private(inode, filep);
9904 return -ENOMEM;
9905 }
9906
9907 for_each_kmem_cache_node(s, node, n) {
9908 unsigned long flags;
9909 struct slab *slab;
9910
9911 if (!node_nr_slabs(n))
9912 continue;
9913
9914 spin_lock_irqsave(&n->list_lock, flags);
9915 list_for_each_entry(slab, &n->partial, slab_list)
9916 process_slab(t, s, slab, alloc, obj_map);
9917 list_for_each_entry(slab, &n->full, slab_list)
9918 process_slab(t, s, slab, alloc, obj_map);
9919 spin_unlock_irqrestore(&n->list_lock, flags);
9920 }
9921
9922 /* Sort locations by count */
9923 sort(t->loc, t->count, sizeof(struct location),
9924 cmp_loc_by_count, NULL);
9925
9926 bitmap_free(obj_map);
9927 return 0;
9928 }
9929
slab_debug_trace_release(struct inode * inode,struct file * file)9930 static int slab_debug_trace_release(struct inode *inode, struct file *file)
9931 {
9932 struct seq_file *seq = file->private_data;
9933 struct loc_track *t = seq->private;
9934
9935 free_loc_track(t);
9936 return seq_release_private(inode, file);
9937 }
9938
9939 static const struct file_operations slab_debugfs_fops = {
9940 .open = slab_debug_trace_open,
9941 .read = seq_read,
9942 .llseek = seq_lseek,
9943 .release = slab_debug_trace_release,
9944 };
9945
debugfs_slab_add(struct kmem_cache * s)9946 static void debugfs_slab_add(struct kmem_cache *s)
9947 {
9948 struct dentry *slab_cache_dir;
9949
9950 if (unlikely(!slab_debugfs_root))
9951 return;
9952
9953 slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
9954
9955 debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s,
9956 TRACK_ALLOC, &slab_debugfs_fops);
9957
9958 debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s,
9959 TRACK_FREE, &slab_debugfs_fops);
9960 }
9961
debugfs_slab_release(struct kmem_cache * s)9962 void debugfs_slab_release(struct kmem_cache *s)
9963 {
9964 debugfs_lookup_and_remove(s->name, slab_debugfs_root);
9965 }
9966
slab_debugfs_init(void)9967 static int __init slab_debugfs_init(void)
9968 {
9969 struct kmem_cache *s;
9970
9971 slab_debugfs_root = debugfs_create_dir("slab", NULL);
9972
9973 list_for_each_entry(s, &slab_caches, list)
9974 if (s->flags & SLAB_STORE_USER)
9975 debugfs_slab_add(s);
9976
9977 return 0;
9978
9979 }
9980 __initcall(slab_debugfs_init);
9981 #endif
9982 /*
9983 * The /proc/slabinfo ABI
9984 */
9985 #ifdef CONFIG_SLUB_DEBUG
get_slabinfo(struct kmem_cache * s,struct slabinfo * sinfo)9986 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
9987 {
9988 unsigned long nr_slabs = 0;
9989 unsigned long nr_objs = 0;
9990 unsigned long nr_free = 0;
9991 int node;
9992 struct kmem_cache_node *n;
9993
9994 for_each_kmem_cache_node(s, node, n) {
9995 nr_slabs += node_nr_slabs(n);
9996 nr_objs += node_nr_objs(n);
9997 nr_free += count_partial_free_approx(n);
9998 }
9999
10000 sinfo->active_objs = nr_objs - nr_free;
10001 sinfo->num_objs = nr_objs;
10002 sinfo->active_slabs = nr_slabs;
10003 sinfo->num_slabs = nr_slabs;
10004 sinfo->objects_per_slab = oo_objects(s->oo);
10005 sinfo->cache_order = oo_order(s->oo);
10006 }
10007 #endif /* CONFIG_SLUB_DEBUG */
10008