xref: /linux/mm/slub.c (revision c5e8e93897b7bb0a336bf3332f82f8d9f2b33f14)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * SLUB: A slab allocator that limits cache line use instead of queuing
4   * objects in per cpu and per node lists.
5   *
6   * The allocator synchronizes using per slab locks or atomic operations
7   * and only uses a centralized lock to manage a pool of partial slabs.
8   *
9   * (C) 2007 SGI, Christoph Lameter
10   * (C) 2011 Linux Foundation, Christoph Lameter
11   */
12  
13  #include <linux/mm.h>
14  #include <linux/swap.h> /* mm_account_reclaimed_pages() */
15  #include <linux/module.h>
16  #include <linux/bit_spinlock.h>
17  #include <linux/interrupt.h>
18  #include <linux/swab.h>
19  #include <linux/bitops.h>
20  #include <linux/slab.h>
21  #include "slab.h"
22  #include <linux/proc_fs.h>
23  #include <linux/seq_file.h>
24  #include <linux/kasan.h>
25  #include <linux/kmsan.h>
26  #include <linux/cpu.h>
27  #include <linux/cpuset.h>
28  #include <linux/mempolicy.h>
29  #include <linux/ctype.h>
30  #include <linux/stackdepot.h>
31  #include <linux/debugobjects.h>
32  #include <linux/kallsyms.h>
33  #include <linux/kfence.h>
34  #include <linux/memory.h>
35  #include <linux/math64.h>
36  #include <linux/fault-inject.h>
37  #include <linux/kmemleak.h>
38  #include <linux/stacktrace.h>
39  #include <linux/prefetch.h>
40  #include <linux/memcontrol.h>
41  #include <linux/random.h>
42  #include <kunit/test.h>
43  #include <kunit/test-bug.h>
44  #include <linux/sort.h>
45  
46  #include <linux/debugfs.h>
47  #include <trace/events/kmem.h>
48  
49  #include "internal.h"
50  
51  /*
52   * Lock order:
53   *   1. slab_mutex (Global Mutex)
54   *   2. node->list_lock (Spinlock)
55   *   3. kmem_cache->cpu_slab->lock (Local lock)
56   *   4. slab_lock(slab) (Only on some arches)
57   *   5. object_map_lock (Only for debugging)
58   *
59   *   slab_mutex
60   *
61   *   The role of the slab_mutex is to protect the list of all the slabs
62   *   and to synchronize major metadata changes to slab cache structures.
63   *   Also synchronizes memory hotplug callbacks.
64   *
65   *   slab_lock
66   *
67   *   The slab_lock is a wrapper around the page lock, thus it is a bit
68   *   spinlock.
69   *
70   *   The slab_lock is only used on arches that do not have the ability
71   *   to do a cmpxchg_double. It only protects:
72   *
73   *	A. slab->freelist	-> List of free objects in a slab
74   *	B. slab->inuse		-> Number of objects in use
75   *	C. slab->objects	-> Number of objects in slab
76   *	D. slab->frozen		-> frozen state
77   *
78   *   Frozen slabs
79   *
80   *   If a slab is frozen then it is exempt from list management. It is
81   *   the cpu slab which is actively allocated from by the processor that
82   *   froze it and it is not on any list. The processor that froze the
83   *   slab is the one who can perform list operations on the slab. Other
84   *   processors may put objects onto the freelist but the processor that
85   *   froze the slab is the only one that can retrieve the objects from the
86   *   slab's freelist.
87   *
88   *   CPU partial slabs
89   *
90   *   The partially empty slabs cached on the CPU partial list are used
91   *   for performance reasons, which speeds up the allocation process.
92   *   These slabs are not frozen, but are also exempt from list management,
93   *   by clearing the PG_workingset flag when moving out of the node
94   *   partial list. Please see __slab_free() for more details.
95   *
96   *   To sum up, the current scheme is:
97   *   - node partial slab: PG_Workingset && !frozen
98   *   - cpu partial slab: !PG_Workingset && !frozen
99   *   - cpu slab: !PG_Workingset && frozen
100   *   - full slab: !PG_Workingset && !frozen
101   *
102   *   list_lock
103   *
104   *   The list_lock protects the partial and full list on each node and
105   *   the partial slab counter. If taken then no new slabs may be added or
106   *   removed from the lists nor make the number of partial slabs be modified.
107   *   (Note that the total number of slabs is an atomic value that may be
108   *   modified without taking the list lock).
109   *
110   *   The list_lock is a centralized lock and thus we avoid taking it as
111   *   much as possible. As long as SLUB does not have to handle partial
112   *   slabs, operations can continue without any centralized lock. F.e.
113   *   allocating a long series of objects that fill up slabs does not require
114   *   the list lock.
115   *
116   *   For debug caches, all allocations are forced to go through a list_lock
117   *   protected region to serialize against concurrent validation.
118   *
119   *   cpu_slab->lock local lock
120   *
121   *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
122   *   except the stat counters. This is a percpu structure manipulated only by
123   *   the local cpu, so the lock protects against being preempted or interrupted
124   *   by an irq. Fast path operations rely on lockless operations instead.
125   *
126   *   On PREEMPT_RT, the local lock neither disables interrupts nor preemption
127   *   which means the lockless fastpath cannot be used as it might interfere with
128   *   an in-progress slow path operations. In this case the local lock is always
129   *   taken but it still utilizes the freelist for the common operations.
130   *
131   *   lockless fastpaths
132   *
133   *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
134   *   are fully lockless when satisfied from the percpu slab (and when
135   *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
136   *   They also don't disable preemption or migration or irqs. They rely on
137   *   the transaction id (tid) field to detect being preempted or moved to
138   *   another cpu.
139   *
140   *   irq, preemption, migration considerations
141   *
142   *   Interrupts are disabled as part of list_lock or local_lock operations, or
143   *   around the slab_lock operation, in order to make the slab allocator safe
144   *   to use in the context of an irq.
145   *
146   *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
147   *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
148   *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
149   *   doesn't have to be revalidated in each section protected by the local lock.
150   *
151   * SLUB assigns one slab for allocation to each processor.
152   * Allocations only occur from these slabs called cpu slabs.
153   *
154   * Slabs with free elements are kept on a partial list and during regular
155   * operations no list for full slabs is used. If an object in a full slab is
156   * freed then the slab will show up again on the partial lists.
157   * We track full slabs for debugging purposes though because otherwise we
158   * cannot scan all objects.
159   *
160   * Slabs are freed when they become empty. Teardown and setup is
161   * minimal so we rely on the page allocators per cpu caches for
162   * fast frees and allocs.
163   *
164   * slab->frozen		The slab is frozen and exempt from list processing.
165   * 			This means that the slab is dedicated to a purpose
166   * 			such as satisfying allocations for a specific
167   * 			processor. Objects may be freed in the slab while
168   * 			it is frozen but slab_free will then skip the usual
169   * 			list operations. It is up to the processor holding
170   * 			the slab to integrate the slab into the slab lists
171   * 			when the slab is no longer needed.
172   *
173   * 			One use of this flag is to mark slabs that are
174   * 			used for allocations. Then such a slab becomes a cpu
175   * 			slab. The cpu slab may be equipped with an additional
176   * 			freelist that allows lockless access to
177   * 			free objects in addition to the regular freelist
178   * 			that requires the slab lock.
179   *
180   * SLAB_DEBUG_FLAGS	Slab requires special handling due to debug
181   * 			options set. This moves	slab handling out of
182   * 			the fast path and disables lockless freelists.
183   */
184  
185  /*
186   * We could simply use migrate_disable()/enable() but as long as it's a
187   * function call even on !PREEMPT_RT, use inline preempt_disable() there.
188   */
189  #ifndef CONFIG_PREEMPT_RT
190  #define slub_get_cpu_ptr(var)		get_cpu_ptr(var)
191  #define slub_put_cpu_ptr(var)		put_cpu_ptr(var)
192  #define USE_LOCKLESS_FAST_PATH()	(true)
193  #else
194  #define slub_get_cpu_ptr(var)		\
195  ({					\
196  	migrate_disable();		\
197  	this_cpu_ptr(var);		\
198  })
199  #define slub_put_cpu_ptr(var)		\
200  do {					\
201  	(void)(var);			\
202  	migrate_enable();		\
203  } while (0)
204  #define USE_LOCKLESS_FAST_PATH()	(false)
205  #endif
206  
207  #ifndef CONFIG_SLUB_TINY
208  #define __fastpath_inline __always_inline
209  #else
210  #define __fastpath_inline
211  #endif
212  
213  #ifdef CONFIG_SLUB_DEBUG
214  #ifdef CONFIG_SLUB_DEBUG_ON
215  DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
216  #else
217  DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
218  #endif
219  #endif		/* CONFIG_SLUB_DEBUG */
220  
221  /* Structure holding parameters for get_partial() call chain */
222  struct partial_context {
223  	gfp_t flags;
224  	unsigned int orig_size;
225  	void *object;
226  };
227  
228  static inline bool kmem_cache_debug(struct kmem_cache *s)
229  {
230  	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
231  }
232  
233  static inline bool slub_debug_orig_size(struct kmem_cache *s)
234  {
235  	return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
236  			(s->flags & SLAB_KMALLOC));
237  }
238  
239  void *fixup_red_left(struct kmem_cache *s, void *p)
240  {
241  	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
242  		p += s->red_left_pad;
243  
244  	return p;
245  }
246  
247  static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
248  {
249  #ifdef CONFIG_SLUB_CPU_PARTIAL
250  	return !kmem_cache_debug(s);
251  #else
252  	return false;
253  #endif
254  }
255  
256  /*
257   * Issues still to be resolved:
258   *
259   * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
260   *
261   * - Variable sizing of the per node arrays
262   */
263  
264  /* Enable to log cmpxchg failures */
265  #undef SLUB_DEBUG_CMPXCHG
266  
267  #ifndef CONFIG_SLUB_TINY
268  /*
269   * Minimum number of partial slabs. These will be left on the partial
270   * lists even if they are empty. kmem_cache_shrink may reclaim them.
271   */
272  #define MIN_PARTIAL 5
273  
274  /*
275   * Maximum number of desirable partial slabs.
276   * The existence of more partial slabs makes kmem_cache_shrink
277   * sort the partial list by the number of objects in use.
278   */
279  #define MAX_PARTIAL 10
280  #else
281  #define MIN_PARTIAL 0
282  #define MAX_PARTIAL 0
283  #endif
284  
285  #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
286  				SLAB_POISON | SLAB_STORE_USER)
287  
288  /*
289   * These debug flags cannot use CMPXCHG because there might be consistency
290   * issues when checking or reading debug information
291   */
292  #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
293  				SLAB_TRACE)
294  
295  
296  /*
297   * Debugging flags that require metadata to be stored in the slab.  These get
298   * disabled when slab_debug=O is used and a cache's min order increases with
299   * metadata.
300   */
301  #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
302  
303  #define OO_SHIFT	16
304  #define OO_MASK		((1 << OO_SHIFT) - 1)
305  #define MAX_OBJS_PER_PAGE	32767 /* since slab.objects is u15 */
306  
307  /* Internal SLUB flags */
308  /* Poison object */
309  #define __OBJECT_POISON		__SLAB_FLAG_BIT(_SLAB_OBJECT_POISON)
310  /* Use cmpxchg_double */
311  
312  #ifdef system_has_freelist_aba
313  #define __CMPXCHG_DOUBLE	__SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE)
314  #else
315  #define __CMPXCHG_DOUBLE	__SLAB_FLAG_UNUSED
316  #endif
317  
318  /*
319   * Tracking user of a slab.
320   */
321  #define TRACK_ADDRS_COUNT 16
322  struct track {
323  	unsigned long addr;	/* Called from address */
324  #ifdef CONFIG_STACKDEPOT
325  	depot_stack_handle_t handle;
326  #endif
327  	int cpu;		/* Was running on cpu */
328  	int pid;		/* Pid context */
329  	unsigned long when;	/* When did the operation occur */
330  };
331  
332  enum track_item { TRACK_ALLOC, TRACK_FREE };
333  
334  #ifdef SLAB_SUPPORTS_SYSFS
335  static int sysfs_slab_add(struct kmem_cache *);
336  static int sysfs_slab_alias(struct kmem_cache *, const char *);
337  #else
338  static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
339  static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
340  							{ return 0; }
341  #endif
342  
343  #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
344  static void debugfs_slab_add(struct kmem_cache *);
345  #else
346  static inline void debugfs_slab_add(struct kmem_cache *s) { }
347  #endif
348  
349  enum stat_item {
350  	ALLOC_FASTPATH,		/* Allocation from cpu slab */
351  	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
352  	FREE_FASTPATH,		/* Free to cpu slab */
353  	FREE_SLOWPATH,		/* Freeing not to cpu slab */
354  	FREE_FROZEN,		/* Freeing to frozen slab */
355  	FREE_ADD_PARTIAL,	/* Freeing moves slab to partial list */
356  	FREE_REMOVE_PARTIAL,	/* Freeing removes last object */
357  	ALLOC_FROM_PARTIAL,	/* Cpu slab acquired from node partial list */
358  	ALLOC_SLAB,		/* Cpu slab acquired from page allocator */
359  	ALLOC_REFILL,		/* Refill cpu slab from slab freelist */
360  	ALLOC_NODE_MISMATCH,	/* Switching cpu slab */
361  	FREE_SLAB,		/* Slab freed to the page allocator */
362  	CPUSLAB_FLUSH,		/* Abandoning of the cpu slab */
363  	DEACTIVATE_FULL,	/* Cpu slab was full when deactivated */
364  	DEACTIVATE_EMPTY,	/* Cpu slab was empty when deactivated */
365  	DEACTIVATE_TO_HEAD,	/* Cpu slab was moved to the head of partials */
366  	DEACTIVATE_TO_TAIL,	/* Cpu slab was moved to the tail of partials */
367  	DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
368  	DEACTIVATE_BYPASS,	/* Implicit deactivation */
369  	ORDER_FALLBACK,		/* Number of times fallback was necessary */
370  	CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */
371  	CMPXCHG_DOUBLE_FAIL,	/* Failures of slab freelist update */
372  	CPU_PARTIAL_ALLOC,	/* Used cpu partial on alloc */
373  	CPU_PARTIAL_FREE,	/* Refill cpu partial on free */
374  	CPU_PARTIAL_NODE,	/* Refill cpu partial from node partial */
375  	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
376  	NR_SLUB_STAT_ITEMS
377  };
378  
379  #ifndef CONFIG_SLUB_TINY
380  /*
381   * When changing the layout, make sure freelist and tid are still compatible
382   * with this_cpu_cmpxchg_double() alignment requirements.
383   */
384  struct kmem_cache_cpu {
385  	union {
386  		struct {
387  			void **freelist;	/* Pointer to next available object */
388  			unsigned long tid;	/* Globally unique transaction id */
389  		};
390  		freelist_aba_t freelist_tid;
391  	};
392  	struct slab *slab;	/* The slab from which we are allocating */
393  #ifdef CONFIG_SLUB_CPU_PARTIAL
394  	struct slab *partial;	/* Partially allocated slabs */
395  #endif
396  	local_lock_t lock;	/* Protects the fields above */
397  #ifdef CONFIG_SLUB_STATS
398  	unsigned int stat[NR_SLUB_STAT_ITEMS];
399  #endif
400  };
401  #endif /* CONFIG_SLUB_TINY */
402  
403  static inline void stat(const struct kmem_cache *s, enum stat_item si)
404  {
405  #ifdef CONFIG_SLUB_STATS
406  	/*
407  	 * The rmw is racy on a preemptible kernel but this is acceptable, so
408  	 * avoid this_cpu_add()'s irq-disable overhead.
409  	 */
410  	raw_cpu_inc(s->cpu_slab->stat[si]);
411  #endif
412  }
413  
414  static inline
415  void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
416  {
417  #ifdef CONFIG_SLUB_STATS
418  	raw_cpu_add(s->cpu_slab->stat[si], v);
419  #endif
420  }
421  
422  /*
423   * The slab lists for all objects.
424   */
425  struct kmem_cache_node {
426  	spinlock_t list_lock;
427  	unsigned long nr_partial;
428  	struct list_head partial;
429  #ifdef CONFIG_SLUB_DEBUG
430  	atomic_long_t nr_slabs;
431  	atomic_long_t total_objects;
432  	struct list_head full;
433  #endif
434  };
435  
436  static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
437  {
438  	return s->node[node];
439  }
440  
441  /*
442   * Iterator over all nodes. The body will be executed for each node that has
443   * a kmem_cache_node structure allocated (which is true for all online nodes)
444   */
445  #define for_each_kmem_cache_node(__s, __node, __n) \
446  	for (__node = 0; __node < nr_node_ids; __node++) \
447  		 if ((__n = get_node(__s, __node)))
448  
449  /*
450   * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
451   * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
452   * differ during memory hotplug/hotremove operations.
453   * Protected by slab_mutex.
454   */
455  static nodemask_t slab_nodes;
456  
457  #ifndef CONFIG_SLUB_TINY
458  /*
459   * Workqueue used for flush_cpu_slab().
460   */
461  static struct workqueue_struct *flushwq;
462  #endif
463  
464  /********************************************************************
465   * 			Core slab cache functions
466   *******************************************************************/
467  
468  /*
469   * Returns freelist pointer (ptr). With hardening, this is obfuscated
470   * with an XOR of the address where the pointer is held and a per-cache
471   * random number.
472   */
473  static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s,
474  					    void *ptr, unsigned long ptr_addr)
475  {
476  	unsigned long encoded;
477  
478  #ifdef CONFIG_SLAB_FREELIST_HARDENED
479  	encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr);
480  #else
481  	encoded = (unsigned long)ptr;
482  #endif
483  	return (freeptr_t){.v = encoded};
484  }
485  
486  static inline void *freelist_ptr_decode(const struct kmem_cache *s,
487  					freeptr_t ptr, unsigned long ptr_addr)
488  {
489  	void *decoded;
490  
491  #ifdef CONFIG_SLAB_FREELIST_HARDENED
492  	decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr));
493  #else
494  	decoded = (void *)ptr.v;
495  #endif
496  	return decoded;
497  }
498  
499  static inline void *get_freepointer(struct kmem_cache *s, void *object)
500  {
501  	unsigned long ptr_addr;
502  	freeptr_t p;
503  
504  	object = kasan_reset_tag(object);
505  	ptr_addr = (unsigned long)object + s->offset;
506  	p = *(freeptr_t *)(ptr_addr);
507  	return freelist_ptr_decode(s, p, ptr_addr);
508  }
509  
510  #ifndef CONFIG_SLUB_TINY
511  static void prefetch_freepointer(const struct kmem_cache *s, void *object)
512  {
513  	prefetchw(object + s->offset);
514  }
515  #endif
516  
517  /*
518   * When running under KMSAN, get_freepointer_safe() may return an uninitialized
519   * pointer value in the case the current thread loses the race for the next
520   * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
521   * slab_alloc_node() will fail, so the uninitialized value won't be used, but
522   * KMSAN will still check all arguments of cmpxchg because of imperfect
523   * handling of inline assembly.
524   * To work around this problem, we apply __no_kmsan_checks to ensure that
525   * get_freepointer_safe() returns initialized memory.
526   */
527  __no_kmsan_checks
528  static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
529  {
530  	unsigned long freepointer_addr;
531  	freeptr_t p;
532  
533  	if (!debug_pagealloc_enabled_static())
534  		return get_freepointer(s, object);
535  
536  	object = kasan_reset_tag(object);
537  	freepointer_addr = (unsigned long)object + s->offset;
538  	copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
539  	return freelist_ptr_decode(s, p, freepointer_addr);
540  }
541  
542  static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
543  {
544  	unsigned long freeptr_addr = (unsigned long)object + s->offset;
545  
546  #ifdef CONFIG_SLAB_FREELIST_HARDENED
547  	BUG_ON(object == fp); /* naive detection of double free or corruption */
548  #endif
549  
550  	freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
551  	*(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr);
552  }
553  
554  /*
555   * See comment in calculate_sizes().
556   */
557  static inline bool freeptr_outside_object(struct kmem_cache *s)
558  {
559  	return s->offset >= s->inuse;
560  }
561  
562  /*
563   * Return offset of the end of info block which is inuse + free pointer if
564   * not overlapping with object.
565   */
566  static inline unsigned int get_info_end(struct kmem_cache *s)
567  {
568  	if (freeptr_outside_object(s))
569  		return s->inuse + sizeof(void *);
570  	else
571  		return s->inuse;
572  }
573  
574  /* Loop over all objects in a slab */
575  #define for_each_object(__p, __s, __addr, __objects) \
576  	for (__p = fixup_red_left(__s, __addr); \
577  		__p < (__addr) + (__objects) * (__s)->size; \
578  		__p += (__s)->size)
579  
580  static inline unsigned int order_objects(unsigned int order, unsigned int size)
581  {
582  	return ((unsigned int)PAGE_SIZE << order) / size;
583  }
584  
585  static inline struct kmem_cache_order_objects oo_make(unsigned int order,
586  		unsigned int size)
587  {
588  	struct kmem_cache_order_objects x = {
589  		(order << OO_SHIFT) + order_objects(order, size)
590  	};
591  
592  	return x;
593  }
594  
595  static inline unsigned int oo_order(struct kmem_cache_order_objects x)
596  {
597  	return x.x >> OO_SHIFT;
598  }
599  
600  static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
601  {
602  	return x.x & OO_MASK;
603  }
604  
605  #ifdef CONFIG_SLUB_CPU_PARTIAL
606  static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
607  {
608  	unsigned int nr_slabs;
609  
610  	s->cpu_partial = nr_objects;
611  
612  	/*
613  	 * We take the number of objects but actually limit the number of
614  	 * slabs on the per cpu partial list, in order to limit excessive
615  	 * growth of the list. For simplicity we assume that the slabs will
616  	 * be half-full.
617  	 */
618  	nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
619  	s->cpu_partial_slabs = nr_slabs;
620  }
621  
622  static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
623  {
624  	return s->cpu_partial_slabs;
625  }
626  #else
627  static inline void
628  slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
629  {
630  }
631  
632  static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
633  {
634  	return 0;
635  }
636  #endif /* CONFIG_SLUB_CPU_PARTIAL */
637  
638  /*
639   * Per slab locking using the pagelock
640   */
641  static __always_inline void slab_lock(struct slab *slab)
642  {
643  	bit_spin_lock(PG_locked, &slab->__page_flags);
644  }
645  
646  static __always_inline void slab_unlock(struct slab *slab)
647  {
648  	bit_spin_unlock(PG_locked, &slab->__page_flags);
649  }
650  
651  static inline bool
652  __update_freelist_fast(struct slab *slab,
653  		      void *freelist_old, unsigned long counters_old,
654  		      void *freelist_new, unsigned long counters_new)
655  {
656  #ifdef system_has_freelist_aba
657  	freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
658  	freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
659  
660  	return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
661  #else
662  	return false;
663  #endif
664  }
665  
666  static inline bool
667  __update_freelist_slow(struct slab *slab,
668  		      void *freelist_old, unsigned long counters_old,
669  		      void *freelist_new, unsigned long counters_new)
670  {
671  	bool ret = false;
672  
673  	slab_lock(slab);
674  	if (slab->freelist == freelist_old &&
675  	    slab->counters == counters_old) {
676  		slab->freelist = freelist_new;
677  		slab->counters = counters_new;
678  		ret = true;
679  	}
680  	slab_unlock(slab);
681  
682  	return ret;
683  }
684  
685  /*
686   * Interrupts must be disabled (for the fallback code to work right), typically
687   * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
688   * part of bit_spin_lock(), is sufficient because the policy is not to allow any
689   * allocation/ free operation in hardirq context. Therefore nothing can
690   * interrupt the operation.
691   */
692  static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
693  		void *freelist_old, unsigned long counters_old,
694  		void *freelist_new, unsigned long counters_new,
695  		const char *n)
696  {
697  	bool ret;
698  
699  	if (USE_LOCKLESS_FAST_PATH())
700  		lockdep_assert_irqs_disabled();
701  
702  	if (s->flags & __CMPXCHG_DOUBLE) {
703  		ret = __update_freelist_fast(slab, freelist_old, counters_old,
704  				            freelist_new, counters_new);
705  	} else {
706  		ret = __update_freelist_slow(slab, freelist_old, counters_old,
707  				            freelist_new, counters_new);
708  	}
709  	if (likely(ret))
710  		return true;
711  
712  	cpu_relax();
713  	stat(s, CMPXCHG_DOUBLE_FAIL);
714  
715  #ifdef SLUB_DEBUG_CMPXCHG
716  	pr_info("%s %s: cmpxchg double redo ", n, s->name);
717  #endif
718  
719  	return false;
720  }
721  
722  static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
723  		void *freelist_old, unsigned long counters_old,
724  		void *freelist_new, unsigned long counters_new,
725  		const char *n)
726  {
727  	bool ret;
728  
729  	if (s->flags & __CMPXCHG_DOUBLE) {
730  		ret = __update_freelist_fast(slab, freelist_old, counters_old,
731  				            freelist_new, counters_new);
732  	} else {
733  		unsigned long flags;
734  
735  		local_irq_save(flags);
736  		ret = __update_freelist_slow(slab, freelist_old, counters_old,
737  				            freelist_new, counters_new);
738  		local_irq_restore(flags);
739  	}
740  	if (likely(ret))
741  		return true;
742  
743  	cpu_relax();
744  	stat(s, CMPXCHG_DOUBLE_FAIL);
745  
746  #ifdef SLUB_DEBUG_CMPXCHG
747  	pr_info("%s %s: cmpxchg double redo ", n, s->name);
748  #endif
749  
750  	return false;
751  }
752  
753  /*
754   * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
755   * family will round up the real request size to these fixed ones, so
756   * there could be an extra area than what is requested. Save the original
757   * request size in the meta data area, for better debug and sanity check.
758   */
759  static inline void set_orig_size(struct kmem_cache *s,
760  				void *object, unsigned int orig_size)
761  {
762  	void *p = kasan_reset_tag(object);
763  	unsigned int kasan_meta_size;
764  
765  	if (!slub_debug_orig_size(s))
766  		return;
767  
768  	/*
769  	 * KASAN can save its free meta data inside of the object at offset 0.
770  	 * If this meta data size is larger than 'orig_size', it will overlap
771  	 * the data redzone in [orig_size+1, object_size]. Thus, we adjust
772  	 * 'orig_size' to be as at least as big as KASAN's meta data.
773  	 */
774  	kasan_meta_size = kasan_metadata_size(s, true);
775  	if (kasan_meta_size > orig_size)
776  		orig_size = kasan_meta_size;
777  
778  	p += get_info_end(s);
779  	p += sizeof(struct track) * 2;
780  
781  	*(unsigned int *)p = orig_size;
782  }
783  
784  static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
785  {
786  	void *p = kasan_reset_tag(object);
787  
788  	if (!slub_debug_orig_size(s))
789  		return s->object_size;
790  
791  	p += get_info_end(s);
792  	p += sizeof(struct track) * 2;
793  
794  	return *(unsigned int *)p;
795  }
796  
797  #ifdef CONFIG_SLUB_DEBUG
798  static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
799  static DEFINE_SPINLOCK(object_map_lock);
800  
801  static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
802  		       struct slab *slab)
803  {
804  	void *addr = slab_address(slab);
805  	void *p;
806  
807  	bitmap_zero(obj_map, slab->objects);
808  
809  	for (p = slab->freelist; p; p = get_freepointer(s, p))
810  		set_bit(__obj_to_index(s, addr, p), obj_map);
811  }
812  
813  #if IS_ENABLED(CONFIG_KUNIT)
814  static bool slab_add_kunit_errors(void)
815  {
816  	struct kunit_resource *resource;
817  
818  	if (!kunit_get_current_test())
819  		return false;
820  
821  	resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
822  	if (!resource)
823  		return false;
824  
825  	(*(int *)resource->data)++;
826  	kunit_put_resource(resource);
827  	return true;
828  }
829  
830  static bool slab_in_kunit_test(void)
831  {
832  	struct kunit_resource *resource;
833  
834  	if (!kunit_get_current_test())
835  		return false;
836  
837  	resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
838  	if (!resource)
839  		return false;
840  
841  	kunit_put_resource(resource);
842  	return true;
843  }
844  #else
845  static inline bool slab_add_kunit_errors(void) { return false; }
846  static inline bool slab_in_kunit_test(void) { return false; }
847  #endif
848  
849  static inline unsigned int size_from_object(struct kmem_cache *s)
850  {
851  	if (s->flags & SLAB_RED_ZONE)
852  		return s->size - s->red_left_pad;
853  
854  	return s->size;
855  }
856  
857  static inline void *restore_red_left(struct kmem_cache *s, void *p)
858  {
859  	if (s->flags & SLAB_RED_ZONE)
860  		p -= s->red_left_pad;
861  
862  	return p;
863  }
864  
865  /*
866   * Debug settings:
867   */
868  #if defined(CONFIG_SLUB_DEBUG_ON)
869  static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
870  #else
871  static slab_flags_t slub_debug;
872  #endif
873  
874  static char *slub_debug_string;
875  static int disable_higher_order_debug;
876  
877  /*
878   * slub is about to manipulate internal object metadata.  This memory lies
879   * outside the range of the allocated object, so accessing it would normally
880   * be reported by kasan as a bounds error.  metadata_access_enable() is used
881   * to tell kasan that these accesses are OK.
882   */
883  static inline void metadata_access_enable(void)
884  {
885  	kasan_disable_current();
886  	kmsan_disable_current();
887  }
888  
889  static inline void metadata_access_disable(void)
890  {
891  	kmsan_enable_current();
892  	kasan_enable_current();
893  }
894  
895  /*
896   * Object debugging
897   */
898  
899  /* Verify that a pointer has an address that is valid within a slab page */
900  static inline int check_valid_pointer(struct kmem_cache *s,
901  				struct slab *slab, void *object)
902  {
903  	void *base;
904  
905  	if (!object)
906  		return 1;
907  
908  	base = slab_address(slab);
909  	object = kasan_reset_tag(object);
910  	object = restore_red_left(s, object);
911  	if (object < base || object >= base + slab->objects * s->size ||
912  		(object - base) % s->size) {
913  		return 0;
914  	}
915  
916  	return 1;
917  }
918  
919  static void print_section(char *level, char *text, u8 *addr,
920  			  unsigned int length)
921  {
922  	metadata_access_enable();
923  	print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
924  			16, 1, kasan_reset_tag((void *)addr), length, 1);
925  	metadata_access_disable();
926  }
927  
928  static struct track *get_track(struct kmem_cache *s, void *object,
929  	enum track_item alloc)
930  {
931  	struct track *p;
932  
933  	p = object + get_info_end(s);
934  
935  	return kasan_reset_tag(p + alloc);
936  }
937  
938  #ifdef CONFIG_STACKDEPOT
939  static noinline depot_stack_handle_t set_track_prepare(void)
940  {
941  	depot_stack_handle_t handle;
942  	unsigned long entries[TRACK_ADDRS_COUNT];
943  	unsigned int nr_entries;
944  
945  	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
946  	handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
947  
948  	return handle;
949  }
950  #else
951  static inline depot_stack_handle_t set_track_prepare(void)
952  {
953  	return 0;
954  }
955  #endif
956  
957  static void set_track_update(struct kmem_cache *s, void *object,
958  			     enum track_item alloc, unsigned long addr,
959  			     depot_stack_handle_t handle)
960  {
961  	struct track *p = get_track(s, object, alloc);
962  
963  #ifdef CONFIG_STACKDEPOT
964  	p->handle = handle;
965  #endif
966  	p->addr = addr;
967  	p->cpu = smp_processor_id();
968  	p->pid = current->pid;
969  	p->when = jiffies;
970  }
971  
972  static __always_inline void set_track(struct kmem_cache *s, void *object,
973  				      enum track_item alloc, unsigned long addr)
974  {
975  	depot_stack_handle_t handle = set_track_prepare();
976  
977  	set_track_update(s, object, alloc, addr, handle);
978  }
979  
980  static void init_tracking(struct kmem_cache *s, void *object)
981  {
982  	struct track *p;
983  
984  	if (!(s->flags & SLAB_STORE_USER))
985  		return;
986  
987  	p = get_track(s, object, TRACK_ALLOC);
988  	memset(p, 0, 2*sizeof(struct track));
989  }
990  
991  static void print_track(const char *s, struct track *t, unsigned long pr_time)
992  {
993  	depot_stack_handle_t handle __maybe_unused;
994  
995  	if (!t->addr)
996  		return;
997  
998  	pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
999  	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
1000  #ifdef CONFIG_STACKDEPOT
1001  	handle = READ_ONCE(t->handle);
1002  	if (handle)
1003  		stack_depot_print(handle);
1004  	else
1005  		pr_err("object allocation/free stack trace missing\n");
1006  #endif
1007  }
1008  
1009  void print_tracking(struct kmem_cache *s, void *object)
1010  {
1011  	unsigned long pr_time = jiffies;
1012  	if (!(s->flags & SLAB_STORE_USER))
1013  		return;
1014  
1015  	print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
1016  	print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
1017  }
1018  
1019  static void print_slab_info(const struct slab *slab)
1020  {
1021  	pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
1022  	       slab, slab->objects, slab->inuse, slab->freelist,
1023  	       &slab->__page_flags);
1024  }
1025  
1026  void skip_orig_size_check(struct kmem_cache *s, const void *object)
1027  {
1028  	set_orig_size(s, (void *)object, s->object_size);
1029  }
1030  
1031  static void slab_bug(struct kmem_cache *s, char *fmt, ...)
1032  {
1033  	struct va_format vaf;
1034  	va_list args;
1035  
1036  	va_start(args, fmt);
1037  	vaf.fmt = fmt;
1038  	vaf.va = &args;
1039  	pr_err("=============================================================================\n");
1040  	pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
1041  	pr_err("-----------------------------------------------------------------------------\n\n");
1042  	va_end(args);
1043  }
1044  
1045  __printf(2, 3)
1046  static void slab_fix(struct kmem_cache *s, char *fmt, ...)
1047  {
1048  	struct va_format vaf;
1049  	va_list args;
1050  
1051  	if (slab_add_kunit_errors())
1052  		return;
1053  
1054  	va_start(args, fmt);
1055  	vaf.fmt = fmt;
1056  	vaf.va = &args;
1057  	pr_err("FIX %s: %pV\n", s->name, &vaf);
1058  	va_end(args);
1059  }
1060  
1061  static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
1062  {
1063  	unsigned int off;	/* Offset of last byte */
1064  	u8 *addr = slab_address(slab);
1065  
1066  	print_tracking(s, p);
1067  
1068  	print_slab_info(slab);
1069  
1070  	pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
1071  	       p, p - addr, get_freepointer(s, p));
1072  
1073  	if (s->flags & SLAB_RED_ZONE)
1074  		print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
1075  			      s->red_left_pad);
1076  	else if (p > addr + 16)
1077  		print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
1078  
1079  	print_section(KERN_ERR,         "Object   ", p,
1080  		      min_t(unsigned int, s->object_size, PAGE_SIZE));
1081  	if (s->flags & SLAB_RED_ZONE)
1082  		print_section(KERN_ERR, "Redzone  ", p + s->object_size,
1083  			s->inuse - s->object_size);
1084  
1085  	off = get_info_end(s);
1086  
1087  	if (s->flags & SLAB_STORE_USER)
1088  		off += 2 * sizeof(struct track);
1089  
1090  	if (slub_debug_orig_size(s))
1091  		off += sizeof(unsigned int);
1092  
1093  	off += kasan_metadata_size(s, false);
1094  
1095  	if (off != size_from_object(s))
1096  		/* Beginning of the filler is the free pointer */
1097  		print_section(KERN_ERR, "Padding  ", p + off,
1098  			      size_from_object(s) - off);
1099  
1100  	dump_stack();
1101  }
1102  
1103  static void object_err(struct kmem_cache *s, struct slab *slab,
1104  			u8 *object, char *reason)
1105  {
1106  	if (slab_add_kunit_errors())
1107  		return;
1108  
1109  	slab_bug(s, "%s", reason);
1110  	print_trailer(s, slab, object);
1111  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1112  }
1113  
1114  static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
1115  			       void **freelist, void *nextfree)
1116  {
1117  	if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
1118  	    !check_valid_pointer(s, slab, nextfree) && freelist) {
1119  		object_err(s, slab, *freelist, "Freechain corrupt");
1120  		*freelist = NULL;
1121  		slab_fix(s, "Isolate corrupted freechain");
1122  		return true;
1123  	}
1124  
1125  	return false;
1126  }
1127  
1128  static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
1129  			const char *fmt, ...)
1130  {
1131  	va_list args;
1132  	char buf[100];
1133  
1134  	if (slab_add_kunit_errors())
1135  		return;
1136  
1137  	va_start(args, fmt);
1138  	vsnprintf(buf, sizeof(buf), fmt, args);
1139  	va_end(args);
1140  	slab_bug(s, "%s", buf);
1141  	print_slab_info(slab);
1142  	dump_stack();
1143  	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1144  }
1145  
1146  static void init_object(struct kmem_cache *s, void *object, u8 val)
1147  {
1148  	u8 *p = kasan_reset_tag(object);
1149  	unsigned int poison_size = s->object_size;
1150  
1151  	if (s->flags & SLAB_RED_ZONE) {
1152  		/*
1153  		 * Here and below, avoid overwriting the KMSAN shadow. Keeping
1154  		 * the shadow makes it possible to distinguish uninit-value
1155  		 * from use-after-free.
1156  		 */
1157  		memset_no_sanitize_memory(p - s->red_left_pad, val,
1158  					  s->red_left_pad);
1159  
1160  		if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
1161  			/*
1162  			 * Redzone the extra allocated space by kmalloc than
1163  			 * requested, and the poison size will be limited to
1164  			 * the original request size accordingly.
1165  			 */
1166  			poison_size = get_orig_size(s, object);
1167  		}
1168  	}
1169  
1170  	if (s->flags & __OBJECT_POISON) {
1171  		memset_no_sanitize_memory(p, POISON_FREE, poison_size - 1);
1172  		memset_no_sanitize_memory(p + poison_size - 1, POISON_END, 1);
1173  	}
1174  
1175  	if (s->flags & SLAB_RED_ZONE)
1176  		memset_no_sanitize_memory(p + poison_size, val,
1177  					  s->inuse - poison_size);
1178  }
1179  
1180  static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
1181  						void *from, void *to)
1182  {
1183  	slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
1184  	memset(from, data, to - from);
1185  }
1186  
1187  #ifdef CONFIG_KMSAN
1188  #define pad_check_attributes noinline __no_kmsan_checks
1189  #else
1190  #define pad_check_attributes
1191  #endif
1192  
1193  static pad_check_attributes int
1194  check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
1195  		       u8 *object, char *what,
1196  		       u8 *start, unsigned int value, unsigned int bytes)
1197  {
1198  	u8 *fault;
1199  	u8 *end;
1200  	u8 *addr = slab_address(slab);
1201  
1202  	metadata_access_enable();
1203  	fault = memchr_inv(kasan_reset_tag(start), value, bytes);
1204  	metadata_access_disable();
1205  	if (!fault)
1206  		return 1;
1207  
1208  	end = start + bytes;
1209  	while (end > fault && end[-1] == value)
1210  		end--;
1211  
1212  	if (slab_add_kunit_errors())
1213  		goto skip_bug_print;
1214  
1215  	slab_bug(s, "%s overwritten", what);
1216  	pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
1217  					fault, end - 1, fault - addr,
1218  					fault[0], value);
1219  
1220  skip_bug_print:
1221  	restore_bytes(s, what, value, fault, end);
1222  	return 0;
1223  }
1224  
1225  /*
1226   * Object layout:
1227   *
1228   * object address
1229   * 	Bytes of the object to be managed.
1230   * 	If the freepointer may overlay the object then the free
1231   *	pointer is at the middle of the object.
1232   *
1233   * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
1234   * 	0xa5 (POISON_END)
1235   *
1236   * object + s->object_size
1237   * 	Padding to reach word boundary. This is also used for Redzoning.
1238   * 	Padding is extended by another word if Redzoning is enabled and
1239   * 	object_size == inuse.
1240   *
1241   * 	We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with
1242   * 	0xcc (SLUB_RED_ACTIVE) for objects in use.
1243   *
1244   * object + s->inuse
1245   * 	Meta data starts here.
1246   *
1247   * 	A. Free pointer (if we cannot overwrite object on free)
1248   * 	B. Tracking data for SLAB_STORE_USER
1249   *	C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
1250   *	D. Padding to reach required alignment boundary or at minimum
1251   * 		one word if debugging is on to be able to detect writes
1252   * 		before the word boundary.
1253   *
1254   *	Padding is done using 0x5a (POISON_INUSE)
1255   *
1256   * object + s->size
1257   * 	Nothing is used beyond s->size.
1258   *
1259   * If slabcaches are merged then the object_size and inuse boundaries are mostly
1260   * ignored. And therefore no slab options that rely on these boundaries
1261   * may be used with merged slabcaches.
1262   */
1263  
1264  static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
1265  {
1266  	unsigned long off = get_info_end(s);	/* The end of info */
1267  
1268  	if (s->flags & SLAB_STORE_USER) {
1269  		/* We also have user information there */
1270  		off += 2 * sizeof(struct track);
1271  
1272  		if (s->flags & SLAB_KMALLOC)
1273  			off += sizeof(unsigned int);
1274  	}
1275  
1276  	off += kasan_metadata_size(s, false);
1277  
1278  	if (size_from_object(s) == off)
1279  		return 1;
1280  
1281  	return check_bytes_and_report(s, slab, p, "Object padding",
1282  			p + off, POISON_INUSE, size_from_object(s) - off);
1283  }
1284  
1285  /* Check the pad bytes at the end of a slab page */
1286  static pad_check_attributes void
1287  slab_pad_check(struct kmem_cache *s, struct slab *slab)
1288  {
1289  	u8 *start;
1290  	u8 *fault;
1291  	u8 *end;
1292  	u8 *pad;
1293  	int length;
1294  	int remainder;
1295  
1296  	if (!(s->flags & SLAB_POISON))
1297  		return;
1298  
1299  	start = slab_address(slab);
1300  	length = slab_size(slab);
1301  	end = start + length;
1302  	remainder = length % s->size;
1303  	if (!remainder)
1304  		return;
1305  
1306  	pad = end - remainder;
1307  	metadata_access_enable();
1308  	fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
1309  	metadata_access_disable();
1310  	if (!fault)
1311  		return;
1312  	while (end > fault && end[-1] == POISON_INUSE)
1313  		end--;
1314  
1315  	slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
1316  			fault, end - 1, fault - start);
1317  	print_section(KERN_ERR, "Padding ", pad, remainder);
1318  
1319  	restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
1320  }
1321  
1322  static int check_object(struct kmem_cache *s, struct slab *slab,
1323  					void *object, u8 val)
1324  {
1325  	u8 *p = object;
1326  	u8 *endobject = object + s->object_size;
1327  	unsigned int orig_size, kasan_meta_size;
1328  	int ret = 1;
1329  
1330  	if (s->flags & SLAB_RED_ZONE) {
1331  		if (!check_bytes_and_report(s, slab, object, "Left Redzone",
1332  			object - s->red_left_pad, val, s->red_left_pad))
1333  			ret = 0;
1334  
1335  		if (!check_bytes_and_report(s, slab, object, "Right Redzone",
1336  			endobject, val, s->inuse - s->object_size))
1337  			ret = 0;
1338  
1339  		if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
1340  			orig_size = get_orig_size(s, object);
1341  
1342  			if (s->object_size > orig_size  &&
1343  				!check_bytes_and_report(s, slab, object,
1344  					"kmalloc Redzone", p + orig_size,
1345  					val, s->object_size - orig_size)) {
1346  				ret = 0;
1347  			}
1348  		}
1349  	} else {
1350  		if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
1351  			if (!check_bytes_and_report(s, slab, p, "Alignment padding",
1352  				endobject, POISON_INUSE,
1353  				s->inuse - s->object_size))
1354  				ret = 0;
1355  		}
1356  	}
1357  
1358  	if (s->flags & SLAB_POISON) {
1359  		if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) {
1360  			/*
1361  			 * KASAN can save its free meta data inside of the
1362  			 * object at offset 0. Thus, skip checking the part of
1363  			 * the redzone that overlaps with the meta data.
1364  			 */
1365  			kasan_meta_size = kasan_metadata_size(s, true);
1366  			if (kasan_meta_size < s->object_size - 1 &&
1367  			    !check_bytes_and_report(s, slab, p, "Poison",
1368  					p + kasan_meta_size, POISON_FREE,
1369  					s->object_size - kasan_meta_size - 1))
1370  				ret = 0;
1371  			if (kasan_meta_size < s->object_size &&
1372  			    !check_bytes_and_report(s, slab, p, "End Poison",
1373  					p + s->object_size - 1, POISON_END, 1))
1374  				ret = 0;
1375  		}
1376  		/*
1377  		 * check_pad_bytes cleans up on its own.
1378  		 */
1379  		if (!check_pad_bytes(s, slab, p))
1380  			ret = 0;
1381  	}
1382  
1383  	/*
1384  	 * Cannot check freepointer while object is allocated if
1385  	 * object and freepointer overlap.
1386  	 */
1387  	if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) &&
1388  	    !check_valid_pointer(s, slab, get_freepointer(s, p))) {
1389  		object_err(s, slab, p, "Freepointer corrupt");
1390  		/*
1391  		 * No choice but to zap it and thus lose the remainder
1392  		 * of the free objects in this slab. May cause
1393  		 * another error because the object count is now wrong.
1394  		 */
1395  		set_freepointer(s, p, NULL);
1396  		ret = 0;
1397  	}
1398  
1399  	if (!ret && !slab_in_kunit_test()) {
1400  		print_trailer(s, slab, object);
1401  		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1402  	}
1403  
1404  	return ret;
1405  }
1406  
1407  static int check_slab(struct kmem_cache *s, struct slab *slab)
1408  {
1409  	int maxobj;
1410  
1411  	if (!folio_test_slab(slab_folio(slab))) {
1412  		slab_err(s, slab, "Not a valid slab page");
1413  		return 0;
1414  	}
1415  
1416  	maxobj = order_objects(slab_order(slab), s->size);
1417  	if (slab->objects > maxobj) {
1418  		slab_err(s, slab, "objects %u > max %u",
1419  			slab->objects, maxobj);
1420  		return 0;
1421  	}
1422  	if (slab->inuse > slab->objects) {
1423  		slab_err(s, slab, "inuse %u > max %u",
1424  			slab->inuse, slab->objects);
1425  		return 0;
1426  	}
1427  	/* Slab_pad_check fixes things up after itself */
1428  	slab_pad_check(s, slab);
1429  	return 1;
1430  }
1431  
1432  /*
1433   * Determine if a certain object in a slab is on the freelist. Must hold the
1434   * slab lock to guarantee that the chains are in a consistent state.
1435   */
1436  static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
1437  {
1438  	int nr = 0;
1439  	void *fp;
1440  	void *object = NULL;
1441  	int max_objects;
1442  
1443  	fp = slab->freelist;
1444  	while (fp && nr <= slab->objects) {
1445  		if (fp == search)
1446  			return 1;
1447  		if (!check_valid_pointer(s, slab, fp)) {
1448  			if (object) {
1449  				object_err(s, slab, object,
1450  					"Freechain corrupt");
1451  				set_freepointer(s, object, NULL);
1452  			} else {
1453  				slab_err(s, slab, "Freepointer corrupt");
1454  				slab->freelist = NULL;
1455  				slab->inuse = slab->objects;
1456  				slab_fix(s, "Freelist cleared");
1457  				return 0;
1458  			}
1459  			break;
1460  		}
1461  		object = fp;
1462  		fp = get_freepointer(s, object);
1463  		nr++;
1464  	}
1465  
1466  	max_objects = order_objects(slab_order(slab), s->size);
1467  	if (max_objects > MAX_OBJS_PER_PAGE)
1468  		max_objects = MAX_OBJS_PER_PAGE;
1469  
1470  	if (slab->objects != max_objects) {
1471  		slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
1472  			 slab->objects, max_objects);
1473  		slab->objects = max_objects;
1474  		slab_fix(s, "Number of objects adjusted");
1475  	}
1476  	if (slab->inuse != slab->objects - nr) {
1477  		slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
1478  			 slab->inuse, slab->objects - nr);
1479  		slab->inuse = slab->objects - nr;
1480  		slab_fix(s, "Object count adjusted");
1481  	}
1482  	return search == NULL;
1483  }
1484  
1485  static void trace(struct kmem_cache *s, struct slab *slab, void *object,
1486  								int alloc)
1487  {
1488  	if (s->flags & SLAB_TRACE) {
1489  		pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
1490  			s->name,
1491  			alloc ? "alloc" : "free",
1492  			object, slab->inuse,
1493  			slab->freelist);
1494  
1495  		if (!alloc)
1496  			print_section(KERN_INFO, "Object ", (void *)object,
1497  					s->object_size);
1498  
1499  		dump_stack();
1500  	}
1501  }
1502  
1503  /*
1504   * Tracking of fully allocated slabs for debugging purposes.
1505   */
1506  static void add_full(struct kmem_cache *s,
1507  	struct kmem_cache_node *n, struct slab *slab)
1508  {
1509  	if (!(s->flags & SLAB_STORE_USER))
1510  		return;
1511  
1512  	lockdep_assert_held(&n->list_lock);
1513  	list_add(&slab->slab_list, &n->full);
1514  }
1515  
1516  static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
1517  {
1518  	if (!(s->flags & SLAB_STORE_USER))
1519  		return;
1520  
1521  	lockdep_assert_held(&n->list_lock);
1522  	list_del(&slab->slab_list);
1523  }
1524  
1525  static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1526  {
1527  	return atomic_long_read(&n->nr_slabs);
1528  }
1529  
1530  static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1531  {
1532  	struct kmem_cache_node *n = get_node(s, node);
1533  
1534  	atomic_long_inc(&n->nr_slabs);
1535  	atomic_long_add(objects, &n->total_objects);
1536  }
1537  static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1538  {
1539  	struct kmem_cache_node *n = get_node(s, node);
1540  
1541  	atomic_long_dec(&n->nr_slabs);
1542  	atomic_long_sub(objects, &n->total_objects);
1543  }
1544  
1545  /* Object debug checks for alloc/free paths */
1546  static void setup_object_debug(struct kmem_cache *s, void *object)
1547  {
1548  	if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
1549  		return;
1550  
1551  	init_object(s, object, SLUB_RED_INACTIVE);
1552  	init_tracking(s, object);
1553  }
1554  
1555  static
1556  void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
1557  {
1558  	if (!kmem_cache_debug_flags(s, SLAB_POISON))
1559  		return;
1560  
1561  	metadata_access_enable();
1562  	memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
1563  	metadata_access_disable();
1564  }
1565  
1566  static inline int alloc_consistency_checks(struct kmem_cache *s,
1567  					struct slab *slab, void *object)
1568  {
1569  	if (!check_slab(s, slab))
1570  		return 0;
1571  
1572  	if (!check_valid_pointer(s, slab, object)) {
1573  		object_err(s, slab, object, "Freelist Pointer check fails");
1574  		return 0;
1575  	}
1576  
1577  	if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
1578  		return 0;
1579  
1580  	return 1;
1581  }
1582  
1583  static noinline bool alloc_debug_processing(struct kmem_cache *s,
1584  			struct slab *slab, void *object, int orig_size)
1585  {
1586  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1587  		if (!alloc_consistency_checks(s, slab, object))
1588  			goto bad;
1589  	}
1590  
1591  	/* Success. Perform special debug activities for allocs */
1592  	trace(s, slab, object, 1);
1593  	set_orig_size(s, object, orig_size);
1594  	init_object(s, object, SLUB_RED_ACTIVE);
1595  	return true;
1596  
1597  bad:
1598  	if (folio_test_slab(slab_folio(slab))) {
1599  		/*
1600  		 * If this is a slab page then lets do the best we can
1601  		 * to avoid issues in the future. Marking all objects
1602  		 * as used avoids touching the remaining objects.
1603  		 */
1604  		slab_fix(s, "Marking all objects used");
1605  		slab->inuse = slab->objects;
1606  		slab->freelist = NULL;
1607  	}
1608  	return false;
1609  }
1610  
1611  static inline int free_consistency_checks(struct kmem_cache *s,
1612  		struct slab *slab, void *object, unsigned long addr)
1613  {
1614  	if (!check_valid_pointer(s, slab, object)) {
1615  		slab_err(s, slab, "Invalid object pointer 0x%p", object);
1616  		return 0;
1617  	}
1618  
1619  	if (on_freelist(s, slab, object)) {
1620  		object_err(s, slab, object, "Object already free");
1621  		return 0;
1622  	}
1623  
1624  	if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
1625  		return 0;
1626  
1627  	if (unlikely(s != slab->slab_cache)) {
1628  		if (!folio_test_slab(slab_folio(slab))) {
1629  			slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
1630  				 object);
1631  		} else if (!slab->slab_cache) {
1632  			pr_err("SLUB <none>: no slab for object 0x%p.\n",
1633  			       object);
1634  			dump_stack();
1635  		} else
1636  			object_err(s, slab, object,
1637  					"page slab pointer corrupt.");
1638  		return 0;
1639  	}
1640  	return 1;
1641  }
1642  
1643  /*
1644   * Parse a block of slab_debug options. Blocks are delimited by ';'
1645   *
1646   * @str:    start of block
1647   * @flags:  returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
1648   * @slabs:  return start of list of slabs, or NULL when there's no list
1649   * @init:   assume this is initial parsing and not per-kmem-create parsing
1650   *
1651   * returns the start of next block if there's any, or NULL
1652   */
1653  static char *
1654  parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
1655  {
1656  	bool higher_order_disable = false;
1657  
1658  	/* Skip any completely empty blocks */
1659  	while (*str && *str == ';')
1660  		str++;
1661  
1662  	if (*str == ',') {
1663  		/*
1664  		 * No options but restriction on slabs. This means full
1665  		 * debugging for slabs matching a pattern.
1666  		 */
1667  		*flags = DEBUG_DEFAULT_FLAGS;
1668  		goto check_slabs;
1669  	}
1670  	*flags = 0;
1671  
1672  	/* Determine which debug features should be switched on */
1673  	for (; *str && *str != ',' && *str != ';'; str++) {
1674  		switch (tolower(*str)) {
1675  		case '-':
1676  			*flags = 0;
1677  			break;
1678  		case 'f':
1679  			*flags |= SLAB_CONSISTENCY_CHECKS;
1680  			break;
1681  		case 'z':
1682  			*flags |= SLAB_RED_ZONE;
1683  			break;
1684  		case 'p':
1685  			*flags |= SLAB_POISON;
1686  			break;
1687  		case 'u':
1688  			*flags |= SLAB_STORE_USER;
1689  			break;
1690  		case 't':
1691  			*flags |= SLAB_TRACE;
1692  			break;
1693  		case 'a':
1694  			*flags |= SLAB_FAILSLAB;
1695  			break;
1696  		case 'o':
1697  			/*
1698  			 * Avoid enabling debugging on caches if its minimum
1699  			 * order would increase as a result.
1700  			 */
1701  			higher_order_disable = true;
1702  			break;
1703  		default:
1704  			if (init)
1705  				pr_err("slab_debug option '%c' unknown. skipped\n", *str);
1706  		}
1707  	}
1708  check_slabs:
1709  	if (*str == ',')
1710  		*slabs = ++str;
1711  	else
1712  		*slabs = NULL;
1713  
1714  	/* Skip over the slab list */
1715  	while (*str && *str != ';')
1716  		str++;
1717  
1718  	/* Skip any completely empty blocks */
1719  	while (*str && *str == ';')
1720  		str++;
1721  
1722  	if (init && higher_order_disable)
1723  		disable_higher_order_debug = 1;
1724  
1725  	if (*str)
1726  		return str;
1727  	else
1728  		return NULL;
1729  }
1730  
1731  static int __init setup_slub_debug(char *str)
1732  {
1733  	slab_flags_t flags;
1734  	slab_flags_t global_flags;
1735  	char *saved_str;
1736  	char *slab_list;
1737  	bool global_slub_debug_changed = false;
1738  	bool slab_list_specified = false;
1739  
1740  	global_flags = DEBUG_DEFAULT_FLAGS;
1741  	if (*str++ != '=' || !*str)
1742  		/*
1743  		 * No options specified. Switch on full debugging.
1744  		 */
1745  		goto out;
1746  
1747  	saved_str = str;
1748  	while (str) {
1749  		str = parse_slub_debug_flags(str, &flags, &slab_list, true);
1750  
1751  		if (!slab_list) {
1752  			global_flags = flags;
1753  			global_slub_debug_changed = true;
1754  		} else {
1755  			slab_list_specified = true;
1756  			if (flags & SLAB_STORE_USER)
1757  				stack_depot_request_early_init();
1758  		}
1759  	}
1760  
1761  	/*
1762  	 * For backwards compatibility, a single list of flags with list of
1763  	 * slabs means debugging is only changed for those slabs, so the global
1764  	 * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
1765  	 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
1766  	 * long as there is no option specifying flags without a slab list.
1767  	 */
1768  	if (slab_list_specified) {
1769  		if (!global_slub_debug_changed)
1770  			global_flags = slub_debug;
1771  		slub_debug_string = saved_str;
1772  	}
1773  out:
1774  	slub_debug = global_flags;
1775  	if (slub_debug & SLAB_STORE_USER)
1776  		stack_depot_request_early_init();
1777  	if (slub_debug != 0 || slub_debug_string)
1778  		static_branch_enable(&slub_debug_enabled);
1779  	else
1780  		static_branch_disable(&slub_debug_enabled);
1781  	if ((static_branch_unlikely(&init_on_alloc) ||
1782  	     static_branch_unlikely(&init_on_free)) &&
1783  	    (slub_debug & SLAB_POISON))
1784  		pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
1785  	return 1;
1786  }
1787  
1788  __setup("slab_debug", setup_slub_debug);
1789  __setup_param("slub_debug", slub_debug, setup_slub_debug, 0);
1790  
1791  /*
1792   * kmem_cache_flags - apply debugging options to the cache
1793   * @flags:		flags to set
1794   * @name:		name of the cache
1795   *
1796   * Debug option(s) are applied to @flags. In addition to the debug
1797   * option(s), if a slab name (or multiple) is specified i.e.
1798   * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1799   * then only the select slabs will receive the debug option(s).
1800   */
1801  slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
1802  {
1803  	char *iter;
1804  	size_t len;
1805  	char *next_block;
1806  	slab_flags_t block_flags;
1807  	slab_flags_t slub_debug_local = slub_debug;
1808  
1809  	if (flags & SLAB_NO_USER_FLAGS)
1810  		return flags;
1811  
1812  	/*
1813  	 * If the slab cache is for debugging (e.g. kmemleak) then
1814  	 * don't store user (stack trace) information by default,
1815  	 * but let the user enable it via the command line below.
1816  	 */
1817  	if (flags & SLAB_NOLEAKTRACE)
1818  		slub_debug_local &= ~SLAB_STORE_USER;
1819  
1820  	len = strlen(name);
1821  	next_block = slub_debug_string;
1822  	/* Go through all blocks of debug options, see if any matches our slab's name */
1823  	while (next_block) {
1824  		next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
1825  		if (!iter)
1826  			continue;
1827  		/* Found a block that has a slab list, search it */
1828  		while (*iter) {
1829  			char *end, *glob;
1830  			size_t cmplen;
1831  
1832  			end = strchrnul(iter, ',');
1833  			if (next_block && next_block < end)
1834  				end = next_block - 1;
1835  
1836  			glob = strnchr(iter, end - iter, '*');
1837  			if (glob)
1838  				cmplen = glob - iter;
1839  			else
1840  				cmplen = max_t(size_t, len, (end - iter));
1841  
1842  			if (!strncmp(name, iter, cmplen)) {
1843  				flags |= block_flags;
1844  				return flags;
1845  			}
1846  
1847  			if (!*end || *end == ';')
1848  				break;
1849  			iter = end + 1;
1850  		}
1851  	}
1852  
1853  	return flags | slub_debug_local;
1854  }
1855  #else /* !CONFIG_SLUB_DEBUG */
1856  static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
1857  static inline
1858  void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
1859  
1860  static inline bool alloc_debug_processing(struct kmem_cache *s,
1861  	struct slab *slab, void *object, int orig_size) { return true; }
1862  
1863  static inline bool free_debug_processing(struct kmem_cache *s,
1864  	struct slab *slab, void *head, void *tail, int *bulk_cnt,
1865  	unsigned long addr, depot_stack_handle_t handle) { return true; }
1866  
1867  static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
1868  static inline int check_object(struct kmem_cache *s, struct slab *slab,
1869  			void *object, u8 val) { return 1; }
1870  static inline depot_stack_handle_t set_track_prepare(void) { return 0; }
1871  static inline void set_track(struct kmem_cache *s, void *object,
1872  			     enum track_item alloc, unsigned long addr) {}
1873  static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1874  					struct slab *slab) {}
1875  static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1876  					struct slab *slab) {}
1877  slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
1878  {
1879  	return flags;
1880  }
1881  #define slub_debug 0
1882  
1883  #define disable_higher_order_debug 0
1884  
1885  static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1886  							{ return 0; }
1887  static inline void inc_slabs_node(struct kmem_cache *s, int node,
1888  							int objects) {}
1889  static inline void dec_slabs_node(struct kmem_cache *s, int node,
1890  							int objects) {}
1891  #ifndef CONFIG_SLUB_TINY
1892  static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
1893  			       void **freelist, void *nextfree)
1894  {
1895  	return false;
1896  }
1897  #endif
1898  #endif /* CONFIG_SLUB_DEBUG */
1899  
1900  #ifdef CONFIG_SLAB_OBJ_EXT
1901  
1902  #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
1903  
1904  static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
1905  {
1906  	struct slabobj_ext *slab_exts;
1907  	struct slab *obj_exts_slab;
1908  
1909  	obj_exts_slab = virt_to_slab(obj_exts);
1910  	slab_exts = slab_obj_exts(obj_exts_slab);
1911  	if (slab_exts) {
1912  		unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
1913  						 obj_exts_slab, obj_exts);
1914  		/* codetag should be NULL */
1915  		WARN_ON(slab_exts[offs].ref.ct);
1916  		set_codetag_empty(&slab_exts[offs].ref);
1917  	}
1918  }
1919  
1920  static inline void mark_failed_objexts_alloc(struct slab *slab)
1921  {
1922  	slab->obj_exts = OBJEXTS_ALLOC_FAIL;
1923  }
1924  
1925  static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
1926  			struct slabobj_ext *vec, unsigned int objects)
1927  {
1928  	/*
1929  	 * If vector previously failed to allocate then we have live
1930  	 * objects with no tag reference. Mark all references in this
1931  	 * vector as empty to avoid warnings later on.
1932  	 */
1933  	if (obj_exts & OBJEXTS_ALLOC_FAIL) {
1934  		unsigned int i;
1935  
1936  		for (i = 0; i < objects; i++)
1937  			set_codetag_empty(&vec[i].ref);
1938  	}
1939  }
1940  
1941  #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
1942  
1943  static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
1944  static inline void mark_failed_objexts_alloc(struct slab *slab) {}
1945  static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
1946  			struct slabobj_ext *vec, unsigned int objects) {}
1947  
1948  #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
1949  
1950  /*
1951   * The allocated objcg pointers array is not accounted directly.
1952   * Moreover, it should not come from DMA buffer and is not readily
1953   * reclaimable. So those GFP bits should be masked off.
1954   */
1955  #define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
1956  				__GFP_ACCOUNT | __GFP_NOFAIL)
1957  
1958  int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
1959  		        gfp_t gfp, bool new_slab)
1960  {
1961  	unsigned int objects = objs_per_slab(s, slab);
1962  	unsigned long new_exts;
1963  	unsigned long old_exts;
1964  	struct slabobj_ext *vec;
1965  
1966  	gfp &= ~OBJCGS_CLEAR_MASK;
1967  	/* Prevent recursive extension vector allocation */
1968  	gfp |= __GFP_NO_OBJ_EXT;
1969  	vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
1970  			   slab_nid(slab));
1971  	if (!vec) {
1972  		/* Mark vectors which failed to allocate */
1973  		if (new_slab)
1974  			mark_failed_objexts_alloc(slab);
1975  
1976  		return -ENOMEM;
1977  	}
1978  
1979  	new_exts = (unsigned long)vec;
1980  #ifdef CONFIG_MEMCG
1981  	new_exts |= MEMCG_DATA_OBJEXTS;
1982  #endif
1983  	old_exts = READ_ONCE(slab->obj_exts);
1984  	handle_failed_objexts_alloc(old_exts, vec, objects);
1985  	if (new_slab) {
1986  		/*
1987  		 * If the slab is brand new and nobody can yet access its
1988  		 * obj_exts, no synchronization is required and obj_exts can
1989  		 * be simply assigned.
1990  		 */
1991  		slab->obj_exts = new_exts;
1992  	} else if ((old_exts & ~OBJEXTS_FLAGS_MASK) ||
1993  		   cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
1994  		/*
1995  		 * If the slab is already in use, somebody can allocate and
1996  		 * assign slabobj_exts in parallel. In this case the existing
1997  		 * objcg vector should be reused.
1998  		 */
1999  		mark_objexts_empty(vec);
2000  		kfree(vec);
2001  		return 0;
2002  	}
2003  
2004  	kmemleak_not_leak(vec);
2005  	return 0;
2006  }
2007  
2008  static inline void free_slab_obj_exts(struct slab *slab)
2009  {
2010  	struct slabobj_ext *obj_exts;
2011  
2012  	obj_exts = slab_obj_exts(slab);
2013  	if (!obj_exts)
2014  		return;
2015  
2016  	/*
2017  	 * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
2018  	 * corresponding extension will be NULL. alloc_tag_sub() will throw a
2019  	 * warning if slab has extensions but the extension of an object is
2020  	 * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
2021  	 * the extension for obj_exts is expected to be NULL.
2022  	 */
2023  	mark_objexts_empty(obj_exts);
2024  	kfree(obj_exts);
2025  	slab->obj_exts = 0;
2026  }
2027  
2028  static inline bool need_slab_obj_ext(void)
2029  {
2030  	if (mem_alloc_profiling_enabled())
2031  		return true;
2032  
2033  	/*
2034  	 * CONFIG_MEMCG creates vector of obj_cgroup objects conditionally
2035  	 * inside memcg_slab_post_alloc_hook. No other users for now.
2036  	 */
2037  	return false;
2038  }
2039  
2040  #else /* CONFIG_SLAB_OBJ_EXT */
2041  
2042  static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
2043  			       gfp_t gfp, bool new_slab)
2044  {
2045  	return 0;
2046  }
2047  
2048  static inline void free_slab_obj_exts(struct slab *slab)
2049  {
2050  }
2051  
2052  static inline bool need_slab_obj_ext(void)
2053  {
2054  	return false;
2055  }
2056  
2057  #endif /* CONFIG_SLAB_OBJ_EXT */
2058  
2059  #ifdef CONFIG_MEM_ALLOC_PROFILING
2060  
2061  static inline struct slabobj_ext *
2062  prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
2063  {
2064  	struct slab *slab;
2065  
2066  	if (!p)
2067  		return NULL;
2068  
2069  	if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
2070  		return NULL;
2071  
2072  	if (flags & __GFP_NO_OBJ_EXT)
2073  		return NULL;
2074  
2075  	slab = virt_to_slab(p);
2076  	if (!slab_obj_exts(slab) &&
2077  	    WARN(alloc_slab_obj_exts(slab, s, flags, false),
2078  		 "%s, %s: Failed to create slab extension vector!\n",
2079  		 __func__, s->name))
2080  		return NULL;
2081  
2082  	return slab_obj_exts(slab) + obj_to_index(s, slab, p);
2083  }
2084  
2085  static inline void
2086  alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2087  {
2088  	if (need_slab_obj_ext()) {
2089  		struct slabobj_ext *obj_exts;
2090  
2091  		obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
2092  		/*
2093  		 * Currently obj_exts is used only for allocation profiling.
2094  		 * If other users appear then mem_alloc_profiling_enabled()
2095  		 * check should be added before alloc_tag_add().
2096  		 */
2097  		if (likely(obj_exts))
2098  			alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
2099  	}
2100  }
2101  
2102  static inline void
2103  alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2104  			     int objects)
2105  {
2106  	struct slabobj_ext *obj_exts;
2107  	int i;
2108  
2109  	if (!mem_alloc_profiling_enabled())
2110  		return;
2111  
2112  	/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
2113  	if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
2114  		return;
2115  
2116  	obj_exts = slab_obj_exts(slab);
2117  	if (!obj_exts)
2118  		return;
2119  
2120  	for (i = 0; i < objects; i++) {
2121  		unsigned int off = obj_to_index(s, slab, p[i]);
2122  
2123  		alloc_tag_sub(&obj_exts[off].ref, s->size);
2124  	}
2125  }
2126  
2127  #else /* CONFIG_MEM_ALLOC_PROFILING */
2128  
2129  static inline void
2130  alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2131  {
2132  }
2133  
2134  static inline void
2135  alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2136  			     int objects)
2137  {
2138  }
2139  
2140  #endif /* CONFIG_MEM_ALLOC_PROFILING */
2141  
2142  
2143  #ifdef CONFIG_MEMCG
2144  
2145  static void memcg_alloc_abort_single(struct kmem_cache *s, void *object);
2146  
2147  static __fastpath_inline
2148  bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
2149  				gfp_t flags, size_t size, void **p)
2150  {
2151  	if (likely(!memcg_kmem_online()))
2152  		return true;
2153  
2154  	if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
2155  		return true;
2156  
2157  	if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p)))
2158  		return true;
2159  
2160  	if (likely(size == 1)) {
2161  		memcg_alloc_abort_single(s, *p);
2162  		*p = NULL;
2163  	} else {
2164  		kmem_cache_free_bulk(s, size, p);
2165  	}
2166  
2167  	return false;
2168  }
2169  
2170  static __fastpath_inline
2171  void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2172  			  int objects)
2173  {
2174  	struct slabobj_ext *obj_exts;
2175  
2176  	if (!memcg_kmem_online())
2177  		return;
2178  
2179  	obj_exts = slab_obj_exts(slab);
2180  	if (likely(!obj_exts))
2181  		return;
2182  
2183  	__memcg_slab_free_hook(s, slab, p, objects, obj_exts);
2184  }
2185  
2186  static __fastpath_inline
2187  bool memcg_slab_post_charge(void *p, gfp_t flags)
2188  {
2189  	struct slabobj_ext *slab_exts;
2190  	struct kmem_cache *s;
2191  	struct folio *folio;
2192  	struct slab *slab;
2193  	unsigned long off;
2194  
2195  	folio = virt_to_folio(p);
2196  	if (!folio_test_slab(folio)) {
2197  		return folio_memcg_kmem(folio) ||
2198  			(__memcg_kmem_charge_page(folio_page(folio, 0), flags,
2199  						  folio_order(folio)) == 0);
2200  	}
2201  
2202  	slab = folio_slab(folio);
2203  	s = slab->slab_cache;
2204  
2205  	/*
2206  	 * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency
2207  	 * of slab_obj_exts being allocated from the same slab and thus the slab
2208  	 * becoming effectively unfreeable.
2209  	 */
2210  	if (is_kmalloc_normal(s))
2211  		return true;
2212  
2213  	/* Ignore already charged objects. */
2214  	slab_exts = slab_obj_exts(slab);
2215  	if (slab_exts) {
2216  		off = obj_to_index(s, slab, p);
2217  		if (unlikely(slab_exts[off].objcg))
2218  			return true;
2219  	}
2220  
2221  	return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p);
2222  }
2223  
2224  #else /* CONFIG_MEMCG */
2225  static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
2226  					      struct list_lru *lru,
2227  					      gfp_t flags, size_t size,
2228  					      void **p)
2229  {
2230  	return true;
2231  }
2232  
2233  static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
2234  					void **p, int objects)
2235  {
2236  }
2237  
2238  static inline bool memcg_slab_post_charge(void *p, gfp_t flags)
2239  {
2240  	return true;
2241  }
2242  #endif /* CONFIG_MEMCG */
2243  
2244  #ifdef CONFIG_SLUB_RCU_DEBUG
2245  static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
2246  
2247  struct rcu_delayed_free {
2248  	struct rcu_head head;
2249  	void *object;
2250  };
2251  #endif
2252  
2253  /*
2254   * Hooks for other subsystems that check memory allocations. In a typical
2255   * production configuration these hooks all should produce no code at all.
2256   *
2257   * Returns true if freeing of the object can proceed, false if its reuse
2258   * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned
2259   * to KFENCE.
2260   */
2261  static __always_inline
2262  bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
2263  		    bool after_rcu_delay)
2264  {
2265  	/* Are the object contents still accessible? */
2266  	bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay;
2267  
2268  	kmemleak_free_recursive(x, s->flags);
2269  	kmsan_slab_free(s, x);
2270  
2271  	debug_check_no_locks_freed(x, s->object_size);
2272  
2273  	if (!(s->flags & SLAB_DEBUG_OBJECTS))
2274  		debug_check_no_obj_freed(x, s->object_size);
2275  
2276  	/* Use KCSAN to help debug racy use-after-free. */
2277  	if (!still_accessible)
2278  		__kcsan_check_access(x, s->object_size,
2279  				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
2280  
2281  	if (kfence_free(x))
2282  		return false;
2283  
2284  	/*
2285  	 * Give KASAN a chance to notice an invalid free operation before we
2286  	 * modify the object.
2287  	 */
2288  	if (kasan_slab_pre_free(s, x))
2289  		return false;
2290  
2291  #ifdef CONFIG_SLUB_RCU_DEBUG
2292  	if (still_accessible) {
2293  		struct rcu_delayed_free *delayed_free;
2294  
2295  		delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT);
2296  		if (delayed_free) {
2297  			/*
2298  			 * Let KASAN track our call stack as a "related work
2299  			 * creation", just like if the object had been freed
2300  			 * normally via kfree_rcu().
2301  			 * We have to do this manually because the rcu_head is
2302  			 * not located inside the object.
2303  			 */
2304  			kasan_record_aux_stack_noalloc(x);
2305  
2306  			delayed_free->object = x;
2307  			call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
2308  			return false;
2309  		}
2310  	}
2311  #endif /* CONFIG_SLUB_RCU_DEBUG */
2312  
2313  	/*
2314  	 * As memory initialization might be integrated into KASAN,
2315  	 * kasan_slab_free and initialization memset's must be
2316  	 * kept together to avoid discrepancies in behavior.
2317  	 *
2318  	 * The initialization memset's clear the object and the metadata,
2319  	 * but don't touch the SLAB redzone.
2320  	 *
2321  	 * The object's freepointer is also avoided if stored outside the
2322  	 * object.
2323  	 */
2324  	if (unlikely(init)) {
2325  		int rsize;
2326  		unsigned int inuse, orig_size;
2327  
2328  		inuse = get_info_end(s);
2329  		orig_size = get_orig_size(s, x);
2330  		if (!kasan_has_integrated_init())
2331  			memset(kasan_reset_tag(x), 0, orig_size);
2332  		rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
2333  		memset((char *)kasan_reset_tag(x) + inuse, 0,
2334  		       s->size - inuse - rsize);
2335  		/*
2336  		 * Restore orig_size, otherwize kmalloc redzone overwritten
2337  		 * would be reported
2338  		 */
2339  		set_orig_size(s, x, orig_size);
2340  
2341  	}
2342  	/* KASAN might put x into memory quarantine, delaying its reuse. */
2343  	return !kasan_slab_free(s, x, init, still_accessible);
2344  }
2345  
2346  static __fastpath_inline
2347  bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
2348  			     int *cnt)
2349  {
2350  
2351  	void *object;
2352  	void *next = *head;
2353  	void *old_tail = *tail;
2354  	bool init;
2355  
2356  	if (is_kfence_address(next)) {
2357  		slab_free_hook(s, next, false, false);
2358  		return false;
2359  	}
2360  
2361  	/* Head and tail of the reconstructed freelist */
2362  	*head = NULL;
2363  	*tail = NULL;
2364  
2365  	init = slab_want_init_on_free(s);
2366  
2367  	do {
2368  		object = next;
2369  		next = get_freepointer(s, object);
2370  
2371  		/* If object's reuse doesn't have to be delayed */
2372  		if (likely(slab_free_hook(s, object, init, false))) {
2373  			/* Move object to the new freelist */
2374  			set_freepointer(s, object, *head);
2375  			*head = object;
2376  			if (!*tail)
2377  				*tail = object;
2378  		} else {
2379  			/*
2380  			 * Adjust the reconstructed freelist depth
2381  			 * accordingly if object's reuse is delayed.
2382  			 */
2383  			--(*cnt);
2384  		}
2385  	} while (object != old_tail);
2386  
2387  	return *head != NULL;
2388  }
2389  
2390  static void *setup_object(struct kmem_cache *s, void *object)
2391  {
2392  	setup_object_debug(s, object);
2393  	object = kasan_init_slab_obj(s, object);
2394  	if (unlikely(s->ctor)) {
2395  		kasan_unpoison_new_object(s, object);
2396  		s->ctor(object);
2397  		kasan_poison_new_object(s, object);
2398  	}
2399  	return object;
2400  }
2401  
2402  /*
2403   * Slab allocation and freeing
2404   */
2405  static inline struct slab *alloc_slab_page(gfp_t flags, int node,
2406  		struct kmem_cache_order_objects oo)
2407  {
2408  	struct folio *folio;
2409  	struct slab *slab;
2410  	unsigned int order = oo_order(oo);
2411  
2412  	if (node == NUMA_NO_NODE)
2413  		folio = (struct folio *)alloc_pages(flags, order);
2414  	else
2415  		folio = (struct folio *)__alloc_pages_node(node, flags, order);
2416  
2417  	if (!folio)
2418  		return NULL;
2419  
2420  	slab = folio_slab(folio);
2421  	__folio_set_slab(folio);
2422  	/* Make the flag visible before any changes to folio->mapping */
2423  	smp_wmb();
2424  	if (folio_is_pfmemalloc(folio))
2425  		slab_set_pfmemalloc(slab);
2426  
2427  	return slab;
2428  }
2429  
2430  #ifdef CONFIG_SLAB_FREELIST_RANDOM
2431  /* Pre-initialize the random sequence cache */
2432  static int init_cache_random_seq(struct kmem_cache *s)
2433  {
2434  	unsigned int count = oo_objects(s->oo);
2435  	int err;
2436  
2437  	/* Bailout if already initialised */
2438  	if (s->random_seq)
2439  		return 0;
2440  
2441  	err = cache_random_seq_create(s, count, GFP_KERNEL);
2442  	if (err) {
2443  		pr_err("SLUB: Unable to initialize free list for %s\n",
2444  			s->name);
2445  		return err;
2446  	}
2447  
2448  	/* Transform to an offset on the set of pages */
2449  	if (s->random_seq) {
2450  		unsigned int i;
2451  
2452  		for (i = 0; i < count; i++)
2453  			s->random_seq[i] *= s->size;
2454  	}
2455  	return 0;
2456  }
2457  
2458  /* Initialize each random sequence freelist per cache */
2459  static void __init init_freelist_randomization(void)
2460  {
2461  	struct kmem_cache *s;
2462  
2463  	mutex_lock(&slab_mutex);
2464  
2465  	list_for_each_entry(s, &slab_caches, list)
2466  		init_cache_random_seq(s);
2467  
2468  	mutex_unlock(&slab_mutex);
2469  }
2470  
2471  /* Get the next entry on the pre-computed freelist randomized */
2472  static void *next_freelist_entry(struct kmem_cache *s,
2473  				unsigned long *pos, void *start,
2474  				unsigned long page_limit,
2475  				unsigned long freelist_count)
2476  {
2477  	unsigned int idx;
2478  
2479  	/*
2480  	 * If the target page allocation failed, the number of objects on the
2481  	 * page might be smaller than the usual size defined by the cache.
2482  	 */
2483  	do {
2484  		idx = s->random_seq[*pos];
2485  		*pos += 1;
2486  		if (*pos >= freelist_count)
2487  			*pos = 0;
2488  	} while (unlikely(idx >= page_limit));
2489  
2490  	return (char *)start + idx;
2491  }
2492  
2493  /* Shuffle the single linked freelist based on a random pre-computed sequence */
2494  static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
2495  {
2496  	void *start;
2497  	void *cur;
2498  	void *next;
2499  	unsigned long idx, pos, page_limit, freelist_count;
2500  
2501  	if (slab->objects < 2 || !s->random_seq)
2502  		return false;
2503  
2504  	freelist_count = oo_objects(s->oo);
2505  	pos = get_random_u32_below(freelist_count);
2506  
2507  	page_limit = slab->objects * s->size;
2508  	start = fixup_red_left(s, slab_address(slab));
2509  
2510  	/* First entry is used as the base of the freelist */
2511  	cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
2512  	cur = setup_object(s, cur);
2513  	slab->freelist = cur;
2514  
2515  	for (idx = 1; idx < slab->objects; idx++) {
2516  		next = next_freelist_entry(s, &pos, start, page_limit,
2517  			freelist_count);
2518  		next = setup_object(s, next);
2519  		set_freepointer(s, cur, next);
2520  		cur = next;
2521  	}
2522  	set_freepointer(s, cur, NULL);
2523  
2524  	return true;
2525  }
2526  #else
2527  static inline int init_cache_random_seq(struct kmem_cache *s)
2528  {
2529  	return 0;
2530  }
2531  static inline void init_freelist_randomization(void) { }
2532  static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
2533  {
2534  	return false;
2535  }
2536  #endif /* CONFIG_SLAB_FREELIST_RANDOM */
2537  
2538  static __always_inline void account_slab(struct slab *slab, int order,
2539  					 struct kmem_cache *s, gfp_t gfp)
2540  {
2541  	if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
2542  		alloc_slab_obj_exts(slab, s, gfp, true);
2543  
2544  	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
2545  			    PAGE_SIZE << order);
2546  }
2547  
2548  static __always_inline void unaccount_slab(struct slab *slab, int order,
2549  					   struct kmem_cache *s)
2550  {
2551  	if (memcg_kmem_online() || need_slab_obj_ext())
2552  		free_slab_obj_exts(slab);
2553  
2554  	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
2555  			    -(PAGE_SIZE << order));
2556  }
2557  
2558  static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
2559  {
2560  	struct slab *slab;
2561  	struct kmem_cache_order_objects oo = s->oo;
2562  	gfp_t alloc_gfp;
2563  	void *start, *p, *next;
2564  	int idx;
2565  	bool shuffle;
2566  
2567  	flags &= gfp_allowed_mask;
2568  
2569  	flags |= s->allocflags;
2570  
2571  	/*
2572  	 * Let the initial higher-order allocation fail under memory pressure
2573  	 * so we fall-back to the minimum order allocation.
2574  	 */
2575  	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
2576  	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
2577  		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
2578  
2579  	slab = alloc_slab_page(alloc_gfp, node, oo);
2580  	if (unlikely(!slab)) {
2581  		oo = s->min;
2582  		alloc_gfp = flags;
2583  		/*
2584  		 * Allocation may have failed due to fragmentation.
2585  		 * Try a lower order alloc if possible
2586  		 */
2587  		slab = alloc_slab_page(alloc_gfp, node, oo);
2588  		if (unlikely(!slab))
2589  			return NULL;
2590  		stat(s, ORDER_FALLBACK);
2591  	}
2592  
2593  	slab->objects = oo_objects(oo);
2594  	slab->inuse = 0;
2595  	slab->frozen = 0;
2596  
2597  	account_slab(slab, oo_order(oo), s, flags);
2598  
2599  	slab->slab_cache = s;
2600  
2601  	kasan_poison_slab(slab);
2602  
2603  	start = slab_address(slab);
2604  
2605  	setup_slab_debug(s, slab, start);
2606  
2607  	shuffle = shuffle_freelist(s, slab);
2608  
2609  	if (!shuffle) {
2610  		start = fixup_red_left(s, start);
2611  		start = setup_object(s, start);
2612  		slab->freelist = start;
2613  		for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
2614  			next = p + s->size;
2615  			next = setup_object(s, next);
2616  			set_freepointer(s, p, next);
2617  			p = next;
2618  		}
2619  		set_freepointer(s, p, NULL);
2620  	}
2621  
2622  	return slab;
2623  }
2624  
2625  static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
2626  {
2627  	if (unlikely(flags & GFP_SLAB_BUG_MASK))
2628  		flags = kmalloc_fix_flags(flags);
2629  
2630  	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
2631  
2632  	return allocate_slab(s,
2633  		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
2634  }
2635  
2636  static void __free_slab(struct kmem_cache *s, struct slab *slab)
2637  {
2638  	struct folio *folio = slab_folio(slab);
2639  	int order = folio_order(folio);
2640  	int pages = 1 << order;
2641  
2642  	__slab_clear_pfmemalloc(slab);
2643  	folio->mapping = NULL;
2644  	/* Make the mapping reset visible before clearing the flag */
2645  	smp_wmb();
2646  	__folio_clear_slab(folio);
2647  	mm_account_reclaimed_pages(pages);
2648  	unaccount_slab(slab, order, s);
2649  	__free_pages(&folio->page, order);
2650  }
2651  
2652  static void rcu_free_slab(struct rcu_head *h)
2653  {
2654  	struct slab *slab = container_of(h, struct slab, rcu_head);
2655  
2656  	__free_slab(slab->slab_cache, slab);
2657  }
2658  
2659  static void free_slab(struct kmem_cache *s, struct slab *slab)
2660  {
2661  	if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
2662  		void *p;
2663  
2664  		slab_pad_check(s, slab);
2665  		for_each_object(p, s, slab_address(slab), slab->objects)
2666  			check_object(s, slab, p, SLUB_RED_INACTIVE);
2667  	}
2668  
2669  	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
2670  		call_rcu(&slab->rcu_head, rcu_free_slab);
2671  	else
2672  		__free_slab(s, slab);
2673  }
2674  
2675  static void discard_slab(struct kmem_cache *s, struct slab *slab)
2676  {
2677  	dec_slabs_node(s, slab_nid(slab), slab->objects);
2678  	free_slab(s, slab);
2679  }
2680  
2681  /*
2682   * SLUB reuses PG_workingset bit to keep track of whether it's on
2683   * the per-node partial list.
2684   */
2685  static inline bool slab_test_node_partial(const struct slab *slab)
2686  {
2687  	return folio_test_workingset(slab_folio(slab));
2688  }
2689  
2690  static inline void slab_set_node_partial(struct slab *slab)
2691  {
2692  	set_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
2693  }
2694  
2695  static inline void slab_clear_node_partial(struct slab *slab)
2696  {
2697  	clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0));
2698  }
2699  
2700  /*
2701   * Management of partially allocated slabs.
2702   */
2703  static inline void
2704  __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
2705  {
2706  	n->nr_partial++;
2707  	if (tail == DEACTIVATE_TO_TAIL)
2708  		list_add_tail(&slab->slab_list, &n->partial);
2709  	else
2710  		list_add(&slab->slab_list, &n->partial);
2711  	slab_set_node_partial(slab);
2712  }
2713  
2714  static inline void add_partial(struct kmem_cache_node *n,
2715  				struct slab *slab, int tail)
2716  {
2717  	lockdep_assert_held(&n->list_lock);
2718  	__add_partial(n, slab, tail);
2719  }
2720  
2721  static inline void remove_partial(struct kmem_cache_node *n,
2722  					struct slab *slab)
2723  {
2724  	lockdep_assert_held(&n->list_lock);
2725  	list_del(&slab->slab_list);
2726  	slab_clear_node_partial(slab);
2727  	n->nr_partial--;
2728  }
2729  
2730  /*
2731   * Called only for kmem_cache_debug() caches instead of remove_partial(), with a
2732   * slab from the n->partial list. Remove only a single object from the slab, do
2733   * the alloc_debug_processing() checks and leave the slab on the list, or move
2734   * it to full list if it was the last free object.
2735   */
2736  static void *alloc_single_from_partial(struct kmem_cache *s,
2737  		struct kmem_cache_node *n, struct slab *slab, int orig_size)
2738  {
2739  	void *object;
2740  
2741  	lockdep_assert_held(&n->list_lock);
2742  
2743  	object = slab->freelist;
2744  	slab->freelist = get_freepointer(s, object);
2745  	slab->inuse++;
2746  
2747  	if (!alloc_debug_processing(s, slab, object, orig_size)) {
2748  		remove_partial(n, slab);
2749  		return NULL;
2750  	}
2751  
2752  	if (slab->inuse == slab->objects) {
2753  		remove_partial(n, slab);
2754  		add_full(s, n, slab);
2755  	}
2756  
2757  	return object;
2758  }
2759  
2760  /*
2761   * Called only for kmem_cache_debug() caches to allocate from a freshly
2762   * allocated slab. Allocate a single object instead of whole freelist
2763   * and put the slab to the partial (or full) list.
2764   */
2765  static void *alloc_single_from_new_slab(struct kmem_cache *s,
2766  					struct slab *slab, int orig_size)
2767  {
2768  	int nid = slab_nid(slab);
2769  	struct kmem_cache_node *n = get_node(s, nid);
2770  	unsigned long flags;
2771  	void *object;
2772  
2773  
2774  	object = slab->freelist;
2775  	slab->freelist = get_freepointer(s, object);
2776  	slab->inuse = 1;
2777  
2778  	if (!alloc_debug_processing(s, slab, object, orig_size))
2779  		/*
2780  		 * It's not really expected that this would fail on a
2781  		 * freshly allocated slab, but a concurrent memory
2782  		 * corruption in theory could cause that.
2783  		 */
2784  		return NULL;
2785  
2786  	spin_lock_irqsave(&n->list_lock, flags);
2787  
2788  	if (slab->inuse == slab->objects)
2789  		add_full(s, n, slab);
2790  	else
2791  		add_partial(n, slab, DEACTIVATE_TO_HEAD);
2792  
2793  	inc_slabs_node(s, nid, slab->objects);
2794  	spin_unlock_irqrestore(&n->list_lock, flags);
2795  
2796  	return object;
2797  }
2798  
2799  #ifdef CONFIG_SLUB_CPU_PARTIAL
2800  static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
2801  #else
2802  static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
2803  				   int drain) { }
2804  #endif
2805  static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
2806  
2807  /*
2808   * Try to allocate a partial slab from a specific node.
2809   */
2810  static struct slab *get_partial_node(struct kmem_cache *s,
2811  				     struct kmem_cache_node *n,
2812  				     struct partial_context *pc)
2813  {
2814  	struct slab *slab, *slab2, *partial = NULL;
2815  	unsigned long flags;
2816  	unsigned int partial_slabs = 0;
2817  
2818  	/*
2819  	 * Racy check. If we mistakenly see no partial slabs then we
2820  	 * just allocate an empty slab. If we mistakenly try to get a
2821  	 * partial slab and there is none available then get_partial()
2822  	 * will return NULL.
2823  	 */
2824  	if (!n || !n->nr_partial)
2825  		return NULL;
2826  
2827  	spin_lock_irqsave(&n->list_lock, flags);
2828  	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
2829  		if (!pfmemalloc_match(slab, pc->flags))
2830  			continue;
2831  
2832  		if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
2833  			void *object = alloc_single_from_partial(s, n, slab,
2834  							pc->orig_size);
2835  			if (object) {
2836  				partial = slab;
2837  				pc->object = object;
2838  				break;
2839  			}
2840  			continue;
2841  		}
2842  
2843  		remove_partial(n, slab);
2844  
2845  		if (!partial) {
2846  			partial = slab;
2847  			stat(s, ALLOC_FROM_PARTIAL);
2848  
2849  			if ((slub_get_cpu_partial(s) == 0)) {
2850  				break;
2851  			}
2852  		} else {
2853  			put_cpu_partial(s, slab, 0);
2854  			stat(s, CPU_PARTIAL_NODE);
2855  
2856  			if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
2857  				break;
2858  			}
2859  		}
2860  	}
2861  	spin_unlock_irqrestore(&n->list_lock, flags);
2862  	return partial;
2863  }
2864  
2865  /*
2866   * Get a slab from somewhere. Search in increasing NUMA distances.
2867   */
2868  static struct slab *get_any_partial(struct kmem_cache *s,
2869  				    struct partial_context *pc)
2870  {
2871  #ifdef CONFIG_NUMA
2872  	struct zonelist *zonelist;
2873  	struct zoneref *z;
2874  	struct zone *zone;
2875  	enum zone_type highest_zoneidx = gfp_zone(pc->flags);
2876  	struct slab *slab;
2877  	unsigned int cpuset_mems_cookie;
2878  
2879  	/*
2880  	 * The defrag ratio allows a configuration of the tradeoffs between
2881  	 * inter node defragmentation and node local allocations. A lower
2882  	 * defrag_ratio increases the tendency to do local allocations
2883  	 * instead of attempting to obtain partial slabs from other nodes.
2884  	 *
2885  	 * If the defrag_ratio is set to 0 then kmalloc() always
2886  	 * returns node local objects. If the ratio is higher then kmalloc()
2887  	 * may return off node objects because partial slabs are obtained
2888  	 * from other nodes and filled up.
2889  	 *
2890  	 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
2891  	 * (which makes defrag_ratio = 1000) then every (well almost)
2892  	 * allocation will first attempt to defrag slab caches on other nodes.
2893  	 * This means scanning over all nodes to look for partial slabs which
2894  	 * may be expensive if we do it every time we are trying to find a slab
2895  	 * with available objects.
2896  	 */
2897  	if (!s->remote_node_defrag_ratio ||
2898  			get_cycles() % 1024 > s->remote_node_defrag_ratio)
2899  		return NULL;
2900  
2901  	do {
2902  		cpuset_mems_cookie = read_mems_allowed_begin();
2903  		zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
2904  		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
2905  			struct kmem_cache_node *n;
2906  
2907  			n = get_node(s, zone_to_nid(zone));
2908  
2909  			if (n && cpuset_zone_allowed(zone, pc->flags) &&
2910  					n->nr_partial > s->min_partial) {
2911  				slab = get_partial_node(s, n, pc);
2912  				if (slab) {
2913  					/*
2914  					 * Don't check read_mems_allowed_retry()
2915  					 * here - if mems_allowed was updated in
2916  					 * parallel, that was a harmless race
2917  					 * between allocation and the cpuset
2918  					 * update
2919  					 */
2920  					return slab;
2921  				}
2922  			}
2923  		}
2924  	} while (read_mems_allowed_retry(cpuset_mems_cookie));
2925  #endif	/* CONFIG_NUMA */
2926  	return NULL;
2927  }
2928  
2929  /*
2930   * Get a partial slab, lock it and return it.
2931   */
2932  static struct slab *get_partial(struct kmem_cache *s, int node,
2933  				struct partial_context *pc)
2934  {
2935  	struct slab *slab;
2936  	int searchnode = node;
2937  
2938  	if (node == NUMA_NO_NODE)
2939  		searchnode = numa_mem_id();
2940  
2941  	slab = get_partial_node(s, get_node(s, searchnode), pc);
2942  	if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
2943  		return slab;
2944  
2945  	return get_any_partial(s, pc);
2946  }
2947  
2948  #ifndef CONFIG_SLUB_TINY
2949  
2950  #ifdef CONFIG_PREEMPTION
2951  /*
2952   * Calculate the next globally unique transaction for disambiguation
2953   * during cmpxchg. The transactions start with the cpu number and are then
2954   * incremented by CONFIG_NR_CPUS.
2955   */
2956  #define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
2957  #else
2958  /*
2959   * No preemption supported therefore also no need to check for
2960   * different cpus.
2961   */
2962  #define TID_STEP 1
2963  #endif /* CONFIG_PREEMPTION */
2964  
2965  static inline unsigned long next_tid(unsigned long tid)
2966  {
2967  	return tid + TID_STEP;
2968  }
2969  
2970  #ifdef SLUB_DEBUG_CMPXCHG
2971  static inline unsigned int tid_to_cpu(unsigned long tid)
2972  {
2973  	return tid % TID_STEP;
2974  }
2975  
2976  static inline unsigned long tid_to_event(unsigned long tid)
2977  {
2978  	return tid / TID_STEP;
2979  }
2980  #endif
2981  
2982  static inline unsigned int init_tid(int cpu)
2983  {
2984  	return cpu;
2985  }
2986  
2987  static inline void note_cmpxchg_failure(const char *n,
2988  		const struct kmem_cache *s, unsigned long tid)
2989  {
2990  #ifdef SLUB_DEBUG_CMPXCHG
2991  	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
2992  
2993  	pr_info("%s %s: cmpxchg redo ", n, s->name);
2994  
2995  #ifdef CONFIG_PREEMPTION
2996  	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
2997  		pr_warn("due to cpu change %d -> %d\n",
2998  			tid_to_cpu(tid), tid_to_cpu(actual_tid));
2999  	else
3000  #endif
3001  	if (tid_to_event(tid) != tid_to_event(actual_tid))
3002  		pr_warn("due to cpu running other code. Event %ld->%ld\n",
3003  			tid_to_event(tid), tid_to_event(actual_tid));
3004  	else
3005  		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
3006  			actual_tid, tid, next_tid(tid));
3007  #endif
3008  	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
3009  }
3010  
3011  static void init_kmem_cache_cpus(struct kmem_cache *s)
3012  {
3013  	int cpu;
3014  	struct kmem_cache_cpu *c;
3015  
3016  	for_each_possible_cpu(cpu) {
3017  		c = per_cpu_ptr(s->cpu_slab, cpu);
3018  		local_lock_init(&c->lock);
3019  		c->tid = init_tid(cpu);
3020  	}
3021  }
3022  
3023  /*
3024   * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
3025   * unfreezes the slabs and puts it on the proper list.
3026   * Assumes the slab has been already safely taken away from kmem_cache_cpu
3027   * by the caller.
3028   */
3029  static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
3030  			    void *freelist)
3031  {
3032  	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
3033  	int free_delta = 0;
3034  	void *nextfree, *freelist_iter, *freelist_tail;
3035  	int tail = DEACTIVATE_TO_HEAD;
3036  	unsigned long flags = 0;
3037  	struct slab new;
3038  	struct slab old;
3039  
3040  	if (READ_ONCE(slab->freelist)) {
3041  		stat(s, DEACTIVATE_REMOTE_FREES);
3042  		tail = DEACTIVATE_TO_TAIL;
3043  	}
3044  
3045  	/*
3046  	 * Stage one: Count the objects on cpu's freelist as free_delta and
3047  	 * remember the last object in freelist_tail for later splicing.
3048  	 */
3049  	freelist_tail = NULL;
3050  	freelist_iter = freelist;
3051  	while (freelist_iter) {
3052  		nextfree = get_freepointer(s, freelist_iter);
3053  
3054  		/*
3055  		 * If 'nextfree' is invalid, it is possible that the object at
3056  		 * 'freelist_iter' is already corrupted.  So isolate all objects
3057  		 * starting at 'freelist_iter' by skipping them.
3058  		 */
3059  		if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
3060  			break;
3061  
3062  		freelist_tail = freelist_iter;
3063  		free_delta++;
3064  
3065  		freelist_iter = nextfree;
3066  	}
3067  
3068  	/*
3069  	 * Stage two: Unfreeze the slab while splicing the per-cpu
3070  	 * freelist to the head of slab's freelist.
3071  	 */
3072  	do {
3073  		old.freelist = READ_ONCE(slab->freelist);
3074  		old.counters = READ_ONCE(slab->counters);
3075  		VM_BUG_ON(!old.frozen);
3076  
3077  		/* Determine target state of the slab */
3078  		new.counters = old.counters;
3079  		new.frozen = 0;
3080  		if (freelist_tail) {
3081  			new.inuse -= free_delta;
3082  			set_freepointer(s, freelist_tail, old.freelist);
3083  			new.freelist = freelist;
3084  		} else {
3085  			new.freelist = old.freelist;
3086  		}
3087  	} while (!slab_update_freelist(s, slab,
3088  		old.freelist, old.counters,
3089  		new.freelist, new.counters,
3090  		"unfreezing slab"));
3091  
3092  	/*
3093  	 * Stage three: Manipulate the slab list based on the updated state.
3094  	 */
3095  	if (!new.inuse && n->nr_partial >= s->min_partial) {
3096  		stat(s, DEACTIVATE_EMPTY);
3097  		discard_slab(s, slab);
3098  		stat(s, FREE_SLAB);
3099  	} else if (new.freelist) {
3100  		spin_lock_irqsave(&n->list_lock, flags);
3101  		add_partial(n, slab, tail);
3102  		spin_unlock_irqrestore(&n->list_lock, flags);
3103  		stat(s, tail);
3104  	} else {
3105  		stat(s, DEACTIVATE_FULL);
3106  	}
3107  }
3108  
3109  #ifdef CONFIG_SLUB_CPU_PARTIAL
3110  static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
3111  {
3112  	struct kmem_cache_node *n = NULL, *n2 = NULL;
3113  	struct slab *slab, *slab_to_discard = NULL;
3114  	unsigned long flags = 0;
3115  
3116  	while (partial_slab) {
3117  		slab = partial_slab;
3118  		partial_slab = slab->next;
3119  
3120  		n2 = get_node(s, slab_nid(slab));
3121  		if (n != n2) {
3122  			if (n)
3123  				spin_unlock_irqrestore(&n->list_lock, flags);
3124  
3125  			n = n2;
3126  			spin_lock_irqsave(&n->list_lock, flags);
3127  		}
3128  
3129  		if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) {
3130  			slab->next = slab_to_discard;
3131  			slab_to_discard = slab;
3132  		} else {
3133  			add_partial(n, slab, DEACTIVATE_TO_TAIL);
3134  			stat(s, FREE_ADD_PARTIAL);
3135  		}
3136  	}
3137  
3138  	if (n)
3139  		spin_unlock_irqrestore(&n->list_lock, flags);
3140  
3141  	while (slab_to_discard) {
3142  		slab = slab_to_discard;
3143  		slab_to_discard = slab_to_discard->next;
3144  
3145  		stat(s, DEACTIVATE_EMPTY);
3146  		discard_slab(s, slab);
3147  		stat(s, FREE_SLAB);
3148  	}
3149  }
3150  
3151  /*
3152   * Put all the cpu partial slabs to the node partial list.
3153   */
3154  static void put_partials(struct kmem_cache *s)
3155  {
3156  	struct slab *partial_slab;
3157  	unsigned long flags;
3158  
3159  	local_lock_irqsave(&s->cpu_slab->lock, flags);
3160  	partial_slab = this_cpu_read(s->cpu_slab->partial);
3161  	this_cpu_write(s->cpu_slab->partial, NULL);
3162  	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3163  
3164  	if (partial_slab)
3165  		__put_partials(s, partial_slab);
3166  }
3167  
3168  static void put_partials_cpu(struct kmem_cache *s,
3169  			     struct kmem_cache_cpu *c)
3170  {
3171  	struct slab *partial_slab;
3172  
3173  	partial_slab = slub_percpu_partial(c);
3174  	c->partial = NULL;
3175  
3176  	if (partial_slab)
3177  		__put_partials(s, partial_slab);
3178  }
3179  
3180  /*
3181   * Put a slab into a partial slab slot if available.
3182   *
3183   * If we did not find a slot then simply move all the partials to the
3184   * per node partial list.
3185   */
3186  static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
3187  {
3188  	struct slab *oldslab;
3189  	struct slab *slab_to_put = NULL;
3190  	unsigned long flags;
3191  	int slabs = 0;
3192  
3193  	local_lock_irqsave(&s->cpu_slab->lock, flags);
3194  
3195  	oldslab = this_cpu_read(s->cpu_slab->partial);
3196  
3197  	if (oldslab) {
3198  		if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
3199  			/*
3200  			 * Partial array is full. Move the existing set to the
3201  			 * per node partial list. Postpone the actual unfreezing
3202  			 * outside of the critical section.
3203  			 */
3204  			slab_to_put = oldslab;
3205  			oldslab = NULL;
3206  		} else {
3207  			slabs = oldslab->slabs;
3208  		}
3209  	}
3210  
3211  	slabs++;
3212  
3213  	slab->slabs = slabs;
3214  	slab->next = oldslab;
3215  
3216  	this_cpu_write(s->cpu_slab->partial, slab);
3217  
3218  	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3219  
3220  	if (slab_to_put) {
3221  		__put_partials(s, slab_to_put);
3222  		stat(s, CPU_PARTIAL_DRAIN);
3223  	}
3224  }
3225  
3226  #else	/* CONFIG_SLUB_CPU_PARTIAL */
3227  
3228  static inline void put_partials(struct kmem_cache *s) { }
3229  static inline void put_partials_cpu(struct kmem_cache *s,
3230  				    struct kmem_cache_cpu *c) { }
3231  
3232  #endif	/* CONFIG_SLUB_CPU_PARTIAL */
3233  
3234  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
3235  {
3236  	unsigned long flags;
3237  	struct slab *slab;
3238  	void *freelist;
3239  
3240  	local_lock_irqsave(&s->cpu_slab->lock, flags);
3241  
3242  	slab = c->slab;
3243  	freelist = c->freelist;
3244  
3245  	c->slab = NULL;
3246  	c->freelist = NULL;
3247  	c->tid = next_tid(c->tid);
3248  
3249  	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3250  
3251  	if (slab) {
3252  		deactivate_slab(s, slab, freelist);
3253  		stat(s, CPUSLAB_FLUSH);
3254  	}
3255  }
3256  
3257  static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
3258  {
3259  	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3260  	void *freelist = c->freelist;
3261  	struct slab *slab = c->slab;
3262  
3263  	c->slab = NULL;
3264  	c->freelist = NULL;
3265  	c->tid = next_tid(c->tid);
3266  
3267  	if (slab) {
3268  		deactivate_slab(s, slab, freelist);
3269  		stat(s, CPUSLAB_FLUSH);
3270  	}
3271  
3272  	put_partials_cpu(s, c);
3273  }
3274  
3275  struct slub_flush_work {
3276  	struct work_struct work;
3277  	struct kmem_cache *s;
3278  	bool skip;
3279  };
3280  
3281  /*
3282   * Flush cpu slab.
3283   *
3284   * Called from CPU work handler with migration disabled.
3285   */
3286  static void flush_cpu_slab(struct work_struct *w)
3287  {
3288  	struct kmem_cache *s;
3289  	struct kmem_cache_cpu *c;
3290  	struct slub_flush_work *sfw;
3291  
3292  	sfw = container_of(w, struct slub_flush_work, work);
3293  
3294  	s = sfw->s;
3295  	c = this_cpu_ptr(s->cpu_slab);
3296  
3297  	if (c->slab)
3298  		flush_slab(s, c);
3299  
3300  	put_partials(s);
3301  }
3302  
3303  static bool has_cpu_slab(int cpu, struct kmem_cache *s)
3304  {
3305  	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3306  
3307  	return c->slab || slub_percpu_partial(c);
3308  }
3309  
3310  static DEFINE_MUTEX(flush_lock);
3311  static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
3312  
3313  static void flush_all_cpus_locked(struct kmem_cache *s)
3314  {
3315  	struct slub_flush_work *sfw;
3316  	unsigned int cpu;
3317  
3318  	lockdep_assert_cpus_held();
3319  	mutex_lock(&flush_lock);
3320  
3321  	for_each_online_cpu(cpu) {
3322  		sfw = &per_cpu(slub_flush, cpu);
3323  		if (!has_cpu_slab(cpu, s)) {
3324  			sfw->skip = true;
3325  			continue;
3326  		}
3327  		INIT_WORK(&sfw->work, flush_cpu_slab);
3328  		sfw->skip = false;
3329  		sfw->s = s;
3330  		queue_work_on(cpu, flushwq, &sfw->work);
3331  	}
3332  
3333  	for_each_online_cpu(cpu) {
3334  		sfw = &per_cpu(slub_flush, cpu);
3335  		if (sfw->skip)
3336  			continue;
3337  		flush_work(&sfw->work);
3338  	}
3339  
3340  	mutex_unlock(&flush_lock);
3341  }
3342  
3343  static void flush_all(struct kmem_cache *s)
3344  {
3345  	cpus_read_lock();
3346  	flush_all_cpus_locked(s);
3347  	cpus_read_unlock();
3348  }
3349  
3350  /*
3351   * Use the cpu notifier to insure that the cpu slabs are flushed when
3352   * necessary.
3353   */
3354  static int slub_cpu_dead(unsigned int cpu)
3355  {
3356  	struct kmem_cache *s;
3357  
3358  	mutex_lock(&slab_mutex);
3359  	list_for_each_entry(s, &slab_caches, list)
3360  		__flush_cpu_slab(s, cpu);
3361  	mutex_unlock(&slab_mutex);
3362  	return 0;
3363  }
3364  
3365  #else /* CONFIG_SLUB_TINY */
3366  static inline void flush_all_cpus_locked(struct kmem_cache *s) { }
3367  static inline void flush_all(struct kmem_cache *s) { }
3368  static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
3369  static inline int slub_cpu_dead(unsigned int cpu) { return 0; }
3370  #endif /* CONFIG_SLUB_TINY */
3371  
3372  /*
3373   * Check if the objects in a per cpu structure fit numa
3374   * locality expectations.
3375   */
3376  static inline int node_match(struct slab *slab, int node)
3377  {
3378  #ifdef CONFIG_NUMA
3379  	if (node != NUMA_NO_NODE && slab_nid(slab) != node)
3380  		return 0;
3381  #endif
3382  	return 1;
3383  }
3384  
3385  #ifdef CONFIG_SLUB_DEBUG
3386  static int count_free(struct slab *slab)
3387  {
3388  	return slab->objects - slab->inuse;
3389  }
3390  
3391  static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
3392  {
3393  	return atomic_long_read(&n->total_objects);
3394  }
3395  
3396  /* Supports checking bulk free of a constructed freelist */
3397  static inline bool free_debug_processing(struct kmem_cache *s,
3398  	struct slab *slab, void *head, void *tail, int *bulk_cnt,
3399  	unsigned long addr, depot_stack_handle_t handle)
3400  {
3401  	bool checks_ok = false;
3402  	void *object = head;
3403  	int cnt = 0;
3404  
3405  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
3406  		if (!check_slab(s, slab))
3407  			goto out;
3408  	}
3409  
3410  	if (slab->inuse < *bulk_cnt) {
3411  		slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
3412  			 slab->inuse, *bulk_cnt);
3413  		goto out;
3414  	}
3415  
3416  next_object:
3417  
3418  	if (++cnt > *bulk_cnt)
3419  		goto out_cnt;
3420  
3421  	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
3422  		if (!free_consistency_checks(s, slab, object, addr))
3423  			goto out;
3424  	}
3425  
3426  	if (s->flags & SLAB_STORE_USER)
3427  		set_track_update(s, object, TRACK_FREE, addr, handle);
3428  	trace(s, slab, object, 0);
3429  	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
3430  	init_object(s, object, SLUB_RED_INACTIVE);
3431  
3432  	/* Reached end of constructed freelist yet? */
3433  	if (object != tail) {
3434  		object = get_freepointer(s, object);
3435  		goto next_object;
3436  	}
3437  	checks_ok = true;
3438  
3439  out_cnt:
3440  	if (cnt != *bulk_cnt) {
3441  		slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
3442  			 *bulk_cnt, cnt);
3443  		*bulk_cnt = cnt;
3444  	}
3445  
3446  out:
3447  
3448  	if (!checks_ok)
3449  		slab_fix(s, "Object at 0x%p not freed", object);
3450  
3451  	return checks_ok;
3452  }
3453  #endif /* CONFIG_SLUB_DEBUG */
3454  
3455  #if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
3456  static unsigned long count_partial(struct kmem_cache_node *n,
3457  					int (*get_count)(struct slab *))
3458  {
3459  	unsigned long flags;
3460  	unsigned long x = 0;
3461  	struct slab *slab;
3462  
3463  	spin_lock_irqsave(&n->list_lock, flags);
3464  	list_for_each_entry(slab, &n->partial, slab_list)
3465  		x += get_count(slab);
3466  	spin_unlock_irqrestore(&n->list_lock, flags);
3467  	return x;
3468  }
3469  #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
3470  
3471  #ifdef CONFIG_SLUB_DEBUG
3472  #define MAX_PARTIAL_TO_SCAN 10000
3473  
3474  static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
3475  {
3476  	unsigned long flags;
3477  	unsigned long x = 0;
3478  	struct slab *slab;
3479  
3480  	spin_lock_irqsave(&n->list_lock, flags);
3481  	if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
3482  		list_for_each_entry(slab, &n->partial, slab_list)
3483  			x += slab->objects - slab->inuse;
3484  	} else {
3485  		/*
3486  		 * For a long list, approximate the total count of objects in
3487  		 * it to meet the limit on the number of slabs to scan.
3488  		 * Scan from both the list's head and tail for better accuracy.
3489  		 */
3490  		unsigned long scanned = 0;
3491  
3492  		list_for_each_entry(slab, &n->partial, slab_list) {
3493  			x += slab->objects - slab->inuse;
3494  			if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
3495  				break;
3496  		}
3497  		list_for_each_entry_reverse(slab, &n->partial, slab_list) {
3498  			x += slab->objects - slab->inuse;
3499  			if (++scanned == MAX_PARTIAL_TO_SCAN)
3500  				break;
3501  		}
3502  		x = mult_frac(x, n->nr_partial, scanned);
3503  		x = min(x, node_nr_objs(n));
3504  	}
3505  	spin_unlock_irqrestore(&n->list_lock, flags);
3506  	return x;
3507  }
3508  
3509  static noinline void
3510  slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
3511  {
3512  	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
3513  				      DEFAULT_RATELIMIT_BURST);
3514  	int cpu = raw_smp_processor_id();
3515  	int node;
3516  	struct kmem_cache_node *n;
3517  
3518  	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
3519  		return;
3520  
3521  	pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n",
3522  		cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags);
3523  	pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
3524  		s->name, s->object_size, s->size, oo_order(s->oo),
3525  		oo_order(s->min));
3526  
3527  	if (oo_order(s->min) > get_order(s->object_size))
3528  		pr_warn("  %s debugging increased min order, use slab_debug=O to disable.\n",
3529  			s->name);
3530  
3531  	for_each_kmem_cache_node(s, node, n) {
3532  		unsigned long nr_slabs;
3533  		unsigned long nr_objs;
3534  		unsigned long nr_free;
3535  
3536  		nr_free  = count_partial_free_approx(n);
3537  		nr_slabs = node_nr_slabs(n);
3538  		nr_objs  = node_nr_objs(n);
3539  
3540  		pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
3541  			node, nr_slabs, nr_objs, nr_free);
3542  	}
3543  }
3544  #else /* CONFIG_SLUB_DEBUG */
3545  static inline void
3546  slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
3547  #endif
3548  
3549  static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
3550  {
3551  	if (unlikely(slab_test_pfmemalloc(slab)))
3552  		return gfp_pfmemalloc_allowed(gfpflags);
3553  
3554  	return true;
3555  }
3556  
3557  #ifndef CONFIG_SLUB_TINY
3558  static inline bool
3559  __update_cpu_freelist_fast(struct kmem_cache *s,
3560  			   void *freelist_old, void *freelist_new,
3561  			   unsigned long tid)
3562  {
3563  	freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
3564  	freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
3565  
3566  	return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
3567  					     &old.full, new.full);
3568  }
3569  
3570  /*
3571   * Check the slab->freelist and either transfer the freelist to the
3572   * per cpu freelist or deactivate the slab.
3573   *
3574   * The slab is still frozen if the return value is not NULL.
3575   *
3576   * If this function returns NULL then the slab has been unfrozen.
3577   */
3578  static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
3579  {
3580  	struct slab new;
3581  	unsigned long counters;
3582  	void *freelist;
3583  
3584  	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
3585  
3586  	do {
3587  		freelist = slab->freelist;
3588  		counters = slab->counters;
3589  
3590  		new.counters = counters;
3591  
3592  		new.inuse = slab->objects;
3593  		new.frozen = freelist != NULL;
3594  
3595  	} while (!__slab_update_freelist(s, slab,
3596  		freelist, counters,
3597  		NULL, new.counters,
3598  		"get_freelist"));
3599  
3600  	return freelist;
3601  }
3602  
3603  /*
3604   * Freeze the partial slab and return the pointer to the freelist.
3605   */
3606  static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
3607  {
3608  	struct slab new;
3609  	unsigned long counters;
3610  	void *freelist;
3611  
3612  	do {
3613  		freelist = slab->freelist;
3614  		counters = slab->counters;
3615  
3616  		new.counters = counters;
3617  		VM_BUG_ON(new.frozen);
3618  
3619  		new.inuse = slab->objects;
3620  		new.frozen = 1;
3621  
3622  	} while (!slab_update_freelist(s, slab,
3623  		freelist, counters,
3624  		NULL, new.counters,
3625  		"freeze_slab"));
3626  
3627  	return freelist;
3628  }
3629  
3630  /*
3631   * Slow path. The lockless freelist is empty or we need to perform
3632   * debugging duties.
3633   *
3634   * Processing is still very fast if new objects have been freed to the
3635   * regular freelist. In that case we simply take over the regular freelist
3636   * as the lockless freelist and zap the regular freelist.
3637   *
3638   * If that is not working then we fall back to the partial lists. We take the
3639   * first element of the freelist as the object to allocate now and move the
3640   * rest of the freelist to the lockless freelist.
3641   *
3642   * And if we were unable to get a new slab from the partial slab lists then
3643   * we need to allocate a new slab. This is the slowest path since it involves
3644   * a call to the page allocator and the setup of a new slab.
3645   *
3646   * Version of __slab_alloc to use when we know that preemption is
3647   * already disabled (which is the case for bulk allocation).
3648   */
3649  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
3650  			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
3651  {
3652  	void *freelist;
3653  	struct slab *slab;
3654  	unsigned long flags;
3655  	struct partial_context pc;
3656  	bool try_thisnode = true;
3657  
3658  	stat(s, ALLOC_SLOWPATH);
3659  
3660  reread_slab:
3661  
3662  	slab = READ_ONCE(c->slab);
3663  	if (!slab) {
3664  		/*
3665  		 * if the node is not online or has no normal memory, just
3666  		 * ignore the node constraint
3667  		 */
3668  		if (unlikely(node != NUMA_NO_NODE &&
3669  			     !node_isset(node, slab_nodes)))
3670  			node = NUMA_NO_NODE;
3671  		goto new_slab;
3672  	}
3673  
3674  	if (unlikely(!node_match(slab, node))) {
3675  		/*
3676  		 * same as above but node_match() being false already
3677  		 * implies node != NUMA_NO_NODE
3678  		 */
3679  		if (!node_isset(node, slab_nodes)) {
3680  			node = NUMA_NO_NODE;
3681  		} else {
3682  			stat(s, ALLOC_NODE_MISMATCH);
3683  			goto deactivate_slab;
3684  		}
3685  	}
3686  
3687  	/*
3688  	 * By rights, we should be searching for a slab page that was
3689  	 * PFMEMALLOC but right now, we are losing the pfmemalloc
3690  	 * information when the page leaves the per-cpu allocator
3691  	 */
3692  	if (unlikely(!pfmemalloc_match(slab, gfpflags)))
3693  		goto deactivate_slab;
3694  
3695  	/* must check again c->slab in case we got preempted and it changed */
3696  	local_lock_irqsave(&s->cpu_slab->lock, flags);
3697  	if (unlikely(slab != c->slab)) {
3698  		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3699  		goto reread_slab;
3700  	}
3701  	freelist = c->freelist;
3702  	if (freelist)
3703  		goto load_freelist;
3704  
3705  	freelist = get_freelist(s, slab);
3706  
3707  	if (!freelist) {
3708  		c->slab = NULL;
3709  		c->tid = next_tid(c->tid);
3710  		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3711  		stat(s, DEACTIVATE_BYPASS);
3712  		goto new_slab;
3713  	}
3714  
3715  	stat(s, ALLOC_REFILL);
3716  
3717  load_freelist:
3718  
3719  	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
3720  
3721  	/*
3722  	 * freelist is pointing to the list of objects to be used.
3723  	 * slab is pointing to the slab from which the objects are obtained.
3724  	 * That slab must be frozen for per cpu allocations to work.
3725  	 */
3726  	VM_BUG_ON(!c->slab->frozen);
3727  	c->freelist = get_freepointer(s, freelist);
3728  	c->tid = next_tid(c->tid);
3729  	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3730  	return freelist;
3731  
3732  deactivate_slab:
3733  
3734  	local_lock_irqsave(&s->cpu_slab->lock, flags);
3735  	if (slab != c->slab) {
3736  		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3737  		goto reread_slab;
3738  	}
3739  	freelist = c->freelist;
3740  	c->slab = NULL;
3741  	c->freelist = NULL;
3742  	c->tid = next_tid(c->tid);
3743  	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3744  	deactivate_slab(s, slab, freelist);
3745  
3746  new_slab:
3747  
3748  #ifdef CONFIG_SLUB_CPU_PARTIAL
3749  	while (slub_percpu_partial(c)) {
3750  		local_lock_irqsave(&s->cpu_slab->lock, flags);
3751  		if (unlikely(c->slab)) {
3752  			local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3753  			goto reread_slab;
3754  		}
3755  		if (unlikely(!slub_percpu_partial(c))) {
3756  			local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3757  			/* we were preempted and partial list got empty */
3758  			goto new_objects;
3759  		}
3760  
3761  		slab = slub_percpu_partial(c);
3762  		slub_set_percpu_partial(c, slab);
3763  
3764  		if (likely(node_match(slab, node) &&
3765  			   pfmemalloc_match(slab, gfpflags))) {
3766  			c->slab = slab;
3767  			freelist = get_freelist(s, slab);
3768  			VM_BUG_ON(!freelist);
3769  			stat(s, CPU_PARTIAL_ALLOC);
3770  			goto load_freelist;
3771  		}
3772  
3773  		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3774  
3775  		slab->next = NULL;
3776  		__put_partials(s, slab);
3777  	}
3778  #endif
3779  
3780  new_objects:
3781  
3782  	pc.flags = gfpflags;
3783  	/*
3784  	 * When a preferred node is indicated but no __GFP_THISNODE
3785  	 *
3786  	 * 1) try to get a partial slab from target node only by having
3787  	 *    __GFP_THISNODE in pc.flags for get_partial()
3788  	 * 2) if 1) failed, try to allocate a new slab from target node with
3789  	 *    GPF_NOWAIT | __GFP_THISNODE opportunistically
3790  	 * 3) if 2) failed, retry with original gfpflags which will allow
3791  	 *    get_partial() try partial lists of other nodes before potentially
3792  	 *    allocating new page from other nodes
3793  	 */
3794  	if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
3795  		     && try_thisnode))
3796  		pc.flags = GFP_NOWAIT | __GFP_THISNODE;
3797  
3798  	pc.orig_size = orig_size;
3799  	slab = get_partial(s, node, &pc);
3800  	if (slab) {
3801  		if (kmem_cache_debug(s)) {
3802  			freelist = pc.object;
3803  			/*
3804  			 * For debug caches here we had to go through
3805  			 * alloc_single_from_partial() so just store the
3806  			 * tracking info and return the object.
3807  			 */
3808  			if (s->flags & SLAB_STORE_USER)
3809  				set_track(s, freelist, TRACK_ALLOC, addr);
3810  
3811  			return freelist;
3812  		}
3813  
3814  		freelist = freeze_slab(s, slab);
3815  		goto retry_load_slab;
3816  	}
3817  
3818  	slub_put_cpu_ptr(s->cpu_slab);
3819  	slab = new_slab(s, pc.flags, node);
3820  	c = slub_get_cpu_ptr(s->cpu_slab);
3821  
3822  	if (unlikely(!slab)) {
3823  		if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
3824  		    && try_thisnode) {
3825  			try_thisnode = false;
3826  			goto new_objects;
3827  		}
3828  		slab_out_of_memory(s, gfpflags, node);
3829  		return NULL;
3830  	}
3831  
3832  	stat(s, ALLOC_SLAB);
3833  
3834  	if (kmem_cache_debug(s)) {
3835  		freelist = alloc_single_from_new_slab(s, slab, orig_size);
3836  
3837  		if (unlikely(!freelist))
3838  			goto new_objects;
3839  
3840  		if (s->flags & SLAB_STORE_USER)
3841  			set_track(s, freelist, TRACK_ALLOC, addr);
3842  
3843  		return freelist;
3844  	}
3845  
3846  	/*
3847  	 * No other reference to the slab yet so we can
3848  	 * muck around with it freely without cmpxchg
3849  	 */
3850  	freelist = slab->freelist;
3851  	slab->freelist = NULL;
3852  	slab->inuse = slab->objects;
3853  	slab->frozen = 1;
3854  
3855  	inc_slabs_node(s, slab_nid(slab), slab->objects);
3856  
3857  	if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
3858  		/*
3859  		 * For !pfmemalloc_match() case we don't load freelist so that
3860  		 * we don't make further mismatched allocations easier.
3861  		 */
3862  		deactivate_slab(s, slab, get_freepointer(s, freelist));
3863  		return freelist;
3864  	}
3865  
3866  retry_load_slab:
3867  
3868  	local_lock_irqsave(&s->cpu_slab->lock, flags);
3869  	if (unlikely(c->slab)) {
3870  		void *flush_freelist = c->freelist;
3871  		struct slab *flush_slab = c->slab;
3872  
3873  		c->slab = NULL;
3874  		c->freelist = NULL;
3875  		c->tid = next_tid(c->tid);
3876  
3877  		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3878  
3879  		deactivate_slab(s, flush_slab, flush_freelist);
3880  
3881  		stat(s, CPUSLAB_FLUSH);
3882  
3883  		goto retry_load_slab;
3884  	}
3885  	c->slab = slab;
3886  
3887  	goto load_freelist;
3888  }
3889  
3890  /*
3891   * A wrapper for ___slab_alloc() for contexts where preemption is not yet
3892   * disabled. Compensates for possible cpu changes by refetching the per cpu area
3893   * pointer.
3894   */
3895  static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
3896  			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
3897  {
3898  	void *p;
3899  
3900  #ifdef CONFIG_PREEMPT_COUNT
3901  	/*
3902  	 * We may have been preempted and rescheduled on a different
3903  	 * cpu before disabling preemption. Need to reload cpu area
3904  	 * pointer.
3905  	 */
3906  	c = slub_get_cpu_ptr(s->cpu_slab);
3907  #endif
3908  
3909  	p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
3910  #ifdef CONFIG_PREEMPT_COUNT
3911  	slub_put_cpu_ptr(s->cpu_slab);
3912  #endif
3913  	return p;
3914  }
3915  
3916  static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
3917  		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
3918  {
3919  	struct kmem_cache_cpu *c;
3920  	struct slab *slab;
3921  	unsigned long tid;
3922  	void *object;
3923  
3924  redo:
3925  	/*
3926  	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
3927  	 * enabled. We may switch back and forth between cpus while
3928  	 * reading from one cpu area. That does not matter as long
3929  	 * as we end up on the original cpu again when doing the cmpxchg.
3930  	 *
3931  	 * We must guarantee that tid and kmem_cache_cpu are retrieved on the
3932  	 * same cpu. We read first the kmem_cache_cpu pointer and use it to read
3933  	 * the tid. If we are preempted and switched to another cpu between the
3934  	 * two reads, it's OK as the two are still associated with the same cpu
3935  	 * and cmpxchg later will validate the cpu.
3936  	 */
3937  	c = raw_cpu_ptr(s->cpu_slab);
3938  	tid = READ_ONCE(c->tid);
3939  
3940  	/*
3941  	 * Irqless object alloc/free algorithm used here depends on sequence
3942  	 * of fetching cpu_slab's data. tid should be fetched before anything
3943  	 * on c to guarantee that object and slab associated with previous tid
3944  	 * won't be used with current tid. If we fetch tid first, object and
3945  	 * slab could be one associated with next tid and our alloc/free
3946  	 * request will be failed. In this case, we will retry. So, no problem.
3947  	 */
3948  	barrier();
3949  
3950  	/*
3951  	 * The transaction ids are globally unique per cpu and per operation on
3952  	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
3953  	 * occurs on the right processor and that there was no operation on the
3954  	 * linked list in between.
3955  	 */
3956  
3957  	object = c->freelist;
3958  	slab = c->slab;
3959  
3960  	if (!USE_LOCKLESS_FAST_PATH() ||
3961  	    unlikely(!object || !slab || !node_match(slab, node))) {
3962  		object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
3963  	} else {
3964  		void *next_object = get_freepointer_safe(s, object);
3965  
3966  		/*
3967  		 * The cmpxchg will only match if there was no additional
3968  		 * operation and if we are on the right processor.
3969  		 *
3970  		 * The cmpxchg does the following atomically (without lock
3971  		 * semantics!)
3972  		 * 1. Relocate first pointer to the current per cpu area.
3973  		 * 2. Verify that tid and freelist have not been changed
3974  		 * 3. If they were not changed replace tid and freelist
3975  		 *
3976  		 * Since this is without lock semantics the protection is only
3977  		 * against code executing on this cpu *not* from access by
3978  		 * other cpus.
3979  		 */
3980  		if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
3981  			note_cmpxchg_failure("slab_alloc", s, tid);
3982  			goto redo;
3983  		}
3984  		prefetch_freepointer(s, next_object);
3985  		stat(s, ALLOC_FASTPATH);
3986  	}
3987  
3988  	return object;
3989  }
3990  #else /* CONFIG_SLUB_TINY */
3991  static void *__slab_alloc_node(struct kmem_cache *s,
3992  		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
3993  {
3994  	struct partial_context pc;
3995  	struct slab *slab;
3996  	void *object;
3997  
3998  	pc.flags = gfpflags;
3999  	pc.orig_size = orig_size;
4000  	slab = get_partial(s, node, &pc);
4001  
4002  	if (slab)
4003  		return pc.object;
4004  
4005  	slab = new_slab(s, gfpflags, node);
4006  	if (unlikely(!slab)) {
4007  		slab_out_of_memory(s, gfpflags, node);
4008  		return NULL;
4009  	}
4010  
4011  	object = alloc_single_from_new_slab(s, slab, orig_size);
4012  
4013  	return object;
4014  }
4015  #endif /* CONFIG_SLUB_TINY */
4016  
4017  /*
4018   * If the object has been wiped upon free, make sure it's fully initialized by
4019   * zeroing out freelist pointer.
4020   *
4021   * Note that we also wipe custom freelist pointers.
4022   */
4023  static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
4024  						   void *obj)
4025  {
4026  	if (unlikely(slab_want_init_on_free(s)) && obj &&
4027  	    !freeptr_outside_object(s))
4028  		memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
4029  			0, sizeof(void *));
4030  }
4031  
4032  static __fastpath_inline
4033  struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
4034  {
4035  	flags &= gfp_allowed_mask;
4036  
4037  	might_alloc(flags);
4038  
4039  	if (unlikely(should_failslab(s, flags)))
4040  		return NULL;
4041  
4042  	return s;
4043  }
4044  
4045  static __fastpath_inline
4046  bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
4047  			  gfp_t flags, size_t size, void **p, bool init,
4048  			  unsigned int orig_size)
4049  {
4050  	unsigned int zero_size = s->object_size;
4051  	bool kasan_init = init;
4052  	size_t i;
4053  	gfp_t init_flags = flags & gfp_allowed_mask;
4054  
4055  	/*
4056  	 * For kmalloc object, the allocated memory size(object_size) is likely
4057  	 * larger than the requested size(orig_size). If redzone check is
4058  	 * enabled for the extra space, don't zero it, as it will be redzoned
4059  	 * soon. The redzone operation for this extra space could be seen as a
4060  	 * replacement of current poisoning under certain debug option, and
4061  	 * won't break other sanity checks.
4062  	 */
4063  	if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) &&
4064  	    (s->flags & SLAB_KMALLOC))
4065  		zero_size = orig_size;
4066  
4067  	/*
4068  	 * When slab_debug is enabled, avoid memory initialization integrated
4069  	 * into KASAN and instead zero out the memory via the memset below with
4070  	 * the proper size. Otherwise, KASAN might overwrite SLUB redzones and
4071  	 * cause false-positive reports. This does not lead to a performance
4072  	 * penalty on production builds, as slab_debug is not intended to be
4073  	 * enabled there.
4074  	 */
4075  	if (__slub_debug_enabled())
4076  		kasan_init = false;
4077  
4078  	/*
4079  	 * As memory initialization might be integrated into KASAN,
4080  	 * kasan_slab_alloc and initialization memset must be
4081  	 * kept together to avoid discrepancies in behavior.
4082  	 *
4083  	 * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
4084  	 */
4085  	for (i = 0; i < size; i++) {
4086  		p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
4087  		if (p[i] && init && (!kasan_init ||
4088  				     !kasan_has_integrated_init()))
4089  			memset(p[i], 0, zero_size);
4090  		kmemleak_alloc_recursive(p[i], s->object_size, 1,
4091  					 s->flags, init_flags);
4092  		kmsan_slab_alloc(s, p[i], init_flags);
4093  		alloc_tagging_slab_alloc_hook(s, p[i], flags);
4094  	}
4095  
4096  	return memcg_slab_post_alloc_hook(s, lru, flags, size, p);
4097  }
4098  
4099  /*
4100   * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
4101   * have the fastpath folded into their functions. So no function call
4102   * overhead for requests that can be satisfied on the fastpath.
4103   *
4104   * The fastpath works by first checking if the lockless freelist can be used.
4105   * If not then __slab_alloc is called for slow processing.
4106   *
4107   * Otherwise we can simply pick the next object from the lockless free list.
4108   */
4109  static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
4110  		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
4111  {
4112  	void *object;
4113  	bool init = false;
4114  
4115  	s = slab_pre_alloc_hook(s, gfpflags);
4116  	if (unlikely(!s))
4117  		return NULL;
4118  
4119  	object = kfence_alloc(s, orig_size, gfpflags);
4120  	if (unlikely(object))
4121  		goto out;
4122  
4123  	object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
4124  
4125  	maybe_wipe_obj_freeptr(s, object);
4126  	init = slab_want_init_on_alloc(gfpflags, s);
4127  
4128  out:
4129  	/*
4130  	 * When init equals 'true', like for kzalloc() family, only
4131  	 * @orig_size bytes might be zeroed instead of s->object_size
4132  	 * In case this fails due to memcg_slab_post_alloc_hook(),
4133  	 * object is set to NULL
4134  	 */
4135  	slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size);
4136  
4137  	return object;
4138  }
4139  
4140  void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
4141  {
4142  	void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_,
4143  				    s->object_size);
4144  
4145  	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
4146  
4147  	return ret;
4148  }
4149  EXPORT_SYMBOL(kmem_cache_alloc_noprof);
4150  
4151  void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
4152  			   gfp_t gfpflags)
4153  {
4154  	void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_,
4155  				    s->object_size);
4156  
4157  	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
4158  
4159  	return ret;
4160  }
4161  EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
4162  
4163  bool kmem_cache_charge(void *objp, gfp_t gfpflags)
4164  {
4165  	if (!memcg_kmem_online())
4166  		return true;
4167  
4168  	return memcg_slab_post_charge(objp, gfpflags);
4169  }
4170  EXPORT_SYMBOL(kmem_cache_charge);
4171  
4172  /**
4173   * kmem_cache_alloc_node - Allocate an object on the specified node
4174   * @s: The cache to allocate from.
4175   * @gfpflags: See kmalloc().
4176   * @node: node number of the target node.
4177   *
4178   * Identical to kmem_cache_alloc but it will allocate memory on the given
4179   * node, which can improve the performance for cpu bound structures.
4180   *
4181   * Fallback to other node is possible if __GFP_THISNODE is not set.
4182   *
4183   * Return: pointer to the new object or %NULL in case of error
4184   */
4185  void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
4186  {
4187  	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
4188  
4189  	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
4190  
4191  	return ret;
4192  }
4193  EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
4194  
4195  /*
4196   * To avoid unnecessary overhead, we pass through large allocation requests
4197   * directly to the page allocator. We use __GFP_COMP, because we will need to
4198   * know the allocation order to free the pages properly in kfree.
4199   */
4200  static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
4201  {
4202  	struct folio *folio;
4203  	void *ptr = NULL;
4204  	unsigned int order = get_order(size);
4205  
4206  	if (unlikely(flags & GFP_SLAB_BUG_MASK))
4207  		flags = kmalloc_fix_flags(flags);
4208  
4209  	flags |= __GFP_COMP;
4210  	folio = (struct folio *)alloc_pages_node_noprof(node, flags, order);
4211  	if (folio) {
4212  		ptr = folio_address(folio);
4213  		lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
4214  				      PAGE_SIZE << order);
4215  	}
4216  
4217  	ptr = kasan_kmalloc_large(ptr, size, flags);
4218  	/* As ptr might get tagged, call kmemleak hook after KASAN. */
4219  	kmemleak_alloc(ptr, size, 1, flags);
4220  	kmsan_kmalloc_large(ptr, size, flags);
4221  
4222  	return ptr;
4223  }
4224  
4225  void *__kmalloc_large_noprof(size_t size, gfp_t flags)
4226  {
4227  	void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE);
4228  
4229  	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
4230  		      flags, NUMA_NO_NODE);
4231  	return ret;
4232  }
4233  EXPORT_SYMBOL(__kmalloc_large_noprof);
4234  
4235  void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
4236  {
4237  	void *ret = ___kmalloc_large_node(size, flags, node);
4238  
4239  	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
4240  		      flags, node);
4241  	return ret;
4242  }
4243  EXPORT_SYMBOL(__kmalloc_large_node_noprof);
4244  
4245  static __always_inline
4246  void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
4247  			unsigned long caller)
4248  {
4249  	struct kmem_cache *s;
4250  	void *ret;
4251  
4252  	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
4253  		ret = __kmalloc_large_node_noprof(size, flags, node);
4254  		trace_kmalloc(caller, ret, size,
4255  			      PAGE_SIZE << get_order(size), flags, node);
4256  		return ret;
4257  	}
4258  
4259  	if (unlikely(!size))
4260  		return ZERO_SIZE_PTR;
4261  
4262  	s = kmalloc_slab(size, b, flags, caller);
4263  
4264  	ret = slab_alloc_node(s, NULL, flags, node, caller, size);
4265  	ret = kasan_kmalloc(s, ret, size, flags);
4266  	trace_kmalloc(caller, ret, size, s->size, flags, node);
4267  	return ret;
4268  }
4269  void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
4270  {
4271  	return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_);
4272  }
4273  EXPORT_SYMBOL(__kmalloc_node_noprof);
4274  
4275  void *__kmalloc_noprof(size_t size, gfp_t flags)
4276  {
4277  	return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_);
4278  }
4279  EXPORT_SYMBOL(__kmalloc_noprof);
4280  
4281  void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
4282  					 int node, unsigned long caller)
4283  {
4284  	return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller);
4285  
4286  }
4287  EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof);
4288  
4289  void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
4290  {
4291  	void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
4292  					    _RET_IP_, size);
4293  
4294  	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
4295  
4296  	ret = kasan_kmalloc(s, ret, size, gfpflags);
4297  	return ret;
4298  }
4299  EXPORT_SYMBOL(__kmalloc_cache_noprof);
4300  
4301  void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
4302  				  int node, size_t size)
4303  {
4304  	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
4305  
4306  	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
4307  
4308  	ret = kasan_kmalloc(s, ret, size, gfpflags);
4309  	return ret;
4310  }
4311  EXPORT_SYMBOL(__kmalloc_cache_node_noprof);
4312  
4313  static noinline void free_to_partial_list(
4314  	struct kmem_cache *s, struct slab *slab,
4315  	void *head, void *tail, int bulk_cnt,
4316  	unsigned long addr)
4317  {
4318  	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
4319  	struct slab *slab_free = NULL;
4320  	int cnt = bulk_cnt;
4321  	unsigned long flags;
4322  	depot_stack_handle_t handle = 0;
4323  
4324  	if (s->flags & SLAB_STORE_USER)
4325  		handle = set_track_prepare();
4326  
4327  	spin_lock_irqsave(&n->list_lock, flags);
4328  
4329  	if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
4330  		void *prior = slab->freelist;
4331  
4332  		/* Perform the actual freeing while we still hold the locks */
4333  		slab->inuse -= cnt;
4334  		set_freepointer(s, tail, prior);
4335  		slab->freelist = head;
4336  
4337  		/*
4338  		 * If the slab is empty, and node's partial list is full,
4339  		 * it should be discarded anyway no matter it's on full or
4340  		 * partial list.
4341  		 */
4342  		if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
4343  			slab_free = slab;
4344  
4345  		if (!prior) {
4346  			/* was on full list */
4347  			remove_full(s, n, slab);
4348  			if (!slab_free) {
4349  				add_partial(n, slab, DEACTIVATE_TO_TAIL);
4350  				stat(s, FREE_ADD_PARTIAL);
4351  			}
4352  		} else if (slab_free) {
4353  			remove_partial(n, slab);
4354  			stat(s, FREE_REMOVE_PARTIAL);
4355  		}
4356  	}
4357  
4358  	if (slab_free) {
4359  		/*
4360  		 * Update the counters while still holding n->list_lock to
4361  		 * prevent spurious validation warnings
4362  		 */
4363  		dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
4364  	}
4365  
4366  	spin_unlock_irqrestore(&n->list_lock, flags);
4367  
4368  	if (slab_free) {
4369  		stat(s, FREE_SLAB);
4370  		free_slab(s, slab_free);
4371  	}
4372  }
4373  
4374  /*
4375   * Slow path handling. This may still be called frequently since objects
4376   * have a longer lifetime than the cpu slabs in most processing loads.
4377   *
4378   * So we still attempt to reduce cache line usage. Just take the slab
4379   * lock and free the item. If there is no additional partial slab
4380   * handling required then we can return immediately.
4381   */
4382  static void __slab_free(struct kmem_cache *s, struct slab *slab,
4383  			void *head, void *tail, int cnt,
4384  			unsigned long addr)
4385  
4386  {
4387  	void *prior;
4388  	int was_frozen;
4389  	struct slab new;
4390  	unsigned long counters;
4391  	struct kmem_cache_node *n = NULL;
4392  	unsigned long flags;
4393  	bool on_node_partial;
4394  
4395  	stat(s, FREE_SLOWPATH);
4396  
4397  	if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
4398  		free_to_partial_list(s, slab, head, tail, cnt, addr);
4399  		return;
4400  	}
4401  
4402  	do {
4403  		if (unlikely(n)) {
4404  			spin_unlock_irqrestore(&n->list_lock, flags);
4405  			n = NULL;
4406  		}
4407  		prior = slab->freelist;
4408  		counters = slab->counters;
4409  		set_freepointer(s, tail, prior);
4410  		new.counters = counters;
4411  		was_frozen = new.frozen;
4412  		new.inuse -= cnt;
4413  		if ((!new.inuse || !prior) && !was_frozen) {
4414  			/* Needs to be taken off a list */
4415  			if (!kmem_cache_has_cpu_partial(s) || prior) {
4416  
4417  				n = get_node(s, slab_nid(slab));
4418  				/*
4419  				 * Speculatively acquire the list_lock.
4420  				 * If the cmpxchg does not succeed then we may
4421  				 * drop the list_lock without any processing.
4422  				 *
4423  				 * Otherwise the list_lock will synchronize with
4424  				 * other processors updating the list of slabs.
4425  				 */
4426  				spin_lock_irqsave(&n->list_lock, flags);
4427  
4428  				on_node_partial = slab_test_node_partial(slab);
4429  			}
4430  		}
4431  
4432  	} while (!slab_update_freelist(s, slab,
4433  		prior, counters,
4434  		head, new.counters,
4435  		"__slab_free"));
4436  
4437  	if (likely(!n)) {
4438  
4439  		if (likely(was_frozen)) {
4440  			/*
4441  			 * The list lock was not taken therefore no list
4442  			 * activity can be necessary.
4443  			 */
4444  			stat(s, FREE_FROZEN);
4445  		} else if (kmem_cache_has_cpu_partial(s) && !prior) {
4446  			/*
4447  			 * If we started with a full slab then put it onto the
4448  			 * per cpu partial list.
4449  			 */
4450  			put_cpu_partial(s, slab, 1);
4451  			stat(s, CPU_PARTIAL_FREE);
4452  		}
4453  
4454  		return;
4455  	}
4456  
4457  	/*
4458  	 * This slab was partially empty but not on the per-node partial list,
4459  	 * in which case we shouldn't manipulate its list, just return.
4460  	 */
4461  	if (prior && !on_node_partial) {
4462  		spin_unlock_irqrestore(&n->list_lock, flags);
4463  		return;
4464  	}
4465  
4466  	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
4467  		goto slab_empty;
4468  
4469  	/*
4470  	 * Objects left in the slab. If it was not on the partial list before
4471  	 * then add it.
4472  	 */
4473  	if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
4474  		add_partial(n, slab, DEACTIVATE_TO_TAIL);
4475  		stat(s, FREE_ADD_PARTIAL);
4476  	}
4477  	spin_unlock_irqrestore(&n->list_lock, flags);
4478  	return;
4479  
4480  slab_empty:
4481  	if (prior) {
4482  		/*
4483  		 * Slab on the partial list.
4484  		 */
4485  		remove_partial(n, slab);
4486  		stat(s, FREE_REMOVE_PARTIAL);
4487  	}
4488  
4489  	spin_unlock_irqrestore(&n->list_lock, flags);
4490  	stat(s, FREE_SLAB);
4491  	discard_slab(s, slab);
4492  }
4493  
4494  #ifndef CONFIG_SLUB_TINY
4495  /*
4496   * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
4497   * can perform fastpath freeing without additional function calls.
4498   *
4499   * The fastpath is only possible if we are freeing to the current cpu slab
4500   * of this processor. This typically the case if we have just allocated
4501   * the item before.
4502   *
4503   * If fastpath is not possible then fall back to __slab_free where we deal
4504   * with all sorts of special processing.
4505   *
4506   * Bulk free of a freelist with several objects (all pointing to the
4507   * same slab) possible by specifying head and tail ptr, plus objects
4508   * count (cnt). Bulk free indicated by tail pointer being set.
4509   */
4510  static __always_inline void do_slab_free(struct kmem_cache *s,
4511  				struct slab *slab, void *head, void *tail,
4512  				int cnt, unsigned long addr)
4513  {
4514  	struct kmem_cache_cpu *c;
4515  	unsigned long tid;
4516  	void **freelist;
4517  
4518  redo:
4519  	/*
4520  	 * Determine the currently cpus per cpu slab.
4521  	 * The cpu may change afterward. However that does not matter since
4522  	 * data is retrieved via this pointer. If we are on the same cpu
4523  	 * during the cmpxchg then the free will succeed.
4524  	 */
4525  	c = raw_cpu_ptr(s->cpu_slab);
4526  	tid = READ_ONCE(c->tid);
4527  
4528  	/* Same with comment on barrier() in __slab_alloc_node() */
4529  	barrier();
4530  
4531  	if (unlikely(slab != c->slab)) {
4532  		__slab_free(s, slab, head, tail, cnt, addr);
4533  		return;
4534  	}
4535  
4536  	if (USE_LOCKLESS_FAST_PATH()) {
4537  		freelist = READ_ONCE(c->freelist);
4538  
4539  		set_freepointer(s, tail, freelist);
4540  
4541  		if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
4542  			note_cmpxchg_failure("slab_free", s, tid);
4543  			goto redo;
4544  		}
4545  	} else {
4546  		/* Update the free list under the local lock */
4547  		local_lock(&s->cpu_slab->lock);
4548  		c = this_cpu_ptr(s->cpu_slab);
4549  		if (unlikely(slab != c->slab)) {
4550  			local_unlock(&s->cpu_slab->lock);
4551  			goto redo;
4552  		}
4553  		tid = c->tid;
4554  		freelist = c->freelist;
4555  
4556  		set_freepointer(s, tail, freelist);
4557  		c->freelist = head;
4558  		c->tid = next_tid(tid);
4559  
4560  		local_unlock(&s->cpu_slab->lock);
4561  	}
4562  	stat_add(s, FREE_FASTPATH, cnt);
4563  }
4564  #else /* CONFIG_SLUB_TINY */
4565  static void do_slab_free(struct kmem_cache *s,
4566  				struct slab *slab, void *head, void *tail,
4567  				int cnt, unsigned long addr)
4568  {
4569  	__slab_free(s, slab, head, tail, cnt, addr);
4570  }
4571  #endif /* CONFIG_SLUB_TINY */
4572  
4573  static __fastpath_inline
4574  void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
4575  	       unsigned long addr)
4576  {
4577  	memcg_slab_free_hook(s, slab, &object, 1);
4578  	alloc_tagging_slab_free_hook(s, slab, &object, 1);
4579  
4580  	if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
4581  		do_slab_free(s, slab, object, object, 1, addr);
4582  }
4583  
4584  #ifdef CONFIG_MEMCG
4585  /* Do not inline the rare memcg charging failed path into the allocation path */
4586  static noinline
4587  void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
4588  {
4589  	if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
4590  		do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_);
4591  }
4592  #endif
4593  
4594  static __fastpath_inline
4595  void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
4596  		    void *tail, void **p, int cnt, unsigned long addr)
4597  {
4598  	memcg_slab_free_hook(s, slab, p, cnt);
4599  	alloc_tagging_slab_free_hook(s, slab, p, cnt);
4600  	/*
4601  	 * With KASAN enabled slab_free_freelist_hook modifies the freelist
4602  	 * to remove objects, whose reuse must be delayed.
4603  	 */
4604  	if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt)))
4605  		do_slab_free(s, slab, head, tail, cnt, addr);
4606  }
4607  
4608  #ifdef CONFIG_SLUB_RCU_DEBUG
4609  static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
4610  {
4611  	struct rcu_delayed_free *delayed_free =
4612  			container_of(rcu_head, struct rcu_delayed_free, head);
4613  	void *object = delayed_free->object;
4614  	struct slab *slab = virt_to_slab(object);
4615  	struct kmem_cache *s;
4616  
4617  	kfree(delayed_free);
4618  
4619  	if (WARN_ON(is_kfence_address(object)))
4620  		return;
4621  
4622  	/* find the object and the cache again */
4623  	if (WARN_ON(!slab))
4624  		return;
4625  	s = slab->slab_cache;
4626  	if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
4627  		return;
4628  
4629  	/* resume freeing */
4630  	if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
4631  		do_slab_free(s, slab, object, object, 1, _THIS_IP_);
4632  }
4633  #endif /* CONFIG_SLUB_RCU_DEBUG */
4634  
4635  #ifdef CONFIG_KASAN_GENERIC
4636  void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
4637  {
4638  	do_slab_free(cache, virt_to_slab(x), x, x, 1, addr);
4639  }
4640  #endif
4641  
4642  static inline struct kmem_cache *virt_to_cache(const void *obj)
4643  {
4644  	struct slab *slab;
4645  
4646  	slab = virt_to_slab(obj);
4647  	if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__))
4648  		return NULL;
4649  	return slab->slab_cache;
4650  }
4651  
4652  static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
4653  {
4654  	struct kmem_cache *cachep;
4655  
4656  	if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
4657  	    !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
4658  		return s;
4659  
4660  	cachep = virt_to_cache(x);
4661  	if (WARN(cachep && cachep != s,
4662  		 "%s: Wrong slab cache. %s but object is from %s\n",
4663  		 __func__, s->name, cachep->name))
4664  		print_tracking(cachep, x);
4665  	return cachep;
4666  }
4667  
4668  /**
4669   * kmem_cache_free - Deallocate an object
4670   * @s: The cache the allocation was from.
4671   * @x: The previously allocated object.
4672   *
4673   * Free an object which was previously allocated from this
4674   * cache.
4675   */
4676  void kmem_cache_free(struct kmem_cache *s, void *x)
4677  {
4678  	s = cache_from_obj(s, x);
4679  	if (!s)
4680  		return;
4681  	trace_kmem_cache_free(_RET_IP_, x, s);
4682  	slab_free(s, virt_to_slab(x), x, _RET_IP_);
4683  }
4684  EXPORT_SYMBOL(kmem_cache_free);
4685  
4686  static void free_large_kmalloc(struct folio *folio, void *object)
4687  {
4688  	unsigned int order = folio_order(folio);
4689  
4690  	if (WARN_ON_ONCE(order == 0))
4691  		pr_warn_once("object pointer: 0x%p\n", object);
4692  
4693  	kmemleak_free(object);
4694  	kasan_kfree_large(object);
4695  	kmsan_kfree_large(object);
4696  
4697  	lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
4698  			      -(PAGE_SIZE << order));
4699  	folio_put(folio);
4700  }
4701  
4702  /**
4703   * kfree - free previously allocated memory
4704   * @object: pointer returned by kmalloc() or kmem_cache_alloc()
4705   *
4706   * If @object is NULL, no operation is performed.
4707   */
4708  void kfree(const void *object)
4709  {
4710  	struct folio *folio;
4711  	struct slab *slab;
4712  	struct kmem_cache *s;
4713  	void *x = (void *)object;
4714  
4715  	trace_kfree(_RET_IP_, object);
4716  
4717  	if (unlikely(ZERO_OR_NULL_PTR(object)))
4718  		return;
4719  
4720  	folio = virt_to_folio(object);
4721  	if (unlikely(!folio_test_slab(folio))) {
4722  		free_large_kmalloc(folio, (void *)object);
4723  		return;
4724  	}
4725  
4726  	slab = folio_slab(folio);
4727  	s = slab->slab_cache;
4728  	slab_free(s, slab, x, _RET_IP_);
4729  }
4730  EXPORT_SYMBOL(kfree);
4731  
4732  struct detached_freelist {
4733  	struct slab *slab;
4734  	void *tail;
4735  	void *freelist;
4736  	int cnt;
4737  	struct kmem_cache *s;
4738  };
4739  
4740  /*
4741   * This function progressively scans the array with free objects (with
4742   * a limited look ahead) and extract objects belonging to the same
4743   * slab.  It builds a detached freelist directly within the given
4744   * slab/objects.  This can happen without any need for
4745   * synchronization, because the objects are owned by running process.
4746   * The freelist is build up as a single linked list in the objects.
4747   * The idea is, that this detached freelist can then be bulk
4748   * transferred to the real freelist(s), but only requiring a single
4749   * synchronization primitive.  Look ahead in the array is limited due
4750   * to performance reasons.
4751   */
4752  static inline
4753  int build_detached_freelist(struct kmem_cache *s, size_t size,
4754  			    void **p, struct detached_freelist *df)
4755  {
4756  	int lookahead = 3;
4757  	void *object;
4758  	struct folio *folio;
4759  	size_t same;
4760  
4761  	object = p[--size];
4762  	folio = virt_to_folio(object);
4763  	if (!s) {
4764  		/* Handle kalloc'ed objects */
4765  		if (unlikely(!folio_test_slab(folio))) {
4766  			free_large_kmalloc(folio, object);
4767  			df->slab = NULL;
4768  			return size;
4769  		}
4770  		/* Derive kmem_cache from object */
4771  		df->slab = folio_slab(folio);
4772  		df->s = df->slab->slab_cache;
4773  	} else {
4774  		df->slab = folio_slab(folio);
4775  		df->s = cache_from_obj(s, object); /* Support for memcg */
4776  	}
4777  
4778  	/* Start new detached freelist */
4779  	df->tail = object;
4780  	df->freelist = object;
4781  	df->cnt = 1;
4782  
4783  	if (is_kfence_address(object))
4784  		return size;
4785  
4786  	set_freepointer(df->s, object, NULL);
4787  
4788  	same = size;
4789  	while (size) {
4790  		object = p[--size];
4791  		/* df->slab is always set at this point */
4792  		if (df->slab == virt_to_slab(object)) {
4793  			/* Opportunity build freelist */
4794  			set_freepointer(df->s, object, df->freelist);
4795  			df->freelist = object;
4796  			df->cnt++;
4797  			same--;
4798  			if (size != same)
4799  				swap(p[size], p[same]);
4800  			continue;
4801  		}
4802  
4803  		/* Limit look ahead search */
4804  		if (!--lookahead)
4805  			break;
4806  	}
4807  
4808  	return same;
4809  }
4810  
4811  /*
4812   * Internal bulk free of objects that were not initialised by the post alloc
4813   * hooks and thus should not be processed by the free hooks
4814   */
4815  static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
4816  {
4817  	if (!size)
4818  		return;
4819  
4820  	do {
4821  		struct detached_freelist df;
4822  
4823  		size = build_detached_freelist(s, size, p, &df);
4824  		if (!df.slab)
4825  			continue;
4826  
4827  		if (kfence_free(df.freelist))
4828  			continue;
4829  
4830  		do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt,
4831  			     _RET_IP_);
4832  	} while (likely(size));
4833  }
4834  
4835  /* Note that interrupts must be enabled when calling this function. */
4836  void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
4837  {
4838  	if (!size)
4839  		return;
4840  
4841  	do {
4842  		struct detached_freelist df;
4843  
4844  		size = build_detached_freelist(s, size, p, &df);
4845  		if (!df.slab)
4846  			continue;
4847  
4848  		slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size],
4849  			       df.cnt, _RET_IP_);
4850  	} while (likely(size));
4851  }
4852  EXPORT_SYMBOL(kmem_cache_free_bulk);
4853  
4854  #ifndef CONFIG_SLUB_TINY
4855  static inline
4856  int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
4857  			    void **p)
4858  {
4859  	struct kmem_cache_cpu *c;
4860  	unsigned long irqflags;
4861  	int i;
4862  
4863  	/*
4864  	 * Drain objects in the per cpu slab, while disabling local
4865  	 * IRQs, which protects against PREEMPT and interrupts
4866  	 * handlers invoking normal fastpath.
4867  	 */
4868  	c = slub_get_cpu_ptr(s->cpu_slab);
4869  	local_lock_irqsave(&s->cpu_slab->lock, irqflags);
4870  
4871  	for (i = 0; i < size; i++) {
4872  		void *object = kfence_alloc(s, s->object_size, flags);
4873  
4874  		if (unlikely(object)) {
4875  			p[i] = object;
4876  			continue;
4877  		}
4878  
4879  		object = c->freelist;
4880  		if (unlikely(!object)) {
4881  			/*
4882  			 * We may have removed an object from c->freelist using
4883  			 * the fastpath in the previous iteration; in that case,
4884  			 * c->tid has not been bumped yet.
4885  			 * Since ___slab_alloc() may reenable interrupts while
4886  			 * allocating memory, we should bump c->tid now.
4887  			 */
4888  			c->tid = next_tid(c->tid);
4889  
4890  			local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
4891  
4892  			/*
4893  			 * Invoking slow path likely have side-effect
4894  			 * of re-populating per CPU c->freelist
4895  			 */
4896  			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
4897  					    _RET_IP_, c, s->object_size);
4898  			if (unlikely(!p[i]))
4899  				goto error;
4900  
4901  			c = this_cpu_ptr(s->cpu_slab);
4902  			maybe_wipe_obj_freeptr(s, p[i]);
4903  
4904  			local_lock_irqsave(&s->cpu_slab->lock, irqflags);
4905  
4906  			continue; /* goto for-loop */
4907  		}
4908  		c->freelist = get_freepointer(s, object);
4909  		p[i] = object;
4910  		maybe_wipe_obj_freeptr(s, p[i]);
4911  		stat(s, ALLOC_FASTPATH);
4912  	}
4913  	c->tid = next_tid(c->tid);
4914  	local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
4915  	slub_put_cpu_ptr(s->cpu_slab);
4916  
4917  	return i;
4918  
4919  error:
4920  	slub_put_cpu_ptr(s->cpu_slab);
4921  	__kmem_cache_free_bulk(s, i, p);
4922  	return 0;
4923  
4924  }
4925  #else /* CONFIG_SLUB_TINY */
4926  static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
4927  				   size_t size, void **p)
4928  {
4929  	int i;
4930  
4931  	for (i = 0; i < size; i++) {
4932  		void *object = kfence_alloc(s, s->object_size, flags);
4933  
4934  		if (unlikely(object)) {
4935  			p[i] = object;
4936  			continue;
4937  		}
4938  
4939  		p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE,
4940  					 _RET_IP_, s->object_size);
4941  		if (unlikely(!p[i]))
4942  			goto error;
4943  
4944  		maybe_wipe_obj_freeptr(s, p[i]);
4945  	}
4946  
4947  	return i;
4948  
4949  error:
4950  	__kmem_cache_free_bulk(s, i, p);
4951  	return 0;
4952  }
4953  #endif /* CONFIG_SLUB_TINY */
4954  
4955  /* Note that interrupts must be enabled when calling this function. */
4956  int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
4957  				 void **p)
4958  {
4959  	int i;
4960  
4961  	if (!size)
4962  		return 0;
4963  
4964  	s = slab_pre_alloc_hook(s, flags);
4965  	if (unlikely(!s))
4966  		return 0;
4967  
4968  	i = __kmem_cache_alloc_bulk(s, flags, size, p);
4969  	if (unlikely(i == 0))
4970  		return 0;
4971  
4972  	/*
4973  	 * memcg and kmem_cache debug support and memory initialization.
4974  	 * Done outside of the IRQ disabled fastpath loop.
4975  	 */
4976  	if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p,
4977  		    slab_want_init_on_alloc(flags, s), s->object_size))) {
4978  		return 0;
4979  	}
4980  	return i;
4981  }
4982  EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
4983  
4984  
4985  /*
4986   * Object placement in a slab is made very easy because we always start at
4987   * offset 0. If we tune the size of the object to the alignment then we can
4988   * get the required alignment by putting one properly sized object after
4989   * another.
4990   *
4991   * Notice that the allocation order determines the sizes of the per cpu
4992   * caches. Each processor has always one slab available for allocations.
4993   * Increasing the allocation order reduces the number of times that slabs
4994   * must be moved on and off the partial lists and is therefore a factor in
4995   * locking overhead.
4996   */
4997  
4998  /*
4999   * Minimum / Maximum order of slab pages. This influences locking overhead
5000   * and slab fragmentation. A higher order reduces the number of partial slabs
5001   * and increases the number of allocations possible without having to
5002   * take the list_lock.
5003   */
5004  static unsigned int slub_min_order;
5005  static unsigned int slub_max_order =
5006  	IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
5007  static unsigned int slub_min_objects;
5008  
5009  /*
5010   * Calculate the order of allocation given an slab object size.
5011   *
5012   * The order of allocation has significant impact on performance and other
5013   * system components. Generally order 0 allocations should be preferred since
5014   * order 0 does not cause fragmentation in the page allocator. Larger objects
5015   * be problematic to put into order 0 slabs because there may be too much
5016   * unused space left. We go to a higher order if more than 1/16th of the slab
5017   * would be wasted.
5018   *
5019   * In order to reach satisfactory performance we must ensure that a minimum
5020   * number of objects is in one slab. Otherwise we may generate too much
5021   * activity on the partial lists which requires taking the list_lock. This is
5022   * less a concern for large slabs though which are rarely used.
5023   *
5024   * slab_max_order specifies the order where we begin to stop considering the
5025   * number of objects in a slab as critical. If we reach slab_max_order then
5026   * we try to keep the page order as low as possible. So we accept more waste
5027   * of space in favor of a small page order.
5028   *
5029   * Higher order allocations also allow the placement of more objects in a
5030   * slab and thereby reduce object handling overhead. If the user has
5031   * requested a higher minimum order then we start with that one instead of
5032   * the smallest order which will fit the object.
5033   */
5034  static inline unsigned int calc_slab_order(unsigned int size,
5035  		unsigned int min_order, unsigned int max_order,
5036  		unsigned int fract_leftover)
5037  {
5038  	unsigned int order;
5039  
5040  	for (order = min_order; order <= max_order; order++) {
5041  
5042  		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
5043  		unsigned int rem;
5044  
5045  		rem = slab_size % size;
5046  
5047  		if (rem <= slab_size / fract_leftover)
5048  			break;
5049  	}
5050  
5051  	return order;
5052  }
5053  
5054  static inline int calculate_order(unsigned int size)
5055  {
5056  	unsigned int order;
5057  	unsigned int min_objects;
5058  	unsigned int max_objects;
5059  	unsigned int min_order;
5060  
5061  	min_objects = slub_min_objects;
5062  	if (!min_objects) {
5063  		/*
5064  		 * Some architectures will only update present cpus when
5065  		 * onlining them, so don't trust the number if it's just 1. But
5066  		 * we also don't want to use nr_cpu_ids always, as on some other
5067  		 * architectures, there can be many possible cpus, but never
5068  		 * onlined. Here we compromise between trying to avoid too high
5069  		 * order on systems that appear larger than they are, and too
5070  		 * low order on systems that appear smaller than they are.
5071  		 */
5072  		unsigned int nr_cpus = num_present_cpus();
5073  		if (nr_cpus <= 1)
5074  			nr_cpus = nr_cpu_ids;
5075  		min_objects = 4 * (fls(nr_cpus) + 1);
5076  	}
5077  	/* min_objects can't be 0 because get_order(0) is undefined */
5078  	max_objects = max(order_objects(slub_max_order, size), 1U);
5079  	min_objects = min(min_objects, max_objects);
5080  
5081  	min_order = max_t(unsigned int, slub_min_order,
5082  			  get_order(min_objects * size));
5083  	if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
5084  		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
5085  
5086  	/*
5087  	 * Attempt to find best configuration for a slab. This works by first
5088  	 * attempting to generate a layout with the best possible configuration
5089  	 * and backing off gradually.
5090  	 *
5091  	 * We start with accepting at most 1/16 waste and try to find the
5092  	 * smallest order from min_objects-derived/slab_min_order up to
5093  	 * slab_max_order that will satisfy the constraint. Note that increasing
5094  	 * the order can only result in same or less fractional waste, not more.
5095  	 *
5096  	 * If that fails, we increase the acceptable fraction of waste and try
5097  	 * again. The last iteration with fraction of 1/2 would effectively
5098  	 * accept any waste and give us the order determined by min_objects, as
5099  	 * long as at least single object fits within slab_max_order.
5100  	 */
5101  	for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
5102  		order = calc_slab_order(size, min_order, slub_max_order,
5103  					fraction);
5104  		if (order <= slub_max_order)
5105  			return order;
5106  	}
5107  
5108  	/*
5109  	 * Doh this slab cannot be placed using slab_max_order.
5110  	 */
5111  	order = get_order(size);
5112  	if (order <= MAX_PAGE_ORDER)
5113  		return order;
5114  	return -ENOSYS;
5115  }
5116  
5117  static void
5118  init_kmem_cache_node(struct kmem_cache_node *n)
5119  {
5120  	n->nr_partial = 0;
5121  	spin_lock_init(&n->list_lock);
5122  	INIT_LIST_HEAD(&n->partial);
5123  #ifdef CONFIG_SLUB_DEBUG
5124  	atomic_long_set(&n->nr_slabs, 0);
5125  	atomic_long_set(&n->total_objects, 0);
5126  	INIT_LIST_HEAD(&n->full);
5127  #endif
5128  }
5129  
5130  #ifndef CONFIG_SLUB_TINY
5131  static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
5132  {
5133  	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
5134  			NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
5135  			sizeof(struct kmem_cache_cpu));
5136  
5137  	/*
5138  	 * Must align to double word boundary for the double cmpxchg
5139  	 * instructions to work; see __pcpu_double_call_return_bool().
5140  	 */
5141  	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
5142  				     2 * sizeof(void *));
5143  
5144  	if (!s->cpu_slab)
5145  		return 0;
5146  
5147  	init_kmem_cache_cpus(s);
5148  
5149  	return 1;
5150  }
5151  #else
5152  static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
5153  {
5154  	return 1;
5155  }
5156  #endif /* CONFIG_SLUB_TINY */
5157  
5158  static struct kmem_cache *kmem_cache_node;
5159  
5160  /*
5161   * No kmalloc_node yet so do it by hand. We know that this is the first
5162   * slab on the node for this slabcache. There are no concurrent accesses
5163   * possible.
5164   *
5165   * Note that this function only works on the kmem_cache_node
5166   * when allocating for the kmem_cache_node. This is used for bootstrapping
5167   * memory on a fresh node that has no slab structures yet.
5168   */
5169  static void early_kmem_cache_node_alloc(int node)
5170  {
5171  	struct slab *slab;
5172  	struct kmem_cache_node *n;
5173  
5174  	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
5175  
5176  	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
5177  
5178  	BUG_ON(!slab);
5179  	if (slab_nid(slab) != node) {
5180  		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
5181  		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
5182  	}
5183  
5184  	n = slab->freelist;
5185  	BUG_ON(!n);
5186  #ifdef CONFIG_SLUB_DEBUG
5187  	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
5188  #endif
5189  	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
5190  	slab->freelist = get_freepointer(kmem_cache_node, n);
5191  	slab->inuse = 1;
5192  	kmem_cache_node->node[node] = n;
5193  	init_kmem_cache_node(n);
5194  	inc_slabs_node(kmem_cache_node, node, slab->objects);
5195  
5196  	/*
5197  	 * No locks need to be taken here as it has just been
5198  	 * initialized and there is no concurrent access.
5199  	 */
5200  	__add_partial(n, slab, DEACTIVATE_TO_HEAD);
5201  }
5202  
5203  static void free_kmem_cache_nodes(struct kmem_cache *s)
5204  {
5205  	int node;
5206  	struct kmem_cache_node *n;
5207  
5208  	for_each_kmem_cache_node(s, node, n) {
5209  		s->node[node] = NULL;
5210  		kmem_cache_free(kmem_cache_node, n);
5211  	}
5212  }
5213  
5214  void __kmem_cache_release(struct kmem_cache *s)
5215  {
5216  	cache_random_seq_destroy(s);
5217  #ifndef CONFIG_SLUB_TINY
5218  	free_percpu(s->cpu_slab);
5219  #endif
5220  	free_kmem_cache_nodes(s);
5221  }
5222  
5223  static int init_kmem_cache_nodes(struct kmem_cache *s)
5224  {
5225  	int node;
5226  
5227  	for_each_node_mask(node, slab_nodes) {
5228  		struct kmem_cache_node *n;
5229  
5230  		if (slab_state == DOWN) {
5231  			early_kmem_cache_node_alloc(node);
5232  			continue;
5233  		}
5234  		n = kmem_cache_alloc_node(kmem_cache_node,
5235  						GFP_KERNEL, node);
5236  
5237  		if (!n) {
5238  			free_kmem_cache_nodes(s);
5239  			return 0;
5240  		}
5241  
5242  		init_kmem_cache_node(n);
5243  		s->node[node] = n;
5244  	}
5245  	return 1;
5246  }
5247  
5248  static void set_cpu_partial(struct kmem_cache *s)
5249  {
5250  #ifdef CONFIG_SLUB_CPU_PARTIAL
5251  	unsigned int nr_objects;
5252  
5253  	/*
5254  	 * cpu_partial determined the maximum number of objects kept in the
5255  	 * per cpu partial lists of a processor.
5256  	 *
5257  	 * Per cpu partial lists mainly contain slabs that just have one
5258  	 * object freed. If they are used for allocation then they can be
5259  	 * filled up again with minimal effort. The slab will never hit the
5260  	 * per node partial lists and therefore no locking will be required.
5261  	 *
5262  	 * For backwards compatibility reasons, this is determined as number
5263  	 * of objects, even though we now limit maximum number of pages, see
5264  	 * slub_set_cpu_partial()
5265  	 */
5266  	if (!kmem_cache_has_cpu_partial(s))
5267  		nr_objects = 0;
5268  	else if (s->size >= PAGE_SIZE)
5269  		nr_objects = 6;
5270  	else if (s->size >= 1024)
5271  		nr_objects = 24;
5272  	else if (s->size >= 256)
5273  		nr_objects = 52;
5274  	else
5275  		nr_objects = 120;
5276  
5277  	slub_set_cpu_partial(s, nr_objects);
5278  #endif
5279  }
5280  
5281  /*
5282   * calculate_sizes() determines the order and the distribution of data within
5283   * a slab object.
5284   */
5285  static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
5286  {
5287  	slab_flags_t flags = s->flags;
5288  	unsigned int size = s->object_size;
5289  	unsigned int order;
5290  
5291  	/*
5292  	 * Round up object size to the next word boundary. We can only
5293  	 * place the free pointer at word boundaries and this determines
5294  	 * the possible location of the free pointer.
5295  	 */
5296  	size = ALIGN(size, sizeof(void *));
5297  
5298  #ifdef CONFIG_SLUB_DEBUG
5299  	/*
5300  	 * Determine if we can poison the object itself. If the user of
5301  	 * the slab may touch the object after free or before allocation
5302  	 * then we should never poison the object itself.
5303  	 */
5304  	if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
5305  			!s->ctor)
5306  		s->flags |= __OBJECT_POISON;
5307  	else
5308  		s->flags &= ~__OBJECT_POISON;
5309  
5310  
5311  	/*
5312  	 * If we are Redzoning then check if there is some space between the
5313  	 * end of the object and the free pointer. If not then add an
5314  	 * additional word to have some bytes to store Redzone information.
5315  	 */
5316  	if ((flags & SLAB_RED_ZONE) && size == s->object_size)
5317  		size += sizeof(void *);
5318  #endif
5319  
5320  	/*
5321  	 * With that we have determined the number of bytes in actual use
5322  	 * by the object and redzoning.
5323  	 */
5324  	s->inuse = size;
5325  
5326  	if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||
5327  	    (flags & SLAB_POISON) || s->ctor ||
5328  	    ((flags & SLAB_RED_ZONE) &&
5329  	     (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
5330  		/*
5331  		 * Relocate free pointer after the object if it is not
5332  		 * permitted to overwrite the first word of the object on
5333  		 * kmem_cache_free.
5334  		 *
5335  		 * This is the case if we do RCU, have a constructor or
5336  		 * destructor, are poisoning the objects, or are
5337  		 * redzoning an object smaller than sizeof(void *) or are
5338  		 * redzoning an object with slub_debug_orig_size() enabled,
5339  		 * in which case the right redzone may be extended.
5340  		 *
5341  		 * The assumption that s->offset >= s->inuse means free
5342  		 * pointer is outside of the object is used in the
5343  		 * freeptr_outside_object() function. If that is no
5344  		 * longer true, the function needs to be modified.
5345  		 */
5346  		s->offset = size;
5347  		size += sizeof(void *);
5348  	} else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) {
5349  		s->offset = args->freeptr_offset;
5350  	} else {
5351  		/*
5352  		 * Store freelist pointer near middle of object to keep
5353  		 * it away from the edges of the object to avoid small
5354  		 * sized over/underflows from neighboring allocations.
5355  		 */
5356  		s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
5357  	}
5358  
5359  #ifdef CONFIG_SLUB_DEBUG
5360  	if (flags & SLAB_STORE_USER) {
5361  		/*
5362  		 * Need to store information about allocs and frees after
5363  		 * the object.
5364  		 */
5365  		size += 2 * sizeof(struct track);
5366  
5367  		/* Save the original kmalloc request size */
5368  		if (flags & SLAB_KMALLOC)
5369  			size += sizeof(unsigned int);
5370  	}
5371  #endif
5372  
5373  	kasan_cache_create(s, &size, &s->flags);
5374  #ifdef CONFIG_SLUB_DEBUG
5375  	if (flags & SLAB_RED_ZONE) {
5376  		/*
5377  		 * Add some empty padding so that we can catch
5378  		 * overwrites from earlier objects rather than let
5379  		 * tracking information or the free pointer be
5380  		 * corrupted if a user writes before the start
5381  		 * of the object.
5382  		 */
5383  		size += sizeof(void *);
5384  
5385  		s->red_left_pad = sizeof(void *);
5386  		s->red_left_pad = ALIGN(s->red_left_pad, s->align);
5387  		size += s->red_left_pad;
5388  	}
5389  #endif
5390  
5391  	/*
5392  	 * SLUB stores one object immediately after another beginning from
5393  	 * offset 0. In order to align the objects we have to simply size
5394  	 * each object to conform to the alignment.
5395  	 */
5396  	size = ALIGN(size, s->align);
5397  	s->size = size;
5398  	s->reciprocal_size = reciprocal_value(size);
5399  	order = calculate_order(size);
5400  
5401  	if ((int)order < 0)
5402  		return 0;
5403  
5404  	s->allocflags = __GFP_COMP;
5405  
5406  	if (s->flags & SLAB_CACHE_DMA)
5407  		s->allocflags |= GFP_DMA;
5408  
5409  	if (s->flags & SLAB_CACHE_DMA32)
5410  		s->allocflags |= GFP_DMA32;
5411  
5412  	if (s->flags & SLAB_RECLAIM_ACCOUNT)
5413  		s->allocflags |= __GFP_RECLAIMABLE;
5414  
5415  	/*
5416  	 * Determine the number of objects per slab
5417  	 */
5418  	s->oo = oo_make(order, size);
5419  	s->min = oo_make(get_order(size), size);
5420  
5421  	return !!oo_objects(s->oo);
5422  }
5423  
5424  static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
5425  			      const char *text)
5426  {
5427  #ifdef CONFIG_SLUB_DEBUG
5428  	void *addr = slab_address(slab);
5429  	void *p;
5430  
5431  	slab_err(s, slab, text, s->name);
5432  
5433  	spin_lock(&object_map_lock);
5434  	__fill_map(object_map, s, slab);
5435  
5436  	for_each_object(p, s, addr, slab->objects) {
5437  
5438  		if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
5439  			pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
5440  			print_tracking(s, p);
5441  		}
5442  	}
5443  	spin_unlock(&object_map_lock);
5444  #endif
5445  }
5446  
5447  /*
5448   * Attempt to free all partial slabs on a node.
5449   * This is called from __kmem_cache_shutdown(). We must take list_lock
5450   * because sysfs file might still access partial list after the shutdowning.
5451   */
5452  static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
5453  {
5454  	LIST_HEAD(discard);
5455  	struct slab *slab, *h;
5456  
5457  	BUG_ON(irqs_disabled());
5458  	spin_lock_irq(&n->list_lock);
5459  	list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
5460  		if (!slab->inuse) {
5461  			remove_partial(n, slab);
5462  			list_add(&slab->slab_list, &discard);
5463  		} else {
5464  			list_slab_objects(s, slab,
5465  			  "Objects remaining in %s on __kmem_cache_shutdown()");
5466  		}
5467  	}
5468  	spin_unlock_irq(&n->list_lock);
5469  
5470  	list_for_each_entry_safe(slab, h, &discard, slab_list)
5471  		discard_slab(s, slab);
5472  }
5473  
5474  bool __kmem_cache_empty(struct kmem_cache *s)
5475  {
5476  	int node;
5477  	struct kmem_cache_node *n;
5478  
5479  	for_each_kmem_cache_node(s, node, n)
5480  		if (n->nr_partial || node_nr_slabs(n))
5481  			return false;
5482  	return true;
5483  }
5484  
5485  /*
5486   * Release all resources used by a slab cache.
5487   */
5488  int __kmem_cache_shutdown(struct kmem_cache *s)
5489  {
5490  	int node;
5491  	struct kmem_cache_node *n;
5492  
5493  	flush_all_cpus_locked(s);
5494  	/* Attempt to free all objects */
5495  	for_each_kmem_cache_node(s, node, n) {
5496  		free_partial(s, n);
5497  		if (n->nr_partial || node_nr_slabs(n))
5498  			return 1;
5499  	}
5500  	return 0;
5501  }
5502  
5503  #ifdef CONFIG_PRINTK
5504  void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
5505  {
5506  	void *base;
5507  	int __maybe_unused i;
5508  	unsigned int objnr;
5509  	void *objp;
5510  	void *objp0;
5511  	struct kmem_cache *s = slab->slab_cache;
5512  	struct track __maybe_unused *trackp;
5513  
5514  	kpp->kp_ptr = object;
5515  	kpp->kp_slab = slab;
5516  	kpp->kp_slab_cache = s;
5517  	base = slab_address(slab);
5518  	objp0 = kasan_reset_tag(object);
5519  #ifdef CONFIG_SLUB_DEBUG
5520  	objp = restore_red_left(s, objp0);
5521  #else
5522  	objp = objp0;
5523  #endif
5524  	objnr = obj_to_index(s, slab, objp);
5525  	kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
5526  	objp = base + s->size * objnr;
5527  	kpp->kp_objp = objp;
5528  	if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
5529  			 || (objp - base) % s->size) ||
5530  	    !(s->flags & SLAB_STORE_USER))
5531  		return;
5532  #ifdef CONFIG_SLUB_DEBUG
5533  	objp = fixup_red_left(s, objp);
5534  	trackp = get_track(s, objp, TRACK_ALLOC);
5535  	kpp->kp_ret = (void *)trackp->addr;
5536  #ifdef CONFIG_STACKDEPOT
5537  	{
5538  		depot_stack_handle_t handle;
5539  		unsigned long *entries;
5540  		unsigned int nr_entries;
5541  
5542  		handle = READ_ONCE(trackp->handle);
5543  		if (handle) {
5544  			nr_entries = stack_depot_fetch(handle, &entries);
5545  			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
5546  				kpp->kp_stack[i] = (void *)entries[i];
5547  		}
5548  
5549  		trackp = get_track(s, objp, TRACK_FREE);
5550  		handle = READ_ONCE(trackp->handle);
5551  		if (handle) {
5552  			nr_entries = stack_depot_fetch(handle, &entries);
5553  			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
5554  				kpp->kp_free_stack[i] = (void *)entries[i];
5555  		}
5556  	}
5557  #endif
5558  #endif
5559  }
5560  #endif
5561  
5562  /********************************************************************
5563   *		Kmalloc subsystem
5564   *******************************************************************/
5565  
5566  static int __init setup_slub_min_order(char *str)
5567  {
5568  	get_option(&str, (int *)&slub_min_order);
5569  
5570  	if (slub_min_order > slub_max_order)
5571  		slub_max_order = slub_min_order;
5572  
5573  	return 1;
5574  }
5575  
5576  __setup("slab_min_order=", setup_slub_min_order);
5577  __setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0);
5578  
5579  
5580  static int __init setup_slub_max_order(char *str)
5581  {
5582  	get_option(&str, (int *)&slub_max_order);
5583  	slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
5584  
5585  	if (slub_min_order > slub_max_order)
5586  		slub_min_order = slub_max_order;
5587  
5588  	return 1;
5589  }
5590  
5591  __setup("slab_max_order=", setup_slub_max_order);
5592  __setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0);
5593  
5594  static int __init setup_slub_min_objects(char *str)
5595  {
5596  	get_option(&str, (int *)&slub_min_objects);
5597  
5598  	return 1;
5599  }
5600  
5601  __setup("slab_min_objects=", setup_slub_min_objects);
5602  __setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0);
5603  
5604  #ifdef CONFIG_HARDENED_USERCOPY
5605  /*
5606   * Rejects incorrectly sized objects and objects that are to be copied
5607   * to/from userspace but do not fall entirely within the containing slab
5608   * cache's usercopy region.
5609   *
5610   * Returns NULL if check passes, otherwise const char * to name of cache
5611   * to indicate an error.
5612   */
5613  void __check_heap_object(const void *ptr, unsigned long n,
5614  			 const struct slab *slab, bool to_user)
5615  {
5616  	struct kmem_cache *s;
5617  	unsigned int offset;
5618  	bool is_kfence = is_kfence_address(ptr);
5619  
5620  	ptr = kasan_reset_tag(ptr);
5621  
5622  	/* Find object and usable object size. */
5623  	s = slab->slab_cache;
5624  
5625  	/* Reject impossible pointers. */
5626  	if (ptr < slab_address(slab))
5627  		usercopy_abort("SLUB object not in SLUB page?!", NULL,
5628  			       to_user, 0, n);
5629  
5630  	/* Find offset within object. */
5631  	if (is_kfence)
5632  		offset = ptr - kfence_object_start(ptr);
5633  	else
5634  		offset = (ptr - slab_address(slab)) % s->size;
5635  
5636  	/* Adjust for redzone and reject if within the redzone. */
5637  	if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
5638  		if (offset < s->red_left_pad)
5639  			usercopy_abort("SLUB object in left red zone",
5640  				       s->name, to_user, offset, n);
5641  		offset -= s->red_left_pad;
5642  	}
5643  
5644  	/* Allow address range falling entirely within usercopy region. */
5645  	if (offset >= s->useroffset &&
5646  	    offset - s->useroffset <= s->usersize &&
5647  	    n <= s->useroffset - offset + s->usersize)
5648  		return;
5649  
5650  	usercopy_abort("SLUB object", s->name, to_user, offset, n);
5651  }
5652  #endif /* CONFIG_HARDENED_USERCOPY */
5653  
5654  #define SHRINK_PROMOTE_MAX 32
5655  
5656  /*
5657   * kmem_cache_shrink discards empty slabs and promotes the slabs filled
5658   * up most to the head of the partial lists. New allocations will then
5659   * fill those up and thus they can be removed from the partial lists.
5660   *
5661   * The slabs with the least items are placed last. This results in them
5662   * being allocated from last increasing the chance that the last objects
5663   * are freed in them.
5664   */
5665  static int __kmem_cache_do_shrink(struct kmem_cache *s)
5666  {
5667  	int node;
5668  	int i;
5669  	struct kmem_cache_node *n;
5670  	struct slab *slab;
5671  	struct slab *t;
5672  	struct list_head discard;
5673  	struct list_head promote[SHRINK_PROMOTE_MAX];
5674  	unsigned long flags;
5675  	int ret = 0;
5676  
5677  	for_each_kmem_cache_node(s, node, n) {
5678  		INIT_LIST_HEAD(&discard);
5679  		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
5680  			INIT_LIST_HEAD(promote + i);
5681  
5682  		spin_lock_irqsave(&n->list_lock, flags);
5683  
5684  		/*
5685  		 * Build lists of slabs to discard or promote.
5686  		 *
5687  		 * Note that concurrent frees may occur while we hold the
5688  		 * list_lock. slab->inuse here is the upper limit.
5689  		 */
5690  		list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
5691  			int free = slab->objects - slab->inuse;
5692  
5693  			/* Do not reread slab->inuse */
5694  			barrier();
5695  
5696  			/* We do not keep full slabs on the list */
5697  			BUG_ON(free <= 0);
5698  
5699  			if (free == slab->objects) {
5700  				list_move(&slab->slab_list, &discard);
5701  				slab_clear_node_partial(slab);
5702  				n->nr_partial--;
5703  				dec_slabs_node(s, node, slab->objects);
5704  			} else if (free <= SHRINK_PROMOTE_MAX)
5705  				list_move(&slab->slab_list, promote + free - 1);
5706  		}
5707  
5708  		/*
5709  		 * Promote the slabs filled up most to the head of the
5710  		 * partial list.
5711  		 */
5712  		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
5713  			list_splice(promote + i, &n->partial);
5714  
5715  		spin_unlock_irqrestore(&n->list_lock, flags);
5716  
5717  		/* Release empty slabs */
5718  		list_for_each_entry_safe(slab, t, &discard, slab_list)
5719  			free_slab(s, slab);
5720  
5721  		if (node_nr_slabs(n))
5722  			ret = 1;
5723  	}
5724  
5725  	return ret;
5726  }
5727  
5728  int __kmem_cache_shrink(struct kmem_cache *s)
5729  {
5730  	flush_all(s);
5731  	return __kmem_cache_do_shrink(s);
5732  }
5733  
5734  static int slab_mem_going_offline_callback(void *arg)
5735  {
5736  	struct kmem_cache *s;
5737  
5738  	mutex_lock(&slab_mutex);
5739  	list_for_each_entry(s, &slab_caches, list) {
5740  		flush_all_cpus_locked(s);
5741  		__kmem_cache_do_shrink(s);
5742  	}
5743  	mutex_unlock(&slab_mutex);
5744  
5745  	return 0;
5746  }
5747  
5748  static void slab_mem_offline_callback(void *arg)
5749  {
5750  	struct memory_notify *marg = arg;
5751  	int offline_node;
5752  
5753  	offline_node = marg->status_change_nid_normal;
5754  
5755  	/*
5756  	 * If the node still has available memory. we need kmem_cache_node
5757  	 * for it yet.
5758  	 */
5759  	if (offline_node < 0)
5760  		return;
5761  
5762  	mutex_lock(&slab_mutex);
5763  	node_clear(offline_node, slab_nodes);
5764  	/*
5765  	 * We no longer free kmem_cache_node structures here, as it would be
5766  	 * racy with all get_node() users, and infeasible to protect them with
5767  	 * slab_mutex.
5768  	 */
5769  	mutex_unlock(&slab_mutex);
5770  }
5771  
5772  static int slab_mem_going_online_callback(void *arg)
5773  {
5774  	struct kmem_cache_node *n;
5775  	struct kmem_cache *s;
5776  	struct memory_notify *marg = arg;
5777  	int nid = marg->status_change_nid_normal;
5778  	int ret = 0;
5779  
5780  	/*
5781  	 * If the node's memory is already available, then kmem_cache_node is
5782  	 * already created. Nothing to do.
5783  	 */
5784  	if (nid < 0)
5785  		return 0;
5786  
5787  	/*
5788  	 * We are bringing a node online. No memory is available yet. We must
5789  	 * allocate a kmem_cache_node structure in order to bring the node
5790  	 * online.
5791  	 */
5792  	mutex_lock(&slab_mutex);
5793  	list_for_each_entry(s, &slab_caches, list) {
5794  		/*
5795  		 * The structure may already exist if the node was previously
5796  		 * onlined and offlined.
5797  		 */
5798  		if (get_node(s, nid))
5799  			continue;
5800  		/*
5801  		 * XXX: kmem_cache_alloc_node will fallback to other nodes
5802  		 *      since memory is not yet available from the node that
5803  		 *      is brought up.
5804  		 */
5805  		n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
5806  		if (!n) {
5807  			ret = -ENOMEM;
5808  			goto out;
5809  		}
5810  		init_kmem_cache_node(n);
5811  		s->node[nid] = n;
5812  	}
5813  	/*
5814  	 * Any cache created after this point will also have kmem_cache_node
5815  	 * initialized for the new node.
5816  	 */
5817  	node_set(nid, slab_nodes);
5818  out:
5819  	mutex_unlock(&slab_mutex);
5820  	return ret;
5821  }
5822  
5823  static int slab_memory_callback(struct notifier_block *self,
5824  				unsigned long action, void *arg)
5825  {
5826  	int ret = 0;
5827  
5828  	switch (action) {
5829  	case MEM_GOING_ONLINE:
5830  		ret = slab_mem_going_online_callback(arg);
5831  		break;
5832  	case MEM_GOING_OFFLINE:
5833  		ret = slab_mem_going_offline_callback(arg);
5834  		break;
5835  	case MEM_OFFLINE:
5836  	case MEM_CANCEL_ONLINE:
5837  		slab_mem_offline_callback(arg);
5838  		break;
5839  	case MEM_ONLINE:
5840  	case MEM_CANCEL_OFFLINE:
5841  		break;
5842  	}
5843  	if (ret)
5844  		ret = notifier_from_errno(ret);
5845  	else
5846  		ret = NOTIFY_OK;
5847  	return ret;
5848  }
5849  
5850  /********************************************************************
5851   *			Basic setup of slabs
5852   *******************************************************************/
5853  
5854  /*
5855   * Used for early kmem_cache structures that were allocated using
5856   * the page allocator. Allocate them properly then fix up the pointers
5857   * that may be pointing to the wrong kmem_cache structure.
5858   */
5859  
5860  static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
5861  {
5862  	int node;
5863  	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
5864  	struct kmem_cache_node *n;
5865  
5866  	memcpy(s, static_cache, kmem_cache->object_size);
5867  
5868  	/*
5869  	 * This runs very early, and only the boot processor is supposed to be
5870  	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
5871  	 * IPIs around.
5872  	 */
5873  	__flush_cpu_slab(s, smp_processor_id());
5874  	for_each_kmem_cache_node(s, node, n) {
5875  		struct slab *p;
5876  
5877  		list_for_each_entry(p, &n->partial, slab_list)
5878  			p->slab_cache = s;
5879  
5880  #ifdef CONFIG_SLUB_DEBUG
5881  		list_for_each_entry(p, &n->full, slab_list)
5882  			p->slab_cache = s;
5883  #endif
5884  	}
5885  	list_add(&s->list, &slab_caches);
5886  	return s;
5887  }
5888  
5889  void __init kmem_cache_init(void)
5890  {
5891  	static __initdata struct kmem_cache boot_kmem_cache,
5892  		boot_kmem_cache_node;
5893  	int node;
5894  
5895  	if (debug_guardpage_minorder())
5896  		slub_max_order = 0;
5897  
5898  	/* Print slub debugging pointers without hashing */
5899  	if (__slub_debug_enabled())
5900  		no_hash_pointers_enable(NULL);
5901  
5902  	kmem_cache_node = &boot_kmem_cache_node;
5903  	kmem_cache = &boot_kmem_cache;
5904  
5905  	/*
5906  	 * Initialize the nodemask for which we will allocate per node
5907  	 * structures. Here we don't need taking slab_mutex yet.
5908  	 */
5909  	for_each_node_state(node, N_NORMAL_MEMORY)
5910  		node_set(node, slab_nodes);
5911  
5912  	create_boot_cache(kmem_cache_node, "kmem_cache_node",
5913  			sizeof(struct kmem_cache_node),
5914  			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
5915  
5916  	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
5917  
5918  	/* Able to allocate the per node structures */
5919  	slab_state = PARTIAL;
5920  
5921  	create_boot_cache(kmem_cache, "kmem_cache",
5922  			offsetof(struct kmem_cache, node) +
5923  				nr_node_ids * sizeof(struct kmem_cache_node *),
5924  			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
5925  
5926  	kmem_cache = bootstrap(&boot_kmem_cache);
5927  	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
5928  
5929  	/* Now we can use the kmem_cache to allocate kmalloc slabs */
5930  	setup_kmalloc_cache_index_table();
5931  	create_kmalloc_caches();
5932  
5933  	/* Setup random freelists for each cache */
5934  	init_freelist_randomization();
5935  
5936  	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
5937  				  slub_cpu_dead);
5938  
5939  	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
5940  		cache_line_size(),
5941  		slub_min_order, slub_max_order, slub_min_objects,
5942  		nr_cpu_ids, nr_node_ids);
5943  }
5944  
5945  void __init kmem_cache_init_late(void)
5946  {
5947  #ifndef CONFIG_SLUB_TINY
5948  	flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
5949  	WARN_ON(!flushwq);
5950  #endif
5951  }
5952  
5953  struct kmem_cache *
5954  __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
5955  		   slab_flags_t flags, void (*ctor)(void *))
5956  {
5957  	struct kmem_cache *s;
5958  
5959  	s = find_mergeable(size, align, flags, name, ctor);
5960  	if (s) {
5961  		if (sysfs_slab_alias(s, name))
5962  			return NULL;
5963  
5964  		s->refcount++;
5965  
5966  		/*
5967  		 * Adjust the object sizes so that we clear
5968  		 * the complete object on kzalloc.
5969  		 */
5970  		s->object_size = max(s->object_size, size);
5971  		s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
5972  	}
5973  
5974  	return s;
5975  }
5976  
5977  int do_kmem_cache_create(struct kmem_cache *s, const char *name,
5978  			 unsigned int size, struct kmem_cache_args *args,
5979  			 slab_flags_t flags)
5980  {
5981  	int err = -EINVAL;
5982  
5983  	s->name = name;
5984  	s->size = s->object_size = size;
5985  
5986  	s->flags = kmem_cache_flags(flags, s->name);
5987  #ifdef CONFIG_SLAB_FREELIST_HARDENED
5988  	s->random = get_random_long();
5989  #endif
5990  	s->align = args->align;
5991  	s->ctor = args->ctor;
5992  #ifdef CONFIG_HARDENED_USERCOPY
5993  	s->useroffset = args->useroffset;
5994  	s->usersize = args->usersize;
5995  #endif
5996  
5997  	if (!calculate_sizes(args, s))
5998  		goto out;
5999  	if (disable_higher_order_debug) {
6000  		/*
6001  		 * Disable debugging flags that store metadata if the min slab
6002  		 * order increased.
6003  		 */
6004  		if (get_order(s->size) > get_order(s->object_size)) {
6005  			s->flags &= ~DEBUG_METADATA_FLAGS;
6006  			s->offset = 0;
6007  			if (!calculate_sizes(args, s))
6008  				goto out;
6009  		}
6010  	}
6011  
6012  #ifdef system_has_freelist_aba
6013  	if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
6014  		/* Enable fast mode */
6015  		s->flags |= __CMPXCHG_DOUBLE;
6016  	}
6017  #endif
6018  
6019  	/*
6020  	 * The larger the object size is, the more slabs we want on the partial
6021  	 * list to avoid pounding the page allocator excessively.
6022  	 */
6023  	s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
6024  	s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
6025  
6026  	set_cpu_partial(s);
6027  
6028  #ifdef CONFIG_NUMA
6029  	s->remote_node_defrag_ratio = 1000;
6030  #endif
6031  
6032  	/* Initialize the pre-computed randomized freelist if slab is up */
6033  	if (slab_state >= UP) {
6034  		if (init_cache_random_seq(s))
6035  			goto out;
6036  	}
6037  
6038  	if (!init_kmem_cache_nodes(s))
6039  		goto out;
6040  
6041  	if (!alloc_kmem_cache_cpus(s))
6042  		goto out;
6043  
6044  	/* Mutex is not taken during early boot */
6045  	if (slab_state <= UP) {
6046  		err = 0;
6047  		goto out;
6048  	}
6049  
6050  	err = sysfs_slab_add(s);
6051  	if (err)
6052  		goto out;
6053  
6054  	if (s->flags & SLAB_STORE_USER)
6055  		debugfs_slab_add(s);
6056  
6057  out:
6058  	if (err)
6059  		__kmem_cache_release(s);
6060  	return err;
6061  }
6062  
6063  #ifdef SLAB_SUPPORTS_SYSFS
6064  static int count_inuse(struct slab *slab)
6065  {
6066  	return slab->inuse;
6067  }
6068  
6069  static int count_total(struct slab *slab)
6070  {
6071  	return slab->objects;
6072  }
6073  #endif
6074  
6075  #ifdef CONFIG_SLUB_DEBUG
6076  static void validate_slab(struct kmem_cache *s, struct slab *slab,
6077  			  unsigned long *obj_map)
6078  {
6079  	void *p;
6080  	void *addr = slab_address(slab);
6081  
6082  	if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
6083  		return;
6084  
6085  	/* Now we know that a valid freelist exists */
6086  	__fill_map(obj_map, s, slab);
6087  	for_each_object(p, s, addr, slab->objects) {
6088  		u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
6089  			 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
6090  
6091  		if (!check_object(s, slab, p, val))
6092  			break;
6093  	}
6094  }
6095  
6096  static int validate_slab_node(struct kmem_cache *s,
6097  		struct kmem_cache_node *n, unsigned long *obj_map)
6098  {
6099  	unsigned long count = 0;
6100  	struct slab *slab;
6101  	unsigned long flags;
6102  
6103  	spin_lock_irqsave(&n->list_lock, flags);
6104  
6105  	list_for_each_entry(slab, &n->partial, slab_list) {
6106  		validate_slab(s, slab, obj_map);
6107  		count++;
6108  	}
6109  	if (count != n->nr_partial) {
6110  		pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
6111  		       s->name, count, n->nr_partial);
6112  		slab_add_kunit_errors();
6113  	}
6114  
6115  	if (!(s->flags & SLAB_STORE_USER))
6116  		goto out;
6117  
6118  	list_for_each_entry(slab, &n->full, slab_list) {
6119  		validate_slab(s, slab, obj_map);
6120  		count++;
6121  	}
6122  	if (count != node_nr_slabs(n)) {
6123  		pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
6124  		       s->name, count, node_nr_slabs(n));
6125  		slab_add_kunit_errors();
6126  	}
6127  
6128  out:
6129  	spin_unlock_irqrestore(&n->list_lock, flags);
6130  	return count;
6131  }
6132  
6133  long validate_slab_cache(struct kmem_cache *s)
6134  {
6135  	int node;
6136  	unsigned long count = 0;
6137  	struct kmem_cache_node *n;
6138  	unsigned long *obj_map;
6139  
6140  	obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
6141  	if (!obj_map)
6142  		return -ENOMEM;
6143  
6144  	flush_all(s);
6145  	for_each_kmem_cache_node(s, node, n)
6146  		count += validate_slab_node(s, n, obj_map);
6147  
6148  	bitmap_free(obj_map);
6149  
6150  	return count;
6151  }
6152  EXPORT_SYMBOL(validate_slab_cache);
6153  
6154  #ifdef CONFIG_DEBUG_FS
6155  /*
6156   * Generate lists of code addresses where slabcache objects are allocated
6157   * and freed.
6158   */
6159  
6160  struct location {
6161  	depot_stack_handle_t handle;
6162  	unsigned long count;
6163  	unsigned long addr;
6164  	unsigned long waste;
6165  	long long sum_time;
6166  	long min_time;
6167  	long max_time;
6168  	long min_pid;
6169  	long max_pid;
6170  	DECLARE_BITMAP(cpus, NR_CPUS);
6171  	nodemask_t nodes;
6172  };
6173  
6174  struct loc_track {
6175  	unsigned long max;
6176  	unsigned long count;
6177  	struct location *loc;
6178  	loff_t idx;
6179  };
6180  
6181  static struct dentry *slab_debugfs_root;
6182  
6183  static void free_loc_track(struct loc_track *t)
6184  {
6185  	if (t->max)
6186  		free_pages((unsigned long)t->loc,
6187  			get_order(sizeof(struct location) * t->max));
6188  }
6189  
6190  static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
6191  {
6192  	struct location *l;
6193  	int order;
6194  
6195  	order = get_order(sizeof(struct location) * max);
6196  
6197  	l = (void *)__get_free_pages(flags, order);
6198  	if (!l)
6199  		return 0;
6200  
6201  	if (t->count) {
6202  		memcpy(l, t->loc, sizeof(struct location) * t->count);
6203  		free_loc_track(t);
6204  	}
6205  	t->max = max;
6206  	t->loc = l;
6207  	return 1;
6208  }
6209  
6210  static int add_location(struct loc_track *t, struct kmem_cache *s,
6211  				const struct track *track,
6212  				unsigned int orig_size)
6213  {
6214  	long start, end, pos;
6215  	struct location *l;
6216  	unsigned long caddr, chandle, cwaste;
6217  	unsigned long age = jiffies - track->when;
6218  	depot_stack_handle_t handle = 0;
6219  	unsigned int waste = s->object_size - orig_size;
6220  
6221  #ifdef CONFIG_STACKDEPOT
6222  	handle = READ_ONCE(track->handle);
6223  #endif
6224  	start = -1;
6225  	end = t->count;
6226  
6227  	for ( ; ; ) {
6228  		pos = start + (end - start + 1) / 2;
6229  
6230  		/*
6231  		 * There is nothing at "end". If we end up there
6232  		 * we need to add something to before end.
6233  		 */
6234  		if (pos == end)
6235  			break;
6236  
6237  		l = &t->loc[pos];
6238  		caddr = l->addr;
6239  		chandle = l->handle;
6240  		cwaste = l->waste;
6241  		if ((track->addr == caddr) && (handle == chandle) &&
6242  			(waste == cwaste)) {
6243  
6244  			l->count++;
6245  			if (track->when) {
6246  				l->sum_time += age;
6247  				if (age < l->min_time)
6248  					l->min_time = age;
6249  				if (age > l->max_time)
6250  					l->max_time = age;
6251  
6252  				if (track->pid < l->min_pid)
6253  					l->min_pid = track->pid;
6254  				if (track->pid > l->max_pid)
6255  					l->max_pid = track->pid;
6256  
6257  				cpumask_set_cpu(track->cpu,
6258  						to_cpumask(l->cpus));
6259  			}
6260  			node_set(page_to_nid(virt_to_page(track)), l->nodes);
6261  			return 1;
6262  		}
6263  
6264  		if (track->addr < caddr)
6265  			end = pos;
6266  		else if (track->addr == caddr && handle < chandle)
6267  			end = pos;
6268  		else if (track->addr == caddr && handle == chandle &&
6269  				waste < cwaste)
6270  			end = pos;
6271  		else
6272  			start = pos;
6273  	}
6274  
6275  	/*
6276  	 * Not found. Insert new tracking element.
6277  	 */
6278  	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
6279  		return 0;
6280  
6281  	l = t->loc + pos;
6282  	if (pos < t->count)
6283  		memmove(l + 1, l,
6284  			(t->count - pos) * sizeof(struct location));
6285  	t->count++;
6286  	l->count = 1;
6287  	l->addr = track->addr;
6288  	l->sum_time = age;
6289  	l->min_time = age;
6290  	l->max_time = age;
6291  	l->min_pid = track->pid;
6292  	l->max_pid = track->pid;
6293  	l->handle = handle;
6294  	l->waste = waste;
6295  	cpumask_clear(to_cpumask(l->cpus));
6296  	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
6297  	nodes_clear(l->nodes);
6298  	node_set(page_to_nid(virt_to_page(track)), l->nodes);
6299  	return 1;
6300  }
6301  
6302  static void process_slab(struct loc_track *t, struct kmem_cache *s,
6303  		struct slab *slab, enum track_item alloc,
6304  		unsigned long *obj_map)
6305  {
6306  	void *addr = slab_address(slab);
6307  	bool is_alloc = (alloc == TRACK_ALLOC);
6308  	void *p;
6309  
6310  	__fill_map(obj_map, s, slab);
6311  
6312  	for_each_object(p, s, addr, slab->objects)
6313  		if (!test_bit(__obj_to_index(s, addr, p), obj_map))
6314  			add_location(t, s, get_track(s, p, alloc),
6315  				     is_alloc ? get_orig_size(s, p) :
6316  						s->object_size);
6317  }
6318  #endif  /* CONFIG_DEBUG_FS   */
6319  #endif	/* CONFIG_SLUB_DEBUG */
6320  
6321  #ifdef SLAB_SUPPORTS_SYSFS
6322  enum slab_stat_type {
6323  	SL_ALL,			/* All slabs */
6324  	SL_PARTIAL,		/* Only partially allocated slabs */
6325  	SL_CPU,			/* Only slabs used for cpu caches */
6326  	SL_OBJECTS,		/* Determine allocated objects not slabs */
6327  	SL_TOTAL		/* Determine object capacity not slabs */
6328  };
6329  
6330  #define SO_ALL		(1 << SL_ALL)
6331  #define SO_PARTIAL	(1 << SL_PARTIAL)
6332  #define SO_CPU		(1 << SL_CPU)
6333  #define SO_OBJECTS	(1 << SL_OBJECTS)
6334  #define SO_TOTAL	(1 << SL_TOTAL)
6335  
6336  static ssize_t show_slab_objects(struct kmem_cache *s,
6337  				 char *buf, unsigned long flags)
6338  {
6339  	unsigned long total = 0;
6340  	int node;
6341  	int x;
6342  	unsigned long *nodes;
6343  	int len = 0;
6344  
6345  	nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
6346  	if (!nodes)
6347  		return -ENOMEM;
6348  
6349  	if (flags & SO_CPU) {
6350  		int cpu;
6351  
6352  		for_each_possible_cpu(cpu) {
6353  			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
6354  							       cpu);
6355  			int node;
6356  			struct slab *slab;
6357  
6358  			slab = READ_ONCE(c->slab);
6359  			if (!slab)
6360  				continue;
6361  
6362  			node = slab_nid(slab);
6363  			if (flags & SO_TOTAL)
6364  				x = slab->objects;
6365  			else if (flags & SO_OBJECTS)
6366  				x = slab->inuse;
6367  			else
6368  				x = 1;
6369  
6370  			total += x;
6371  			nodes[node] += x;
6372  
6373  #ifdef CONFIG_SLUB_CPU_PARTIAL
6374  			slab = slub_percpu_partial_read_once(c);
6375  			if (slab) {
6376  				node = slab_nid(slab);
6377  				if (flags & SO_TOTAL)
6378  					WARN_ON_ONCE(1);
6379  				else if (flags & SO_OBJECTS)
6380  					WARN_ON_ONCE(1);
6381  				else
6382  					x = data_race(slab->slabs);
6383  				total += x;
6384  				nodes[node] += x;
6385  			}
6386  #endif
6387  		}
6388  	}
6389  
6390  	/*
6391  	 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
6392  	 * already held which will conflict with an existing lock order:
6393  	 *
6394  	 * mem_hotplug_lock->slab_mutex->kernfs_mutex
6395  	 *
6396  	 * We don't really need mem_hotplug_lock (to hold off
6397  	 * slab_mem_going_offline_callback) here because slab's memory hot
6398  	 * unplug code doesn't destroy the kmem_cache->node[] data.
6399  	 */
6400  
6401  #ifdef CONFIG_SLUB_DEBUG
6402  	if (flags & SO_ALL) {
6403  		struct kmem_cache_node *n;
6404  
6405  		for_each_kmem_cache_node(s, node, n) {
6406  
6407  			if (flags & SO_TOTAL)
6408  				x = node_nr_objs(n);
6409  			else if (flags & SO_OBJECTS)
6410  				x = node_nr_objs(n) - count_partial(n, count_free);
6411  			else
6412  				x = node_nr_slabs(n);
6413  			total += x;
6414  			nodes[node] += x;
6415  		}
6416  
6417  	} else
6418  #endif
6419  	if (flags & SO_PARTIAL) {
6420  		struct kmem_cache_node *n;
6421  
6422  		for_each_kmem_cache_node(s, node, n) {
6423  			if (flags & SO_TOTAL)
6424  				x = count_partial(n, count_total);
6425  			else if (flags & SO_OBJECTS)
6426  				x = count_partial(n, count_inuse);
6427  			else
6428  				x = n->nr_partial;
6429  			total += x;
6430  			nodes[node] += x;
6431  		}
6432  	}
6433  
6434  	len += sysfs_emit_at(buf, len, "%lu", total);
6435  #ifdef CONFIG_NUMA
6436  	for (node = 0; node < nr_node_ids; node++) {
6437  		if (nodes[node])
6438  			len += sysfs_emit_at(buf, len, " N%d=%lu",
6439  					     node, nodes[node]);
6440  	}
6441  #endif
6442  	len += sysfs_emit_at(buf, len, "\n");
6443  	kfree(nodes);
6444  
6445  	return len;
6446  }
6447  
6448  #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
6449  #define to_slab(n) container_of(n, struct kmem_cache, kobj)
6450  
6451  struct slab_attribute {
6452  	struct attribute attr;
6453  	ssize_t (*show)(struct kmem_cache *s, char *buf);
6454  	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
6455  };
6456  
6457  #define SLAB_ATTR_RO(_name) \
6458  	static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
6459  
6460  #define SLAB_ATTR(_name) \
6461  	static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
6462  
6463  static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
6464  {
6465  	return sysfs_emit(buf, "%u\n", s->size);
6466  }
6467  SLAB_ATTR_RO(slab_size);
6468  
6469  static ssize_t align_show(struct kmem_cache *s, char *buf)
6470  {
6471  	return sysfs_emit(buf, "%u\n", s->align);
6472  }
6473  SLAB_ATTR_RO(align);
6474  
6475  static ssize_t object_size_show(struct kmem_cache *s, char *buf)
6476  {
6477  	return sysfs_emit(buf, "%u\n", s->object_size);
6478  }
6479  SLAB_ATTR_RO(object_size);
6480  
6481  static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
6482  {
6483  	return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
6484  }
6485  SLAB_ATTR_RO(objs_per_slab);
6486  
6487  static ssize_t order_show(struct kmem_cache *s, char *buf)
6488  {
6489  	return sysfs_emit(buf, "%u\n", oo_order(s->oo));
6490  }
6491  SLAB_ATTR_RO(order);
6492  
6493  static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
6494  {
6495  	return sysfs_emit(buf, "%lu\n", s->min_partial);
6496  }
6497  
6498  static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
6499  				 size_t length)
6500  {
6501  	unsigned long min;
6502  	int err;
6503  
6504  	err = kstrtoul(buf, 10, &min);
6505  	if (err)
6506  		return err;
6507  
6508  	s->min_partial = min;
6509  	return length;
6510  }
6511  SLAB_ATTR(min_partial);
6512  
6513  static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
6514  {
6515  	unsigned int nr_partial = 0;
6516  #ifdef CONFIG_SLUB_CPU_PARTIAL
6517  	nr_partial = s->cpu_partial;
6518  #endif
6519  
6520  	return sysfs_emit(buf, "%u\n", nr_partial);
6521  }
6522  
6523  static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
6524  				 size_t length)
6525  {
6526  	unsigned int objects;
6527  	int err;
6528  
6529  	err = kstrtouint(buf, 10, &objects);
6530  	if (err)
6531  		return err;
6532  	if (objects && !kmem_cache_has_cpu_partial(s))
6533  		return -EINVAL;
6534  
6535  	slub_set_cpu_partial(s, objects);
6536  	flush_all(s);
6537  	return length;
6538  }
6539  SLAB_ATTR(cpu_partial);
6540  
6541  static ssize_t ctor_show(struct kmem_cache *s, char *buf)
6542  {
6543  	if (!s->ctor)
6544  		return 0;
6545  	return sysfs_emit(buf, "%pS\n", s->ctor);
6546  }
6547  SLAB_ATTR_RO(ctor);
6548  
6549  static ssize_t aliases_show(struct kmem_cache *s, char *buf)
6550  {
6551  	return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
6552  }
6553  SLAB_ATTR_RO(aliases);
6554  
6555  static ssize_t partial_show(struct kmem_cache *s, char *buf)
6556  {
6557  	return show_slab_objects(s, buf, SO_PARTIAL);
6558  }
6559  SLAB_ATTR_RO(partial);
6560  
6561  static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
6562  {
6563  	return show_slab_objects(s, buf, SO_CPU);
6564  }
6565  SLAB_ATTR_RO(cpu_slabs);
6566  
6567  static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
6568  {
6569  	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
6570  }
6571  SLAB_ATTR_RO(objects_partial);
6572  
6573  static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
6574  {
6575  	int objects = 0;
6576  	int slabs = 0;
6577  	int cpu __maybe_unused;
6578  	int len = 0;
6579  
6580  #ifdef CONFIG_SLUB_CPU_PARTIAL
6581  	for_each_online_cpu(cpu) {
6582  		struct slab *slab;
6583  
6584  		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
6585  
6586  		if (slab)
6587  			slabs += data_race(slab->slabs);
6588  	}
6589  #endif
6590  
6591  	/* Approximate half-full slabs, see slub_set_cpu_partial() */
6592  	objects = (slabs * oo_objects(s->oo)) / 2;
6593  	len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
6594  
6595  #ifdef CONFIG_SLUB_CPU_PARTIAL
6596  	for_each_online_cpu(cpu) {
6597  		struct slab *slab;
6598  
6599  		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
6600  		if (slab) {
6601  			slabs = data_race(slab->slabs);
6602  			objects = (slabs * oo_objects(s->oo)) / 2;
6603  			len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
6604  					     cpu, objects, slabs);
6605  		}
6606  	}
6607  #endif
6608  	len += sysfs_emit_at(buf, len, "\n");
6609  
6610  	return len;
6611  }
6612  SLAB_ATTR_RO(slabs_cpu_partial);
6613  
6614  static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
6615  {
6616  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
6617  }
6618  SLAB_ATTR_RO(reclaim_account);
6619  
6620  static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
6621  {
6622  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
6623  }
6624  SLAB_ATTR_RO(hwcache_align);
6625  
6626  #ifdef CONFIG_ZONE_DMA
6627  static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
6628  {
6629  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
6630  }
6631  SLAB_ATTR_RO(cache_dma);
6632  #endif
6633  
6634  #ifdef CONFIG_HARDENED_USERCOPY
6635  static ssize_t usersize_show(struct kmem_cache *s, char *buf)
6636  {
6637  	return sysfs_emit(buf, "%u\n", s->usersize);
6638  }
6639  SLAB_ATTR_RO(usersize);
6640  #endif
6641  
6642  static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
6643  {
6644  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
6645  }
6646  SLAB_ATTR_RO(destroy_by_rcu);
6647  
6648  #ifdef CONFIG_SLUB_DEBUG
6649  static ssize_t slabs_show(struct kmem_cache *s, char *buf)
6650  {
6651  	return show_slab_objects(s, buf, SO_ALL);
6652  }
6653  SLAB_ATTR_RO(slabs);
6654  
6655  static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
6656  {
6657  	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
6658  }
6659  SLAB_ATTR_RO(total_objects);
6660  
6661  static ssize_t objects_show(struct kmem_cache *s, char *buf)
6662  {
6663  	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
6664  }
6665  SLAB_ATTR_RO(objects);
6666  
6667  static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
6668  {
6669  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
6670  }
6671  SLAB_ATTR_RO(sanity_checks);
6672  
6673  static ssize_t trace_show(struct kmem_cache *s, char *buf)
6674  {
6675  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
6676  }
6677  SLAB_ATTR_RO(trace);
6678  
6679  static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
6680  {
6681  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
6682  }
6683  
6684  SLAB_ATTR_RO(red_zone);
6685  
6686  static ssize_t poison_show(struct kmem_cache *s, char *buf)
6687  {
6688  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
6689  }
6690  
6691  SLAB_ATTR_RO(poison);
6692  
6693  static ssize_t store_user_show(struct kmem_cache *s, char *buf)
6694  {
6695  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
6696  }
6697  
6698  SLAB_ATTR_RO(store_user);
6699  
6700  static ssize_t validate_show(struct kmem_cache *s, char *buf)
6701  {
6702  	return 0;
6703  }
6704  
6705  static ssize_t validate_store(struct kmem_cache *s,
6706  			const char *buf, size_t length)
6707  {
6708  	int ret = -EINVAL;
6709  
6710  	if (buf[0] == '1' && kmem_cache_debug(s)) {
6711  		ret = validate_slab_cache(s);
6712  		if (ret >= 0)
6713  			ret = length;
6714  	}
6715  	return ret;
6716  }
6717  SLAB_ATTR(validate);
6718  
6719  #endif /* CONFIG_SLUB_DEBUG */
6720  
6721  #ifdef CONFIG_FAILSLAB
6722  static ssize_t failslab_show(struct kmem_cache *s, char *buf)
6723  {
6724  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
6725  }
6726  
6727  static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
6728  				size_t length)
6729  {
6730  	if (s->refcount > 1)
6731  		return -EINVAL;
6732  
6733  	if (buf[0] == '1')
6734  		WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
6735  	else
6736  		WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
6737  
6738  	return length;
6739  }
6740  SLAB_ATTR(failslab);
6741  #endif
6742  
6743  static ssize_t shrink_show(struct kmem_cache *s, char *buf)
6744  {
6745  	return 0;
6746  }
6747  
6748  static ssize_t shrink_store(struct kmem_cache *s,
6749  			const char *buf, size_t length)
6750  {
6751  	if (buf[0] == '1')
6752  		kmem_cache_shrink(s);
6753  	else
6754  		return -EINVAL;
6755  	return length;
6756  }
6757  SLAB_ATTR(shrink);
6758  
6759  #ifdef CONFIG_NUMA
6760  static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
6761  {
6762  	return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
6763  }
6764  
6765  static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
6766  				const char *buf, size_t length)
6767  {
6768  	unsigned int ratio;
6769  	int err;
6770  
6771  	err = kstrtouint(buf, 10, &ratio);
6772  	if (err)
6773  		return err;
6774  	if (ratio > 100)
6775  		return -ERANGE;
6776  
6777  	s->remote_node_defrag_ratio = ratio * 10;
6778  
6779  	return length;
6780  }
6781  SLAB_ATTR(remote_node_defrag_ratio);
6782  #endif
6783  
6784  #ifdef CONFIG_SLUB_STATS
6785  static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
6786  {
6787  	unsigned long sum  = 0;
6788  	int cpu;
6789  	int len = 0;
6790  	int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
6791  
6792  	if (!data)
6793  		return -ENOMEM;
6794  
6795  	for_each_online_cpu(cpu) {
6796  		unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
6797  
6798  		data[cpu] = x;
6799  		sum += x;
6800  	}
6801  
6802  	len += sysfs_emit_at(buf, len, "%lu", sum);
6803  
6804  #ifdef CONFIG_SMP
6805  	for_each_online_cpu(cpu) {
6806  		if (data[cpu])
6807  			len += sysfs_emit_at(buf, len, " C%d=%u",
6808  					     cpu, data[cpu]);
6809  	}
6810  #endif
6811  	kfree(data);
6812  	len += sysfs_emit_at(buf, len, "\n");
6813  
6814  	return len;
6815  }
6816  
6817  static void clear_stat(struct kmem_cache *s, enum stat_item si)
6818  {
6819  	int cpu;
6820  
6821  	for_each_online_cpu(cpu)
6822  		per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
6823  }
6824  
6825  #define STAT_ATTR(si, text) 					\
6826  static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
6827  {								\
6828  	return show_stat(s, buf, si);				\
6829  }								\
6830  static ssize_t text##_store(struct kmem_cache *s,		\
6831  				const char *buf, size_t length)	\
6832  {								\
6833  	if (buf[0] != '0')					\
6834  		return -EINVAL;					\
6835  	clear_stat(s, si);					\
6836  	return length;						\
6837  }								\
6838  SLAB_ATTR(text);						\
6839  
6840  STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
6841  STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
6842  STAT_ATTR(FREE_FASTPATH, free_fastpath);
6843  STAT_ATTR(FREE_SLOWPATH, free_slowpath);
6844  STAT_ATTR(FREE_FROZEN, free_frozen);
6845  STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
6846  STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
6847  STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
6848  STAT_ATTR(ALLOC_SLAB, alloc_slab);
6849  STAT_ATTR(ALLOC_REFILL, alloc_refill);
6850  STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
6851  STAT_ATTR(FREE_SLAB, free_slab);
6852  STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
6853  STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
6854  STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
6855  STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
6856  STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
6857  STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
6858  STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
6859  STAT_ATTR(ORDER_FALLBACK, order_fallback);
6860  STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
6861  STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
6862  STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
6863  STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
6864  STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
6865  STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
6866  #endif	/* CONFIG_SLUB_STATS */
6867  
6868  #ifdef CONFIG_KFENCE
6869  static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
6870  {
6871  	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
6872  }
6873  
6874  static ssize_t skip_kfence_store(struct kmem_cache *s,
6875  			const char *buf, size_t length)
6876  {
6877  	int ret = length;
6878  
6879  	if (buf[0] == '0')
6880  		s->flags &= ~SLAB_SKIP_KFENCE;
6881  	else if (buf[0] == '1')
6882  		s->flags |= SLAB_SKIP_KFENCE;
6883  	else
6884  		ret = -EINVAL;
6885  
6886  	return ret;
6887  }
6888  SLAB_ATTR(skip_kfence);
6889  #endif
6890  
6891  static struct attribute *slab_attrs[] = {
6892  	&slab_size_attr.attr,
6893  	&object_size_attr.attr,
6894  	&objs_per_slab_attr.attr,
6895  	&order_attr.attr,
6896  	&min_partial_attr.attr,
6897  	&cpu_partial_attr.attr,
6898  	&objects_partial_attr.attr,
6899  	&partial_attr.attr,
6900  	&cpu_slabs_attr.attr,
6901  	&ctor_attr.attr,
6902  	&aliases_attr.attr,
6903  	&align_attr.attr,
6904  	&hwcache_align_attr.attr,
6905  	&reclaim_account_attr.attr,
6906  	&destroy_by_rcu_attr.attr,
6907  	&shrink_attr.attr,
6908  	&slabs_cpu_partial_attr.attr,
6909  #ifdef CONFIG_SLUB_DEBUG
6910  	&total_objects_attr.attr,
6911  	&objects_attr.attr,
6912  	&slabs_attr.attr,
6913  	&sanity_checks_attr.attr,
6914  	&trace_attr.attr,
6915  	&red_zone_attr.attr,
6916  	&poison_attr.attr,
6917  	&store_user_attr.attr,
6918  	&validate_attr.attr,
6919  #endif
6920  #ifdef CONFIG_ZONE_DMA
6921  	&cache_dma_attr.attr,
6922  #endif
6923  #ifdef CONFIG_NUMA
6924  	&remote_node_defrag_ratio_attr.attr,
6925  #endif
6926  #ifdef CONFIG_SLUB_STATS
6927  	&alloc_fastpath_attr.attr,
6928  	&alloc_slowpath_attr.attr,
6929  	&free_fastpath_attr.attr,
6930  	&free_slowpath_attr.attr,
6931  	&free_frozen_attr.attr,
6932  	&free_add_partial_attr.attr,
6933  	&free_remove_partial_attr.attr,
6934  	&alloc_from_partial_attr.attr,
6935  	&alloc_slab_attr.attr,
6936  	&alloc_refill_attr.attr,
6937  	&alloc_node_mismatch_attr.attr,
6938  	&free_slab_attr.attr,
6939  	&cpuslab_flush_attr.attr,
6940  	&deactivate_full_attr.attr,
6941  	&deactivate_empty_attr.attr,
6942  	&deactivate_to_head_attr.attr,
6943  	&deactivate_to_tail_attr.attr,
6944  	&deactivate_remote_frees_attr.attr,
6945  	&deactivate_bypass_attr.attr,
6946  	&order_fallback_attr.attr,
6947  	&cmpxchg_double_fail_attr.attr,
6948  	&cmpxchg_double_cpu_fail_attr.attr,
6949  	&cpu_partial_alloc_attr.attr,
6950  	&cpu_partial_free_attr.attr,
6951  	&cpu_partial_node_attr.attr,
6952  	&cpu_partial_drain_attr.attr,
6953  #endif
6954  #ifdef CONFIG_FAILSLAB
6955  	&failslab_attr.attr,
6956  #endif
6957  #ifdef CONFIG_HARDENED_USERCOPY
6958  	&usersize_attr.attr,
6959  #endif
6960  #ifdef CONFIG_KFENCE
6961  	&skip_kfence_attr.attr,
6962  #endif
6963  
6964  	NULL
6965  };
6966  
6967  static const struct attribute_group slab_attr_group = {
6968  	.attrs = slab_attrs,
6969  };
6970  
6971  static ssize_t slab_attr_show(struct kobject *kobj,
6972  				struct attribute *attr,
6973  				char *buf)
6974  {
6975  	struct slab_attribute *attribute;
6976  	struct kmem_cache *s;
6977  
6978  	attribute = to_slab_attr(attr);
6979  	s = to_slab(kobj);
6980  
6981  	if (!attribute->show)
6982  		return -EIO;
6983  
6984  	return attribute->show(s, buf);
6985  }
6986  
6987  static ssize_t slab_attr_store(struct kobject *kobj,
6988  				struct attribute *attr,
6989  				const char *buf, size_t len)
6990  {
6991  	struct slab_attribute *attribute;
6992  	struct kmem_cache *s;
6993  
6994  	attribute = to_slab_attr(attr);
6995  	s = to_slab(kobj);
6996  
6997  	if (!attribute->store)
6998  		return -EIO;
6999  
7000  	return attribute->store(s, buf, len);
7001  }
7002  
7003  static void kmem_cache_release(struct kobject *k)
7004  {
7005  	slab_kmem_cache_release(to_slab(k));
7006  }
7007  
7008  static const struct sysfs_ops slab_sysfs_ops = {
7009  	.show = slab_attr_show,
7010  	.store = slab_attr_store,
7011  };
7012  
7013  static const struct kobj_type slab_ktype = {
7014  	.sysfs_ops = &slab_sysfs_ops,
7015  	.release = kmem_cache_release,
7016  };
7017  
7018  static struct kset *slab_kset;
7019  
7020  static inline struct kset *cache_kset(struct kmem_cache *s)
7021  {
7022  	return slab_kset;
7023  }
7024  
7025  #define ID_STR_LENGTH 32
7026  
7027  /* Create a unique string id for a slab cache:
7028   *
7029   * Format	:[flags-]size
7030   */
7031  static char *create_unique_id(struct kmem_cache *s)
7032  {
7033  	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
7034  	char *p = name;
7035  
7036  	if (!name)
7037  		return ERR_PTR(-ENOMEM);
7038  
7039  	*p++ = ':';
7040  	/*
7041  	 * First flags affecting slabcache operations. We will only
7042  	 * get here for aliasable slabs so we do not need to support
7043  	 * too many flags. The flags here must cover all flags that
7044  	 * are matched during merging to guarantee that the id is
7045  	 * unique.
7046  	 */
7047  	if (s->flags & SLAB_CACHE_DMA)
7048  		*p++ = 'd';
7049  	if (s->flags & SLAB_CACHE_DMA32)
7050  		*p++ = 'D';
7051  	if (s->flags & SLAB_RECLAIM_ACCOUNT)
7052  		*p++ = 'a';
7053  	if (s->flags & SLAB_CONSISTENCY_CHECKS)
7054  		*p++ = 'F';
7055  	if (s->flags & SLAB_ACCOUNT)
7056  		*p++ = 'A';
7057  	if (p != name + 1)
7058  		*p++ = '-';
7059  	p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
7060  
7061  	if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
7062  		kfree(name);
7063  		return ERR_PTR(-EINVAL);
7064  	}
7065  	kmsan_unpoison_memory(name, p - name);
7066  	return name;
7067  }
7068  
7069  static int sysfs_slab_add(struct kmem_cache *s)
7070  {
7071  	int err;
7072  	const char *name;
7073  	struct kset *kset = cache_kset(s);
7074  	int unmergeable = slab_unmergeable(s);
7075  
7076  	if (!unmergeable && disable_higher_order_debug &&
7077  			(slub_debug & DEBUG_METADATA_FLAGS))
7078  		unmergeable = 1;
7079  
7080  	if (unmergeable) {
7081  		/*
7082  		 * Slabcache can never be merged so we can use the name proper.
7083  		 * This is typically the case for debug situations. In that
7084  		 * case we can catch duplicate names easily.
7085  		 */
7086  		sysfs_remove_link(&slab_kset->kobj, s->name);
7087  		name = s->name;
7088  	} else {
7089  		/*
7090  		 * Create a unique name for the slab as a target
7091  		 * for the symlinks.
7092  		 */
7093  		name = create_unique_id(s);
7094  		if (IS_ERR(name))
7095  			return PTR_ERR(name);
7096  	}
7097  
7098  	s->kobj.kset = kset;
7099  	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
7100  	if (err)
7101  		goto out;
7102  
7103  	err = sysfs_create_group(&s->kobj, &slab_attr_group);
7104  	if (err)
7105  		goto out_del_kobj;
7106  
7107  	if (!unmergeable) {
7108  		/* Setup first alias */
7109  		sysfs_slab_alias(s, s->name);
7110  	}
7111  out:
7112  	if (!unmergeable)
7113  		kfree(name);
7114  	return err;
7115  out_del_kobj:
7116  	kobject_del(&s->kobj);
7117  	goto out;
7118  }
7119  
7120  void sysfs_slab_unlink(struct kmem_cache *s)
7121  {
7122  	kobject_del(&s->kobj);
7123  }
7124  
7125  void sysfs_slab_release(struct kmem_cache *s)
7126  {
7127  	kobject_put(&s->kobj);
7128  }
7129  
7130  /*
7131   * Need to buffer aliases during bootup until sysfs becomes
7132   * available lest we lose that information.
7133   */
7134  struct saved_alias {
7135  	struct kmem_cache *s;
7136  	const char *name;
7137  	struct saved_alias *next;
7138  };
7139  
7140  static struct saved_alias *alias_list;
7141  
7142  static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
7143  {
7144  	struct saved_alias *al;
7145  
7146  	if (slab_state == FULL) {
7147  		/*
7148  		 * If we have a leftover link then remove it.
7149  		 */
7150  		sysfs_remove_link(&slab_kset->kobj, name);
7151  		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
7152  	}
7153  
7154  	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
7155  	if (!al)
7156  		return -ENOMEM;
7157  
7158  	al->s = s;
7159  	al->name = name;
7160  	al->next = alias_list;
7161  	alias_list = al;
7162  	kmsan_unpoison_memory(al, sizeof(*al));
7163  	return 0;
7164  }
7165  
7166  static int __init slab_sysfs_init(void)
7167  {
7168  	struct kmem_cache *s;
7169  	int err;
7170  
7171  	mutex_lock(&slab_mutex);
7172  
7173  	slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
7174  	if (!slab_kset) {
7175  		mutex_unlock(&slab_mutex);
7176  		pr_err("Cannot register slab subsystem.\n");
7177  		return -ENOMEM;
7178  	}
7179  
7180  	slab_state = FULL;
7181  
7182  	list_for_each_entry(s, &slab_caches, list) {
7183  		err = sysfs_slab_add(s);
7184  		if (err)
7185  			pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
7186  			       s->name);
7187  	}
7188  
7189  	while (alias_list) {
7190  		struct saved_alias *al = alias_list;
7191  
7192  		alias_list = alias_list->next;
7193  		err = sysfs_slab_alias(al->s, al->name);
7194  		if (err)
7195  			pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
7196  			       al->name);
7197  		kfree(al);
7198  	}
7199  
7200  	mutex_unlock(&slab_mutex);
7201  	return 0;
7202  }
7203  late_initcall(slab_sysfs_init);
7204  #endif /* SLAB_SUPPORTS_SYSFS */
7205  
7206  #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
7207  static int slab_debugfs_show(struct seq_file *seq, void *v)
7208  {
7209  	struct loc_track *t = seq->private;
7210  	struct location *l;
7211  	unsigned long idx;
7212  
7213  	idx = (unsigned long) t->idx;
7214  	if (idx < t->count) {
7215  		l = &t->loc[idx];
7216  
7217  		seq_printf(seq, "%7ld ", l->count);
7218  
7219  		if (l->addr)
7220  			seq_printf(seq, "%pS", (void *)l->addr);
7221  		else
7222  			seq_puts(seq, "<not-available>");
7223  
7224  		if (l->waste)
7225  			seq_printf(seq, " waste=%lu/%lu",
7226  				l->count * l->waste, l->waste);
7227  
7228  		if (l->sum_time != l->min_time) {
7229  			seq_printf(seq, " age=%ld/%llu/%ld",
7230  				l->min_time, div_u64(l->sum_time, l->count),
7231  				l->max_time);
7232  		} else
7233  			seq_printf(seq, " age=%ld", l->min_time);
7234  
7235  		if (l->min_pid != l->max_pid)
7236  			seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
7237  		else
7238  			seq_printf(seq, " pid=%ld",
7239  				l->min_pid);
7240  
7241  		if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
7242  			seq_printf(seq, " cpus=%*pbl",
7243  				 cpumask_pr_args(to_cpumask(l->cpus)));
7244  
7245  		if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
7246  			seq_printf(seq, " nodes=%*pbl",
7247  				 nodemask_pr_args(&l->nodes));
7248  
7249  #ifdef CONFIG_STACKDEPOT
7250  		{
7251  			depot_stack_handle_t handle;
7252  			unsigned long *entries;
7253  			unsigned int nr_entries, j;
7254  
7255  			handle = READ_ONCE(l->handle);
7256  			if (handle) {
7257  				nr_entries = stack_depot_fetch(handle, &entries);
7258  				seq_puts(seq, "\n");
7259  				for (j = 0; j < nr_entries; j++)
7260  					seq_printf(seq, "        %pS\n", (void *)entries[j]);
7261  			}
7262  		}
7263  #endif
7264  		seq_puts(seq, "\n");
7265  	}
7266  
7267  	if (!idx && !t->count)
7268  		seq_puts(seq, "No data\n");
7269  
7270  	return 0;
7271  }
7272  
7273  static void slab_debugfs_stop(struct seq_file *seq, void *v)
7274  {
7275  }
7276  
7277  static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
7278  {
7279  	struct loc_track *t = seq->private;
7280  
7281  	t->idx = ++(*ppos);
7282  	if (*ppos <= t->count)
7283  		return ppos;
7284  
7285  	return NULL;
7286  }
7287  
7288  static int cmp_loc_by_count(const void *a, const void *b, const void *data)
7289  {
7290  	struct location *loc1 = (struct location *)a;
7291  	struct location *loc2 = (struct location *)b;
7292  
7293  	if (loc1->count > loc2->count)
7294  		return -1;
7295  	else
7296  		return 1;
7297  }
7298  
7299  static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
7300  {
7301  	struct loc_track *t = seq->private;
7302  
7303  	t->idx = *ppos;
7304  	return ppos;
7305  }
7306  
7307  static const struct seq_operations slab_debugfs_sops = {
7308  	.start  = slab_debugfs_start,
7309  	.next   = slab_debugfs_next,
7310  	.stop   = slab_debugfs_stop,
7311  	.show   = slab_debugfs_show,
7312  };
7313  
7314  static int slab_debug_trace_open(struct inode *inode, struct file *filep)
7315  {
7316  
7317  	struct kmem_cache_node *n;
7318  	enum track_item alloc;
7319  	int node;
7320  	struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
7321  						sizeof(struct loc_track));
7322  	struct kmem_cache *s = file_inode(filep)->i_private;
7323  	unsigned long *obj_map;
7324  
7325  	if (!t)
7326  		return -ENOMEM;
7327  
7328  	obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
7329  	if (!obj_map) {
7330  		seq_release_private(inode, filep);
7331  		return -ENOMEM;
7332  	}
7333  
7334  	if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
7335  		alloc = TRACK_ALLOC;
7336  	else
7337  		alloc = TRACK_FREE;
7338  
7339  	if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
7340  		bitmap_free(obj_map);
7341  		seq_release_private(inode, filep);
7342  		return -ENOMEM;
7343  	}
7344  
7345  	for_each_kmem_cache_node(s, node, n) {
7346  		unsigned long flags;
7347  		struct slab *slab;
7348  
7349  		if (!node_nr_slabs(n))
7350  			continue;
7351  
7352  		spin_lock_irqsave(&n->list_lock, flags);
7353  		list_for_each_entry(slab, &n->partial, slab_list)
7354  			process_slab(t, s, slab, alloc, obj_map);
7355  		list_for_each_entry(slab, &n->full, slab_list)
7356  			process_slab(t, s, slab, alloc, obj_map);
7357  		spin_unlock_irqrestore(&n->list_lock, flags);
7358  	}
7359  
7360  	/* Sort locations by count */
7361  	sort_r(t->loc, t->count, sizeof(struct location),
7362  		cmp_loc_by_count, NULL, NULL);
7363  
7364  	bitmap_free(obj_map);
7365  	return 0;
7366  }
7367  
7368  static int slab_debug_trace_release(struct inode *inode, struct file *file)
7369  {
7370  	struct seq_file *seq = file->private_data;
7371  	struct loc_track *t = seq->private;
7372  
7373  	free_loc_track(t);
7374  	return seq_release_private(inode, file);
7375  }
7376  
7377  static const struct file_operations slab_debugfs_fops = {
7378  	.open    = slab_debug_trace_open,
7379  	.read    = seq_read,
7380  	.llseek  = seq_lseek,
7381  	.release = slab_debug_trace_release,
7382  };
7383  
7384  static void debugfs_slab_add(struct kmem_cache *s)
7385  {
7386  	struct dentry *slab_cache_dir;
7387  
7388  	if (unlikely(!slab_debugfs_root))
7389  		return;
7390  
7391  	slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
7392  
7393  	debugfs_create_file("alloc_traces", 0400,
7394  		slab_cache_dir, s, &slab_debugfs_fops);
7395  
7396  	debugfs_create_file("free_traces", 0400,
7397  		slab_cache_dir, s, &slab_debugfs_fops);
7398  }
7399  
7400  void debugfs_slab_release(struct kmem_cache *s)
7401  {
7402  	debugfs_lookup_and_remove(s->name, slab_debugfs_root);
7403  }
7404  
7405  static int __init slab_debugfs_init(void)
7406  {
7407  	struct kmem_cache *s;
7408  
7409  	slab_debugfs_root = debugfs_create_dir("slab", NULL);
7410  
7411  	list_for_each_entry(s, &slab_caches, list)
7412  		if (s->flags & SLAB_STORE_USER)
7413  			debugfs_slab_add(s);
7414  
7415  	return 0;
7416  
7417  }
7418  __initcall(slab_debugfs_init);
7419  #endif
7420  /*
7421   * The /proc/slabinfo ABI
7422   */
7423  #ifdef CONFIG_SLUB_DEBUG
7424  void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
7425  {
7426  	unsigned long nr_slabs = 0;
7427  	unsigned long nr_objs = 0;
7428  	unsigned long nr_free = 0;
7429  	int node;
7430  	struct kmem_cache_node *n;
7431  
7432  	for_each_kmem_cache_node(s, node, n) {
7433  		nr_slabs += node_nr_slabs(n);
7434  		nr_objs += node_nr_objs(n);
7435  		nr_free += count_partial_free_approx(n);
7436  	}
7437  
7438  	sinfo->active_objs = nr_objs - nr_free;
7439  	sinfo->num_objs = nr_objs;
7440  	sinfo->active_slabs = nr_slabs;
7441  	sinfo->num_slabs = nr_slabs;
7442  	sinfo->objects_per_slab = oo_objects(s->oo);
7443  	sinfo->cache_order = oo_order(s->oo);
7444  }
7445  #endif /* CONFIG_SLUB_DEBUG */
7446