1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * SLUB: A slab allocator that limits cache line use instead of queuing 4 * objects in per cpu and per node lists. 5 * 6 * The allocator synchronizes using per slab locks or atomic operations 7 * and only uses a centralized lock to manage a pool of partial slabs. 8 * 9 * (C) 2007 SGI, Christoph Lameter 10 * (C) 2011 Linux Foundation, Christoph Lameter 11 */ 12 13 #include <linux/mm.h> 14 #include <linux/swap.h> /* mm_account_reclaimed_pages() */ 15 #include <linux/module.h> 16 #include <linux/bit_spinlock.h> 17 #include <linux/interrupt.h> 18 #include <linux/swab.h> 19 #include <linux/bitops.h> 20 #include <linux/slab.h> 21 #include "slab.h" 22 #include <linux/proc_fs.h> 23 #include <linux/seq_file.h> 24 #include <linux/kasan.h> 25 #include <linux/kmsan.h> 26 #include <linux/cpu.h> 27 #include <linux/cpuset.h> 28 #include <linux/mempolicy.h> 29 #include <linux/ctype.h> 30 #include <linux/stackdepot.h> 31 #include <linux/debugobjects.h> 32 #include <linux/kallsyms.h> 33 #include <linux/kfence.h> 34 #include <linux/memory.h> 35 #include <linux/math64.h> 36 #include <linux/fault-inject.h> 37 #include <linux/stacktrace.h> 38 #include <linux/prefetch.h> 39 #include <linux/memcontrol.h> 40 #include <linux/random.h> 41 #include <kunit/test.h> 42 #include <kunit/test-bug.h> 43 #include <linux/sort.h> 44 45 #include <linux/debugfs.h> 46 #include <trace/events/kmem.h> 47 48 #include "internal.h" 49 50 /* 51 * Lock order: 52 * 1. slab_mutex (Global Mutex) 53 * 2. node->list_lock (Spinlock) 54 * 3. kmem_cache->cpu_slab->lock (Local lock) 55 * 4. slab_lock(slab) (Only on some arches) 56 * 5. object_map_lock (Only for debugging) 57 * 58 * slab_mutex 59 * 60 * The role of the slab_mutex is to protect the list of all the slabs 61 * and to synchronize major metadata changes to slab cache structures. 62 * Also synchronizes memory hotplug callbacks. 63 * 64 * slab_lock 65 * 66 * The slab_lock is a wrapper around the page lock, thus it is a bit 67 * spinlock. 68 * 69 * The slab_lock is only used on arches that do not have the ability 70 * to do a cmpxchg_double. It only protects: 71 * 72 * A. slab->freelist -> List of free objects in a slab 73 * B. slab->inuse -> Number of objects in use 74 * C. slab->objects -> Number of objects in slab 75 * D. slab->frozen -> frozen state 76 * 77 * Frozen slabs 78 * 79 * If a slab is frozen then it is exempt from list management. It is not 80 * on any list except per cpu partial list. The processor that froze the 81 * slab is the one who can perform list operations on the slab. Other 82 * processors may put objects onto the freelist but the processor that 83 * froze the slab is the only one that can retrieve the objects from the 84 * slab's freelist. 85 * 86 * list_lock 87 * 88 * The list_lock protects the partial and full list on each node and 89 * the partial slab counter. If taken then no new slabs may be added or 90 * removed from the lists nor make the number of partial slabs be modified. 91 * (Note that the total number of slabs is an atomic value that may be 92 * modified without taking the list lock). 93 * 94 * The list_lock is a centralized lock and thus we avoid taking it as 95 * much as possible. As long as SLUB does not have to handle partial 96 * slabs, operations can continue without any centralized lock. F.e. 97 * allocating a long series of objects that fill up slabs does not require 98 * the list lock. 99 * 100 * For debug caches, all allocations are forced to go through a list_lock 101 * protected region to serialize against concurrent validation. 102 * 103 * cpu_slab->lock local lock 104 * 105 * This locks protect slowpath manipulation of all kmem_cache_cpu fields 106 * except the stat counters. This is a percpu structure manipulated only by 107 * the local cpu, so the lock protects against being preempted or interrupted 108 * by an irq. Fast path operations rely on lockless operations instead. 109 * 110 * On PREEMPT_RT, the local lock neither disables interrupts nor preemption 111 * which means the lockless fastpath cannot be used as it might interfere with 112 * an in-progress slow path operations. In this case the local lock is always 113 * taken but it still utilizes the freelist for the common operations. 114 * 115 * lockless fastpaths 116 * 117 * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) 118 * are fully lockless when satisfied from the percpu slab (and when 119 * cmpxchg_double is possible to use, otherwise slab_lock is taken). 120 * They also don't disable preemption or migration or irqs. They rely on 121 * the transaction id (tid) field to detect being preempted or moved to 122 * another cpu. 123 * 124 * irq, preemption, migration considerations 125 * 126 * Interrupts are disabled as part of list_lock or local_lock operations, or 127 * around the slab_lock operation, in order to make the slab allocator safe 128 * to use in the context of an irq. 129 * 130 * In addition, preemption (or migration on PREEMPT_RT) is disabled in the 131 * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the 132 * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer 133 * doesn't have to be revalidated in each section protected by the local lock. 134 * 135 * SLUB assigns one slab for allocation to each processor. 136 * Allocations only occur from these slabs called cpu slabs. 137 * 138 * Slabs with free elements are kept on a partial list and during regular 139 * operations no list for full slabs is used. If an object in a full slab is 140 * freed then the slab will show up again on the partial lists. 141 * We track full slabs for debugging purposes though because otherwise we 142 * cannot scan all objects. 143 * 144 * Slabs are freed when they become empty. Teardown and setup is 145 * minimal so we rely on the page allocators per cpu caches for 146 * fast frees and allocs. 147 * 148 * slab->frozen The slab is frozen and exempt from list processing. 149 * This means that the slab is dedicated to a purpose 150 * such as satisfying allocations for a specific 151 * processor. Objects may be freed in the slab while 152 * it is frozen but slab_free will then skip the usual 153 * list operations. It is up to the processor holding 154 * the slab to integrate the slab into the slab lists 155 * when the slab is no longer needed. 156 * 157 * One use of this flag is to mark slabs that are 158 * used for allocations. Then such a slab becomes a cpu 159 * slab. The cpu slab may be equipped with an additional 160 * freelist that allows lockless access to 161 * free objects in addition to the regular freelist 162 * that requires the slab lock. 163 * 164 * SLAB_DEBUG_FLAGS Slab requires special handling due to debug 165 * options set. This moves slab handling out of 166 * the fast path and disables lockless freelists. 167 */ 168 169 /* 170 * We could simply use migrate_disable()/enable() but as long as it's a 171 * function call even on !PREEMPT_RT, use inline preempt_disable() there. 172 */ 173 #ifndef CONFIG_PREEMPT_RT 174 #define slub_get_cpu_ptr(var) get_cpu_ptr(var) 175 #define slub_put_cpu_ptr(var) put_cpu_ptr(var) 176 #define USE_LOCKLESS_FAST_PATH() (true) 177 #else 178 #define slub_get_cpu_ptr(var) \ 179 ({ \ 180 migrate_disable(); \ 181 this_cpu_ptr(var); \ 182 }) 183 #define slub_put_cpu_ptr(var) \ 184 do { \ 185 (void)(var); \ 186 migrate_enable(); \ 187 } while (0) 188 #define USE_LOCKLESS_FAST_PATH() (false) 189 #endif 190 191 #ifndef CONFIG_SLUB_TINY 192 #define __fastpath_inline __always_inline 193 #else 194 #define __fastpath_inline 195 #endif 196 197 #ifdef CONFIG_SLUB_DEBUG 198 #ifdef CONFIG_SLUB_DEBUG_ON 199 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); 200 #else 201 DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); 202 #endif 203 #endif /* CONFIG_SLUB_DEBUG */ 204 205 /* Structure holding parameters for get_partial() call chain */ 206 struct partial_context { 207 struct slab **slab; 208 gfp_t flags; 209 unsigned int orig_size; 210 }; 211 212 static inline bool kmem_cache_debug(struct kmem_cache *s) 213 { 214 return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); 215 } 216 217 static inline bool slub_debug_orig_size(struct kmem_cache *s) 218 { 219 return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && 220 (s->flags & SLAB_KMALLOC)); 221 } 222 223 void *fixup_red_left(struct kmem_cache *s, void *p) 224 { 225 if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) 226 p += s->red_left_pad; 227 228 return p; 229 } 230 231 static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) 232 { 233 #ifdef CONFIG_SLUB_CPU_PARTIAL 234 return !kmem_cache_debug(s); 235 #else 236 return false; 237 #endif 238 } 239 240 /* 241 * Issues still to be resolved: 242 * 243 * - Support PAGE_ALLOC_DEBUG. Should be easy to do. 244 * 245 * - Variable sizing of the per node arrays 246 */ 247 248 /* Enable to log cmpxchg failures */ 249 #undef SLUB_DEBUG_CMPXCHG 250 251 #ifndef CONFIG_SLUB_TINY 252 /* 253 * Minimum number of partial slabs. These will be left on the partial 254 * lists even if they are empty. kmem_cache_shrink may reclaim them. 255 */ 256 #define MIN_PARTIAL 5 257 258 /* 259 * Maximum number of desirable partial slabs. 260 * The existence of more partial slabs makes kmem_cache_shrink 261 * sort the partial list by the number of objects in use. 262 */ 263 #define MAX_PARTIAL 10 264 #else 265 #define MIN_PARTIAL 0 266 #define MAX_PARTIAL 0 267 #endif 268 269 #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ 270 SLAB_POISON | SLAB_STORE_USER) 271 272 /* 273 * These debug flags cannot use CMPXCHG because there might be consistency 274 * issues when checking or reading debug information 275 */ 276 #define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \ 277 SLAB_TRACE) 278 279 280 /* 281 * Debugging flags that require metadata to be stored in the slab. These get 282 * disabled when slub_debug=O is used and a cache's min order increases with 283 * metadata. 284 */ 285 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 286 287 #define OO_SHIFT 16 288 #define OO_MASK ((1 << OO_SHIFT) - 1) 289 #define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */ 290 291 /* Internal SLUB flags */ 292 /* Poison object */ 293 #define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) 294 /* Use cmpxchg_double */ 295 296 #ifdef system_has_freelist_aba 297 #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) 298 #else 299 #define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U) 300 #endif 301 302 /* 303 * Tracking user of a slab. 304 */ 305 #define TRACK_ADDRS_COUNT 16 306 struct track { 307 unsigned long addr; /* Called from address */ 308 #ifdef CONFIG_STACKDEPOT 309 depot_stack_handle_t handle; 310 #endif 311 int cpu; /* Was running on cpu */ 312 int pid; /* Pid context */ 313 unsigned long when; /* When did the operation occur */ 314 }; 315 316 enum track_item { TRACK_ALLOC, TRACK_FREE }; 317 318 #ifdef SLAB_SUPPORTS_SYSFS 319 static int sysfs_slab_add(struct kmem_cache *); 320 static int sysfs_slab_alias(struct kmem_cache *, const char *); 321 #else 322 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 323 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 324 { return 0; } 325 #endif 326 327 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) 328 static void debugfs_slab_add(struct kmem_cache *); 329 #else 330 static inline void debugfs_slab_add(struct kmem_cache *s) { } 331 #endif 332 333 static inline void stat(const struct kmem_cache *s, enum stat_item si) 334 { 335 #ifdef CONFIG_SLUB_STATS 336 /* 337 * The rmw is racy on a preemptible kernel but this is acceptable, so 338 * avoid this_cpu_add()'s irq-disable overhead. 339 */ 340 raw_cpu_inc(s->cpu_slab->stat[si]); 341 #endif 342 } 343 344 /* 345 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. 346 * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily 347 * differ during memory hotplug/hotremove operations. 348 * Protected by slab_mutex. 349 */ 350 static nodemask_t slab_nodes; 351 352 #ifndef CONFIG_SLUB_TINY 353 /* 354 * Workqueue used for flush_cpu_slab(). 355 */ 356 static struct workqueue_struct *flushwq; 357 #endif 358 359 /******************************************************************** 360 * Core slab cache functions 361 *******************************************************************/ 362 363 /* 364 * freeptr_t represents a SLUB freelist pointer, which might be encoded 365 * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. 366 */ 367 typedef struct { unsigned long v; } freeptr_t; 368 369 /* 370 * Returns freelist pointer (ptr). With hardening, this is obfuscated 371 * with an XOR of the address where the pointer is held and a per-cache 372 * random number. 373 */ 374 static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s, 375 void *ptr, unsigned long ptr_addr) 376 { 377 unsigned long encoded; 378 379 #ifdef CONFIG_SLAB_FREELIST_HARDENED 380 encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr); 381 #else 382 encoded = (unsigned long)ptr; 383 #endif 384 return (freeptr_t){.v = encoded}; 385 } 386 387 static inline void *freelist_ptr_decode(const struct kmem_cache *s, 388 freeptr_t ptr, unsigned long ptr_addr) 389 { 390 void *decoded; 391 392 #ifdef CONFIG_SLAB_FREELIST_HARDENED 393 decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr)); 394 #else 395 decoded = (void *)ptr.v; 396 #endif 397 return decoded; 398 } 399 400 static inline void *get_freepointer(struct kmem_cache *s, void *object) 401 { 402 unsigned long ptr_addr; 403 freeptr_t p; 404 405 object = kasan_reset_tag(object); 406 ptr_addr = (unsigned long)object + s->offset; 407 p = *(freeptr_t *)(ptr_addr); 408 return freelist_ptr_decode(s, p, ptr_addr); 409 } 410 411 #ifndef CONFIG_SLUB_TINY 412 static void prefetch_freepointer(const struct kmem_cache *s, void *object) 413 { 414 prefetchw(object + s->offset); 415 } 416 #endif 417 418 /* 419 * When running under KMSAN, get_freepointer_safe() may return an uninitialized 420 * pointer value in the case the current thread loses the race for the next 421 * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in 422 * slab_alloc_node() will fail, so the uninitialized value won't be used, but 423 * KMSAN will still check all arguments of cmpxchg because of imperfect 424 * handling of inline assembly. 425 * To work around this problem, we apply __no_kmsan_checks to ensure that 426 * get_freepointer_safe() returns initialized memory. 427 */ 428 __no_kmsan_checks 429 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 430 { 431 unsigned long freepointer_addr; 432 freeptr_t p; 433 434 if (!debug_pagealloc_enabled_static()) 435 return get_freepointer(s, object); 436 437 object = kasan_reset_tag(object); 438 freepointer_addr = (unsigned long)object + s->offset; 439 copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); 440 return freelist_ptr_decode(s, p, freepointer_addr); 441 } 442 443 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 444 { 445 unsigned long freeptr_addr = (unsigned long)object + s->offset; 446 447 #ifdef CONFIG_SLAB_FREELIST_HARDENED 448 BUG_ON(object == fp); /* naive detection of double free or corruption */ 449 #endif 450 451 freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr); 452 *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr); 453 } 454 455 /* Loop over all objects in a slab */ 456 #define for_each_object(__p, __s, __addr, __objects) \ 457 for (__p = fixup_red_left(__s, __addr); \ 458 __p < (__addr) + (__objects) * (__s)->size; \ 459 __p += (__s)->size) 460 461 static inline unsigned int order_objects(unsigned int order, unsigned int size) 462 { 463 return ((unsigned int)PAGE_SIZE << order) / size; 464 } 465 466 static inline struct kmem_cache_order_objects oo_make(unsigned int order, 467 unsigned int size) 468 { 469 struct kmem_cache_order_objects x = { 470 (order << OO_SHIFT) + order_objects(order, size) 471 }; 472 473 return x; 474 } 475 476 static inline unsigned int oo_order(struct kmem_cache_order_objects x) 477 { 478 return x.x >> OO_SHIFT; 479 } 480 481 static inline unsigned int oo_objects(struct kmem_cache_order_objects x) 482 { 483 return x.x & OO_MASK; 484 } 485 486 #ifdef CONFIG_SLUB_CPU_PARTIAL 487 static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 488 { 489 unsigned int nr_slabs; 490 491 s->cpu_partial = nr_objects; 492 493 /* 494 * We take the number of objects but actually limit the number of 495 * slabs on the per cpu partial list, in order to limit excessive 496 * growth of the list. For simplicity we assume that the slabs will 497 * be half-full. 498 */ 499 nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); 500 s->cpu_partial_slabs = nr_slabs; 501 } 502 #else 503 static inline void 504 slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 505 { 506 } 507 #endif /* CONFIG_SLUB_CPU_PARTIAL */ 508 509 /* 510 * Per slab locking using the pagelock 511 */ 512 static __always_inline void slab_lock(struct slab *slab) 513 { 514 struct page *page = slab_page(slab); 515 516 VM_BUG_ON_PAGE(PageTail(page), page); 517 bit_spin_lock(PG_locked, &page->flags); 518 } 519 520 static __always_inline void slab_unlock(struct slab *slab) 521 { 522 struct page *page = slab_page(slab); 523 524 VM_BUG_ON_PAGE(PageTail(page), page); 525 __bit_spin_unlock(PG_locked, &page->flags); 526 } 527 528 static inline bool 529 __update_freelist_fast(struct slab *slab, 530 void *freelist_old, unsigned long counters_old, 531 void *freelist_new, unsigned long counters_new) 532 { 533 #ifdef system_has_freelist_aba 534 freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old }; 535 freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new }; 536 537 return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full); 538 #else 539 return false; 540 #endif 541 } 542 543 static inline bool 544 __update_freelist_slow(struct slab *slab, 545 void *freelist_old, unsigned long counters_old, 546 void *freelist_new, unsigned long counters_new) 547 { 548 bool ret = false; 549 550 slab_lock(slab); 551 if (slab->freelist == freelist_old && 552 slab->counters == counters_old) { 553 slab->freelist = freelist_new; 554 slab->counters = counters_new; 555 ret = true; 556 } 557 slab_unlock(slab); 558 559 return ret; 560 } 561 562 /* 563 * Interrupts must be disabled (for the fallback code to work right), typically 564 * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is 565 * part of bit_spin_lock(), is sufficient because the policy is not to allow any 566 * allocation/ free operation in hardirq context. Therefore nothing can 567 * interrupt the operation. 568 */ 569 static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab, 570 void *freelist_old, unsigned long counters_old, 571 void *freelist_new, unsigned long counters_new, 572 const char *n) 573 { 574 bool ret; 575 576 if (USE_LOCKLESS_FAST_PATH()) 577 lockdep_assert_irqs_disabled(); 578 579 if (s->flags & __CMPXCHG_DOUBLE) { 580 ret = __update_freelist_fast(slab, freelist_old, counters_old, 581 freelist_new, counters_new); 582 } else { 583 ret = __update_freelist_slow(slab, freelist_old, counters_old, 584 freelist_new, counters_new); 585 } 586 if (likely(ret)) 587 return true; 588 589 cpu_relax(); 590 stat(s, CMPXCHG_DOUBLE_FAIL); 591 592 #ifdef SLUB_DEBUG_CMPXCHG 593 pr_info("%s %s: cmpxchg double redo ", n, s->name); 594 #endif 595 596 return false; 597 } 598 599 static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, 600 void *freelist_old, unsigned long counters_old, 601 void *freelist_new, unsigned long counters_new, 602 const char *n) 603 { 604 bool ret; 605 606 if (s->flags & __CMPXCHG_DOUBLE) { 607 ret = __update_freelist_fast(slab, freelist_old, counters_old, 608 freelist_new, counters_new); 609 } else { 610 unsigned long flags; 611 612 local_irq_save(flags); 613 ret = __update_freelist_slow(slab, freelist_old, counters_old, 614 freelist_new, counters_new); 615 local_irq_restore(flags); 616 } 617 if (likely(ret)) 618 return true; 619 620 cpu_relax(); 621 stat(s, CMPXCHG_DOUBLE_FAIL); 622 623 #ifdef SLUB_DEBUG_CMPXCHG 624 pr_info("%s %s: cmpxchg double redo ", n, s->name); 625 #endif 626 627 return false; 628 } 629 630 #ifdef CONFIG_SLUB_DEBUG 631 static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; 632 static DEFINE_SPINLOCK(object_map_lock); 633 634 static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, 635 struct slab *slab) 636 { 637 void *addr = slab_address(slab); 638 void *p; 639 640 bitmap_zero(obj_map, slab->objects); 641 642 for (p = slab->freelist; p; p = get_freepointer(s, p)) 643 set_bit(__obj_to_index(s, addr, p), obj_map); 644 } 645 646 #if IS_ENABLED(CONFIG_KUNIT) 647 static bool slab_add_kunit_errors(void) 648 { 649 struct kunit_resource *resource; 650 651 if (!kunit_get_current_test()) 652 return false; 653 654 resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); 655 if (!resource) 656 return false; 657 658 (*(int *)resource->data)++; 659 kunit_put_resource(resource); 660 return true; 661 } 662 #else 663 static inline bool slab_add_kunit_errors(void) { return false; } 664 #endif 665 666 static inline unsigned int size_from_object(struct kmem_cache *s) 667 { 668 if (s->flags & SLAB_RED_ZONE) 669 return s->size - s->red_left_pad; 670 671 return s->size; 672 } 673 674 static inline void *restore_red_left(struct kmem_cache *s, void *p) 675 { 676 if (s->flags & SLAB_RED_ZONE) 677 p -= s->red_left_pad; 678 679 return p; 680 } 681 682 /* 683 * Debug settings: 684 */ 685 #if defined(CONFIG_SLUB_DEBUG_ON) 686 static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; 687 #else 688 static slab_flags_t slub_debug; 689 #endif 690 691 static char *slub_debug_string; 692 static int disable_higher_order_debug; 693 694 /* 695 * slub is about to manipulate internal object metadata. This memory lies 696 * outside the range of the allocated object, so accessing it would normally 697 * be reported by kasan as a bounds error. metadata_access_enable() is used 698 * to tell kasan that these accesses are OK. 699 */ 700 static inline void metadata_access_enable(void) 701 { 702 kasan_disable_current(); 703 } 704 705 static inline void metadata_access_disable(void) 706 { 707 kasan_enable_current(); 708 } 709 710 /* 711 * Object debugging 712 */ 713 714 /* Verify that a pointer has an address that is valid within a slab page */ 715 static inline int check_valid_pointer(struct kmem_cache *s, 716 struct slab *slab, void *object) 717 { 718 void *base; 719 720 if (!object) 721 return 1; 722 723 base = slab_address(slab); 724 object = kasan_reset_tag(object); 725 object = restore_red_left(s, object); 726 if (object < base || object >= base + slab->objects * s->size || 727 (object - base) % s->size) { 728 return 0; 729 } 730 731 return 1; 732 } 733 734 static void print_section(char *level, char *text, u8 *addr, 735 unsigned int length) 736 { 737 metadata_access_enable(); 738 print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 739 16, 1, kasan_reset_tag((void *)addr), length, 1); 740 metadata_access_disable(); 741 } 742 743 /* 744 * See comment in calculate_sizes(). 745 */ 746 static inline bool freeptr_outside_object(struct kmem_cache *s) 747 { 748 return s->offset >= s->inuse; 749 } 750 751 /* 752 * Return offset of the end of info block which is inuse + free pointer if 753 * not overlapping with object. 754 */ 755 static inline unsigned int get_info_end(struct kmem_cache *s) 756 { 757 if (freeptr_outside_object(s)) 758 return s->inuse + sizeof(void *); 759 else 760 return s->inuse; 761 } 762 763 static struct track *get_track(struct kmem_cache *s, void *object, 764 enum track_item alloc) 765 { 766 struct track *p; 767 768 p = object + get_info_end(s); 769 770 return kasan_reset_tag(p + alloc); 771 } 772 773 #ifdef CONFIG_STACKDEPOT 774 static noinline depot_stack_handle_t set_track_prepare(void) 775 { 776 depot_stack_handle_t handle; 777 unsigned long entries[TRACK_ADDRS_COUNT]; 778 unsigned int nr_entries; 779 780 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); 781 handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); 782 783 return handle; 784 } 785 #else 786 static inline depot_stack_handle_t set_track_prepare(void) 787 { 788 return 0; 789 } 790 #endif 791 792 static void set_track_update(struct kmem_cache *s, void *object, 793 enum track_item alloc, unsigned long addr, 794 depot_stack_handle_t handle) 795 { 796 struct track *p = get_track(s, object, alloc); 797 798 #ifdef CONFIG_STACKDEPOT 799 p->handle = handle; 800 #endif 801 p->addr = addr; 802 p->cpu = smp_processor_id(); 803 p->pid = current->pid; 804 p->when = jiffies; 805 } 806 807 static __always_inline void set_track(struct kmem_cache *s, void *object, 808 enum track_item alloc, unsigned long addr) 809 { 810 depot_stack_handle_t handle = set_track_prepare(); 811 812 set_track_update(s, object, alloc, addr, handle); 813 } 814 815 static void init_tracking(struct kmem_cache *s, void *object) 816 { 817 struct track *p; 818 819 if (!(s->flags & SLAB_STORE_USER)) 820 return; 821 822 p = get_track(s, object, TRACK_ALLOC); 823 memset(p, 0, 2*sizeof(struct track)); 824 } 825 826 static void print_track(const char *s, struct track *t, unsigned long pr_time) 827 { 828 depot_stack_handle_t handle __maybe_unused; 829 830 if (!t->addr) 831 return; 832 833 pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", 834 s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); 835 #ifdef CONFIG_STACKDEPOT 836 handle = READ_ONCE(t->handle); 837 if (handle) 838 stack_depot_print(handle); 839 else 840 pr_err("object allocation/free stack trace missing\n"); 841 #endif 842 } 843 844 void print_tracking(struct kmem_cache *s, void *object) 845 { 846 unsigned long pr_time = jiffies; 847 if (!(s->flags & SLAB_STORE_USER)) 848 return; 849 850 print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time); 851 print_track("Freed", get_track(s, object, TRACK_FREE), pr_time); 852 } 853 854 static void print_slab_info(const struct slab *slab) 855 { 856 struct folio *folio = (struct folio *)slab_folio(slab); 857 858 pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", 859 slab, slab->objects, slab->inuse, slab->freelist, 860 folio_flags(folio, 0)); 861 } 862 863 /* 864 * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API 865 * family will round up the real request size to these fixed ones, so 866 * there could be an extra area than what is requested. Save the original 867 * request size in the meta data area, for better debug and sanity check. 868 */ 869 static inline void set_orig_size(struct kmem_cache *s, 870 void *object, unsigned int orig_size) 871 { 872 void *p = kasan_reset_tag(object); 873 unsigned int kasan_meta_size; 874 875 if (!slub_debug_orig_size(s)) 876 return; 877 878 /* 879 * KASAN can save its free meta data inside of the object at offset 0. 880 * If this meta data size is larger than 'orig_size', it will overlap 881 * the data redzone in [orig_size+1, object_size]. Thus, we adjust 882 * 'orig_size' to be as at least as big as KASAN's meta data. 883 */ 884 kasan_meta_size = kasan_metadata_size(s, true); 885 if (kasan_meta_size > orig_size) 886 orig_size = kasan_meta_size; 887 888 p += get_info_end(s); 889 p += sizeof(struct track) * 2; 890 891 *(unsigned int *)p = orig_size; 892 } 893 894 static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) 895 { 896 void *p = kasan_reset_tag(object); 897 898 if (!slub_debug_orig_size(s)) 899 return s->object_size; 900 901 p += get_info_end(s); 902 p += sizeof(struct track) * 2; 903 904 return *(unsigned int *)p; 905 } 906 907 void skip_orig_size_check(struct kmem_cache *s, const void *object) 908 { 909 set_orig_size(s, (void *)object, s->object_size); 910 } 911 912 static void slab_bug(struct kmem_cache *s, char *fmt, ...) 913 { 914 struct va_format vaf; 915 va_list args; 916 917 va_start(args, fmt); 918 vaf.fmt = fmt; 919 vaf.va = &args; 920 pr_err("=============================================================================\n"); 921 pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); 922 pr_err("-----------------------------------------------------------------------------\n\n"); 923 va_end(args); 924 } 925 926 __printf(2, 3) 927 static void slab_fix(struct kmem_cache *s, char *fmt, ...) 928 { 929 struct va_format vaf; 930 va_list args; 931 932 if (slab_add_kunit_errors()) 933 return; 934 935 va_start(args, fmt); 936 vaf.fmt = fmt; 937 vaf.va = &args; 938 pr_err("FIX %s: %pV\n", s->name, &vaf); 939 va_end(args); 940 } 941 942 static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) 943 { 944 unsigned int off; /* Offset of last byte */ 945 u8 *addr = slab_address(slab); 946 947 print_tracking(s, p); 948 949 print_slab_info(slab); 950 951 pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", 952 p, p - addr, get_freepointer(s, p)); 953 954 if (s->flags & SLAB_RED_ZONE) 955 print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, 956 s->red_left_pad); 957 else if (p > addr + 16) 958 print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); 959 960 print_section(KERN_ERR, "Object ", p, 961 min_t(unsigned int, s->object_size, PAGE_SIZE)); 962 if (s->flags & SLAB_RED_ZONE) 963 print_section(KERN_ERR, "Redzone ", p + s->object_size, 964 s->inuse - s->object_size); 965 966 off = get_info_end(s); 967 968 if (s->flags & SLAB_STORE_USER) 969 off += 2 * sizeof(struct track); 970 971 if (slub_debug_orig_size(s)) 972 off += sizeof(unsigned int); 973 974 off += kasan_metadata_size(s, false); 975 976 if (off != size_from_object(s)) 977 /* Beginning of the filler is the free pointer */ 978 print_section(KERN_ERR, "Padding ", p + off, 979 size_from_object(s) - off); 980 981 dump_stack(); 982 } 983 984 static void object_err(struct kmem_cache *s, struct slab *slab, 985 u8 *object, char *reason) 986 { 987 if (slab_add_kunit_errors()) 988 return; 989 990 slab_bug(s, "%s", reason); 991 print_trailer(s, slab, object); 992 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 993 } 994 995 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 996 void **freelist, void *nextfree) 997 { 998 if ((s->flags & SLAB_CONSISTENCY_CHECKS) && 999 !check_valid_pointer(s, slab, nextfree) && freelist) { 1000 object_err(s, slab, *freelist, "Freechain corrupt"); 1001 *freelist = NULL; 1002 slab_fix(s, "Isolate corrupted freechain"); 1003 return true; 1004 } 1005 1006 return false; 1007 } 1008 1009 static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, 1010 const char *fmt, ...) 1011 { 1012 va_list args; 1013 char buf[100]; 1014 1015 if (slab_add_kunit_errors()) 1016 return; 1017 1018 va_start(args, fmt); 1019 vsnprintf(buf, sizeof(buf), fmt, args); 1020 va_end(args); 1021 slab_bug(s, "%s", buf); 1022 print_slab_info(slab); 1023 dump_stack(); 1024 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 1025 } 1026 1027 static void init_object(struct kmem_cache *s, void *object, u8 val) 1028 { 1029 u8 *p = kasan_reset_tag(object); 1030 unsigned int poison_size = s->object_size; 1031 1032 if (s->flags & SLAB_RED_ZONE) { 1033 memset(p - s->red_left_pad, val, s->red_left_pad); 1034 1035 if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { 1036 /* 1037 * Redzone the extra allocated space by kmalloc than 1038 * requested, and the poison size will be limited to 1039 * the original request size accordingly. 1040 */ 1041 poison_size = get_orig_size(s, object); 1042 } 1043 } 1044 1045 if (s->flags & __OBJECT_POISON) { 1046 memset(p, POISON_FREE, poison_size - 1); 1047 p[poison_size - 1] = POISON_END; 1048 } 1049 1050 if (s->flags & SLAB_RED_ZONE) 1051 memset(p + poison_size, val, s->inuse - poison_size); 1052 } 1053 1054 static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 1055 void *from, void *to) 1056 { 1057 slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); 1058 memset(from, data, to - from); 1059 } 1060 1061 static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, 1062 u8 *object, char *what, 1063 u8 *start, unsigned int value, unsigned int bytes) 1064 { 1065 u8 *fault; 1066 u8 *end; 1067 u8 *addr = slab_address(slab); 1068 1069 metadata_access_enable(); 1070 fault = memchr_inv(kasan_reset_tag(start), value, bytes); 1071 metadata_access_disable(); 1072 if (!fault) 1073 return 1; 1074 1075 end = start + bytes; 1076 while (end > fault && end[-1] == value) 1077 end--; 1078 1079 if (slab_add_kunit_errors()) 1080 goto skip_bug_print; 1081 1082 slab_bug(s, "%s overwritten", what); 1083 pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", 1084 fault, end - 1, fault - addr, 1085 fault[0], value); 1086 print_trailer(s, slab, object); 1087 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 1088 1089 skip_bug_print: 1090 restore_bytes(s, what, value, fault, end); 1091 return 0; 1092 } 1093 1094 /* 1095 * Object layout: 1096 * 1097 * object address 1098 * Bytes of the object to be managed. 1099 * If the freepointer may overlay the object then the free 1100 * pointer is at the middle of the object. 1101 * 1102 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 1103 * 0xa5 (POISON_END) 1104 * 1105 * object + s->object_size 1106 * Padding to reach word boundary. This is also used for Redzoning. 1107 * Padding is extended by another word if Redzoning is enabled and 1108 * object_size == inuse. 1109 * 1110 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 1111 * 0xcc (RED_ACTIVE) for objects in use. 1112 * 1113 * object + s->inuse 1114 * Meta data starts here. 1115 * 1116 * A. Free pointer (if we cannot overwrite object on free) 1117 * B. Tracking data for SLAB_STORE_USER 1118 * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) 1119 * D. Padding to reach required alignment boundary or at minimum 1120 * one word if debugging is on to be able to detect writes 1121 * before the word boundary. 1122 * 1123 * Padding is done using 0x5a (POISON_INUSE) 1124 * 1125 * object + s->size 1126 * Nothing is used beyond s->size. 1127 * 1128 * If slabcaches are merged then the object_size and inuse boundaries are mostly 1129 * ignored. And therefore no slab options that rely on these boundaries 1130 * may be used with merged slabcaches. 1131 */ 1132 1133 static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) 1134 { 1135 unsigned long off = get_info_end(s); /* The end of info */ 1136 1137 if (s->flags & SLAB_STORE_USER) { 1138 /* We also have user information there */ 1139 off += 2 * sizeof(struct track); 1140 1141 if (s->flags & SLAB_KMALLOC) 1142 off += sizeof(unsigned int); 1143 } 1144 1145 off += kasan_metadata_size(s, false); 1146 1147 if (size_from_object(s) == off) 1148 return 1; 1149 1150 return check_bytes_and_report(s, slab, p, "Object padding", 1151 p + off, POISON_INUSE, size_from_object(s) - off); 1152 } 1153 1154 /* Check the pad bytes at the end of a slab page */ 1155 static void slab_pad_check(struct kmem_cache *s, struct slab *slab) 1156 { 1157 u8 *start; 1158 u8 *fault; 1159 u8 *end; 1160 u8 *pad; 1161 int length; 1162 int remainder; 1163 1164 if (!(s->flags & SLAB_POISON)) 1165 return; 1166 1167 start = slab_address(slab); 1168 length = slab_size(slab); 1169 end = start + length; 1170 remainder = length % s->size; 1171 if (!remainder) 1172 return; 1173 1174 pad = end - remainder; 1175 metadata_access_enable(); 1176 fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder); 1177 metadata_access_disable(); 1178 if (!fault) 1179 return; 1180 while (end > fault && end[-1] == POISON_INUSE) 1181 end--; 1182 1183 slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", 1184 fault, end - 1, fault - start); 1185 print_section(KERN_ERR, "Padding ", pad, remainder); 1186 1187 restore_bytes(s, "slab padding", POISON_INUSE, fault, end); 1188 } 1189 1190 static int check_object(struct kmem_cache *s, struct slab *slab, 1191 void *object, u8 val) 1192 { 1193 u8 *p = object; 1194 u8 *endobject = object + s->object_size; 1195 unsigned int orig_size, kasan_meta_size; 1196 1197 if (s->flags & SLAB_RED_ZONE) { 1198 if (!check_bytes_and_report(s, slab, object, "Left Redzone", 1199 object - s->red_left_pad, val, s->red_left_pad)) 1200 return 0; 1201 1202 if (!check_bytes_and_report(s, slab, object, "Right Redzone", 1203 endobject, val, s->inuse - s->object_size)) 1204 return 0; 1205 1206 if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { 1207 orig_size = get_orig_size(s, object); 1208 1209 if (s->object_size > orig_size && 1210 !check_bytes_and_report(s, slab, object, 1211 "kmalloc Redzone", p + orig_size, 1212 val, s->object_size - orig_size)) { 1213 return 0; 1214 } 1215 } 1216 } else { 1217 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { 1218 check_bytes_and_report(s, slab, p, "Alignment padding", 1219 endobject, POISON_INUSE, 1220 s->inuse - s->object_size); 1221 } 1222 } 1223 1224 if (s->flags & SLAB_POISON) { 1225 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) { 1226 /* 1227 * KASAN can save its free meta data inside of the 1228 * object at offset 0. Thus, skip checking the part of 1229 * the redzone that overlaps with the meta data. 1230 */ 1231 kasan_meta_size = kasan_metadata_size(s, true); 1232 if (kasan_meta_size < s->object_size - 1 && 1233 !check_bytes_and_report(s, slab, p, "Poison", 1234 p + kasan_meta_size, POISON_FREE, 1235 s->object_size - kasan_meta_size - 1)) 1236 return 0; 1237 if (kasan_meta_size < s->object_size && 1238 !check_bytes_and_report(s, slab, p, "End Poison", 1239 p + s->object_size - 1, POISON_END, 1)) 1240 return 0; 1241 } 1242 /* 1243 * check_pad_bytes cleans up on its own. 1244 */ 1245 check_pad_bytes(s, slab, p); 1246 } 1247 1248 if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) 1249 /* 1250 * Object and freepointer overlap. Cannot check 1251 * freepointer while object is allocated. 1252 */ 1253 return 1; 1254 1255 /* Check free pointer validity */ 1256 if (!check_valid_pointer(s, slab, get_freepointer(s, p))) { 1257 object_err(s, slab, p, "Freepointer corrupt"); 1258 /* 1259 * No choice but to zap it and thus lose the remainder 1260 * of the free objects in this slab. May cause 1261 * another error because the object count is now wrong. 1262 */ 1263 set_freepointer(s, p, NULL); 1264 return 0; 1265 } 1266 return 1; 1267 } 1268 1269 static int check_slab(struct kmem_cache *s, struct slab *slab) 1270 { 1271 int maxobj; 1272 1273 if (!folio_test_slab(slab_folio(slab))) { 1274 slab_err(s, slab, "Not a valid slab page"); 1275 return 0; 1276 } 1277 1278 maxobj = order_objects(slab_order(slab), s->size); 1279 if (slab->objects > maxobj) { 1280 slab_err(s, slab, "objects %u > max %u", 1281 slab->objects, maxobj); 1282 return 0; 1283 } 1284 if (slab->inuse > slab->objects) { 1285 slab_err(s, slab, "inuse %u > max %u", 1286 slab->inuse, slab->objects); 1287 return 0; 1288 } 1289 /* Slab_pad_check fixes things up after itself */ 1290 slab_pad_check(s, slab); 1291 return 1; 1292 } 1293 1294 /* 1295 * Determine if a certain object in a slab is on the freelist. Must hold the 1296 * slab lock to guarantee that the chains are in a consistent state. 1297 */ 1298 static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) 1299 { 1300 int nr = 0; 1301 void *fp; 1302 void *object = NULL; 1303 int max_objects; 1304 1305 fp = slab->freelist; 1306 while (fp && nr <= slab->objects) { 1307 if (fp == search) 1308 return 1; 1309 if (!check_valid_pointer(s, slab, fp)) { 1310 if (object) { 1311 object_err(s, slab, object, 1312 "Freechain corrupt"); 1313 set_freepointer(s, object, NULL); 1314 } else { 1315 slab_err(s, slab, "Freepointer corrupt"); 1316 slab->freelist = NULL; 1317 slab->inuse = slab->objects; 1318 slab_fix(s, "Freelist cleared"); 1319 return 0; 1320 } 1321 break; 1322 } 1323 object = fp; 1324 fp = get_freepointer(s, object); 1325 nr++; 1326 } 1327 1328 max_objects = order_objects(slab_order(slab), s->size); 1329 if (max_objects > MAX_OBJS_PER_PAGE) 1330 max_objects = MAX_OBJS_PER_PAGE; 1331 1332 if (slab->objects != max_objects) { 1333 slab_err(s, slab, "Wrong number of objects. Found %d but should be %d", 1334 slab->objects, max_objects); 1335 slab->objects = max_objects; 1336 slab_fix(s, "Number of objects adjusted"); 1337 } 1338 if (slab->inuse != slab->objects - nr) { 1339 slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d", 1340 slab->inuse, slab->objects - nr); 1341 slab->inuse = slab->objects - nr; 1342 slab_fix(s, "Object count adjusted"); 1343 } 1344 return search == NULL; 1345 } 1346 1347 static void trace(struct kmem_cache *s, struct slab *slab, void *object, 1348 int alloc) 1349 { 1350 if (s->flags & SLAB_TRACE) { 1351 pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", 1352 s->name, 1353 alloc ? "alloc" : "free", 1354 object, slab->inuse, 1355 slab->freelist); 1356 1357 if (!alloc) 1358 print_section(KERN_INFO, "Object ", (void *)object, 1359 s->object_size); 1360 1361 dump_stack(); 1362 } 1363 } 1364 1365 /* 1366 * Tracking of fully allocated slabs for debugging purposes. 1367 */ 1368 static void add_full(struct kmem_cache *s, 1369 struct kmem_cache_node *n, struct slab *slab) 1370 { 1371 if (!(s->flags & SLAB_STORE_USER)) 1372 return; 1373 1374 lockdep_assert_held(&n->list_lock); 1375 list_add(&slab->slab_list, &n->full); 1376 } 1377 1378 static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) 1379 { 1380 if (!(s->flags & SLAB_STORE_USER)) 1381 return; 1382 1383 lockdep_assert_held(&n->list_lock); 1384 list_del(&slab->slab_list); 1385 } 1386 1387 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1388 { 1389 return atomic_long_read(&n->nr_slabs); 1390 } 1391 1392 static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) 1393 { 1394 struct kmem_cache_node *n = get_node(s, node); 1395 1396 /* 1397 * May be called early in order to allocate a slab for the 1398 * kmem_cache_node structure. Solve the chicken-egg 1399 * dilemma by deferring the increment of the count during 1400 * bootstrap (see early_kmem_cache_node_alloc). 1401 */ 1402 if (likely(n)) { 1403 atomic_long_inc(&n->nr_slabs); 1404 atomic_long_add(objects, &n->total_objects); 1405 } 1406 } 1407 static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) 1408 { 1409 struct kmem_cache_node *n = get_node(s, node); 1410 1411 atomic_long_dec(&n->nr_slabs); 1412 atomic_long_sub(objects, &n->total_objects); 1413 } 1414 1415 /* Object debug checks for alloc/free paths */ 1416 static void setup_object_debug(struct kmem_cache *s, void *object) 1417 { 1418 if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)) 1419 return; 1420 1421 init_object(s, object, SLUB_RED_INACTIVE); 1422 init_tracking(s, object); 1423 } 1424 1425 static 1426 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) 1427 { 1428 if (!kmem_cache_debug_flags(s, SLAB_POISON)) 1429 return; 1430 1431 metadata_access_enable(); 1432 memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab)); 1433 metadata_access_disable(); 1434 } 1435 1436 static inline int alloc_consistency_checks(struct kmem_cache *s, 1437 struct slab *slab, void *object) 1438 { 1439 if (!check_slab(s, slab)) 1440 return 0; 1441 1442 if (!check_valid_pointer(s, slab, object)) { 1443 object_err(s, slab, object, "Freelist Pointer check fails"); 1444 return 0; 1445 } 1446 1447 if (!check_object(s, slab, object, SLUB_RED_INACTIVE)) 1448 return 0; 1449 1450 return 1; 1451 } 1452 1453 static noinline bool alloc_debug_processing(struct kmem_cache *s, 1454 struct slab *slab, void *object, int orig_size) 1455 { 1456 if (s->flags & SLAB_CONSISTENCY_CHECKS) { 1457 if (!alloc_consistency_checks(s, slab, object)) 1458 goto bad; 1459 } 1460 1461 /* Success. Perform special debug activities for allocs */ 1462 trace(s, slab, object, 1); 1463 set_orig_size(s, object, orig_size); 1464 init_object(s, object, SLUB_RED_ACTIVE); 1465 return true; 1466 1467 bad: 1468 if (folio_test_slab(slab_folio(slab))) { 1469 /* 1470 * If this is a slab page then lets do the best we can 1471 * to avoid issues in the future. Marking all objects 1472 * as used avoids touching the remaining objects. 1473 */ 1474 slab_fix(s, "Marking all objects used"); 1475 slab->inuse = slab->objects; 1476 slab->freelist = NULL; 1477 } 1478 return false; 1479 } 1480 1481 static inline int free_consistency_checks(struct kmem_cache *s, 1482 struct slab *slab, void *object, unsigned long addr) 1483 { 1484 if (!check_valid_pointer(s, slab, object)) { 1485 slab_err(s, slab, "Invalid object pointer 0x%p", object); 1486 return 0; 1487 } 1488 1489 if (on_freelist(s, slab, object)) { 1490 object_err(s, slab, object, "Object already free"); 1491 return 0; 1492 } 1493 1494 if (!check_object(s, slab, object, SLUB_RED_ACTIVE)) 1495 return 0; 1496 1497 if (unlikely(s != slab->slab_cache)) { 1498 if (!folio_test_slab(slab_folio(slab))) { 1499 slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", 1500 object); 1501 } else if (!slab->slab_cache) { 1502 pr_err("SLUB <none>: no slab for object 0x%p.\n", 1503 object); 1504 dump_stack(); 1505 } else 1506 object_err(s, slab, object, 1507 "page slab pointer corrupt."); 1508 return 0; 1509 } 1510 return 1; 1511 } 1512 1513 /* 1514 * Parse a block of slub_debug options. Blocks are delimited by ';' 1515 * 1516 * @str: start of block 1517 * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified 1518 * @slabs: return start of list of slabs, or NULL when there's no list 1519 * @init: assume this is initial parsing and not per-kmem-create parsing 1520 * 1521 * returns the start of next block if there's any, or NULL 1522 */ 1523 static char * 1524 parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) 1525 { 1526 bool higher_order_disable = false; 1527 1528 /* Skip any completely empty blocks */ 1529 while (*str && *str == ';') 1530 str++; 1531 1532 if (*str == ',') { 1533 /* 1534 * No options but restriction on slabs. This means full 1535 * debugging for slabs matching a pattern. 1536 */ 1537 *flags = DEBUG_DEFAULT_FLAGS; 1538 goto check_slabs; 1539 } 1540 *flags = 0; 1541 1542 /* Determine which debug features should be switched on */ 1543 for (; *str && *str != ',' && *str != ';'; str++) { 1544 switch (tolower(*str)) { 1545 case '-': 1546 *flags = 0; 1547 break; 1548 case 'f': 1549 *flags |= SLAB_CONSISTENCY_CHECKS; 1550 break; 1551 case 'z': 1552 *flags |= SLAB_RED_ZONE; 1553 break; 1554 case 'p': 1555 *flags |= SLAB_POISON; 1556 break; 1557 case 'u': 1558 *flags |= SLAB_STORE_USER; 1559 break; 1560 case 't': 1561 *flags |= SLAB_TRACE; 1562 break; 1563 case 'a': 1564 *flags |= SLAB_FAILSLAB; 1565 break; 1566 case 'o': 1567 /* 1568 * Avoid enabling debugging on caches if its minimum 1569 * order would increase as a result. 1570 */ 1571 higher_order_disable = true; 1572 break; 1573 default: 1574 if (init) 1575 pr_err("slub_debug option '%c' unknown. skipped\n", *str); 1576 } 1577 } 1578 check_slabs: 1579 if (*str == ',') 1580 *slabs = ++str; 1581 else 1582 *slabs = NULL; 1583 1584 /* Skip over the slab list */ 1585 while (*str && *str != ';') 1586 str++; 1587 1588 /* Skip any completely empty blocks */ 1589 while (*str && *str == ';') 1590 str++; 1591 1592 if (init && higher_order_disable) 1593 disable_higher_order_debug = 1; 1594 1595 if (*str) 1596 return str; 1597 else 1598 return NULL; 1599 } 1600 1601 static int __init setup_slub_debug(char *str) 1602 { 1603 slab_flags_t flags; 1604 slab_flags_t global_flags; 1605 char *saved_str; 1606 char *slab_list; 1607 bool global_slub_debug_changed = false; 1608 bool slab_list_specified = false; 1609 1610 global_flags = DEBUG_DEFAULT_FLAGS; 1611 if (*str++ != '=' || !*str) 1612 /* 1613 * No options specified. Switch on full debugging. 1614 */ 1615 goto out; 1616 1617 saved_str = str; 1618 while (str) { 1619 str = parse_slub_debug_flags(str, &flags, &slab_list, true); 1620 1621 if (!slab_list) { 1622 global_flags = flags; 1623 global_slub_debug_changed = true; 1624 } else { 1625 slab_list_specified = true; 1626 if (flags & SLAB_STORE_USER) 1627 stack_depot_request_early_init(); 1628 } 1629 } 1630 1631 /* 1632 * For backwards compatibility, a single list of flags with list of 1633 * slabs means debugging is only changed for those slabs, so the global 1634 * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending 1635 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as 1636 * long as there is no option specifying flags without a slab list. 1637 */ 1638 if (slab_list_specified) { 1639 if (!global_slub_debug_changed) 1640 global_flags = slub_debug; 1641 slub_debug_string = saved_str; 1642 } 1643 out: 1644 slub_debug = global_flags; 1645 if (slub_debug & SLAB_STORE_USER) 1646 stack_depot_request_early_init(); 1647 if (slub_debug != 0 || slub_debug_string) 1648 static_branch_enable(&slub_debug_enabled); 1649 else 1650 static_branch_disable(&slub_debug_enabled); 1651 if ((static_branch_unlikely(&init_on_alloc) || 1652 static_branch_unlikely(&init_on_free)) && 1653 (slub_debug & SLAB_POISON)) 1654 pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); 1655 return 1; 1656 } 1657 1658 __setup("slub_debug", setup_slub_debug); 1659 1660 /* 1661 * kmem_cache_flags - apply debugging options to the cache 1662 * @object_size: the size of an object without meta data 1663 * @flags: flags to set 1664 * @name: name of the cache 1665 * 1666 * Debug option(s) are applied to @flags. In addition to the debug 1667 * option(s), if a slab name (or multiple) is specified i.e. 1668 * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ... 1669 * then only the select slabs will receive the debug option(s). 1670 */ 1671 slab_flags_t kmem_cache_flags(unsigned int object_size, 1672 slab_flags_t flags, const char *name) 1673 { 1674 char *iter; 1675 size_t len; 1676 char *next_block; 1677 slab_flags_t block_flags; 1678 slab_flags_t slub_debug_local = slub_debug; 1679 1680 if (flags & SLAB_NO_USER_FLAGS) 1681 return flags; 1682 1683 /* 1684 * If the slab cache is for debugging (e.g. kmemleak) then 1685 * don't store user (stack trace) information by default, 1686 * but let the user enable it via the command line below. 1687 */ 1688 if (flags & SLAB_NOLEAKTRACE) 1689 slub_debug_local &= ~SLAB_STORE_USER; 1690 1691 len = strlen(name); 1692 next_block = slub_debug_string; 1693 /* Go through all blocks of debug options, see if any matches our slab's name */ 1694 while (next_block) { 1695 next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false); 1696 if (!iter) 1697 continue; 1698 /* Found a block that has a slab list, search it */ 1699 while (*iter) { 1700 char *end, *glob; 1701 size_t cmplen; 1702 1703 end = strchrnul(iter, ','); 1704 if (next_block && next_block < end) 1705 end = next_block - 1; 1706 1707 glob = strnchr(iter, end - iter, '*'); 1708 if (glob) 1709 cmplen = glob - iter; 1710 else 1711 cmplen = max_t(size_t, len, (end - iter)); 1712 1713 if (!strncmp(name, iter, cmplen)) { 1714 flags |= block_flags; 1715 return flags; 1716 } 1717 1718 if (!*end || *end == ';') 1719 break; 1720 iter = end + 1; 1721 } 1722 } 1723 1724 return flags | slub_debug_local; 1725 } 1726 #else /* !CONFIG_SLUB_DEBUG */ 1727 static inline void setup_object_debug(struct kmem_cache *s, void *object) {} 1728 static inline 1729 void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} 1730 1731 static inline bool alloc_debug_processing(struct kmem_cache *s, 1732 struct slab *slab, void *object, int orig_size) { return true; } 1733 1734 static inline bool free_debug_processing(struct kmem_cache *s, 1735 struct slab *slab, void *head, void *tail, int *bulk_cnt, 1736 unsigned long addr, depot_stack_handle_t handle) { return true; } 1737 1738 static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} 1739 static inline int check_object(struct kmem_cache *s, struct slab *slab, 1740 void *object, u8 val) { return 1; } 1741 static inline depot_stack_handle_t set_track_prepare(void) { return 0; } 1742 static inline void set_track(struct kmem_cache *s, void *object, 1743 enum track_item alloc, unsigned long addr) {} 1744 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1745 struct slab *slab) {} 1746 static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, 1747 struct slab *slab) {} 1748 slab_flags_t kmem_cache_flags(unsigned int object_size, 1749 slab_flags_t flags, const char *name) 1750 { 1751 return flags; 1752 } 1753 #define slub_debug 0 1754 1755 #define disable_higher_order_debug 0 1756 1757 static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1758 { return 0; } 1759 static inline void inc_slabs_node(struct kmem_cache *s, int node, 1760 int objects) {} 1761 static inline void dec_slabs_node(struct kmem_cache *s, int node, 1762 int objects) {} 1763 1764 #ifndef CONFIG_SLUB_TINY 1765 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 1766 void **freelist, void *nextfree) 1767 { 1768 return false; 1769 } 1770 #endif 1771 #endif /* CONFIG_SLUB_DEBUG */ 1772 1773 /* 1774 * Hooks for other subsystems that check memory allocations. In a typical 1775 * production configuration these hooks all should produce no code at all. 1776 */ 1777 static __always_inline bool slab_free_hook(struct kmem_cache *s, 1778 void *x, bool init) 1779 { 1780 kmemleak_free_recursive(x, s->flags); 1781 kmsan_slab_free(s, x); 1782 1783 debug_check_no_locks_freed(x, s->object_size); 1784 1785 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1786 debug_check_no_obj_freed(x, s->object_size); 1787 1788 /* Use KCSAN to help debug racy use-after-free. */ 1789 if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) 1790 __kcsan_check_access(x, s->object_size, 1791 KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); 1792 1793 /* 1794 * As memory initialization might be integrated into KASAN, 1795 * kasan_slab_free and initialization memset's must be 1796 * kept together to avoid discrepancies in behavior. 1797 * 1798 * The initialization memset's clear the object and the metadata, 1799 * but don't touch the SLAB redzone. 1800 */ 1801 if (init) { 1802 int rsize; 1803 1804 if (!kasan_has_integrated_init()) 1805 memset(kasan_reset_tag(x), 0, s->object_size); 1806 rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; 1807 memset((char *)kasan_reset_tag(x) + s->inuse, 0, 1808 s->size - s->inuse - rsize); 1809 } 1810 /* KASAN might put x into memory quarantine, delaying its reuse. */ 1811 return kasan_slab_free(s, x, init); 1812 } 1813 1814 static inline bool slab_free_freelist_hook(struct kmem_cache *s, 1815 void **head, void **tail, 1816 int *cnt) 1817 { 1818 1819 void *object; 1820 void *next = *head; 1821 void *old_tail = *tail ? *tail : *head; 1822 1823 if (is_kfence_address(next)) { 1824 slab_free_hook(s, next, false); 1825 return true; 1826 } 1827 1828 /* Head and tail of the reconstructed freelist */ 1829 *head = NULL; 1830 *tail = NULL; 1831 1832 do { 1833 object = next; 1834 next = get_freepointer(s, object); 1835 1836 /* If object's reuse doesn't have to be delayed */ 1837 if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { 1838 /* Move object to the new freelist */ 1839 set_freepointer(s, object, *head); 1840 *head = object; 1841 if (!*tail) 1842 *tail = object; 1843 } else { 1844 /* 1845 * Adjust the reconstructed freelist depth 1846 * accordingly if object's reuse is delayed. 1847 */ 1848 --(*cnt); 1849 } 1850 } while (object != old_tail); 1851 1852 if (*head == *tail) 1853 *tail = NULL; 1854 1855 return *head != NULL; 1856 } 1857 1858 static void *setup_object(struct kmem_cache *s, void *object) 1859 { 1860 setup_object_debug(s, object); 1861 object = kasan_init_slab_obj(s, object); 1862 if (unlikely(s->ctor)) { 1863 kasan_unpoison_object_data(s, object); 1864 s->ctor(object); 1865 kasan_poison_object_data(s, object); 1866 } 1867 return object; 1868 } 1869 1870 /* 1871 * Slab allocation and freeing 1872 */ 1873 static inline struct slab *alloc_slab_page(gfp_t flags, int node, 1874 struct kmem_cache_order_objects oo) 1875 { 1876 struct folio *folio; 1877 struct slab *slab; 1878 unsigned int order = oo_order(oo); 1879 1880 if (node == NUMA_NO_NODE) 1881 folio = (struct folio *)alloc_pages(flags, order); 1882 else 1883 folio = (struct folio *)__alloc_pages_node(node, flags, order); 1884 1885 if (!folio) 1886 return NULL; 1887 1888 slab = folio_slab(folio); 1889 __folio_set_slab(folio); 1890 /* Make the flag visible before any changes to folio->mapping */ 1891 smp_wmb(); 1892 if (folio_is_pfmemalloc(folio)) 1893 slab_set_pfmemalloc(slab); 1894 1895 return slab; 1896 } 1897 1898 #ifdef CONFIG_SLAB_FREELIST_RANDOM 1899 /* Pre-initialize the random sequence cache */ 1900 static int init_cache_random_seq(struct kmem_cache *s) 1901 { 1902 unsigned int count = oo_objects(s->oo); 1903 int err; 1904 1905 /* Bailout if already initialised */ 1906 if (s->random_seq) 1907 return 0; 1908 1909 err = cache_random_seq_create(s, count, GFP_KERNEL); 1910 if (err) { 1911 pr_err("SLUB: Unable to initialize free list for %s\n", 1912 s->name); 1913 return err; 1914 } 1915 1916 /* Transform to an offset on the set of pages */ 1917 if (s->random_seq) { 1918 unsigned int i; 1919 1920 for (i = 0; i < count; i++) 1921 s->random_seq[i] *= s->size; 1922 } 1923 return 0; 1924 } 1925 1926 /* Initialize each random sequence freelist per cache */ 1927 static void __init init_freelist_randomization(void) 1928 { 1929 struct kmem_cache *s; 1930 1931 mutex_lock(&slab_mutex); 1932 1933 list_for_each_entry(s, &slab_caches, list) 1934 init_cache_random_seq(s); 1935 1936 mutex_unlock(&slab_mutex); 1937 } 1938 1939 /* Get the next entry on the pre-computed freelist randomized */ 1940 static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab, 1941 unsigned long *pos, void *start, 1942 unsigned long page_limit, 1943 unsigned long freelist_count) 1944 { 1945 unsigned int idx; 1946 1947 /* 1948 * If the target page allocation failed, the number of objects on the 1949 * page might be smaller than the usual size defined by the cache. 1950 */ 1951 do { 1952 idx = s->random_seq[*pos]; 1953 *pos += 1; 1954 if (*pos >= freelist_count) 1955 *pos = 0; 1956 } while (unlikely(idx >= page_limit)); 1957 1958 return (char *)start + idx; 1959 } 1960 1961 /* Shuffle the single linked freelist based on a random pre-computed sequence */ 1962 static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) 1963 { 1964 void *start; 1965 void *cur; 1966 void *next; 1967 unsigned long idx, pos, page_limit, freelist_count; 1968 1969 if (slab->objects < 2 || !s->random_seq) 1970 return false; 1971 1972 freelist_count = oo_objects(s->oo); 1973 pos = get_random_u32_below(freelist_count); 1974 1975 page_limit = slab->objects * s->size; 1976 start = fixup_red_left(s, slab_address(slab)); 1977 1978 /* First entry is used as the base of the freelist */ 1979 cur = next_freelist_entry(s, slab, &pos, start, page_limit, 1980 freelist_count); 1981 cur = setup_object(s, cur); 1982 slab->freelist = cur; 1983 1984 for (idx = 1; idx < slab->objects; idx++) { 1985 next = next_freelist_entry(s, slab, &pos, start, page_limit, 1986 freelist_count); 1987 next = setup_object(s, next); 1988 set_freepointer(s, cur, next); 1989 cur = next; 1990 } 1991 set_freepointer(s, cur, NULL); 1992 1993 return true; 1994 } 1995 #else 1996 static inline int init_cache_random_seq(struct kmem_cache *s) 1997 { 1998 return 0; 1999 } 2000 static inline void init_freelist_randomization(void) { } 2001 static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) 2002 { 2003 return false; 2004 } 2005 #endif /* CONFIG_SLAB_FREELIST_RANDOM */ 2006 2007 static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 2008 { 2009 struct slab *slab; 2010 struct kmem_cache_order_objects oo = s->oo; 2011 gfp_t alloc_gfp; 2012 void *start, *p, *next; 2013 int idx; 2014 bool shuffle; 2015 2016 flags &= gfp_allowed_mask; 2017 2018 flags |= s->allocflags; 2019 2020 /* 2021 * Let the initial higher-order allocation fail under memory pressure 2022 * so we fall-back to the minimum order allocation. 2023 */ 2024 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 2025 if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) 2026 alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; 2027 2028 slab = alloc_slab_page(alloc_gfp, node, oo); 2029 if (unlikely(!slab)) { 2030 oo = s->min; 2031 alloc_gfp = flags; 2032 /* 2033 * Allocation may have failed due to fragmentation. 2034 * Try a lower order alloc if possible 2035 */ 2036 slab = alloc_slab_page(alloc_gfp, node, oo); 2037 if (unlikely(!slab)) 2038 return NULL; 2039 stat(s, ORDER_FALLBACK); 2040 } 2041 2042 slab->objects = oo_objects(oo); 2043 slab->inuse = 0; 2044 slab->frozen = 0; 2045 2046 account_slab(slab, oo_order(oo), s, flags); 2047 2048 slab->slab_cache = s; 2049 2050 kasan_poison_slab(slab); 2051 2052 start = slab_address(slab); 2053 2054 setup_slab_debug(s, slab, start); 2055 2056 shuffle = shuffle_freelist(s, slab); 2057 2058 if (!shuffle) { 2059 start = fixup_red_left(s, start); 2060 start = setup_object(s, start); 2061 slab->freelist = start; 2062 for (idx = 0, p = start; idx < slab->objects - 1; idx++) { 2063 next = p + s->size; 2064 next = setup_object(s, next); 2065 set_freepointer(s, p, next); 2066 p = next; 2067 } 2068 set_freepointer(s, p, NULL); 2069 } 2070 2071 return slab; 2072 } 2073 2074 static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) 2075 { 2076 if (unlikely(flags & GFP_SLAB_BUG_MASK)) 2077 flags = kmalloc_fix_flags(flags); 2078 2079 WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); 2080 2081 return allocate_slab(s, 2082 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 2083 } 2084 2085 static void __free_slab(struct kmem_cache *s, struct slab *slab) 2086 { 2087 struct folio *folio = slab_folio(slab); 2088 int order = folio_order(folio); 2089 int pages = 1 << order; 2090 2091 __slab_clear_pfmemalloc(slab); 2092 folio->mapping = NULL; 2093 /* Make the mapping reset visible before clearing the flag */ 2094 smp_wmb(); 2095 __folio_clear_slab(folio); 2096 mm_account_reclaimed_pages(pages); 2097 unaccount_slab(slab, order, s); 2098 __free_pages(&folio->page, order); 2099 } 2100 2101 static void rcu_free_slab(struct rcu_head *h) 2102 { 2103 struct slab *slab = container_of(h, struct slab, rcu_head); 2104 2105 __free_slab(slab->slab_cache, slab); 2106 } 2107 2108 static void free_slab(struct kmem_cache *s, struct slab *slab) 2109 { 2110 if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { 2111 void *p; 2112 2113 slab_pad_check(s, slab); 2114 for_each_object(p, s, slab_address(slab), slab->objects) 2115 check_object(s, slab, p, SLUB_RED_INACTIVE); 2116 } 2117 2118 if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) 2119 call_rcu(&slab->rcu_head, rcu_free_slab); 2120 else 2121 __free_slab(s, slab); 2122 } 2123 2124 static void discard_slab(struct kmem_cache *s, struct slab *slab) 2125 { 2126 dec_slabs_node(s, slab_nid(slab), slab->objects); 2127 free_slab(s, slab); 2128 } 2129 2130 /* 2131 * Management of partially allocated slabs. 2132 */ 2133 static inline void 2134 __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) 2135 { 2136 n->nr_partial++; 2137 if (tail == DEACTIVATE_TO_TAIL) 2138 list_add_tail(&slab->slab_list, &n->partial); 2139 else 2140 list_add(&slab->slab_list, &n->partial); 2141 } 2142 2143 static inline void add_partial(struct kmem_cache_node *n, 2144 struct slab *slab, int tail) 2145 { 2146 lockdep_assert_held(&n->list_lock); 2147 __add_partial(n, slab, tail); 2148 } 2149 2150 static inline void remove_partial(struct kmem_cache_node *n, 2151 struct slab *slab) 2152 { 2153 lockdep_assert_held(&n->list_lock); 2154 list_del(&slab->slab_list); 2155 n->nr_partial--; 2156 } 2157 2158 /* 2159 * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a 2160 * slab from the n->partial list. Remove only a single object from the slab, do 2161 * the alloc_debug_processing() checks and leave the slab on the list, or move 2162 * it to full list if it was the last free object. 2163 */ 2164 static void *alloc_single_from_partial(struct kmem_cache *s, 2165 struct kmem_cache_node *n, struct slab *slab, int orig_size) 2166 { 2167 void *object; 2168 2169 lockdep_assert_held(&n->list_lock); 2170 2171 object = slab->freelist; 2172 slab->freelist = get_freepointer(s, object); 2173 slab->inuse++; 2174 2175 if (!alloc_debug_processing(s, slab, object, orig_size)) { 2176 remove_partial(n, slab); 2177 return NULL; 2178 } 2179 2180 if (slab->inuse == slab->objects) { 2181 remove_partial(n, slab); 2182 add_full(s, n, slab); 2183 } 2184 2185 return object; 2186 } 2187 2188 /* 2189 * Called only for kmem_cache_debug() caches to allocate from a freshly 2190 * allocated slab. Allocate a single object instead of whole freelist 2191 * and put the slab to the partial (or full) list. 2192 */ 2193 static void *alloc_single_from_new_slab(struct kmem_cache *s, 2194 struct slab *slab, int orig_size) 2195 { 2196 int nid = slab_nid(slab); 2197 struct kmem_cache_node *n = get_node(s, nid); 2198 unsigned long flags; 2199 void *object; 2200 2201 2202 object = slab->freelist; 2203 slab->freelist = get_freepointer(s, object); 2204 slab->inuse = 1; 2205 2206 if (!alloc_debug_processing(s, slab, object, orig_size)) 2207 /* 2208 * It's not really expected that this would fail on a 2209 * freshly allocated slab, but a concurrent memory 2210 * corruption in theory could cause that. 2211 */ 2212 return NULL; 2213 2214 spin_lock_irqsave(&n->list_lock, flags); 2215 2216 if (slab->inuse == slab->objects) 2217 add_full(s, n, slab); 2218 else 2219 add_partial(n, slab, DEACTIVATE_TO_HEAD); 2220 2221 inc_slabs_node(s, nid, slab->objects); 2222 spin_unlock_irqrestore(&n->list_lock, flags); 2223 2224 return object; 2225 } 2226 2227 /* 2228 * Remove slab from the partial list, freeze it and 2229 * return the pointer to the freelist. 2230 * 2231 * Returns a list of objects or NULL if it fails. 2232 */ 2233 static inline void *acquire_slab(struct kmem_cache *s, 2234 struct kmem_cache_node *n, struct slab *slab, 2235 int mode) 2236 { 2237 void *freelist; 2238 unsigned long counters; 2239 struct slab new; 2240 2241 lockdep_assert_held(&n->list_lock); 2242 2243 /* 2244 * Zap the freelist and set the frozen bit. 2245 * The old freelist is the list of objects for the 2246 * per cpu allocation list. 2247 */ 2248 freelist = slab->freelist; 2249 counters = slab->counters; 2250 new.counters = counters; 2251 if (mode) { 2252 new.inuse = slab->objects; 2253 new.freelist = NULL; 2254 } else { 2255 new.freelist = freelist; 2256 } 2257 2258 VM_BUG_ON(new.frozen); 2259 new.frozen = 1; 2260 2261 if (!__slab_update_freelist(s, slab, 2262 freelist, counters, 2263 new.freelist, new.counters, 2264 "acquire_slab")) 2265 return NULL; 2266 2267 remove_partial(n, slab); 2268 WARN_ON(!freelist); 2269 return freelist; 2270 } 2271 2272 #ifdef CONFIG_SLUB_CPU_PARTIAL 2273 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); 2274 #else 2275 static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, 2276 int drain) { } 2277 #endif 2278 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); 2279 2280 /* 2281 * Try to allocate a partial slab from a specific node. 2282 */ 2283 static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, 2284 struct partial_context *pc) 2285 { 2286 struct slab *slab, *slab2; 2287 void *object = NULL; 2288 unsigned long flags; 2289 unsigned int partial_slabs = 0; 2290 2291 /* 2292 * Racy check. If we mistakenly see no partial slabs then we 2293 * just allocate an empty slab. If we mistakenly try to get a 2294 * partial slab and there is none available then get_partial() 2295 * will return NULL. 2296 */ 2297 if (!n || !n->nr_partial) 2298 return NULL; 2299 2300 spin_lock_irqsave(&n->list_lock, flags); 2301 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 2302 void *t; 2303 2304 if (!pfmemalloc_match(slab, pc->flags)) 2305 continue; 2306 2307 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 2308 object = alloc_single_from_partial(s, n, slab, 2309 pc->orig_size); 2310 if (object) 2311 break; 2312 continue; 2313 } 2314 2315 t = acquire_slab(s, n, slab, object == NULL); 2316 if (!t) 2317 break; 2318 2319 if (!object) { 2320 *pc->slab = slab; 2321 stat(s, ALLOC_FROM_PARTIAL); 2322 object = t; 2323 } else { 2324 put_cpu_partial(s, slab, 0); 2325 stat(s, CPU_PARTIAL_NODE); 2326 partial_slabs++; 2327 } 2328 #ifdef CONFIG_SLUB_CPU_PARTIAL 2329 if (!kmem_cache_has_cpu_partial(s) 2330 || partial_slabs > s->cpu_partial_slabs / 2) 2331 break; 2332 #else 2333 break; 2334 #endif 2335 2336 } 2337 spin_unlock_irqrestore(&n->list_lock, flags); 2338 return object; 2339 } 2340 2341 /* 2342 * Get a slab from somewhere. Search in increasing NUMA distances. 2343 */ 2344 static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) 2345 { 2346 #ifdef CONFIG_NUMA 2347 struct zonelist *zonelist; 2348 struct zoneref *z; 2349 struct zone *zone; 2350 enum zone_type highest_zoneidx = gfp_zone(pc->flags); 2351 void *object; 2352 unsigned int cpuset_mems_cookie; 2353 2354 /* 2355 * The defrag ratio allows a configuration of the tradeoffs between 2356 * inter node defragmentation and node local allocations. A lower 2357 * defrag_ratio increases the tendency to do local allocations 2358 * instead of attempting to obtain partial slabs from other nodes. 2359 * 2360 * If the defrag_ratio is set to 0 then kmalloc() always 2361 * returns node local objects. If the ratio is higher then kmalloc() 2362 * may return off node objects because partial slabs are obtained 2363 * from other nodes and filled up. 2364 * 2365 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100 2366 * (which makes defrag_ratio = 1000) then every (well almost) 2367 * allocation will first attempt to defrag slab caches on other nodes. 2368 * This means scanning over all nodes to look for partial slabs which 2369 * may be expensive if we do it every time we are trying to find a slab 2370 * with available objects. 2371 */ 2372 if (!s->remote_node_defrag_ratio || 2373 get_cycles() % 1024 > s->remote_node_defrag_ratio) 2374 return NULL; 2375 2376 do { 2377 cpuset_mems_cookie = read_mems_allowed_begin(); 2378 zonelist = node_zonelist(mempolicy_slab_node(), pc->flags); 2379 for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { 2380 struct kmem_cache_node *n; 2381 2382 n = get_node(s, zone_to_nid(zone)); 2383 2384 if (n && cpuset_zone_allowed(zone, pc->flags) && 2385 n->nr_partial > s->min_partial) { 2386 object = get_partial_node(s, n, pc); 2387 if (object) { 2388 /* 2389 * Don't check read_mems_allowed_retry() 2390 * here - if mems_allowed was updated in 2391 * parallel, that was a harmless race 2392 * between allocation and the cpuset 2393 * update 2394 */ 2395 return object; 2396 } 2397 } 2398 } 2399 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 2400 #endif /* CONFIG_NUMA */ 2401 return NULL; 2402 } 2403 2404 /* 2405 * Get a partial slab, lock it and return it. 2406 */ 2407 static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) 2408 { 2409 void *object; 2410 int searchnode = node; 2411 2412 if (node == NUMA_NO_NODE) 2413 searchnode = numa_mem_id(); 2414 2415 object = get_partial_node(s, get_node(s, searchnode), pc); 2416 if (object || node != NUMA_NO_NODE) 2417 return object; 2418 2419 return get_any_partial(s, pc); 2420 } 2421 2422 #ifndef CONFIG_SLUB_TINY 2423 2424 #ifdef CONFIG_PREEMPTION 2425 /* 2426 * Calculate the next globally unique transaction for disambiguation 2427 * during cmpxchg. The transactions start with the cpu number and are then 2428 * incremented by CONFIG_NR_CPUS. 2429 */ 2430 #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 2431 #else 2432 /* 2433 * No preemption supported therefore also no need to check for 2434 * different cpus. 2435 */ 2436 #define TID_STEP 1 2437 #endif /* CONFIG_PREEMPTION */ 2438 2439 static inline unsigned long next_tid(unsigned long tid) 2440 { 2441 return tid + TID_STEP; 2442 } 2443 2444 #ifdef SLUB_DEBUG_CMPXCHG 2445 static inline unsigned int tid_to_cpu(unsigned long tid) 2446 { 2447 return tid % TID_STEP; 2448 } 2449 2450 static inline unsigned long tid_to_event(unsigned long tid) 2451 { 2452 return tid / TID_STEP; 2453 } 2454 #endif 2455 2456 static inline unsigned int init_tid(int cpu) 2457 { 2458 return cpu; 2459 } 2460 2461 static inline void note_cmpxchg_failure(const char *n, 2462 const struct kmem_cache *s, unsigned long tid) 2463 { 2464 #ifdef SLUB_DEBUG_CMPXCHG 2465 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 2466 2467 pr_info("%s %s: cmpxchg redo ", n, s->name); 2468 2469 #ifdef CONFIG_PREEMPTION 2470 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 2471 pr_warn("due to cpu change %d -> %d\n", 2472 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 2473 else 2474 #endif 2475 if (tid_to_event(tid) != tid_to_event(actual_tid)) 2476 pr_warn("due to cpu running other code. Event %ld->%ld\n", 2477 tid_to_event(tid), tid_to_event(actual_tid)); 2478 else 2479 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", 2480 actual_tid, tid, next_tid(tid)); 2481 #endif 2482 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 2483 } 2484 2485 static void init_kmem_cache_cpus(struct kmem_cache *s) 2486 { 2487 int cpu; 2488 struct kmem_cache_cpu *c; 2489 2490 for_each_possible_cpu(cpu) { 2491 c = per_cpu_ptr(s->cpu_slab, cpu); 2492 local_lock_init(&c->lock); 2493 c->tid = init_tid(cpu); 2494 } 2495 } 2496 2497 /* 2498 * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, 2499 * unfreezes the slabs and puts it on the proper list. 2500 * Assumes the slab has been already safely taken away from kmem_cache_cpu 2501 * by the caller. 2502 */ 2503 static void deactivate_slab(struct kmem_cache *s, struct slab *slab, 2504 void *freelist) 2505 { 2506 enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; 2507 struct kmem_cache_node *n = get_node(s, slab_nid(slab)); 2508 int free_delta = 0; 2509 enum slab_modes mode = M_NONE; 2510 void *nextfree, *freelist_iter, *freelist_tail; 2511 int tail = DEACTIVATE_TO_HEAD; 2512 unsigned long flags = 0; 2513 struct slab new; 2514 struct slab old; 2515 2516 if (slab->freelist) { 2517 stat(s, DEACTIVATE_REMOTE_FREES); 2518 tail = DEACTIVATE_TO_TAIL; 2519 } 2520 2521 /* 2522 * Stage one: Count the objects on cpu's freelist as free_delta and 2523 * remember the last object in freelist_tail for later splicing. 2524 */ 2525 freelist_tail = NULL; 2526 freelist_iter = freelist; 2527 while (freelist_iter) { 2528 nextfree = get_freepointer(s, freelist_iter); 2529 2530 /* 2531 * If 'nextfree' is invalid, it is possible that the object at 2532 * 'freelist_iter' is already corrupted. So isolate all objects 2533 * starting at 'freelist_iter' by skipping them. 2534 */ 2535 if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) 2536 break; 2537 2538 freelist_tail = freelist_iter; 2539 free_delta++; 2540 2541 freelist_iter = nextfree; 2542 } 2543 2544 /* 2545 * Stage two: Unfreeze the slab while splicing the per-cpu 2546 * freelist to the head of slab's freelist. 2547 * 2548 * Ensure that the slab is unfrozen while the list presence 2549 * reflects the actual number of objects during unfreeze. 2550 * 2551 * We first perform cmpxchg holding lock and insert to list 2552 * when it succeed. If there is mismatch then the slab is not 2553 * unfrozen and number of objects in the slab may have changed. 2554 * Then release lock and retry cmpxchg again. 2555 */ 2556 redo: 2557 2558 old.freelist = READ_ONCE(slab->freelist); 2559 old.counters = READ_ONCE(slab->counters); 2560 VM_BUG_ON(!old.frozen); 2561 2562 /* Determine target state of the slab */ 2563 new.counters = old.counters; 2564 if (freelist_tail) { 2565 new.inuse -= free_delta; 2566 set_freepointer(s, freelist_tail, old.freelist); 2567 new.freelist = freelist; 2568 } else 2569 new.freelist = old.freelist; 2570 2571 new.frozen = 0; 2572 2573 if (!new.inuse && n->nr_partial >= s->min_partial) { 2574 mode = M_FREE; 2575 } else if (new.freelist) { 2576 mode = M_PARTIAL; 2577 /* 2578 * Taking the spinlock removes the possibility that 2579 * acquire_slab() will see a slab that is frozen 2580 */ 2581 spin_lock_irqsave(&n->list_lock, flags); 2582 } else { 2583 mode = M_FULL_NOLIST; 2584 } 2585 2586 2587 if (!slab_update_freelist(s, slab, 2588 old.freelist, old.counters, 2589 new.freelist, new.counters, 2590 "unfreezing slab")) { 2591 if (mode == M_PARTIAL) 2592 spin_unlock_irqrestore(&n->list_lock, flags); 2593 goto redo; 2594 } 2595 2596 2597 if (mode == M_PARTIAL) { 2598 add_partial(n, slab, tail); 2599 spin_unlock_irqrestore(&n->list_lock, flags); 2600 stat(s, tail); 2601 } else if (mode == M_FREE) { 2602 stat(s, DEACTIVATE_EMPTY); 2603 discard_slab(s, slab); 2604 stat(s, FREE_SLAB); 2605 } else if (mode == M_FULL_NOLIST) { 2606 stat(s, DEACTIVATE_FULL); 2607 } 2608 } 2609 2610 #ifdef CONFIG_SLUB_CPU_PARTIAL 2611 static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) 2612 { 2613 struct kmem_cache_node *n = NULL, *n2 = NULL; 2614 struct slab *slab, *slab_to_discard = NULL; 2615 unsigned long flags = 0; 2616 2617 while (partial_slab) { 2618 struct slab new; 2619 struct slab old; 2620 2621 slab = partial_slab; 2622 partial_slab = slab->next; 2623 2624 n2 = get_node(s, slab_nid(slab)); 2625 if (n != n2) { 2626 if (n) 2627 spin_unlock_irqrestore(&n->list_lock, flags); 2628 2629 n = n2; 2630 spin_lock_irqsave(&n->list_lock, flags); 2631 } 2632 2633 do { 2634 2635 old.freelist = slab->freelist; 2636 old.counters = slab->counters; 2637 VM_BUG_ON(!old.frozen); 2638 2639 new.counters = old.counters; 2640 new.freelist = old.freelist; 2641 2642 new.frozen = 0; 2643 2644 } while (!__slab_update_freelist(s, slab, 2645 old.freelist, old.counters, 2646 new.freelist, new.counters, 2647 "unfreezing slab")); 2648 2649 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { 2650 slab->next = slab_to_discard; 2651 slab_to_discard = slab; 2652 } else { 2653 add_partial(n, slab, DEACTIVATE_TO_TAIL); 2654 stat(s, FREE_ADD_PARTIAL); 2655 } 2656 } 2657 2658 if (n) 2659 spin_unlock_irqrestore(&n->list_lock, flags); 2660 2661 while (slab_to_discard) { 2662 slab = slab_to_discard; 2663 slab_to_discard = slab_to_discard->next; 2664 2665 stat(s, DEACTIVATE_EMPTY); 2666 discard_slab(s, slab); 2667 stat(s, FREE_SLAB); 2668 } 2669 } 2670 2671 /* 2672 * Unfreeze all the cpu partial slabs. 2673 */ 2674 static void unfreeze_partials(struct kmem_cache *s) 2675 { 2676 struct slab *partial_slab; 2677 unsigned long flags; 2678 2679 local_lock_irqsave(&s->cpu_slab->lock, flags); 2680 partial_slab = this_cpu_read(s->cpu_slab->partial); 2681 this_cpu_write(s->cpu_slab->partial, NULL); 2682 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 2683 2684 if (partial_slab) 2685 __unfreeze_partials(s, partial_slab); 2686 } 2687 2688 static void unfreeze_partials_cpu(struct kmem_cache *s, 2689 struct kmem_cache_cpu *c) 2690 { 2691 struct slab *partial_slab; 2692 2693 partial_slab = slub_percpu_partial(c); 2694 c->partial = NULL; 2695 2696 if (partial_slab) 2697 __unfreeze_partials(s, partial_slab); 2698 } 2699 2700 /* 2701 * Put a slab that was just frozen (in __slab_free|get_partial_node) into a 2702 * partial slab slot if available. 2703 * 2704 * If we did not find a slot then simply move all the partials to the 2705 * per node partial list. 2706 */ 2707 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) 2708 { 2709 struct slab *oldslab; 2710 struct slab *slab_to_unfreeze = NULL; 2711 unsigned long flags; 2712 int slabs = 0; 2713 2714 local_lock_irqsave(&s->cpu_slab->lock, flags); 2715 2716 oldslab = this_cpu_read(s->cpu_slab->partial); 2717 2718 if (oldslab) { 2719 if (drain && oldslab->slabs >= s->cpu_partial_slabs) { 2720 /* 2721 * Partial array is full. Move the existing set to the 2722 * per node partial list. Postpone the actual unfreezing 2723 * outside of the critical section. 2724 */ 2725 slab_to_unfreeze = oldslab; 2726 oldslab = NULL; 2727 } else { 2728 slabs = oldslab->slabs; 2729 } 2730 } 2731 2732 slabs++; 2733 2734 slab->slabs = slabs; 2735 slab->next = oldslab; 2736 2737 this_cpu_write(s->cpu_slab->partial, slab); 2738 2739 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 2740 2741 if (slab_to_unfreeze) { 2742 __unfreeze_partials(s, slab_to_unfreeze); 2743 stat(s, CPU_PARTIAL_DRAIN); 2744 } 2745 } 2746 2747 #else /* CONFIG_SLUB_CPU_PARTIAL */ 2748 2749 static inline void unfreeze_partials(struct kmem_cache *s) { } 2750 static inline void unfreeze_partials_cpu(struct kmem_cache *s, 2751 struct kmem_cache_cpu *c) { } 2752 2753 #endif /* CONFIG_SLUB_CPU_PARTIAL */ 2754 2755 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 2756 { 2757 unsigned long flags; 2758 struct slab *slab; 2759 void *freelist; 2760 2761 local_lock_irqsave(&s->cpu_slab->lock, flags); 2762 2763 slab = c->slab; 2764 freelist = c->freelist; 2765 2766 c->slab = NULL; 2767 c->freelist = NULL; 2768 c->tid = next_tid(c->tid); 2769 2770 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 2771 2772 if (slab) { 2773 deactivate_slab(s, slab, freelist); 2774 stat(s, CPUSLAB_FLUSH); 2775 } 2776 } 2777 2778 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 2779 { 2780 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2781 void *freelist = c->freelist; 2782 struct slab *slab = c->slab; 2783 2784 c->slab = NULL; 2785 c->freelist = NULL; 2786 c->tid = next_tid(c->tid); 2787 2788 if (slab) { 2789 deactivate_slab(s, slab, freelist); 2790 stat(s, CPUSLAB_FLUSH); 2791 } 2792 2793 unfreeze_partials_cpu(s, c); 2794 } 2795 2796 struct slub_flush_work { 2797 struct work_struct work; 2798 struct kmem_cache *s; 2799 bool skip; 2800 }; 2801 2802 /* 2803 * Flush cpu slab. 2804 * 2805 * Called from CPU work handler with migration disabled. 2806 */ 2807 static void flush_cpu_slab(struct work_struct *w) 2808 { 2809 struct kmem_cache *s; 2810 struct kmem_cache_cpu *c; 2811 struct slub_flush_work *sfw; 2812 2813 sfw = container_of(w, struct slub_flush_work, work); 2814 2815 s = sfw->s; 2816 c = this_cpu_ptr(s->cpu_slab); 2817 2818 if (c->slab) 2819 flush_slab(s, c); 2820 2821 unfreeze_partials(s); 2822 } 2823 2824 static bool has_cpu_slab(int cpu, struct kmem_cache *s) 2825 { 2826 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 2827 2828 return c->slab || slub_percpu_partial(c); 2829 } 2830 2831 static DEFINE_MUTEX(flush_lock); 2832 static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); 2833 2834 static void flush_all_cpus_locked(struct kmem_cache *s) 2835 { 2836 struct slub_flush_work *sfw; 2837 unsigned int cpu; 2838 2839 lockdep_assert_cpus_held(); 2840 mutex_lock(&flush_lock); 2841 2842 for_each_online_cpu(cpu) { 2843 sfw = &per_cpu(slub_flush, cpu); 2844 if (!has_cpu_slab(cpu, s)) { 2845 sfw->skip = true; 2846 continue; 2847 } 2848 INIT_WORK(&sfw->work, flush_cpu_slab); 2849 sfw->skip = false; 2850 sfw->s = s; 2851 queue_work_on(cpu, flushwq, &sfw->work); 2852 } 2853 2854 for_each_online_cpu(cpu) { 2855 sfw = &per_cpu(slub_flush, cpu); 2856 if (sfw->skip) 2857 continue; 2858 flush_work(&sfw->work); 2859 } 2860 2861 mutex_unlock(&flush_lock); 2862 } 2863 2864 static void flush_all(struct kmem_cache *s) 2865 { 2866 cpus_read_lock(); 2867 flush_all_cpus_locked(s); 2868 cpus_read_unlock(); 2869 } 2870 2871 /* 2872 * Use the cpu notifier to insure that the cpu slabs are flushed when 2873 * necessary. 2874 */ 2875 static int slub_cpu_dead(unsigned int cpu) 2876 { 2877 struct kmem_cache *s; 2878 2879 mutex_lock(&slab_mutex); 2880 list_for_each_entry(s, &slab_caches, list) 2881 __flush_cpu_slab(s, cpu); 2882 mutex_unlock(&slab_mutex); 2883 return 0; 2884 } 2885 2886 #else /* CONFIG_SLUB_TINY */ 2887 static inline void flush_all_cpus_locked(struct kmem_cache *s) { } 2888 static inline void flush_all(struct kmem_cache *s) { } 2889 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } 2890 static inline int slub_cpu_dead(unsigned int cpu) { return 0; } 2891 #endif /* CONFIG_SLUB_TINY */ 2892 2893 /* 2894 * Check if the objects in a per cpu structure fit numa 2895 * locality expectations. 2896 */ 2897 static inline int node_match(struct slab *slab, int node) 2898 { 2899 #ifdef CONFIG_NUMA 2900 if (node != NUMA_NO_NODE && slab_nid(slab) != node) 2901 return 0; 2902 #endif 2903 return 1; 2904 } 2905 2906 #ifdef CONFIG_SLUB_DEBUG 2907 static int count_free(struct slab *slab) 2908 { 2909 return slab->objects - slab->inuse; 2910 } 2911 2912 static inline unsigned long node_nr_objs(struct kmem_cache_node *n) 2913 { 2914 return atomic_long_read(&n->total_objects); 2915 } 2916 2917 /* Supports checking bulk free of a constructed freelist */ 2918 static inline bool free_debug_processing(struct kmem_cache *s, 2919 struct slab *slab, void *head, void *tail, int *bulk_cnt, 2920 unsigned long addr, depot_stack_handle_t handle) 2921 { 2922 bool checks_ok = false; 2923 void *object = head; 2924 int cnt = 0; 2925 2926 if (s->flags & SLAB_CONSISTENCY_CHECKS) { 2927 if (!check_slab(s, slab)) 2928 goto out; 2929 } 2930 2931 if (slab->inuse < *bulk_cnt) { 2932 slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", 2933 slab->inuse, *bulk_cnt); 2934 goto out; 2935 } 2936 2937 next_object: 2938 2939 if (++cnt > *bulk_cnt) 2940 goto out_cnt; 2941 2942 if (s->flags & SLAB_CONSISTENCY_CHECKS) { 2943 if (!free_consistency_checks(s, slab, object, addr)) 2944 goto out; 2945 } 2946 2947 if (s->flags & SLAB_STORE_USER) 2948 set_track_update(s, object, TRACK_FREE, addr, handle); 2949 trace(s, slab, object, 0); 2950 /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ 2951 init_object(s, object, SLUB_RED_INACTIVE); 2952 2953 /* Reached end of constructed freelist yet? */ 2954 if (object != tail) { 2955 object = get_freepointer(s, object); 2956 goto next_object; 2957 } 2958 checks_ok = true; 2959 2960 out_cnt: 2961 if (cnt != *bulk_cnt) { 2962 slab_err(s, slab, "Bulk free expected %d objects but found %d\n", 2963 *bulk_cnt, cnt); 2964 *bulk_cnt = cnt; 2965 } 2966 2967 out: 2968 2969 if (!checks_ok) 2970 slab_fix(s, "Object at 0x%p not freed", object); 2971 2972 return checks_ok; 2973 } 2974 #endif /* CONFIG_SLUB_DEBUG */ 2975 2976 #if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS) 2977 static unsigned long count_partial(struct kmem_cache_node *n, 2978 int (*get_count)(struct slab *)) 2979 { 2980 unsigned long flags; 2981 unsigned long x = 0; 2982 struct slab *slab; 2983 2984 spin_lock_irqsave(&n->list_lock, flags); 2985 list_for_each_entry(slab, &n->partial, slab_list) 2986 x += get_count(slab); 2987 spin_unlock_irqrestore(&n->list_lock, flags); 2988 return x; 2989 } 2990 #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ 2991 2992 #ifdef CONFIG_SLUB_DEBUG 2993 static noinline void 2994 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) 2995 { 2996 static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, 2997 DEFAULT_RATELIMIT_BURST); 2998 int node; 2999 struct kmem_cache_node *n; 3000 3001 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) 3002 return; 3003 3004 pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", 3005 nid, gfpflags, &gfpflags); 3006 pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", 3007 s->name, s->object_size, s->size, oo_order(s->oo), 3008 oo_order(s->min)); 3009 3010 if (oo_order(s->min) > get_order(s->object_size)) 3011 pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", 3012 s->name); 3013 3014 for_each_kmem_cache_node(s, node, n) { 3015 unsigned long nr_slabs; 3016 unsigned long nr_objs; 3017 unsigned long nr_free; 3018 3019 nr_free = count_partial(n, count_free); 3020 nr_slabs = node_nr_slabs(n); 3021 nr_objs = node_nr_objs(n); 3022 3023 pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", 3024 node, nr_slabs, nr_objs, nr_free); 3025 } 3026 } 3027 #else /* CONFIG_SLUB_DEBUG */ 3028 static inline void 3029 slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { } 3030 #endif 3031 3032 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) 3033 { 3034 if (unlikely(slab_test_pfmemalloc(slab))) 3035 return gfp_pfmemalloc_allowed(gfpflags); 3036 3037 return true; 3038 } 3039 3040 #ifndef CONFIG_SLUB_TINY 3041 static inline bool 3042 __update_cpu_freelist_fast(struct kmem_cache *s, 3043 void *freelist_old, void *freelist_new, 3044 unsigned long tid) 3045 { 3046 freelist_aba_t old = { .freelist = freelist_old, .counter = tid }; 3047 freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) }; 3048 3049 return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full, 3050 &old.full, new.full); 3051 } 3052 3053 /* 3054 * Check the slab->freelist and either transfer the freelist to the 3055 * per cpu freelist or deactivate the slab. 3056 * 3057 * The slab is still frozen if the return value is not NULL. 3058 * 3059 * If this function returns NULL then the slab has been unfrozen. 3060 */ 3061 static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) 3062 { 3063 struct slab new; 3064 unsigned long counters; 3065 void *freelist; 3066 3067 lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 3068 3069 do { 3070 freelist = slab->freelist; 3071 counters = slab->counters; 3072 3073 new.counters = counters; 3074 VM_BUG_ON(!new.frozen); 3075 3076 new.inuse = slab->objects; 3077 new.frozen = freelist != NULL; 3078 3079 } while (!__slab_update_freelist(s, slab, 3080 freelist, counters, 3081 NULL, new.counters, 3082 "get_freelist")); 3083 3084 return freelist; 3085 } 3086 3087 /* 3088 * Slow path. The lockless freelist is empty or we need to perform 3089 * debugging duties. 3090 * 3091 * Processing is still very fast if new objects have been freed to the 3092 * regular freelist. In that case we simply take over the regular freelist 3093 * as the lockless freelist and zap the regular freelist. 3094 * 3095 * If that is not working then we fall back to the partial lists. We take the 3096 * first element of the freelist as the object to allocate now and move the 3097 * rest of the freelist to the lockless freelist. 3098 * 3099 * And if we were unable to get a new slab from the partial slab lists then 3100 * we need to allocate a new slab. This is the slowest path since it involves 3101 * a call to the page allocator and the setup of a new slab. 3102 * 3103 * Version of __slab_alloc to use when we know that preemption is 3104 * already disabled (which is the case for bulk allocation). 3105 */ 3106 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 3107 unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 3108 { 3109 void *freelist; 3110 struct slab *slab; 3111 unsigned long flags; 3112 struct partial_context pc; 3113 3114 stat(s, ALLOC_SLOWPATH); 3115 3116 reread_slab: 3117 3118 slab = READ_ONCE(c->slab); 3119 if (!slab) { 3120 /* 3121 * if the node is not online or has no normal memory, just 3122 * ignore the node constraint 3123 */ 3124 if (unlikely(node != NUMA_NO_NODE && 3125 !node_isset(node, slab_nodes))) 3126 node = NUMA_NO_NODE; 3127 goto new_slab; 3128 } 3129 redo: 3130 3131 if (unlikely(!node_match(slab, node))) { 3132 /* 3133 * same as above but node_match() being false already 3134 * implies node != NUMA_NO_NODE 3135 */ 3136 if (!node_isset(node, slab_nodes)) { 3137 node = NUMA_NO_NODE; 3138 } else { 3139 stat(s, ALLOC_NODE_MISMATCH); 3140 goto deactivate_slab; 3141 } 3142 } 3143 3144 /* 3145 * By rights, we should be searching for a slab page that was 3146 * PFMEMALLOC but right now, we are losing the pfmemalloc 3147 * information when the page leaves the per-cpu allocator 3148 */ 3149 if (unlikely(!pfmemalloc_match(slab, gfpflags))) 3150 goto deactivate_slab; 3151 3152 /* must check again c->slab in case we got preempted and it changed */ 3153 local_lock_irqsave(&s->cpu_slab->lock, flags); 3154 if (unlikely(slab != c->slab)) { 3155 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3156 goto reread_slab; 3157 } 3158 freelist = c->freelist; 3159 if (freelist) 3160 goto load_freelist; 3161 3162 freelist = get_freelist(s, slab); 3163 3164 if (!freelist) { 3165 c->slab = NULL; 3166 c->tid = next_tid(c->tid); 3167 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3168 stat(s, DEACTIVATE_BYPASS); 3169 goto new_slab; 3170 } 3171 3172 stat(s, ALLOC_REFILL); 3173 3174 load_freelist: 3175 3176 lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 3177 3178 /* 3179 * freelist is pointing to the list of objects to be used. 3180 * slab is pointing to the slab from which the objects are obtained. 3181 * That slab must be frozen for per cpu allocations to work. 3182 */ 3183 VM_BUG_ON(!c->slab->frozen); 3184 c->freelist = get_freepointer(s, freelist); 3185 c->tid = next_tid(c->tid); 3186 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3187 return freelist; 3188 3189 deactivate_slab: 3190 3191 local_lock_irqsave(&s->cpu_slab->lock, flags); 3192 if (slab != c->slab) { 3193 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3194 goto reread_slab; 3195 } 3196 freelist = c->freelist; 3197 c->slab = NULL; 3198 c->freelist = NULL; 3199 c->tid = next_tid(c->tid); 3200 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3201 deactivate_slab(s, slab, freelist); 3202 3203 new_slab: 3204 3205 if (slub_percpu_partial(c)) { 3206 local_lock_irqsave(&s->cpu_slab->lock, flags); 3207 if (unlikely(c->slab)) { 3208 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3209 goto reread_slab; 3210 } 3211 if (unlikely(!slub_percpu_partial(c))) { 3212 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3213 /* we were preempted and partial list got empty */ 3214 goto new_objects; 3215 } 3216 3217 slab = c->slab = slub_percpu_partial(c); 3218 slub_set_percpu_partial(c, slab); 3219 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3220 stat(s, CPU_PARTIAL_ALLOC); 3221 goto redo; 3222 } 3223 3224 new_objects: 3225 3226 pc.flags = gfpflags; 3227 pc.slab = &slab; 3228 pc.orig_size = orig_size; 3229 freelist = get_partial(s, node, &pc); 3230 if (freelist) 3231 goto check_new_slab; 3232 3233 slub_put_cpu_ptr(s->cpu_slab); 3234 slab = new_slab(s, gfpflags, node); 3235 c = slub_get_cpu_ptr(s->cpu_slab); 3236 3237 if (unlikely(!slab)) { 3238 slab_out_of_memory(s, gfpflags, node); 3239 return NULL; 3240 } 3241 3242 stat(s, ALLOC_SLAB); 3243 3244 if (kmem_cache_debug(s)) { 3245 freelist = alloc_single_from_new_slab(s, slab, orig_size); 3246 3247 if (unlikely(!freelist)) 3248 goto new_objects; 3249 3250 if (s->flags & SLAB_STORE_USER) 3251 set_track(s, freelist, TRACK_ALLOC, addr); 3252 3253 return freelist; 3254 } 3255 3256 /* 3257 * No other reference to the slab yet so we can 3258 * muck around with it freely without cmpxchg 3259 */ 3260 freelist = slab->freelist; 3261 slab->freelist = NULL; 3262 slab->inuse = slab->objects; 3263 slab->frozen = 1; 3264 3265 inc_slabs_node(s, slab_nid(slab), slab->objects); 3266 3267 check_new_slab: 3268 3269 if (kmem_cache_debug(s)) { 3270 /* 3271 * For debug caches here we had to go through 3272 * alloc_single_from_partial() so just store the tracking info 3273 * and return the object 3274 */ 3275 if (s->flags & SLAB_STORE_USER) 3276 set_track(s, freelist, TRACK_ALLOC, addr); 3277 3278 return freelist; 3279 } 3280 3281 if (unlikely(!pfmemalloc_match(slab, gfpflags))) { 3282 /* 3283 * For !pfmemalloc_match() case we don't load freelist so that 3284 * we don't make further mismatched allocations easier. 3285 */ 3286 deactivate_slab(s, slab, get_freepointer(s, freelist)); 3287 return freelist; 3288 } 3289 3290 retry_load_slab: 3291 3292 local_lock_irqsave(&s->cpu_slab->lock, flags); 3293 if (unlikely(c->slab)) { 3294 void *flush_freelist = c->freelist; 3295 struct slab *flush_slab = c->slab; 3296 3297 c->slab = NULL; 3298 c->freelist = NULL; 3299 c->tid = next_tid(c->tid); 3300 3301 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3302 3303 deactivate_slab(s, flush_slab, flush_freelist); 3304 3305 stat(s, CPUSLAB_FLUSH); 3306 3307 goto retry_load_slab; 3308 } 3309 c->slab = slab; 3310 3311 goto load_freelist; 3312 } 3313 3314 /* 3315 * A wrapper for ___slab_alloc() for contexts where preemption is not yet 3316 * disabled. Compensates for possible cpu changes by refetching the per cpu area 3317 * pointer. 3318 */ 3319 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 3320 unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 3321 { 3322 void *p; 3323 3324 #ifdef CONFIG_PREEMPT_COUNT 3325 /* 3326 * We may have been preempted and rescheduled on a different 3327 * cpu before disabling preemption. Need to reload cpu area 3328 * pointer. 3329 */ 3330 c = slub_get_cpu_ptr(s->cpu_slab); 3331 #endif 3332 3333 p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); 3334 #ifdef CONFIG_PREEMPT_COUNT 3335 slub_put_cpu_ptr(s->cpu_slab); 3336 #endif 3337 return p; 3338 } 3339 3340 static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 3341 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 3342 { 3343 struct kmem_cache_cpu *c; 3344 struct slab *slab; 3345 unsigned long tid; 3346 void *object; 3347 3348 redo: 3349 /* 3350 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 3351 * enabled. We may switch back and forth between cpus while 3352 * reading from one cpu area. That does not matter as long 3353 * as we end up on the original cpu again when doing the cmpxchg. 3354 * 3355 * We must guarantee that tid and kmem_cache_cpu are retrieved on the 3356 * same cpu. We read first the kmem_cache_cpu pointer and use it to read 3357 * the tid. If we are preempted and switched to another cpu between the 3358 * two reads, it's OK as the two are still associated with the same cpu 3359 * and cmpxchg later will validate the cpu. 3360 */ 3361 c = raw_cpu_ptr(s->cpu_slab); 3362 tid = READ_ONCE(c->tid); 3363 3364 /* 3365 * Irqless object alloc/free algorithm used here depends on sequence 3366 * of fetching cpu_slab's data. tid should be fetched before anything 3367 * on c to guarantee that object and slab associated with previous tid 3368 * won't be used with current tid. If we fetch tid first, object and 3369 * slab could be one associated with next tid and our alloc/free 3370 * request will be failed. In this case, we will retry. So, no problem. 3371 */ 3372 barrier(); 3373 3374 /* 3375 * The transaction ids are globally unique per cpu and per operation on 3376 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 3377 * occurs on the right processor and that there was no operation on the 3378 * linked list in between. 3379 */ 3380 3381 object = c->freelist; 3382 slab = c->slab; 3383 3384 if (!USE_LOCKLESS_FAST_PATH() || 3385 unlikely(!object || !slab || !node_match(slab, node))) { 3386 object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); 3387 } else { 3388 void *next_object = get_freepointer_safe(s, object); 3389 3390 /* 3391 * The cmpxchg will only match if there was no additional 3392 * operation and if we are on the right processor. 3393 * 3394 * The cmpxchg does the following atomically (without lock 3395 * semantics!) 3396 * 1. Relocate first pointer to the current per cpu area. 3397 * 2. Verify that tid and freelist have not been changed 3398 * 3. If they were not changed replace tid and freelist 3399 * 3400 * Since this is without lock semantics the protection is only 3401 * against code executing on this cpu *not* from access by 3402 * other cpus. 3403 */ 3404 if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { 3405 note_cmpxchg_failure("slab_alloc", s, tid); 3406 goto redo; 3407 } 3408 prefetch_freepointer(s, next_object); 3409 stat(s, ALLOC_FASTPATH); 3410 } 3411 3412 return object; 3413 } 3414 #else /* CONFIG_SLUB_TINY */ 3415 static void *__slab_alloc_node(struct kmem_cache *s, 3416 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 3417 { 3418 struct partial_context pc; 3419 struct slab *slab; 3420 void *object; 3421 3422 pc.flags = gfpflags; 3423 pc.slab = &slab; 3424 pc.orig_size = orig_size; 3425 object = get_partial(s, node, &pc); 3426 3427 if (object) 3428 return object; 3429 3430 slab = new_slab(s, gfpflags, node); 3431 if (unlikely(!slab)) { 3432 slab_out_of_memory(s, gfpflags, node); 3433 return NULL; 3434 } 3435 3436 object = alloc_single_from_new_slab(s, slab, orig_size); 3437 3438 return object; 3439 } 3440 #endif /* CONFIG_SLUB_TINY */ 3441 3442 /* 3443 * If the object has been wiped upon free, make sure it's fully initialized by 3444 * zeroing out freelist pointer. 3445 */ 3446 static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, 3447 void *obj) 3448 { 3449 if (unlikely(slab_want_init_on_free(s)) && obj) 3450 memset((void *)((char *)kasan_reset_tag(obj) + s->offset), 3451 0, sizeof(void *)); 3452 } 3453 3454 /* 3455 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 3456 * have the fastpath folded into their functions. So no function call 3457 * overhead for requests that can be satisfied on the fastpath. 3458 * 3459 * The fastpath works by first checking if the lockless freelist can be used. 3460 * If not then __slab_alloc is called for slow processing. 3461 * 3462 * Otherwise we can simply pick the next object from the lockless free list. 3463 */ 3464 static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, 3465 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 3466 { 3467 void *object; 3468 struct obj_cgroup *objcg = NULL; 3469 bool init = false; 3470 3471 s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); 3472 if (!s) 3473 return NULL; 3474 3475 object = kfence_alloc(s, orig_size, gfpflags); 3476 if (unlikely(object)) 3477 goto out; 3478 3479 object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); 3480 3481 maybe_wipe_obj_freeptr(s, object); 3482 init = slab_want_init_on_alloc(gfpflags, s); 3483 3484 out: 3485 /* 3486 * When init equals 'true', like for kzalloc() family, only 3487 * @orig_size bytes might be zeroed instead of s->object_size 3488 */ 3489 slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size); 3490 3491 return object; 3492 } 3493 3494 static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, 3495 gfp_t gfpflags, unsigned long addr, size_t orig_size) 3496 { 3497 return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); 3498 } 3499 3500 static __fastpath_inline 3501 void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, 3502 gfp_t gfpflags) 3503 { 3504 void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); 3505 3506 trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); 3507 3508 return ret; 3509 } 3510 3511 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 3512 { 3513 return __kmem_cache_alloc_lru(s, NULL, gfpflags); 3514 } 3515 EXPORT_SYMBOL(kmem_cache_alloc); 3516 3517 void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, 3518 gfp_t gfpflags) 3519 { 3520 return __kmem_cache_alloc_lru(s, lru, gfpflags); 3521 } 3522 EXPORT_SYMBOL(kmem_cache_alloc_lru); 3523 3524 void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, 3525 int node, size_t orig_size, 3526 unsigned long caller) 3527 { 3528 return slab_alloc_node(s, NULL, gfpflags, node, 3529 caller, orig_size); 3530 } 3531 3532 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 3533 { 3534 void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); 3535 3536 trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node); 3537 3538 return ret; 3539 } 3540 EXPORT_SYMBOL(kmem_cache_alloc_node); 3541 3542 static noinline void free_to_partial_list( 3543 struct kmem_cache *s, struct slab *slab, 3544 void *head, void *tail, int bulk_cnt, 3545 unsigned long addr) 3546 { 3547 struct kmem_cache_node *n = get_node(s, slab_nid(slab)); 3548 struct slab *slab_free = NULL; 3549 int cnt = bulk_cnt; 3550 unsigned long flags; 3551 depot_stack_handle_t handle = 0; 3552 3553 if (s->flags & SLAB_STORE_USER) 3554 handle = set_track_prepare(); 3555 3556 spin_lock_irqsave(&n->list_lock, flags); 3557 3558 if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) { 3559 void *prior = slab->freelist; 3560 3561 /* Perform the actual freeing while we still hold the locks */ 3562 slab->inuse -= cnt; 3563 set_freepointer(s, tail, prior); 3564 slab->freelist = head; 3565 3566 /* 3567 * If the slab is empty, and node's partial list is full, 3568 * it should be discarded anyway no matter it's on full or 3569 * partial list. 3570 */ 3571 if (slab->inuse == 0 && n->nr_partial >= s->min_partial) 3572 slab_free = slab; 3573 3574 if (!prior) { 3575 /* was on full list */ 3576 remove_full(s, n, slab); 3577 if (!slab_free) { 3578 add_partial(n, slab, DEACTIVATE_TO_TAIL); 3579 stat(s, FREE_ADD_PARTIAL); 3580 } 3581 } else if (slab_free) { 3582 remove_partial(n, slab); 3583 stat(s, FREE_REMOVE_PARTIAL); 3584 } 3585 } 3586 3587 if (slab_free) { 3588 /* 3589 * Update the counters while still holding n->list_lock to 3590 * prevent spurious validation warnings 3591 */ 3592 dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); 3593 } 3594 3595 spin_unlock_irqrestore(&n->list_lock, flags); 3596 3597 if (slab_free) { 3598 stat(s, FREE_SLAB); 3599 free_slab(s, slab_free); 3600 } 3601 } 3602 3603 /* 3604 * Slow path handling. This may still be called frequently since objects 3605 * have a longer lifetime than the cpu slabs in most processing loads. 3606 * 3607 * So we still attempt to reduce cache line usage. Just take the slab 3608 * lock and free the item. If there is no additional partial slab 3609 * handling required then we can return immediately. 3610 */ 3611 static void __slab_free(struct kmem_cache *s, struct slab *slab, 3612 void *head, void *tail, int cnt, 3613 unsigned long addr) 3614 3615 { 3616 void *prior; 3617 int was_frozen; 3618 struct slab new; 3619 unsigned long counters; 3620 struct kmem_cache_node *n = NULL; 3621 unsigned long flags; 3622 3623 stat(s, FREE_SLOWPATH); 3624 3625 if (kfence_free(head)) 3626 return; 3627 3628 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 3629 free_to_partial_list(s, slab, head, tail, cnt, addr); 3630 return; 3631 } 3632 3633 do { 3634 if (unlikely(n)) { 3635 spin_unlock_irqrestore(&n->list_lock, flags); 3636 n = NULL; 3637 } 3638 prior = slab->freelist; 3639 counters = slab->counters; 3640 set_freepointer(s, tail, prior); 3641 new.counters = counters; 3642 was_frozen = new.frozen; 3643 new.inuse -= cnt; 3644 if ((!new.inuse || !prior) && !was_frozen) { 3645 3646 if (kmem_cache_has_cpu_partial(s) && !prior) { 3647 3648 /* 3649 * Slab was on no list before and will be 3650 * partially empty 3651 * We can defer the list move and instead 3652 * freeze it. 3653 */ 3654 new.frozen = 1; 3655 3656 } else { /* Needs to be taken off a list */ 3657 3658 n = get_node(s, slab_nid(slab)); 3659 /* 3660 * Speculatively acquire the list_lock. 3661 * If the cmpxchg does not succeed then we may 3662 * drop the list_lock without any processing. 3663 * 3664 * Otherwise the list_lock will synchronize with 3665 * other processors updating the list of slabs. 3666 */ 3667 spin_lock_irqsave(&n->list_lock, flags); 3668 3669 } 3670 } 3671 3672 } while (!slab_update_freelist(s, slab, 3673 prior, counters, 3674 head, new.counters, 3675 "__slab_free")); 3676 3677 if (likely(!n)) { 3678 3679 if (likely(was_frozen)) { 3680 /* 3681 * The list lock was not taken therefore no list 3682 * activity can be necessary. 3683 */ 3684 stat(s, FREE_FROZEN); 3685 } else if (new.frozen) { 3686 /* 3687 * If we just froze the slab then put it onto the 3688 * per cpu partial list. 3689 */ 3690 put_cpu_partial(s, slab, 1); 3691 stat(s, CPU_PARTIAL_FREE); 3692 } 3693 3694 return; 3695 } 3696 3697 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) 3698 goto slab_empty; 3699 3700 /* 3701 * Objects left in the slab. If it was not on the partial list before 3702 * then add it. 3703 */ 3704 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { 3705 remove_full(s, n, slab); 3706 add_partial(n, slab, DEACTIVATE_TO_TAIL); 3707 stat(s, FREE_ADD_PARTIAL); 3708 } 3709 spin_unlock_irqrestore(&n->list_lock, flags); 3710 return; 3711 3712 slab_empty: 3713 if (prior) { 3714 /* 3715 * Slab on the partial list. 3716 */ 3717 remove_partial(n, slab); 3718 stat(s, FREE_REMOVE_PARTIAL); 3719 } else { 3720 /* Slab must be on the full list */ 3721 remove_full(s, n, slab); 3722 } 3723 3724 spin_unlock_irqrestore(&n->list_lock, flags); 3725 stat(s, FREE_SLAB); 3726 discard_slab(s, slab); 3727 } 3728 3729 #ifndef CONFIG_SLUB_TINY 3730 /* 3731 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 3732 * can perform fastpath freeing without additional function calls. 3733 * 3734 * The fastpath is only possible if we are freeing to the current cpu slab 3735 * of this processor. This typically the case if we have just allocated 3736 * the item before. 3737 * 3738 * If fastpath is not possible then fall back to __slab_free where we deal 3739 * with all sorts of special processing. 3740 * 3741 * Bulk free of a freelist with several objects (all pointing to the 3742 * same slab) possible by specifying head and tail ptr, plus objects 3743 * count (cnt). Bulk free indicated by tail pointer being set. 3744 */ 3745 static __always_inline void do_slab_free(struct kmem_cache *s, 3746 struct slab *slab, void *head, void *tail, 3747 int cnt, unsigned long addr) 3748 { 3749 void *tail_obj = tail ? : head; 3750 struct kmem_cache_cpu *c; 3751 unsigned long tid; 3752 void **freelist; 3753 3754 redo: 3755 /* 3756 * Determine the currently cpus per cpu slab. 3757 * The cpu may change afterward. However that does not matter since 3758 * data is retrieved via this pointer. If we are on the same cpu 3759 * during the cmpxchg then the free will succeed. 3760 */ 3761 c = raw_cpu_ptr(s->cpu_slab); 3762 tid = READ_ONCE(c->tid); 3763 3764 /* Same with comment on barrier() in slab_alloc_node() */ 3765 barrier(); 3766 3767 if (unlikely(slab != c->slab)) { 3768 __slab_free(s, slab, head, tail_obj, cnt, addr); 3769 return; 3770 } 3771 3772 if (USE_LOCKLESS_FAST_PATH()) { 3773 freelist = READ_ONCE(c->freelist); 3774 3775 set_freepointer(s, tail_obj, freelist); 3776 3777 if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { 3778 note_cmpxchg_failure("slab_free", s, tid); 3779 goto redo; 3780 } 3781 } else { 3782 /* Update the free list under the local lock */ 3783 local_lock(&s->cpu_slab->lock); 3784 c = this_cpu_ptr(s->cpu_slab); 3785 if (unlikely(slab != c->slab)) { 3786 local_unlock(&s->cpu_slab->lock); 3787 goto redo; 3788 } 3789 tid = c->tid; 3790 freelist = c->freelist; 3791 3792 set_freepointer(s, tail_obj, freelist); 3793 c->freelist = head; 3794 c->tid = next_tid(tid); 3795 3796 local_unlock(&s->cpu_slab->lock); 3797 } 3798 stat(s, FREE_FASTPATH); 3799 } 3800 #else /* CONFIG_SLUB_TINY */ 3801 static void do_slab_free(struct kmem_cache *s, 3802 struct slab *slab, void *head, void *tail, 3803 int cnt, unsigned long addr) 3804 { 3805 void *tail_obj = tail ? : head; 3806 3807 __slab_free(s, slab, head, tail_obj, cnt, addr); 3808 } 3809 #endif /* CONFIG_SLUB_TINY */ 3810 3811 static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, 3812 void *head, void *tail, void **p, int cnt, 3813 unsigned long addr) 3814 { 3815 memcg_slab_free_hook(s, slab, p, cnt); 3816 /* 3817 * With KASAN enabled slab_free_freelist_hook modifies the freelist 3818 * to remove objects, whose reuse must be delayed. 3819 */ 3820 if (slab_free_freelist_hook(s, &head, &tail, &cnt)) 3821 do_slab_free(s, slab, head, tail, cnt, addr); 3822 } 3823 3824 #ifdef CONFIG_KASAN_GENERIC 3825 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) 3826 { 3827 do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr); 3828 } 3829 #endif 3830 3831 void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) 3832 { 3833 slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); 3834 } 3835 3836 void kmem_cache_free(struct kmem_cache *s, void *x) 3837 { 3838 s = cache_from_obj(s, x); 3839 if (!s) 3840 return; 3841 trace_kmem_cache_free(_RET_IP_, x, s); 3842 slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); 3843 } 3844 EXPORT_SYMBOL(kmem_cache_free); 3845 3846 struct detached_freelist { 3847 struct slab *slab; 3848 void *tail; 3849 void *freelist; 3850 int cnt; 3851 struct kmem_cache *s; 3852 }; 3853 3854 /* 3855 * This function progressively scans the array with free objects (with 3856 * a limited look ahead) and extract objects belonging to the same 3857 * slab. It builds a detached freelist directly within the given 3858 * slab/objects. This can happen without any need for 3859 * synchronization, because the objects are owned by running process. 3860 * The freelist is build up as a single linked list in the objects. 3861 * The idea is, that this detached freelist can then be bulk 3862 * transferred to the real freelist(s), but only requiring a single 3863 * synchronization primitive. Look ahead in the array is limited due 3864 * to performance reasons. 3865 */ 3866 static inline 3867 int build_detached_freelist(struct kmem_cache *s, size_t size, 3868 void **p, struct detached_freelist *df) 3869 { 3870 int lookahead = 3; 3871 void *object; 3872 struct folio *folio; 3873 size_t same; 3874 3875 object = p[--size]; 3876 folio = virt_to_folio(object); 3877 if (!s) { 3878 /* Handle kalloc'ed objects */ 3879 if (unlikely(!folio_test_slab(folio))) { 3880 free_large_kmalloc(folio, object); 3881 df->slab = NULL; 3882 return size; 3883 } 3884 /* Derive kmem_cache from object */ 3885 df->slab = folio_slab(folio); 3886 df->s = df->slab->slab_cache; 3887 } else { 3888 df->slab = folio_slab(folio); 3889 df->s = cache_from_obj(s, object); /* Support for memcg */ 3890 } 3891 3892 /* Start new detached freelist */ 3893 df->tail = object; 3894 df->freelist = object; 3895 df->cnt = 1; 3896 3897 if (is_kfence_address(object)) 3898 return size; 3899 3900 set_freepointer(df->s, object, NULL); 3901 3902 same = size; 3903 while (size) { 3904 object = p[--size]; 3905 /* df->slab is always set at this point */ 3906 if (df->slab == virt_to_slab(object)) { 3907 /* Opportunity build freelist */ 3908 set_freepointer(df->s, object, df->freelist); 3909 df->freelist = object; 3910 df->cnt++; 3911 same--; 3912 if (size != same) 3913 swap(p[size], p[same]); 3914 continue; 3915 } 3916 3917 /* Limit look ahead search */ 3918 if (!--lookahead) 3919 break; 3920 } 3921 3922 return same; 3923 } 3924 3925 /* Note that interrupts must be enabled when calling this function. */ 3926 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) 3927 { 3928 if (!size) 3929 return; 3930 3931 do { 3932 struct detached_freelist df; 3933 3934 size = build_detached_freelist(s, size, p, &df); 3935 if (!df.slab) 3936 continue; 3937 3938 slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt, 3939 _RET_IP_); 3940 } while (likely(size)); 3941 } 3942 EXPORT_SYMBOL(kmem_cache_free_bulk); 3943 3944 #ifndef CONFIG_SLUB_TINY 3945 static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 3946 size_t size, void **p, struct obj_cgroup *objcg) 3947 { 3948 struct kmem_cache_cpu *c; 3949 unsigned long irqflags; 3950 int i; 3951 3952 /* 3953 * Drain objects in the per cpu slab, while disabling local 3954 * IRQs, which protects against PREEMPT and interrupts 3955 * handlers invoking normal fastpath. 3956 */ 3957 c = slub_get_cpu_ptr(s->cpu_slab); 3958 local_lock_irqsave(&s->cpu_slab->lock, irqflags); 3959 3960 for (i = 0; i < size; i++) { 3961 void *object = kfence_alloc(s, s->object_size, flags); 3962 3963 if (unlikely(object)) { 3964 p[i] = object; 3965 continue; 3966 } 3967 3968 object = c->freelist; 3969 if (unlikely(!object)) { 3970 /* 3971 * We may have removed an object from c->freelist using 3972 * the fastpath in the previous iteration; in that case, 3973 * c->tid has not been bumped yet. 3974 * Since ___slab_alloc() may reenable interrupts while 3975 * allocating memory, we should bump c->tid now. 3976 */ 3977 c->tid = next_tid(c->tid); 3978 3979 local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 3980 3981 /* 3982 * Invoking slow path likely have side-effect 3983 * of re-populating per CPU c->freelist 3984 */ 3985 p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, 3986 _RET_IP_, c, s->object_size); 3987 if (unlikely(!p[i])) 3988 goto error; 3989 3990 c = this_cpu_ptr(s->cpu_slab); 3991 maybe_wipe_obj_freeptr(s, p[i]); 3992 3993 local_lock_irqsave(&s->cpu_slab->lock, irqflags); 3994 3995 continue; /* goto for-loop */ 3996 } 3997 c->freelist = get_freepointer(s, object); 3998 p[i] = object; 3999 maybe_wipe_obj_freeptr(s, p[i]); 4000 } 4001 c->tid = next_tid(c->tid); 4002 local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 4003 slub_put_cpu_ptr(s->cpu_slab); 4004 4005 return i; 4006 4007 error: 4008 slub_put_cpu_ptr(s->cpu_slab); 4009 slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); 4010 kmem_cache_free_bulk(s, i, p); 4011 return 0; 4012 4013 } 4014 #else /* CONFIG_SLUB_TINY */ 4015 static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 4016 size_t size, void **p, struct obj_cgroup *objcg) 4017 { 4018 int i; 4019 4020 for (i = 0; i < size; i++) { 4021 void *object = kfence_alloc(s, s->object_size, flags); 4022 4023 if (unlikely(object)) { 4024 p[i] = object; 4025 continue; 4026 } 4027 4028 p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, 4029 _RET_IP_, s->object_size); 4030 if (unlikely(!p[i])) 4031 goto error; 4032 4033 maybe_wipe_obj_freeptr(s, p[i]); 4034 } 4035 4036 return i; 4037 4038 error: 4039 slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); 4040 kmem_cache_free_bulk(s, i, p); 4041 return 0; 4042 } 4043 #endif /* CONFIG_SLUB_TINY */ 4044 4045 /* Note that interrupts must be enabled when calling this function. */ 4046 int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 4047 void **p) 4048 { 4049 int i; 4050 struct obj_cgroup *objcg = NULL; 4051 4052 if (!size) 4053 return 0; 4054 4055 /* memcg and kmem_cache debug support */ 4056 s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); 4057 if (unlikely(!s)) 4058 return 0; 4059 4060 i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg); 4061 4062 /* 4063 * memcg and kmem_cache debug support and memory initialization. 4064 * Done outside of the IRQ disabled fastpath loop. 4065 */ 4066 if (i != 0) 4067 slab_post_alloc_hook(s, objcg, flags, size, p, 4068 slab_want_init_on_alloc(flags, s), s->object_size); 4069 return i; 4070 } 4071 EXPORT_SYMBOL(kmem_cache_alloc_bulk); 4072 4073 4074 /* 4075 * Object placement in a slab is made very easy because we always start at 4076 * offset 0. If we tune the size of the object to the alignment then we can 4077 * get the required alignment by putting one properly sized object after 4078 * another. 4079 * 4080 * Notice that the allocation order determines the sizes of the per cpu 4081 * caches. Each processor has always one slab available for allocations. 4082 * Increasing the allocation order reduces the number of times that slabs 4083 * must be moved on and off the partial lists and is therefore a factor in 4084 * locking overhead. 4085 */ 4086 4087 /* 4088 * Minimum / Maximum order of slab pages. This influences locking overhead 4089 * and slab fragmentation. A higher order reduces the number of partial slabs 4090 * and increases the number of allocations possible without having to 4091 * take the list_lock. 4092 */ 4093 static unsigned int slub_min_order; 4094 static unsigned int slub_max_order = 4095 IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER; 4096 static unsigned int slub_min_objects; 4097 4098 /* 4099 * Calculate the order of allocation given an slab object size. 4100 * 4101 * The order of allocation has significant impact on performance and other 4102 * system components. Generally order 0 allocations should be preferred since 4103 * order 0 does not cause fragmentation in the page allocator. Larger objects 4104 * be problematic to put into order 0 slabs because there may be too much 4105 * unused space left. We go to a higher order if more than 1/16th of the slab 4106 * would be wasted. 4107 * 4108 * In order to reach satisfactory performance we must ensure that a minimum 4109 * number of objects is in one slab. Otherwise we may generate too much 4110 * activity on the partial lists which requires taking the list_lock. This is 4111 * less a concern for large slabs though which are rarely used. 4112 * 4113 * slub_max_order specifies the order where we begin to stop considering the 4114 * number of objects in a slab as critical. If we reach slub_max_order then 4115 * we try to keep the page order as low as possible. So we accept more waste 4116 * of space in favor of a small page order. 4117 * 4118 * Higher order allocations also allow the placement of more objects in a 4119 * slab and thereby reduce object handling overhead. If the user has 4120 * requested a higher minimum order then we start with that one instead of 4121 * the smallest order which will fit the object. 4122 */ 4123 static inline unsigned int calc_slab_order(unsigned int size, 4124 unsigned int min_order, unsigned int max_order, 4125 unsigned int fract_leftover) 4126 { 4127 unsigned int order; 4128 4129 for (order = min_order; order <= max_order; order++) { 4130 4131 unsigned int slab_size = (unsigned int)PAGE_SIZE << order; 4132 unsigned int rem; 4133 4134 rem = slab_size % size; 4135 4136 if (rem <= slab_size / fract_leftover) 4137 break; 4138 } 4139 4140 return order; 4141 } 4142 4143 static inline int calculate_order(unsigned int size) 4144 { 4145 unsigned int order; 4146 unsigned int min_objects; 4147 unsigned int max_objects; 4148 unsigned int min_order; 4149 4150 min_objects = slub_min_objects; 4151 if (!min_objects) { 4152 /* 4153 * Some architectures will only update present cpus when 4154 * onlining them, so don't trust the number if it's just 1. But 4155 * we also don't want to use nr_cpu_ids always, as on some other 4156 * architectures, there can be many possible cpus, but never 4157 * onlined. Here we compromise between trying to avoid too high 4158 * order on systems that appear larger than they are, and too 4159 * low order on systems that appear smaller than they are. 4160 */ 4161 unsigned int nr_cpus = num_present_cpus(); 4162 if (nr_cpus <= 1) 4163 nr_cpus = nr_cpu_ids; 4164 min_objects = 4 * (fls(nr_cpus) + 1); 4165 } 4166 /* min_objects can't be 0 because get_order(0) is undefined */ 4167 max_objects = max(order_objects(slub_max_order, size), 1U); 4168 min_objects = min(min_objects, max_objects); 4169 4170 min_order = max_t(unsigned int, slub_min_order, 4171 get_order(min_objects * size)); 4172 if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) 4173 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 4174 4175 /* 4176 * Attempt to find best configuration for a slab. This works by first 4177 * attempting to generate a layout with the best possible configuration 4178 * and backing off gradually. 4179 * 4180 * We start with accepting at most 1/16 waste and try to find the 4181 * smallest order from min_objects-derived/slub_min_order up to 4182 * slub_max_order that will satisfy the constraint. Note that increasing 4183 * the order can only result in same or less fractional waste, not more. 4184 * 4185 * If that fails, we increase the acceptable fraction of waste and try 4186 * again. The last iteration with fraction of 1/2 would effectively 4187 * accept any waste and give us the order determined by min_objects, as 4188 * long as at least single object fits within slub_max_order. 4189 */ 4190 for (unsigned int fraction = 16; fraction > 1; fraction /= 2) { 4191 order = calc_slab_order(size, min_order, slub_max_order, 4192 fraction); 4193 if (order <= slub_max_order) 4194 return order; 4195 } 4196 4197 /* 4198 * Doh this slab cannot be placed using slub_max_order. 4199 */ 4200 order = get_order(size); 4201 if (order <= MAX_ORDER) 4202 return order; 4203 return -ENOSYS; 4204 } 4205 4206 static void 4207 init_kmem_cache_node(struct kmem_cache_node *n) 4208 { 4209 n->nr_partial = 0; 4210 spin_lock_init(&n->list_lock); 4211 INIT_LIST_HEAD(&n->partial); 4212 #ifdef CONFIG_SLUB_DEBUG 4213 atomic_long_set(&n->nr_slabs, 0); 4214 atomic_long_set(&n->total_objects, 0); 4215 INIT_LIST_HEAD(&n->full); 4216 #endif 4217 } 4218 4219 #ifndef CONFIG_SLUB_TINY 4220 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 4221 { 4222 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 4223 NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * 4224 sizeof(struct kmem_cache_cpu)); 4225 4226 /* 4227 * Must align to double word boundary for the double cmpxchg 4228 * instructions to work; see __pcpu_double_call_return_bool(). 4229 */ 4230 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 4231 2 * sizeof(void *)); 4232 4233 if (!s->cpu_slab) 4234 return 0; 4235 4236 init_kmem_cache_cpus(s); 4237 4238 return 1; 4239 } 4240 #else 4241 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 4242 { 4243 return 1; 4244 } 4245 #endif /* CONFIG_SLUB_TINY */ 4246 4247 static struct kmem_cache *kmem_cache_node; 4248 4249 /* 4250 * No kmalloc_node yet so do it by hand. We know that this is the first 4251 * slab on the node for this slabcache. There are no concurrent accesses 4252 * possible. 4253 * 4254 * Note that this function only works on the kmem_cache_node 4255 * when allocating for the kmem_cache_node. This is used for bootstrapping 4256 * memory on a fresh node that has no slab structures yet. 4257 */ 4258 static void early_kmem_cache_node_alloc(int node) 4259 { 4260 struct slab *slab; 4261 struct kmem_cache_node *n; 4262 4263 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 4264 4265 slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); 4266 4267 BUG_ON(!slab); 4268 inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects); 4269 if (slab_nid(slab) != node) { 4270 pr_err("SLUB: Unable to allocate memory from node %d\n", node); 4271 pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); 4272 } 4273 4274 n = slab->freelist; 4275 BUG_ON(!n); 4276 #ifdef CONFIG_SLUB_DEBUG 4277 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 4278 init_tracking(kmem_cache_node, n); 4279 #endif 4280 n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); 4281 slab->freelist = get_freepointer(kmem_cache_node, n); 4282 slab->inuse = 1; 4283 kmem_cache_node->node[node] = n; 4284 init_kmem_cache_node(n); 4285 inc_slabs_node(kmem_cache_node, node, slab->objects); 4286 4287 /* 4288 * No locks need to be taken here as it has just been 4289 * initialized and there is no concurrent access. 4290 */ 4291 __add_partial(n, slab, DEACTIVATE_TO_HEAD); 4292 } 4293 4294 static void free_kmem_cache_nodes(struct kmem_cache *s) 4295 { 4296 int node; 4297 struct kmem_cache_node *n; 4298 4299 for_each_kmem_cache_node(s, node, n) { 4300 s->node[node] = NULL; 4301 kmem_cache_free(kmem_cache_node, n); 4302 } 4303 } 4304 4305 void __kmem_cache_release(struct kmem_cache *s) 4306 { 4307 cache_random_seq_destroy(s); 4308 #ifndef CONFIG_SLUB_TINY 4309 free_percpu(s->cpu_slab); 4310 #endif 4311 free_kmem_cache_nodes(s); 4312 } 4313 4314 static int init_kmem_cache_nodes(struct kmem_cache *s) 4315 { 4316 int node; 4317 4318 for_each_node_mask(node, slab_nodes) { 4319 struct kmem_cache_node *n; 4320 4321 if (slab_state == DOWN) { 4322 early_kmem_cache_node_alloc(node); 4323 continue; 4324 } 4325 n = kmem_cache_alloc_node(kmem_cache_node, 4326 GFP_KERNEL, node); 4327 4328 if (!n) { 4329 free_kmem_cache_nodes(s); 4330 return 0; 4331 } 4332 4333 init_kmem_cache_node(n); 4334 s->node[node] = n; 4335 } 4336 return 1; 4337 } 4338 4339 static void set_cpu_partial(struct kmem_cache *s) 4340 { 4341 #ifdef CONFIG_SLUB_CPU_PARTIAL 4342 unsigned int nr_objects; 4343 4344 /* 4345 * cpu_partial determined the maximum number of objects kept in the 4346 * per cpu partial lists of a processor. 4347 * 4348 * Per cpu partial lists mainly contain slabs that just have one 4349 * object freed. If they are used for allocation then they can be 4350 * filled up again with minimal effort. The slab will never hit the 4351 * per node partial lists and therefore no locking will be required. 4352 * 4353 * For backwards compatibility reasons, this is determined as number 4354 * of objects, even though we now limit maximum number of pages, see 4355 * slub_set_cpu_partial() 4356 */ 4357 if (!kmem_cache_has_cpu_partial(s)) 4358 nr_objects = 0; 4359 else if (s->size >= PAGE_SIZE) 4360 nr_objects = 6; 4361 else if (s->size >= 1024) 4362 nr_objects = 24; 4363 else if (s->size >= 256) 4364 nr_objects = 52; 4365 else 4366 nr_objects = 120; 4367 4368 slub_set_cpu_partial(s, nr_objects); 4369 #endif 4370 } 4371 4372 /* 4373 * calculate_sizes() determines the order and the distribution of data within 4374 * a slab object. 4375 */ 4376 static int calculate_sizes(struct kmem_cache *s) 4377 { 4378 slab_flags_t flags = s->flags; 4379 unsigned int size = s->object_size; 4380 unsigned int order; 4381 4382 /* 4383 * Round up object size to the next word boundary. We can only 4384 * place the free pointer at word boundaries and this determines 4385 * the possible location of the free pointer. 4386 */ 4387 size = ALIGN(size, sizeof(void *)); 4388 4389 #ifdef CONFIG_SLUB_DEBUG 4390 /* 4391 * Determine if we can poison the object itself. If the user of 4392 * the slab may touch the object after free or before allocation 4393 * then we should never poison the object itself. 4394 */ 4395 if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) && 4396 !s->ctor) 4397 s->flags |= __OBJECT_POISON; 4398 else 4399 s->flags &= ~__OBJECT_POISON; 4400 4401 4402 /* 4403 * If we are Redzoning then check if there is some space between the 4404 * end of the object and the free pointer. If not then add an 4405 * additional word to have some bytes to store Redzone information. 4406 */ 4407 if ((flags & SLAB_RED_ZONE) && size == s->object_size) 4408 size += sizeof(void *); 4409 #endif 4410 4411 /* 4412 * With that we have determined the number of bytes in actual use 4413 * by the object and redzoning. 4414 */ 4415 s->inuse = size; 4416 4417 if (slub_debug_orig_size(s) || 4418 (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || 4419 ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || 4420 s->ctor) { 4421 /* 4422 * Relocate free pointer after the object if it is not 4423 * permitted to overwrite the first word of the object on 4424 * kmem_cache_free. 4425 * 4426 * This is the case if we do RCU, have a constructor or 4427 * destructor, are poisoning the objects, or are 4428 * redzoning an object smaller than sizeof(void *). 4429 * 4430 * The assumption that s->offset >= s->inuse means free 4431 * pointer is outside of the object is used in the 4432 * freeptr_outside_object() function. If that is no 4433 * longer true, the function needs to be modified. 4434 */ 4435 s->offset = size; 4436 size += sizeof(void *); 4437 } else { 4438 /* 4439 * Store freelist pointer near middle of object to keep 4440 * it away from the edges of the object to avoid small 4441 * sized over/underflows from neighboring allocations. 4442 */ 4443 s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); 4444 } 4445 4446 #ifdef CONFIG_SLUB_DEBUG 4447 if (flags & SLAB_STORE_USER) { 4448 /* 4449 * Need to store information about allocs and frees after 4450 * the object. 4451 */ 4452 size += 2 * sizeof(struct track); 4453 4454 /* Save the original kmalloc request size */ 4455 if (flags & SLAB_KMALLOC) 4456 size += sizeof(unsigned int); 4457 } 4458 #endif 4459 4460 kasan_cache_create(s, &size, &s->flags); 4461 #ifdef CONFIG_SLUB_DEBUG 4462 if (flags & SLAB_RED_ZONE) { 4463 /* 4464 * Add some empty padding so that we can catch 4465 * overwrites from earlier objects rather than let 4466 * tracking information or the free pointer be 4467 * corrupted if a user writes before the start 4468 * of the object. 4469 */ 4470 size += sizeof(void *); 4471 4472 s->red_left_pad = sizeof(void *); 4473 s->red_left_pad = ALIGN(s->red_left_pad, s->align); 4474 size += s->red_left_pad; 4475 } 4476 #endif 4477 4478 /* 4479 * SLUB stores one object immediately after another beginning from 4480 * offset 0. In order to align the objects we have to simply size 4481 * each object to conform to the alignment. 4482 */ 4483 size = ALIGN(size, s->align); 4484 s->size = size; 4485 s->reciprocal_size = reciprocal_value(size); 4486 order = calculate_order(size); 4487 4488 if ((int)order < 0) 4489 return 0; 4490 4491 s->allocflags = 0; 4492 if (order) 4493 s->allocflags |= __GFP_COMP; 4494 4495 if (s->flags & SLAB_CACHE_DMA) 4496 s->allocflags |= GFP_DMA; 4497 4498 if (s->flags & SLAB_CACHE_DMA32) 4499 s->allocflags |= GFP_DMA32; 4500 4501 if (s->flags & SLAB_RECLAIM_ACCOUNT) 4502 s->allocflags |= __GFP_RECLAIMABLE; 4503 4504 /* 4505 * Determine the number of objects per slab 4506 */ 4507 s->oo = oo_make(order, size); 4508 s->min = oo_make(get_order(size), size); 4509 4510 return !!oo_objects(s->oo); 4511 } 4512 4513 static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) 4514 { 4515 s->flags = kmem_cache_flags(s->size, flags, s->name); 4516 #ifdef CONFIG_SLAB_FREELIST_HARDENED 4517 s->random = get_random_long(); 4518 #endif 4519 4520 if (!calculate_sizes(s)) 4521 goto error; 4522 if (disable_higher_order_debug) { 4523 /* 4524 * Disable debugging flags that store metadata if the min slab 4525 * order increased. 4526 */ 4527 if (get_order(s->size) > get_order(s->object_size)) { 4528 s->flags &= ~DEBUG_METADATA_FLAGS; 4529 s->offset = 0; 4530 if (!calculate_sizes(s)) 4531 goto error; 4532 } 4533 } 4534 4535 #ifdef system_has_freelist_aba 4536 if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { 4537 /* Enable fast mode */ 4538 s->flags |= __CMPXCHG_DOUBLE; 4539 } 4540 #endif 4541 4542 /* 4543 * The larger the object size is, the more slabs we want on the partial 4544 * list to avoid pounding the page allocator excessively. 4545 */ 4546 s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); 4547 s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); 4548 4549 set_cpu_partial(s); 4550 4551 #ifdef CONFIG_NUMA 4552 s->remote_node_defrag_ratio = 1000; 4553 #endif 4554 4555 /* Initialize the pre-computed randomized freelist if slab is up */ 4556 if (slab_state >= UP) { 4557 if (init_cache_random_seq(s)) 4558 goto error; 4559 } 4560 4561 if (!init_kmem_cache_nodes(s)) 4562 goto error; 4563 4564 if (alloc_kmem_cache_cpus(s)) 4565 return 0; 4566 4567 error: 4568 __kmem_cache_release(s); 4569 return -EINVAL; 4570 } 4571 4572 static void list_slab_objects(struct kmem_cache *s, struct slab *slab, 4573 const char *text) 4574 { 4575 #ifdef CONFIG_SLUB_DEBUG 4576 void *addr = slab_address(slab); 4577 void *p; 4578 4579 slab_err(s, slab, text, s->name); 4580 4581 spin_lock(&object_map_lock); 4582 __fill_map(object_map, s, slab); 4583 4584 for_each_object(p, s, addr, slab->objects) { 4585 4586 if (!test_bit(__obj_to_index(s, addr, p), object_map)) { 4587 pr_err("Object 0x%p @offset=%tu\n", p, p - addr); 4588 print_tracking(s, p); 4589 } 4590 } 4591 spin_unlock(&object_map_lock); 4592 #endif 4593 } 4594 4595 /* 4596 * Attempt to free all partial slabs on a node. 4597 * This is called from __kmem_cache_shutdown(). We must take list_lock 4598 * because sysfs file might still access partial list after the shutdowning. 4599 */ 4600 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 4601 { 4602 LIST_HEAD(discard); 4603 struct slab *slab, *h; 4604 4605 BUG_ON(irqs_disabled()); 4606 spin_lock_irq(&n->list_lock); 4607 list_for_each_entry_safe(slab, h, &n->partial, slab_list) { 4608 if (!slab->inuse) { 4609 remove_partial(n, slab); 4610 list_add(&slab->slab_list, &discard); 4611 } else { 4612 list_slab_objects(s, slab, 4613 "Objects remaining in %s on __kmem_cache_shutdown()"); 4614 } 4615 } 4616 spin_unlock_irq(&n->list_lock); 4617 4618 list_for_each_entry_safe(slab, h, &discard, slab_list) 4619 discard_slab(s, slab); 4620 } 4621 4622 bool __kmem_cache_empty(struct kmem_cache *s) 4623 { 4624 int node; 4625 struct kmem_cache_node *n; 4626 4627 for_each_kmem_cache_node(s, node, n) 4628 if (n->nr_partial || node_nr_slabs(n)) 4629 return false; 4630 return true; 4631 } 4632 4633 /* 4634 * Release all resources used by a slab cache. 4635 */ 4636 int __kmem_cache_shutdown(struct kmem_cache *s) 4637 { 4638 int node; 4639 struct kmem_cache_node *n; 4640 4641 flush_all_cpus_locked(s); 4642 /* Attempt to free all objects */ 4643 for_each_kmem_cache_node(s, node, n) { 4644 free_partial(s, n); 4645 if (n->nr_partial || node_nr_slabs(n)) 4646 return 1; 4647 } 4648 return 0; 4649 } 4650 4651 #ifdef CONFIG_PRINTK 4652 void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) 4653 { 4654 void *base; 4655 int __maybe_unused i; 4656 unsigned int objnr; 4657 void *objp; 4658 void *objp0; 4659 struct kmem_cache *s = slab->slab_cache; 4660 struct track __maybe_unused *trackp; 4661 4662 kpp->kp_ptr = object; 4663 kpp->kp_slab = slab; 4664 kpp->kp_slab_cache = s; 4665 base = slab_address(slab); 4666 objp0 = kasan_reset_tag(object); 4667 #ifdef CONFIG_SLUB_DEBUG 4668 objp = restore_red_left(s, objp0); 4669 #else 4670 objp = objp0; 4671 #endif 4672 objnr = obj_to_index(s, slab, objp); 4673 kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp); 4674 objp = base + s->size * objnr; 4675 kpp->kp_objp = objp; 4676 if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size 4677 || (objp - base) % s->size) || 4678 !(s->flags & SLAB_STORE_USER)) 4679 return; 4680 #ifdef CONFIG_SLUB_DEBUG 4681 objp = fixup_red_left(s, objp); 4682 trackp = get_track(s, objp, TRACK_ALLOC); 4683 kpp->kp_ret = (void *)trackp->addr; 4684 #ifdef CONFIG_STACKDEPOT 4685 { 4686 depot_stack_handle_t handle; 4687 unsigned long *entries; 4688 unsigned int nr_entries; 4689 4690 handle = READ_ONCE(trackp->handle); 4691 if (handle) { 4692 nr_entries = stack_depot_fetch(handle, &entries); 4693 for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) 4694 kpp->kp_stack[i] = (void *)entries[i]; 4695 } 4696 4697 trackp = get_track(s, objp, TRACK_FREE); 4698 handle = READ_ONCE(trackp->handle); 4699 if (handle) { 4700 nr_entries = stack_depot_fetch(handle, &entries); 4701 for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) 4702 kpp->kp_free_stack[i] = (void *)entries[i]; 4703 } 4704 } 4705 #endif 4706 #endif 4707 } 4708 #endif 4709 4710 /******************************************************************** 4711 * Kmalloc subsystem 4712 *******************************************************************/ 4713 4714 static int __init setup_slub_min_order(char *str) 4715 { 4716 get_option(&str, (int *)&slub_min_order); 4717 4718 if (slub_min_order > slub_max_order) 4719 slub_max_order = slub_min_order; 4720 4721 return 1; 4722 } 4723 4724 __setup("slub_min_order=", setup_slub_min_order); 4725 4726 static int __init setup_slub_max_order(char *str) 4727 { 4728 get_option(&str, (int *)&slub_max_order); 4729 slub_max_order = min_t(unsigned int, slub_max_order, MAX_ORDER); 4730 4731 if (slub_min_order > slub_max_order) 4732 slub_min_order = slub_max_order; 4733 4734 return 1; 4735 } 4736 4737 __setup("slub_max_order=", setup_slub_max_order); 4738 4739 static int __init setup_slub_min_objects(char *str) 4740 { 4741 get_option(&str, (int *)&slub_min_objects); 4742 4743 return 1; 4744 } 4745 4746 __setup("slub_min_objects=", setup_slub_min_objects); 4747 4748 #ifdef CONFIG_HARDENED_USERCOPY 4749 /* 4750 * Rejects incorrectly sized objects and objects that are to be copied 4751 * to/from userspace but do not fall entirely within the containing slab 4752 * cache's usercopy region. 4753 * 4754 * Returns NULL if check passes, otherwise const char * to name of cache 4755 * to indicate an error. 4756 */ 4757 void __check_heap_object(const void *ptr, unsigned long n, 4758 const struct slab *slab, bool to_user) 4759 { 4760 struct kmem_cache *s; 4761 unsigned int offset; 4762 bool is_kfence = is_kfence_address(ptr); 4763 4764 ptr = kasan_reset_tag(ptr); 4765 4766 /* Find object and usable object size. */ 4767 s = slab->slab_cache; 4768 4769 /* Reject impossible pointers. */ 4770 if (ptr < slab_address(slab)) 4771 usercopy_abort("SLUB object not in SLUB page?!", NULL, 4772 to_user, 0, n); 4773 4774 /* Find offset within object. */ 4775 if (is_kfence) 4776 offset = ptr - kfence_object_start(ptr); 4777 else 4778 offset = (ptr - slab_address(slab)) % s->size; 4779 4780 /* Adjust for redzone and reject if within the redzone. */ 4781 if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { 4782 if (offset < s->red_left_pad) 4783 usercopy_abort("SLUB object in left red zone", 4784 s->name, to_user, offset, n); 4785 offset -= s->red_left_pad; 4786 } 4787 4788 /* Allow address range falling entirely within usercopy region. */ 4789 if (offset >= s->useroffset && 4790 offset - s->useroffset <= s->usersize && 4791 n <= s->useroffset - offset + s->usersize) 4792 return; 4793 4794 usercopy_abort("SLUB object", s->name, to_user, offset, n); 4795 } 4796 #endif /* CONFIG_HARDENED_USERCOPY */ 4797 4798 #define SHRINK_PROMOTE_MAX 32 4799 4800 /* 4801 * kmem_cache_shrink discards empty slabs and promotes the slabs filled 4802 * up most to the head of the partial lists. New allocations will then 4803 * fill those up and thus they can be removed from the partial lists. 4804 * 4805 * The slabs with the least items are placed last. This results in them 4806 * being allocated from last increasing the chance that the last objects 4807 * are freed in them. 4808 */ 4809 static int __kmem_cache_do_shrink(struct kmem_cache *s) 4810 { 4811 int node; 4812 int i; 4813 struct kmem_cache_node *n; 4814 struct slab *slab; 4815 struct slab *t; 4816 struct list_head discard; 4817 struct list_head promote[SHRINK_PROMOTE_MAX]; 4818 unsigned long flags; 4819 int ret = 0; 4820 4821 for_each_kmem_cache_node(s, node, n) { 4822 INIT_LIST_HEAD(&discard); 4823 for (i = 0; i < SHRINK_PROMOTE_MAX; i++) 4824 INIT_LIST_HEAD(promote + i); 4825 4826 spin_lock_irqsave(&n->list_lock, flags); 4827 4828 /* 4829 * Build lists of slabs to discard or promote. 4830 * 4831 * Note that concurrent frees may occur while we hold the 4832 * list_lock. slab->inuse here is the upper limit. 4833 */ 4834 list_for_each_entry_safe(slab, t, &n->partial, slab_list) { 4835 int free = slab->objects - slab->inuse; 4836 4837 /* Do not reread slab->inuse */ 4838 barrier(); 4839 4840 /* We do not keep full slabs on the list */ 4841 BUG_ON(free <= 0); 4842 4843 if (free == slab->objects) { 4844 list_move(&slab->slab_list, &discard); 4845 n->nr_partial--; 4846 dec_slabs_node(s, node, slab->objects); 4847 } else if (free <= SHRINK_PROMOTE_MAX) 4848 list_move(&slab->slab_list, promote + free - 1); 4849 } 4850 4851 /* 4852 * Promote the slabs filled up most to the head of the 4853 * partial list. 4854 */ 4855 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) 4856 list_splice(promote + i, &n->partial); 4857 4858 spin_unlock_irqrestore(&n->list_lock, flags); 4859 4860 /* Release empty slabs */ 4861 list_for_each_entry_safe(slab, t, &discard, slab_list) 4862 free_slab(s, slab); 4863 4864 if (node_nr_slabs(n)) 4865 ret = 1; 4866 } 4867 4868 return ret; 4869 } 4870 4871 int __kmem_cache_shrink(struct kmem_cache *s) 4872 { 4873 flush_all(s); 4874 return __kmem_cache_do_shrink(s); 4875 } 4876 4877 static int slab_mem_going_offline_callback(void *arg) 4878 { 4879 struct kmem_cache *s; 4880 4881 mutex_lock(&slab_mutex); 4882 list_for_each_entry(s, &slab_caches, list) { 4883 flush_all_cpus_locked(s); 4884 __kmem_cache_do_shrink(s); 4885 } 4886 mutex_unlock(&slab_mutex); 4887 4888 return 0; 4889 } 4890 4891 static void slab_mem_offline_callback(void *arg) 4892 { 4893 struct memory_notify *marg = arg; 4894 int offline_node; 4895 4896 offline_node = marg->status_change_nid_normal; 4897 4898 /* 4899 * If the node still has available memory. we need kmem_cache_node 4900 * for it yet. 4901 */ 4902 if (offline_node < 0) 4903 return; 4904 4905 mutex_lock(&slab_mutex); 4906 node_clear(offline_node, slab_nodes); 4907 /* 4908 * We no longer free kmem_cache_node structures here, as it would be 4909 * racy with all get_node() users, and infeasible to protect them with 4910 * slab_mutex. 4911 */ 4912 mutex_unlock(&slab_mutex); 4913 } 4914 4915 static int slab_mem_going_online_callback(void *arg) 4916 { 4917 struct kmem_cache_node *n; 4918 struct kmem_cache *s; 4919 struct memory_notify *marg = arg; 4920 int nid = marg->status_change_nid_normal; 4921 int ret = 0; 4922 4923 /* 4924 * If the node's memory is already available, then kmem_cache_node is 4925 * already created. Nothing to do. 4926 */ 4927 if (nid < 0) 4928 return 0; 4929 4930 /* 4931 * We are bringing a node online. No memory is available yet. We must 4932 * allocate a kmem_cache_node structure in order to bring the node 4933 * online. 4934 */ 4935 mutex_lock(&slab_mutex); 4936 list_for_each_entry(s, &slab_caches, list) { 4937 /* 4938 * The structure may already exist if the node was previously 4939 * onlined and offlined. 4940 */ 4941 if (get_node(s, nid)) 4942 continue; 4943 /* 4944 * XXX: kmem_cache_alloc_node will fallback to other nodes 4945 * since memory is not yet available from the node that 4946 * is brought up. 4947 */ 4948 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); 4949 if (!n) { 4950 ret = -ENOMEM; 4951 goto out; 4952 } 4953 init_kmem_cache_node(n); 4954 s->node[nid] = n; 4955 } 4956 /* 4957 * Any cache created after this point will also have kmem_cache_node 4958 * initialized for the new node. 4959 */ 4960 node_set(nid, slab_nodes); 4961 out: 4962 mutex_unlock(&slab_mutex); 4963 return ret; 4964 } 4965 4966 static int slab_memory_callback(struct notifier_block *self, 4967 unsigned long action, void *arg) 4968 { 4969 int ret = 0; 4970 4971 switch (action) { 4972 case MEM_GOING_ONLINE: 4973 ret = slab_mem_going_online_callback(arg); 4974 break; 4975 case MEM_GOING_OFFLINE: 4976 ret = slab_mem_going_offline_callback(arg); 4977 break; 4978 case MEM_OFFLINE: 4979 case MEM_CANCEL_ONLINE: 4980 slab_mem_offline_callback(arg); 4981 break; 4982 case MEM_ONLINE: 4983 case MEM_CANCEL_OFFLINE: 4984 break; 4985 } 4986 if (ret) 4987 ret = notifier_from_errno(ret); 4988 else 4989 ret = NOTIFY_OK; 4990 return ret; 4991 } 4992 4993 /******************************************************************** 4994 * Basic setup of slabs 4995 *******************************************************************/ 4996 4997 /* 4998 * Used for early kmem_cache structures that were allocated using 4999 * the page allocator. Allocate them properly then fix up the pointers 5000 * that may be pointing to the wrong kmem_cache structure. 5001 */ 5002 5003 static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) 5004 { 5005 int node; 5006 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 5007 struct kmem_cache_node *n; 5008 5009 memcpy(s, static_cache, kmem_cache->object_size); 5010 5011 /* 5012 * This runs very early, and only the boot processor is supposed to be 5013 * up. Even if it weren't true, IRQs are not up so we couldn't fire 5014 * IPIs around. 5015 */ 5016 __flush_cpu_slab(s, smp_processor_id()); 5017 for_each_kmem_cache_node(s, node, n) { 5018 struct slab *p; 5019 5020 list_for_each_entry(p, &n->partial, slab_list) 5021 p->slab_cache = s; 5022 5023 #ifdef CONFIG_SLUB_DEBUG 5024 list_for_each_entry(p, &n->full, slab_list) 5025 p->slab_cache = s; 5026 #endif 5027 } 5028 list_add(&s->list, &slab_caches); 5029 return s; 5030 } 5031 5032 void __init kmem_cache_init(void) 5033 { 5034 static __initdata struct kmem_cache boot_kmem_cache, 5035 boot_kmem_cache_node; 5036 int node; 5037 5038 if (debug_guardpage_minorder()) 5039 slub_max_order = 0; 5040 5041 /* Print slub debugging pointers without hashing */ 5042 if (__slub_debug_enabled()) 5043 no_hash_pointers_enable(NULL); 5044 5045 kmem_cache_node = &boot_kmem_cache_node; 5046 kmem_cache = &boot_kmem_cache; 5047 5048 /* 5049 * Initialize the nodemask for which we will allocate per node 5050 * structures. Here we don't need taking slab_mutex yet. 5051 */ 5052 for_each_node_state(node, N_NORMAL_MEMORY) 5053 node_set(node, slab_nodes); 5054 5055 create_boot_cache(kmem_cache_node, "kmem_cache_node", 5056 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); 5057 5058 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 5059 5060 /* Able to allocate the per node structures */ 5061 slab_state = PARTIAL; 5062 5063 create_boot_cache(kmem_cache, "kmem_cache", 5064 offsetof(struct kmem_cache, node) + 5065 nr_node_ids * sizeof(struct kmem_cache_node *), 5066 SLAB_HWCACHE_ALIGN, 0, 0); 5067 5068 kmem_cache = bootstrap(&boot_kmem_cache); 5069 kmem_cache_node = bootstrap(&boot_kmem_cache_node); 5070 5071 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 5072 setup_kmalloc_cache_index_table(); 5073 create_kmalloc_caches(0); 5074 5075 /* Setup random freelists for each cache */ 5076 init_freelist_randomization(); 5077 5078 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, 5079 slub_cpu_dead); 5080 5081 pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", 5082 cache_line_size(), 5083 slub_min_order, slub_max_order, slub_min_objects, 5084 nr_cpu_ids, nr_node_ids); 5085 } 5086 5087 void __init kmem_cache_init_late(void) 5088 { 5089 #ifndef CONFIG_SLUB_TINY 5090 flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); 5091 WARN_ON(!flushwq); 5092 #endif 5093 } 5094 5095 struct kmem_cache * 5096 __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, 5097 slab_flags_t flags, void (*ctor)(void *)) 5098 { 5099 struct kmem_cache *s; 5100 5101 s = find_mergeable(size, align, flags, name, ctor); 5102 if (s) { 5103 if (sysfs_slab_alias(s, name)) 5104 return NULL; 5105 5106 s->refcount++; 5107 5108 /* 5109 * Adjust the object sizes so that we clear 5110 * the complete object on kzalloc. 5111 */ 5112 s->object_size = max(s->object_size, size); 5113 s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); 5114 } 5115 5116 return s; 5117 } 5118 5119 int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) 5120 { 5121 int err; 5122 5123 err = kmem_cache_open(s, flags); 5124 if (err) 5125 return err; 5126 5127 /* Mutex is not taken during early boot */ 5128 if (slab_state <= UP) 5129 return 0; 5130 5131 err = sysfs_slab_add(s); 5132 if (err) { 5133 __kmem_cache_release(s); 5134 return err; 5135 } 5136 5137 if (s->flags & SLAB_STORE_USER) 5138 debugfs_slab_add(s); 5139 5140 return 0; 5141 } 5142 5143 #ifdef SLAB_SUPPORTS_SYSFS 5144 static int count_inuse(struct slab *slab) 5145 { 5146 return slab->inuse; 5147 } 5148 5149 static int count_total(struct slab *slab) 5150 { 5151 return slab->objects; 5152 } 5153 #endif 5154 5155 #ifdef CONFIG_SLUB_DEBUG 5156 static void validate_slab(struct kmem_cache *s, struct slab *slab, 5157 unsigned long *obj_map) 5158 { 5159 void *p; 5160 void *addr = slab_address(slab); 5161 5162 if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) 5163 return; 5164 5165 /* Now we know that a valid freelist exists */ 5166 __fill_map(obj_map, s, slab); 5167 for_each_object(p, s, addr, slab->objects) { 5168 u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ? 5169 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; 5170 5171 if (!check_object(s, slab, p, val)) 5172 break; 5173 } 5174 } 5175 5176 static int validate_slab_node(struct kmem_cache *s, 5177 struct kmem_cache_node *n, unsigned long *obj_map) 5178 { 5179 unsigned long count = 0; 5180 struct slab *slab; 5181 unsigned long flags; 5182 5183 spin_lock_irqsave(&n->list_lock, flags); 5184 5185 list_for_each_entry(slab, &n->partial, slab_list) { 5186 validate_slab(s, slab, obj_map); 5187 count++; 5188 } 5189 if (count != n->nr_partial) { 5190 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", 5191 s->name, count, n->nr_partial); 5192 slab_add_kunit_errors(); 5193 } 5194 5195 if (!(s->flags & SLAB_STORE_USER)) 5196 goto out; 5197 5198 list_for_each_entry(slab, &n->full, slab_list) { 5199 validate_slab(s, slab, obj_map); 5200 count++; 5201 } 5202 if (count != node_nr_slabs(n)) { 5203 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", 5204 s->name, count, node_nr_slabs(n)); 5205 slab_add_kunit_errors(); 5206 } 5207 5208 out: 5209 spin_unlock_irqrestore(&n->list_lock, flags); 5210 return count; 5211 } 5212 5213 long validate_slab_cache(struct kmem_cache *s) 5214 { 5215 int node; 5216 unsigned long count = 0; 5217 struct kmem_cache_node *n; 5218 unsigned long *obj_map; 5219 5220 obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); 5221 if (!obj_map) 5222 return -ENOMEM; 5223 5224 flush_all(s); 5225 for_each_kmem_cache_node(s, node, n) 5226 count += validate_slab_node(s, n, obj_map); 5227 5228 bitmap_free(obj_map); 5229 5230 return count; 5231 } 5232 EXPORT_SYMBOL(validate_slab_cache); 5233 5234 #ifdef CONFIG_DEBUG_FS 5235 /* 5236 * Generate lists of code addresses where slabcache objects are allocated 5237 * and freed. 5238 */ 5239 5240 struct location { 5241 depot_stack_handle_t handle; 5242 unsigned long count; 5243 unsigned long addr; 5244 unsigned long waste; 5245 long long sum_time; 5246 long min_time; 5247 long max_time; 5248 long min_pid; 5249 long max_pid; 5250 DECLARE_BITMAP(cpus, NR_CPUS); 5251 nodemask_t nodes; 5252 }; 5253 5254 struct loc_track { 5255 unsigned long max; 5256 unsigned long count; 5257 struct location *loc; 5258 loff_t idx; 5259 }; 5260 5261 static struct dentry *slab_debugfs_root; 5262 5263 static void free_loc_track(struct loc_track *t) 5264 { 5265 if (t->max) 5266 free_pages((unsigned long)t->loc, 5267 get_order(sizeof(struct location) * t->max)); 5268 } 5269 5270 static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) 5271 { 5272 struct location *l; 5273 int order; 5274 5275 order = get_order(sizeof(struct location) * max); 5276 5277 l = (void *)__get_free_pages(flags, order); 5278 if (!l) 5279 return 0; 5280 5281 if (t->count) { 5282 memcpy(l, t->loc, sizeof(struct location) * t->count); 5283 free_loc_track(t); 5284 } 5285 t->max = max; 5286 t->loc = l; 5287 return 1; 5288 } 5289 5290 static int add_location(struct loc_track *t, struct kmem_cache *s, 5291 const struct track *track, 5292 unsigned int orig_size) 5293 { 5294 long start, end, pos; 5295 struct location *l; 5296 unsigned long caddr, chandle, cwaste; 5297 unsigned long age = jiffies - track->when; 5298 depot_stack_handle_t handle = 0; 5299 unsigned int waste = s->object_size - orig_size; 5300 5301 #ifdef CONFIG_STACKDEPOT 5302 handle = READ_ONCE(track->handle); 5303 #endif 5304 start = -1; 5305 end = t->count; 5306 5307 for ( ; ; ) { 5308 pos = start + (end - start + 1) / 2; 5309 5310 /* 5311 * There is nothing at "end". If we end up there 5312 * we need to add something to before end. 5313 */ 5314 if (pos == end) 5315 break; 5316 5317 l = &t->loc[pos]; 5318 caddr = l->addr; 5319 chandle = l->handle; 5320 cwaste = l->waste; 5321 if ((track->addr == caddr) && (handle == chandle) && 5322 (waste == cwaste)) { 5323 5324 l->count++; 5325 if (track->when) { 5326 l->sum_time += age; 5327 if (age < l->min_time) 5328 l->min_time = age; 5329 if (age > l->max_time) 5330 l->max_time = age; 5331 5332 if (track->pid < l->min_pid) 5333 l->min_pid = track->pid; 5334 if (track->pid > l->max_pid) 5335 l->max_pid = track->pid; 5336 5337 cpumask_set_cpu(track->cpu, 5338 to_cpumask(l->cpus)); 5339 } 5340 node_set(page_to_nid(virt_to_page(track)), l->nodes); 5341 return 1; 5342 } 5343 5344 if (track->addr < caddr) 5345 end = pos; 5346 else if (track->addr == caddr && handle < chandle) 5347 end = pos; 5348 else if (track->addr == caddr && handle == chandle && 5349 waste < cwaste) 5350 end = pos; 5351 else 5352 start = pos; 5353 } 5354 5355 /* 5356 * Not found. Insert new tracking element. 5357 */ 5358 if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) 5359 return 0; 5360 5361 l = t->loc + pos; 5362 if (pos < t->count) 5363 memmove(l + 1, l, 5364 (t->count - pos) * sizeof(struct location)); 5365 t->count++; 5366 l->count = 1; 5367 l->addr = track->addr; 5368 l->sum_time = age; 5369 l->min_time = age; 5370 l->max_time = age; 5371 l->min_pid = track->pid; 5372 l->max_pid = track->pid; 5373 l->handle = handle; 5374 l->waste = waste; 5375 cpumask_clear(to_cpumask(l->cpus)); 5376 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); 5377 nodes_clear(l->nodes); 5378 node_set(page_to_nid(virt_to_page(track)), l->nodes); 5379 return 1; 5380 } 5381 5382 static void process_slab(struct loc_track *t, struct kmem_cache *s, 5383 struct slab *slab, enum track_item alloc, 5384 unsigned long *obj_map) 5385 { 5386 void *addr = slab_address(slab); 5387 bool is_alloc = (alloc == TRACK_ALLOC); 5388 void *p; 5389 5390 __fill_map(obj_map, s, slab); 5391 5392 for_each_object(p, s, addr, slab->objects) 5393 if (!test_bit(__obj_to_index(s, addr, p), obj_map)) 5394 add_location(t, s, get_track(s, p, alloc), 5395 is_alloc ? get_orig_size(s, p) : 5396 s->object_size); 5397 } 5398 #endif /* CONFIG_DEBUG_FS */ 5399 #endif /* CONFIG_SLUB_DEBUG */ 5400 5401 #ifdef SLAB_SUPPORTS_SYSFS 5402 enum slab_stat_type { 5403 SL_ALL, /* All slabs */ 5404 SL_PARTIAL, /* Only partially allocated slabs */ 5405 SL_CPU, /* Only slabs used for cpu caches */ 5406 SL_OBJECTS, /* Determine allocated objects not slabs */ 5407 SL_TOTAL /* Determine object capacity not slabs */ 5408 }; 5409 5410 #define SO_ALL (1 << SL_ALL) 5411 #define SO_PARTIAL (1 << SL_PARTIAL) 5412 #define SO_CPU (1 << SL_CPU) 5413 #define SO_OBJECTS (1 << SL_OBJECTS) 5414 #define SO_TOTAL (1 << SL_TOTAL) 5415 5416 static ssize_t show_slab_objects(struct kmem_cache *s, 5417 char *buf, unsigned long flags) 5418 { 5419 unsigned long total = 0; 5420 int node; 5421 int x; 5422 unsigned long *nodes; 5423 int len = 0; 5424 5425 nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); 5426 if (!nodes) 5427 return -ENOMEM; 5428 5429 if (flags & SO_CPU) { 5430 int cpu; 5431 5432 for_each_possible_cpu(cpu) { 5433 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, 5434 cpu); 5435 int node; 5436 struct slab *slab; 5437 5438 slab = READ_ONCE(c->slab); 5439 if (!slab) 5440 continue; 5441 5442 node = slab_nid(slab); 5443 if (flags & SO_TOTAL) 5444 x = slab->objects; 5445 else if (flags & SO_OBJECTS) 5446 x = slab->inuse; 5447 else 5448 x = 1; 5449 5450 total += x; 5451 nodes[node] += x; 5452 5453 #ifdef CONFIG_SLUB_CPU_PARTIAL 5454 slab = slub_percpu_partial_read_once(c); 5455 if (slab) { 5456 node = slab_nid(slab); 5457 if (flags & SO_TOTAL) 5458 WARN_ON_ONCE(1); 5459 else if (flags & SO_OBJECTS) 5460 WARN_ON_ONCE(1); 5461 else 5462 x = slab->slabs; 5463 total += x; 5464 nodes[node] += x; 5465 } 5466 #endif 5467 } 5468 } 5469 5470 /* 5471 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" 5472 * already held which will conflict with an existing lock order: 5473 * 5474 * mem_hotplug_lock->slab_mutex->kernfs_mutex 5475 * 5476 * We don't really need mem_hotplug_lock (to hold off 5477 * slab_mem_going_offline_callback) here because slab's memory hot 5478 * unplug code doesn't destroy the kmem_cache->node[] data. 5479 */ 5480 5481 #ifdef CONFIG_SLUB_DEBUG 5482 if (flags & SO_ALL) { 5483 struct kmem_cache_node *n; 5484 5485 for_each_kmem_cache_node(s, node, n) { 5486 5487 if (flags & SO_TOTAL) 5488 x = node_nr_objs(n); 5489 else if (flags & SO_OBJECTS) 5490 x = node_nr_objs(n) - count_partial(n, count_free); 5491 else 5492 x = node_nr_slabs(n); 5493 total += x; 5494 nodes[node] += x; 5495 } 5496 5497 } else 5498 #endif 5499 if (flags & SO_PARTIAL) { 5500 struct kmem_cache_node *n; 5501 5502 for_each_kmem_cache_node(s, node, n) { 5503 if (flags & SO_TOTAL) 5504 x = count_partial(n, count_total); 5505 else if (flags & SO_OBJECTS) 5506 x = count_partial(n, count_inuse); 5507 else 5508 x = n->nr_partial; 5509 total += x; 5510 nodes[node] += x; 5511 } 5512 } 5513 5514 len += sysfs_emit_at(buf, len, "%lu", total); 5515 #ifdef CONFIG_NUMA 5516 for (node = 0; node < nr_node_ids; node++) { 5517 if (nodes[node]) 5518 len += sysfs_emit_at(buf, len, " N%d=%lu", 5519 node, nodes[node]); 5520 } 5521 #endif 5522 len += sysfs_emit_at(buf, len, "\n"); 5523 kfree(nodes); 5524 5525 return len; 5526 } 5527 5528 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 5529 #define to_slab(n) container_of(n, struct kmem_cache, kobj) 5530 5531 struct slab_attribute { 5532 struct attribute attr; 5533 ssize_t (*show)(struct kmem_cache *s, char *buf); 5534 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); 5535 }; 5536 5537 #define SLAB_ATTR_RO(_name) \ 5538 static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) 5539 5540 #define SLAB_ATTR(_name) \ 5541 static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) 5542 5543 static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 5544 { 5545 return sysfs_emit(buf, "%u\n", s->size); 5546 } 5547 SLAB_ATTR_RO(slab_size); 5548 5549 static ssize_t align_show(struct kmem_cache *s, char *buf) 5550 { 5551 return sysfs_emit(buf, "%u\n", s->align); 5552 } 5553 SLAB_ATTR_RO(align); 5554 5555 static ssize_t object_size_show(struct kmem_cache *s, char *buf) 5556 { 5557 return sysfs_emit(buf, "%u\n", s->object_size); 5558 } 5559 SLAB_ATTR_RO(object_size); 5560 5561 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 5562 { 5563 return sysfs_emit(buf, "%u\n", oo_objects(s->oo)); 5564 } 5565 SLAB_ATTR_RO(objs_per_slab); 5566 5567 static ssize_t order_show(struct kmem_cache *s, char *buf) 5568 { 5569 return sysfs_emit(buf, "%u\n", oo_order(s->oo)); 5570 } 5571 SLAB_ATTR_RO(order); 5572 5573 static ssize_t min_partial_show(struct kmem_cache *s, char *buf) 5574 { 5575 return sysfs_emit(buf, "%lu\n", s->min_partial); 5576 } 5577 5578 static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, 5579 size_t length) 5580 { 5581 unsigned long min; 5582 int err; 5583 5584 err = kstrtoul(buf, 10, &min); 5585 if (err) 5586 return err; 5587 5588 s->min_partial = min; 5589 return length; 5590 } 5591 SLAB_ATTR(min_partial); 5592 5593 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 5594 { 5595 unsigned int nr_partial = 0; 5596 #ifdef CONFIG_SLUB_CPU_PARTIAL 5597 nr_partial = s->cpu_partial; 5598 #endif 5599 5600 return sysfs_emit(buf, "%u\n", nr_partial); 5601 } 5602 5603 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, 5604 size_t length) 5605 { 5606 unsigned int objects; 5607 int err; 5608 5609 err = kstrtouint(buf, 10, &objects); 5610 if (err) 5611 return err; 5612 if (objects && !kmem_cache_has_cpu_partial(s)) 5613 return -EINVAL; 5614 5615 slub_set_cpu_partial(s, objects); 5616 flush_all(s); 5617 return length; 5618 } 5619 SLAB_ATTR(cpu_partial); 5620 5621 static ssize_t ctor_show(struct kmem_cache *s, char *buf) 5622 { 5623 if (!s->ctor) 5624 return 0; 5625 return sysfs_emit(buf, "%pS\n", s->ctor); 5626 } 5627 SLAB_ATTR_RO(ctor); 5628 5629 static ssize_t aliases_show(struct kmem_cache *s, char *buf) 5630 { 5631 return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); 5632 } 5633 SLAB_ATTR_RO(aliases); 5634 5635 static ssize_t partial_show(struct kmem_cache *s, char *buf) 5636 { 5637 return show_slab_objects(s, buf, SO_PARTIAL); 5638 } 5639 SLAB_ATTR_RO(partial); 5640 5641 static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) 5642 { 5643 return show_slab_objects(s, buf, SO_CPU); 5644 } 5645 SLAB_ATTR_RO(cpu_slabs); 5646 5647 static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) 5648 { 5649 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); 5650 } 5651 SLAB_ATTR_RO(objects_partial); 5652 5653 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 5654 { 5655 int objects = 0; 5656 int slabs = 0; 5657 int cpu __maybe_unused; 5658 int len = 0; 5659 5660 #ifdef CONFIG_SLUB_CPU_PARTIAL 5661 for_each_online_cpu(cpu) { 5662 struct slab *slab; 5663 5664 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 5665 5666 if (slab) 5667 slabs += slab->slabs; 5668 } 5669 #endif 5670 5671 /* Approximate half-full slabs, see slub_set_cpu_partial() */ 5672 objects = (slabs * oo_objects(s->oo)) / 2; 5673 len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); 5674 5675 #ifdef CONFIG_SLUB_CPU_PARTIAL 5676 for_each_online_cpu(cpu) { 5677 struct slab *slab; 5678 5679 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 5680 if (slab) { 5681 slabs = READ_ONCE(slab->slabs); 5682 objects = (slabs * oo_objects(s->oo)) / 2; 5683 len += sysfs_emit_at(buf, len, " C%d=%d(%d)", 5684 cpu, objects, slabs); 5685 } 5686 } 5687 #endif 5688 len += sysfs_emit_at(buf, len, "\n"); 5689 5690 return len; 5691 } 5692 SLAB_ATTR_RO(slabs_cpu_partial); 5693 5694 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 5695 { 5696 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 5697 } 5698 SLAB_ATTR_RO(reclaim_account); 5699 5700 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) 5701 { 5702 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); 5703 } 5704 SLAB_ATTR_RO(hwcache_align); 5705 5706 #ifdef CONFIG_ZONE_DMA 5707 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) 5708 { 5709 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); 5710 } 5711 SLAB_ATTR_RO(cache_dma); 5712 #endif 5713 5714 #ifdef CONFIG_HARDENED_USERCOPY 5715 static ssize_t usersize_show(struct kmem_cache *s, char *buf) 5716 { 5717 return sysfs_emit(buf, "%u\n", s->usersize); 5718 } 5719 SLAB_ATTR_RO(usersize); 5720 #endif 5721 5722 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 5723 { 5724 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); 5725 } 5726 SLAB_ATTR_RO(destroy_by_rcu); 5727 5728 #ifdef CONFIG_SLUB_DEBUG 5729 static ssize_t slabs_show(struct kmem_cache *s, char *buf) 5730 { 5731 return show_slab_objects(s, buf, SO_ALL); 5732 } 5733 SLAB_ATTR_RO(slabs); 5734 5735 static ssize_t total_objects_show(struct kmem_cache *s, char *buf) 5736 { 5737 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); 5738 } 5739 SLAB_ATTR_RO(total_objects); 5740 5741 static ssize_t objects_show(struct kmem_cache *s, char *buf) 5742 { 5743 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); 5744 } 5745 SLAB_ATTR_RO(objects); 5746 5747 static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) 5748 { 5749 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); 5750 } 5751 SLAB_ATTR_RO(sanity_checks); 5752 5753 static ssize_t trace_show(struct kmem_cache *s, char *buf) 5754 { 5755 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE)); 5756 } 5757 SLAB_ATTR_RO(trace); 5758 5759 static ssize_t red_zone_show(struct kmem_cache *s, char *buf) 5760 { 5761 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); 5762 } 5763 5764 SLAB_ATTR_RO(red_zone); 5765 5766 static ssize_t poison_show(struct kmem_cache *s, char *buf) 5767 { 5768 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON)); 5769 } 5770 5771 SLAB_ATTR_RO(poison); 5772 5773 static ssize_t store_user_show(struct kmem_cache *s, char *buf) 5774 { 5775 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); 5776 } 5777 5778 SLAB_ATTR_RO(store_user); 5779 5780 static ssize_t validate_show(struct kmem_cache *s, char *buf) 5781 { 5782 return 0; 5783 } 5784 5785 static ssize_t validate_store(struct kmem_cache *s, 5786 const char *buf, size_t length) 5787 { 5788 int ret = -EINVAL; 5789 5790 if (buf[0] == '1' && kmem_cache_debug(s)) { 5791 ret = validate_slab_cache(s); 5792 if (ret >= 0) 5793 ret = length; 5794 } 5795 return ret; 5796 } 5797 SLAB_ATTR(validate); 5798 5799 #endif /* CONFIG_SLUB_DEBUG */ 5800 5801 #ifdef CONFIG_FAILSLAB 5802 static ssize_t failslab_show(struct kmem_cache *s, char *buf) 5803 { 5804 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); 5805 } 5806 5807 static ssize_t failslab_store(struct kmem_cache *s, const char *buf, 5808 size_t length) 5809 { 5810 if (s->refcount > 1) 5811 return -EINVAL; 5812 5813 if (buf[0] == '1') 5814 WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB); 5815 else 5816 WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB); 5817 5818 return length; 5819 } 5820 SLAB_ATTR(failslab); 5821 #endif 5822 5823 static ssize_t shrink_show(struct kmem_cache *s, char *buf) 5824 { 5825 return 0; 5826 } 5827 5828 static ssize_t shrink_store(struct kmem_cache *s, 5829 const char *buf, size_t length) 5830 { 5831 if (buf[0] == '1') 5832 kmem_cache_shrink(s); 5833 else 5834 return -EINVAL; 5835 return length; 5836 } 5837 SLAB_ATTR(shrink); 5838 5839 #ifdef CONFIG_NUMA 5840 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 5841 { 5842 return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10); 5843 } 5844 5845 static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 5846 const char *buf, size_t length) 5847 { 5848 unsigned int ratio; 5849 int err; 5850 5851 err = kstrtouint(buf, 10, &ratio); 5852 if (err) 5853 return err; 5854 if (ratio > 100) 5855 return -ERANGE; 5856 5857 s->remote_node_defrag_ratio = ratio * 10; 5858 5859 return length; 5860 } 5861 SLAB_ATTR(remote_node_defrag_ratio); 5862 #endif 5863 5864 #ifdef CONFIG_SLUB_STATS 5865 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) 5866 { 5867 unsigned long sum = 0; 5868 int cpu; 5869 int len = 0; 5870 int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL); 5871 5872 if (!data) 5873 return -ENOMEM; 5874 5875 for_each_online_cpu(cpu) { 5876 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 5877 5878 data[cpu] = x; 5879 sum += x; 5880 } 5881 5882 len += sysfs_emit_at(buf, len, "%lu", sum); 5883 5884 #ifdef CONFIG_SMP 5885 for_each_online_cpu(cpu) { 5886 if (data[cpu]) 5887 len += sysfs_emit_at(buf, len, " C%d=%u", 5888 cpu, data[cpu]); 5889 } 5890 #endif 5891 kfree(data); 5892 len += sysfs_emit_at(buf, len, "\n"); 5893 5894 return len; 5895 } 5896 5897 static void clear_stat(struct kmem_cache *s, enum stat_item si) 5898 { 5899 int cpu; 5900 5901 for_each_online_cpu(cpu) 5902 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 5903 } 5904 5905 #define STAT_ATTR(si, text) \ 5906 static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 5907 { \ 5908 return show_stat(s, buf, si); \ 5909 } \ 5910 static ssize_t text##_store(struct kmem_cache *s, \ 5911 const char *buf, size_t length) \ 5912 { \ 5913 if (buf[0] != '0') \ 5914 return -EINVAL; \ 5915 clear_stat(s, si); \ 5916 return length; \ 5917 } \ 5918 SLAB_ATTR(text); \ 5919 5920 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 5921 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 5922 STAT_ATTR(FREE_FASTPATH, free_fastpath); 5923 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 5924 STAT_ATTR(FREE_FROZEN, free_frozen); 5925 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 5926 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 5927 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 5928 STAT_ATTR(ALLOC_SLAB, alloc_slab); 5929 STAT_ATTR(ALLOC_REFILL, alloc_refill); 5930 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 5931 STAT_ATTR(FREE_SLAB, free_slab); 5932 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 5933 STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 5934 STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 5935 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 5936 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 5937 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 5938 STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 5939 STAT_ATTR(ORDER_FALLBACK, order_fallback); 5940 STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 5941 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 5942 STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 5943 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 5944 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 5945 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 5946 #endif /* CONFIG_SLUB_STATS */ 5947 5948 #ifdef CONFIG_KFENCE 5949 static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf) 5950 { 5951 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE)); 5952 } 5953 5954 static ssize_t skip_kfence_store(struct kmem_cache *s, 5955 const char *buf, size_t length) 5956 { 5957 int ret = length; 5958 5959 if (buf[0] == '0') 5960 s->flags &= ~SLAB_SKIP_KFENCE; 5961 else if (buf[0] == '1') 5962 s->flags |= SLAB_SKIP_KFENCE; 5963 else 5964 ret = -EINVAL; 5965 5966 return ret; 5967 } 5968 SLAB_ATTR(skip_kfence); 5969 #endif 5970 5971 static struct attribute *slab_attrs[] = { 5972 &slab_size_attr.attr, 5973 &object_size_attr.attr, 5974 &objs_per_slab_attr.attr, 5975 &order_attr.attr, 5976 &min_partial_attr.attr, 5977 &cpu_partial_attr.attr, 5978 &objects_partial_attr.attr, 5979 &partial_attr.attr, 5980 &cpu_slabs_attr.attr, 5981 &ctor_attr.attr, 5982 &aliases_attr.attr, 5983 &align_attr.attr, 5984 &hwcache_align_attr.attr, 5985 &reclaim_account_attr.attr, 5986 &destroy_by_rcu_attr.attr, 5987 &shrink_attr.attr, 5988 &slabs_cpu_partial_attr.attr, 5989 #ifdef CONFIG_SLUB_DEBUG 5990 &total_objects_attr.attr, 5991 &objects_attr.attr, 5992 &slabs_attr.attr, 5993 &sanity_checks_attr.attr, 5994 &trace_attr.attr, 5995 &red_zone_attr.attr, 5996 &poison_attr.attr, 5997 &store_user_attr.attr, 5998 &validate_attr.attr, 5999 #endif 6000 #ifdef CONFIG_ZONE_DMA 6001 &cache_dma_attr.attr, 6002 #endif 6003 #ifdef CONFIG_NUMA 6004 &remote_node_defrag_ratio_attr.attr, 6005 #endif 6006 #ifdef CONFIG_SLUB_STATS 6007 &alloc_fastpath_attr.attr, 6008 &alloc_slowpath_attr.attr, 6009 &free_fastpath_attr.attr, 6010 &free_slowpath_attr.attr, 6011 &free_frozen_attr.attr, 6012 &free_add_partial_attr.attr, 6013 &free_remove_partial_attr.attr, 6014 &alloc_from_partial_attr.attr, 6015 &alloc_slab_attr.attr, 6016 &alloc_refill_attr.attr, 6017 &alloc_node_mismatch_attr.attr, 6018 &free_slab_attr.attr, 6019 &cpuslab_flush_attr.attr, 6020 &deactivate_full_attr.attr, 6021 &deactivate_empty_attr.attr, 6022 &deactivate_to_head_attr.attr, 6023 &deactivate_to_tail_attr.attr, 6024 &deactivate_remote_frees_attr.attr, 6025 &deactivate_bypass_attr.attr, 6026 &order_fallback_attr.attr, 6027 &cmpxchg_double_fail_attr.attr, 6028 &cmpxchg_double_cpu_fail_attr.attr, 6029 &cpu_partial_alloc_attr.attr, 6030 &cpu_partial_free_attr.attr, 6031 &cpu_partial_node_attr.attr, 6032 &cpu_partial_drain_attr.attr, 6033 #endif 6034 #ifdef CONFIG_FAILSLAB 6035 &failslab_attr.attr, 6036 #endif 6037 #ifdef CONFIG_HARDENED_USERCOPY 6038 &usersize_attr.attr, 6039 #endif 6040 #ifdef CONFIG_KFENCE 6041 &skip_kfence_attr.attr, 6042 #endif 6043 6044 NULL 6045 }; 6046 6047 static const struct attribute_group slab_attr_group = { 6048 .attrs = slab_attrs, 6049 }; 6050 6051 static ssize_t slab_attr_show(struct kobject *kobj, 6052 struct attribute *attr, 6053 char *buf) 6054 { 6055 struct slab_attribute *attribute; 6056 struct kmem_cache *s; 6057 6058 attribute = to_slab_attr(attr); 6059 s = to_slab(kobj); 6060 6061 if (!attribute->show) 6062 return -EIO; 6063 6064 return attribute->show(s, buf); 6065 } 6066 6067 static ssize_t slab_attr_store(struct kobject *kobj, 6068 struct attribute *attr, 6069 const char *buf, size_t len) 6070 { 6071 struct slab_attribute *attribute; 6072 struct kmem_cache *s; 6073 6074 attribute = to_slab_attr(attr); 6075 s = to_slab(kobj); 6076 6077 if (!attribute->store) 6078 return -EIO; 6079 6080 return attribute->store(s, buf, len); 6081 } 6082 6083 static void kmem_cache_release(struct kobject *k) 6084 { 6085 slab_kmem_cache_release(to_slab(k)); 6086 } 6087 6088 static const struct sysfs_ops slab_sysfs_ops = { 6089 .show = slab_attr_show, 6090 .store = slab_attr_store, 6091 }; 6092 6093 static const struct kobj_type slab_ktype = { 6094 .sysfs_ops = &slab_sysfs_ops, 6095 .release = kmem_cache_release, 6096 }; 6097 6098 static struct kset *slab_kset; 6099 6100 static inline struct kset *cache_kset(struct kmem_cache *s) 6101 { 6102 return slab_kset; 6103 } 6104 6105 #define ID_STR_LENGTH 32 6106 6107 /* Create a unique string id for a slab cache: 6108 * 6109 * Format :[flags-]size 6110 */ 6111 static char *create_unique_id(struct kmem_cache *s) 6112 { 6113 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); 6114 char *p = name; 6115 6116 if (!name) 6117 return ERR_PTR(-ENOMEM); 6118 6119 *p++ = ':'; 6120 /* 6121 * First flags affecting slabcache operations. We will only 6122 * get here for aliasable slabs so we do not need to support 6123 * too many flags. The flags here must cover all flags that 6124 * are matched during merging to guarantee that the id is 6125 * unique. 6126 */ 6127 if (s->flags & SLAB_CACHE_DMA) 6128 *p++ = 'd'; 6129 if (s->flags & SLAB_CACHE_DMA32) 6130 *p++ = 'D'; 6131 if (s->flags & SLAB_RECLAIM_ACCOUNT) 6132 *p++ = 'a'; 6133 if (s->flags & SLAB_CONSISTENCY_CHECKS) 6134 *p++ = 'F'; 6135 if (s->flags & SLAB_ACCOUNT) 6136 *p++ = 'A'; 6137 if (p != name + 1) 6138 *p++ = '-'; 6139 p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size); 6140 6141 if (WARN_ON(p > name + ID_STR_LENGTH - 1)) { 6142 kfree(name); 6143 return ERR_PTR(-EINVAL); 6144 } 6145 kmsan_unpoison_memory(name, p - name); 6146 return name; 6147 } 6148 6149 static int sysfs_slab_add(struct kmem_cache *s) 6150 { 6151 int err; 6152 const char *name; 6153 struct kset *kset = cache_kset(s); 6154 int unmergeable = slab_unmergeable(s); 6155 6156 if (!unmergeable && disable_higher_order_debug && 6157 (slub_debug & DEBUG_METADATA_FLAGS)) 6158 unmergeable = 1; 6159 6160 if (unmergeable) { 6161 /* 6162 * Slabcache can never be merged so we can use the name proper. 6163 * This is typically the case for debug situations. In that 6164 * case we can catch duplicate names easily. 6165 */ 6166 sysfs_remove_link(&slab_kset->kobj, s->name); 6167 name = s->name; 6168 } else { 6169 /* 6170 * Create a unique name for the slab as a target 6171 * for the symlinks. 6172 */ 6173 name = create_unique_id(s); 6174 if (IS_ERR(name)) 6175 return PTR_ERR(name); 6176 } 6177 6178 s->kobj.kset = kset; 6179 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); 6180 if (err) 6181 goto out; 6182 6183 err = sysfs_create_group(&s->kobj, &slab_attr_group); 6184 if (err) 6185 goto out_del_kobj; 6186 6187 if (!unmergeable) { 6188 /* Setup first alias */ 6189 sysfs_slab_alias(s, s->name); 6190 } 6191 out: 6192 if (!unmergeable) 6193 kfree(name); 6194 return err; 6195 out_del_kobj: 6196 kobject_del(&s->kobj); 6197 goto out; 6198 } 6199 6200 void sysfs_slab_unlink(struct kmem_cache *s) 6201 { 6202 if (slab_state >= FULL) 6203 kobject_del(&s->kobj); 6204 } 6205 6206 void sysfs_slab_release(struct kmem_cache *s) 6207 { 6208 if (slab_state >= FULL) 6209 kobject_put(&s->kobj); 6210 } 6211 6212 /* 6213 * Need to buffer aliases during bootup until sysfs becomes 6214 * available lest we lose that information. 6215 */ 6216 struct saved_alias { 6217 struct kmem_cache *s; 6218 const char *name; 6219 struct saved_alias *next; 6220 }; 6221 6222 static struct saved_alias *alias_list; 6223 6224 static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 6225 { 6226 struct saved_alias *al; 6227 6228 if (slab_state == FULL) { 6229 /* 6230 * If we have a leftover link then remove it. 6231 */ 6232 sysfs_remove_link(&slab_kset->kobj, name); 6233 return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); 6234 } 6235 6236 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); 6237 if (!al) 6238 return -ENOMEM; 6239 6240 al->s = s; 6241 al->name = name; 6242 al->next = alias_list; 6243 alias_list = al; 6244 kmsan_unpoison_memory(al, sizeof(*al)); 6245 return 0; 6246 } 6247 6248 static int __init slab_sysfs_init(void) 6249 { 6250 struct kmem_cache *s; 6251 int err; 6252 6253 mutex_lock(&slab_mutex); 6254 6255 slab_kset = kset_create_and_add("slab", NULL, kernel_kobj); 6256 if (!slab_kset) { 6257 mutex_unlock(&slab_mutex); 6258 pr_err("Cannot register slab subsystem.\n"); 6259 return -ENOMEM; 6260 } 6261 6262 slab_state = FULL; 6263 6264 list_for_each_entry(s, &slab_caches, list) { 6265 err = sysfs_slab_add(s); 6266 if (err) 6267 pr_err("SLUB: Unable to add boot slab %s to sysfs\n", 6268 s->name); 6269 } 6270 6271 while (alias_list) { 6272 struct saved_alias *al = alias_list; 6273 6274 alias_list = alias_list->next; 6275 err = sysfs_slab_alias(al->s, al->name); 6276 if (err) 6277 pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", 6278 al->name); 6279 kfree(al); 6280 } 6281 6282 mutex_unlock(&slab_mutex); 6283 return 0; 6284 } 6285 late_initcall(slab_sysfs_init); 6286 #endif /* SLAB_SUPPORTS_SYSFS */ 6287 6288 #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) 6289 static int slab_debugfs_show(struct seq_file *seq, void *v) 6290 { 6291 struct loc_track *t = seq->private; 6292 struct location *l; 6293 unsigned long idx; 6294 6295 idx = (unsigned long) t->idx; 6296 if (idx < t->count) { 6297 l = &t->loc[idx]; 6298 6299 seq_printf(seq, "%7ld ", l->count); 6300 6301 if (l->addr) 6302 seq_printf(seq, "%pS", (void *)l->addr); 6303 else 6304 seq_puts(seq, "<not-available>"); 6305 6306 if (l->waste) 6307 seq_printf(seq, " waste=%lu/%lu", 6308 l->count * l->waste, l->waste); 6309 6310 if (l->sum_time != l->min_time) { 6311 seq_printf(seq, " age=%ld/%llu/%ld", 6312 l->min_time, div_u64(l->sum_time, l->count), 6313 l->max_time); 6314 } else 6315 seq_printf(seq, " age=%ld", l->min_time); 6316 6317 if (l->min_pid != l->max_pid) 6318 seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid); 6319 else 6320 seq_printf(seq, " pid=%ld", 6321 l->min_pid); 6322 6323 if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus))) 6324 seq_printf(seq, " cpus=%*pbl", 6325 cpumask_pr_args(to_cpumask(l->cpus))); 6326 6327 if (nr_online_nodes > 1 && !nodes_empty(l->nodes)) 6328 seq_printf(seq, " nodes=%*pbl", 6329 nodemask_pr_args(&l->nodes)); 6330 6331 #ifdef CONFIG_STACKDEPOT 6332 { 6333 depot_stack_handle_t handle; 6334 unsigned long *entries; 6335 unsigned int nr_entries, j; 6336 6337 handle = READ_ONCE(l->handle); 6338 if (handle) { 6339 nr_entries = stack_depot_fetch(handle, &entries); 6340 seq_puts(seq, "\n"); 6341 for (j = 0; j < nr_entries; j++) 6342 seq_printf(seq, " %pS\n", (void *)entries[j]); 6343 } 6344 } 6345 #endif 6346 seq_puts(seq, "\n"); 6347 } 6348 6349 if (!idx && !t->count) 6350 seq_puts(seq, "No data\n"); 6351 6352 return 0; 6353 } 6354 6355 static void slab_debugfs_stop(struct seq_file *seq, void *v) 6356 { 6357 } 6358 6359 static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) 6360 { 6361 struct loc_track *t = seq->private; 6362 6363 t->idx = ++(*ppos); 6364 if (*ppos <= t->count) 6365 return ppos; 6366 6367 return NULL; 6368 } 6369 6370 static int cmp_loc_by_count(const void *a, const void *b, const void *data) 6371 { 6372 struct location *loc1 = (struct location *)a; 6373 struct location *loc2 = (struct location *)b; 6374 6375 if (loc1->count > loc2->count) 6376 return -1; 6377 else 6378 return 1; 6379 } 6380 6381 static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) 6382 { 6383 struct loc_track *t = seq->private; 6384 6385 t->idx = *ppos; 6386 return ppos; 6387 } 6388 6389 static const struct seq_operations slab_debugfs_sops = { 6390 .start = slab_debugfs_start, 6391 .next = slab_debugfs_next, 6392 .stop = slab_debugfs_stop, 6393 .show = slab_debugfs_show, 6394 }; 6395 6396 static int slab_debug_trace_open(struct inode *inode, struct file *filep) 6397 { 6398 6399 struct kmem_cache_node *n; 6400 enum track_item alloc; 6401 int node; 6402 struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops, 6403 sizeof(struct loc_track)); 6404 struct kmem_cache *s = file_inode(filep)->i_private; 6405 unsigned long *obj_map; 6406 6407 if (!t) 6408 return -ENOMEM; 6409 6410 obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); 6411 if (!obj_map) { 6412 seq_release_private(inode, filep); 6413 return -ENOMEM; 6414 } 6415 6416 if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) 6417 alloc = TRACK_ALLOC; 6418 else 6419 alloc = TRACK_FREE; 6420 6421 if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { 6422 bitmap_free(obj_map); 6423 seq_release_private(inode, filep); 6424 return -ENOMEM; 6425 } 6426 6427 for_each_kmem_cache_node(s, node, n) { 6428 unsigned long flags; 6429 struct slab *slab; 6430 6431 if (!node_nr_slabs(n)) 6432 continue; 6433 6434 spin_lock_irqsave(&n->list_lock, flags); 6435 list_for_each_entry(slab, &n->partial, slab_list) 6436 process_slab(t, s, slab, alloc, obj_map); 6437 list_for_each_entry(slab, &n->full, slab_list) 6438 process_slab(t, s, slab, alloc, obj_map); 6439 spin_unlock_irqrestore(&n->list_lock, flags); 6440 } 6441 6442 /* Sort locations by count */ 6443 sort_r(t->loc, t->count, sizeof(struct location), 6444 cmp_loc_by_count, NULL, NULL); 6445 6446 bitmap_free(obj_map); 6447 return 0; 6448 } 6449 6450 static int slab_debug_trace_release(struct inode *inode, struct file *file) 6451 { 6452 struct seq_file *seq = file->private_data; 6453 struct loc_track *t = seq->private; 6454 6455 free_loc_track(t); 6456 return seq_release_private(inode, file); 6457 } 6458 6459 static const struct file_operations slab_debugfs_fops = { 6460 .open = slab_debug_trace_open, 6461 .read = seq_read, 6462 .llseek = seq_lseek, 6463 .release = slab_debug_trace_release, 6464 }; 6465 6466 static void debugfs_slab_add(struct kmem_cache *s) 6467 { 6468 struct dentry *slab_cache_dir; 6469 6470 if (unlikely(!slab_debugfs_root)) 6471 return; 6472 6473 slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root); 6474 6475 debugfs_create_file("alloc_traces", 0400, 6476 slab_cache_dir, s, &slab_debugfs_fops); 6477 6478 debugfs_create_file("free_traces", 0400, 6479 slab_cache_dir, s, &slab_debugfs_fops); 6480 } 6481 6482 void debugfs_slab_release(struct kmem_cache *s) 6483 { 6484 debugfs_lookup_and_remove(s->name, slab_debugfs_root); 6485 } 6486 6487 static int __init slab_debugfs_init(void) 6488 { 6489 struct kmem_cache *s; 6490 6491 slab_debugfs_root = debugfs_create_dir("slab", NULL); 6492 6493 list_for_each_entry(s, &slab_caches, list) 6494 if (s->flags & SLAB_STORE_USER) 6495 debugfs_slab_add(s); 6496 6497 return 0; 6498 6499 } 6500 __initcall(slab_debugfs_init); 6501 #endif 6502 /* 6503 * The /proc/slabinfo ABI 6504 */ 6505 #ifdef CONFIG_SLUB_DEBUG 6506 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) 6507 { 6508 unsigned long nr_slabs = 0; 6509 unsigned long nr_objs = 0; 6510 unsigned long nr_free = 0; 6511 int node; 6512 struct kmem_cache_node *n; 6513 6514 for_each_kmem_cache_node(s, node, n) { 6515 nr_slabs += node_nr_slabs(n); 6516 nr_objs += node_nr_objs(n); 6517 nr_free += count_partial(n, count_free); 6518 } 6519 6520 sinfo->active_objs = nr_objs - nr_free; 6521 sinfo->num_objs = nr_objs; 6522 sinfo->active_slabs = nr_slabs; 6523 sinfo->num_slabs = nr_slabs; 6524 sinfo->objects_per_slab = oo_objects(s->oo); 6525 sinfo->cache_order = oo_order(s->oo); 6526 } 6527 6528 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) 6529 { 6530 } 6531 6532 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 6533 size_t count, loff_t *ppos) 6534 { 6535 return -EIO; 6536 } 6537 #endif /* CONFIG_SLUB_DEBUG */ 6538