1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Slab allocator functions that are independent of the allocator strategy 4 * 5 * (C) 2012 Christoph Lameter <cl@gentwo.org> 6 */ 7 #include <linux/slab.h> 8 9 #include <linux/mm.h> 10 #include <linux/poison.h> 11 #include <linux/interrupt.h> 12 #include <linux/memory.h> 13 #include <linux/cache.h> 14 #include <linux/compiler.h> 15 #include <linux/kfence.h> 16 #include <linux/module.h> 17 #include <linux/cpu.h> 18 #include <linux/uaccess.h> 19 #include <linux/seq_file.h> 20 #include <linux/dma-mapping.h> 21 #include <linux/swiotlb.h> 22 #include <linux/proc_fs.h> 23 #include <linux/debugfs.h> 24 #include <linux/kmemleak.h> 25 #include <linux/kasan.h> 26 #include <asm/cacheflush.h> 27 #include <asm/tlbflush.h> 28 #include <asm/page.h> 29 #include <linux/memcontrol.h> 30 #include <linux/stackdepot.h> 31 #include <trace/events/rcu.h> 32 33 #include "../kernel/rcu/rcu.h" 34 #include "internal.h" 35 #include "slab.h" 36 37 #define CREATE_TRACE_POINTS 38 #include <trace/events/kmem.h> 39 40 enum slab_state slab_state; 41 LIST_HEAD(slab_caches); 42 DEFINE_MUTEX(slab_mutex); 43 struct kmem_cache *kmem_cache; 44 45 /* 46 * Set of flags that will prevent slab merging. 47 * Any flag that adds per-object metadata should be included, 48 * since slab merging can update s->inuse that affects the metadata layout. 49 */ 50 #define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \ 51 SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \ 52 SLAB_OBJ_EXT_IN_OBJ) 53 54 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ 55 SLAB_CACHE_DMA32 | SLAB_ACCOUNT) 56 57 /* 58 * Merge control. If this is set then no merging of slab caches will occur. 59 */ 60 static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); 61 62 static int __init setup_slab_nomerge(char *str) 63 { 64 slab_nomerge = true; 65 return 1; 66 } 67 68 static int __init setup_slab_merge(char *str) 69 { 70 slab_nomerge = false; 71 return 1; 72 } 73 74 __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); 75 __setup_param("slub_merge", slub_merge, setup_slab_merge, 0); 76 77 __setup("slab_nomerge", setup_slab_nomerge); 78 __setup("slab_merge", setup_slab_merge); 79 80 /* 81 * Determine the size of a slab object 82 */ 83 unsigned int kmem_cache_size(struct kmem_cache *s) 84 { 85 return s->object_size; 86 } 87 EXPORT_SYMBOL(kmem_cache_size); 88 89 #ifdef CONFIG_DEBUG_VM 90 91 static bool kmem_cache_is_duplicate_name(const char *name) 92 { 93 struct kmem_cache *s; 94 95 list_for_each_entry(s, &slab_caches, list) { 96 if (!strcmp(s->name, name)) 97 return true; 98 } 99 100 return false; 101 } 102 103 static int kmem_cache_sanity_check(const char *name, unsigned int size) 104 { 105 if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) { 106 pr_err("kmem_cache_create(%s) integrity check failed\n", name); 107 return -EINVAL; 108 } 109 110 /* Duplicate names will confuse slabtop, et al */ 111 WARN(kmem_cache_is_duplicate_name(name), 112 "kmem_cache of name '%s' already exists\n", name); 113 114 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 115 return 0; 116 } 117 #else 118 static inline int kmem_cache_sanity_check(const char *name, unsigned int size) 119 { 120 return 0; 121 } 122 #endif 123 124 /* 125 * Figure out what the alignment of the objects will be given a set of 126 * flags, a user specified alignment and the size of the objects. 127 */ 128 static unsigned int calculate_alignment(slab_flags_t flags, 129 unsigned int align, unsigned int size) 130 { 131 /* 132 * If the user wants hardware cache aligned objects then follow that 133 * suggestion if the object is sufficiently large. 134 * 135 * The hardware cache alignment cannot override the specified 136 * alignment though. If that is greater then use it. 137 */ 138 if (flags & SLAB_HWCACHE_ALIGN) { 139 unsigned int ralign; 140 141 ralign = cache_line_size(); 142 while (size <= ralign / 2) 143 ralign /= 2; 144 align = max(align, ralign); 145 } 146 147 align = max(align, arch_slab_minalign()); 148 149 return ALIGN(align, sizeof(void *)); 150 } 151 152 /* 153 * Find a mergeable slab cache 154 */ 155 int slab_unmergeable(struct kmem_cache *s) 156 { 157 if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE)) 158 return 1; 159 160 if (s->ctor) 161 return 1; 162 163 #ifdef CONFIG_HARDENED_USERCOPY 164 if (s->usersize) 165 return 1; 166 #endif 167 168 /* 169 * We may have set a slab to be unmergeable during bootstrap. 170 */ 171 if (s->refcount < 0) 172 return 1; 173 174 return 0; 175 } 176 177 bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags) 178 { 179 if (slab_nomerge) 180 return true; 181 182 if (args->ctor) 183 return true; 184 185 if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize) 186 return true; 187 188 if (flags & SLAB_NEVER_MERGE) 189 return true; 190 191 return false; 192 } 193 194 static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags, 195 const char *name, struct kmem_cache_args *args) 196 { 197 struct kmem_cache *s; 198 unsigned int align; 199 200 flags = kmem_cache_flags(flags, name); 201 if (slab_args_unmergeable(args, flags)) 202 return NULL; 203 204 size = ALIGN(size, sizeof(void *)); 205 align = calculate_alignment(flags, args->align, size); 206 size = ALIGN(size, align); 207 208 list_for_each_entry_reverse(s, &slab_caches, list) { 209 if (slab_unmergeable(s)) 210 continue; 211 212 if (size > s->size) 213 continue; 214 215 if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME)) 216 continue; 217 /* 218 * Check if alignment is compatible. 219 * Courtesy of Adrian Drzewiecki 220 */ 221 if ((s->size & ~(align - 1)) != s->size) 222 continue; 223 224 if (s->size - size >= sizeof(void *)) 225 continue; 226 227 return s; 228 } 229 return NULL; 230 } 231 232 static struct kmem_cache *create_cache(const char *name, 233 unsigned int object_size, 234 struct kmem_cache_args *args, 235 slab_flags_t flags) 236 { 237 struct kmem_cache *s; 238 int err; 239 240 /* If a custom freelist pointer is requested make sure it's sane. */ 241 err = -EINVAL; 242 if (args->use_freeptr_offset && 243 (args->freeptr_offset >= object_size || 244 (!(flags & SLAB_TYPESAFE_BY_RCU) && !args->ctor) || 245 !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t)))) 246 goto out; 247 248 err = -ENOMEM; 249 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); 250 if (!s) 251 goto out; 252 err = do_kmem_cache_create(s, name, object_size, args, flags); 253 if (err) 254 goto out_free_cache; 255 256 s->refcount = 1; 257 list_add(&s->list, &slab_caches); 258 return s; 259 260 out_free_cache: 261 kmem_cache_free(kmem_cache, s); 262 out: 263 return ERR_PTR(err); 264 } 265 266 static struct kmem_cache * 267 __kmem_cache_alias(const char *name, unsigned int size, slab_flags_t flags, 268 struct kmem_cache_args *args) 269 { 270 struct kmem_cache *s; 271 272 s = find_mergeable(size, flags, name, args); 273 if (s) { 274 if (sysfs_slab_alias(s, name)) 275 pr_err("SLUB: Unable to add cache alias %s to sysfs\n", 276 name); 277 278 s->refcount++; 279 280 /* 281 * Adjust the object sizes so that we clear 282 * the complete object on kzalloc. 283 */ 284 s->object_size = max(s->object_size, size); 285 s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); 286 } 287 288 return s; 289 } 290 291 /** 292 * __kmem_cache_create_args - Create a kmem cache. 293 * @name: A string which is used in /proc/slabinfo to identify this cache. 294 * @object_size: The size of objects to be created in this cache. 295 * @args: Additional arguments for the cache creation (see 296 * &struct kmem_cache_args). 297 * @flags: See the descriptions of individual flags. The common ones are listed 298 * in the description below. 299 * 300 * Not to be called directly, use the kmem_cache_create() wrapper with the same 301 * parameters. 302 * 303 * Commonly used @flags: 304 * 305 * &SLAB_ACCOUNT - Account allocations to memcg. 306 * 307 * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries. 308 * 309 * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable. 310 * 311 * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed 312 * by a grace period - see the full description before using. 313 * 314 * Context: Cannot be called within a interrupt, but can be interrupted. 315 * 316 * Return: a pointer to the cache on success, NULL on failure. 317 */ 318 struct kmem_cache *__kmem_cache_create_args(const char *name, 319 unsigned int object_size, 320 struct kmem_cache_args *args, 321 slab_flags_t flags) 322 { 323 struct kmem_cache *s = NULL; 324 const char *cache_name; 325 int err; 326 327 #ifdef CONFIG_SLUB_DEBUG 328 /* 329 * If no slab_debug was enabled globally, the static key is not yet 330 * enabled by setup_slub_debug(). Enable it if the cache is being 331 * created with any of the debugging flags passed explicitly. 332 * It's also possible that this is the first cache created with 333 * SLAB_STORE_USER and we should init stack_depot for it. 334 */ 335 if (flags & SLAB_DEBUG_FLAGS) 336 static_branch_enable(&slub_debug_enabled); 337 if (flags & SLAB_STORE_USER) 338 stack_depot_init(); 339 #else 340 flags &= ~SLAB_DEBUG_FLAGS; 341 #endif 342 343 /* 344 * Caches with specific capacity are special enough. It's simpler to 345 * make them unmergeable. 346 */ 347 if (args->sheaf_capacity) 348 flags |= SLAB_NO_MERGE; 349 350 mutex_lock(&slab_mutex); 351 352 err = kmem_cache_sanity_check(name, object_size); 353 if (err) { 354 goto out_unlock; 355 } 356 357 if (flags & ~SLAB_FLAGS_PERMITTED) { 358 err = -EINVAL; 359 goto out_unlock; 360 } 361 362 /* Fail closed on bad usersize of useroffset values. */ 363 if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || 364 WARN_ON(!args->usersize && args->useroffset) || 365 WARN_ON(object_size < args->usersize || 366 object_size - args->usersize < args->useroffset)) 367 args->usersize = args->useroffset = 0; 368 369 s = __kmem_cache_alias(name, object_size, flags, args); 370 if (s) 371 goto out_unlock; 372 373 cache_name = kstrdup_const(name, GFP_KERNEL); 374 if (!cache_name) { 375 err = -ENOMEM; 376 goto out_unlock; 377 } 378 379 args->align = calculate_alignment(flags, args->align, object_size); 380 s = create_cache(cache_name, object_size, args, flags); 381 if (IS_ERR(s)) { 382 err = PTR_ERR(s); 383 kfree_const(cache_name); 384 } 385 386 out_unlock: 387 mutex_unlock(&slab_mutex); 388 389 if (err) { 390 if (flags & SLAB_PANIC) 391 panic("%s: Failed to create slab '%s'. Error %d\n", 392 __func__, name, err); 393 else { 394 pr_warn("%s(%s) failed with error %d\n", 395 __func__, name, err); 396 dump_stack(); 397 } 398 return NULL; 399 } 400 return s; 401 } 402 EXPORT_SYMBOL(__kmem_cache_create_args); 403 404 static struct kmem_cache *kmem_buckets_cache __ro_after_init; 405 406 /** 407 * kmem_buckets_create - Create a set of caches that handle dynamic sized 408 * allocations via kmem_buckets_alloc() 409 * @name: A prefix string which is used in /proc/slabinfo to identify this 410 * cache. The individual caches with have their sizes as the suffix. 411 * @flags: SLAB flags (see kmem_cache_create() for details). 412 * @useroffset: Starting offset within an allocation that may be copied 413 * to/from userspace. 414 * @usersize: How many bytes, starting at @useroffset, may be copied 415 * to/from userspace. 416 * @ctor: A constructor for the objects, run when new allocations are made. 417 * 418 * Cannot be called within an interrupt, but can be interrupted. 419 * 420 * Return: a pointer to the cache on success, NULL on failure. When 421 * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and 422 * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc(). 423 * (i.e. callers only need to check for NULL on failure.) 424 */ 425 kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, 426 unsigned int useroffset, 427 unsigned int usersize, 428 void (*ctor)(void *)) 429 { 430 unsigned long mask = 0; 431 unsigned int idx; 432 kmem_buckets *b; 433 434 BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG); 435 436 /* 437 * When the separate buckets API is not built in, just return 438 * a non-NULL value for the kmem_buckets pointer, which will be 439 * unused when performing allocations. 440 */ 441 if (!IS_ENABLED(CONFIG_SLAB_BUCKETS)) 442 return ZERO_SIZE_PTR; 443 444 if (WARN_ON(!kmem_buckets_cache)) 445 return NULL; 446 447 b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO); 448 if (WARN_ON(!b)) 449 return NULL; 450 451 flags |= SLAB_NO_MERGE; 452 453 for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) { 454 char *short_size, *cache_name; 455 unsigned int cache_useroffset, cache_usersize; 456 unsigned int size, aligned_idx; 457 458 if (!kmalloc_caches[KMALLOC_NORMAL][idx]) 459 continue; 460 461 size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size; 462 if (!size) 463 continue; 464 465 short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, '-'); 466 if (WARN_ON(!short_size)) 467 goto fail; 468 469 if (useroffset >= size) { 470 cache_useroffset = 0; 471 cache_usersize = 0; 472 } else { 473 cache_useroffset = useroffset; 474 cache_usersize = min(size - cache_useroffset, usersize); 475 } 476 477 aligned_idx = __kmalloc_index(size, false); 478 if (!(*b)[aligned_idx]) { 479 cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1); 480 if (WARN_ON(!cache_name)) 481 goto fail; 482 (*b)[aligned_idx] = kmem_cache_create_usercopy(cache_name, size, 483 0, flags, cache_useroffset, 484 cache_usersize, ctor); 485 kfree(cache_name); 486 if (WARN_ON(!(*b)[aligned_idx])) 487 goto fail; 488 set_bit(aligned_idx, &mask); 489 } 490 if (idx != aligned_idx) 491 (*b)[idx] = (*b)[aligned_idx]; 492 } 493 494 return b; 495 496 fail: 497 for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL])) 498 kmem_cache_destroy((*b)[idx]); 499 kmem_cache_free(kmem_buckets_cache, b); 500 501 return NULL; 502 } 503 EXPORT_SYMBOL(kmem_buckets_create); 504 505 /* 506 * For a given kmem_cache, kmem_cache_destroy() should only be called 507 * once or there will be a use-after-free problem. The actual deletion 508 * and release of the kobject does not need slab_mutex or cpu_hotplug_lock 509 * protection. So they are now done without holding those locks. 510 */ 511 static void kmem_cache_release(struct kmem_cache *s) 512 { 513 kfence_shutdown_cache(s); 514 if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL) 515 sysfs_slab_release(s); 516 else 517 slab_kmem_cache_release(s); 518 } 519 520 void slab_kmem_cache_release(struct kmem_cache *s) 521 { 522 __kmem_cache_release(s); 523 kfree_const(s->name); 524 kmem_cache_free(kmem_cache, s); 525 } 526 527 void kmem_cache_destroy(struct kmem_cache *s) 528 { 529 int err; 530 531 if (unlikely(!s) || !kasan_check_byte(s)) 532 return; 533 534 /* in-flight kfree_rcu()'s may include objects from our cache */ 535 kvfree_rcu_barrier_on_cache(s); 536 537 if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) && 538 (s->flags & SLAB_TYPESAFE_BY_RCU)) { 539 /* 540 * Under CONFIG_SLUB_RCU_DEBUG, when objects in a 541 * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally 542 * defer their freeing with call_rcu(). 543 * Wait for such call_rcu() invocations here before actually 544 * destroying the cache. 545 * 546 * It doesn't matter that we haven't looked at the slab refcount 547 * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so 548 * the refcount should be 1 here. 549 */ 550 rcu_barrier(); 551 } 552 553 /* Wait for deferred work from kmalloc/kfree_nolock() */ 554 defer_free_barrier(); 555 556 cpus_read_lock(); 557 mutex_lock(&slab_mutex); 558 559 s->refcount--; 560 if (s->refcount) { 561 mutex_unlock(&slab_mutex); 562 cpus_read_unlock(); 563 return; 564 } 565 566 /* free asan quarantined objects */ 567 kasan_cache_shutdown(s); 568 569 err = __kmem_cache_shutdown(s); 570 if (!slab_in_kunit_test()) 571 WARN(err, "%s %s: Slab cache still has objects when called from %pS", 572 __func__, s->name, (void *)_RET_IP_); 573 574 list_del(&s->list); 575 576 mutex_unlock(&slab_mutex); 577 cpus_read_unlock(); 578 579 if (slab_state >= FULL) 580 sysfs_slab_unlink(s); 581 debugfs_slab_release(s); 582 583 if (err) 584 return; 585 586 if (s->flags & SLAB_TYPESAFE_BY_RCU) 587 rcu_barrier(); 588 589 kmem_cache_release(s); 590 } 591 EXPORT_SYMBOL(kmem_cache_destroy); 592 593 /** 594 * kmem_cache_shrink - Shrink a cache. 595 * @cachep: The cache to shrink. 596 * 597 * Releases as many slabs as possible for a cache. 598 * To help debugging, a zero exit status indicates all slabs were released. 599 * 600 * Return: %0 if all slabs were released, non-zero otherwise 601 */ 602 int kmem_cache_shrink(struct kmem_cache *cachep) 603 { 604 kasan_cache_shrink(cachep); 605 606 return __kmem_cache_shrink(cachep); 607 } 608 EXPORT_SYMBOL(kmem_cache_shrink); 609 610 bool slab_is_available(void) 611 { 612 return slab_state >= UP; 613 } 614 615 #ifdef CONFIG_PRINTK 616 static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) 617 { 618 if (__kfence_obj_info(kpp, object, slab)) 619 return; 620 __kmem_obj_info(kpp, object, slab); 621 } 622 623 /** 624 * kmem_dump_obj - Print available slab provenance information 625 * @object: slab object for which to find provenance information. 626 * 627 * This function uses pr_cont(), so that the caller is expected to have 628 * printed out whatever preamble is appropriate. The provenance information 629 * depends on the type of object and on how much debugging is enabled. 630 * For a slab-cache object, the fact that it is a slab object is printed, 631 * and, if available, the slab name, return address, and stack trace from 632 * the allocation and last free path of that object. 633 * 634 * Return: %true if the pointer is to a not-yet-freed object from 635 * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer 636 * is to an already-freed object, and %false otherwise. 637 */ 638 bool kmem_dump_obj(void *object) 639 { 640 char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; 641 int i; 642 struct slab *slab; 643 unsigned long ptroffset; 644 struct kmem_obj_info kp = { }; 645 646 /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ 647 if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) 648 return false; 649 slab = virt_to_slab(object); 650 if (!slab) 651 return false; 652 653 kmem_obj_info(&kp, object, slab); 654 if (kp.kp_slab_cache) 655 pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); 656 else 657 pr_cont(" slab%s", cp); 658 if (is_kfence_address(object)) 659 pr_cont(" (kfence)"); 660 if (kp.kp_objp) 661 pr_cont(" start %px", kp.kp_objp); 662 if (kp.kp_data_offset) 663 pr_cont(" data offset %lu", kp.kp_data_offset); 664 if (kp.kp_objp) { 665 ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset; 666 pr_cont(" pointer offset %lu", ptroffset); 667 } 668 if (kp.kp_slab_cache && kp.kp_slab_cache->object_size) 669 pr_cont(" size %u", kp.kp_slab_cache->object_size); 670 if (kp.kp_ret) 671 pr_cont(" allocated at %pS\n", kp.kp_ret); 672 else 673 pr_cont("\n"); 674 for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) { 675 if (!kp.kp_stack[i]) 676 break; 677 pr_info(" %pS\n", kp.kp_stack[i]); 678 } 679 680 if (kp.kp_free_stack[0]) 681 pr_cont(" Free path:\n"); 682 683 for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) { 684 if (!kp.kp_free_stack[i]) 685 break; 686 pr_info(" %pS\n", kp.kp_free_stack[i]); 687 } 688 689 return true; 690 } 691 EXPORT_SYMBOL_GPL(kmem_dump_obj); 692 #endif 693 694 /* Create a cache during boot when no slab services are available yet */ 695 void __init create_boot_cache(struct kmem_cache *s, const char *name, 696 unsigned int size, slab_flags_t flags, 697 unsigned int useroffset, unsigned int usersize) 698 { 699 int err; 700 unsigned int align = ARCH_KMALLOC_MINALIGN; 701 struct kmem_cache_args kmem_args = {}; 702 703 /* 704 * kmalloc caches guarantee alignment of at least the largest 705 * power-of-two divisor of the size. For power-of-two sizes, 706 * it is the size itself. 707 */ 708 if (flags & SLAB_KMALLOC) 709 align = max(align, 1U << (ffs(size) - 1)); 710 kmem_args.align = calculate_alignment(flags, align, size); 711 712 #ifdef CONFIG_HARDENED_USERCOPY 713 kmem_args.useroffset = useroffset; 714 kmem_args.usersize = usersize; 715 #endif 716 717 err = do_kmem_cache_create(s, name, size, &kmem_args, flags); 718 719 if (err) 720 panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", 721 name, size, err); 722 723 s->refcount = -1; /* Exempt from merging for now */ 724 } 725 726 static struct kmem_cache *__init create_kmalloc_cache(const char *name, 727 unsigned int size, 728 slab_flags_t flags) 729 { 730 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 731 732 if (!s) 733 panic("Out of memory when creating slab %s\n", name); 734 735 create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size); 736 list_add(&s->list, &slab_caches); 737 s->refcount = 1; 738 return s; 739 } 740 741 kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init = 742 { /* initialization for https://llvm.org/pr42570 */ }; 743 EXPORT_SYMBOL(kmalloc_caches); 744 745 #ifdef CONFIG_RANDOM_KMALLOC_CACHES 746 unsigned long random_kmalloc_seed __ro_after_init; 747 EXPORT_SYMBOL(random_kmalloc_seed); 748 #endif 749 750 /* 751 * Conversion table for small slabs sizes / 8 to the index in the 752 * kmalloc array. This is necessary for slabs < 192 since we have non power 753 * of two cache sizes there. The size of larger slabs can be determined using 754 * fls. 755 */ 756 u8 kmalloc_size_index[24] __ro_after_init = { 757 3, /* 8 */ 758 4, /* 16 */ 759 5, /* 24 */ 760 5, /* 32 */ 761 6, /* 40 */ 762 6, /* 48 */ 763 6, /* 56 */ 764 6, /* 64 */ 765 1, /* 72 */ 766 1, /* 80 */ 767 1, /* 88 */ 768 1, /* 96 */ 769 7, /* 104 */ 770 7, /* 112 */ 771 7, /* 120 */ 772 7, /* 128 */ 773 2, /* 136 */ 774 2, /* 144 */ 775 2, /* 152 */ 776 2, /* 160 */ 777 2, /* 168 */ 778 2, /* 176 */ 779 2, /* 184 */ 780 2 /* 192 */ 781 }; 782 783 size_t kmalloc_size_roundup(size_t size) 784 { 785 if (size && size <= KMALLOC_MAX_CACHE_SIZE) { 786 /* 787 * The flags don't matter since size_index is common to all. 788 * Neither does the caller for just getting ->object_size. 789 */ 790 return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size; 791 } 792 793 /* Above the smaller buckets, size is a multiple of page size. */ 794 if (size && size <= KMALLOC_MAX_SIZE) 795 return PAGE_SIZE << get_order(size); 796 797 /* 798 * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR 799 * and very large size - kmalloc() may fail. 800 */ 801 return size; 802 803 } 804 EXPORT_SYMBOL(kmalloc_size_roundup); 805 806 #ifdef CONFIG_ZONE_DMA 807 #define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, 808 #else 809 #define KMALLOC_DMA_NAME(sz) 810 #endif 811 812 #ifdef CONFIG_MEMCG 813 #define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz, 814 #else 815 #define KMALLOC_CGROUP_NAME(sz) 816 #endif 817 818 #ifndef CONFIG_SLUB_TINY 819 #define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz, 820 #else 821 #define KMALLOC_RCL_NAME(sz) 822 #endif 823 824 #ifdef CONFIG_RANDOM_KMALLOC_CACHES 825 #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b 826 #define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz) 827 #define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz, 828 #define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz, 829 #define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz, 830 #define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz, 831 #define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz, 832 #define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz, 833 #define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz, 834 #define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz, 835 #define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz, 836 #define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz, 837 #define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz, 838 #define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz, 839 #define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz, 840 #define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz, 841 #define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz, 842 #else // CONFIG_RANDOM_KMALLOC_CACHES 843 #define KMALLOC_RANDOM_NAME(N, sz) 844 #endif 845 846 #define INIT_KMALLOC_INFO(__size, __short_size) \ 847 { \ 848 .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ 849 KMALLOC_RCL_NAME(__short_size) \ 850 KMALLOC_CGROUP_NAME(__short_size) \ 851 KMALLOC_DMA_NAME(__short_size) \ 852 KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \ 853 .size = __size, \ 854 } 855 856 /* 857 * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time. 858 * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is 859 * kmalloc-2M. 860 */ 861 const struct kmalloc_info_struct kmalloc_info[] __initconst = { 862 INIT_KMALLOC_INFO(0, 0), 863 INIT_KMALLOC_INFO(96, 96), 864 INIT_KMALLOC_INFO(192, 192), 865 INIT_KMALLOC_INFO(8, 8), 866 INIT_KMALLOC_INFO(16, 16), 867 INIT_KMALLOC_INFO(32, 32), 868 INIT_KMALLOC_INFO(64, 64), 869 INIT_KMALLOC_INFO(128, 128), 870 INIT_KMALLOC_INFO(256, 256), 871 INIT_KMALLOC_INFO(512, 512), 872 INIT_KMALLOC_INFO(1024, 1k), 873 INIT_KMALLOC_INFO(2048, 2k), 874 INIT_KMALLOC_INFO(4096, 4k), 875 INIT_KMALLOC_INFO(8192, 8k), 876 INIT_KMALLOC_INFO(16384, 16k), 877 INIT_KMALLOC_INFO(32768, 32k), 878 INIT_KMALLOC_INFO(65536, 64k), 879 INIT_KMALLOC_INFO(131072, 128k), 880 INIT_KMALLOC_INFO(262144, 256k), 881 INIT_KMALLOC_INFO(524288, 512k), 882 INIT_KMALLOC_INFO(1048576, 1M), 883 INIT_KMALLOC_INFO(2097152, 2M) 884 }; 885 886 /* 887 * Patch up the size_index table if we have strange large alignment 888 * requirements for the kmalloc array. This is only the case for 889 * MIPS it seems. The standard arches will not generate any code here. 890 * 891 * Largest permitted alignment is 256 bytes due to the way we 892 * handle the index determination for the smaller caches. 893 * 894 * Make sure that nothing crazy happens if someone starts tinkering 895 * around with ARCH_KMALLOC_MINALIGN 896 */ 897 void __init setup_kmalloc_cache_index_table(void) 898 { 899 unsigned int i; 900 901 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 902 !is_power_of_2(KMALLOC_MIN_SIZE)); 903 904 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 905 unsigned int elem = size_index_elem(i); 906 907 if (elem >= ARRAY_SIZE(kmalloc_size_index)) 908 break; 909 kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW; 910 } 911 912 if (KMALLOC_MIN_SIZE >= 64) { 913 /* 914 * The 96 byte sized cache is not used if the alignment 915 * is 64 byte. 916 */ 917 for (i = 64 + 8; i <= 96; i += 8) 918 kmalloc_size_index[size_index_elem(i)] = 7; 919 920 } 921 922 if (KMALLOC_MIN_SIZE >= 128) { 923 /* 924 * The 192 byte sized cache is not used if the alignment 925 * is 128 byte. Redirect kmalloc to use the 256 byte cache 926 * instead. 927 */ 928 for (i = 128 + 8; i <= 192; i += 8) 929 kmalloc_size_index[size_index_elem(i)] = 8; 930 } 931 } 932 933 static unsigned int __kmalloc_minalign(void) 934 { 935 unsigned int minalign = dma_get_cache_alignment(); 936 937 if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && 938 is_swiotlb_allocated()) 939 minalign = ARCH_KMALLOC_MINALIGN; 940 941 return max(minalign, arch_slab_minalign()); 942 } 943 944 static void __init 945 new_kmalloc_cache(int idx, enum kmalloc_cache_type type) 946 { 947 slab_flags_t flags = 0; 948 unsigned int minalign = __kmalloc_minalign(); 949 unsigned int aligned_size = kmalloc_info[idx].size; 950 int aligned_idx = idx; 951 952 if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) { 953 flags |= SLAB_RECLAIM_ACCOUNT; 954 } else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) { 955 if (mem_cgroup_kmem_disabled()) { 956 kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; 957 return; 958 } 959 flags |= SLAB_ACCOUNT; 960 } else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) { 961 flags |= SLAB_CACHE_DMA; 962 } 963 964 #ifdef CONFIG_RANDOM_KMALLOC_CACHES 965 if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END) 966 flags |= SLAB_NO_MERGE; 967 #endif 968 969 /* 970 * If CONFIG_MEMCG is enabled, disable cache merging for 971 * KMALLOC_NORMAL caches. 972 */ 973 if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL)) 974 flags |= SLAB_NO_MERGE; 975 976 if (minalign > ARCH_KMALLOC_MINALIGN) { 977 aligned_size = ALIGN(aligned_size, minalign); 978 aligned_idx = __kmalloc_index(aligned_size, false); 979 } 980 981 if (!kmalloc_caches[type][aligned_idx]) 982 kmalloc_caches[type][aligned_idx] = create_kmalloc_cache( 983 kmalloc_info[aligned_idx].name[type], 984 aligned_size, flags); 985 if (idx != aligned_idx) 986 kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx]; 987 } 988 989 /* 990 * Create the kmalloc array. Some of the regular kmalloc arrays 991 * may already have been created because they were needed to 992 * enable allocations for slab creation. 993 */ 994 void __init create_kmalloc_caches(void) 995 { 996 int i; 997 enum kmalloc_cache_type type; 998 999 /* 1000 * Including KMALLOC_CGROUP if CONFIG_MEMCG defined 1001 */ 1002 for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { 1003 /* Caches that are NOT of the two-to-the-power-of size. */ 1004 if (KMALLOC_MIN_SIZE <= 32) 1005 new_kmalloc_cache(1, type); 1006 if (KMALLOC_MIN_SIZE <= 64) 1007 new_kmalloc_cache(2, type); 1008 1009 /* Caches that are of the two-to-the-power-of size. */ 1010 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 1011 new_kmalloc_cache(i, type); 1012 } 1013 #ifdef CONFIG_RANDOM_KMALLOC_CACHES 1014 random_kmalloc_seed = get_random_u64(); 1015 #endif 1016 1017 /* Kmalloc array is now usable */ 1018 slab_state = UP; 1019 1020 if (IS_ENABLED(CONFIG_SLAB_BUCKETS)) 1021 kmem_buckets_cache = kmem_cache_create("kmalloc_buckets", 1022 sizeof(kmem_buckets), 1023 0, SLAB_NO_MERGE, NULL); 1024 } 1025 1026 gfp_t kmalloc_fix_flags(gfp_t flags) 1027 { 1028 gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; 1029 1030 flags &= ~GFP_SLAB_BUG_MASK; 1031 pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", 1032 invalid_mask, &invalid_mask, flags, &flags); 1033 dump_stack(); 1034 1035 return flags; 1036 } 1037 1038 #ifdef CONFIG_SLAB_FREELIST_RANDOM 1039 /* Randomize a generic freelist */ 1040 static void freelist_randomize(unsigned int *list, 1041 unsigned int count) 1042 { 1043 unsigned int rand; 1044 unsigned int i; 1045 1046 for (i = 0; i < count; i++) 1047 list[i] = i; 1048 1049 /* Fisher-Yates shuffle */ 1050 for (i = count - 1; i > 0; i--) { 1051 rand = get_random_u32_below(i + 1); 1052 swap(list[i], list[rand]); 1053 } 1054 } 1055 1056 /* Create a random sequence per cache */ 1057 int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, 1058 gfp_t gfp) 1059 { 1060 1061 if (count < 2 || cachep->random_seq) 1062 return 0; 1063 1064 cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp); 1065 if (!cachep->random_seq) 1066 return -ENOMEM; 1067 1068 freelist_randomize(cachep->random_seq, count); 1069 return 0; 1070 } 1071 1072 /* Destroy the per-cache random freelist sequence */ 1073 void cache_random_seq_destroy(struct kmem_cache *cachep) 1074 { 1075 kfree(cachep->random_seq); 1076 cachep->random_seq = NULL; 1077 } 1078 #endif /* CONFIG_SLAB_FREELIST_RANDOM */ 1079 1080 #ifdef CONFIG_SLUB_DEBUG 1081 #define SLABINFO_RIGHTS (0400) 1082 1083 static void print_slabinfo_header(struct seq_file *m) 1084 { 1085 /* 1086 * Output format version, so at least we can change it 1087 * without _too_ many complaints. 1088 */ 1089 seq_puts(m, "slabinfo - version: 2.1\n"); 1090 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); 1091 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 1092 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 1093 seq_putc(m, '\n'); 1094 } 1095 1096 static void *slab_start(struct seq_file *m, loff_t *pos) 1097 { 1098 mutex_lock(&slab_mutex); 1099 return seq_list_start(&slab_caches, *pos); 1100 } 1101 1102 static void *slab_next(struct seq_file *m, void *p, loff_t *pos) 1103 { 1104 return seq_list_next(p, &slab_caches, pos); 1105 } 1106 1107 static void slab_stop(struct seq_file *m, void *p) 1108 { 1109 mutex_unlock(&slab_mutex); 1110 } 1111 1112 static void cache_show(struct kmem_cache *s, struct seq_file *m) 1113 { 1114 struct slabinfo sinfo; 1115 1116 memset(&sinfo, 0, sizeof(sinfo)); 1117 get_slabinfo(s, &sinfo); 1118 1119 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 1120 s->name, sinfo.active_objs, sinfo.num_objs, s->size, 1121 sinfo.objects_per_slab, (1 << sinfo.cache_order)); 1122 1123 seq_printf(m, " : tunables %4u %4u %4u", 1124 sinfo.limit, sinfo.batchcount, sinfo.shared); 1125 seq_printf(m, " : slabdata %6lu %6lu %6lu", 1126 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); 1127 seq_putc(m, '\n'); 1128 } 1129 1130 static int slab_show(struct seq_file *m, void *p) 1131 { 1132 struct kmem_cache *s = list_entry(p, struct kmem_cache, list); 1133 1134 if (p == slab_caches.next) 1135 print_slabinfo_header(m); 1136 cache_show(s, m); 1137 return 0; 1138 } 1139 1140 void dump_unreclaimable_slab(void) 1141 { 1142 struct kmem_cache *s; 1143 struct slabinfo sinfo; 1144 1145 /* 1146 * Here acquiring slab_mutex is risky since we don't prefer to get 1147 * sleep in oom path. But, without mutex hold, it may introduce a 1148 * risk of crash. 1149 * Use mutex_trylock to protect the list traverse, dump nothing 1150 * without acquiring the mutex. 1151 */ 1152 if (!mutex_trylock(&slab_mutex)) { 1153 pr_warn("excessive unreclaimable slab but cannot dump stats\n"); 1154 return; 1155 } 1156 1157 pr_info("Unreclaimable slab info:\n"); 1158 pr_info("Name Used Total\n"); 1159 1160 list_for_each_entry(s, &slab_caches, list) { 1161 if (s->flags & SLAB_RECLAIM_ACCOUNT) 1162 continue; 1163 1164 get_slabinfo(s, &sinfo); 1165 1166 if (sinfo.num_objs > 0) 1167 pr_info("%-17s %10luKB %10luKB\n", s->name, 1168 (sinfo.active_objs * s->size) / 1024, 1169 (sinfo.num_objs * s->size) / 1024); 1170 } 1171 mutex_unlock(&slab_mutex); 1172 } 1173 1174 /* 1175 * slabinfo_op - iterator that generates /proc/slabinfo 1176 * 1177 * Output layout: 1178 * cache-name 1179 * num-active-objs 1180 * total-objs 1181 * object size 1182 * num-active-slabs 1183 * total-slabs 1184 * num-pages-per-slab 1185 * + further values on SMP and with statistics enabled 1186 */ 1187 static const struct seq_operations slabinfo_op = { 1188 .start = slab_start, 1189 .next = slab_next, 1190 .stop = slab_stop, 1191 .show = slab_show, 1192 }; 1193 1194 static int slabinfo_open(struct inode *inode, struct file *file) 1195 { 1196 return seq_open(file, &slabinfo_op); 1197 } 1198 1199 static const struct proc_ops slabinfo_proc_ops = { 1200 .proc_flags = PROC_ENTRY_PERMANENT, 1201 .proc_open = slabinfo_open, 1202 .proc_read = seq_read, 1203 .proc_lseek = seq_lseek, 1204 .proc_release = seq_release, 1205 }; 1206 1207 static int __init slab_proc_init(void) 1208 { 1209 proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops); 1210 return 0; 1211 } 1212 module_init(slab_proc_init); 1213 1214 #endif /* CONFIG_SLUB_DEBUG */ 1215 1216 /** 1217 * kfree_sensitive - Clear sensitive information in memory before freeing 1218 * @p: object to free memory of 1219 * 1220 * The memory of the object @p points to is zeroed before freed. 1221 * If @p is %NULL, kfree_sensitive() does nothing. 1222 * 1223 * Note: this function zeroes the whole allocated buffer which can be a good 1224 * deal bigger than the requested buffer size passed to kmalloc(). So be 1225 * careful when using this function in performance sensitive code. 1226 */ 1227 void kfree_sensitive(const void *p) 1228 { 1229 size_t ks; 1230 void *mem = (void *)p; 1231 1232 ks = ksize(mem); 1233 if (ks) { 1234 kasan_unpoison_range(mem, ks); 1235 memzero_explicit(mem, ks); 1236 } 1237 kfree(mem); 1238 } 1239 EXPORT_SYMBOL(kfree_sensitive); 1240 1241 #ifdef CONFIG_BPF_SYSCALL 1242 #include <linux/btf.h> 1243 1244 __bpf_kfunc_start_defs(); 1245 1246 __bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr) 1247 { 1248 struct slab *slab; 1249 1250 if (!virt_addr_valid((void *)(long)addr)) 1251 return NULL; 1252 1253 slab = virt_to_slab((void *)(long)addr); 1254 return slab ? slab->slab_cache : NULL; 1255 } 1256 1257 __bpf_kfunc_end_defs(); 1258 #endif /* CONFIG_BPF_SYSCALL */ 1259 1260 /* Tracepoints definitions. */ 1261 EXPORT_TRACEPOINT_SYMBOL(kmalloc); 1262 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 1263 EXPORT_TRACEPOINT_SYMBOL(kfree); 1264 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); 1265 1266 #ifndef CONFIG_KVFREE_RCU_BATCHED 1267 1268 void kvfree_call_rcu(struct rcu_head *head, void *ptr) 1269 { 1270 if (head) { 1271 kasan_record_aux_stack(ptr); 1272 call_rcu(head, kvfree_rcu_cb); 1273 return; 1274 } 1275 1276 // kvfree_rcu(one_arg) call. 1277 might_sleep(); 1278 synchronize_rcu(); 1279 kvfree(ptr); 1280 } 1281 EXPORT_SYMBOL_GPL(kvfree_call_rcu); 1282 1283 void __init kvfree_rcu_init(void) 1284 { 1285 } 1286 1287 #else /* CONFIG_KVFREE_RCU_BATCHED */ 1288 1289 /* 1290 * This rcu parameter is runtime-read-only. It reflects 1291 * a minimum allowed number of objects which can be cached 1292 * per-CPU. Object size is equal to one page. This value 1293 * can be changed at boot time. 1294 */ 1295 static int rcu_min_cached_objs = 5; 1296 module_param(rcu_min_cached_objs, int, 0444); 1297 1298 // A page shrinker can ask for pages to be freed to make them 1299 // available for other parts of the system. This usually happens 1300 // under low memory conditions, and in that case we should also 1301 // defer page-cache filling for a short time period. 1302 // 1303 // The default value is 5 seconds, which is long enough to reduce 1304 // interference with the shrinker while it asks other systems to 1305 // drain their caches. 1306 static int rcu_delay_page_cache_fill_msec = 5000; 1307 module_param(rcu_delay_page_cache_fill_msec, int, 0444); 1308 1309 static struct workqueue_struct *rcu_reclaim_wq; 1310 1311 /* Maximum number of jiffies to wait before draining a batch. */ 1312 #define KFREE_DRAIN_JIFFIES (5 * HZ) 1313 #define KFREE_N_BATCHES 2 1314 #define FREE_N_CHANNELS 2 1315 1316 /** 1317 * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers 1318 * @list: List node. All blocks are linked between each other 1319 * @gp_snap: Snapshot of RCU state for objects placed to this bulk 1320 * @nr_records: Number of active pointers in the array 1321 * @records: Array of the kvfree_rcu() pointers 1322 */ 1323 struct kvfree_rcu_bulk_data { 1324 struct list_head list; 1325 struct rcu_gp_oldstate gp_snap; 1326 unsigned long nr_records; 1327 void *records[] __counted_by(nr_records); 1328 }; 1329 1330 /* 1331 * This macro defines how many entries the "records" array 1332 * will contain. It is based on the fact that the size of 1333 * kvfree_rcu_bulk_data structure becomes exactly one page. 1334 */ 1335 #define KVFREE_BULK_MAX_ENTR \ 1336 ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) 1337 1338 /** 1339 * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests 1340 * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period 1341 * @head_free: List of kfree_rcu() objects waiting for a grace period 1342 * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. 1343 * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period 1344 * @krcp: Pointer to @kfree_rcu_cpu structure 1345 */ 1346 1347 struct kfree_rcu_cpu_work { 1348 struct rcu_work rcu_work; 1349 struct rcu_head *head_free; 1350 struct rcu_gp_oldstate head_free_gp_snap; 1351 struct list_head bulk_head_free[FREE_N_CHANNELS]; 1352 struct kfree_rcu_cpu *krcp; 1353 }; 1354 1355 /** 1356 * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period 1357 * @head: List of kfree_rcu() objects not yet waiting for a grace period 1358 * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" 1359 * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period 1360 * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period 1361 * @lock: Synchronize access to this structure 1362 * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES 1363 * @initialized: The @rcu_work fields have been initialized 1364 * @head_count: Number of objects in rcu_head singular list 1365 * @bulk_count: Number of objects in bulk-list 1366 * @bkvcache: 1367 * A simple cache list that contains objects for reuse purpose. 1368 * In order to save some per-cpu space the list is singular. 1369 * Even though it is lockless an access has to be protected by the 1370 * per-cpu lock. 1371 * @page_cache_work: A work to refill the cache when it is empty 1372 * @backoff_page_cache_fill: Delay cache refills 1373 * @work_in_progress: Indicates that page_cache_work is running 1374 * @hrtimer: A hrtimer for scheduling a page_cache_work 1375 * @nr_bkv_objs: number of allocated objects at @bkvcache. 1376 * 1377 * This is a per-CPU structure. The reason that it is not included in 1378 * the rcu_data structure is to permit this code to be extracted from 1379 * the RCU files. Such extraction could allow further optimization of 1380 * the interactions with the slab allocators. 1381 */ 1382 struct kfree_rcu_cpu { 1383 // Objects queued on a linked list 1384 // through their rcu_head structures. 1385 struct rcu_head *head; 1386 unsigned long head_gp_snap; 1387 atomic_t head_count; 1388 1389 // Objects queued on a bulk-list. 1390 struct list_head bulk_head[FREE_N_CHANNELS]; 1391 atomic_t bulk_count[FREE_N_CHANNELS]; 1392 1393 struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; 1394 raw_spinlock_t lock; 1395 struct delayed_work monitor_work; 1396 bool initialized; 1397 1398 struct delayed_work page_cache_work; 1399 atomic_t backoff_page_cache_fill; 1400 atomic_t work_in_progress; 1401 struct hrtimer hrtimer; 1402 1403 struct llist_head bkvcache; 1404 int nr_bkv_objs; 1405 }; 1406 1407 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { 1408 .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), 1409 }; 1410 1411 static __always_inline void 1412 debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) 1413 { 1414 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 1415 int i; 1416 1417 for (i = 0; i < bhead->nr_records; i++) 1418 debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); 1419 #endif 1420 } 1421 1422 static inline struct kfree_rcu_cpu * 1423 krc_this_cpu_lock(unsigned long *flags) 1424 { 1425 struct kfree_rcu_cpu *krcp; 1426 1427 local_irq_save(*flags); // For safely calling this_cpu_ptr(). 1428 krcp = this_cpu_ptr(&krc); 1429 raw_spin_lock(&krcp->lock); 1430 1431 return krcp; 1432 } 1433 1434 static inline void 1435 krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) 1436 { 1437 raw_spin_unlock_irqrestore(&krcp->lock, flags); 1438 } 1439 1440 static inline struct kvfree_rcu_bulk_data * 1441 get_cached_bnode(struct kfree_rcu_cpu *krcp) 1442 { 1443 if (!krcp->nr_bkv_objs) 1444 return NULL; 1445 1446 WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); 1447 return (struct kvfree_rcu_bulk_data *) 1448 llist_del_first(&krcp->bkvcache); 1449 } 1450 1451 static inline bool 1452 put_cached_bnode(struct kfree_rcu_cpu *krcp, 1453 struct kvfree_rcu_bulk_data *bnode) 1454 { 1455 // Check the limit. 1456 if (krcp->nr_bkv_objs >= rcu_min_cached_objs) 1457 return false; 1458 1459 llist_add((struct llist_node *) bnode, &krcp->bkvcache); 1460 WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); 1461 return true; 1462 } 1463 1464 static int 1465 drain_page_cache(struct kfree_rcu_cpu *krcp) 1466 { 1467 unsigned long flags; 1468 struct llist_node *page_list, *pos, *n; 1469 int freed = 0; 1470 1471 if (!rcu_min_cached_objs) 1472 return 0; 1473 1474 raw_spin_lock_irqsave(&krcp->lock, flags); 1475 page_list = llist_del_all(&krcp->bkvcache); 1476 WRITE_ONCE(krcp->nr_bkv_objs, 0); 1477 raw_spin_unlock_irqrestore(&krcp->lock, flags); 1478 1479 llist_for_each_safe(pos, n, page_list) { 1480 free_page((unsigned long)pos); 1481 freed++; 1482 } 1483 1484 return freed; 1485 } 1486 1487 static void 1488 kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, 1489 struct kvfree_rcu_bulk_data *bnode, int idx) 1490 { 1491 unsigned long flags; 1492 int i; 1493 1494 if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { 1495 debug_rcu_bhead_unqueue(bnode); 1496 rcu_lock_acquire(&rcu_callback_map); 1497 if (idx == 0) { // kmalloc() / kfree(). 1498 trace_rcu_invoke_kfree_bulk_callback( 1499 "slab", bnode->nr_records, 1500 bnode->records); 1501 1502 kfree_bulk(bnode->nr_records, bnode->records); 1503 } else { // vmalloc() / vfree(). 1504 for (i = 0; i < bnode->nr_records; i++) { 1505 trace_rcu_invoke_kvfree_callback( 1506 "slab", bnode->records[i], 0); 1507 1508 vfree(bnode->records[i]); 1509 } 1510 } 1511 rcu_lock_release(&rcu_callback_map); 1512 } 1513 1514 raw_spin_lock_irqsave(&krcp->lock, flags); 1515 if (put_cached_bnode(krcp, bnode)) 1516 bnode = NULL; 1517 raw_spin_unlock_irqrestore(&krcp->lock, flags); 1518 1519 if (bnode) 1520 free_page((unsigned long) bnode); 1521 1522 cond_resched_tasks_rcu_qs(); 1523 } 1524 1525 static void 1526 kvfree_rcu_list(struct rcu_head *head) 1527 { 1528 struct rcu_head *next; 1529 1530 for (; head; head = next) { 1531 void *ptr = (void *) head->func; 1532 unsigned long offset = (void *) head - ptr; 1533 1534 next = head->next; 1535 debug_rcu_head_unqueue((struct rcu_head *)ptr); 1536 rcu_lock_acquire(&rcu_callback_map); 1537 trace_rcu_invoke_kvfree_callback("slab", head, offset); 1538 1539 kvfree(ptr); 1540 1541 rcu_lock_release(&rcu_callback_map); 1542 cond_resched_tasks_rcu_qs(); 1543 } 1544 } 1545 1546 /* 1547 * This function is invoked in workqueue context after a grace period. 1548 * It frees all the objects queued on ->bulk_head_free or ->head_free. 1549 */ 1550 static void kfree_rcu_work(struct work_struct *work) 1551 { 1552 unsigned long flags; 1553 struct kvfree_rcu_bulk_data *bnode, *n; 1554 struct list_head bulk_head[FREE_N_CHANNELS]; 1555 struct rcu_head *head; 1556 struct kfree_rcu_cpu *krcp; 1557 struct kfree_rcu_cpu_work *krwp; 1558 struct rcu_gp_oldstate head_gp_snap; 1559 int i; 1560 1561 krwp = container_of(to_rcu_work(work), 1562 struct kfree_rcu_cpu_work, rcu_work); 1563 krcp = krwp->krcp; 1564 1565 raw_spin_lock_irqsave(&krcp->lock, flags); 1566 // Channels 1 and 2. 1567 for (i = 0; i < FREE_N_CHANNELS; i++) 1568 list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); 1569 1570 // Channel 3. 1571 head = krwp->head_free; 1572 krwp->head_free = NULL; 1573 head_gp_snap = krwp->head_free_gp_snap; 1574 raw_spin_unlock_irqrestore(&krcp->lock, flags); 1575 1576 // Handle the first two channels. 1577 for (i = 0; i < FREE_N_CHANNELS; i++) { 1578 // Start from the tail page, so a GP is likely passed for it. 1579 list_for_each_entry_safe(bnode, n, &bulk_head[i], list) 1580 kvfree_rcu_bulk(krcp, bnode, i); 1581 } 1582 1583 /* 1584 * This is used when the "bulk" path can not be used for the 1585 * double-argument of kvfree_rcu(). This happens when the 1586 * page-cache is empty, which means that objects are instead 1587 * queued on a linked list through their rcu_head structures. 1588 * This list is named "Channel 3". 1589 */ 1590 if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) 1591 kvfree_rcu_list(head); 1592 } 1593 1594 static bool kfree_rcu_sheaf(void *obj) 1595 { 1596 struct kmem_cache *s; 1597 struct slab *slab; 1598 1599 if (is_vmalloc_addr(obj)) 1600 return false; 1601 1602 slab = virt_to_slab(obj); 1603 if (unlikely(!slab)) 1604 return false; 1605 1606 s = slab->slab_cache; 1607 if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) 1608 return __kfree_rcu_sheaf(s, obj); 1609 1610 return false; 1611 } 1612 1613 static bool 1614 need_offload_krc(struct kfree_rcu_cpu *krcp) 1615 { 1616 int i; 1617 1618 for (i = 0; i < FREE_N_CHANNELS; i++) 1619 if (!list_empty(&krcp->bulk_head[i])) 1620 return true; 1621 1622 return !!READ_ONCE(krcp->head); 1623 } 1624 1625 static bool 1626 need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) 1627 { 1628 int i; 1629 1630 for (i = 0; i < FREE_N_CHANNELS; i++) 1631 if (!list_empty(&krwp->bulk_head_free[i])) 1632 return true; 1633 1634 return !!krwp->head_free; 1635 } 1636 1637 static int krc_count(struct kfree_rcu_cpu *krcp) 1638 { 1639 int sum = atomic_read(&krcp->head_count); 1640 int i; 1641 1642 for (i = 0; i < FREE_N_CHANNELS; i++) 1643 sum += atomic_read(&krcp->bulk_count[i]); 1644 1645 return sum; 1646 } 1647 1648 static void 1649 __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) 1650 { 1651 long delay, delay_left; 1652 1653 delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; 1654 if (delayed_work_pending(&krcp->monitor_work)) { 1655 delay_left = krcp->monitor_work.timer.expires - jiffies; 1656 if (delay < delay_left) 1657 mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); 1658 return; 1659 } 1660 queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay); 1661 } 1662 1663 static void 1664 schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) 1665 { 1666 unsigned long flags; 1667 1668 raw_spin_lock_irqsave(&krcp->lock, flags); 1669 __schedule_delayed_monitor_work(krcp); 1670 raw_spin_unlock_irqrestore(&krcp->lock, flags); 1671 } 1672 1673 static void 1674 kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) 1675 { 1676 struct list_head bulk_ready[FREE_N_CHANNELS]; 1677 struct kvfree_rcu_bulk_data *bnode, *n; 1678 struct rcu_head *head_ready = NULL; 1679 unsigned long flags; 1680 int i; 1681 1682 raw_spin_lock_irqsave(&krcp->lock, flags); 1683 for (i = 0; i < FREE_N_CHANNELS; i++) { 1684 INIT_LIST_HEAD(&bulk_ready[i]); 1685 1686 list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { 1687 if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) 1688 break; 1689 1690 atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); 1691 list_move(&bnode->list, &bulk_ready[i]); 1692 } 1693 } 1694 1695 if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { 1696 head_ready = krcp->head; 1697 atomic_set(&krcp->head_count, 0); 1698 WRITE_ONCE(krcp->head, NULL); 1699 } 1700 raw_spin_unlock_irqrestore(&krcp->lock, flags); 1701 1702 for (i = 0; i < FREE_N_CHANNELS; i++) { 1703 list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) 1704 kvfree_rcu_bulk(krcp, bnode, i); 1705 } 1706 1707 if (head_ready) 1708 kvfree_rcu_list(head_ready); 1709 } 1710 1711 /* 1712 * Return: %true if a work is queued, %false otherwise. 1713 */ 1714 static bool 1715 kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) 1716 { 1717 unsigned long flags; 1718 bool queued = false; 1719 int i, j; 1720 1721 raw_spin_lock_irqsave(&krcp->lock, flags); 1722 1723 // Attempt to start a new batch. 1724 for (i = 0; i < KFREE_N_BATCHES; i++) { 1725 struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); 1726 1727 // Try to detach bulk_head or head and attach it, only when 1728 // all channels are free. Any channel is not free means at krwp 1729 // there is on-going rcu work to handle krwp's free business. 1730 if (need_wait_for_krwp_work(krwp)) 1731 continue; 1732 1733 // kvfree_rcu_drain_ready() might handle this krcp, if so give up. 1734 if (need_offload_krc(krcp)) { 1735 // Channel 1 corresponds to the SLAB-pointer bulk path. 1736 // Channel 2 corresponds to vmalloc-pointer bulk path. 1737 for (j = 0; j < FREE_N_CHANNELS; j++) { 1738 if (list_empty(&krwp->bulk_head_free[j])) { 1739 atomic_set(&krcp->bulk_count[j], 0); 1740 list_replace_init(&krcp->bulk_head[j], 1741 &krwp->bulk_head_free[j]); 1742 } 1743 } 1744 1745 // Channel 3 corresponds to both SLAB and vmalloc 1746 // objects queued on the linked list. 1747 if (!krwp->head_free) { 1748 krwp->head_free = krcp->head; 1749 get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); 1750 atomic_set(&krcp->head_count, 0); 1751 WRITE_ONCE(krcp->head, NULL); 1752 } 1753 1754 // One work is per one batch, so there are three 1755 // "free channels", the batch can handle. Break 1756 // the loop since it is done with this CPU thus 1757 // queuing an RCU work is _always_ success here. 1758 queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work); 1759 WARN_ON_ONCE(!queued); 1760 break; 1761 } 1762 } 1763 1764 raw_spin_unlock_irqrestore(&krcp->lock, flags); 1765 return queued; 1766 } 1767 1768 /* 1769 * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. 1770 */ 1771 static void kfree_rcu_monitor(struct work_struct *work) 1772 { 1773 struct kfree_rcu_cpu *krcp = container_of(work, 1774 struct kfree_rcu_cpu, monitor_work.work); 1775 1776 // Drain ready for reclaim. 1777 kvfree_rcu_drain_ready(krcp); 1778 1779 // Queue a batch for a rest. 1780 kvfree_rcu_queue_batch(krcp); 1781 1782 // If there is nothing to detach, it means that our job is 1783 // successfully done here. In case of having at least one 1784 // of the channels that is still busy we should rearm the 1785 // work to repeat an attempt. Because previous batches are 1786 // still in progress. 1787 if (need_offload_krc(krcp)) 1788 schedule_delayed_monitor_work(krcp); 1789 } 1790 1791 static void fill_page_cache_func(struct work_struct *work) 1792 { 1793 struct kvfree_rcu_bulk_data *bnode; 1794 struct kfree_rcu_cpu *krcp = 1795 container_of(work, struct kfree_rcu_cpu, 1796 page_cache_work.work); 1797 unsigned long flags; 1798 int nr_pages; 1799 bool pushed; 1800 int i; 1801 1802 nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? 1803 1 : rcu_min_cached_objs; 1804 1805 for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { 1806 bnode = (struct kvfree_rcu_bulk_data *) 1807 __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 1808 1809 if (!bnode) 1810 break; 1811 1812 raw_spin_lock_irqsave(&krcp->lock, flags); 1813 pushed = put_cached_bnode(krcp, bnode); 1814 raw_spin_unlock_irqrestore(&krcp->lock, flags); 1815 1816 if (!pushed) { 1817 free_page((unsigned long) bnode); 1818 break; 1819 } 1820 } 1821 1822 atomic_set(&krcp->work_in_progress, 0); 1823 atomic_set(&krcp->backoff_page_cache_fill, 0); 1824 } 1825 1826 // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() 1827 // state specified by flags. If can_alloc is true, the caller must 1828 // be schedulable and not be holding any locks or mutexes that might be 1829 // acquired by the memory allocator or anything that it might invoke. 1830 // Returns true if ptr was successfully recorded, else the caller must 1831 // use a fallback. 1832 static inline bool 1833 add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, 1834 unsigned long *flags, void *ptr, bool can_alloc) 1835 { 1836 struct kvfree_rcu_bulk_data *bnode; 1837 int idx; 1838 1839 *krcp = krc_this_cpu_lock(flags); 1840 if (unlikely(!(*krcp)->initialized)) 1841 return false; 1842 1843 idx = !!is_vmalloc_addr(ptr); 1844 bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], 1845 struct kvfree_rcu_bulk_data, list); 1846 1847 /* Check if a new block is required. */ 1848 if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { 1849 bnode = get_cached_bnode(*krcp); 1850 if (!bnode && can_alloc) { 1851 krc_this_cpu_unlock(*krcp, *flags); 1852 1853 // __GFP_NORETRY - allows a light-weight direct reclaim 1854 // what is OK from minimizing of fallback hitting point of 1855 // view. Apart of that it forbids any OOM invoking what is 1856 // also beneficial since we are about to release memory soon. 1857 // 1858 // __GFP_NOMEMALLOC - prevents from consuming of all the 1859 // memory reserves. Please note we have a fallback path. 1860 // 1861 // __GFP_NOWARN - it is supposed that an allocation can 1862 // be failed under low memory or high memory pressure 1863 // scenarios. 1864 bnode = (struct kvfree_rcu_bulk_data *) 1865 __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 1866 raw_spin_lock_irqsave(&(*krcp)->lock, *flags); 1867 } 1868 1869 if (!bnode) 1870 return false; 1871 1872 // Initialize the new block and attach it. 1873 bnode->nr_records = 0; 1874 list_add(&bnode->list, &(*krcp)->bulk_head[idx]); 1875 } 1876 1877 // Finally insert and update the GP for this page. 1878 bnode->nr_records++; 1879 bnode->records[bnode->nr_records - 1] = ptr; 1880 get_state_synchronize_rcu_full(&bnode->gp_snap); 1881 atomic_inc(&(*krcp)->bulk_count[idx]); 1882 1883 return true; 1884 } 1885 1886 static enum hrtimer_restart 1887 schedule_page_work_fn(struct hrtimer *t) 1888 { 1889 struct kfree_rcu_cpu *krcp = 1890 container_of(t, struct kfree_rcu_cpu, hrtimer); 1891 1892 queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); 1893 return HRTIMER_NORESTART; 1894 } 1895 1896 static void 1897 run_page_cache_worker(struct kfree_rcu_cpu *krcp) 1898 { 1899 // If cache disabled, bail out. 1900 if (!rcu_min_cached_objs) 1901 return; 1902 1903 if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && 1904 !atomic_xchg(&krcp->work_in_progress, 1)) { 1905 if (atomic_read(&krcp->backoff_page_cache_fill)) { 1906 queue_delayed_work(rcu_reclaim_wq, 1907 &krcp->page_cache_work, 1908 msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); 1909 } else { 1910 hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC, 1911 HRTIMER_MODE_REL); 1912 hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); 1913 } 1914 } 1915 } 1916 1917 void __init kfree_rcu_scheduler_running(void) 1918 { 1919 int cpu; 1920 1921 for_each_possible_cpu(cpu) { 1922 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); 1923 1924 if (need_offload_krc(krcp)) 1925 schedule_delayed_monitor_work(krcp); 1926 } 1927 } 1928 1929 /* 1930 * Queue a request for lazy invocation of the appropriate free routine 1931 * after a grace period. Please note that three paths are maintained, 1932 * two for the common case using arrays of pointers and a third one that 1933 * is used only when the main paths cannot be used, for example, due to 1934 * memory pressure. 1935 * 1936 * Each kvfree_call_rcu() request is added to a batch. The batch will be drained 1937 * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will 1938 * be free'd in workqueue context. This allows us to: batch requests together to 1939 * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. 1940 */ 1941 void kvfree_call_rcu(struct rcu_head *head, void *ptr) 1942 { 1943 unsigned long flags; 1944 struct kfree_rcu_cpu *krcp; 1945 bool success; 1946 1947 /* 1948 * Please note there is a limitation for the head-less 1949 * variant, that is why there is a clear rule for such 1950 * objects: it can be used from might_sleep() context 1951 * only. For other places please embed an rcu_head to 1952 * your data. 1953 */ 1954 if (!head) 1955 might_sleep(); 1956 1957 if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr)) 1958 return; 1959 1960 // Queue the object but don't yet schedule the batch. 1961 if (debug_rcu_head_queue(ptr)) { 1962 // Probable double kfree_rcu(), just leak. 1963 WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", 1964 __func__, head); 1965 1966 // Mark as success and leave. 1967 return; 1968 } 1969 1970 kasan_record_aux_stack(ptr); 1971 success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); 1972 if (!success) { 1973 run_page_cache_worker(krcp); 1974 1975 if (head == NULL) 1976 // Inline if kvfree_rcu(one_arg) call. 1977 goto unlock_return; 1978 1979 head->func = ptr; 1980 head->next = krcp->head; 1981 WRITE_ONCE(krcp->head, head); 1982 atomic_inc(&krcp->head_count); 1983 1984 // Take a snapshot for this krcp. 1985 krcp->head_gp_snap = get_state_synchronize_rcu(); 1986 success = true; 1987 } 1988 1989 /* 1990 * The kvfree_rcu() caller considers the pointer freed at this point 1991 * and likely removes any references to it. Since the actual slab 1992 * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore 1993 * this object (no scanning or false positives reporting). 1994 */ 1995 kmemleak_ignore(ptr); 1996 1997 // Set timer to drain after KFREE_DRAIN_JIFFIES. 1998 if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) 1999 __schedule_delayed_monitor_work(krcp); 2000 2001 unlock_return: 2002 krc_this_cpu_unlock(krcp, flags); 2003 2004 /* 2005 * Inline kvfree() after synchronize_rcu(). We can do 2006 * it from might_sleep() context only, so the current 2007 * CPU can pass the QS state. 2008 */ 2009 if (!success) { 2010 debug_rcu_head_unqueue((struct rcu_head *) ptr); 2011 synchronize_rcu(); 2012 kvfree(ptr); 2013 } 2014 } 2015 EXPORT_SYMBOL_GPL(kvfree_call_rcu); 2016 2017 static inline void __kvfree_rcu_barrier(void) 2018 { 2019 struct kfree_rcu_cpu_work *krwp; 2020 struct kfree_rcu_cpu *krcp; 2021 bool queued; 2022 int i, cpu; 2023 2024 /* 2025 * Firstly we detach objects and queue them over an RCU-batch 2026 * for all CPUs. Finally queued works are flushed for each CPU. 2027 * 2028 * Please note. If there are outstanding batches for a particular 2029 * CPU, those have to be finished first following by queuing a new. 2030 */ 2031 for_each_possible_cpu(cpu) { 2032 krcp = per_cpu_ptr(&krc, cpu); 2033 2034 /* 2035 * Check if this CPU has any objects which have been queued for a 2036 * new GP completion. If not(means nothing to detach), we are done 2037 * with it. If any batch is pending/running for this "krcp", below 2038 * per-cpu flush_rcu_work() waits its completion(see last step). 2039 */ 2040 if (!need_offload_krc(krcp)) 2041 continue; 2042 2043 while (1) { 2044 /* 2045 * If we are not able to queue a new RCU work it means: 2046 * - batches for this CPU are still in flight which should 2047 * be flushed first and then repeat; 2048 * - no objects to detach, because of concurrency. 2049 */ 2050 queued = kvfree_rcu_queue_batch(krcp); 2051 2052 /* 2053 * Bail out, if there is no need to offload this "krcp" 2054 * anymore. As noted earlier it can run concurrently. 2055 */ 2056 if (queued || !need_offload_krc(krcp)) 2057 break; 2058 2059 /* There are ongoing batches. */ 2060 for (i = 0; i < KFREE_N_BATCHES; i++) { 2061 krwp = &(krcp->krw_arr[i]); 2062 flush_rcu_work(&krwp->rcu_work); 2063 } 2064 } 2065 } 2066 2067 /* 2068 * Now we guarantee that all objects are flushed. 2069 */ 2070 for_each_possible_cpu(cpu) { 2071 krcp = per_cpu_ptr(&krc, cpu); 2072 2073 /* 2074 * A monitor work can drain ready to reclaim objects 2075 * directly. Wait its completion if running or pending. 2076 */ 2077 cancel_delayed_work_sync(&krcp->monitor_work); 2078 2079 for (i = 0; i < KFREE_N_BATCHES; i++) { 2080 krwp = &(krcp->krw_arr[i]); 2081 flush_rcu_work(&krwp->rcu_work); 2082 } 2083 } 2084 } 2085 2086 /** 2087 * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. 2088 * 2089 * Note that a single argument of kvfree_rcu() call has a slow path that 2090 * triggers synchronize_rcu() following by freeing a pointer. It is done 2091 * before the return from the function. Therefore for any single-argument 2092 * call that will result in a kfree() to a cache that is to be destroyed 2093 * during module exit, it is developer's responsibility to ensure that all 2094 * such calls have returned before the call to kmem_cache_destroy(). 2095 */ 2096 void kvfree_rcu_barrier(void) 2097 { 2098 flush_all_rcu_sheaves(); 2099 __kvfree_rcu_barrier(); 2100 } 2101 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); 2102 2103 /** 2104 * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a 2105 * specific slab cache. 2106 * @s: slab cache to wait for 2107 * 2108 * See the description of kvfree_rcu_barrier() for details. 2109 */ 2110 void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) 2111 { 2112 if (cache_has_sheaves(s)) { 2113 flush_rcu_sheaves_on_cache(s); 2114 rcu_barrier(); 2115 } 2116 2117 /* 2118 * TODO: Introduce a version of __kvfree_rcu_barrier() that works 2119 * on a specific slab cache. 2120 */ 2121 __kvfree_rcu_barrier(); 2122 } 2123 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache); 2124 2125 static unsigned long 2126 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 2127 { 2128 int cpu; 2129 unsigned long count = 0; 2130 2131 /* Snapshot count of all CPUs */ 2132 for_each_possible_cpu(cpu) { 2133 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); 2134 2135 count += krc_count(krcp); 2136 count += READ_ONCE(krcp->nr_bkv_objs); 2137 atomic_set(&krcp->backoff_page_cache_fill, 1); 2138 } 2139 2140 return count == 0 ? SHRINK_EMPTY : count; 2141 } 2142 2143 static unsigned long 2144 kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 2145 { 2146 int cpu, freed = 0; 2147 2148 for_each_possible_cpu(cpu) { 2149 int count; 2150 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); 2151 2152 count = krc_count(krcp); 2153 count += drain_page_cache(krcp); 2154 kfree_rcu_monitor(&krcp->monitor_work.work); 2155 2156 sc->nr_to_scan -= count; 2157 freed += count; 2158 2159 if (sc->nr_to_scan <= 0) 2160 break; 2161 } 2162 2163 return freed == 0 ? SHRINK_STOP : freed; 2164 } 2165 2166 void __init kvfree_rcu_init(void) 2167 { 2168 int cpu; 2169 int i, j; 2170 struct shrinker *kfree_rcu_shrinker; 2171 2172 rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim", 2173 WQ_UNBOUND | WQ_MEM_RECLAIM, 0); 2174 WARN_ON(!rcu_reclaim_wq); 2175 2176 /* Clamp it to [0:100] seconds interval. */ 2177 if (rcu_delay_page_cache_fill_msec < 0 || 2178 rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { 2179 2180 rcu_delay_page_cache_fill_msec = 2181 clamp(rcu_delay_page_cache_fill_msec, 0, 2182 (int) (100 * MSEC_PER_SEC)); 2183 2184 pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", 2185 rcu_delay_page_cache_fill_msec); 2186 } 2187 2188 for_each_possible_cpu(cpu) { 2189 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); 2190 2191 for (i = 0; i < KFREE_N_BATCHES; i++) { 2192 INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); 2193 krcp->krw_arr[i].krcp = krcp; 2194 2195 for (j = 0; j < FREE_N_CHANNELS; j++) 2196 INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); 2197 } 2198 2199 for (i = 0; i < FREE_N_CHANNELS; i++) 2200 INIT_LIST_HEAD(&krcp->bulk_head[i]); 2201 2202 INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); 2203 INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); 2204 krcp->initialized = true; 2205 } 2206 2207 kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu"); 2208 if (!kfree_rcu_shrinker) { 2209 pr_err("Failed to allocate kfree_rcu() shrinker!\n"); 2210 return; 2211 } 2212 2213 kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; 2214 kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; 2215 2216 shrinker_register(kfree_rcu_shrinker); 2217 } 2218 2219 #endif /* CONFIG_KVFREE_RCU_BATCHED */ 2220