1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org> 5 * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org> 6 * Copyright (c) 2004-2006 Robert N. M. Watson 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice unmodified, this list of conditions, and the following 14 * disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * uma_core.c Implementation of the Universal Memory allocator 33 * 34 * This allocator is intended to replace the multitude of similar object caches 35 * in the standard FreeBSD kernel. The intent is to be flexible as well as 36 * efficient. A primary design goal is to return unused memory to the rest of 37 * the system. This will make the system as a whole more flexible due to the 38 * ability to move memory to subsystems which most need it instead of leaving 39 * pools of reserved memory unused. 40 * 41 * The basic ideas stem from similar slab/zone based allocators whose algorithms 42 * are well known. 43 * 44 */ 45 46 /* 47 * TODO: 48 * - Improve memory usage for large allocations 49 * - Investigate cache size adjustments 50 */ 51 52 #include <sys/cdefs.h> 53 #include "opt_ddb.h" 54 #include "opt_param.h" 55 #include "opt_vm.h" 56 57 #include <sys/param.h> 58 #include <sys/systm.h> 59 #include <sys/asan.h> 60 #include <sys/bitset.h> 61 #include <sys/domainset.h> 62 #include <sys/eventhandler.h> 63 #include <sys/kernel.h> 64 #include <sys/types.h> 65 #include <sys/limits.h> 66 #include <sys/queue.h> 67 #include <sys/malloc.h> 68 #include <sys/ktr.h> 69 #include <sys/lock.h> 70 #include <sys/msan.h> 71 #include <sys/mutex.h> 72 #include <sys/proc.h> 73 #include <sys/random.h> 74 #include <sys/rwlock.h> 75 #include <sys/sbuf.h> 76 #include <sys/sched.h> 77 #include <sys/sleepqueue.h> 78 #include <sys/smp.h> 79 #include <sys/smr.h> 80 #include <sys/sysctl.h> 81 #include <sys/taskqueue.h> 82 #include <sys/vmmeter.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <vm/vm_domainset.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_pageout.h> 89 #include <vm/vm_phys.h> 90 #include <vm/vm_pagequeue.h> 91 #include <vm/vm_map.h> 92 #include <vm/vm_kern.h> 93 #include <vm/vm_extern.h> 94 #include <vm/vm_dumpset.h> 95 #include <vm/uma.h> 96 #include <vm/uma_int.h> 97 #include <vm/uma_dbg.h> 98 99 #include <ddb/ddb.h> 100 101 #ifdef DEBUG_MEMGUARD 102 #include <vm/memguard.h> 103 #endif 104 105 #include <machine/md_var.h> 106 107 #ifdef INVARIANTS 108 #define UMA_ALWAYS_CTORDTOR 1 109 #else 110 #define UMA_ALWAYS_CTORDTOR 0 111 #endif 112 113 /* 114 * This is the zone and keg from which all zones are spawned. 115 */ 116 static uma_zone_t kegs; 117 static uma_zone_t zones; 118 119 /* 120 * On INVARIANTS builds, the slab contains a second bitset of the same size, 121 * "dbg_bits", which is laid out immediately after us_free. 122 */ 123 #ifdef INVARIANTS 124 #define SLAB_BITSETS 2 125 #else 126 #define SLAB_BITSETS 1 127 #endif 128 129 /* 130 * These are the two zones from which all offpage uma_slab_ts are allocated. 131 * 132 * One zone is for slab headers that can represent a larger number of items, 133 * making the slabs themselves more efficient, and the other zone is for 134 * headers that are smaller and represent fewer items, making the headers more 135 * efficient. 136 */ 137 #define SLABZONE_SIZE(setsize) \ 138 (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS) 139 #define SLABZONE0_SETSIZE (PAGE_SIZE / 16) 140 #define SLABZONE1_SETSIZE SLAB_MAX_SETSIZE 141 #define SLABZONE0_SIZE SLABZONE_SIZE(SLABZONE0_SETSIZE) 142 #define SLABZONE1_SIZE SLABZONE_SIZE(SLABZONE1_SETSIZE) 143 static uma_zone_t slabzones[2]; 144 145 /* 146 * The initial hash tables come out of this zone so they can be allocated 147 * prior to malloc coming up. 148 */ 149 static uma_zone_t hashzone; 150 151 /* The boot-time adjusted value for cache line alignment. */ 152 static unsigned int uma_cache_align_mask = 64 - 1; 153 154 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); 155 static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc"); 156 157 /* 158 * Are we allowed to allocate buckets? 159 */ 160 static int bucketdisable = 1; 161 162 /* Linked list of all kegs in the system */ 163 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); 164 165 /* Linked list of all cache-only zones in the system */ 166 static LIST_HEAD(,uma_zone) uma_cachezones = 167 LIST_HEAD_INITIALIZER(uma_cachezones); 168 169 /* 170 * Mutex for global lists: uma_kegs, uma_cachezones, and the per-keg list of 171 * zones. 172 */ 173 static struct rwlock_padalign __exclusive_cache_line uma_rwlock; 174 175 static struct sx uma_reclaim_lock; 176 177 /* 178 * First available virual address for boot time allocations. 179 */ 180 static vm_offset_t bootstart; 181 static vm_offset_t bootmem; 182 183 /* 184 * kmem soft limit, initialized by uma_set_limit(). Ensure that early 185 * allocations don't trigger a wakeup of the reclaim thread. 186 */ 187 unsigned long uma_kmem_limit = LONG_MAX; 188 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, 189 "UMA kernel memory soft limit"); 190 unsigned long uma_kmem_total; 191 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, 192 "UMA kernel memory usage"); 193 194 /* Is the VM done starting up? */ 195 static enum { 196 BOOT_COLD, 197 BOOT_KVA, 198 BOOT_PCPU, 199 BOOT_RUNNING, 200 BOOT_SHUTDOWN, 201 } booted = BOOT_COLD; 202 203 /* 204 * This is the handle used to schedule events that need to happen 205 * outside of the allocation fast path. 206 */ 207 static struct timeout_task uma_timeout_task; 208 #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ 209 210 /* 211 * This structure is passed as the zone ctor arg so that I don't have to create 212 * a special allocation function just for zones. 213 */ 214 struct uma_zctor_args { 215 const char *name; 216 size_t size; 217 uma_ctor ctor; 218 uma_dtor dtor; 219 uma_init uminit; 220 uma_fini fini; 221 uma_import import; 222 uma_release release; 223 void *arg; 224 uma_keg_t keg; 225 int align; 226 uint32_t flags; 227 }; 228 229 struct uma_kctor_args { 230 uma_zone_t zone; 231 size_t size; 232 uma_init uminit; 233 uma_fini fini; 234 int align; 235 uint32_t flags; 236 }; 237 238 struct uma_bucket_zone { 239 uma_zone_t ubz_zone; 240 const char *ubz_name; 241 int ubz_entries; /* Number of items it can hold. */ 242 int ubz_maxsize; /* Maximum allocation size per-item. */ 243 }; 244 245 /* 246 * Compute the actual number of bucket entries to pack them in power 247 * of two sizes for more efficient space utilization. 248 */ 249 #define BUCKET_SIZE(n) \ 250 (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) 251 252 #define BUCKET_MAX BUCKET_SIZE(256) 253 254 struct uma_bucket_zone bucket_zones[] = { 255 /* Literal bucket sizes. */ 256 { NULL, "2 Bucket", 2, 4096 }, 257 { NULL, "4 Bucket", 4, 3072 }, 258 { NULL, "8 Bucket", 8, 2048 }, 259 { NULL, "16 Bucket", 16, 1024 }, 260 /* Rounded down power of 2 sizes for efficiency. */ 261 { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, 262 { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, 263 { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, 264 { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, 265 { NULL, NULL, 0} 266 }; 267 268 /* 269 * Flags and enumerations to be passed to internal functions. 270 */ 271 enum zfreeskip { 272 SKIP_NONE = 0, 273 SKIP_CNT = 0x00000001, 274 SKIP_DTOR = 0x00010000, 275 SKIP_FINI = 0x00020000, 276 }; 277 278 /* Prototypes.. */ 279 280 void uma_startup1(vm_offset_t); 281 void uma_startup2(void); 282 283 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 284 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 285 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 286 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 287 static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 288 static void page_free(void *, vm_size_t, uint8_t); 289 static void pcpu_page_free(void *, vm_size_t, uint8_t); 290 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); 291 static void cache_drain(uma_zone_t); 292 static void bucket_drain(uma_zone_t, uma_bucket_t); 293 static void bucket_cache_reclaim(uma_zone_t zone, bool, int); 294 static bool bucket_cache_reclaim_domain(uma_zone_t, bool, bool, int); 295 static int keg_ctor(void *, int, void *, int); 296 static void keg_dtor(void *, int, void *); 297 static void keg_drain(uma_keg_t keg, int domain); 298 static int zone_ctor(void *, int, void *, int); 299 static void zone_dtor(void *, int, void *); 300 static inline void item_dtor(uma_zone_t zone, void *item, int size, 301 void *udata, enum zfreeskip skip); 302 static int zero_init(void *, int, int); 303 static void zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, 304 int itemdomain, bool ws); 305 static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *); 306 static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *); 307 static void zone_timeout(uma_zone_t zone, void *); 308 static int hash_alloc(struct uma_hash *, u_int); 309 static int hash_expand(struct uma_hash *, struct uma_hash *); 310 static void hash_free(struct uma_hash *hash); 311 static void uma_timeout(void *, int); 312 static void uma_shutdown(void); 313 static void *zone_alloc_item(uma_zone_t, void *, int, int); 314 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); 315 static int zone_alloc_limit(uma_zone_t zone, int count, int flags); 316 static void zone_free_limit(uma_zone_t zone, int count); 317 static void bucket_enable(void); 318 static void bucket_init(void); 319 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); 320 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); 321 static void bucket_zone_drain(int domain); 322 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); 323 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); 324 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); 325 static size_t slab_sizeof(int nitems); 326 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, 327 uma_fini fini, int align, uint32_t flags); 328 static int zone_import(void *, void **, int, int, int); 329 static void zone_release(void *, void **, int); 330 static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int); 331 static bool cache_free(uma_zone_t, uma_cache_t, void *, int); 332 333 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); 334 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); 335 static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS); 336 static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); 337 static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS); 338 static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS); 339 static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS); 340 341 static uint64_t uma_zone_get_allocs(uma_zone_t zone); 342 343 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 344 "Memory allocation debugging"); 345 346 #ifdef INVARIANTS 347 static uint64_t uma_keg_get_allocs(uma_keg_t zone); 348 static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg); 349 350 static bool uma_dbg_kskip(uma_keg_t keg, void *mem); 351 static bool uma_dbg_zskip(uma_zone_t zone, void *mem); 352 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); 353 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); 354 355 static u_int dbg_divisor = 1; 356 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor, 357 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0, 358 "Debug & thrash every this item in memory allocator"); 359 360 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER; 361 static counter_u64_t uma_skip_cnt = EARLY_COUNTER; 362 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD, 363 &uma_dbg_cnt, "memory items debugged"); 364 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD, 365 &uma_skip_cnt, "memory items skipped, not debugged"); 366 #endif 367 368 SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 369 "Universal Memory Allocator"); 370 371 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT, 372 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); 373 374 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT, 375 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); 376 377 static int zone_warnings = 1; 378 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, 379 "Warn when UMA zones becomes full"); 380 381 static int multipage_slabs = 1; 382 TUNABLE_INT("vm.debug.uma_multipage_slabs", &multipage_slabs); 383 SYSCTL_INT(_vm_debug, OID_AUTO, uma_multipage_slabs, 384 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &multipage_slabs, 0, 385 "UMA may choose larger slab sizes for better efficiency"); 386 387 /* 388 * Select the slab zone for an offpage slab with the given maximum item count. 389 */ 390 static inline uma_zone_t 391 slabzone(int ipers) 392 { 393 394 return (slabzones[ipers > SLABZONE0_SETSIZE]); 395 } 396 397 /* 398 * This routine checks to see whether or not it's safe to enable buckets. 399 */ 400 static void 401 bucket_enable(void) 402 { 403 404 KASSERT(booted >= BOOT_KVA, ("Bucket enable before init")); 405 bucketdisable = vm_page_count_min(); 406 } 407 408 /* 409 * Initialize bucket_zones, the array of zones of buckets of various sizes. 410 * 411 * For each zone, calculate the memory required for each bucket, consisting 412 * of the header and an array of pointers. 413 */ 414 static void 415 bucket_init(void) 416 { 417 struct uma_bucket_zone *ubz; 418 int size; 419 420 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { 421 size = roundup(sizeof(struct uma_bucket), sizeof(void *)); 422 size += sizeof(void *) * ubz->ubz_entries; 423 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, 424 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 425 UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | 426 UMA_ZONE_FIRSTTOUCH); 427 } 428 } 429 430 /* 431 * Given a desired number of entries for a bucket, return the zone from which 432 * to allocate the bucket. 433 */ 434 static struct uma_bucket_zone * 435 bucket_zone_lookup(int entries) 436 { 437 struct uma_bucket_zone *ubz; 438 439 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 440 if (ubz->ubz_entries >= entries) 441 return (ubz); 442 ubz--; 443 return (ubz); 444 } 445 446 static int 447 bucket_select(int size) 448 { 449 struct uma_bucket_zone *ubz; 450 451 ubz = &bucket_zones[0]; 452 if (size > ubz->ubz_maxsize) 453 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); 454 455 for (; ubz->ubz_entries != 0; ubz++) 456 if (ubz->ubz_maxsize < size) 457 break; 458 ubz--; 459 return (ubz->ubz_entries); 460 } 461 462 static uma_bucket_t 463 bucket_alloc(uma_zone_t zone, void *udata, int flags) 464 { 465 struct uma_bucket_zone *ubz; 466 uma_bucket_t bucket; 467 468 /* 469 * Don't allocate buckets early in boot. 470 */ 471 if (__predict_false(booted < BOOT_KVA)) 472 return (NULL); 473 474 /* 475 * To limit bucket recursion we store the original zone flags 476 * in a cookie passed via zalloc_arg/zfree_arg. This allows the 477 * NOVM flag to persist even through deep recursions. We also 478 * store ZFLAG_BUCKET once we have recursed attempting to allocate 479 * a bucket for a bucket zone so we do not allow infinite bucket 480 * recursion. This cookie will even persist to frees of unused 481 * buckets via the allocation path or bucket allocations in the 482 * free path. 483 */ 484 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 485 udata = (void *)(uintptr_t)zone->uz_flags; 486 else { 487 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) 488 return (NULL); 489 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); 490 } 491 if (((uintptr_t)udata & UMA_ZONE_VM) != 0) 492 flags |= M_NOVM; 493 ubz = bucket_zone_lookup(atomic_load_16(&zone->uz_bucket_size)); 494 if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) 495 ubz++; 496 bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); 497 if (bucket) { 498 #ifdef INVARIANTS 499 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); 500 #endif 501 bucket->ub_cnt = 0; 502 bucket->ub_entries = min(ubz->ubz_entries, 503 zone->uz_bucket_size_max); 504 bucket->ub_seq = SMR_SEQ_INVALID; 505 CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p", 506 zone->uz_name, zone, bucket); 507 } 508 509 return (bucket); 510 } 511 512 static void 513 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) 514 { 515 struct uma_bucket_zone *ubz; 516 517 if (bucket->ub_cnt != 0) 518 bucket_drain(zone, bucket); 519 520 KASSERT(bucket->ub_cnt == 0, 521 ("bucket_free: Freeing a non free bucket.")); 522 KASSERT(bucket->ub_seq == SMR_SEQ_INVALID, 523 ("bucket_free: Freeing an SMR bucket.")); 524 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 525 udata = (void *)(uintptr_t)zone->uz_flags; 526 ubz = bucket_zone_lookup(bucket->ub_entries); 527 uma_zfree_arg(ubz->ubz_zone, bucket, udata); 528 } 529 530 static void 531 bucket_zone_drain(int domain) 532 { 533 struct uma_bucket_zone *ubz; 534 535 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 536 uma_zone_reclaim_domain(ubz->ubz_zone, UMA_RECLAIM_DRAIN, 537 domain); 538 } 539 540 #ifdef KASAN 541 _Static_assert(UMA_SMALLEST_UNIT % KASAN_SHADOW_SCALE == 0, 542 "Base UMA allocation size not a multiple of the KASAN scale factor"); 543 544 static void 545 kasan_mark_item_valid(uma_zone_t zone, void *item) 546 { 547 void *pcpu_item; 548 size_t sz, rsz; 549 int i; 550 551 if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0) 552 return; 553 554 sz = zone->uz_size; 555 rsz = roundup2(sz, KASAN_SHADOW_SCALE); 556 if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { 557 kasan_mark(item, sz, rsz, KASAN_GENERIC_REDZONE); 558 } else { 559 pcpu_item = zpcpu_base_to_offset(item); 560 for (i = 0; i <= mp_maxid; i++) 561 kasan_mark(zpcpu_get_cpu(pcpu_item, i), sz, rsz, 562 KASAN_GENERIC_REDZONE); 563 } 564 } 565 566 static void 567 kasan_mark_item_invalid(uma_zone_t zone, void *item) 568 { 569 void *pcpu_item; 570 size_t sz; 571 int i; 572 573 if ((zone->uz_flags & UMA_ZONE_NOKASAN) != 0) 574 return; 575 576 sz = roundup2(zone->uz_size, KASAN_SHADOW_SCALE); 577 if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { 578 kasan_mark(item, 0, sz, KASAN_UMA_FREED); 579 } else { 580 pcpu_item = zpcpu_base_to_offset(item); 581 for (i = 0; i <= mp_maxid; i++) 582 kasan_mark(zpcpu_get_cpu(pcpu_item, i), 0, sz, 583 KASAN_UMA_FREED); 584 } 585 } 586 587 static void 588 kasan_mark_slab_valid(uma_keg_t keg, void *mem) 589 { 590 size_t sz; 591 592 if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) { 593 sz = keg->uk_ppera * PAGE_SIZE; 594 kasan_mark(mem, sz, sz, 0); 595 } 596 } 597 598 static void 599 kasan_mark_slab_invalid(uma_keg_t keg, void *mem) 600 { 601 size_t sz; 602 603 if ((keg->uk_flags & UMA_ZONE_NOKASAN) == 0) { 604 if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) 605 sz = keg->uk_ppera * PAGE_SIZE; 606 else 607 sz = keg->uk_pgoff; 608 kasan_mark(mem, 0, sz, KASAN_UMA_FREED); 609 } 610 } 611 #else /* !KASAN */ 612 static void 613 kasan_mark_item_valid(uma_zone_t zone __unused, void *item __unused) 614 { 615 } 616 617 static void 618 kasan_mark_item_invalid(uma_zone_t zone __unused, void *item __unused) 619 { 620 } 621 622 static void 623 kasan_mark_slab_valid(uma_keg_t keg __unused, void *mem __unused) 624 { 625 } 626 627 static void 628 kasan_mark_slab_invalid(uma_keg_t keg __unused, void *mem __unused) 629 { 630 } 631 #endif /* KASAN */ 632 633 #ifdef KMSAN 634 static inline void 635 kmsan_mark_item_uninitialized(uma_zone_t zone, void *item) 636 { 637 void *pcpu_item; 638 size_t sz; 639 int i; 640 641 if ((zone->uz_flags & 642 (UMA_ZFLAG_CACHE | UMA_ZONE_SECONDARY | UMA_ZONE_MALLOC)) != 0) { 643 /* 644 * Cache zones should not be instrumented by default, as UMA 645 * does not have enough information to do so correctly. 646 * Consumers can mark items themselves if it makes sense to do 647 * so. 648 * 649 * Items from secondary zones are initialized by the parent 650 * zone and thus cannot safely be marked by UMA. 651 * 652 * malloc zones are handled directly by malloc(9) and friends, 653 * since they can provide more precise origin tracking. 654 */ 655 return; 656 } 657 if (zone->uz_keg->uk_init != NULL) { 658 /* 659 * By definition, initialized items cannot be marked. The 660 * best we can do is mark items from these zones after they 661 * are freed to the keg. 662 */ 663 return; 664 } 665 666 sz = zone->uz_size; 667 if ((zone->uz_flags & UMA_ZONE_PCPU) == 0) { 668 kmsan_orig(item, sz, KMSAN_TYPE_UMA, KMSAN_RET_ADDR); 669 kmsan_mark(item, sz, KMSAN_STATE_UNINIT); 670 } else { 671 pcpu_item = zpcpu_base_to_offset(item); 672 for (i = 0; i <= mp_maxid; i++) { 673 kmsan_orig(zpcpu_get_cpu(pcpu_item, i), sz, 674 KMSAN_TYPE_UMA, KMSAN_RET_ADDR); 675 kmsan_mark(zpcpu_get_cpu(pcpu_item, i), sz, 676 KMSAN_STATE_INITED); 677 } 678 } 679 } 680 #else /* !KMSAN */ 681 static inline void 682 kmsan_mark_item_uninitialized(uma_zone_t zone __unused, void *item __unused) 683 { 684 } 685 #endif /* KMSAN */ 686 687 /* 688 * Acquire the domain lock and record contention. 689 */ 690 static uma_zone_domain_t 691 zone_domain_lock(uma_zone_t zone, int domain) 692 { 693 uma_zone_domain_t zdom; 694 bool lockfail; 695 696 zdom = ZDOM_GET(zone, domain); 697 lockfail = false; 698 if (ZDOM_OWNED(zdom)) 699 lockfail = true; 700 ZDOM_LOCK(zdom); 701 /* This is unsynchronized. The counter does not need to be precise. */ 702 if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) 703 zone->uz_bucket_size++; 704 return (zdom); 705 } 706 707 /* 708 * Search for the domain with the least cached items and return it if it 709 * is out of balance with the preferred domain. 710 */ 711 static __noinline int 712 zone_domain_lowest(uma_zone_t zone, int pref) 713 { 714 long least, nitems, prefitems; 715 int domain; 716 int i; 717 718 prefitems = least = LONG_MAX; 719 domain = 0; 720 for (i = 0; i < vm_ndomains; i++) { 721 nitems = ZDOM_GET(zone, i)->uzd_nitems; 722 if (nitems < least) { 723 domain = i; 724 least = nitems; 725 } 726 if (domain == pref) 727 prefitems = nitems; 728 } 729 if (prefitems < least * 2) 730 return (pref); 731 732 return (domain); 733 } 734 735 /* 736 * Search for the domain with the most cached items and return it or the 737 * preferred domain if it has enough to proceed. 738 */ 739 static __noinline int 740 zone_domain_highest(uma_zone_t zone, int pref) 741 { 742 long most, nitems; 743 int domain; 744 int i; 745 746 if (ZDOM_GET(zone, pref)->uzd_nitems > BUCKET_MAX) 747 return (pref); 748 749 most = 0; 750 domain = 0; 751 for (i = 0; i < vm_ndomains; i++) { 752 nitems = ZDOM_GET(zone, i)->uzd_nitems; 753 if (nitems > most) { 754 domain = i; 755 most = nitems; 756 } 757 } 758 759 return (domain); 760 } 761 762 /* 763 * Set the maximum imax value. 764 */ 765 static void 766 zone_domain_imax_set(uma_zone_domain_t zdom, int nitems) 767 { 768 long old; 769 770 old = zdom->uzd_imax; 771 do { 772 if (old >= nitems) 773 return; 774 } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0); 775 776 /* 777 * We are at new maximum, so do the last WSS update for the old 778 * bimin and prepare to measure next allocation batch. 779 */ 780 if (zdom->uzd_wss < old - zdom->uzd_bimin) 781 zdom->uzd_wss = old - zdom->uzd_bimin; 782 zdom->uzd_bimin = nitems; 783 } 784 785 /* 786 * Attempt to satisfy an allocation by retrieving a full bucket from one of the 787 * zone's caches. If a bucket is found the zone is not locked on return. 788 */ 789 static uma_bucket_t 790 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim) 791 { 792 uma_bucket_t bucket; 793 long cnt; 794 int i; 795 bool dtor = false; 796 797 ZDOM_LOCK_ASSERT(zdom); 798 799 if ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) == NULL) 800 return (NULL); 801 802 /* SMR Buckets can not be re-used until readers expire. */ 803 if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && 804 bucket->ub_seq != SMR_SEQ_INVALID) { 805 if (!smr_poll(zone->uz_smr, bucket->ub_seq, false)) 806 return (NULL); 807 bucket->ub_seq = SMR_SEQ_INVALID; 808 dtor = (zone->uz_dtor != NULL) || UMA_ALWAYS_CTORDTOR; 809 if (STAILQ_NEXT(bucket, ub_link) != NULL) 810 zdom->uzd_seq = STAILQ_NEXT(bucket, ub_link)->ub_seq; 811 } 812 STAILQ_REMOVE_HEAD(&zdom->uzd_buckets, ub_link); 813 814 KASSERT(zdom->uzd_nitems >= bucket->ub_cnt, 815 ("%s: item count underflow (%ld, %d)", 816 __func__, zdom->uzd_nitems, bucket->ub_cnt)); 817 KASSERT(bucket->ub_cnt > 0, 818 ("%s: empty bucket in bucket cache", __func__)); 819 zdom->uzd_nitems -= bucket->ub_cnt; 820 821 if (reclaim) { 822 /* 823 * Shift the bounds of the current WSS interval to avoid 824 * perturbing the estimates. 825 */ 826 cnt = lmin(zdom->uzd_bimin, bucket->ub_cnt); 827 atomic_subtract_long(&zdom->uzd_imax, cnt); 828 zdom->uzd_bimin -= cnt; 829 zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt); 830 if (zdom->uzd_limin >= bucket->ub_cnt) { 831 zdom->uzd_limin -= bucket->ub_cnt; 832 } else { 833 zdom->uzd_limin = 0; 834 zdom->uzd_timin = 0; 835 } 836 } else if (zdom->uzd_bimin > zdom->uzd_nitems) { 837 zdom->uzd_bimin = zdom->uzd_nitems; 838 if (zdom->uzd_imin > zdom->uzd_nitems) 839 zdom->uzd_imin = zdom->uzd_nitems; 840 } 841 842 ZDOM_UNLOCK(zdom); 843 if (dtor) 844 for (i = 0; i < bucket->ub_cnt; i++) 845 item_dtor(zone, bucket->ub_bucket[i], zone->uz_size, 846 NULL, SKIP_NONE); 847 848 return (bucket); 849 } 850 851 /* 852 * Insert a full bucket into the specified cache. The "ws" parameter indicates 853 * whether the bucket's contents should be counted as part of the zone's working 854 * set. The bucket may be freed if it exceeds the bucket limit. 855 */ 856 static void 857 zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata, 858 const bool ws) 859 { 860 uma_zone_domain_t zdom; 861 862 /* We don't cache empty buckets. This can happen after a reclaim. */ 863 if (bucket->ub_cnt == 0) 864 goto out; 865 zdom = zone_domain_lock(zone, domain); 866 867 /* 868 * Conditionally set the maximum number of items. 869 */ 870 zdom->uzd_nitems += bucket->ub_cnt; 871 if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) { 872 if (ws) { 873 zone_domain_imax_set(zdom, zdom->uzd_nitems); 874 } else { 875 /* 876 * Shift the bounds of the current WSS interval to 877 * avoid perturbing the estimates. 878 */ 879 atomic_add_long(&zdom->uzd_imax, bucket->ub_cnt); 880 zdom->uzd_imin += bucket->ub_cnt; 881 zdom->uzd_bimin += bucket->ub_cnt; 882 zdom->uzd_limin += bucket->ub_cnt; 883 } 884 if (STAILQ_EMPTY(&zdom->uzd_buckets)) 885 zdom->uzd_seq = bucket->ub_seq; 886 887 /* 888 * Try to promote reuse of recently used items. For items 889 * protected by SMR, try to defer reuse to minimize polling. 890 */ 891 if (bucket->ub_seq == SMR_SEQ_INVALID) 892 STAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); 893 else 894 STAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); 895 ZDOM_UNLOCK(zdom); 896 return; 897 } 898 zdom->uzd_nitems -= bucket->ub_cnt; 899 ZDOM_UNLOCK(zdom); 900 out: 901 bucket_free(zone, bucket, udata); 902 } 903 904 /* Pops an item out of a per-cpu cache bucket. */ 905 static inline void * 906 cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket) 907 { 908 void *item; 909 910 CRITICAL_ASSERT(curthread); 911 912 bucket->ucb_cnt--; 913 item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt]; 914 #ifdef INVARIANTS 915 bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL; 916 KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); 917 #endif 918 cache->uc_allocs++; 919 920 return (item); 921 } 922 923 /* Pushes an item into a per-cpu cache bucket. */ 924 static inline void 925 cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item) 926 { 927 928 CRITICAL_ASSERT(curthread); 929 KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL, 930 ("uma_zfree: Freeing to non free bucket index.")); 931 932 bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item; 933 bucket->ucb_cnt++; 934 cache->uc_frees++; 935 } 936 937 /* 938 * Unload a UMA bucket from a per-cpu cache. 939 */ 940 static inline uma_bucket_t 941 cache_bucket_unload(uma_cache_bucket_t bucket) 942 { 943 uma_bucket_t b; 944 945 b = bucket->ucb_bucket; 946 if (b != NULL) { 947 MPASS(b->ub_entries == bucket->ucb_entries); 948 b->ub_cnt = bucket->ucb_cnt; 949 bucket->ucb_bucket = NULL; 950 bucket->ucb_entries = bucket->ucb_cnt = 0; 951 } 952 953 return (b); 954 } 955 956 static inline uma_bucket_t 957 cache_bucket_unload_alloc(uma_cache_t cache) 958 { 959 960 return (cache_bucket_unload(&cache->uc_allocbucket)); 961 } 962 963 static inline uma_bucket_t 964 cache_bucket_unload_free(uma_cache_t cache) 965 { 966 967 return (cache_bucket_unload(&cache->uc_freebucket)); 968 } 969 970 static inline uma_bucket_t 971 cache_bucket_unload_cross(uma_cache_t cache) 972 { 973 974 return (cache_bucket_unload(&cache->uc_crossbucket)); 975 } 976 977 /* 978 * Load a bucket into a per-cpu cache bucket. 979 */ 980 static inline void 981 cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b) 982 { 983 984 CRITICAL_ASSERT(curthread); 985 MPASS(bucket->ucb_bucket == NULL); 986 MPASS(b->ub_seq == SMR_SEQ_INVALID); 987 988 bucket->ucb_bucket = b; 989 bucket->ucb_cnt = b->ub_cnt; 990 bucket->ucb_entries = b->ub_entries; 991 } 992 993 static inline void 994 cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b) 995 { 996 997 cache_bucket_load(&cache->uc_allocbucket, b); 998 } 999 1000 static inline void 1001 cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b) 1002 { 1003 1004 cache_bucket_load(&cache->uc_freebucket, b); 1005 } 1006 1007 #ifdef NUMA 1008 static inline void 1009 cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b) 1010 { 1011 1012 cache_bucket_load(&cache->uc_crossbucket, b); 1013 } 1014 #endif 1015 1016 /* 1017 * Copy and preserve ucb_spare. 1018 */ 1019 static inline void 1020 cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2) 1021 { 1022 1023 b1->ucb_bucket = b2->ucb_bucket; 1024 b1->ucb_entries = b2->ucb_entries; 1025 b1->ucb_cnt = b2->ucb_cnt; 1026 } 1027 1028 /* 1029 * Swap two cache buckets. 1030 */ 1031 static inline void 1032 cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2) 1033 { 1034 struct uma_cache_bucket b3; 1035 1036 CRITICAL_ASSERT(curthread); 1037 1038 cache_bucket_copy(&b3, b1); 1039 cache_bucket_copy(b1, b2); 1040 cache_bucket_copy(b2, &b3); 1041 } 1042 1043 /* 1044 * Attempt to fetch a bucket from a zone on behalf of the current cpu cache. 1045 */ 1046 static uma_bucket_t 1047 cache_fetch_bucket(uma_zone_t zone, uma_cache_t cache, int domain) 1048 { 1049 uma_zone_domain_t zdom; 1050 uma_bucket_t bucket; 1051 smr_seq_t seq; 1052 1053 /* 1054 * Avoid the lock if possible. 1055 */ 1056 zdom = ZDOM_GET(zone, domain); 1057 if (zdom->uzd_nitems == 0) 1058 return (NULL); 1059 1060 if ((cache_uz_flags(cache) & UMA_ZONE_SMR) != 0 && 1061 (seq = atomic_load_32(&zdom->uzd_seq)) != SMR_SEQ_INVALID && 1062 !smr_poll(zone->uz_smr, seq, false)) 1063 return (NULL); 1064 1065 /* 1066 * Check the zone's cache of buckets. 1067 */ 1068 zdom = zone_domain_lock(zone, domain); 1069 if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) 1070 return (bucket); 1071 ZDOM_UNLOCK(zdom); 1072 1073 return (NULL); 1074 } 1075 1076 static void 1077 zone_log_warning(uma_zone_t zone) 1078 { 1079 static const struct timeval warninterval = { 300, 0 }; 1080 1081 if (!zone_warnings || zone->uz_warning == NULL) 1082 return; 1083 1084 if (ratecheck(&zone->uz_ratecheck, &warninterval)) 1085 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); 1086 } 1087 1088 static inline void 1089 zone_maxaction(uma_zone_t zone) 1090 { 1091 1092 if (zone->uz_maxaction.ta_func != NULL) 1093 taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); 1094 } 1095 1096 /* 1097 * Routine called by timeout which is used to fire off some time interval 1098 * based calculations. (stats, hash size, etc.) 1099 * 1100 * Arguments: 1101 * arg Unused 1102 * 1103 * Returns: 1104 * Nothing 1105 */ 1106 static void 1107 uma_timeout(void *context __unused, int pending __unused) 1108 { 1109 bucket_enable(); 1110 zone_foreach(zone_timeout, NULL); 1111 1112 /* Reschedule this event */ 1113 taskqueue_enqueue_timeout(taskqueue_thread, &uma_timeout_task, 1114 UMA_TIMEOUT * hz); 1115 } 1116 1117 /* 1118 * Update the working set size estimates for the zone's bucket cache. 1119 * The constants chosen here are somewhat arbitrary. 1120 */ 1121 static void 1122 zone_domain_update_wss(uma_zone_domain_t zdom) 1123 { 1124 long m; 1125 1126 ZDOM_LOCK_ASSERT(zdom); 1127 MPASS(zdom->uzd_imax >= zdom->uzd_nitems); 1128 MPASS(zdom->uzd_nitems >= zdom->uzd_bimin); 1129 MPASS(zdom->uzd_bimin >= zdom->uzd_imin); 1130 1131 /* 1132 * Estimate WSS as modified moving average of biggest allocation 1133 * batches for each period over few minutes (UMA_TIMEOUT of 20s). 1134 */ 1135 zdom->uzd_wss = lmax(zdom->uzd_wss * 3 / 4, 1136 zdom->uzd_imax - zdom->uzd_bimin); 1137 1138 /* 1139 * Estimate longtime minimum item count as a combination of recent 1140 * minimum item count, adjusted by WSS for safety, and the modified 1141 * moving average over the last several hours (UMA_TIMEOUT of 20s). 1142 * timin measures time since limin tried to go negative, that means 1143 * we were dangerously close to or got out of cache. 1144 */ 1145 m = zdom->uzd_imin - zdom->uzd_wss; 1146 if (m >= 0) { 1147 if (zdom->uzd_limin >= m) 1148 zdom->uzd_limin = m; 1149 else 1150 zdom->uzd_limin = (m + zdom->uzd_limin * 255) / 256; 1151 zdom->uzd_timin++; 1152 } else { 1153 zdom->uzd_limin = 0; 1154 zdom->uzd_timin = 0; 1155 } 1156 1157 /* To reduce period edge effects on WSS keep half of the imax. */ 1158 atomic_subtract_long(&zdom->uzd_imax, 1159 (zdom->uzd_imax - zdom->uzd_nitems + 1) / 2); 1160 zdom->uzd_imin = zdom->uzd_bimin = zdom->uzd_nitems; 1161 } 1162 1163 /* 1164 * Routine to perform timeout driven calculations. This expands the 1165 * hashes and does per cpu statistics aggregation. 1166 * 1167 * Returns nothing. 1168 */ 1169 static void 1170 zone_timeout(uma_zone_t zone, void *unused) 1171 { 1172 uma_keg_t keg; 1173 u_int slabs, pages; 1174 1175 if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) 1176 goto trim; 1177 1178 keg = zone->uz_keg; 1179 1180 /* 1181 * Hash zones are non-numa by definition so the first domain 1182 * is the only one present. 1183 */ 1184 KEG_LOCK(keg, 0); 1185 pages = keg->uk_domain[0].ud_pages; 1186 1187 /* 1188 * Expand the keg hash table. 1189 * 1190 * This is done if the number of slabs is larger than the hash size. 1191 * What I'm trying to do here is completely reduce collisions. This 1192 * may be a little aggressive. Should I allow for two collisions max? 1193 */ 1194 if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) { 1195 struct uma_hash newhash; 1196 struct uma_hash oldhash; 1197 int ret; 1198 1199 /* 1200 * This is so involved because allocating and freeing 1201 * while the keg lock is held will lead to deadlock. 1202 * I have to do everything in stages and check for 1203 * races. 1204 */ 1205 KEG_UNLOCK(keg, 0); 1206 ret = hash_alloc(&newhash, 1 << fls(slabs)); 1207 KEG_LOCK(keg, 0); 1208 if (ret) { 1209 if (hash_expand(&keg->uk_hash, &newhash)) { 1210 oldhash = keg->uk_hash; 1211 keg->uk_hash = newhash; 1212 } else 1213 oldhash = newhash; 1214 1215 KEG_UNLOCK(keg, 0); 1216 hash_free(&oldhash); 1217 goto trim; 1218 } 1219 } 1220 KEG_UNLOCK(keg, 0); 1221 1222 trim: 1223 /* Trim caches not used for a long time. */ 1224 if ((zone->uz_flags & (UMA_ZONE_UNMANAGED | UMA_ZONE_NOTRIM)) == 0) { 1225 for (int i = 0; i < vm_ndomains; i++) { 1226 if (bucket_cache_reclaim_domain(zone, false, false, i) && 1227 (zone->uz_flags & UMA_ZFLAG_CACHE) == 0) 1228 keg_drain(zone->uz_keg, i); 1229 } 1230 } 1231 } 1232 1233 /* 1234 * Allocate and zero fill the next sized hash table from the appropriate 1235 * backing store. 1236 * 1237 * Arguments: 1238 * hash A new hash structure with the old hash size in uh_hashsize 1239 * 1240 * Returns: 1241 * 1 on success and 0 on failure. 1242 */ 1243 static int 1244 hash_alloc(struct uma_hash *hash, u_int size) 1245 { 1246 size_t alloc; 1247 1248 KASSERT(powerof2(size), ("hash size must be power of 2")); 1249 if (size > UMA_HASH_SIZE_INIT) { 1250 hash->uh_hashsize = size; 1251 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; 1252 hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT); 1253 } else { 1254 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; 1255 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, 1256 UMA_ANYDOMAIN, M_WAITOK); 1257 hash->uh_hashsize = UMA_HASH_SIZE_INIT; 1258 } 1259 if (hash->uh_slab_hash) { 1260 bzero(hash->uh_slab_hash, alloc); 1261 hash->uh_hashmask = hash->uh_hashsize - 1; 1262 return (1); 1263 } 1264 1265 return (0); 1266 } 1267 1268 /* 1269 * Expands the hash table for HASH zones. This is done from zone_timeout 1270 * to reduce collisions. This must not be done in the regular allocation 1271 * path, otherwise, we can recurse on the vm while allocating pages. 1272 * 1273 * Arguments: 1274 * oldhash The hash you want to expand 1275 * newhash The hash structure for the new table 1276 * 1277 * Returns: 1278 * Nothing 1279 * 1280 * Discussion: 1281 */ 1282 static int 1283 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) 1284 { 1285 uma_hash_slab_t slab; 1286 u_int hval; 1287 u_int idx; 1288 1289 if (!newhash->uh_slab_hash) 1290 return (0); 1291 1292 if (oldhash->uh_hashsize >= newhash->uh_hashsize) 1293 return (0); 1294 1295 /* 1296 * I need to investigate hash algorithms for resizing without a 1297 * full rehash. 1298 */ 1299 1300 for (idx = 0; idx < oldhash->uh_hashsize; idx++) 1301 while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) { 1302 slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]); 1303 LIST_REMOVE(slab, uhs_hlink); 1304 hval = UMA_HASH(newhash, slab->uhs_data); 1305 LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], 1306 slab, uhs_hlink); 1307 } 1308 1309 return (1); 1310 } 1311 1312 /* 1313 * Free the hash bucket to the appropriate backing store. 1314 * 1315 * Arguments: 1316 * slab_hash The hash bucket we're freeing 1317 * hashsize The number of entries in that hash bucket 1318 * 1319 * Returns: 1320 * Nothing 1321 */ 1322 static void 1323 hash_free(struct uma_hash *hash) 1324 { 1325 if (hash->uh_slab_hash == NULL) 1326 return; 1327 if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) 1328 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); 1329 else 1330 free(hash->uh_slab_hash, M_UMAHASH); 1331 } 1332 1333 /* 1334 * Frees all outstanding items in a bucket 1335 * 1336 * Arguments: 1337 * zone The zone to free to, must be unlocked. 1338 * bucket The free/alloc bucket with items. 1339 * 1340 * Returns: 1341 * Nothing 1342 */ 1343 static void 1344 bucket_drain(uma_zone_t zone, uma_bucket_t bucket) 1345 { 1346 int i; 1347 1348 if (bucket->ub_cnt == 0) 1349 return; 1350 1351 if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && 1352 bucket->ub_seq != SMR_SEQ_INVALID) { 1353 smr_wait(zone->uz_smr, bucket->ub_seq); 1354 bucket->ub_seq = SMR_SEQ_INVALID; 1355 for (i = 0; i < bucket->ub_cnt; i++) 1356 item_dtor(zone, bucket->ub_bucket[i], 1357 zone->uz_size, NULL, SKIP_NONE); 1358 } 1359 if (zone->uz_fini) 1360 for (i = 0; i < bucket->ub_cnt; i++) { 1361 kasan_mark_item_valid(zone, bucket->ub_bucket[i]); 1362 zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); 1363 kasan_mark_item_invalid(zone, bucket->ub_bucket[i]); 1364 } 1365 zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); 1366 if (zone->uz_max_items > 0) 1367 zone_free_limit(zone, bucket->ub_cnt); 1368 #ifdef INVARIANTS 1369 bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt); 1370 #endif 1371 bucket->ub_cnt = 0; 1372 } 1373 1374 /* 1375 * Drains the per cpu caches for a zone. 1376 * 1377 * NOTE: This may only be called while the zone is being torn down, and not 1378 * during normal operation. This is necessary in order that we do not have 1379 * to migrate CPUs to drain the per-CPU caches. 1380 * 1381 * Arguments: 1382 * zone The zone to drain, must be unlocked. 1383 * 1384 * Returns: 1385 * Nothing 1386 */ 1387 static void 1388 cache_drain(uma_zone_t zone) 1389 { 1390 uma_cache_t cache; 1391 uma_bucket_t bucket; 1392 smr_seq_t seq; 1393 int cpu; 1394 1395 /* 1396 * XXX: It is safe to not lock the per-CPU caches, because we're 1397 * tearing down the zone anyway. I.e., there will be no further use 1398 * of the caches at this point. 1399 * 1400 * XXX: It would good to be able to assert that the zone is being 1401 * torn down to prevent improper use of cache_drain(). 1402 */ 1403 seq = SMR_SEQ_INVALID; 1404 if ((zone->uz_flags & UMA_ZONE_SMR) != 0) 1405 seq = smr_advance(zone->uz_smr); 1406 CPU_FOREACH(cpu) { 1407 cache = &zone->uz_cpu[cpu]; 1408 bucket = cache_bucket_unload_alloc(cache); 1409 if (bucket != NULL) 1410 bucket_free(zone, bucket, NULL); 1411 bucket = cache_bucket_unload_free(cache); 1412 if (bucket != NULL) { 1413 bucket->ub_seq = seq; 1414 bucket_free(zone, bucket, NULL); 1415 } 1416 bucket = cache_bucket_unload_cross(cache); 1417 if (bucket != NULL) { 1418 bucket->ub_seq = seq; 1419 bucket_free(zone, bucket, NULL); 1420 } 1421 } 1422 bucket_cache_reclaim(zone, true, UMA_ANYDOMAIN); 1423 } 1424 1425 static void 1426 cache_shrink(uma_zone_t zone, void *unused) 1427 { 1428 1429 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 1430 return; 1431 1432 ZONE_LOCK(zone); 1433 zone->uz_bucket_size = 1434 (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2; 1435 ZONE_UNLOCK(zone); 1436 } 1437 1438 static void 1439 cache_drain_safe_cpu(uma_zone_t zone, void *unused) 1440 { 1441 uma_cache_t cache; 1442 uma_bucket_t b1, b2, b3; 1443 int domain; 1444 1445 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 1446 return; 1447 1448 b1 = b2 = b3 = NULL; 1449 critical_enter(); 1450 cache = &zone->uz_cpu[curcpu]; 1451 domain = PCPU_GET(domain); 1452 b1 = cache_bucket_unload_alloc(cache); 1453 1454 /* 1455 * Don't flush SMR zone buckets. This leaves the zone without a 1456 * bucket and forces every free to synchronize(). 1457 */ 1458 if ((zone->uz_flags & UMA_ZONE_SMR) == 0) { 1459 b2 = cache_bucket_unload_free(cache); 1460 b3 = cache_bucket_unload_cross(cache); 1461 } 1462 critical_exit(); 1463 1464 if (b1 != NULL) 1465 zone_free_bucket(zone, b1, NULL, domain, false); 1466 if (b2 != NULL) 1467 zone_free_bucket(zone, b2, NULL, domain, false); 1468 if (b3 != NULL) { 1469 /* Adjust the domain so it goes to zone_free_cross. */ 1470 domain = (domain + 1) % vm_ndomains; 1471 zone_free_bucket(zone, b3, NULL, domain, false); 1472 } 1473 } 1474 1475 /* 1476 * Safely drain per-CPU caches of a zone(s) to alloc bucket. 1477 * This is an expensive call because it needs to bind to all CPUs 1478 * one by one and enter a critical section on each of them in order 1479 * to safely access their cache buckets. 1480 * Zone lock must not be held on call this function. 1481 */ 1482 static void 1483 pcpu_cache_drain_safe(uma_zone_t zone) 1484 { 1485 int cpu; 1486 1487 /* 1488 * Polite bucket sizes shrinking was not enough, shrink aggressively. 1489 */ 1490 if (zone) 1491 cache_shrink(zone, NULL); 1492 else 1493 zone_foreach(cache_shrink, NULL); 1494 1495 CPU_FOREACH(cpu) { 1496 thread_lock(curthread); 1497 sched_bind(curthread, cpu); 1498 thread_unlock(curthread); 1499 1500 if (zone) 1501 cache_drain_safe_cpu(zone, NULL); 1502 else 1503 zone_foreach(cache_drain_safe_cpu, NULL); 1504 } 1505 thread_lock(curthread); 1506 sched_unbind(curthread); 1507 thread_unlock(curthread); 1508 } 1509 1510 /* 1511 * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller 1512 * requested a drain, otherwise the per-domain caches are trimmed to either 1513 * estimated working set size. 1514 */ 1515 static bool 1516 bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, bool trim, int domain) 1517 { 1518 uma_zone_domain_t zdom; 1519 uma_bucket_t bucket; 1520 long target; 1521 bool done = false; 1522 1523 /* 1524 * The cross bucket is partially filled and not part of 1525 * the item count. Reclaim it individually here. 1526 */ 1527 zdom = ZDOM_GET(zone, domain); 1528 if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) { 1529 ZONE_CROSS_LOCK(zone); 1530 bucket = zdom->uzd_cross; 1531 zdom->uzd_cross = NULL; 1532 ZONE_CROSS_UNLOCK(zone); 1533 if (bucket != NULL) 1534 bucket_free(zone, bucket, NULL); 1535 } 1536 1537 /* 1538 * If we were asked to drain the zone, we are done only once 1539 * this bucket cache is empty. If trim, we reclaim items in 1540 * excess of the zone's estimated working set size. Multiple 1541 * consecutive calls will shrink the WSS and so reclaim more. 1542 * If neither drain nor trim, then voluntarily reclaim 1/4 1543 * (to reduce first spike) of items not used for a long time. 1544 */ 1545 ZDOM_LOCK(zdom); 1546 zone_domain_update_wss(zdom); 1547 if (drain) 1548 target = 0; 1549 else if (trim) 1550 target = zdom->uzd_wss; 1551 else if (zdom->uzd_timin > 900 / UMA_TIMEOUT) 1552 target = zdom->uzd_nitems - zdom->uzd_limin / 4; 1553 else { 1554 ZDOM_UNLOCK(zdom); 1555 return (done); 1556 } 1557 while ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) != NULL && 1558 zdom->uzd_nitems >= target + bucket->ub_cnt) { 1559 bucket = zone_fetch_bucket(zone, zdom, true); 1560 if (bucket == NULL) 1561 break; 1562 bucket_free(zone, bucket, NULL); 1563 done = true; 1564 ZDOM_LOCK(zdom); 1565 } 1566 ZDOM_UNLOCK(zdom); 1567 return (done); 1568 } 1569 1570 static void 1571 bucket_cache_reclaim(uma_zone_t zone, bool drain, int domain) 1572 { 1573 int i; 1574 1575 /* 1576 * Shrink the zone bucket size to ensure that the per-CPU caches 1577 * don't grow too large. 1578 */ 1579 if (zone->uz_bucket_size > zone->uz_bucket_size_min) 1580 zone->uz_bucket_size--; 1581 1582 if (domain != UMA_ANYDOMAIN && 1583 (zone->uz_flags & UMA_ZONE_ROUNDROBIN) == 0) { 1584 bucket_cache_reclaim_domain(zone, drain, true, domain); 1585 } else { 1586 for (i = 0; i < vm_ndomains; i++) 1587 bucket_cache_reclaim_domain(zone, drain, true, i); 1588 } 1589 } 1590 1591 static void 1592 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) 1593 { 1594 uint8_t *mem; 1595 size_t size; 1596 int i; 1597 uint8_t flags; 1598 1599 CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", 1600 keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); 1601 1602 mem = slab_data(slab, keg); 1603 size = PAGE_SIZE * keg->uk_ppera; 1604 1605 kasan_mark_slab_valid(keg, mem); 1606 if (keg->uk_fini != NULL) { 1607 for (i = start - 1; i > -1; i--) 1608 #ifdef INVARIANTS 1609 /* 1610 * trash_fini implies that dtor was trash_dtor. trash_fini 1611 * would check that memory hasn't been modified since free, 1612 * which executed trash_dtor. 1613 * That's why we need to run uma_dbg_kskip() check here, 1614 * albeit we don't make skip check for other init/fini 1615 * invocations. 1616 */ 1617 if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) || 1618 keg->uk_fini != trash_fini) 1619 #endif 1620 keg->uk_fini(slab_item(slab, keg, i), keg->uk_size); 1621 } 1622 flags = slab->us_flags; 1623 if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { 1624 zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), 1625 NULL, SKIP_NONE); 1626 } 1627 keg->uk_freef(mem, size, flags); 1628 uma_total_dec(size); 1629 } 1630 1631 static void 1632 keg_drain_domain(uma_keg_t keg, int domain) 1633 { 1634 struct slabhead freeslabs; 1635 uma_domain_t dom; 1636 uma_slab_t slab, tmp; 1637 uint32_t i, stofree, stokeep, partial; 1638 1639 dom = &keg->uk_domain[domain]; 1640 LIST_INIT(&freeslabs); 1641 1642 CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u", 1643 keg->uk_name, keg, domain, dom->ud_free_items); 1644 1645 KEG_LOCK(keg, domain); 1646 1647 /* 1648 * Are the free items in partially allocated slabs sufficient to meet 1649 * the reserve? If not, compute the number of fully free slabs that must 1650 * be kept. 1651 */ 1652 partial = dom->ud_free_items - dom->ud_free_slabs * keg->uk_ipers; 1653 if (partial < keg->uk_reserve) { 1654 stokeep = min(dom->ud_free_slabs, 1655 howmany(keg->uk_reserve - partial, keg->uk_ipers)); 1656 } else { 1657 stokeep = 0; 1658 } 1659 stofree = dom->ud_free_slabs - stokeep; 1660 1661 /* 1662 * Partition the free slabs into two sets: those that must be kept in 1663 * order to maintain the reserve, and those that may be released back to 1664 * the system. Since one set may be much larger than the other, 1665 * populate the smaller of the two sets and swap them if necessary. 1666 */ 1667 for (i = min(stofree, stokeep); i > 0; i--) { 1668 slab = LIST_FIRST(&dom->ud_free_slab); 1669 LIST_REMOVE(slab, us_link); 1670 LIST_INSERT_HEAD(&freeslabs, slab, us_link); 1671 } 1672 if (stofree > stokeep) 1673 LIST_SWAP(&freeslabs, &dom->ud_free_slab, uma_slab, us_link); 1674 1675 if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) { 1676 LIST_FOREACH(slab, &freeslabs, us_link) 1677 UMA_HASH_REMOVE(&keg->uk_hash, slab); 1678 } 1679 dom->ud_free_items -= stofree * keg->uk_ipers; 1680 dom->ud_free_slabs -= stofree; 1681 dom->ud_pages -= stofree * keg->uk_ppera; 1682 KEG_UNLOCK(keg, domain); 1683 1684 LIST_FOREACH_SAFE(slab, &freeslabs, us_link, tmp) 1685 keg_free_slab(keg, slab, keg->uk_ipers); 1686 } 1687 1688 /* 1689 * Frees pages from a keg back to the system. This is done on demand from 1690 * the pageout daemon. 1691 * 1692 * Returns nothing. 1693 */ 1694 static void 1695 keg_drain(uma_keg_t keg, int domain) 1696 { 1697 int i; 1698 1699 if ((keg->uk_flags & UMA_ZONE_NOFREE) != 0) 1700 return; 1701 if (domain != UMA_ANYDOMAIN) { 1702 keg_drain_domain(keg, domain); 1703 } else { 1704 for (i = 0; i < vm_ndomains; i++) 1705 keg_drain_domain(keg, i); 1706 } 1707 } 1708 1709 static void 1710 zone_reclaim(uma_zone_t zone, int domain, int waitok, bool drain) 1711 { 1712 /* 1713 * Count active reclaim operations in order to interlock with 1714 * zone_dtor(), which removes the zone from global lists before 1715 * attempting to reclaim items itself. 1716 * 1717 * The zone may be destroyed while sleeping, so only zone_dtor() should 1718 * specify M_WAITOK. 1719 */ 1720 ZONE_LOCK(zone); 1721 if (waitok == M_WAITOK) { 1722 while (zone->uz_reclaimers > 0) 1723 msleep(zone, ZONE_LOCKPTR(zone), PVM, "zonedrain", 1); 1724 } 1725 zone->uz_reclaimers++; 1726 ZONE_UNLOCK(zone); 1727 bucket_cache_reclaim(zone, drain, domain); 1728 1729 if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) 1730 keg_drain(zone->uz_keg, domain); 1731 ZONE_LOCK(zone); 1732 zone->uz_reclaimers--; 1733 if (zone->uz_reclaimers == 0) 1734 wakeup(zone); 1735 ZONE_UNLOCK(zone); 1736 } 1737 1738 /* 1739 * Allocate a new slab for a keg and inserts it into the partial slab list. 1740 * The keg should be unlocked on entry. If the allocation succeeds it will 1741 * be locked on return. 1742 * 1743 * Arguments: 1744 * flags Wait flags for the item initialization routine 1745 * aflags Wait flags for the slab allocation 1746 * 1747 * Returns: 1748 * The slab that was allocated or NULL if there is no memory and the 1749 * caller specified M_NOWAIT. 1750 */ 1751 static uma_slab_t 1752 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, 1753 int aflags) 1754 { 1755 uma_domain_t dom; 1756 uma_slab_t slab; 1757 unsigned long size; 1758 uint8_t *mem; 1759 uint8_t sflags; 1760 int i; 1761 1762 TSENTER(); 1763 1764 KASSERT(domain >= 0 && domain < vm_ndomains, 1765 ("keg_alloc_slab: domain %d out of range", domain)); 1766 1767 slab = NULL; 1768 mem = NULL; 1769 if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { 1770 uma_hash_slab_t hslab; 1771 hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL, 1772 domain, aflags); 1773 if (hslab == NULL) 1774 goto fail; 1775 slab = &hslab->uhs_slab; 1776 } 1777 1778 /* 1779 * This reproduces the old vm_zone behavior of zero filling pages the 1780 * first time they are added to a zone. 1781 * 1782 * Malloced items are zeroed in uma_zalloc. 1783 */ 1784 1785 if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) 1786 aflags |= M_ZERO; 1787 else 1788 aflags &= ~M_ZERO; 1789 1790 if (keg->uk_flags & UMA_ZONE_NODUMP) 1791 aflags |= M_NODUMP; 1792 1793 if (keg->uk_flags & UMA_ZONE_NOFREE) 1794 aflags |= M_NEVERFREED; 1795 1796 /* zone is passed for legacy reasons. */ 1797 size = keg->uk_ppera * PAGE_SIZE; 1798 mem = keg->uk_allocf(zone, size, domain, &sflags, aflags); 1799 if (mem == NULL) { 1800 if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) 1801 zone_free_item(slabzone(keg->uk_ipers), 1802 slab_tohashslab(slab), NULL, SKIP_NONE); 1803 goto fail; 1804 } 1805 uma_total_inc(size); 1806 1807 /* For HASH zones all pages go to the same uma_domain. */ 1808 if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) 1809 domain = 0; 1810 1811 kmsan_mark(mem, size, 1812 (aflags & M_ZERO) != 0 ? KMSAN_STATE_INITED : KMSAN_STATE_UNINIT); 1813 1814 /* Point the slab into the allocated memory */ 1815 if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) 1816 slab = (uma_slab_t)(mem + keg->uk_pgoff); 1817 else 1818 slab_tohashslab(slab)->uhs_data = mem; 1819 1820 if (keg->uk_flags & UMA_ZFLAG_VTOSLAB) 1821 for (i = 0; i < keg->uk_ppera; i++) 1822 vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE), 1823 zone, slab); 1824 1825 slab->us_freecount = keg->uk_ipers; 1826 slab->us_flags = sflags; 1827 slab->us_domain = domain; 1828 1829 BIT_FILL(keg->uk_ipers, &slab->us_free); 1830 #ifdef INVARIANTS 1831 BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg)); 1832 #endif 1833 1834 if (keg->uk_init != NULL) { 1835 for (i = 0; i < keg->uk_ipers; i++) 1836 if (keg->uk_init(slab_item(slab, keg, i), 1837 keg->uk_size, flags) != 0) 1838 break; 1839 if (i != keg->uk_ipers) { 1840 keg_free_slab(keg, slab, i); 1841 goto fail; 1842 } 1843 } 1844 kasan_mark_slab_invalid(keg, mem); 1845 KEG_LOCK(keg, domain); 1846 1847 CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", 1848 slab, keg->uk_name, keg); 1849 1850 if (keg->uk_flags & UMA_ZFLAG_HASH) 1851 UMA_HASH_INSERT(&keg->uk_hash, slab, mem); 1852 1853 /* 1854 * If we got a slab here it's safe to mark it partially used 1855 * and return. We assume that the caller is going to remove 1856 * at least one item. 1857 */ 1858 dom = &keg->uk_domain[domain]; 1859 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 1860 dom->ud_pages += keg->uk_ppera; 1861 dom->ud_free_items += keg->uk_ipers; 1862 1863 TSEXIT(); 1864 return (slab); 1865 1866 fail: 1867 return (NULL); 1868 } 1869 1870 /* 1871 * This function is intended to be used early on in place of page_alloc(). It 1872 * performs contiguous physical memory allocations and uses a bump allocator for 1873 * KVA, so is usable before the kernel map is initialized. 1874 */ 1875 static void * 1876 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1877 int wait) 1878 { 1879 vm_paddr_t pa; 1880 vm_page_t m; 1881 int i, pages; 1882 1883 pages = howmany(bytes, PAGE_SIZE); 1884 KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__)); 1885 1886 *pflag = UMA_SLAB_BOOT; 1887 m = vm_page_alloc_noobj_contig_domain(domain, malloc2vm_flags(wait) | 1888 VM_ALLOC_WIRED, pages, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, 1889 VM_MEMATTR_DEFAULT); 1890 if (m == NULL) 1891 return (NULL); 1892 1893 pa = VM_PAGE_TO_PHYS(m); 1894 for (i = 0; i < pages; i++, pa += PAGE_SIZE) { 1895 #if MINIDUMP_PAGE_TRACKING && MINIDUMP_STARTUP_PAGE_TRACKING 1896 if ((wait & M_NODUMP) == 0) 1897 dump_add_page(pa); 1898 #endif 1899 } 1900 1901 /* Allocate KVA and indirectly advance bootmem. */ 1902 return ((void *)pmap_map(&bootmem, m->phys_addr, 1903 m->phys_addr + (pages * PAGE_SIZE), VM_PROT_READ | VM_PROT_WRITE)); 1904 } 1905 1906 static void 1907 startup_free(void *mem, vm_size_t bytes) 1908 { 1909 vm_offset_t va; 1910 vm_page_t m; 1911 1912 va = (vm_offset_t)mem; 1913 m = PHYS_TO_VM_PAGE(pmap_kextract(va)); 1914 1915 /* 1916 * startup_alloc() returns direct-mapped slabs on some platforms. Avoid 1917 * unmapping ranges of the direct map. 1918 */ 1919 if (va >= bootstart && va + bytes <= bootmem) 1920 pmap_remove(kernel_pmap, va, va + bytes); 1921 for (; bytes != 0; bytes -= PAGE_SIZE, m++) { 1922 #if MINIDUMP_PAGE_TRACKING && MINIDUMP_STARTUP_PAGE_TRACKING 1923 dump_drop_page(VM_PAGE_TO_PHYS(m)); 1924 #endif 1925 vm_page_unwire_noq(m); 1926 vm_page_free(m); 1927 } 1928 } 1929 1930 /* 1931 * Allocates a number of pages from the system 1932 * 1933 * Arguments: 1934 * bytes The number of bytes requested 1935 * wait Shall we wait? 1936 * 1937 * Returns: 1938 * A pointer to the alloced memory or possibly 1939 * NULL if M_NOWAIT is set. 1940 */ 1941 static void * 1942 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1943 int wait) 1944 { 1945 void *p; /* Returned page */ 1946 1947 *pflag = UMA_SLAB_KERNEL; 1948 p = kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); 1949 1950 return (p); 1951 } 1952 1953 static void * 1954 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1955 int wait) 1956 { 1957 struct pglist alloctail; 1958 vm_offset_t addr, zkva; 1959 int cpu, flags; 1960 vm_page_t p, p_next; 1961 #ifdef NUMA 1962 struct pcpu *pc; 1963 #endif 1964 1965 MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE); 1966 1967 TAILQ_INIT(&alloctail); 1968 flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | malloc2vm_flags(wait); 1969 *pflag = UMA_SLAB_KERNEL; 1970 for (cpu = 0; cpu <= mp_maxid; cpu++) { 1971 if (CPU_ABSENT(cpu)) { 1972 p = vm_page_alloc_noobj(flags); 1973 } else { 1974 #ifndef NUMA 1975 p = vm_page_alloc_noobj(flags); 1976 #else 1977 pc = pcpu_find(cpu); 1978 if (__predict_false(VM_DOMAIN_EMPTY(pc->pc_domain))) 1979 p = NULL; 1980 else 1981 p = vm_page_alloc_noobj_domain(pc->pc_domain, 1982 flags); 1983 if (__predict_false(p == NULL)) 1984 p = vm_page_alloc_noobj(flags); 1985 #endif 1986 } 1987 if (__predict_false(p == NULL)) 1988 goto fail; 1989 TAILQ_INSERT_TAIL(&alloctail, p, plinks.q); 1990 } 1991 if ((addr = kva_alloc(bytes)) == 0) 1992 goto fail; 1993 zkva = addr; 1994 TAILQ_FOREACH(p, &alloctail, plinks.q) { 1995 pmap_qenter(zkva, &p, 1); 1996 zkva += PAGE_SIZE; 1997 } 1998 return ((void*)addr); 1999 fail: 2000 TAILQ_FOREACH_SAFE(p, &alloctail, plinks.q, p_next) { 2001 vm_page_unwire_noq(p); 2002 vm_page_free(p); 2003 } 2004 return (NULL); 2005 } 2006 2007 /* 2008 * Allocates a number of pages not belonging to a VM object 2009 * 2010 * Arguments: 2011 * bytes The number of bytes requested 2012 * wait Shall we wait? 2013 * 2014 * Returns: 2015 * A pointer to the alloced memory or possibly 2016 * NULL if M_NOWAIT is set. 2017 */ 2018 static void * 2019 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 2020 int wait) 2021 { 2022 TAILQ_HEAD(, vm_page) alloctail; 2023 u_long npages; 2024 vm_offset_t retkva, zkva; 2025 vm_page_t p, p_next; 2026 uma_keg_t keg; 2027 int req; 2028 2029 TAILQ_INIT(&alloctail); 2030 keg = zone->uz_keg; 2031 req = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED; 2032 if ((wait & M_WAITOK) != 0) 2033 req |= VM_ALLOC_WAITOK; 2034 2035 npages = howmany(bytes, PAGE_SIZE); 2036 while (npages > 0) { 2037 p = vm_page_alloc_noobj_domain(domain, req); 2038 if (p != NULL) { 2039 TAILQ_INSERT_TAIL(&alloctail, p, plinks.q); 2040 npages--; 2041 continue; 2042 } 2043 /* 2044 * Page allocation failed, free intermediate pages and 2045 * exit. 2046 */ 2047 TAILQ_FOREACH_SAFE(p, &alloctail, plinks.q, p_next) { 2048 vm_page_unwire_noq(p); 2049 vm_page_free(p); 2050 } 2051 return (NULL); 2052 } 2053 *flags = UMA_SLAB_PRIV; 2054 zkva = keg->uk_kva + 2055 atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); 2056 retkva = zkva; 2057 TAILQ_FOREACH(p, &alloctail, plinks.q) { 2058 pmap_qenter(zkva, &p, 1); 2059 zkva += PAGE_SIZE; 2060 } 2061 2062 return ((void *)retkva); 2063 } 2064 2065 /* 2066 * Allocate physically contiguous pages. 2067 */ 2068 static void * 2069 contig_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 2070 int wait) 2071 { 2072 2073 *pflag = UMA_SLAB_KERNEL; 2074 return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), 2075 bytes, wait, 0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); 2076 } 2077 2078 #if defined(UMA_USE_DMAP) && !defined(UMA_MD_SMALL_ALLOC) 2079 void * 2080 uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 2081 int wait) 2082 { 2083 vm_page_t m; 2084 vm_paddr_t pa; 2085 void *va; 2086 2087 *flags = UMA_SLAB_PRIV; 2088 m = vm_page_alloc_noobj_domain(domain, 2089 malloc2vm_flags(wait) | VM_ALLOC_WIRED); 2090 if (m == NULL) 2091 return (NULL); 2092 pa = m->phys_addr; 2093 if ((wait & M_NODUMP) == 0) 2094 dump_add_page(pa); 2095 va = (void *)PHYS_TO_DMAP(pa); 2096 return (va); 2097 } 2098 #endif 2099 2100 /* 2101 * Frees a number of pages to the system 2102 * 2103 * Arguments: 2104 * mem A pointer to the memory to be freed 2105 * size The size of the memory being freed 2106 * flags The original p->us_flags field 2107 * 2108 * Returns: 2109 * Nothing 2110 */ 2111 static void 2112 page_free(void *mem, vm_size_t size, uint8_t flags) 2113 { 2114 2115 if ((flags & UMA_SLAB_BOOT) != 0) { 2116 startup_free(mem, size); 2117 return; 2118 } 2119 2120 KASSERT((flags & UMA_SLAB_KERNEL) != 0, 2121 ("UMA: page_free used with invalid flags %x", flags)); 2122 2123 kmem_free(mem, size); 2124 } 2125 2126 /* 2127 * Frees pcpu zone allocations 2128 * 2129 * Arguments: 2130 * mem A pointer to the memory to be freed 2131 * size The size of the memory being freed 2132 * flags The original p->us_flags field 2133 * 2134 * Returns: 2135 * Nothing 2136 */ 2137 static void 2138 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) 2139 { 2140 vm_offset_t sva, curva; 2141 vm_paddr_t paddr; 2142 vm_page_t m; 2143 2144 MPASS(size == (mp_maxid+1)*PAGE_SIZE); 2145 2146 if ((flags & UMA_SLAB_BOOT) != 0) { 2147 startup_free(mem, size); 2148 return; 2149 } 2150 2151 sva = (vm_offset_t)mem; 2152 for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { 2153 paddr = pmap_kextract(curva); 2154 m = PHYS_TO_VM_PAGE(paddr); 2155 vm_page_unwire_noq(m); 2156 vm_page_free(m); 2157 } 2158 pmap_qremove(sva, size >> PAGE_SHIFT); 2159 kva_free(sva, size); 2160 } 2161 2162 #if defined(UMA_USE_DMAP) && !defined(UMA_MD_SMALL_ALLOC) 2163 void 2164 uma_small_free(void *mem, vm_size_t size, uint8_t flags) 2165 { 2166 vm_page_t m; 2167 vm_paddr_t pa; 2168 2169 pa = DMAP_TO_PHYS((vm_offset_t)mem); 2170 dump_drop_page(pa); 2171 m = PHYS_TO_VM_PAGE(pa); 2172 vm_page_unwire_noq(m); 2173 vm_page_free(m); 2174 } 2175 #endif 2176 2177 /* 2178 * Zero fill initializer 2179 * 2180 * Arguments/Returns follow uma_init specifications 2181 */ 2182 static int 2183 zero_init(void *mem, int size, int flags) 2184 { 2185 bzero(mem, size); 2186 return (0); 2187 } 2188 2189 #ifdef INVARIANTS 2190 static struct noslabbits * 2191 slab_dbg_bits(uma_slab_t slab, uma_keg_t keg) 2192 { 2193 2194 return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers))); 2195 } 2196 #endif 2197 2198 /* 2199 * Actual size of embedded struct slab (!OFFPAGE). 2200 */ 2201 static size_t 2202 slab_sizeof(int nitems) 2203 { 2204 size_t s; 2205 2206 s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS; 2207 return (roundup(s, UMA_ALIGN_PTR + 1)); 2208 } 2209 2210 #define UMA_FIXPT_SHIFT 31 2211 #define UMA_FRAC_FIXPT(n, d) \ 2212 ((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d))) 2213 #define UMA_FIXPT_PCT(f) \ 2214 ((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT)) 2215 #define UMA_PCT_FIXPT(pct) UMA_FRAC_FIXPT((pct), 100) 2216 #define UMA_MIN_EFF UMA_PCT_FIXPT(100 - UMA_MAX_WASTE) 2217 2218 /* 2219 * Compute the number of items that will fit in a slab. If hdr is true, the 2220 * item count may be limited to provide space in the slab for an inline slab 2221 * header. Otherwise, all slab space will be provided for item storage. 2222 */ 2223 static u_int 2224 slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr) 2225 { 2226 u_int ipers; 2227 u_int padpi; 2228 2229 /* The padding between items is not needed after the last item. */ 2230 padpi = rsize - size; 2231 2232 if (hdr) { 2233 /* 2234 * Start with the maximum item count and remove items until 2235 * the slab header first alongside the allocatable memory. 2236 */ 2237 for (ipers = MIN(SLAB_MAX_SETSIZE, 2238 (slabsize + padpi - slab_sizeof(1)) / rsize); 2239 ipers > 0 && 2240 ipers * rsize - padpi + slab_sizeof(ipers) > slabsize; 2241 ipers--) 2242 continue; 2243 } else { 2244 ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE); 2245 } 2246 2247 return (ipers); 2248 } 2249 2250 struct keg_layout_result { 2251 u_int format; 2252 u_int slabsize; 2253 u_int ipers; 2254 u_int eff; 2255 }; 2256 2257 static void 2258 keg_layout_one(uma_keg_t keg, u_int rsize, u_int slabsize, u_int fmt, 2259 struct keg_layout_result *kl) 2260 { 2261 u_int total; 2262 2263 kl->format = fmt; 2264 kl->slabsize = slabsize; 2265 2266 /* Handle INTERNAL as inline with an extra page. */ 2267 if ((fmt & UMA_ZFLAG_INTERNAL) != 0) { 2268 kl->format &= ~UMA_ZFLAG_INTERNAL; 2269 kl->slabsize += PAGE_SIZE; 2270 } 2271 2272 kl->ipers = slab_ipers_hdr(keg->uk_size, rsize, kl->slabsize, 2273 (fmt & UMA_ZFLAG_OFFPAGE) == 0); 2274 2275 /* Account for memory used by an offpage slab header. */ 2276 total = kl->slabsize; 2277 if ((fmt & UMA_ZFLAG_OFFPAGE) != 0) 2278 total += slabzone(kl->ipers)->uz_keg->uk_rsize; 2279 2280 kl->eff = UMA_FRAC_FIXPT(kl->ipers * rsize, total); 2281 } 2282 2283 /* 2284 * Determine the format of a uma keg. This determines where the slab header 2285 * will be placed (inline or offpage) and calculates ipers, rsize, and ppera. 2286 * 2287 * Arguments 2288 * keg The zone we should initialize 2289 * 2290 * Returns 2291 * Nothing 2292 */ 2293 static void 2294 keg_layout(uma_keg_t keg) 2295 { 2296 struct keg_layout_result kl = {}, kl_tmp; 2297 u_int fmts[2]; 2298 u_int alignsize; 2299 u_int nfmt; 2300 u_int pages; 2301 u_int rsize; 2302 u_int slabsize; 2303 u_int i, j; 2304 2305 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || 2306 (keg->uk_size <= UMA_PCPU_ALLOC_SIZE && 2307 (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0), 2308 ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b", 2309 __func__, keg->uk_name, keg->uk_size, keg->uk_flags, 2310 PRINT_UMA_ZFLAGS)); 2311 KASSERT((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) == 0 || 2312 (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0, 2313 ("%s: incompatible flags 0x%b", __func__, keg->uk_flags, 2314 PRINT_UMA_ZFLAGS)); 2315 2316 alignsize = keg->uk_align + 1; 2317 #ifdef KASAN 2318 /* 2319 * ASAN requires that each allocation be aligned to the shadow map 2320 * scale factor. 2321 */ 2322 if (alignsize < KASAN_SHADOW_SCALE) 2323 alignsize = KASAN_SHADOW_SCALE; 2324 #endif 2325 2326 /* 2327 * Calculate the size of each allocation (rsize) according to 2328 * alignment. If the requested size is smaller than we have 2329 * allocation bits for we round it up. 2330 */ 2331 rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT); 2332 rsize = roundup2(rsize, alignsize); 2333 2334 if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) { 2335 /* 2336 * We want one item to start on every align boundary in a page. 2337 * To do this we will span pages. We will also extend the item 2338 * by the size of align if it is an even multiple of align. 2339 * Otherwise, it would fall on the same boundary every time. 2340 */ 2341 if ((rsize & alignsize) == 0) 2342 rsize += alignsize; 2343 slabsize = rsize * (PAGE_SIZE / alignsize); 2344 slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE); 2345 slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE); 2346 slabsize = round_page(slabsize); 2347 } else { 2348 /* 2349 * Start with a slab size of as many pages as it takes to 2350 * represent a single item. We will try to fit as many 2351 * additional items into the slab as possible. 2352 */ 2353 slabsize = round_page(keg->uk_size); 2354 } 2355 2356 /* Build a list of all of the available formats for this keg. */ 2357 nfmt = 0; 2358 2359 /* Evaluate an inline slab layout. */ 2360 if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0) 2361 fmts[nfmt++] = 0; 2362 2363 /* TODO: vm_page-embedded slab. */ 2364 2365 /* 2366 * We can't do OFFPAGE if we're internal or if we've been 2367 * asked to not go to the VM for buckets. If we do this we 2368 * may end up going to the VM for slabs which we do not want 2369 * to do if we're UMA_ZONE_VM, which clearly forbids it. 2370 * In those cases, evaluate a pseudo-format called INTERNAL 2371 * which has an inline slab header and one extra page to 2372 * guarantee that it fits. 2373 * 2374 * Otherwise, see if using an OFFPAGE slab will improve our 2375 * efficiency. 2376 */ 2377 if ((keg->uk_flags & (UMA_ZFLAG_INTERNAL | UMA_ZONE_VM)) != 0) 2378 fmts[nfmt++] = UMA_ZFLAG_INTERNAL; 2379 else 2380 fmts[nfmt++] = UMA_ZFLAG_OFFPAGE; 2381 2382 /* 2383 * Choose a slab size and format which satisfy the minimum efficiency. 2384 * Prefer the smallest slab size that meets the constraints. 2385 * 2386 * Start with a minimum slab size, to accommodate CACHESPREAD. Then, 2387 * for small items (up to PAGE_SIZE), the iteration increment is one 2388 * page; and for large items, the increment is one item. 2389 */ 2390 i = (slabsize + rsize - keg->uk_size) / MAX(PAGE_SIZE, rsize); 2391 KASSERT(i >= 1, ("keg %s(%p) flags=0x%b slabsize=%u, rsize=%u, i=%u", 2392 keg->uk_name, keg, keg->uk_flags, PRINT_UMA_ZFLAGS, slabsize, 2393 rsize, i)); 2394 for ( ; ; i++) { 2395 slabsize = (rsize <= PAGE_SIZE) ? ptoa(i) : 2396 round_page(rsize * (i - 1) + keg->uk_size); 2397 2398 for (j = 0; j < nfmt; j++) { 2399 /* Only if we have no viable format yet. */ 2400 if ((fmts[j] & UMA_ZFLAG_INTERNAL) != 0 && 2401 kl.ipers > 0) 2402 continue; 2403 2404 keg_layout_one(keg, rsize, slabsize, fmts[j], &kl_tmp); 2405 if (kl_tmp.eff <= kl.eff) 2406 continue; 2407 2408 kl = kl_tmp; 2409 2410 CTR6(KTR_UMA, "keg %s layout: format %#x " 2411 "(ipers %u * rsize %u) / slabsize %#x = %u%% eff", 2412 keg->uk_name, kl.format, kl.ipers, rsize, 2413 kl.slabsize, UMA_FIXPT_PCT(kl.eff)); 2414 2415 /* Stop when we reach the minimum efficiency. */ 2416 if (kl.eff >= UMA_MIN_EFF) 2417 break; 2418 } 2419 2420 if (kl.eff >= UMA_MIN_EFF || !multipage_slabs || 2421 slabsize >= SLAB_MAX_SETSIZE * rsize || 2422 (keg->uk_flags & (UMA_ZONE_PCPU | UMA_ZONE_CONTIG)) != 0) 2423 break; 2424 } 2425 2426 pages = atop(kl.slabsize); 2427 if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) 2428 pages *= mp_maxid + 1; 2429 2430 keg->uk_rsize = rsize; 2431 keg->uk_ipers = kl.ipers; 2432 keg->uk_ppera = pages; 2433 keg->uk_flags |= kl.format; 2434 2435 /* 2436 * How do we find the slab header if it is offpage or if not all item 2437 * start addresses are in the same page? We could solve the latter 2438 * case with vaddr alignment, but we don't. 2439 */ 2440 if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0 || 2441 (keg->uk_ipers - 1) * rsize >= PAGE_SIZE) { 2442 if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0) 2443 keg->uk_flags |= UMA_ZFLAG_HASH; 2444 else 2445 keg->uk_flags |= UMA_ZFLAG_VTOSLAB; 2446 } 2447 2448 CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u", 2449 __func__, keg->uk_name, keg->uk_flags, rsize, keg->uk_ipers, 2450 pages); 2451 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, 2452 ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__, 2453 keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize, 2454 keg->uk_ipers, pages)); 2455 } 2456 2457 /* 2458 * Keg header ctor. This initializes all fields, locks, etc. And inserts 2459 * the keg onto the global keg list. 2460 * 2461 * Arguments/Returns follow uma_ctor specifications 2462 * udata Actually uma_kctor_args 2463 */ 2464 static int 2465 keg_ctor(void *mem, int size, void *udata, int flags) 2466 { 2467 struct uma_kctor_args *arg = udata; 2468 uma_keg_t keg = mem; 2469 uma_zone_t zone; 2470 int i; 2471 2472 bzero(keg, size); 2473 keg->uk_size = arg->size; 2474 keg->uk_init = arg->uminit; 2475 keg->uk_fini = arg->fini; 2476 keg->uk_align = arg->align; 2477 keg->uk_reserve = 0; 2478 keg->uk_flags = arg->flags; 2479 2480 /* 2481 * We use a global round-robin policy by default. Zones with 2482 * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which 2483 * case the iterator is never run. 2484 */ 2485 keg->uk_dr.dr_policy = DOMAINSET_RR(); 2486 keg->uk_dr.dr_iter = 0; 2487 2488 /* 2489 * The primary zone is passed to us at keg-creation time. 2490 */ 2491 zone = arg->zone; 2492 keg->uk_name = zone->uz_name; 2493 2494 if (arg->flags & UMA_ZONE_ZINIT) 2495 keg->uk_init = zero_init; 2496 2497 if (arg->flags & UMA_ZONE_MALLOC) 2498 keg->uk_flags |= UMA_ZFLAG_VTOSLAB; 2499 2500 #ifndef SMP 2501 keg->uk_flags &= ~UMA_ZONE_PCPU; 2502 #endif 2503 2504 keg_layout(keg); 2505 2506 /* 2507 * Use a first-touch NUMA policy for kegs that pmap_extract() will 2508 * work on. Use round-robin for everything else. 2509 * 2510 * Zones may override the default by specifying either. 2511 */ 2512 #ifdef NUMA 2513 if ((keg->uk_flags & 2514 (UMA_ZONE_ROUNDROBIN | UMA_ZFLAG_CACHE | UMA_ZONE_NOTPAGE)) == 0) 2515 keg->uk_flags |= UMA_ZONE_FIRSTTOUCH; 2516 else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0) 2517 keg->uk_flags |= UMA_ZONE_ROUNDROBIN; 2518 #endif 2519 2520 /* 2521 * If we haven't booted yet we need allocations to go through the 2522 * startup cache until the vm is ready. 2523 */ 2524 #ifdef UMA_USE_DMAP 2525 if (keg->uk_ppera == 1) 2526 keg->uk_allocf = uma_small_alloc; 2527 else 2528 #endif 2529 if (booted < BOOT_KVA) 2530 keg->uk_allocf = startup_alloc; 2531 else if (keg->uk_flags & UMA_ZONE_PCPU) 2532 keg->uk_allocf = pcpu_page_alloc; 2533 else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) 2534 keg->uk_allocf = contig_alloc; 2535 else 2536 keg->uk_allocf = page_alloc; 2537 #ifdef UMA_USE_DMAP 2538 if (keg->uk_ppera == 1) 2539 keg->uk_freef = uma_small_free; 2540 else 2541 #endif 2542 if (keg->uk_flags & UMA_ZONE_PCPU) 2543 keg->uk_freef = pcpu_page_free; 2544 else 2545 keg->uk_freef = page_free; 2546 2547 /* 2548 * Initialize keg's locks. 2549 */ 2550 for (i = 0; i < vm_ndomains; i++) 2551 KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS)); 2552 2553 /* 2554 * If we're putting the slab header in the actual page we need to 2555 * figure out where in each page it goes. See slab_sizeof 2556 * definition. 2557 */ 2558 if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) { 2559 size_t shsize; 2560 2561 shsize = slab_sizeof(keg->uk_ipers); 2562 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize; 2563 /* 2564 * The only way the following is possible is if with our 2565 * UMA_ALIGN_PTR adjustments we are now bigger than 2566 * UMA_SLAB_SIZE. I haven't checked whether this is 2567 * mathematically possible for all cases, so we make 2568 * sure here anyway. 2569 */ 2570 KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera, 2571 ("zone %s ipers %d rsize %d size %d slab won't fit", 2572 zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size)); 2573 } 2574 2575 if (keg->uk_flags & UMA_ZFLAG_HASH) 2576 hash_alloc(&keg->uk_hash, 0); 2577 2578 CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone); 2579 2580 LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); 2581 2582 rw_wlock(&uma_rwlock); 2583 LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); 2584 rw_wunlock(&uma_rwlock); 2585 return (0); 2586 } 2587 2588 static void 2589 zone_kva_available(uma_zone_t zone, void *unused) 2590 { 2591 uma_keg_t keg; 2592 2593 if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) 2594 return; 2595 KEG_GET(zone, keg); 2596 2597 if (keg->uk_allocf == startup_alloc) { 2598 /* Switch to the real allocator. */ 2599 if (keg->uk_flags & UMA_ZONE_PCPU) 2600 keg->uk_allocf = pcpu_page_alloc; 2601 else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && 2602 keg->uk_ppera > 1) 2603 keg->uk_allocf = contig_alloc; 2604 else 2605 keg->uk_allocf = page_alloc; 2606 } 2607 } 2608 2609 static void 2610 zone_alloc_counters(uma_zone_t zone, void *unused) 2611 { 2612 2613 zone->uz_allocs = counter_u64_alloc(M_WAITOK); 2614 zone->uz_frees = counter_u64_alloc(M_WAITOK); 2615 zone->uz_fails = counter_u64_alloc(M_WAITOK); 2616 zone->uz_xdomain = counter_u64_alloc(M_WAITOK); 2617 } 2618 2619 static void 2620 zone_alloc_sysctl(uma_zone_t zone, void *unused) 2621 { 2622 uma_zone_domain_t zdom; 2623 uma_domain_t dom; 2624 uma_keg_t keg; 2625 struct sysctl_oid *oid, *domainoid; 2626 int domains, i, cnt; 2627 static const char *nokeg = "cache zone"; 2628 char *c; 2629 2630 /* 2631 * Make a sysctl safe copy of the zone name by removing 2632 * any special characters and handling dups by appending 2633 * an index. 2634 */ 2635 if (zone->uz_namecnt != 0) { 2636 /* Count the number of decimal digits and '_' separator. */ 2637 for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++) 2638 cnt /= 10; 2639 zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1, 2640 M_UMA, M_WAITOK); 2641 sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name, 2642 zone->uz_namecnt); 2643 } else 2644 zone->uz_ctlname = strdup(zone->uz_name, M_UMA); 2645 for (c = zone->uz_ctlname; *c != '\0'; c++) 2646 if (strchr("./\\ -", *c) != NULL) 2647 *c = '_'; 2648 2649 /* 2650 * Basic parameters at the root. 2651 */ 2652 zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma), 2653 OID_AUTO, zone->uz_ctlname, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); 2654 oid = zone->uz_oid; 2655 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2656 "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size"); 2657 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2658 "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE, 2659 zone, 0, sysctl_handle_uma_zone_flags, "A", 2660 "Allocator configuration flags"); 2661 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2662 "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0, 2663 "Desired per-cpu cache size"); 2664 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2665 "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0, 2666 "Maximum allowed per-cpu cache size"); 2667 2668 /* 2669 * keg if present. 2670 */ 2671 if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) 2672 domains = vm_ndomains; 2673 else 2674 domains = 1; 2675 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 2676 "keg", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); 2677 keg = zone->uz_keg; 2678 if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) { 2679 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2680 "name", CTLFLAG_RD, keg->uk_name, "Keg name"); 2681 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2682 "rsize", CTLFLAG_RD, &keg->uk_rsize, 0, 2683 "Real object size with alignment"); 2684 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2685 "ppera", CTLFLAG_RD, &keg->uk_ppera, 0, 2686 "pages per-slab allocation"); 2687 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2688 "ipers", CTLFLAG_RD, &keg->uk_ipers, 0, 2689 "items available per-slab"); 2690 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2691 "align", CTLFLAG_RD, &keg->uk_align, 0, 2692 "item alignment mask"); 2693 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2694 "reserve", CTLFLAG_RD, &keg->uk_reserve, 0, 2695 "number of reserved items"); 2696 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2697 "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, 2698 keg, 0, sysctl_handle_uma_slab_efficiency, "I", 2699 "Slab utilization (100 - internal fragmentation %)"); 2700 domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid), 2701 OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); 2702 for (i = 0; i < domains; i++) { 2703 dom = &keg->uk_domain[i]; 2704 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), 2705 OID_AUTO, VM_DOMAIN(i)->vmd_name, 2706 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); 2707 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2708 "pages", CTLFLAG_RD, &dom->ud_pages, 0, 2709 "Total pages currently allocated from VM"); 2710 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2711 "free_items", CTLFLAG_RD, &dom->ud_free_items, 0, 2712 "Items free in the slab layer"); 2713 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2714 "free_slabs", CTLFLAG_RD, &dom->ud_free_slabs, 0, 2715 "Unused slabs"); 2716 } 2717 } else 2718 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2719 "name", CTLFLAG_RD, nokeg, "Keg name"); 2720 2721 /* 2722 * Information about zone limits. 2723 */ 2724 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 2725 "limit", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); 2726 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2727 "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 2728 zone, 0, sysctl_handle_uma_zone_items, "QU", 2729 "Current number of allocated items if limit is set"); 2730 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2731 "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, 2732 "Maximum number of allocated and cached items"); 2733 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2734 "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0, 2735 "Number of threads sleeping at limit"); 2736 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2737 "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, 2738 "Total zone limit sleeps"); 2739 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2740 "bucket_max", CTLFLAG_RD, &zone->uz_bucket_max, 0, 2741 "Maximum number of items in each domain's bucket cache"); 2742 2743 /* 2744 * Per-domain zone information. 2745 */ 2746 domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), 2747 OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); 2748 for (i = 0; i < domains; i++) { 2749 zdom = ZDOM_GET(zone, i); 2750 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), 2751 OID_AUTO, VM_DOMAIN(i)->vmd_name, 2752 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); 2753 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2754 "nitems", CTLFLAG_RD, &zdom->uzd_nitems, 2755 "number of items in this domain"); 2756 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2757 "imax", CTLFLAG_RD, &zdom->uzd_imax, 2758 "maximum item count in this period"); 2759 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2760 "imin", CTLFLAG_RD, &zdom->uzd_imin, 2761 "minimum item count in this period"); 2762 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2763 "bimin", CTLFLAG_RD, &zdom->uzd_bimin, 2764 "Minimum item count in this batch"); 2765 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2766 "wss", CTLFLAG_RD, &zdom->uzd_wss, 2767 "Working set size"); 2768 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2769 "limin", CTLFLAG_RD, &zdom->uzd_limin, 2770 "Long time minimum item count"); 2771 SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2772 "timin", CTLFLAG_RD, &zdom->uzd_timin, 0, 2773 "Time since zero long time minimum item count"); 2774 } 2775 2776 /* 2777 * General statistics. 2778 */ 2779 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 2780 "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); 2781 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2782 "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, 2783 zone, 1, sysctl_handle_uma_zone_cur, "I", 2784 "Current number of allocated items"); 2785 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2786 "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 2787 zone, 0, sysctl_handle_uma_zone_allocs, "QU", 2788 "Total allocation calls"); 2789 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2790 "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 2791 zone, 0, sysctl_handle_uma_zone_frees, "QU", 2792 "Total free calls"); 2793 SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2794 "fails", CTLFLAG_RD, &zone->uz_fails, 2795 "Number of allocation failures"); 2796 SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2797 "xdomain", CTLFLAG_RD, &zone->uz_xdomain, 2798 "Free calls from the wrong domain"); 2799 } 2800 2801 struct uma_zone_count { 2802 const char *name; 2803 int count; 2804 }; 2805 2806 static void 2807 zone_count(uma_zone_t zone, void *arg) 2808 { 2809 struct uma_zone_count *cnt; 2810 2811 cnt = arg; 2812 /* 2813 * Some zones are rapidly created with identical names and 2814 * destroyed out of order. This can lead to gaps in the count. 2815 * Use one greater than the maximum observed for this name. 2816 */ 2817 if (strcmp(zone->uz_name, cnt->name) == 0) 2818 cnt->count = MAX(cnt->count, 2819 zone->uz_namecnt + 1); 2820 } 2821 2822 static void 2823 zone_update_caches(uma_zone_t zone) 2824 { 2825 int i; 2826 2827 for (i = 0; i <= mp_maxid; i++) { 2828 cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size); 2829 cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags); 2830 } 2831 } 2832 2833 /* 2834 * Zone header ctor. This initializes all fields, locks, etc. 2835 * 2836 * Arguments/Returns follow uma_ctor specifications 2837 * udata Actually uma_zctor_args 2838 */ 2839 static int 2840 zone_ctor(void *mem, int size, void *udata, int flags) 2841 { 2842 struct uma_zone_count cnt; 2843 struct uma_zctor_args *arg = udata; 2844 uma_zone_domain_t zdom; 2845 uma_zone_t zone = mem; 2846 uma_zone_t z; 2847 uma_keg_t keg; 2848 int i; 2849 2850 bzero(zone, size); 2851 zone->uz_name = arg->name; 2852 zone->uz_ctor = arg->ctor; 2853 zone->uz_dtor = arg->dtor; 2854 zone->uz_init = NULL; 2855 zone->uz_fini = NULL; 2856 zone->uz_sleeps = 0; 2857 zone->uz_bucket_size = 0; 2858 zone->uz_bucket_size_min = 0; 2859 zone->uz_bucket_size_max = BUCKET_MAX; 2860 zone->uz_flags = (arg->flags & UMA_ZONE_SMR); 2861 zone->uz_warning = NULL; 2862 /* The domain structures follow the cpu structures. */ 2863 zone->uz_bucket_max = ULONG_MAX; 2864 timevalclear(&zone->uz_ratecheck); 2865 2866 /* Count the number of duplicate names. */ 2867 cnt.name = arg->name; 2868 cnt.count = 0; 2869 zone_foreach(zone_count, &cnt); 2870 zone->uz_namecnt = cnt.count; 2871 ZONE_CROSS_LOCK_INIT(zone); 2872 2873 for (i = 0; i < vm_ndomains; i++) { 2874 zdom = ZDOM_GET(zone, i); 2875 ZDOM_LOCK_INIT(zone, zdom, (arg->flags & UMA_ZONE_MTXCLASS)); 2876 STAILQ_INIT(&zdom->uzd_buckets); 2877 } 2878 2879 #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN) 2880 if (arg->uminit == trash_init && arg->fini == trash_fini) 2881 zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR; 2882 #elif defined(KASAN) 2883 if ((arg->flags & (UMA_ZONE_NOFREE | UMA_ZFLAG_CACHE)) != 0) 2884 arg->flags |= UMA_ZONE_NOKASAN; 2885 #endif 2886 2887 /* 2888 * This is a pure cache zone, no kegs. 2889 */ 2890 if (arg->import) { 2891 KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0, 2892 ("zone_ctor: Import specified for non-cache zone.")); 2893 zone->uz_flags = arg->flags; 2894 zone->uz_size = arg->size; 2895 zone->uz_import = arg->import; 2896 zone->uz_release = arg->release; 2897 zone->uz_arg = arg->arg; 2898 #ifdef NUMA 2899 /* 2900 * Cache zones are round-robin unless a policy is 2901 * specified because they may have incompatible 2902 * constraints. 2903 */ 2904 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0) 2905 zone->uz_flags |= UMA_ZONE_ROUNDROBIN; 2906 #endif 2907 rw_wlock(&uma_rwlock); 2908 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); 2909 rw_wunlock(&uma_rwlock); 2910 goto out; 2911 } 2912 2913 /* 2914 * Use the regular zone/keg/slab allocator. 2915 */ 2916 zone->uz_import = zone_import; 2917 zone->uz_release = zone_release; 2918 zone->uz_arg = zone; 2919 keg = arg->keg; 2920 2921 if (arg->flags & UMA_ZONE_SECONDARY) { 2922 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, 2923 ("Secondary zone requested UMA_ZFLAG_INTERNAL")); 2924 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); 2925 zone->uz_init = arg->uminit; 2926 zone->uz_fini = arg->fini; 2927 zone->uz_flags |= UMA_ZONE_SECONDARY; 2928 rw_wlock(&uma_rwlock); 2929 ZONE_LOCK(zone); 2930 LIST_FOREACH(z, &keg->uk_zones, uz_link) { 2931 if (LIST_NEXT(z, uz_link) == NULL) { 2932 LIST_INSERT_AFTER(z, zone, uz_link); 2933 break; 2934 } 2935 } 2936 ZONE_UNLOCK(zone); 2937 rw_wunlock(&uma_rwlock); 2938 } else if (keg == NULL) { 2939 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, 2940 arg->align, arg->flags)) == NULL) 2941 return (ENOMEM); 2942 } else { 2943 struct uma_kctor_args karg; 2944 int error; 2945 2946 /* We should only be here from uma_startup() */ 2947 karg.size = arg->size; 2948 karg.uminit = arg->uminit; 2949 karg.fini = arg->fini; 2950 karg.align = arg->align; 2951 karg.flags = (arg->flags & ~UMA_ZONE_SMR); 2952 karg.zone = zone; 2953 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, 2954 flags); 2955 if (error) 2956 return (error); 2957 } 2958 2959 /* Inherit properties from the keg. */ 2960 zone->uz_keg = keg; 2961 zone->uz_size = keg->uk_size; 2962 zone->uz_flags |= (keg->uk_flags & 2963 (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); 2964 2965 out: 2966 if (booted >= BOOT_PCPU) { 2967 zone_alloc_counters(zone, NULL); 2968 if (booted >= BOOT_RUNNING) 2969 zone_alloc_sysctl(zone, NULL); 2970 } else { 2971 zone->uz_allocs = EARLY_COUNTER; 2972 zone->uz_frees = EARLY_COUNTER; 2973 zone->uz_fails = EARLY_COUNTER; 2974 } 2975 2976 /* Caller requests a private SMR context. */ 2977 if ((zone->uz_flags & UMA_ZONE_SMR) != 0) 2978 zone->uz_smr = smr_create(zone->uz_name, 0, 0); 2979 2980 KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != 2981 (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), 2982 ("Invalid zone flag combination")); 2983 if (arg->flags & UMA_ZFLAG_INTERNAL) 2984 zone->uz_bucket_size_max = zone->uz_bucket_size = 0; 2985 if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) 2986 zone->uz_bucket_size = BUCKET_MAX; 2987 else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) 2988 zone->uz_bucket_size = 0; 2989 else 2990 zone->uz_bucket_size = bucket_select(zone->uz_size); 2991 zone->uz_bucket_size_min = zone->uz_bucket_size; 2992 if (zone->uz_dtor != NULL || zone->uz_ctor != NULL) 2993 zone->uz_flags |= UMA_ZFLAG_CTORDTOR; 2994 zone_update_caches(zone); 2995 2996 return (0); 2997 } 2998 2999 /* 3000 * Keg header dtor. This frees all data, destroys locks, frees the hash 3001 * table and removes the keg from the global list. 3002 * 3003 * Arguments/Returns follow uma_dtor specifications 3004 * udata unused 3005 */ 3006 static void 3007 keg_dtor(void *arg, int size, void *udata) 3008 { 3009 uma_keg_t keg; 3010 uint32_t free, pages; 3011 int i; 3012 3013 keg = (uma_keg_t)arg; 3014 free = pages = 0; 3015 for (i = 0; i < vm_ndomains; i++) { 3016 free += keg->uk_domain[i].ud_free_items; 3017 pages += keg->uk_domain[i].ud_pages; 3018 KEG_LOCK_FINI(keg, i); 3019 } 3020 if (pages != 0) 3021 printf("Freed UMA keg (%s) was not empty (%u items). " 3022 " Lost %u pages of memory.\n", 3023 keg->uk_name ? keg->uk_name : "", 3024 pages / keg->uk_ppera * keg->uk_ipers - free, pages); 3025 3026 hash_free(&keg->uk_hash); 3027 } 3028 3029 /* 3030 * Zone header dtor. 3031 * 3032 * Arguments/Returns follow uma_dtor specifications 3033 * udata unused 3034 */ 3035 static void 3036 zone_dtor(void *arg, int size, void *udata) 3037 { 3038 uma_zone_t zone; 3039 uma_keg_t keg; 3040 int i; 3041 3042 zone = (uma_zone_t)arg; 3043 3044 sysctl_remove_oid(zone->uz_oid, 1, 1); 3045 3046 if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) 3047 cache_drain(zone); 3048 3049 rw_wlock(&uma_rwlock); 3050 LIST_REMOVE(zone, uz_link); 3051 rw_wunlock(&uma_rwlock); 3052 if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { 3053 keg = zone->uz_keg; 3054 keg->uk_reserve = 0; 3055 } 3056 zone_reclaim(zone, UMA_ANYDOMAIN, M_WAITOK, true); 3057 3058 /* 3059 * We only destroy kegs from non secondary/non cache zones. 3060 */ 3061 if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { 3062 keg = zone->uz_keg; 3063 rw_wlock(&uma_rwlock); 3064 LIST_REMOVE(keg, uk_link); 3065 rw_wunlock(&uma_rwlock); 3066 zone_free_item(kegs, keg, NULL, SKIP_NONE); 3067 } 3068 counter_u64_free(zone->uz_allocs); 3069 counter_u64_free(zone->uz_frees); 3070 counter_u64_free(zone->uz_fails); 3071 counter_u64_free(zone->uz_xdomain); 3072 free(zone->uz_ctlname, M_UMA); 3073 for (i = 0; i < vm_ndomains; i++) 3074 ZDOM_LOCK_FINI(ZDOM_GET(zone, i)); 3075 ZONE_CROSS_LOCK_FINI(zone); 3076 } 3077 3078 static void 3079 zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *arg), void *arg) 3080 { 3081 uma_keg_t keg; 3082 uma_zone_t zone; 3083 3084 LIST_FOREACH(keg, &uma_kegs, uk_link) { 3085 LIST_FOREACH(zone, &keg->uk_zones, uz_link) 3086 zfunc(zone, arg); 3087 } 3088 LIST_FOREACH(zone, &uma_cachezones, uz_link) 3089 zfunc(zone, arg); 3090 } 3091 3092 /* 3093 * Traverses every zone in the system and calls a callback 3094 * 3095 * Arguments: 3096 * zfunc A pointer to a function which accepts a zone 3097 * as an argument. 3098 * 3099 * Returns: 3100 * Nothing 3101 */ 3102 static void 3103 zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg) 3104 { 3105 3106 rw_rlock(&uma_rwlock); 3107 zone_foreach_unlocked(zfunc, arg); 3108 rw_runlock(&uma_rwlock); 3109 } 3110 3111 /* 3112 * Initialize the kernel memory allocator. This is done after pages can be 3113 * allocated but before general KVA is available. 3114 */ 3115 void 3116 uma_startup1(vm_offset_t virtual_avail) 3117 { 3118 struct uma_zctor_args args; 3119 size_t ksize, zsize, size; 3120 uma_keg_t primarykeg; 3121 uintptr_t m; 3122 int domain; 3123 uint8_t pflag; 3124 3125 bootstart = bootmem = virtual_avail; 3126 3127 rw_init(&uma_rwlock, "UMA lock"); 3128 sx_init(&uma_reclaim_lock, "umareclaim"); 3129 3130 ksize = sizeof(struct uma_keg) + 3131 (sizeof(struct uma_domain) * vm_ndomains); 3132 ksize = roundup(ksize, UMA_SUPER_ALIGN); 3133 zsize = sizeof(struct uma_zone) + 3134 (sizeof(struct uma_cache) * (mp_maxid + 1)) + 3135 (sizeof(struct uma_zone_domain) * vm_ndomains); 3136 zsize = roundup(zsize, UMA_SUPER_ALIGN); 3137 3138 /* Allocate the zone of zones, zone of kegs, and zone of zones keg. */ 3139 size = (zsize * 2) + ksize; 3140 for (domain = 0; domain < vm_ndomains; domain++) { 3141 m = (uintptr_t)startup_alloc(NULL, size, domain, &pflag, 3142 M_NOWAIT | M_ZERO); 3143 if (m != 0) 3144 break; 3145 } 3146 zones = (uma_zone_t)m; 3147 m += zsize; 3148 kegs = (uma_zone_t)m; 3149 m += zsize; 3150 primarykeg = (uma_keg_t)m; 3151 3152 /* "manually" create the initial zone */ 3153 memset(&args, 0, sizeof(args)); 3154 args.name = "UMA Kegs"; 3155 args.size = ksize; 3156 args.ctor = keg_ctor; 3157 args.dtor = keg_dtor; 3158 args.uminit = zero_init; 3159 args.fini = NULL; 3160 args.keg = primarykeg; 3161 args.align = UMA_SUPER_ALIGN - 1; 3162 args.flags = UMA_ZFLAG_INTERNAL; 3163 zone_ctor(kegs, zsize, &args, M_WAITOK); 3164 3165 args.name = "UMA Zones"; 3166 args.size = zsize; 3167 args.ctor = zone_ctor; 3168 args.dtor = zone_dtor; 3169 args.uminit = zero_init; 3170 args.fini = NULL; 3171 args.keg = NULL; 3172 args.align = UMA_SUPER_ALIGN - 1; 3173 args.flags = UMA_ZFLAG_INTERNAL; 3174 zone_ctor(zones, zsize, &args, M_WAITOK); 3175 3176 /* Now make zones for slab headers */ 3177 slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE, 3178 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 3179 slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE, 3180 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 3181 3182 hashzone = uma_zcreate("UMA Hash", 3183 sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, 3184 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 3185 3186 bucket_init(); 3187 smr_init(); 3188 } 3189 3190 #ifndef UMA_USE_DMAP 3191 extern void vm_radix_reserve_kva(void); 3192 #endif 3193 3194 /* 3195 * Advertise the availability of normal kva allocations and switch to 3196 * the default back-end allocator. Marks the KVA we consumed on startup 3197 * as used in the map. 3198 */ 3199 void 3200 uma_startup2(void) 3201 { 3202 3203 if (bootstart != bootmem) { 3204 vm_map_lock(kernel_map); 3205 (void)vm_map_insert(kernel_map, NULL, 0, bootstart, bootmem, 3206 VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); 3207 vm_map_unlock(kernel_map); 3208 } 3209 3210 #ifndef UMA_USE_DMAP 3211 /* Set up radix zone to use noobj_alloc. */ 3212 vm_radix_reserve_kva(); 3213 #endif 3214 3215 booted = BOOT_KVA; 3216 zone_foreach_unlocked(zone_kva_available, NULL); 3217 bucket_enable(); 3218 } 3219 3220 /* 3221 * Allocate counters as early as possible so that boot-time allocations are 3222 * accounted more precisely. 3223 */ 3224 static void 3225 uma_startup_pcpu(void *arg __unused) 3226 { 3227 3228 zone_foreach_unlocked(zone_alloc_counters, NULL); 3229 booted = BOOT_PCPU; 3230 } 3231 SYSINIT(uma_startup_pcpu, SI_SUB_COUNTER, SI_ORDER_ANY, uma_startup_pcpu, NULL); 3232 3233 /* 3234 * Finish our initialization steps. 3235 */ 3236 static void 3237 uma_startup3(void *arg __unused) 3238 { 3239 3240 #ifdef INVARIANTS 3241 TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor); 3242 uma_dbg_cnt = counter_u64_alloc(M_WAITOK); 3243 uma_skip_cnt = counter_u64_alloc(M_WAITOK); 3244 #endif 3245 zone_foreach_unlocked(zone_alloc_sysctl, NULL); 3246 booted = BOOT_RUNNING; 3247 3248 EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL, 3249 EVENTHANDLER_PRI_FIRST); 3250 } 3251 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); 3252 3253 static void 3254 uma_startup4(void *arg __unused) 3255 { 3256 TIMEOUT_TASK_INIT(taskqueue_thread, &uma_timeout_task, 0, uma_timeout, 3257 NULL); 3258 taskqueue_enqueue_timeout(taskqueue_thread, &uma_timeout_task, 3259 UMA_TIMEOUT * hz); 3260 } 3261 SYSINIT(uma_startup4, SI_SUB_TASKQ, SI_ORDER_ANY, uma_startup4, NULL); 3262 3263 static void 3264 uma_shutdown(void) 3265 { 3266 3267 booted = BOOT_SHUTDOWN; 3268 } 3269 3270 static uma_keg_t 3271 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, 3272 int align, uint32_t flags) 3273 { 3274 struct uma_kctor_args args; 3275 3276 args.size = size; 3277 args.uminit = uminit; 3278 args.fini = fini; 3279 args.align = align; 3280 args.flags = flags; 3281 args.zone = zone; 3282 return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); 3283 } 3284 3285 3286 static void 3287 check_align_mask(unsigned int mask) 3288 { 3289 3290 KASSERT(powerof2(mask + 1), 3291 ("UMA: %s: Not the mask of a power of 2 (%#x)", __func__, mask)); 3292 /* 3293 * Make sure the stored align mask doesn't have its highest bit set, 3294 * which would cause implementation-defined behavior when passing it as 3295 * the 'align' argument of uma_zcreate(). Such very large alignments do 3296 * not make sense anyway. 3297 */ 3298 KASSERT(mask <= INT_MAX, 3299 ("UMA: %s: Mask too big (%#x)", __func__, mask)); 3300 } 3301 3302 /* Public functions */ 3303 /* See uma.h */ 3304 void 3305 uma_set_cache_align_mask(unsigned int mask) 3306 { 3307 3308 check_align_mask(mask); 3309 uma_cache_align_mask = mask; 3310 } 3311 3312 /* Returns the alignment mask to use to request cache alignment. */ 3313 unsigned int 3314 uma_get_cache_align_mask(void) 3315 { 3316 return (uma_cache_align_mask); 3317 } 3318 3319 /* See uma.h */ 3320 uma_zone_t 3321 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, 3322 uma_init uminit, uma_fini fini, int align, uint32_t flags) 3323 3324 { 3325 struct uma_zctor_args args; 3326 uma_zone_t res; 3327 3328 check_align_mask(align); 3329 3330 /* This stuff is essential for the zone ctor */ 3331 memset(&args, 0, sizeof(args)); 3332 args.name = name; 3333 args.size = size; 3334 args.ctor = ctor; 3335 args.dtor = dtor; 3336 args.uminit = uminit; 3337 args.fini = fini; 3338 #if defined(INVARIANTS) && !defined(KASAN) && !defined(KMSAN) 3339 /* 3340 * Inject procedures which check for memory use after free if we are 3341 * allowed to scramble the memory while it is not allocated. This 3342 * requires that: UMA is actually able to access the memory, no init 3343 * or fini procedures, no dependency on the initial value of the 3344 * memory, and no (legitimate) use of the memory after free. Note, 3345 * the ctor and dtor do not need to be empty. 3346 */ 3347 if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH | 3348 UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) { 3349 args.uminit = trash_init; 3350 args.fini = trash_fini; 3351 } 3352 #endif 3353 args.align = align; 3354 args.flags = flags; 3355 args.keg = NULL; 3356 3357 sx_xlock(&uma_reclaim_lock); 3358 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 3359 sx_xunlock(&uma_reclaim_lock); 3360 3361 return (res); 3362 } 3363 3364 /* See uma.h */ 3365 uma_zone_t 3366 uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor, 3367 uma_init zinit, uma_fini zfini, uma_zone_t primary) 3368 { 3369 struct uma_zctor_args args; 3370 uma_keg_t keg; 3371 uma_zone_t res; 3372 3373 keg = primary->uz_keg; 3374 memset(&args, 0, sizeof(args)); 3375 args.name = name; 3376 args.size = keg->uk_size; 3377 args.ctor = ctor; 3378 args.dtor = dtor; 3379 args.uminit = zinit; 3380 args.fini = zfini; 3381 args.align = keg->uk_align; 3382 args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; 3383 args.keg = keg; 3384 3385 sx_xlock(&uma_reclaim_lock); 3386 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 3387 sx_xunlock(&uma_reclaim_lock); 3388 3389 return (res); 3390 } 3391 3392 /* See uma.h */ 3393 uma_zone_t 3394 uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor, 3395 uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease, 3396 void *arg, int flags) 3397 { 3398 struct uma_zctor_args args; 3399 3400 memset(&args, 0, sizeof(args)); 3401 args.name = name; 3402 args.size = size; 3403 args.ctor = ctor; 3404 args.dtor = dtor; 3405 args.uminit = zinit; 3406 args.fini = zfini; 3407 args.import = zimport; 3408 args.release = zrelease; 3409 args.arg = arg; 3410 args.align = 0; 3411 args.flags = flags | UMA_ZFLAG_CACHE; 3412 3413 return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); 3414 } 3415 3416 /* See uma.h */ 3417 void 3418 uma_zdestroy(uma_zone_t zone) 3419 { 3420 3421 /* 3422 * Large slabs are expensive to reclaim, so don't bother doing 3423 * unnecessary work if we're shutting down. 3424 */ 3425 if (booted == BOOT_SHUTDOWN && 3426 zone->uz_fini == NULL && zone->uz_release == zone_release) 3427 return; 3428 sx_xlock(&uma_reclaim_lock); 3429 zone_free_item(zones, zone, NULL, SKIP_NONE); 3430 sx_xunlock(&uma_reclaim_lock); 3431 } 3432 3433 void 3434 uma_zwait(uma_zone_t zone) 3435 { 3436 3437 if ((zone->uz_flags & UMA_ZONE_SMR) != 0) 3438 uma_zfree_smr(zone, uma_zalloc_smr(zone, M_WAITOK)); 3439 else if ((zone->uz_flags & UMA_ZONE_PCPU) != 0) 3440 uma_zfree_pcpu(zone, uma_zalloc_pcpu(zone, M_WAITOK)); 3441 else 3442 uma_zfree(zone, uma_zalloc(zone, M_WAITOK)); 3443 } 3444 3445 void * 3446 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags) 3447 { 3448 void *item, *pcpu_item; 3449 #ifdef SMP 3450 int i; 3451 3452 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 3453 #endif 3454 item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO); 3455 if (item == NULL) 3456 return (NULL); 3457 pcpu_item = zpcpu_base_to_offset(item); 3458 if (flags & M_ZERO) { 3459 #ifdef SMP 3460 for (i = 0; i <= mp_maxid; i++) 3461 bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size); 3462 #else 3463 bzero(item, zone->uz_size); 3464 #endif 3465 } 3466 return (pcpu_item); 3467 } 3468 3469 /* 3470 * A stub while both regular and pcpu cases are identical. 3471 */ 3472 void 3473 uma_zfree_pcpu_arg(uma_zone_t zone, void *pcpu_item, void *udata) 3474 { 3475 void *item; 3476 3477 #ifdef SMP 3478 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 3479 #endif 3480 3481 /* uma_zfree_pcu_*(..., NULL) does nothing, to match free(9). */ 3482 if (pcpu_item == NULL) 3483 return; 3484 3485 item = zpcpu_offset_to_base(pcpu_item); 3486 uma_zfree_arg(zone, item, udata); 3487 } 3488 3489 static inline void * 3490 item_ctor(uma_zone_t zone, int uz_flags, int size, void *udata, int flags, 3491 void *item) 3492 { 3493 #ifdef INVARIANTS 3494 bool skipdbg; 3495 #endif 3496 3497 kasan_mark_item_valid(zone, item); 3498 kmsan_mark_item_uninitialized(zone, item); 3499 3500 #ifdef INVARIANTS 3501 skipdbg = uma_dbg_zskip(zone, item); 3502 if (!skipdbg && (uz_flags & UMA_ZFLAG_TRASH) != 0 && 3503 zone->uz_ctor != trash_ctor) 3504 trash_ctor(item, size, zone, flags); 3505 #endif 3506 3507 /* Check flags before loading ctor pointer. */ 3508 if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0) && 3509 __predict_false(zone->uz_ctor != NULL) && 3510 zone->uz_ctor(item, size, udata, flags) != 0) { 3511 counter_u64_add(zone->uz_fails, 1); 3512 zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); 3513 return (NULL); 3514 } 3515 #ifdef INVARIANTS 3516 if (!skipdbg) 3517 uma_dbg_alloc(zone, NULL, item); 3518 #endif 3519 if (__predict_false(flags & M_ZERO)) 3520 return (memset(item, 0, size)); 3521 3522 return (item); 3523 } 3524 3525 static inline void 3526 item_dtor(uma_zone_t zone, void *item, int size, void *udata, 3527 enum zfreeskip skip) 3528 { 3529 #ifdef INVARIANTS 3530 bool skipdbg; 3531 3532 skipdbg = uma_dbg_zskip(zone, item); 3533 if (skip == SKIP_NONE && !skipdbg) { 3534 if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0) 3535 uma_dbg_free(zone, udata, item); 3536 else 3537 uma_dbg_free(zone, NULL, item); 3538 } 3539 #endif 3540 if (__predict_true(skip < SKIP_DTOR)) { 3541 if (zone->uz_dtor != NULL) 3542 zone->uz_dtor(item, size, udata); 3543 #ifdef INVARIANTS 3544 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && 3545 zone->uz_dtor != trash_dtor) 3546 trash_dtor(item, size, zone); 3547 #endif 3548 } 3549 kasan_mark_item_invalid(zone, item); 3550 } 3551 3552 #ifdef NUMA 3553 static int 3554 item_domain(void *item) 3555 { 3556 int domain; 3557 3558 domain = vm_phys_domain(vtophys(item)); 3559 KASSERT(domain >= 0 && domain < vm_ndomains, 3560 ("%s: unknown domain for item %p", __func__, item)); 3561 return (domain); 3562 } 3563 #endif 3564 3565 #if defined(INVARIANTS) || defined(DEBUG_MEMGUARD) || defined(WITNESS) 3566 #if defined(INVARIANTS) && (defined(DDB) || defined(STACK)) 3567 #include <sys/stack.h> 3568 #endif 3569 #define UMA_ZALLOC_DEBUG 3570 static int 3571 uma_zalloc_debug(uma_zone_t zone, void **itemp, void *udata, int flags) 3572 { 3573 int error; 3574 3575 error = 0; 3576 #ifdef WITNESS 3577 if (flags & M_WAITOK) { 3578 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 3579 "uma_zalloc_debug: zone \"%s\"", zone->uz_name); 3580 } 3581 #endif 3582 3583 #ifdef INVARIANTS 3584 KASSERT((flags & M_EXEC) == 0, 3585 ("uma_zalloc_debug: called with M_EXEC")); 3586 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3587 ("uma_zalloc_debug: called within spinlock or critical section")); 3588 KASSERT((zone->uz_flags & UMA_ZONE_PCPU) == 0 || (flags & M_ZERO) == 0, 3589 ("uma_zalloc_debug: allocating from a pcpu zone with M_ZERO")); 3590 3591 _Static_assert(M_NOWAIT != 0 && M_WAITOK != 0, 3592 "M_NOWAIT and M_WAITOK must be non-zero for this assertion:"); 3593 #if 0 3594 /* 3595 * Give the #elif clause time to find problems, then remove it 3596 * and enable this. (Remove <sys/stack.h> above, too.) 3597 */ 3598 KASSERT((flags & (M_NOWAIT|M_WAITOK)) == M_NOWAIT || 3599 (flags & (M_NOWAIT|M_WAITOK)) == M_WAITOK, 3600 ("uma_zalloc_debug: must pass one of M_NOWAIT or M_WAITOK")); 3601 #elif defined(DDB) || defined(STACK) 3602 if (__predict_false((flags & (M_NOWAIT|M_WAITOK)) != M_NOWAIT && 3603 (flags & (M_NOWAIT|M_WAITOK)) != M_WAITOK)) { 3604 static int stack_count; 3605 struct stack st; 3606 3607 if (stack_count < 10) { 3608 ++stack_count; 3609 printf("uma_zalloc* called with bad WAIT flags:\n"); 3610 stack_save(&st); 3611 stack_print(&st); 3612 } 3613 } 3614 #endif 3615 #endif 3616 3617 #ifdef DEBUG_MEMGUARD 3618 if ((zone->uz_flags & (UMA_ZONE_SMR | UMA_ZFLAG_CACHE)) == 0 && 3619 memguard_cmp_zone(zone)) { 3620 void *item; 3621 item = memguard_alloc(zone->uz_size, flags); 3622 if (item != NULL) { 3623 error = EJUSTRETURN; 3624 if (zone->uz_init != NULL && 3625 zone->uz_init(item, zone->uz_size, flags) != 0) { 3626 *itemp = NULL; 3627 return (error); 3628 } 3629 if (zone->uz_ctor != NULL && 3630 zone->uz_ctor(item, zone->uz_size, udata, 3631 flags) != 0) { 3632 counter_u64_add(zone->uz_fails, 1); 3633 if (zone->uz_fini != NULL) 3634 zone->uz_fini(item, zone->uz_size); 3635 *itemp = NULL; 3636 return (error); 3637 } 3638 *itemp = item; 3639 return (error); 3640 } 3641 /* This is unfortunate but should not be fatal. */ 3642 } 3643 #endif 3644 return (error); 3645 } 3646 3647 static int 3648 uma_zfree_debug(uma_zone_t zone, void *item, void *udata) 3649 { 3650 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3651 ("uma_zfree_debug: called with spinlock or critical section held")); 3652 3653 #ifdef DEBUG_MEMGUARD 3654 if ((zone->uz_flags & (UMA_ZONE_SMR | UMA_ZFLAG_CACHE)) == 0 && 3655 is_memguard_addr(item)) { 3656 if (zone->uz_dtor != NULL) 3657 zone->uz_dtor(item, zone->uz_size, udata); 3658 if (zone->uz_fini != NULL) 3659 zone->uz_fini(item, zone->uz_size); 3660 memguard_free(item); 3661 return (EJUSTRETURN); 3662 } 3663 #endif 3664 return (0); 3665 } 3666 #endif 3667 3668 static inline void * 3669 cache_alloc_item(uma_zone_t zone, uma_cache_t cache, uma_cache_bucket_t bucket, 3670 void *udata, int flags) 3671 { 3672 void *item; 3673 int size, uz_flags; 3674 3675 item = cache_bucket_pop(cache, bucket); 3676 size = cache_uz_size(cache); 3677 uz_flags = cache_uz_flags(cache); 3678 critical_exit(); 3679 return (item_ctor(zone, uz_flags, size, udata, flags, item)); 3680 } 3681 3682 static __noinline void * 3683 cache_alloc_retry(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) 3684 { 3685 uma_cache_bucket_t bucket; 3686 int domain; 3687 3688 while (cache_alloc(zone, cache, udata, flags)) { 3689 cache = &zone->uz_cpu[curcpu]; 3690 bucket = &cache->uc_allocbucket; 3691 if (__predict_false(bucket->ucb_cnt == 0)) 3692 continue; 3693 return (cache_alloc_item(zone, cache, bucket, udata, flags)); 3694 } 3695 critical_exit(); 3696 3697 /* 3698 * We can not get a bucket so try to return a single item. 3699 */ 3700 if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH) 3701 domain = PCPU_GET(domain); 3702 else 3703 domain = UMA_ANYDOMAIN; 3704 return (zone_alloc_item(zone, udata, domain, flags)); 3705 } 3706 3707 /* See uma.h */ 3708 void * 3709 uma_zalloc_smr(uma_zone_t zone, int flags) 3710 { 3711 uma_cache_bucket_t bucket; 3712 uma_cache_t cache; 3713 3714 CTR3(KTR_UMA, "uma_zalloc_smr zone %s(%p) flags %d", zone->uz_name, 3715 zone, flags); 3716 3717 #ifdef UMA_ZALLOC_DEBUG 3718 void *item; 3719 3720 KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, 3721 ("uma_zalloc_arg: called with non-SMR zone.")); 3722 if (uma_zalloc_debug(zone, &item, NULL, flags) == EJUSTRETURN) 3723 return (item); 3724 #endif 3725 3726 critical_enter(); 3727 cache = &zone->uz_cpu[curcpu]; 3728 bucket = &cache->uc_allocbucket; 3729 if (__predict_false(bucket->ucb_cnt == 0)) 3730 return (cache_alloc_retry(zone, cache, NULL, flags)); 3731 return (cache_alloc_item(zone, cache, bucket, NULL, flags)); 3732 } 3733 3734 /* See uma.h */ 3735 void * 3736 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) 3737 { 3738 uma_cache_bucket_t bucket; 3739 uma_cache_t cache; 3740 3741 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3742 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3743 3744 /* This is the fast path allocation */ 3745 CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name, 3746 zone, flags); 3747 3748 #ifdef UMA_ZALLOC_DEBUG 3749 void *item; 3750 3751 KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, 3752 ("uma_zalloc_arg: called with SMR zone.")); 3753 if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN) 3754 return (item); 3755 #endif 3756 3757 /* 3758 * If possible, allocate from the per-CPU cache. There are two 3759 * requirements for safe access to the per-CPU cache: (1) the thread 3760 * accessing the cache must not be preempted or yield during access, 3761 * and (2) the thread must not migrate CPUs without switching which 3762 * cache it accesses. We rely on a critical section to prevent 3763 * preemption and migration. We release the critical section in 3764 * order to acquire the zone mutex if we are unable to allocate from 3765 * the current cache; when we re-acquire the critical section, we 3766 * must detect and handle migration if it has occurred. 3767 */ 3768 critical_enter(); 3769 cache = &zone->uz_cpu[curcpu]; 3770 bucket = &cache->uc_allocbucket; 3771 if (__predict_false(bucket->ucb_cnt == 0)) 3772 return (cache_alloc_retry(zone, cache, udata, flags)); 3773 return (cache_alloc_item(zone, cache, bucket, udata, flags)); 3774 } 3775 3776 /* 3777 * Replenish an alloc bucket and possibly restore an old one. Called in 3778 * a critical section. Returns in a critical section. 3779 * 3780 * A false return value indicates an allocation failure. 3781 * A true return value indicates success and the caller should retry. 3782 */ 3783 static __noinline bool 3784 cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) 3785 { 3786 uma_bucket_t bucket; 3787 int curdomain, domain; 3788 bool new; 3789 3790 CRITICAL_ASSERT(curthread); 3791 3792 /* 3793 * If we have run out of items in our alloc bucket see 3794 * if we can switch with the free bucket. 3795 * 3796 * SMR Zones can't re-use the free bucket until the sequence has 3797 * expired. 3798 */ 3799 if ((cache_uz_flags(cache) & UMA_ZONE_SMR) == 0 && 3800 cache->uc_freebucket.ucb_cnt != 0) { 3801 cache_bucket_swap(&cache->uc_freebucket, 3802 &cache->uc_allocbucket); 3803 return (true); 3804 } 3805 3806 /* 3807 * Discard any empty allocation bucket while we hold no locks. 3808 */ 3809 bucket = cache_bucket_unload_alloc(cache); 3810 critical_exit(); 3811 3812 if (bucket != NULL) { 3813 KASSERT(bucket->ub_cnt == 0, 3814 ("cache_alloc: Entered with non-empty alloc bucket.")); 3815 bucket_free(zone, bucket, udata); 3816 } 3817 3818 /* 3819 * Attempt to retrieve the item from the per-CPU cache has failed, so 3820 * we must go back to the zone. This requires the zdom lock, so we 3821 * must drop the critical section, then re-acquire it when we go back 3822 * to the cache. Since the critical section is released, we may be 3823 * preempted or migrate. As such, make sure not to maintain any 3824 * thread-local state specific to the cache from prior to releasing 3825 * the critical section. 3826 */ 3827 domain = PCPU_GET(domain); 3828 if ((cache_uz_flags(cache) & UMA_ZONE_ROUNDROBIN) != 0 || 3829 VM_DOMAIN_EMPTY(domain)) 3830 domain = zone_domain_highest(zone, domain); 3831 bucket = cache_fetch_bucket(zone, cache, domain); 3832 if (bucket == NULL && zone->uz_bucket_size != 0 && !bucketdisable) { 3833 bucket = zone_alloc_bucket(zone, udata, domain, flags); 3834 new = true; 3835 } else { 3836 new = false; 3837 } 3838 3839 CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", 3840 zone->uz_name, zone, bucket); 3841 if (bucket == NULL) { 3842 critical_enter(); 3843 return (false); 3844 } 3845 3846 /* 3847 * See if we lost the race or were migrated. Cache the 3848 * initialized bucket to make this less likely or claim 3849 * the memory directly. 3850 */ 3851 critical_enter(); 3852 cache = &zone->uz_cpu[curcpu]; 3853 if (cache->uc_allocbucket.ucb_bucket == NULL && 3854 ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) == 0 || 3855 (curdomain = PCPU_GET(domain)) == domain || 3856 VM_DOMAIN_EMPTY(curdomain))) { 3857 if (new) 3858 atomic_add_long(&ZDOM_GET(zone, domain)->uzd_imax, 3859 bucket->ub_cnt); 3860 cache_bucket_load_alloc(cache, bucket); 3861 return (true); 3862 } 3863 3864 /* 3865 * We lost the race, release this bucket and start over. 3866 */ 3867 critical_exit(); 3868 zone_put_bucket(zone, domain, bucket, udata, !new); 3869 critical_enter(); 3870 3871 return (true); 3872 } 3873 3874 void * 3875 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) 3876 { 3877 #ifdef NUMA 3878 uma_bucket_t bucket; 3879 uma_zone_domain_t zdom; 3880 void *item; 3881 #endif 3882 3883 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3884 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3885 3886 /* This is the fast path allocation */ 3887 CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d", 3888 zone->uz_name, zone, domain, flags); 3889 3890 KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, 3891 ("uma_zalloc_domain: called with SMR zone.")); 3892 #ifdef NUMA 3893 KASSERT((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0, 3894 ("uma_zalloc_domain: called with non-FIRSTTOUCH zone.")); 3895 3896 if (vm_ndomains == 1) 3897 return (uma_zalloc_arg(zone, udata, flags)); 3898 3899 #ifdef UMA_ZALLOC_DEBUG 3900 if (uma_zalloc_debug(zone, &item, udata, flags) == EJUSTRETURN) 3901 return (item); 3902 #endif 3903 3904 /* 3905 * Try to allocate from the bucket cache before falling back to the keg. 3906 * We could try harder and attempt to allocate from per-CPU caches or 3907 * the per-domain cross-domain buckets, but the complexity is probably 3908 * not worth it. It is more important that frees of previous 3909 * cross-domain allocations do not blow up the cache. 3910 */ 3911 zdom = zone_domain_lock(zone, domain); 3912 if ((bucket = zone_fetch_bucket(zone, zdom, false)) != NULL) { 3913 item = bucket->ub_bucket[bucket->ub_cnt - 1]; 3914 #ifdef INVARIANTS 3915 bucket->ub_bucket[bucket->ub_cnt - 1] = NULL; 3916 #endif 3917 bucket->ub_cnt--; 3918 zone_put_bucket(zone, domain, bucket, udata, true); 3919 item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, 3920 flags, item); 3921 if (item != NULL) { 3922 KASSERT(item_domain(item) == domain, 3923 ("%s: bucket cache item %p from wrong domain", 3924 __func__, item)); 3925 counter_u64_add(zone->uz_allocs, 1); 3926 } 3927 return (item); 3928 } 3929 ZDOM_UNLOCK(zdom); 3930 return (zone_alloc_item(zone, udata, domain, flags)); 3931 #else 3932 return (uma_zalloc_arg(zone, udata, flags)); 3933 #endif 3934 } 3935 3936 /* 3937 * Find a slab with some space. Prefer slabs that are partially used over those 3938 * that are totally full. This helps to reduce fragmentation. 3939 * 3940 * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check 3941 * only 'domain'. 3942 */ 3943 static uma_slab_t 3944 keg_first_slab(uma_keg_t keg, int domain, bool rr) 3945 { 3946 uma_domain_t dom; 3947 uma_slab_t slab; 3948 int start; 3949 3950 KASSERT(domain >= 0 && domain < vm_ndomains, 3951 ("keg_first_slab: domain %d out of range", domain)); 3952 KEG_LOCK_ASSERT(keg, domain); 3953 3954 slab = NULL; 3955 start = domain; 3956 do { 3957 dom = &keg->uk_domain[domain]; 3958 if ((slab = LIST_FIRST(&dom->ud_part_slab)) != NULL) 3959 return (slab); 3960 if ((slab = LIST_FIRST(&dom->ud_free_slab)) != NULL) { 3961 LIST_REMOVE(slab, us_link); 3962 dom->ud_free_slabs--; 3963 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 3964 return (slab); 3965 } 3966 if (rr) 3967 domain = (domain + 1) % vm_ndomains; 3968 } while (domain != start); 3969 3970 return (NULL); 3971 } 3972 3973 /* 3974 * Fetch an existing slab from a free or partial list. Returns with the 3975 * keg domain lock held if a slab was found or unlocked if not. 3976 */ 3977 static uma_slab_t 3978 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) 3979 { 3980 uma_slab_t slab; 3981 uint32_t reserve; 3982 3983 /* HASH has a single free list. */ 3984 if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) 3985 domain = 0; 3986 3987 KEG_LOCK(keg, domain); 3988 reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; 3989 if (keg->uk_domain[domain].ud_free_items <= reserve || 3990 (slab = keg_first_slab(keg, domain, rr)) == NULL) { 3991 KEG_UNLOCK(keg, domain); 3992 return (NULL); 3993 } 3994 return (slab); 3995 } 3996 3997 static uma_slab_t 3998 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) 3999 { 4000 struct vm_domainset_iter di; 4001 uma_slab_t slab; 4002 int aflags, domain; 4003 bool rr; 4004 4005 KASSERT((flags & (M_WAITOK | M_NOVM)) != (M_WAITOK | M_NOVM), 4006 ("%s: invalid flags %#x", __func__, flags)); 4007 4008 restart: 4009 /* 4010 * Use the keg's policy if upper layers haven't already specified a 4011 * domain (as happens with first-touch zones). 4012 * 4013 * To avoid races we run the iterator with the keg lock held, but that 4014 * means that we cannot allow the vm_domainset layer to sleep. Thus, 4015 * clear M_WAITOK and handle low memory conditions locally. 4016 */ 4017 rr = rdomain == UMA_ANYDOMAIN; 4018 if (rr) { 4019 aflags = (flags & ~M_WAITOK) | M_NOWAIT; 4020 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 4021 &aflags); 4022 } else { 4023 aflags = flags; 4024 domain = rdomain; 4025 } 4026 4027 for (;;) { 4028 slab = keg_fetch_free_slab(keg, domain, rr, flags); 4029 if (slab != NULL) 4030 return (slab); 4031 4032 /* 4033 * M_NOVM is used to break the recursion that can otherwise 4034 * occur if low-level memory management routines use UMA. 4035 */ 4036 if ((flags & M_NOVM) == 0) { 4037 slab = keg_alloc_slab(keg, zone, domain, flags, aflags); 4038 if (slab != NULL) 4039 return (slab); 4040 } 4041 4042 if (!rr) { 4043 if ((flags & M_USE_RESERVE) != 0) { 4044 /* 4045 * Drain reserves from other domains before 4046 * giving up or sleeping. It may be useful to 4047 * support per-domain reserves eventually. 4048 */ 4049 rdomain = UMA_ANYDOMAIN; 4050 goto restart; 4051 } 4052 if ((flags & M_WAITOK) == 0) 4053 break; 4054 vm_wait_domain(domain); 4055 } else if (vm_domainset_iter_policy(&di, &domain) != 0) { 4056 if ((flags & M_WAITOK) != 0) { 4057 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); 4058 goto restart; 4059 } 4060 break; 4061 } 4062 } 4063 4064 /* 4065 * We might not have been able to get a slab but another cpu 4066 * could have while we were unlocked. Check again before we 4067 * fail. 4068 */ 4069 if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) 4070 return (slab); 4071 4072 return (NULL); 4073 } 4074 4075 static void * 4076 slab_alloc_item(uma_keg_t keg, uma_slab_t slab) 4077 { 4078 uma_domain_t dom; 4079 void *item; 4080 int freei; 4081 4082 KEG_LOCK_ASSERT(keg, slab->us_domain); 4083 4084 dom = &keg->uk_domain[slab->us_domain]; 4085 freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1; 4086 BIT_CLR(keg->uk_ipers, freei, &slab->us_free); 4087 item = slab_item(slab, keg, freei); 4088 slab->us_freecount--; 4089 dom->ud_free_items--; 4090 4091 /* 4092 * Move this slab to the full list. It must be on the partial list, so 4093 * we do not need to update the free slab count. In particular, 4094 * keg_fetch_slab() always returns slabs on the partial list. 4095 */ 4096 if (slab->us_freecount == 0) { 4097 LIST_REMOVE(slab, us_link); 4098 LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); 4099 } 4100 4101 return (item); 4102 } 4103 4104 static int 4105 zone_import(void *arg, void **bucket, int max, int domain, int flags) 4106 { 4107 uma_domain_t dom; 4108 uma_zone_t zone; 4109 uma_slab_t slab; 4110 uma_keg_t keg; 4111 #ifdef NUMA 4112 int stripe; 4113 #endif 4114 int i; 4115 4116 zone = arg; 4117 slab = NULL; 4118 keg = zone->uz_keg; 4119 /* Try to keep the buckets totally full */ 4120 for (i = 0; i < max; ) { 4121 if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL) 4122 break; 4123 #ifdef NUMA 4124 stripe = howmany(max, vm_ndomains); 4125 #endif 4126 dom = &keg->uk_domain[slab->us_domain]; 4127 do { 4128 bucket[i++] = slab_alloc_item(keg, slab); 4129 if (keg->uk_reserve > 0 && 4130 dom->ud_free_items <= keg->uk_reserve) { 4131 /* 4132 * Avoid depleting the reserve after a 4133 * successful item allocation, even if 4134 * M_USE_RESERVE is specified. 4135 */ 4136 KEG_UNLOCK(keg, slab->us_domain); 4137 goto out; 4138 } 4139 #ifdef NUMA 4140 /* 4141 * If the zone is striped we pick a new slab for every 4142 * N allocations. Eliminating this conditional will 4143 * instead pick a new domain for each bucket rather 4144 * than stripe within each bucket. The current option 4145 * produces more fragmentation and requires more cpu 4146 * time but yields better distribution. 4147 */ 4148 if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 && 4149 vm_ndomains > 1 && --stripe == 0) 4150 break; 4151 #endif 4152 } while (slab->us_freecount != 0 && i < max); 4153 KEG_UNLOCK(keg, slab->us_domain); 4154 4155 /* Don't block if we allocated any successfully. */ 4156 flags &= ~M_WAITOK; 4157 flags |= M_NOWAIT; 4158 } 4159 out: 4160 return i; 4161 } 4162 4163 static int 4164 zone_alloc_limit_hard(uma_zone_t zone, int count, int flags) 4165 { 4166 uint64_t old, new, total, max; 4167 4168 /* 4169 * The hard case. We're going to sleep because there were existing 4170 * sleepers or because we ran out of items. This routine enforces 4171 * fairness by keeping fifo order. 4172 * 4173 * First release our ill gotten gains and make some noise. 4174 */ 4175 for (;;) { 4176 zone_free_limit(zone, count); 4177 zone_log_warning(zone); 4178 zone_maxaction(zone); 4179 if (flags & M_NOWAIT) 4180 return (0); 4181 4182 /* 4183 * We need to allocate an item or set ourself as a sleeper 4184 * while the sleepq lock is held to avoid wakeup races. This 4185 * is essentially a home rolled semaphore. 4186 */ 4187 sleepq_lock(&zone->uz_max_items); 4188 old = zone->uz_items; 4189 do { 4190 MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX); 4191 /* Cache the max since we will evaluate twice. */ 4192 max = zone->uz_max_items; 4193 if (UZ_ITEMS_SLEEPERS(old) != 0 || 4194 UZ_ITEMS_COUNT(old) >= max) 4195 new = old + UZ_ITEMS_SLEEPER; 4196 else 4197 new = old + MIN(count, max - old); 4198 } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0); 4199 4200 /* We may have successfully allocated under the sleepq lock. */ 4201 if (UZ_ITEMS_SLEEPERS(new) == 0) { 4202 sleepq_release(&zone->uz_max_items); 4203 return (new - old); 4204 } 4205 4206 /* 4207 * This is in a different cacheline from uz_items so that we 4208 * don't constantly invalidate the fastpath cacheline when we 4209 * adjust item counts. This could be limited to toggling on 4210 * transitions. 4211 */ 4212 atomic_add_32(&zone->uz_sleepers, 1); 4213 atomic_add_64(&zone->uz_sleeps, 1); 4214 4215 /* 4216 * We have added ourselves as a sleeper. The sleepq lock 4217 * protects us from wakeup races. Sleep now and then retry. 4218 */ 4219 sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0); 4220 sleepq_wait(&zone->uz_max_items, PVM); 4221 4222 /* 4223 * After wakeup, remove ourselves as a sleeper and try 4224 * again. We no longer have the sleepq lock for protection. 4225 * 4226 * Subract ourselves as a sleeper while attempting to add 4227 * our count. 4228 */ 4229 atomic_subtract_32(&zone->uz_sleepers, 1); 4230 old = atomic_fetchadd_64(&zone->uz_items, 4231 -(UZ_ITEMS_SLEEPER - count)); 4232 /* We're no longer a sleeper. */ 4233 old -= UZ_ITEMS_SLEEPER; 4234 4235 /* 4236 * If we're still at the limit, restart. Notably do not 4237 * block on other sleepers. Cache the max value to protect 4238 * against changes via sysctl. 4239 */ 4240 total = UZ_ITEMS_COUNT(old); 4241 max = zone->uz_max_items; 4242 if (total >= max) 4243 continue; 4244 /* Truncate if necessary, otherwise wake other sleepers. */ 4245 if (total + count > max) { 4246 zone_free_limit(zone, total + count - max); 4247 count = max - total; 4248 } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0) 4249 wakeup_one(&zone->uz_max_items); 4250 4251 return (count); 4252 } 4253 } 4254 4255 /* 4256 * Allocate 'count' items from our max_items limit. Returns the number 4257 * available. If M_NOWAIT is not specified it will sleep until at least 4258 * one item can be allocated. 4259 */ 4260 static int 4261 zone_alloc_limit(uma_zone_t zone, int count, int flags) 4262 { 4263 uint64_t old; 4264 uint64_t max; 4265 4266 max = zone->uz_max_items; 4267 MPASS(max > 0); 4268 4269 /* 4270 * We expect normal allocations to succeed with a simple 4271 * fetchadd. 4272 */ 4273 old = atomic_fetchadd_64(&zone->uz_items, count); 4274 if (__predict_true(old + count <= max)) 4275 return (count); 4276 4277 /* 4278 * If we had some items and no sleepers just return the 4279 * truncated value. We have to release the excess space 4280 * though because that may wake sleepers who weren't woken 4281 * because we were temporarily over the limit. 4282 */ 4283 if (old < max) { 4284 zone_free_limit(zone, (old + count) - max); 4285 return (max - old); 4286 } 4287 return (zone_alloc_limit_hard(zone, count, flags)); 4288 } 4289 4290 /* 4291 * Free a number of items back to the limit. 4292 */ 4293 static void 4294 zone_free_limit(uma_zone_t zone, int count) 4295 { 4296 uint64_t old; 4297 4298 MPASS(count > 0); 4299 4300 /* 4301 * In the common case we either have no sleepers or 4302 * are still over the limit and can just return. 4303 */ 4304 old = atomic_fetchadd_64(&zone->uz_items, -count); 4305 if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 || 4306 UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items)) 4307 return; 4308 4309 /* 4310 * Moderate the rate of wakeups. Sleepers will continue 4311 * to generate wakeups if necessary. 4312 */ 4313 wakeup_one(&zone->uz_max_items); 4314 } 4315 4316 static uma_bucket_t 4317 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) 4318 { 4319 uma_bucket_t bucket; 4320 int error, maxbucket, cnt; 4321 4322 CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name, 4323 zone, domain); 4324 4325 /* Avoid allocs targeting empty domains. */ 4326 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 4327 domain = UMA_ANYDOMAIN; 4328 else if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) 4329 domain = UMA_ANYDOMAIN; 4330 4331 if (zone->uz_max_items > 0) 4332 maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size, 4333 M_NOWAIT); 4334 else 4335 maxbucket = zone->uz_bucket_size; 4336 if (maxbucket == 0) 4337 return (NULL); 4338 4339 /* Don't wait for buckets, preserve caller's NOVM setting. */ 4340 bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); 4341 if (bucket == NULL) { 4342 cnt = 0; 4343 goto out; 4344 } 4345 4346 bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, 4347 MIN(maxbucket, bucket->ub_entries), domain, flags); 4348 4349 /* 4350 * Initialize the memory if necessary. 4351 */ 4352 if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { 4353 int i; 4354 4355 for (i = 0; i < bucket->ub_cnt; i++) { 4356 kasan_mark_item_valid(zone, bucket->ub_bucket[i]); 4357 error = zone->uz_init(bucket->ub_bucket[i], 4358 zone->uz_size, flags); 4359 kasan_mark_item_invalid(zone, bucket->ub_bucket[i]); 4360 if (error != 0) 4361 break; 4362 } 4363 4364 /* 4365 * If we couldn't initialize the whole bucket, put the 4366 * rest back onto the freelist. 4367 */ 4368 if (i != bucket->ub_cnt) { 4369 zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], 4370 bucket->ub_cnt - i); 4371 #ifdef INVARIANTS 4372 bzero(&bucket->ub_bucket[i], 4373 sizeof(void *) * (bucket->ub_cnt - i)); 4374 #endif 4375 bucket->ub_cnt = i; 4376 } 4377 } 4378 4379 cnt = bucket->ub_cnt; 4380 if (bucket->ub_cnt == 0) { 4381 bucket_free(zone, bucket, udata); 4382 counter_u64_add(zone->uz_fails, 1); 4383 bucket = NULL; 4384 } 4385 out: 4386 if (zone->uz_max_items > 0 && cnt < maxbucket) 4387 zone_free_limit(zone, maxbucket - cnt); 4388 4389 return (bucket); 4390 } 4391 4392 /* 4393 * Allocates a single item from a zone. 4394 * 4395 * Arguments 4396 * zone The zone to alloc for. 4397 * udata The data to be passed to the constructor. 4398 * domain The domain to allocate from or UMA_ANYDOMAIN. 4399 * flags M_WAITOK, M_NOWAIT, M_ZERO. 4400 * 4401 * Returns 4402 * NULL if there is no memory and M_NOWAIT is set 4403 * An item if successful 4404 */ 4405 4406 static void * 4407 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) 4408 { 4409 void *item; 4410 4411 if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) { 4412 counter_u64_add(zone->uz_fails, 1); 4413 return (NULL); 4414 } 4415 4416 /* Avoid allocs targeting empty domains. */ 4417 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 4418 domain = UMA_ANYDOMAIN; 4419 4420 if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) 4421 goto fail_cnt; 4422 4423 /* 4424 * We have to call both the zone's init (not the keg's init) 4425 * and the zone's ctor. This is because the item is going from 4426 * a keg slab directly to the user, and the user is expecting it 4427 * to be both zone-init'd as well as zone-ctor'd. 4428 */ 4429 if (zone->uz_init != NULL) { 4430 int error; 4431 4432 kasan_mark_item_valid(zone, item); 4433 error = zone->uz_init(item, zone->uz_size, flags); 4434 kasan_mark_item_invalid(zone, item); 4435 if (error != 0) { 4436 zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); 4437 goto fail_cnt; 4438 } 4439 } 4440 item = item_ctor(zone, zone->uz_flags, zone->uz_size, udata, flags, 4441 item); 4442 if (item == NULL) 4443 goto fail; 4444 4445 counter_u64_add(zone->uz_allocs, 1); 4446 CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, 4447 zone->uz_name, zone); 4448 4449 return (item); 4450 4451 fail_cnt: 4452 counter_u64_add(zone->uz_fails, 1); 4453 fail: 4454 if (zone->uz_max_items > 0) 4455 zone_free_limit(zone, 1); 4456 CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", 4457 zone->uz_name, zone); 4458 4459 return (NULL); 4460 } 4461 4462 /* See uma.h */ 4463 void 4464 uma_zfree_smr(uma_zone_t zone, void *item) 4465 { 4466 uma_cache_t cache; 4467 uma_cache_bucket_t bucket; 4468 int itemdomain; 4469 #ifdef NUMA 4470 int uz_flags; 4471 #endif 4472 4473 CTR3(KTR_UMA, "uma_zfree_smr zone %s(%p) item %p", 4474 zone->uz_name, zone, item); 4475 4476 #ifdef UMA_ZALLOC_DEBUG 4477 KASSERT((zone->uz_flags & UMA_ZONE_SMR) != 0, 4478 ("uma_zfree_smr: called with non-SMR zone.")); 4479 KASSERT(item != NULL, ("uma_zfree_smr: Called with NULL pointer.")); 4480 SMR_ASSERT_NOT_ENTERED(zone->uz_smr); 4481 if (uma_zfree_debug(zone, item, NULL) == EJUSTRETURN) 4482 return; 4483 #endif 4484 cache = &zone->uz_cpu[curcpu]; 4485 itemdomain = 0; 4486 #ifdef NUMA 4487 uz_flags = cache_uz_flags(cache); 4488 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) 4489 itemdomain = item_domain(item); 4490 #endif 4491 critical_enter(); 4492 do { 4493 cache = &zone->uz_cpu[curcpu]; 4494 /* SMR Zones must free to the free bucket. */ 4495 bucket = &cache->uc_freebucket; 4496 #ifdef NUMA 4497 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && 4498 PCPU_GET(domain) != itemdomain) { 4499 bucket = &cache->uc_crossbucket; 4500 } 4501 #endif 4502 if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { 4503 cache_bucket_push(cache, bucket, item); 4504 critical_exit(); 4505 return; 4506 } 4507 } while (cache_free(zone, cache, NULL, itemdomain)); 4508 critical_exit(); 4509 4510 /* 4511 * If nothing else caught this, we'll just do an internal free. 4512 */ 4513 zone_free_item(zone, item, NULL, SKIP_NONE); 4514 } 4515 4516 /* See uma.h */ 4517 void 4518 uma_zfree_arg(uma_zone_t zone, void *item, void *udata) 4519 { 4520 uma_cache_t cache; 4521 uma_cache_bucket_t bucket; 4522 int itemdomain, uz_flags; 4523 4524 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 4525 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 4526 4527 CTR3(KTR_UMA, "uma_zfree_arg zone %s(%p) item %p", 4528 zone->uz_name, zone, item); 4529 4530 #ifdef UMA_ZALLOC_DEBUG 4531 KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, 4532 ("uma_zfree_arg: called with SMR zone.")); 4533 if (uma_zfree_debug(zone, item, udata) == EJUSTRETURN) 4534 return; 4535 #endif 4536 /* uma_zfree(..., NULL) does nothing, to match free(9). */ 4537 if (item == NULL) 4538 return; 4539 4540 /* 4541 * We are accessing the per-cpu cache without a critical section to 4542 * fetch size and flags. This is acceptable, if we are preempted we 4543 * will simply read another cpu's line. 4544 */ 4545 cache = &zone->uz_cpu[curcpu]; 4546 uz_flags = cache_uz_flags(cache); 4547 if (UMA_ALWAYS_CTORDTOR || 4548 __predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0)) 4549 item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE); 4550 4551 /* 4552 * The race here is acceptable. If we miss it we'll just have to wait 4553 * a little longer for the limits to be reset. 4554 */ 4555 if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) { 4556 if (atomic_load_32(&zone->uz_sleepers) > 0) 4557 goto zfree_item; 4558 } 4559 4560 /* 4561 * If possible, free to the per-CPU cache. There are two 4562 * requirements for safe access to the per-CPU cache: (1) the thread 4563 * accessing the cache must not be preempted or yield during access, 4564 * and (2) the thread must not migrate CPUs without switching which 4565 * cache it accesses. We rely on a critical section to prevent 4566 * preemption and migration. We release the critical section in 4567 * order to acquire the zone mutex if we are unable to free to the 4568 * current cache; when we re-acquire the critical section, we must 4569 * detect and handle migration if it has occurred. 4570 */ 4571 itemdomain = 0; 4572 #ifdef NUMA 4573 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) 4574 itemdomain = item_domain(item); 4575 #endif 4576 critical_enter(); 4577 do { 4578 cache = &zone->uz_cpu[curcpu]; 4579 /* 4580 * Try to free into the allocbucket first to give LIFO 4581 * ordering for cache-hot datastructures. Spill over 4582 * into the freebucket if necessary. Alloc will swap 4583 * them if one runs dry. 4584 */ 4585 bucket = &cache->uc_allocbucket; 4586 #ifdef NUMA 4587 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && 4588 PCPU_GET(domain) != itemdomain) { 4589 bucket = &cache->uc_crossbucket; 4590 } else 4591 #endif 4592 if (bucket->ucb_cnt == bucket->ucb_entries && 4593 cache->uc_freebucket.ucb_cnt < 4594 cache->uc_freebucket.ucb_entries) 4595 cache_bucket_swap(&cache->uc_freebucket, 4596 &cache->uc_allocbucket); 4597 if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { 4598 cache_bucket_push(cache, bucket, item); 4599 critical_exit(); 4600 return; 4601 } 4602 } while (cache_free(zone, cache, udata, itemdomain)); 4603 critical_exit(); 4604 4605 /* 4606 * If nothing else caught this, we'll just do an internal free. 4607 */ 4608 zfree_item: 4609 zone_free_item(zone, item, udata, SKIP_DTOR); 4610 } 4611 4612 #ifdef NUMA 4613 /* 4614 * sort crossdomain free buckets to domain correct buckets and cache 4615 * them. 4616 */ 4617 static void 4618 zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata) 4619 { 4620 struct uma_bucketlist emptybuckets, fullbuckets; 4621 uma_zone_domain_t zdom; 4622 uma_bucket_t b; 4623 smr_seq_t seq; 4624 void *item; 4625 int domain; 4626 4627 CTR3(KTR_UMA, 4628 "uma_zfree: zone %s(%p) draining cross bucket %p", 4629 zone->uz_name, zone, bucket); 4630 4631 /* 4632 * It is possible for buckets to arrive here out of order so we fetch 4633 * the current smr seq rather than accepting the bucket's. 4634 */ 4635 seq = SMR_SEQ_INVALID; 4636 if ((zone->uz_flags & UMA_ZONE_SMR) != 0) 4637 seq = smr_advance(zone->uz_smr); 4638 4639 /* 4640 * To avoid having ndomain * ndomain buckets for sorting we have a 4641 * lock on the current crossfree bucket. A full matrix with 4642 * per-domain locking could be used if necessary. 4643 */ 4644 STAILQ_INIT(&emptybuckets); 4645 STAILQ_INIT(&fullbuckets); 4646 ZONE_CROSS_LOCK(zone); 4647 for (; bucket->ub_cnt > 0; bucket->ub_cnt--) { 4648 item = bucket->ub_bucket[bucket->ub_cnt - 1]; 4649 domain = item_domain(item); 4650 zdom = ZDOM_GET(zone, domain); 4651 if (zdom->uzd_cross == NULL) { 4652 if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) { 4653 STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); 4654 zdom->uzd_cross = b; 4655 } else { 4656 /* 4657 * Avoid allocating a bucket with the cross lock 4658 * held, since allocation can trigger a 4659 * cross-domain free and bucket zones may 4660 * allocate from each other. 4661 */ 4662 ZONE_CROSS_UNLOCK(zone); 4663 b = bucket_alloc(zone, udata, M_NOWAIT); 4664 if (b == NULL) 4665 goto out; 4666 ZONE_CROSS_LOCK(zone); 4667 if (zdom->uzd_cross != NULL) { 4668 STAILQ_INSERT_HEAD(&emptybuckets, b, 4669 ub_link); 4670 } else { 4671 zdom->uzd_cross = b; 4672 } 4673 } 4674 } 4675 b = zdom->uzd_cross; 4676 b->ub_bucket[b->ub_cnt++] = item; 4677 b->ub_seq = seq; 4678 if (b->ub_cnt == b->ub_entries) { 4679 STAILQ_INSERT_HEAD(&fullbuckets, b, ub_link); 4680 if ((b = STAILQ_FIRST(&emptybuckets)) != NULL) 4681 STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); 4682 zdom->uzd_cross = b; 4683 } 4684 } 4685 ZONE_CROSS_UNLOCK(zone); 4686 out: 4687 if (bucket->ub_cnt == 0) 4688 bucket->ub_seq = SMR_SEQ_INVALID; 4689 bucket_free(zone, bucket, udata); 4690 4691 while ((b = STAILQ_FIRST(&emptybuckets)) != NULL) { 4692 STAILQ_REMOVE_HEAD(&emptybuckets, ub_link); 4693 bucket_free(zone, b, udata); 4694 } 4695 while ((b = STAILQ_FIRST(&fullbuckets)) != NULL) { 4696 STAILQ_REMOVE_HEAD(&fullbuckets, ub_link); 4697 domain = item_domain(b->ub_bucket[0]); 4698 zone_put_bucket(zone, domain, b, udata, true); 4699 } 4700 } 4701 #endif 4702 4703 static void 4704 zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, 4705 int itemdomain, bool ws) 4706 { 4707 4708 #ifdef NUMA 4709 /* 4710 * Buckets coming from the wrong domain will be entirely for the 4711 * only other domain on two domain systems. In this case we can 4712 * simply cache them. Otherwise we need to sort them back to 4713 * correct domains. 4714 */ 4715 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && 4716 vm_ndomains > 2 && PCPU_GET(domain) != itemdomain) { 4717 zone_free_cross(zone, bucket, udata); 4718 return; 4719 } 4720 #endif 4721 4722 /* 4723 * Attempt to save the bucket in the zone's domain bucket cache. 4724 */ 4725 CTR3(KTR_UMA, 4726 "uma_zfree: zone %s(%p) putting bucket %p on free list", 4727 zone->uz_name, zone, bucket); 4728 /* ub_cnt is pointing to the last free item */ 4729 if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0) 4730 itemdomain = zone_domain_lowest(zone, itemdomain); 4731 zone_put_bucket(zone, itemdomain, bucket, udata, ws); 4732 } 4733 4734 /* 4735 * Populate a free or cross bucket for the current cpu cache. Free any 4736 * existing full bucket either to the zone cache or back to the slab layer. 4737 * 4738 * Enters and returns in a critical section. false return indicates that 4739 * we can not satisfy this free in the cache layer. true indicates that 4740 * the caller should retry. 4741 */ 4742 static __noinline bool 4743 cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, int itemdomain) 4744 { 4745 uma_cache_bucket_t cbucket; 4746 uma_bucket_t newbucket, bucket; 4747 4748 CRITICAL_ASSERT(curthread); 4749 4750 if (zone->uz_bucket_size == 0) 4751 return false; 4752 4753 cache = &zone->uz_cpu[curcpu]; 4754 newbucket = NULL; 4755 4756 /* 4757 * FIRSTTOUCH domains need to free to the correct zdom. When 4758 * enabled this is the zdom of the item. The bucket is the 4759 * cross bucket if the current domain and itemdomain do not match. 4760 */ 4761 cbucket = &cache->uc_freebucket; 4762 #ifdef NUMA 4763 if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { 4764 if (PCPU_GET(domain) != itemdomain) { 4765 cbucket = &cache->uc_crossbucket; 4766 if (cbucket->ucb_cnt != 0) 4767 counter_u64_add(zone->uz_xdomain, 4768 cbucket->ucb_cnt); 4769 } 4770 } 4771 #endif 4772 bucket = cache_bucket_unload(cbucket); 4773 KASSERT(bucket == NULL || bucket->ub_cnt == bucket->ub_entries, 4774 ("cache_free: Entered with non-full free bucket.")); 4775 4776 /* We are no longer associated with this CPU. */ 4777 critical_exit(); 4778 4779 /* 4780 * Don't let SMR zones operate without a free bucket. Force 4781 * a synchronize and re-use this one. We will only degrade 4782 * to a synchronize every bucket_size items rather than every 4783 * item if we fail to allocate a bucket. 4784 */ 4785 if ((zone->uz_flags & UMA_ZONE_SMR) != 0) { 4786 if (bucket != NULL) 4787 bucket->ub_seq = smr_advance(zone->uz_smr); 4788 newbucket = bucket_alloc(zone, udata, M_NOWAIT); 4789 if (newbucket == NULL && bucket != NULL) { 4790 bucket_drain(zone, bucket); 4791 newbucket = bucket; 4792 bucket = NULL; 4793 } 4794 } else if (!bucketdisable) 4795 newbucket = bucket_alloc(zone, udata, M_NOWAIT); 4796 4797 if (bucket != NULL) 4798 zone_free_bucket(zone, bucket, udata, itemdomain, true); 4799 4800 critical_enter(); 4801 if ((bucket = newbucket) == NULL) 4802 return (false); 4803 cache = &zone->uz_cpu[curcpu]; 4804 #ifdef NUMA 4805 /* 4806 * Check to see if we should be populating the cross bucket. If it 4807 * is already populated we will fall through and attempt to populate 4808 * the free bucket. 4809 */ 4810 if ((cache_uz_flags(cache) & UMA_ZONE_FIRSTTOUCH) != 0) { 4811 if (PCPU_GET(domain) != itemdomain && 4812 cache->uc_crossbucket.ucb_bucket == NULL) { 4813 cache_bucket_load_cross(cache, bucket); 4814 return (true); 4815 } 4816 } 4817 #endif 4818 /* 4819 * We may have lost the race to fill the bucket or switched CPUs. 4820 */ 4821 if (cache->uc_freebucket.ucb_bucket != NULL) { 4822 critical_exit(); 4823 bucket_free(zone, bucket, udata); 4824 critical_enter(); 4825 } else 4826 cache_bucket_load_free(cache, bucket); 4827 4828 return (true); 4829 } 4830 4831 static void 4832 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) 4833 { 4834 uma_keg_t keg; 4835 uma_domain_t dom; 4836 int freei; 4837 4838 keg = zone->uz_keg; 4839 KEG_LOCK_ASSERT(keg, slab->us_domain); 4840 4841 /* Do we need to remove from any lists? */ 4842 dom = &keg->uk_domain[slab->us_domain]; 4843 if (slab->us_freecount + 1 == keg->uk_ipers) { 4844 LIST_REMOVE(slab, us_link); 4845 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); 4846 dom->ud_free_slabs++; 4847 } else if (slab->us_freecount == 0) { 4848 LIST_REMOVE(slab, us_link); 4849 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 4850 } 4851 4852 /* Slab management. */ 4853 freei = slab_item_index(slab, keg, item); 4854 BIT_SET(keg->uk_ipers, freei, &slab->us_free); 4855 slab->us_freecount++; 4856 4857 /* Keg statistics. */ 4858 dom->ud_free_items++; 4859 } 4860 4861 static void 4862 zone_release(void *arg, void **bucket, int cnt) 4863 { 4864 struct mtx *lock; 4865 uma_zone_t zone; 4866 uma_slab_t slab; 4867 uma_keg_t keg; 4868 uint8_t *mem; 4869 void *item; 4870 int i; 4871 4872 zone = arg; 4873 keg = zone->uz_keg; 4874 lock = NULL; 4875 if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0)) 4876 lock = KEG_LOCK(keg, 0); 4877 for (i = 0; i < cnt; i++) { 4878 item = bucket[i]; 4879 if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) { 4880 slab = vtoslab((vm_offset_t)item); 4881 } else { 4882 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 4883 if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0) 4884 slab = hash_sfind(&keg->uk_hash, mem); 4885 else 4886 slab = (uma_slab_t)(mem + keg->uk_pgoff); 4887 } 4888 if (lock != KEG_LOCKPTR(keg, slab->us_domain)) { 4889 if (lock != NULL) 4890 mtx_unlock(lock); 4891 lock = KEG_LOCK(keg, slab->us_domain); 4892 } 4893 slab_free_item(zone, slab, item); 4894 } 4895 if (lock != NULL) 4896 mtx_unlock(lock); 4897 } 4898 4899 /* 4900 * Frees a single item to any zone. 4901 * 4902 * Arguments: 4903 * zone The zone to free to 4904 * item The item we're freeing 4905 * udata User supplied data for the dtor 4906 * skip Skip dtors and finis 4907 */ 4908 static __noinline void 4909 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) 4910 { 4911 4912 /* 4913 * If a free is sent directly to an SMR zone we have to 4914 * synchronize immediately because the item can instantly 4915 * be reallocated. This should only happen in degenerate 4916 * cases when no memory is available for per-cpu caches. 4917 */ 4918 if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_NONE) 4919 smr_synchronize(zone->uz_smr); 4920 4921 item_dtor(zone, item, zone->uz_size, udata, skip); 4922 4923 if (skip < SKIP_FINI && zone->uz_fini) { 4924 kasan_mark_item_valid(zone, item); 4925 zone->uz_fini(item, zone->uz_size); 4926 kasan_mark_item_invalid(zone, item); 4927 } 4928 4929 zone->uz_release(zone->uz_arg, &item, 1); 4930 4931 if (skip & SKIP_CNT) 4932 return; 4933 4934 counter_u64_add(zone->uz_frees, 1); 4935 4936 if (zone->uz_max_items > 0) 4937 zone_free_limit(zone, 1); 4938 } 4939 4940 /* See uma.h */ 4941 int 4942 uma_zone_set_max(uma_zone_t zone, int nitems) 4943 { 4944 4945 /* 4946 * If the limit is small, we may need to constrain the maximum per-CPU 4947 * cache size, or disable caching entirely. 4948 */ 4949 uma_zone_set_maxcache(zone, nitems); 4950 4951 /* 4952 * XXX This can misbehave if the zone has any allocations with 4953 * no limit and a limit is imposed. There is currently no 4954 * way to clear a limit. 4955 */ 4956 ZONE_LOCK(zone); 4957 if (zone->uz_max_items == 0) 4958 ZONE_ASSERT_COLD(zone); 4959 zone->uz_max_items = nitems; 4960 zone->uz_flags |= UMA_ZFLAG_LIMIT; 4961 zone_update_caches(zone); 4962 /* We may need to wake waiters. */ 4963 wakeup(&zone->uz_max_items); 4964 ZONE_UNLOCK(zone); 4965 4966 return (nitems); 4967 } 4968 4969 /* See uma.h */ 4970 void 4971 uma_zone_set_maxcache(uma_zone_t zone, int nitems) 4972 { 4973 int bpcpu, bpdom, bsize, nb; 4974 4975 ZONE_LOCK(zone); 4976 4977 /* 4978 * Compute a lower bound on the number of items that may be cached in 4979 * the zone. Each CPU gets at least two buckets, and for cross-domain 4980 * frees we use an additional bucket per CPU and per domain. Select the 4981 * largest bucket size that does not exceed half of the requested limit, 4982 * with the left over space given to the full bucket cache. 4983 */ 4984 bpdom = 0; 4985 bpcpu = 2; 4986 #ifdef NUMA 4987 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && vm_ndomains > 1) { 4988 bpcpu++; 4989 bpdom++; 4990 } 4991 #endif 4992 nb = bpcpu * mp_ncpus + bpdom * vm_ndomains; 4993 bsize = nitems / nb / 2; 4994 if (bsize > BUCKET_MAX) 4995 bsize = BUCKET_MAX; 4996 else if (bsize == 0 && nitems / nb > 0) 4997 bsize = 1; 4998 zone->uz_bucket_size_max = zone->uz_bucket_size = bsize; 4999 if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) 5000 zone->uz_bucket_size_min = zone->uz_bucket_size_max; 5001 zone->uz_bucket_max = nitems - nb * bsize; 5002 ZONE_UNLOCK(zone); 5003 } 5004 5005 /* See uma.h */ 5006 int 5007 uma_zone_get_max(uma_zone_t zone) 5008 { 5009 int nitems; 5010 5011 nitems = atomic_load_64(&zone->uz_max_items); 5012 5013 return (nitems); 5014 } 5015 5016 /* See uma.h */ 5017 void 5018 uma_zone_set_warning(uma_zone_t zone, const char *warning) 5019 { 5020 5021 ZONE_ASSERT_COLD(zone); 5022 zone->uz_warning = warning; 5023 } 5024 5025 /* See uma.h */ 5026 void 5027 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) 5028 { 5029 5030 ZONE_ASSERT_COLD(zone); 5031 TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); 5032 } 5033 5034 /* See uma.h */ 5035 int 5036 uma_zone_get_cur(uma_zone_t zone) 5037 { 5038 int64_t nitems; 5039 u_int i; 5040 5041 nitems = 0; 5042 if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER) 5043 nitems = counter_u64_fetch(zone->uz_allocs) - 5044 counter_u64_fetch(zone->uz_frees); 5045 CPU_FOREACH(i) 5046 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) - 5047 atomic_load_64(&zone->uz_cpu[i].uc_frees); 5048 5049 return (nitems < 0 ? 0 : nitems); 5050 } 5051 5052 static uint64_t 5053 uma_zone_get_allocs(uma_zone_t zone) 5054 { 5055 uint64_t nitems; 5056 u_int i; 5057 5058 nitems = 0; 5059 if (zone->uz_allocs != EARLY_COUNTER) 5060 nitems = counter_u64_fetch(zone->uz_allocs); 5061 CPU_FOREACH(i) 5062 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs); 5063 5064 return (nitems); 5065 } 5066 5067 static uint64_t 5068 uma_zone_get_frees(uma_zone_t zone) 5069 { 5070 uint64_t nitems; 5071 u_int i; 5072 5073 nitems = 0; 5074 if (zone->uz_frees != EARLY_COUNTER) 5075 nitems = counter_u64_fetch(zone->uz_frees); 5076 CPU_FOREACH(i) 5077 nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees); 5078 5079 return (nitems); 5080 } 5081 5082 #ifdef INVARIANTS 5083 /* Used only for KEG_ASSERT_COLD(). */ 5084 static uint64_t 5085 uma_keg_get_allocs(uma_keg_t keg) 5086 { 5087 uma_zone_t z; 5088 uint64_t nitems; 5089 5090 nitems = 0; 5091 LIST_FOREACH(z, &keg->uk_zones, uz_link) 5092 nitems += uma_zone_get_allocs(z); 5093 5094 return (nitems); 5095 } 5096 #endif 5097 5098 /* See uma.h */ 5099 void 5100 uma_zone_set_init(uma_zone_t zone, uma_init uminit) 5101 { 5102 uma_keg_t keg; 5103 5104 KEG_GET(zone, keg); 5105 KEG_ASSERT_COLD(keg); 5106 keg->uk_init = uminit; 5107 } 5108 5109 /* See uma.h */ 5110 void 5111 uma_zone_set_fini(uma_zone_t zone, uma_fini fini) 5112 { 5113 uma_keg_t keg; 5114 5115 KEG_GET(zone, keg); 5116 KEG_ASSERT_COLD(keg); 5117 keg->uk_fini = fini; 5118 } 5119 5120 /* See uma.h */ 5121 void 5122 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) 5123 { 5124 5125 ZONE_ASSERT_COLD(zone); 5126 zone->uz_init = zinit; 5127 } 5128 5129 /* See uma.h */ 5130 void 5131 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) 5132 { 5133 5134 ZONE_ASSERT_COLD(zone); 5135 zone->uz_fini = zfini; 5136 } 5137 5138 /* See uma.h */ 5139 void 5140 uma_zone_set_freef(uma_zone_t zone, uma_free freef) 5141 { 5142 uma_keg_t keg; 5143 5144 KEG_GET(zone, keg); 5145 KEG_ASSERT_COLD(keg); 5146 keg->uk_freef = freef; 5147 } 5148 5149 /* See uma.h */ 5150 void 5151 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) 5152 { 5153 uma_keg_t keg; 5154 5155 KEG_GET(zone, keg); 5156 KEG_ASSERT_COLD(keg); 5157 keg->uk_allocf = allocf; 5158 } 5159 5160 /* See uma.h */ 5161 void 5162 uma_zone_set_smr(uma_zone_t zone, smr_t smr) 5163 { 5164 5165 ZONE_ASSERT_COLD(zone); 5166 5167 KASSERT(smr != NULL, ("Got NULL smr")); 5168 KASSERT((zone->uz_flags & UMA_ZONE_SMR) == 0, 5169 ("zone %p (%s) already uses SMR", zone, zone->uz_name)); 5170 zone->uz_flags |= UMA_ZONE_SMR; 5171 zone->uz_smr = smr; 5172 zone_update_caches(zone); 5173 } 5174 5175 smr_t 5176 uma_zone_get_smr(uma_zone_t zone) 5177 { 5178 5179 return (zone->uz_smr); 5180 } 5181 5182 /* See uma.h */ 5183 void 5184 uma_zone_reserve(uma_zone_t zone, int items) 5185 { 5186 uma_keg_t keg; 5187 5188 KEG_GET(zone, keg); 5189 KEG_ASSERT_COLD(keg); 5190 keg->uk_reserve = items; 5191 } 5192 5193 /* See uma.h */ 5194 int 5195 uma_zone_reserve_kva(uma_zone_t zone, int count) 5196 { 5197 uma_keg_t keg; 5198 vm_offset_t kva; 5199 u_int pages; 5200 5201 KEG_GET(zone, keg); 5202 KEG_ASSERT_COLD(keg); 5203 ZONE_ASSERT_COLD(zone); 5204 5205 pages = howmany(count, keg->uk_ipers) * keg->uk_ppera; 5206 5207 #ifdef UMA_USE_DMAP 5208 if (keg->uk_ppera > 1) { 5209 #else 5210 if (1) { 5211 #endif 5212 kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); 5213 if (kva == 0) 5214 return (0); 5215 } else 5216 kva = 0; 5217 5218 MPASS(keg->uk_kva == 0); 5219 keg->uk_kva = kva; 5220 keg->uk_offset = 0; 5221 zone->uz_max_items = pages * keg->uk_ipers; 5222 #ifdef UMA_USE_DMAP 5223 keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; 5224 #else 5225 keg->uk_allocf = noobj_alloc; 5226 #endif 5227 keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; 5228 zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; 5229 zone_update_caches(zone); 5230 5231 return (1); 5232 } 5233 5234 /* See uma.h */ 5235 void 5236 uma_prealloc(uma_zone_t zone, int items) 5237 { 5238 struct vm_domainset_iter di; 5239 uma_domain_t dom; 5240 uma_slab_t slab; 5241 uma_keg_t keg; 5242 int aflags, domain, slabs; 5243 5244 KEG_GET(zone, keg); 5245 slabs = howmany(items, keg->uk_ipers); 5246 while (slabs-- > 0) { 5247 aflags = M_NOWAIT; 5248 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 5249 &aflags); 5250 for (;;) { 5251 slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, 5252 aflags); 5253 if (slab != NULL) { 5254 dom = &keg->uk_domain[slab->us_domain]; 5255 /* 5256 * keg_alloc_slab() always returns a slab on the 5257 * partial list. 5258 */ 5259 LIST_REMOVE(slab, us_link); 5260 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, 5261 us_link); 5262 dom->ud_free_slabs++; 5263 KEG_UNLOCK(keg, slab->us_domain); 5264 break; 5265 } 5266 if (vm_domainset_iter_policy(&di, &domain) != 0) 5267 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0); 5268 } 5269 } 5270 } 5271 5272 /* 5273 * Returns a snapshot of memory consumption in bytes. 5274 */ 5275 size_t 5276 uma_zone_memory(uma_zone_t zone) 5277 { 5278 size_t sz; 5279 int i; 5280 5281 sz = 0; 5282 if (zone->uz_flags & UMA_ZFLAG_CACHE) { 5283 for (i = 0; i < vm_ndomains; i++) 5284 sz += ZDOM_GET(zone, i)->uzd_nitems; 5285 return (sz * zone->uz_size); 5286 } 5287 for (i = 0; i < vm_ndomains; i++) 5288 sz += zone->uz_keg->uk_domain[i].ud_pages; 5289 5290 return (sz * PAGE_SIZE); 5291 } 5292 5293 struct uma_reclaim_args { 5294 int domain; 5295 int req; 5296 }; 5297 5298 static void 5299 uma_reclaim_domain_cb(uma_zone_t zone, void *arg) 5300 { 5301 struct uma_reclaim_args *args; 5302 5303 args = arg; 5304 if ((zone->uz_flags & UMA_ZONE_UNMANAGED) != 0) 5305 return; 5306 if ((args->req == UMA_RECLAIM_TRIM) && 5307 (zone->uz_flags & UMA_ZONE_NOTRIM) !=0) 5308 return; 5309 5310 uma_zone_reclaim_domain(zone, args->req, args->domain); 5311 } 5312 5313 /* See uma.h */ 5314 void 5315 uma_reclaim(int req) 5316 { 5317 uma_reclaim_domain(req, UMA_ANYDOMAIN); 5318 } 5319 5320 void 5321 uma_reclaim_domain(int req, int domain) 5322 { 5323 struct uma_reclaim_args args; 5324 5325 bucket_enable(); 5326 5327 args.domain = domain; 5328 args.req = req; 5329 5330 sx_slock(&uma_reclaim_lock); 5331 switch (req) { 5332 case UMA_RECLAIM_TRIM: 5333 case UMA_RECLAIM_DRAIN: 5334 zone_foreach(uma_reclaim_domain_cb, &args); 5335 break; 5336 case UMA_RECLAIM_DRAIN_CPU: 5337 /* 5338 * Reclaim globally visible free items from all zones, then drain 5339 * per-CPU buckets, then reclaim items freed while draining. 5340 * This approach minimizes expensive context switching needed to 5341 * drain each zone's per-CPU buckets. 5342 */ 5343 args.req = UMA_RECLAIM_DRAIN; 5344 zone_foreach(uma_reclaim_domain_cb, &args); 5345 pcpu_cache_drain_safe(NULL); 5346 zone_foreach(uma_reclaim_domain_cb, &args); 5347 break; 5348 default: 5349 panic("unhandled reclamation request %d", req); 5350 } 5351 5352 /* 5353 * Some slabs may have been freed but this zone will be visited early 5354 * we visit again so that we can free pages that are empty once other 5355 * zones are drained. We have to do the same for buckets. 5356 */ 5357 uma_zone_reclaim_domain(slabzones[0], UMA_RECLAIM_DRAIN, domain); 5358 uma_zone_reclaim_domain(slabzones[1], UMA_RECLAIM_DRAIN, domain); 5359 bucket_zone_drain(domain); 5360 sx_sunlock(&uma_reclaim_lock); 5361 } 5362 5363 static volatile int uma_reclaim_needed; 5364 5365 void 5366 uma_reclaim_wakeup(void) 5367 { 5368 5369 if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0) 5370 wakeup(uma_reclaim); 5371 } 5372 5373 void 5374 uma_reclaim_worker(void *arg __unused) 5375 { 5376 5377 for (;;) { 5378 sx_xlock(&uma_reclaim_lock); 5379 while (atomic_load_int(&uma_reclaim_needed) == 0) 5380 sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", 5381 hz); 5382 sx_xunlock(&uma_reclaim_lock); 5383 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); 5384 uma_reclaim(UMA_RECLAIM_DRAIN_CPU); 5385 atomic_store_int(&uma_reclaim_needed, 0); 5386 /* Don't fire more than once per-second. */ 5387 pause("umarclslp", hz); 5388 } 5389 } 5390 5391 /* See uma.h */ 5392 void 5393 uma_zone_reclaim(uma_zone_t zone, int req) 5394 { 5395 uma_zone_reclaim_domain(zone, req, UMA_ANYDOMAIN); 5396 } 5397 5398 void 5399 uma_zone_reclaim_domain(uma_zone_t zone, int req, int domain) 5400 { 5401 switch (req) { 5402 case UMA_RECLAIM_TRIM: 5403 zone_reclaim(zone, domain, M_NOWAIT, false); 5404 break; 5405 case UMA_RECLAIM_DRAIN: 5406 zone_reclaim(zone, domain, M_NOWAIT, true); 5407 break; 5408 case UMA_RECLAIM_DRAIN_CPU: 5409 pcpu_cache_drain_safe(zone); 5410 zone_reclaim(zone, domain, M_NOWAIT, true); 5411 break; 5412 default: 5413 panic("unhandled reclamation request %d", req); 5414 } 5415 } 5416 5417 /* See uma.h */ 5418 int 5419 uma_zone_exhausted(uma_zone_t zone) 5420 { 5421 5422 return (atomic_load_32(&zone->uz_sleepers) > 0); 5423 } 5424 5425 unsigned long 5426 uma_limit(void) 5427 { 5428 5429 return (uma_kmem_limit); 5430 } 5431 5432 void 5433 uma_set_limit(unsigned long limit) 5434 { 5435 5436 uma_kmem_limit = limit; 5437 } 5438 5439 unsigned long 5440 uma_size(void) 5441 { 5442 5443 return (atomic_load_long(&uma_kmem_total)); 5444 } 5445 5446 long 5447 uma_avail(void) 5448 { 5449 5450 return (uma_kmem_limit - uma_size()); 5451 } 5452 5453 #ifdef DDB 5454 /* 5455 * Generate statistics across both the zone and its per-cpu cache's. Return 5456 * desired statistics if the pointer is non-NULL for that statistic. 5457 * 5458 * Note: does not update the zone statistics, as it can't safely clear the 5459 * per-CPU cache statistic. 5460 * 5461 */ 5462 static void 5463 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, 5464 uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) 5465 { 5466 uma_cache_t cache; 5467 uint64_t allocs, frees, sleeps, xdomain; 5468 int cachefree, cpu; 5469 5470 allocs = frees = sleeps = xdomain = 0; 5471 cachefree = 0; 5472 CPU_FOREACH(cpu) { 5473 cache = &z->uz_cpu[cpu]; 5474 cachefree += cache->uc_allocbucket.ucb_cnt; 5475 cachefree += cache->uc_freebucket.ucb_cnt; 5476 xdomain += cache->uc_crossbucket.ucb_cnt; 5477 cachefree += cache->uc_crossbucket.ucb_cnt; 5478 allocs += cache->uc_allocs; 5479 frees += cache->uc_frees; 5480 } 5481 allocs += counter_u64_fetch(z->uz_allocs); 5482 frees += counter_u64_fetch(z->uz_frees); 5483 xdomain += counter_u64_fetch(z->uz_xdomain); 5484 sleeps += z->uz_sleeps; 5485 if (cachefreep != NULL) 5486 *cachefreep = cachefree; 5487 if (allocsp != NULL) 5488 *allocsp = allocs; 5489 if (freesp != NULL) 5490 *freesp = frees; 5491 if (sleepsp != NULL) 5492 *sleepsp = sleeps; 5493 if (xdomainp != NULL) 5494 *xdomainp = xdomain; 5495 } 5496 #endif /* DDB */ 5497 5498 static int 5499 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) 5500 { 5501 uma_keg_t kz; 5502 uma_zone_t z; 5503 int count; 5504 5505 count = 0; 5506 rw_rlock(&uma_rwlock); 5507 LIST_FOREACH(kz, &uma_kegs, uk_link) { 5508 LIST_FOREACH(z, &kz->uk_zones, uz_link) 5509 count++; 5510 } 5511 LIST_FOREACH(z, &uma_cachezones, uz_link) 5512 count++; 5513 5514 rw_runlock(&uma_rwlock); 5515 return (sysctl_handle_int(oidp, &count, 0, req)); 5516 } 5517 5518 static void 5519 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, 5520 struct uma_percpu_stat *ups, bool internal) 5521 { 5522 uma_zone_domain_t zdom; 5523 uma_cache_t cache; 5524 int i; 5525 5526 for (i = 0; i < vm_ndomains; i++) { 5527 zdom = ZDOM_GET(z, i); 5528 uth->uth_zone_free += zdom->uzd_nitems; 5529 } 5530 uth->uth_allocs = counter_u64_fetch(z->uz_allocs); 5531 uth->uth_frees = counter_u64_fetch(z->uz_frees); 5532 uth->uth_fails = counter_u64_fetch(z->uz_fails); 5533 uth->uth_xdomain = counter_u64_fetch(z->uz_xdomain); 5534 uth->uth_sleeps = z->uz_sleeps; 5535 5536 for (i = 0; i < mp_maxid + 1; i++) { 5537 bzero(&ups[i], sizeof(*ups)); 5538 if (internal || CPU_ABSENT(i)) 5539 continue; 5540 cache = &z->uz_cpu[i]; 5541 ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt; 5542 ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt; 5543 ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt; 5544 ups[i].ups_allocs = cache->uc_allocs; 5545 ups[i].ups_frees = cache->uc_frees; 5546 } 5547 } 5548 5549 static int 5550 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) 5551 { 5552 struct uma_stream_header ush; 5553 struct uma_type_header uth; 5554 struct uma_percpu_stat *ups; 5555 struct sbuf sbuf; 5556 uma_keg_t kz; 5557 uma_zone_t z; 5558 uint64_t items; 5559 uint32_t kfree, pages; 5560 int count, error, i; 5561 5562 error = sysctl_wire_old_buffer(req, 0); 5563 if (error != 0) 5564 return (error); 5565 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 5566 sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); 5567 ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); 5568 5569 count = 0; 5570 rw_rlock(&uma_rwlock); 5571 LIST_FOREACH(kz, &uma_kegs, uk_link) { 5572 LIST_FOREACH(z, &kz->uk_zones, uz_link) 5573 count++; 5574 } 5575 5576 LIST_FOREACH(z, &uma_cachezones, uz_link) 5577 count++; 5578 5579 /* 5580 * Insert stream header. 5581 */ 5582 bzero(&ush, sizeof(ush)); 5583 ush.ush_version = UMA_STREAM_VERSION; 5584 ush.ush_maxcpus = (mp_maxid + 1); 5585 ush.ush_count = count; 5586 (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); 5587 5588 LIST_FOREACH(kz, &uma_kegs, uk_link) { 5589 kfree = pages = 0; 5590 for (i = 0; i < vm_ndomains; i++) { 5591 kfree += kz->uk_domain[i].ud_free_items; 5592 pages += kz->uk_domain[i].ud_pages; 5593 } 5594 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 5595 bzero(&uth, sizeof(uth)); 5596 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 5597 uth.uth_align = kz->uk_align; 5598 uth.uth_size = kz->uk_size; 5599 uth.uth_rsize = kz->uk_rsize; 5600 if (z->uz_max_items > 0) { 5601 items = UZ_ITEMS_COUNT(z->uz_items); 5602 uth.uth_pages = (items / kz->uk_ipers) * 5603 kz->uk_ppera; 5604 } else 5605 uth.uth_pages = pages; 5606 uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * 5607 kz->uk_ppera; 5608 uth.uth_limit = z->uz_max_items; 5609 uth.uth_keg_free = kfree; 5610 5611 /* 5612 * A zone is secondary is it is not the first entry 5613 * on the keg's zone list. 5614 */ 5615 if ((z->uz_flags & UMA_ZONE_SECONDARY) && 5616 (LIST_FIRST(&kz->uk_zones) != z)) 5617 uth.uth_zone_flags = UTH_ZONE_SECONDARY; 5618 uma_vm_zone_stats(&uth, z, &sbuf, ups, 5619 kz->uk_flags & UMA_ZFLAG_INTERNAL); 5620 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 5621 for (i = 0; i < mp_maxid + 1; i++) 5622 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 5623 } 5624 } 5625 LIST_FOREACH(z, &uma_cachezones, uz_link) { 5626 bzero(&uth, sizeof(uth)); 5627 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 5628 uth.uth_size = z->uz_size; 5629 uma_vm_zone_stats(&uth, z, &sbuf, ups, false); 5630 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 5631 for (i = 0; i < mp_maxid + 1; i++) 5632 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 5633 } 5634 5635 rw_runlock(&uma_rwlock); 5636 error = sbuf_finish(&sbuf); 5637 sbuf_delete(&sbuf); 5638 free(ups, M_TEMP); 5639 return (error); 5640 } 5641 5642 int 5643 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) 5644 { 5645 uma_zone_t zone = *(uma_zone_t *)arg1; 5646 int error, max; 5647 5648 max = uma_zone_get_max(zone); 5649 error = sysctl_handle_int(oidp, &max, 0, req); 5650 if (error || !req->newptr) 5651 return (error); 5652 5653 uma_zone_set_max(zone, max); 5654 5655 return (0); 5656 } 5657 5658 int 5659 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) 5660 { 5661 uma_zone_t zone; 5662 int cur; 5663 5664 /* 5665 * Some callers want to add sysctls for global zones that 5666 * may not yet exist so they pass a pointer to a pointer. 5667 */ 5668 if (arg2 == 0) 5669 zone = *(uma_zone_t *)arg1; 5670 else 5671 zone = arg1; 5672 cur = uma_zone_get_cur(zone); 5673 return (sysctl_handle_int(oidp, &cur, 0, req)); 5674 } 5675 5676 static int 5677 sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS) 5678 { 5679 uma_zone_t zone = arg1; 5680 uint64_t cur; 5681 5682 cur = uma_zone_get_allocs(zone); 5683 return (sysctl_handle_64(oidp, &cur, 0, req)); 5684 } 5685 5686 static int 5687 sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS) 5688 { 5689 uma_zone_t zone = arg1; 5690 uint64_t cur; 5691 5692 cur = uma_zone_get_frees(zone); 5693 return (sysctl_handle_64(oidp, &cur, 0, req)); 5694 } 5695 5696 static int 5697 sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS) 5698 { 5699 struct sbuf sbuf; 5700 uma_zone_t zone = arg1; 5701 int error; 5702 5703 sbuf_new_for_sysctl(&sbuf, NULL, 0, req); 5704 if (zone->uz_flags != 0) 5705 sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS); 5706 else 5707 sbuf_printf(&sbuf, "0"); 5708 error = sbuf_finish(&sbuf); 5709 sbuf_delete(&sbuf); 5710 5711 return (error); 5712 } 5713 5714 static int 5715 sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS) 5716 { 5717 uma_keg_t keg = arg1; 5718 int avail, effpct, total; 5719 5720 total = keg->uk_ppera * PAGE_SIZE; 5721 if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) 5722 total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize; 5723 /* 5724 * We consider the client's requested size and alignment here, not the 5725 * real size determination uk_rsize, because we also adjust the real 5726 * size for internal implementation reasons (max bitset size). 5727 */ 5728 avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1); 5729 if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) 5730 avail *= mp_maxid + 1; 5731 effpct = 100 * avail / total; 5732 return (sysctl_handle_int(oidp, &effpct, 0, req)); 5733 } 5734 5735 static int 5736 sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS) 5737 { 5738 uma_zone_t zone = arg1; 5739 uint64_t cur; 5740 5741 cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items)); 5742 return (sysctl_handle_64(oidp, &cur, 0, req)); 5743 } 5744 5745 #ifdef INVARIANTS 5746 static uma_slab_t 5747 uma_dbg_getslab(uma_zone_t zone, void *item) 5748 { 5749 uma_slab_t slab; 5750 uma_keg_t keg; 5751 uint8_t *mem; 5752 5753 /* 5754 * It is safe to return the slab here even though the 5755 * zone is unlocked because the item's allocation state 5756 * essentially holds a reference. 5757 */ 5758 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 5759 if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) 5760 return (NULL); 5761 if (zone->uz_flags & UMA_ZFLAG_VTOSLAB) 5762 return (vtoslab((vm_offset_t)mem)); 5763 keg = zone->uz_keg; 5764 if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0) 5765 return ((uma_slab_t)(mem + keg->uk_pgoff)); 5766 KEG_LOCK(keg, 0); 5767 slab = hash_sfind(&keg->uk_hash, mem); 5768 KEG_UNLOCK(keg, 0); 5769 5770 return (slab); 5771 } 5772 5773 static bool 5774 uma_dbg_zskip(uma_zone_t zone, void *mem) 5775 { 5776 5777 if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) 5778 return (true); 5779 5780 return (uma_dbg_kskip(zone->uz_keg, mem)); 5781 } 5782 5783 static bool 5784 uma_dbg_kskip(uma_keg_t keg, void *mem) 5785 { 5786 uintptr_t idx; 5787 5788 if (dbg_divisor == 0) 5789 return (true); 5790 5791 if (dbg_divisor == 1) 5792 return (false); 5793 5794 idx = (uintptr_t)mem >> PAGE_SHIFT; 5795 if (keg->uk_ipers > 1) { 5796 idx *= keg->uk_ipers; 5797 idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize; 5798 } 5799 5800 if ((idx / dbg_divisor) * dbg_divisor != idx) { 5801 counter_u64_add(uma_skip_cnt, 1); 5802 return (true); 5803 } 5804 counter_u64_add(uma_dbg_cnt, 1); 5805 5806 return (false); 5807 } 5808 5809 /* 5810 * Set up the slab's freei data such that uma_dbg_free can function. 5811 * 5812 */ 5813 static void 5814 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) 5815 { 5816 uma_keg_t keg; 5817 int freei; 5818 5819 if (slab == NULL) { 5820 slab = uma_dbg_getslab(zone, item); 5821 if (slab == NULL) 5822 panic("uma: item %p did not belong to zone %s", 5823 item, zone->uz_name); 5824 } 5825 keg = zone->uz_keg; 5826 freei = slab_item_index(slab, keg, item); 5827 5828 if (BIT_TEST_SET_ATOMIC(keg->uk_ipers, freei, 5829 slab_dbg_bits(slab, keg))) 5830 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)", 5831 item, zone, zone->uz_name, slab, freei); 5832 } 5833 5834 /* 5835 * Verifies freed addresses. Checks for alignment, valid slab membership 5836 * and duplicate frees. 5837 * 5838 */ 5839 static void 5840 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) 5841 { 5842 uma_keg_t keg; 5843 int freei; 5844 5845 if (slab == NULL) { 5846 slab = uma_dbg_getslab(zone, item); 5847 if (slab == NULL) 5848 panic("uma: Freed item %p did not belong to zone %s", 5849 item, zone->uz_name); 5850 } 5851 keg = zone->uz_keg; 5852 freei = slab_item_index(slab, keg, item); 5853 5854 if (freei >= keg->uk_ipers) 5855 panic("Invalid free of %p from zone %p(%s) slab %p(%d)", 5856 item, zone, zone->uz_name, slab, freei); 5857 5858 if (slab_item(slab, keg, freei) != item) 5859 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)", 5860 item, zone, zone->uz_name, slab, freei); 5861 5862 if (!BIT_TEST_CLR_ATOMIC(keg->uk_ipers, freei, 5863 slab_dbg_bits(slab, keg))) 5864 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)", 5865 item, zone, zone->uz_name, slab, freei); 5866 } 5867 #endif /* INVARIANTS */ 5868 5869 #ifdef DDB 5870 static int64_t 5871 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used, 5872 uint64_t *sleeps, long *cachefree, uint64_t *xdomain) 5873 { 5874 uint64_t frees; 5875 int i; 5876 5877 if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { 5878 *allocs = counter_u64_fetch(z->uz_allocs); 5879 frees = counter_u64_fetch(z->uz_frees); 5880 *sleeps = z->uz_sleeps; 5881 *cachefree = 0; 5882 *xdomain = 0; 5883 } else 5884 uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps, 5885 xdomain); 5886 for (i = 0; i < vm_ndomains; i++) { 5887 *cachefree += ZDOM_GET(z, i)->uzd_nitems; 5888 if (!((z->uz_flags & UMA_ZONE_SECONDARY) && 5889 (LIST_FIRST(&kz->uk_zones) != z))) 5890 *cachefree += kz->uk_domain[i].ud_free_items; 5891 } 5892 *used = *allocs - frees; 5893 return (((int64_t)*used + *cachefree) * kz->uk_size); 5894 } 5895 5896 DB_SHOW_COMMAND_FLAGS(uma, db_show_uma, DB_CMD_MEMSAFE) 5897 { 5898 const char *fmt_hdr, *fmt_entry; 5899 uma_keg_t kz; 5900 uma_zone_t z; 5901 uint64_t allocs, used, sleeps, xdomain; 5902 long cachefree; 5903 /* variables for sorting */ 5904 uma_keg_t cur_keg; 5905 uma_zone_t cur_zone, last_zone; 5906 int64_t cur_size, last_size, size; 5907 int ties; 5908 5909 /* /i option produces machine-parseable CSV output */ 5910 if (modif[0] == 'i') { 5911 fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; 5912 fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n"; 5913 } else { 5914 fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n"; 5915 fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n"; 5916 } 5917 5918 db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests", 5919 "Sleeps", "Bucket", "Total Mem", "XFree"); 5920 5921 /* Sort the zones with largest size first. */ 5922 last_zone = NULL; 5923 last_size = INT64_MAX; 5924 for (;;) { 5925 cur_zone = NULL; 5926 cur_size = -1; 5927 ties = 0; 5928 LIST_FOREACH(kz, &uma_kegs, uk_link) { 5929 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 5930 /* 5931 * In the case of size ties, print out zones 5932 * in the order they are encountered. That is, 5933 * when we encounter the most recently output 5934 * zone, we have already printed all preceding 5935 * ties, and we must print all following ties. 5936 */ 5937 if (z == last_zone) { 5938 ties = 1; 5939 continue; 5940 } 5941 size = get_uma_stats(kz, z, &allocs, &used, 5942 &sleeps, &cachefree, &xdomain); 5943 if (size > cur_size && size < last_size + ties) 5944 { 5945 cur_size = size; 5946 cur_zone = z; 5947 cur_keg = kz; 5948 } 5949 } 5950 } 5951 if (cur_zone == NULL) 5952 break; 5953 5954 size = get_uma_stats(cur_keg, cur_zone, &allocs, &used, 5955 &sleeps, &cachefree, &xdomain); 5956 db_printf(fmt_entry, cur_zone->uz_name, 5957 (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree, 5958 (uintmax_t)allocs, (uintmax_t)sleeps, 5959 (unsigned)cur_zone->uz_bucket_size, (intmax_t)size, 5960 xdomain); 5961 5962 if (db_pager_quit) 5963 return; 5964 last_zone = cur_zone; 5965 last_size = cur_size; 5966 } 5967 } 5968 5969 DB_SHOW_COMMAND_FLAGS(umacache, db_show_umacache, DB_CMD_MEMSAFE) 5970 { 5971 uma_zone_t z; 5972 uint64_t allocs, frees; 5973 long cachefree; 5974 int i; 5975 5976 db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", 5977 "Requests", "Bucket"); 5978 LIST_FOREACH(z, &uma_cachezones, uz_link) { 5979 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); 5980 for (i = 0; i < vm_ndomains; i++) 5981 cachefree += ZDOM_GET(z, i)->uzd_nitems; 5982 db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", 5983 z->uz_name, (uintmax_t)z->uz_size, 5984 (intmax_t)(allocs - frees), cachefree, 5985 (uintmax_t)allocs, z->uz_bucket_size); 5986 if (db_pager_quit) 5987 return; 5988 } 5989 } 5990 #endif /* DDB */ 5991