1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org> 5 * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org> 6 * Copyright (c) 2004-2006 Robert N. M. Watson 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice unmodified, this list of conditions, and the following 14 * disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * uma_core.c Implementation of the Universal Memory allocator 33 * 34 * This allocator is intended to replace the multitude of similar object caches 35 * in the standard FreeBSD kernel. The intent is to be flexible as well as 36 * efficient. A primary design goal is to return unused memory to the rest of 37 * the system. This will make the system as a whole more flexible due to the 38 * ability to move memory to subsystems which most need it instead of leaving 39 * pools of reserved memory unused. 40 * 41 * The basic ideas stem from similar slab/zone based allocators whose algorithms 42 * are well known. 43 * 44 */ 45 46 /* 47 * TODO: 48 * - Improve memory usage for large allocations 49 * - Investigate cache size adjustments 50 */ 51 52 #include <sys/cdefs.h> 53 __FBSDID("$FreeBSD$"); 54 55 #include "opt_ddb.h" 56 #include "opt_param.h" 57 #include "opt_vm.h" 58 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/bitset.h> 62 #include <sys/domainset.h> 63 #include <sys/eventhandler.h> 64 #include <sys/kernel.h> 65 #include <sys/types.h> 66 #include <sys/limits.h> 67 #include <sys/queue.h> 68 #include <sys/malloc.h> 69 #include <sys/ktr.h> 70 #include <sys/lock.h> 71 #include <sys/sysctl.h> 72 #include <sys/mutex.h> 73 #include <sys/proc.h> 74 #include <sys/random.h> 75 #include <sys/rwlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/sleepqueue.h> 79 #include <sys/smp.h> 80 #include <sys/taskqueue.h> 81 #include <sys/vmmeter.h> 82 83 #include <vm/vm.h> 84 #include <vm/vm_domainset.h> 85 #include <vm/vm_object.h> 86 #include <vm/vm_page.h> 87 #include <vm/vm_pageout.h> 88 #include <vm/vm_param.h> 89 #include <vm/vm_phys.h> 90 #include <vm/vm_pagequeue.h> 91 #include <vm/vm_map.h> 92 #include <vm/vm_kern.h> 93 #include <vm/vm_extern.h> 94 #include <vm/uma.h> 95 #include <vm/uma_int.h> 96 #include <vm/uma_dbg.h> 97 98 #include <ddb/ddb.h> 99 100 #ifdef DEBUG_MEMGUARD 101 #include <vm/memguard.h> 102 #endif 103 104 /* 105 * This is the zone and keg from which all zones are spawned. 106 */ 107 static uma_zone_t kegs; 108 static uma_zone_t zones; 109 110 /* 111 * These are the two zones from which all offpage uma_slab_ts are allocated. 112 * 113 * One zone is for slab headers that can represent a larger number of items, 114 * making the slabs themselves more efficient, and the other zone is for 115 * headers that are smaller and represent fewer items, making the headers more 116 * efficient. 117 */ 118 #define SLABZONE_SIZE(setsize) \ 119 (sizeof(struct uma_hash_slab) + BITSET_SIZE(setsize) * SLAB_BITSETS) 120 #define SLABZONE0_SETSIZE (PAGE_SIZE / 16) 121 #define SLABZONE1_SETSIZE SLAB_MAX_SETSIZE 122 #define SLABZONE0_SIZE SLABZONE_SIZE(SLABZONE0_SETSIZE) 123 #define SLABZONE1_SIZE SLABZONE_SIZE(SLABZONE1_SETSIZE) 124 static uma_zone_t slabzones[2]; 125 126 /* 127 * The initial hash tables come out of this zone so they can be allocated 128 * prior to malloc coming up. 129 */ 130 static uma_zone_t hashzone; 131 132 /* The boot-time adjusted value for cache line alignment. */ 133 int uma_align_cache = 64 - 1; 134 135 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); 136 static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc"); 137 138 /* 139 * Are we allowed to allocate buckets? 140 */ 141 static int bucketdisable = 1; 142 143 /* Linked list of all kegs in the system */ 144 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); 145 146 /* Linked list of all cache-only zones in the system */ 147 static LIST_HEAD(,uma_zone) uma_cachezones = 148 LIST_HEAD_INITIALIZER(uma_cachezones); 149 150 /* This RW lock protects the keg list */ 151 static struct rwlock_padalign __exclusive_cache_line uma_rwlock; 152 153 /* 154 * Pointer and counter to pool of pages, that is preallocated at 155 * startup to bootstrap UMA. 156 */ 157 static char *bootmem; 158 static int boot_pages; 159 160 static struct sx uma_reclaim_lock; 161 162 /* 163 * kmem soft limit, initialized by uma_set_limit(). Ensure that early 164 * allocations don't trigger a wakeup of the reclaim thread. 165 */ 166 unsigned long uma_kmem_limit = LONG_MAX; 167 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, 168 "UMA kernel memory soft limit"); 169 unsigned long uma_kmem_total; 170 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, 171 "UMA kernel memory usage"); 172 173 /* Is the VM done starting up? */ 174 static enum { 175 BOOT_COLD, 176 BOOT_STRAPPED, 177 BOOT_PAGEALLOC, 178 BOOT_BUCKETS, 179 BOOT_RUNNING, 180 BOOT_SHUTDOWN, 181 } booted = BOOT_COLD; 182 183 /* 184 * This is the handle used to schedule events that need to happen 185 * outside of the allocation fast path. 186 */ 187 static struct callout uma_callout; 188 #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ 189 190 /* 191 * This structure is passed as the zone ctor arg so that I don't have to create 192 * a special allocation function just for zones. 193 */ 194 struct uma_zctor_args { 195 const char *name; 196 size_t size; 197 uma_ctor ctor; 198 uma_dtor dtor; 199 uma_init uminit; 200 uma_fini fini; 201 uma_import import; 202 uma_release release; 203 void *arg; 204 uma_keg_t keg; 205 int align; 206 uint32_t flags; 207 }; 208 209 struct uma_kctor_args { 210 uma_zone_t zone; 211 size_t size; 212 uma_init uminit; 213 uma_fini fini; 214 int align; 215 uint32_t flags; 216 }; 217 218 struct uma_bucket_zone { 219 uma_zone_t ubz_zone; 220 char *ubz_name; 221 int ubz_entries; /* Number of items it can hold. */ 222 int ubz_maxsize; /* Maximum allocation size per-item. */ 223 }; 224 225 /* 226 * Compute the actual number of bucket entries to pack them in power 227 * of two sizes for more efficient space utilization. 228 */ 229 #define BUCKET_SIZE(n) \ 230 (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) 231 232 #define BUCKET_MAX BUCKET_SIZE(256) 233 #define BUCKET_MIN BUCKET_SIZE(4) 234 235 struct uma_bucket_zone bucket_zones[] = { 236 { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 }, 237 { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 }, 238 { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 }, 239 { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 }, 240 { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 }, 241 { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, 242 { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, 243 { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, 244 { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, 245 { NULL, NULL, 0} 246 }; 247 248 /* 249 * Flags and enumerations to be passed to internal functions. 250 */ 251 enum zfreeskip { 252 SKIP_NONE = 0, 253 SKIP_CNT = 0x00000001, 254 SKIP_DTOR = 0x00010000, 255 SKIP_FINI = 0x00020000, 256 }; 257 258 /* Prototypes.. */ 259 260 int uma_startup_count(int); 261 void uma_startup(void *, int); 262 void uma_startup1(void); 263 void uma_startup2(void); 264 265 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 266 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 267 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 268 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 269 static void page_free(void *, vm_size_t, uint8_t); 270 static void pcpu_page_free(void *, vm_size_t, uint8_t); 271 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); 272 static void cache_drain(uma_zone_t); 273 static void bucket_drain(uma_zone_t, uma_bucket_t); 274 static void bucket_cache_reclaim(uma_zone_t zone, bool); 275 static int keg_ctor(void *, int, void *, int); 276 static void keg_dtor(void *, int, void *); 277 static int zone_ctor(void *, int, void *, int); 278 static void zone_dtor(void *, int, void *); 279 static int zero_init(void *, int, int); 280 static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *); 281 static void zone_timeout(uma_zone_t zone, void *); 282 static int hash_alloc(struct uma_hash *, u_int); 283 static int hash_expand(struct uma_hash *, struct uma_hash *); 284 static void hash_free(struct uma_hash *hash); 285 static void uma_timeout(void *); 286 static void uma_startup3(void); 287 static void uma_shutdown(void); 288 static void *zone_alloc_item(uma_zone_t, void *, int, int); 289 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); 290 static int zone_alloc_limit(uma_zone_t zone, int count, int flags); 291 static void zone_free_limit(uma_zone_t zone, int count); 292 static void bucket_enable(void); 293 static void bucket_init(void); 294 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); 295 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); 296 static void bucket_zone_drain(void); 297 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); 298 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); 299 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); 300 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, 301 uma_fini fini, int align, uint32_t flags); 302 static int zone_import(void *, void **, int, int, int); 303 static void zone_release(void *, void **, int); 304 static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int); 305 static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int); 306 307 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); 308 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); 309 static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS); 310 static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); 311 static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS); 312 static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS); 313 static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS); 314 315 static uint64_t uma_zone_get_allocs(uma_zone_t zone); 316 317 #ifdef INVARIANTS 318 static uint64_t uma_keg_get_allocs(uma_keg_t zone); 319 static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg); 320 321 static bool uma_dbg_kskip(uma_keg_t keg, void *mem); 322 static bool uma_dbg_zskip(uma_zone_t zone, void *mem); 323 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); 324 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); 325 326 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0, 327 "Memory allocation debugging"); 328 329 static u_int dbg_divisor = 1; 330 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor, 331 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0, 332 "Debug & thrash every this item in memory allocator"); 333 334 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER; 335 static counter_u64_t uma_skip_cnt = EARLY_COUNTER; 336 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD, 337 &uma_dbg_cnt, "memory items debugged"); 338 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD, 339 &uma_skip_cnt, "memory items skipped, not debugged"); 340 #endif 341 342 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); 343 344 SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW, 0, "Universal Memory Allocator"); 345 346 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_INT, 347 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); 348 349 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLTYPE_STRUCT, 350 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); 351 352 static int zone_warnings = 1; 353 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, 354 "Warn when UMA zones becomes full"); 355 356 /* 357 * Select the slab zone for an offpage slab with the given maximum item count. 358 */ 359 static inline uma_zone_t 360 slabzone(int ipers) 361 { 362 363 return (slabzones[ipers > SLABZONE0_SETSIZE]); 364 } 365 366 /* 367 * This routine checks to see whether or not it's safe to enable buckets. 368 */ 369 static void 370 bucket_enable(void) 371 { 372 373 KASSERT(booted >= BOOT_BUCKETS, ("Bucket enable before init")); 374 bucketdisable = vm_page_count_min(); 375 } 376 377 /* 378 * Initialize bucket_zones, the array of zones of buckets of various sizes. 379 * 380 * For each zone, calculate the memory required for each bucket, consisting 381 * of the header and an array of pointers. 382 */ 383 static void 384 bucket_init(void) 385 { 386 struct uma_bucket_zone *ubz; 387 int size; 388 389 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { 390 size = roundup(sizeof(struct uma_bucket), sizeof(void *)); 391 size += sizeof(void *) * ubz->ubz_entries; 392 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, 393 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 394 UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | 395 UMA_ZONE_FIRSTTOUCH); 396 } 397 } 398 399 /* 400 * Given a desired number of entries for a bucket, return the zone from which 401 * to allocate the bucket. 402 */ 403 static struct uma_bucket_zone * 404 bucket_zone_lookup(int entries) 405 { 406 struct uma_bucket_zone *ubz; 407 408 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 409 if (ubz->ubz_entries >= entries) 410 return (ubz); 411 ubz--; 412 return (ubz); 413 } 414 415 static struct uma_bucket_zone * 416 bucket_zone_max(uma_zone_t zone, int nitems) 417 { 418 struct uma_bucket_zone *ubz; 419 int bpcpu; 420 421 bpcpu = 2; 422 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) 423 /* Count the cross-domain bucket. */ 424 bpcpu++; 425 426 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 427 if (ubz->ubz_entries * bpcpu * mp_ncpus > nitems) 428 break; 429 if (ubz == &bucket_zones[0]) 430 ubz = NULL; 431 else 432 ubz--; 433 return (ubz); 434 } 435 436 static int 437 bucket_select(int size) 438 { 439 struct uma_bucket_zone *ubz; 440 441 ubz = &bucket_zones[0]; 442 if (size > ubz->ubz_maxsize) 443 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); 444 445 for (; ubz->ubz_entries != 0; ubz++) 446 if (ubz->ubz_maxsize < size) 447 break; 448 ubz--; 449 return (ubz->ubz_entries); 450 } 451 452 static uma_bucket_t 453 bucket_alloc(uma_zone_t zone, void *udata, int flags) 454 { 455 struct uma_bucket_zone *ubz; 456 uma_bucket_t bucket; 457 458 /* 459 * This is to stop us from allocating per cpu buckets while we're 460 * running out of vm.boot_pages. Otherwise, we would exhaust the 461 * boot pages. This also prevents us from allocating buckets in 462 * low memory situations. 463 */ 464 if (bucketdisable) 465 return (NULL); 466 /* 467 * To limit bucket recursion we store the original zone flags 468 * in a cookie passed via zalloc_arg/zfree_arg. This allows the 469 * NOVM flag to persist even through deep recursions. We also 470 * store ZFLAG_BUCKET once we have recursed attempting to allocate 471 * a bucket for a bucket zone so we do not allow infinite bucket 472 * recursion. This cookie will even persist to frees of unused 473 * buckets via the allocation path or bucket allocations in the 474 * free path. 475 */ 476 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 477 udata = (void *)(uintptr_t)zone->uz_flags; 478 else { 479 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) 480 return (NULL); 481 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); 482 } 483 if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY) 484 flags |= M_NOVM; 485 ubz = bucket_zone_lookup(zone->uz_bucket_size); 486 if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) 487 ubz++; 488 bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); 489 if (bucket) { 490 #ifdef INVARIANTS 491 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); 492 #endif 493 bucket->ub_cnt = 0; 494 bucket->ub_entries = ubz->ubz_entries; 495 } 496 497 return (bucket); 498 } 499 500 static void 501 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) 502 { 503 struct uma_bucket_zone *ubz; 504 505 KASSERT(bucket->ub_cnt == 0, 506 ("bucket_free: Freeing a non free bucket.")); 507 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 508 udata = (void *)(uintptr_t)zone->uz_flags; 509 ubz = bucket_zone_lookup(bucket->ub_entries); 510 uma_zfree_arg(ubz->ubz_zone, bucket, udata); 511 } 512 513 static void 514 bucket_zone_drain(void) 515 { 516 struct uma_bucket_zone *ubz; 517 518 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 519 uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN); 520 } 521 522 /* 523 * Attempt to satisfy an allocation by retrieving a full bucket from one of the 524 * zone's caches. 525 */ 526 static uma_bucket_t 527 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom) 528 { 529 uma_bucket_t bucket; 530 531 ZONE_LOCK_ASSERT(zone); 532 533 if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) { 534 MPASS(zdom->uzd_nitems >= bucket->ub_cnt); 535 TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); 536 zdom->uzd_nitems -= bucket->ub_cnt; 537 if (zdom->uzd_imin > zdom->uzd_nitems) 538 zdom->uzd_imin = zdom->uzd_nitems; 539 zone->uz_bkt_count -= bucket->ub_cnt; 540 } 541 return (bucket); 542 } 543 544 /* 545 * Insert a full bucket into the specified cache. The "ws" parameter indicates 546 * whether the bucket's contents should be counted as part of the zone's working 547 * set. 548 */ 549 static void 550 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket, 551 const bool ws) 552 { 553 554 ZONE_LOCK_ASSERT(zone); 555 KASSERT(!ws || zone->uz_bkt_count < zone->uz_bkt_max, 556 ("%s: zone %p overflow", __func__, zone)); 557 558 if (ws) 559 TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); 560 else 561 TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); 562 zdom->uzd_nitems += bucket->ub_cnt; 563 if (ws && zdom->uzd_imax < zdom->uzd_nitems) 564 zdom->uzd_imax = zdom->uzd_nitems; 565 zone->uz_bkt_count += bucket->ub_cnt; 566 } 567 568 /* Pops an item out of a per-cpu cache bucket. */ 569 static inline void * 570 cache_bucket_pop(uma_cache_t cache, uma_cache_bucket_t bucket) 571 { 572 void *item; 573 574 CRITICAL_ASSERT(curthread); 575 576 bucket->ucb_cnt--; 577 item = bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt]; 578 #ifdef INVARIANTS 579 bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = NULL; 580 KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); 581 #endif 582 cache->uc_allocs++; 583 584 return (item); 585 } 586 587 /* Pushes an item into a per-cpu cache bucket. */ 588 static inline void 589 cache_bucket_push(uma_cache_t cache, uma_cache_bucket_t bucket, void *item) 590 { 591 592 CRITICAL_ASSERT(curthread); 593 KASSERT(bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] == NULL, 594 ("uma_zfree: Freeing to non free bucket index.")); 595 596 bucket->ucb_bucket->ub_bucket[bucket->ucb_cnt] = item; 597 bucket->ucb_cnt++; 598 cache->uc_frees++; 599 } 600 601 /* 602 * Unload a UMA bucket from a per-cpu cache. 603 */ 604 static inline uma_bucket_t 605 cache_bucket_unload(uma_cache_bucket_t bucket) 606 { 607 uma_bucket_t b; 608 609 b = bucket->ucb_bucket; 610 if (b != NULL) { 611 MPASS(b->ub_entries == bucket->ucb_entries); 612 b->ub_cnt = bucket->ucb_cnt; 613 bucket->ucb_bucket = NULL; 614 bucket->ucb_entries = bucket->ucb_cnt = 0; 615 } 616 617 return (b); 618 } 619 620 static inline uma_bucket_t 621 cache_bucket_unload_alloc(uma_cache_t cache) 622 { 623 624 return (cache_bucket_unload(&cache->uc_allocbucket)); 625 } 626 627 static inline uma_bucket_t 628 cache_bucket_unload_free(uma_cache_t cache) 629 { 630 631 return (cache_bucket_unload(&cache->uc_freebucket)); 632 } 633 634 static inline uma_bucket_t 635 cache_bucket_unload_cross(uma_cache_t cache) 636 { 637 638 return (cache_bucket_unload(&cache->uc_crossbucket)); 639 } 640 641 /* 642 * Load a bucket into a per-cpu cache bucket. 643 */ 644 static inline void 645 cache_bucket_load(uma_cache_bucket_t bucket, uma_bucket_t b) 646 { 647 648 CRITICAL_ASSERT(curthread); 649 MPASS(bucket->ucb_bucket == NULL); 650 651 bucket->ucb_bucket = b; 652 bucket->ucb_cnt = b->ub_cnt; 653 bucket->ucb_entries = b->ub_entries; 654 } 655 656 static inline void 657 cache_bucket_load_alloc(uma_cache_t cache, uma_bucket_t b) 658 { 659 660 cache_bucket_load(&cache->uc_allocbucket, b); 661 } 662 663 static inline void 664 cache_bucket_load_free(uma_cache_t cache, uma_bucket_t b) 665 { 666 667 cache_bucket_load(&cache->uc_freebucket, b); 668 } 669 670 #ifdef NUMA 671 static inline void 672 cache_bucket_load_cross(uma_cache_t cache, uma_bucket_t b) 673 { 674 675 cache_bucket_load(&cache->uc_crossbucket, b); 676 } 677 #endif 678 679 /* 680 * Copy and preserve ucb_spare. 681 */ 682 static inline void 683 cache_bucket_copy(uma_cache_bucket_t b1, uma_cache_bucket_t b2) 684 { 685 686 b1->ucb_bucket = b2->ucb_bucket; 687 b1->ucb_entries = b2->ucb_entries; 688 b1->ucb_cnt = b2->ucb_cnt; 689 } 690 691 /* 692 * Swap two cache buckets. 693 */ 694 static inline void 695 cache_bucket_swap(uma_cache_bucket_t b1, uma_cache_bucket_t b2) 696 { 697 struct uma_cache_bucket b3; 698 699 CRITICAL_ASSERT(curthread); 700 701 cache_bucket_copy(&b3, b1); 702 cache_bucket_copy(b1, b2); 703 cache_bucket_copy(b2, &b3); 704 } 705 706 static void 707 zone_log_warning(uma_zone_t zone) 708 { 709 static const struct timeval warninterval = { 300, 0 }; 710 711 if (!zone_warnings || zone->uz_warning == NULL) 712 return; 713 714 if (ratecheck(&zone->uz_ratecheck, &warninterval)) 715 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); 716 } 717 718 static inline void 719 zone_maxaction(uma_zone_t zone) 720 { 721 722 if (zone->uz_maxaction.ta_func != NULL) 723 taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); 724 } 725 726 /* 727 * Routine called by timeout which is used to fire off some time interval 728 * based calculations. (stats, hash size, etc.) 729 * 730 * Arguments: 731 * arg Unused 732 * 733 * Returns: 734 * Nothing 735 */ 736 static void 737 uma_timeout(void *unused) 738 { 739 bucket_enable(); 740 zone_foreach(zone_timeout, NULL); 741 742 /* Reschedule this event */ 743 callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); 744 } 745 746 /* 747 * Update the working set size estimate for the zone's bucket cache. 748 * The constants chosen here are somewhat arbitrary. With an update period of 749 * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the 750 * last 100s. 751 */ 752 static void 753 zone_domain_update_wss(uma_zone_domain_t zdom) 754 { 755 long wss; 756 757 MPASS(zdom->uzd_imax >= zdom->uzd_imin); 758 wss = zdom->uzd_imax - zdom->uzd_imin; 759 zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems; 760 zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5; 761 } 762 763 /* 764 * Routine to perform timeout driven calculations. This expands the 765 * hashes and does per cpu statistics aggregation. 766 * 767 * Returns nothing. 768 */ 769 static void 770 zone_timeout(uma_zone_t zone, void *unused) 771 { 772 uma_keg_t keg; 773 u_int slabs, pages; 774 775 if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) 776 goto update_wss; 777 778 keg = zone->uz_keg; 779 780 /* 781 * Hash zones are non-numa by definition so the first domain 782 * is the only one present. 783 */ 784 KEG_LOCK(keg, 0); 785 pages = keg->uk_domain[0].ud_pages; 786 787 /* 788 * Expand the keg hash table. 789 * 790 * This is done if the number of slabs is larger than the hash size. 791 * What I'm trying to do here is completely reduce collisions. This 792 * may be a little aggressive. Should I allow for two collisions max? 793 */ 794 if ((slabs = pages / keg->uk_ppera) > keg->uk_hash.uh_hashsize) { 795 struct uma_hash newhash; 796 struct uma_hash oldhash; 797 int ret; 798 799 /* 800 * This is so involved because allocating and freeing 801 * while the keg lock is held will lead to deadlock. 802 * I have to do everything in stages and check for 803 * races. 804 */ 805 KEG_UNLOCK(keg, 0); 806 ret = hash_alloc(&newhash, 1 << fls(slabs)); 807 KEG_LOCK(keg, 0); 808 if (ret) { 809 if (hash_expand(&keg->uk_hash, &newhash)) { 810 oldhash = keg->uk_hash; 811 keg->uk_hash = newhash; 812 } else 813 oldhash = newhash; 814 815 KEG_UNLOCK(keg, 0); 816 hash_free(&oldhash); 817 goto update_wss; 818 } 819 } 820 KEG_UNLOCK(keg, 0); 821 822 update_wss: 823 ZONE_LOCK(zone); 824 for (int i = 0; i < vm_ndomains; i++) 825 zone_domain_update_wss(&zone->uz_domain[i]); 826 ZONE_UNLOCK(zone); 827 } 828 829 /* 830 * Allocate and zero fill the next sized hash table from the appropriate 831 * backing store. 832 * 833 * Arguments: 834 * hash A new hash structure with the old hash size in uh_hashsize 835 * 836 * Returns: 837 * 1 on success and 0 on failure. 838 */ 839 static int 840 hash_alloc(struct uma_hash *hash, u_int size) 841 { 842 size_t alloc; 843 844 KASSERT(powerof2(size), ("hash size must be power of 2")); 845 if (size > UMA_HASH_SIZE_INIT) { 846 hash->uh_hashsize = size; 847 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; 848 hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT); 849 } else { 850 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; 851 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, 852 UMA_ANYDOMAIN, M_WAITOK); 853 hash->uh_hashsize = UMA_HASH_SIZE_INIT; 854 } 855 if (hash->uh_slab_hash) { 856 bzero(hash->uh_slab_hash, alloc); 857 hash->uh_hashmask = hash->uh_hashsize - 1; 858 return (1); 859 } 860 861 return (0); 862 } 863 864 /* 865 * Expands the hash table for HASH zones. This is done from zone_timeout 866 * to reduce collisions. This must not be done in the regular allocation 867 * path, otherwise, we can recurse on the vm while allocating pages. 868 * 869 * Arguments: 870 * oldhash The hash you want to expand 871 * newhash The hash structure for the new table 872 * 873 * Returns: 874 * Nothing 875 * 876 * Discussion: 877 */ 878 static int 879 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) 880 { 881 uma_hash_slab_t slab; 882 u_int hval; 883 u_int idx; 884 885 if (!newhash->uh_slab_hash) 886 return (0); 887 888 if (oldhash->uh_hashsize >= newhash->uh_hashsize) 889 return (0); 890 891 /* 892 * I need to investigate hash algorithms for resizing without a 893 * full rehash. 894 */ 895 896 for (idx = 0; idx < oldhash->uh_hashsize; idx++) 897 while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) { 898 slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]); 899 LIST_REMOVE(slab, uhs_hlink); 900 hval = UMA_HASH(newhash, slab->uhs_data); 901 LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], 902 slab, uhs_hlink); 903 } 904 905 return (1); 906 } 907 908 /* 909 * Free the hash bucket to the appropriate backing store. 910 * 911 * Arguments: 912 * slab_hash The hash bucket we're freeing 913 * hashsize The number of entries in that hash bucket 914 * 915 * Returns: 916 * Nothing 917 */ 918 static void 919 hash_free(struct uma_hash *hash) 920 { 921 if (hash->uh_slab_hash == NULL) 922 return; 923 if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) 924 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); 925 else 926 free(hash->uh_slab_hash, M_UMAHASH); 927 } 928 929 /* 930 * Frees all outstanding items in a bucket 931 * 932 * Arguments: 933 * zone The zone to free to, must be unlocked. 934 * bucket The free/alloc bucket with items. 935 * 936 * Returns: 937 * Nothing 938 */ 939 940 static void 941 bucket_drain(uma_zone_t zone, uma_bucket_t bucket) 942 { 943 int i; 944 945 if (bucket == NULL || bucket->ub_cnt == 0) 946 return; 947 948 if (zone->uz_fini) 949 for (i = 0; i < bucket->ub_cnt; i++) 950 zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); 951 zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); 952 if (zone->uz_max_items > 0) 953 zone_free_limit(zone, bucket->ub_cnt); 954 bucket->ub_cnt = 0; 955 } 956 957 /* 958 * Drains the per cpu caches for a zone. 959 * 960 * NOTE: This may only be called while the zone is being torn down, and not 961 * during normal operation. This is necessary in order that we do not have 962 * to migrate CPUs to drain the per-CPU caches. 963 * 964 * Arguments: 965 * zone The zone to drain, must be unlocked. 966 * 967 * Returns: 968 * Nothing 969 */ 970 static void 971 cache_drain(uma_zone_t zone) 972 { 973 uma_cache_t cache; 974 uma_bucket_t bucket; 975 int cpu; 976 977 /* 978 * XXX: It is safe to not lock the per-CPU caches, because we're 979 * tearing down the zone anyway. I.e., there will be no further use 980 * of the caches at this point. 981 * 982 * XXX: It would good to be able to assert that the zone is being 983 * torn down to prevent improper use of cache_drain(). 984 */ 985 CPU_FOREACH(cpu) { 986 cache = &zone->uz_cpu[cpu]; 987 bucket = cache_bucket_unload_alloc(cache); 988 if (bucket != NULL) { 989 bucket_drain(zone, bucket); 990 bucket_free(zone, bucket, NULL); 991 } 992 bucket = cache_bucket_unload_free(cache); 993 if (bucket != NULL) { 994 bucket_drain(zone, bucket); 995 bucket_free(zone, bucket, NULL); 996 } 997 bucket = cache_bucket_unload_cross(cache); 998 if (bucket != NULL) { 999 bucket_drain(zone, bucket); 1000 bucket_free(zone, bucket, NULL); 1001 } 1002 } 1003 bucket_cache_reclaim(zone, true); 1004 } 1005 1006 static void 1007 cache_shrink(uma_zone_t zone, void *unused) 1008 { 1009 1010 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 1011 return; 1012 1013 ZONE_LOCK(zone); 1014 zone->uz_bucket_size = 1015 (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2; 1016 ZONE_UNLOCK(zone); 1017 } 1018 1019 static void 1020 cache_drain_safe_cpu(uma_zone_t zone, void *unused) 1021 { 1022 uma_cache_t cache; 1023 uma_bucket_t b1, b2, b3; 1024 int domain; 1025 1026 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 1027 return; 1028 1029 b1 = b2 = b3 = NULL; 1030 ZONE_LOCK(zone); 1031 critical_enter(); 1032 if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH) 1033 domain = PCPU_GET(domain); 1034 else 1035 domain = 0; 1036 cache = &zone->uz_cpu[curcpu]; 1037 b1 = cache_bucket_unload_alloc(cache); 1038 if (b1 != NULL && b1->ub_cnt != 0) { 1039 zone_put_bucket(zone, &zone->uz_domain[domain], b1, false); 1040 b1 = NULL; 1041 } 1042 b2 = cache_bucket_unload_free(cache); 1043 if (b2 != NULL && b2->ub_cnt != 0) { 1044 zone_put_bucket(zone, &zone->uz_domain[domain], b2, false); 1045 b2 = NULL; 1046 } 1047 b3 = cache_bucket_unload_cross(cache); 1048 critical_exit(); 1049 ZONE_UNLOCK(zone); 1050 if (b1) 1051 bucket_free(zone, b1, NULL); 1052 if (b2) 1053 bucket_free(zone, b2, NULL); 1054 if (b3) { 1055 bucket_drain(zone, b3); 1056 bucket_free(zone, b3, NULL); 1057 } 1058 } 1059 1060 /* 1061 * Safely drain per-CPU caches of a zone(s) to alloc bucket. 1062 * This is an expensive call because it needs to bind to all CPUs 1063 * one by one and enter a critical section on each of them in order 1064 * to safely access their cache buckets. 1065 * Zone lock must not be held on call this function. 1066 */ 1067 static void 1068 pcpu_cache_drain_safe(uma_zone_t zone) 1069 { 1070 int cpu; 1071 1072 /* 1073 * Polite bucket sizes shrinking was not enough, shrink aggressively. 1074 */ 1075 if (zone) 1076 cache_shrink(zone, NULL); 1077 else 1078 zone_foreach(cache_shrink, NULL); 1079 1080 CPU_FOREACH(cpu) { 1081 thread_lock(curthread); 1082 sched_bind(curthread, cpu); 1083 thread_unlock(curthread); 1084 1085 if (zone) 1086 cache_drain_safe_cpu(zone, NULL); 1087 else 1088 zone_foreach(cache_drain_safe_cpu, NULL); 1089 } 1090 thread_lock(curthread); 1091 sched_unbind(curthread); 1092 thread_unlock(curthread); 1093 } 1094 1095 /* 1096 * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller 1097 * requested a drain, otherwise the per-domain caches are trimmed to either 1098 * estimated working set size. 1099 */ 1100 static void 1101 bucket_cache_reclaim(uma_zone_t zone, bool drain) 1102 { 1103 uma_zone_domain_t zdom; 1104 uma_bucket_t bucket; 1105 long target, tofree; 1106 int i; 1107 1108 for (i = 0; i < vm_ndomains; i++) { 1109 /* 1110 * The cross bucket is partially filled and not part of 1111 * the item count. Reclaim it individually here. 1112 */ 1113 zdom = &zone->uz_domain[i]; 1114 ZONE_CROSS_LOCK(zone); 1115 bucket = zdom->uzd_cross; 1116 zdom->uzd_cross = NULL; 1117 ZONE_CROSS_UNLOCK(zone); 1118 if (bucket != NULL) { 1119 bucket_drain(zone, bucket); 1120 bucket_free(zone, bucket, NULL); 1121 } 1122 1123 /* 1124 * Shrink the zone bucket size to ensure that the per-CPU caches 1125 * don't grow too large. 1126 */ 1127 ZONE_LOCK(zone); 1128 if (i == 0 && zone->uz_bucket_size > zone->uz_bucket_size_min) 1129 zone->uz_bucket_size--; 1130 1131 /* 1132 * If we were asked to drain the zone, we are done only once 1133 * this bucket cache is empty. Otherwise, we reclaim items in 1134 * excess of the zone's estimated working set size. If the 1135 * difference nitems - imin is larger than the WSS estimate, 1136 * then the estimate will grow at the end of this interval and 1137 * we ignore the historical average. 1138 */ 1139 target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems - 1140 zdom->uzd_imin); 1141 while (zdom->uzd_nitems > target) { 1142 bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist); 1143 if (bucket == NULL) 1144 break; 1145 tofree = bucket->ub_cnt; 1146 TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); 1147 zdom->uzd_nitems -= tofree; 1148 1149 /* 1150 * Shift the bounds of the current WSS interval to avoid 1151 * perturbing the estimate. 1152 */ 1153 zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree); 1154 zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree); 1155 1156 ZONE_UNLOCK(zone); 1157 bucket_drain(zone, bucket); 1158 bucket_free(zone, bucket, NULL); 1159 ZONE_LOCK(zone); 1160 } 1161 ZONE_UNLOCK(zone); 1162 } 1163 } 1164 1165 static void 1166 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) 1167 { 1168 uint8_t *mem; 1169 int i; 1170 uint8_t flags; 1171 1172 CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", 1173 keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); 1174 1175 mem = slab_data(slab, keg); 1176 flags = slab->us_flags; 1177 i = start; 1178 if (keg->uk_fini != NULL) { 1179 for (i--; i > -1; i--) 1180 #ifdef INVARIANTS 1181 /* 1182 * trash_fini implies that dtor was trash_dtor. trash_fini 1183 * would check that memory hasn't been modified since free, 1184 * which executed trash_dtor. 1185 * That's why we need to run uma_dbg_kskip() check here, 1186 * albeit we don't make skip check for other init/fini 1187 * invocations. 1188 */ 1189 if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) || 1190 keg->uk_fini != trash_fini) 1191 #endif 1192 keg->uk_fini(slab_item(slab, keg, i), keg->uk_size); 1193 } 1194 if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) 1195 zone_free_item(slabzone(keg->uk_ipers), slab_tohashslab(slab), 1196 NULL, SKIP_NONE); 1197 keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags); 1198 uma_total_dec(PAGE_SIZE * keg->uk_ppera); 1199 } 1200 1201 /* 1202 * Frees pages from a keg back to the system. This is done on demand from 1203 * the pageout daemon. 1204 * 1205 * Returns nothing. 1206 */ 1207 static void 1208 keg_drain(uma_keg_t keg) 1209 { 1210 struct slabhead freeslabs = { 0 }; 1211 uma_domain_t dom; 1212 uma_slab_t slab, tmp; 1213 int i, n; 1214 1215 /* 1216 * We don't want to take pages from statically allocated kegs at this 1217 * time 1218 */ 1219 if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) 1220 return; 1221 1222 for (i = 0; i < vm_ndomains; i++) { 1223 CTR4(KTR_UMA, "keg_drain %s(%p) domain %d free items: %u", 1224 keg->uk_name, keg, i, dom->ud_free); 1225 n = 0; 1226 dom = &keg->uk_domain[i]; 1227 KEG_LOCK(keg, i); 1228 LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) { 1229 /* We have nowhere to free these to. */ 1230 if (slab->us_flags & UMA_SLAB_BOOT) 1231 continue; 1232 if (keg->uk_flags & UMA_ZFLAG_HASH) 1233 UMA_HASH_REMOVE(&keg->uk_hash, slab); 1234 n++; 1235 LIST_REMOVE(slab, us_link); 1236 LIST_INSERT_HEAD(&freeslabs, slab, us_link); 1237 } 1238 dom->ud_pages -= n * keg->uk_ppera; 1239 dom->ud_free -= n * keg->uk_ipers; 1240 KEG_UNLOCK(keg, i); 1241 } 1242 1243 while ((slab = LIST_FIRST(&freeslabs)) != NULL) { 1244 LIST_REMOVE(slab, us_link); 1245 keg_free_slab(keg, slab, keg->uk_ipers); 1246 } 1247 } 1248 1249 static void 1250 zone_reclaim(uma_zone_t zone, int waitok, bool drain) 1251 { 1252 1253 /* 1254 * Set draining to interlock with zone_dtor() so we can release our 1255 * locks as we go. Only dtor() should do a WAITOK call since it 1256 * is the only call that knows the structure will still be available 1257 * when it wakes up. 1258 */ 1259 ZONE_LOCK(zone); 1260 while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) { 1261 if (waitok == M_NOWAIT) 1262 goto out; 1263 msleep(zone, &zone->uz_lock, PVM, "zonedrain", 1); 1264 } 1265 zone->uz_flags |= UMA_ZFLAG_RECLAIMING; 1266 ZONE_UNLOCK(zone); 1267 bucket_cache_reclaim(zone, drain); 1268 1269 /* 1270 * The DRAINING flag protects us from being freed while 1271 * we're running. Normally the uma_rwlock would protect us but we 1272 * must be able to release and acquire the right lock for each keg. 1273 */ 1274 if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) 1275 keg_drain(zone->uz_keg); 1276 ZONE_LOCK(zone); 1277 zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING; 1278 wakeup(zone); 1279 out: 1280 ZONE_UNLOCK(zone); 1281 } 1282 1283 static void 1284 zone_drain(uma_zone_t zone, void *unused) 1285 { 1286 1287 zone_reclaim(zone, M_NOWAIT, true); 1288 } 1289 1290 static void 1291 zone_trim(uma_zone_t zone, void *unused) 1292 { 1293 1294 zone_reclaim(zone, M_NOWAIT, false); 1295 } 1296 1297 /* 1298 * Allocate a new slab for a keg and inserts it into the partial slab list. 1299 * The keg should be unlocked on entry. If the allocation succeeds it will 1300 * be locked on return. 1301 * 1302 * Arguments: 1303 * flags Wait flags for the item initialization routine 1304 * aflags Wait flags for the slab allocation 1305 * 1306 * Returns: 1307 * The slab that was allocated or NULL if there is no memory and the 1308 * caller specified M_NOWAIT. 1309 */ 1310 static uma_slab_t 1311 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, 1312 int aflags) 1313 { 1314 uma_domain_t dom; 1315 uma_alloc allocf; 1316 uma_slab_t slab; 1317 unsigned long size; 1318 uint8_t *mem; 1319 uint8_t sflags; 1320 int i; 1321 1322 KASSERT(domain >= 0 && domain < vm_ndomains, 1323 ("keg_alloc_slab: domain %d out of range", domain)); 1324 1325 allocf = keg->uk_allocf; 1326 slab = NULL; 1327 mem = NULL; 1328 if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) { 1329 uma_hash_slab_t hslab; 1330 hslab = zone_alloc_item(slabzone(keg->uk_ipers), NULL, 1331 domain, aflags); 1332 if (hslab == NULL) 1333 goto fail; 1334 slab = &hslab->uhs_slab; 1335 } 1336 1337 /* 1338 * This reproduces the old vm_zone behavior of zero filling pages the 1339 * first time they are added to a zone. 1340 * 1341 * Malloced items are zeroed in uma_zalloc. 1342 */ 1343 1344 if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) 1345 aflags |= M_ZERO; 1346 else 1347 aflags &= ~M_ZERO; 1348 1349 if (keg->uk_flags & UMA_ZONE_NODUMP) 1350 aflags |= M_NODUMP; 1351 1352 /* zone is passed for legacy reasons. */ 1353 size = keg->uk_ppera * PAGE_SIZE; 1354 mem = allocf(zone, size, domain, &sflags, aflags); 1355 if (mem == NULL) { 1356 if (keg->uk_flags & UMA_ZFLAG_OFFPAGE) 1357 zone_free_item(slabzone(keg->uk_ipers), 1358 slab_tohashslab(slab), NULL, SKIP_NONE); 1359 goto fail; 1360 } 1361 uma_total_inc(size); 1362 1363 /* For HASH zones all pages go to the same uma_domain. */ 1364 if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) 1365 domain = 0; 1366 1367 /* Point the slab into the allocated memory */ 1368 if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) 1369 slab = (uma_slab_t )(mem + keg->uk_pgoff); 1370 else 1371 slab_tohashslab(slab)->uhs_data = mem; 1372 1373 if (keg->uk_flags & UMA_ZFLAG_VTOSLAB) 1374 for (i = 0; i < keg->uk_ppera; i++) 1375 vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE), 1376 zone, slab); 1377 1378 slab->us_freecount = keg->uk_ipers; 1379 slab->us_flags = sflags; 1380 slab->us_domain = domain; 1381 1382 BIT_FILL(keg->uk_ipers, &slab->us_free); 1383 #ifdef INVARIANTS 1384 BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg)); 1385 #endif 1386 1387 if (keg->uk_init != NULL) { 1388 for (i = 0; i < keg->uk_ipers; i++) 1389 if (keg->uk_init(slab_item(slab, keg, i), 1390 keg->uk_size, flags) != 0) 1391 break; 1392 if (i != keg->uk_ipers) { 1393 keg_free_slab(keg, slab, i); 1394 goto fail; 1395 } 1396 } 1397 KEG_LOCK(keg, domain); 1398 1399 CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", 1400 slab, keg->uk_name, keg); 1401 1402 if (keg->uk_flags & UMA_ZFLAG_HASH) 1403 UMA_HASH_INSERT(&keg->uk_hash, slab, mem); 1404 1405 /* 1406 * If we got a slab here it's safe to mark it partially used 1407 * and return. We assume that the caller is going to remove 1408 * at least one item. 1409 */ 1410 dom = &keg->uk_domain[domain]; 1411 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 1412 dom->ud_pages += keg->uk_ppera; 1413 dom->ud_free += keg->uk_ipers; 1414 1415 return (slab); 1416 1417 fail: 1418 return (NULL); 1419 } 1420 1421 /* 1422 * This function is intended to be used early on in place of page_alloc() so 1423 * that we may use the boot time page cache to satisfy allocations before 1424 * the VM is ready. 1425 */ 1426 static void * 1427 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1428 int wait) 1429 { 1430 uma_keg_t keg; 1431 void *mem; 1432 int pages; 1433 1434 keg = zone->uz_keg; 1435 /* 1436 * If we are in BOOT_BUCKETS or higher, than switch to real 1437 * allocator. Zones with page sized slabs switch at BOOT_PAGEALLOC. 1438 */ 1439 switch (booted) { 1440 case BOOT_COLD: 1441 case BOOT_STRAPPED: 1442 break; 1443 case BOOT_PAGEALLOC: 1444 if (keg->uk_ppera > 1) 1445 break; 1446 default: 1447 #ifdef UMA_MD_SMALL_ALLOC 1448 keg->uk_allocf = (keg->uk_ppera > 1) ? 1449 page_alloc : uma_small_alloc; 1450 #else 1451 keg->uk_allocf = page_alloc; 1452 #endif 1453 return keg->uk_allocf(zone, bytes, domain, pflag, wait); 1454 } 1455 1456 /* 1457 * Check our small startup cache to see if it has pages remaining. 1458 */ 1459 pages = howmany(bytes, PAGE_SIZE); 1460 KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__)); 1461 if (pages > boot_pages) 1462 panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name); 1463 #ifdef DIAGNOSTIC 1464 printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name, 1465 boot_pages); 1466 #endif 1467 mem = bootmem; 1468 boot_pages -= pages; 1469 bootmem += pages * PAGE_SIZE; 1470 *pflag = UMA_SLAB_BOOT; 1471 1472 return (mem); 1473 } 1474 1475 /* 1476 * Allocates a number of pages from the system 1477 * 1478 * Arguments: 1479 * bytes The number of bytes requested 1480 * wait Shall we wait? 1481 * 1482 * Returns: 1483 * A pointer to the alloced memory or possibly 1484 * NULL if M_NOWAIT is set. 1485 */ 1486 static void * 1487 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1488 int wait) 1489 { 1490 void *p; /* Returned page */ 1491 1492 *pflag = UMA_SLAB_KERNEL; 1493 p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); 1494 1495 return (p); 1496 } 1497 1498 static void * 1499 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1500 int wait) 1501 { 1502 struct pglist alloctail; 1503 vm_offset_t addr, zkva; 1504 int cpu, flags; 1505 vm_page_t p, p_next; 1506 #ifdef NUMA 1507 struct pcpu *pc; 1508 #endif 1509 1510 MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE); 1511 1512 TAILQ_INIT(&alloctail); 1513 flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | 1514 malloc2vm_flags(wait); 1515 *pflag = UMA_SLAB_KERNEL; 1516 for (cpu = 0; cpu <= mp_maxid; cpu++) { 1517 if (CPU_ABSENT(cpu)) { 1518 p = vm_page_alloc(NULL, 0, flags); 1519 } else { 1520 #ifndef NUMA 1521 p = vm_page_alloc(NULL, 0, flags); 1522 #else 1523 pc = pcpu_find(cpu); 1524 p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags); 1525 if (__predict_false(p == NULL)) 1526 p = vm_page_alloc(NULL, 0, flags); 1527 #endif 1528 } 1529 if (__predict_false(p == NULL)) 1530 goto fail; 1531 TAILQ_INSERT_TAIL(&alloctail, p, listq); 1532 } 1533 if ((addr = kva_alloc(bytes)) == 0) 1534 goto fail; 1535 zkva = addr; 1536 TAILQ_FOREACH(p, &alloctail, listq) { 1537 pmap_qenter(zkva, &p, 1); 1538 zkva += PAGE_SIZE; 1539 } 1540 return ((void*)addr); 1541 fail: 1542 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { 1543 vm_page_unwire_noq(p); 1544 vm_page_free(p); 1545 } 1546 return (NULL); 1547 } 1548 1549 /* 1550 * Allocates a number of pages from within an object 1551 * 1552 * Arguments: 1553 * bytes The number of bytes requested 1554 * wait Shall we wait? 1555 * 1556 * Returns: 1557 * A pointer to the alloced memory or possibly 1558 * NULL if M_NOWAIT is set. 1559 */ 1560 static void * 1561 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 1562 int wait) 1563 { 1564 TAILQ_HEAD(, vm_page) alloctail; 1565 u_long npages; 1566 vm_offset_t retkva, zkva; 1567 vm_page_t p, p_next; 1568 uma_keg_t keg; 1569 1570 TAILQ_INIT(&alloctail); 1571 keg = zone->uz_keg; 1572 1573 npages = howmany(bytes, PAGE_SIZE); 1574 while (npages > 0) { 1575 p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT | 1576 VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | 1577 ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : 1578 VM_ALLOC_NOWAIT)); 1579 if (p != NULL) { 1580 /* 1581 * Since the page does not belong to an object, its 1582 * listq is unused. 1583 */ 1584 TAILQ_INSERT_TAIL(&alloctail, p, listq); 1585 npages--; 1586 continue; 1587 } 1588 /* 1589 * Page allocation failed, free intermediate pages and 1590 * exit. 1591 */ 1592 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { 1593 vm_page_unwire_noq(p); 1594 vm_page_free(p); 1595 } 1596 return (NULL); 1597 } 1598 *flags = UMA_SLAB_PRIV; 1599 zkva = keg->uk_kva + 1600 atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); 1601 retkva = zkva; 1602 TAILQ_FOREACH(p, &alloctail, listq) { 1603 pmap_qenter(zkva, &p, 1); 1604 zkva += PAGE_SIZE; 1605 } 1606 1607 return ((void *)retkva); 1608 } 1609 1610 /* 1611 * Frees a number of pages to the system 1612 * 1613 * Arguments: 1614 * mem A pointer to the memory to be freed 1615 * size The size of the memory being freed 1616 * flags The original p->us_flags field 1617 * 1618 * Returns: 1619 * Nothing 1620 */ 1621 static void 1622 page_free(void *mem, vm_size_t size, uint8_t flags) 1623 { 1624 1625 if ((flags & UMA_SLAB_KERNEL) == 0) 1626 panic("UMA: page_free used with invalid flags %x", flags); 1627 1628 kmem_free((vm_offset_t)mem, size); 1629 } 1630 1631 /* 1632 * Frees pcpu zone allocations 1633 * 1634 * Arguments: 1635 * mem A pointer to the memory to be freed 1636 * size The size of the memory being freed 1637 * flags The original p->us_flags field 1638 * 1639 * Returns: 1640 * Nothing 1641 */ 1642 static void 1643 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) 1644 { 1645 vm_offset_t sva, curva; 1646 vm_paddr_t paddr; 1647 vm_page_t m; 1648 1649 MPASS(size == (mp_maxid+1)*PAGE_SIZE); 1650 sva = (vm_offset_t)mem; 1651 for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { 1652 paddr = pmap_kextract(curva); 1653 m = PHYS_TO_VM_PAGE(paddr); 1654 vm_page_unwire_noq(m); 1655 vm_page_free(m); 1656 } 1657 pmap_qremove(sva, size >> PAGE_SHIFT); 1658 kva_free(sva, size); 1659 } 1660 1661 1662 /* 1663 * Zero fill initializer 1664 * 1665 * Arguments/Returns follow uma_init specifications 1666 */ 1667 static int 1668 zero_init(void *mem, int size, int flags) 1669 { 1670 bzero(mem, size); 1671 return (0); 1672 } 1673 1674 #ifdef INVARIANTS 1675 struct noslabbits * 1676 slab_dbg_bits(uma_slab_t slab, uma_keg_t keg) 1677 { 1678 1679 return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers))); 1680 } 1681 #endif 1682 1683 /* 1684 * Actual size of embedded struct slab (!OFFPAGE). 1685 */ 1686 size_t 1687 slab_sizeof(int nitems) 1688 { 1689 size_t s; 1690 1691 s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS; 1692 return (roundup(s, UMA_ALIGN_PTR + 1)); 1693 } 1694 1695 /* 1696 * Size of memory for embedded slabs (!OFFPAGE). 1697 */ 1698 size_t 1699 slab_space(int nitems) 1700 { 1701 return (UMA_SLAB_SIZE - slab_sizeof(nitems)); 1702 } 1703 1704 #define UMA_FIXPT_SHIFT 31 1705 #define UMA_FRAC_FIXPT(n, d) \ 1706 ((uint32_t)(((uint64_t)(n) << UMA_FIXPT_SHIFT) / (d))) 1707 #define UMA_FIXPT_PCT(f) \ 1708 ((u_int)(((uint64_t)100 * (f)) >> UMA_FIXPT_SHIFT)) 1709 #define UMA_PCT_FIXPT(pct) UMA_FRAC_FIXPT((pct), 100) 1710 #define UMA_MIN_EFF UMA_PCT_FIXPT(100 - UMA_MAX_WASTE) 1711 1712 /* 1713 * Compute the number of items that will fit in a slab. If hdr is true, the 1714 * item count may be limited to provide space in the slab for an inline slab 1715 * header. Otherwise, all slab space will be provided for item storage. 1716 */ 1717 static u_int 1718 slab_ipers_hdr(u_int size, u_int rsize, u_int slabsize, bool hdr) 1719 { 1720 u_int ipers; 1721 u_int padpi; 1722 1723 /* The padding between items is not needed after the last item. */ 1724 padpi = rsize - size; 1725 1726 if (hdr) { 1727 /* 1728 * Start with the maximum item count and remove items until 1729 * the slab header first alongside the allocatable memory. 1730 */ 1731 for (ipers = MIN(SLAB_MAX_SETSIZE, 1732 (slabsize + padpi - slab_sizeof(1)) / rsize); 1733 ipers > 0 && 1734 ipers * rsize - padpi + slab_sizeof(ipers) > slabsize; 1735 ipers--) 1736 continue; 1737 } else { 1738 ipers = MIN((slabsize + padpi) / rsize, SLAB_MAX_SETSIZE); 1739 } 1740 1741 return (ipers); 1742 } 1743 1744 /* 1745 * Compute the number of items that will fit in a slab for a startup zone. 1746 */ 1747 int 1748 slab_ipers(size_t size, int align) 1749 { 1750 int rsize; 1751 1752 rsize = roundup(size, align + 1); /* Assume no CACHESPREAD */ 1753 return (slab_ipers_hdr(size, rsize, UMA_SLAB_SIZE, true)); 1754 } 1755 1756 /* 1757 * Determine the format of a uma keg. This determines where the slab header 1758 * will be placed (inline or offpage) and calculates ipers, rsize, and ppera. 1759 * 1760 * Arguments 1761 * keg The zone we should initialize 1762 * 1763 * Returns 1764 * Nothing 1765 */ 1766 static void 1767 keg_layout(uma_keg_t keg) 1768 { 1769 u_int alignsize; 1770 u_int eff; 1771 u_int eff_offpage; 1772 u_int format; 1773 u_int ipers; 1774 u_int ipers_offpage; 1775 u_int pages; 1776 u_int rsize; 1777 u_int slabsize; 1778 1779 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || 1780 (keg->uk_size <= UMA_PCPU_ALLOC_SIZE && 1781 (keg->uk_flags & UMA_ZONE_CACHESPREAD) == 0), 1782 ("%s: cannot configure for PCPU: keg=%s, size=%u, flags=0x%b", 1783 __func__, keg->uk_name, keg->uk_size, keg->uk_flags, 1784 PRINT_UMA_ZFLAGS)); 1785 KASSERT((keg->uk_flags & 1786 (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY)) == 0 || 1787 (keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0, 1788 ("%s: incompatible flags 0x%b", __func__, keg->uk_flags, 1789 PRINT_UMA_ZFLAGS)); 1790 1791 alignsize = keg->uk_align + 1; 1792 format = 0; 1793 ipers = 0; 1794 1795 /* 1796 * Calculate the size of each allocation (rsize) according to 1797 * alignment. If the requested size is smaller than we have 1798 * allocation bits for we round it up. 1799 */ 1800 rsize = MAX(keg->uk_size, UMA_SMALLEST_UNIT); 1801 rsize = roundup2(rsize, alignsize); 1802 1803 if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) { 1804 slabsize = UMA_PCPU_ALLOC_SIZE; 1805 pages = mp_maxid + 1; 1806 } else if ((keg->uk_flags & UMA_ZONE_CACHESPREAD) != 0) { 1807 /* 1808 * We want one item to start on every align boundary in a page. 1809 * To do this we will span pages. We will also extend the item 1810 * by the size of align if it is an even multiple of align. 1811 * Otherwise, it would fall on the same boundary every time. 1812 */ 1813 if ((rsize & alignsize) == 0) 1814 rsize += alignsize; 1815 slabsize = rsize * (PAGE_SIZE / alignsize); 1816 slabsize = MIN(slabsize, rsize * SLAB_MAX_SETSIZE); 1817 slabsize = MIN(slabsize, UMA_CACHESPREAD_MAX_SIZE); 1818 pages = howmany(slabsize, PAGE_SIZE); 1819 slabsize = ptoa(pages); 1820 } else { 1821 /* 1822 * Choose a slab size of as many pages as it takes to represent 1823 * a single item. We will then try to fit as many additional 1824 * items into the slab as possible. At some point, we may want 1825 * to increase the slab size for awkward item sizes in order to 1826 * increase efficiency. 1827 */ 1828 pages = howmany(keg->uk_size, PAGE_SIZE); 1829 slabsize = ptoa(pages); 1830 } 1831 1832 /* Evaluate an inline slab layout. */ 1833 if ((keg->uk_flags & (UMA_ZONE_NOTOUCH | UMA_ZONE_PCPU)) == 0) 1834 ipers = slab_ipers_hdr(keg->uk_size, rsize, slabsize, true); 1835 1836 /* TODO: vm_page-embedded slab. */ 1837 1838 /* 1839 * We can't do OFFPAGE if we're internal or if we've been 1840 * asked to not go to the VM for buckets. If we do this we 1841 * may end up going to the VM for slabs which we do not 1842 * want to do if we're UMA_ZFLAG_CACHEONLY as a result 1843 * of UMA_ZONE_VM, which clearly forbids it. 1844 */ 1845 if ((keg->uk_flags & 1846 (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY)) != 0) { 1847 if (ipers == 0) { 1848 /* We need an extra page for the slab header. */ 1849 pages++; 1850 slabsize = ptoa(pages); 1851 ipers = slab_ipers_hdr(keg->uk_size, rsize, slabsize, 1852 true); 1853 } 1854 goto out; 1855 } 1856 1857 /* 1858 * See if using an OFFPAGE slab will improve our efficiency. 1859 * Only do this if we are below our efficiency threshold. 1860 * 1861 * XXX We could try growing slabsize to limit max waste as well. 1862 * Historically this was not done because the VM could not 1863 * efficiently handle contiguous allocations. 1864 */ 1865 eff = UMA_FRAC_FIXPT(ipers * rsize, slabsize); 1866 ipers_offpage = slab_ipers_hdr(keg->uk_size, rsize, slabsize, false); 1867 eff_offpage = UMA_FRAC_FIXPT(ipers_offpage * rsize, 1868 slabsize + slabzone(ipers_offpage)->uz_keg->uk_rsize); 1869 if (ipers == 0 || (eff < UMA_MIN_EFF && eff < eff_offpage)) { 1870 CTR5(KTR_UMA, "UMA decided we need offpage slab headers for " 1871 "keg: %s(%p), minimum efficiency allowed = %u%%, " 1872 "old efficiency = %u%%, offpage efficiency = %u%%", 1873 keg->uk_name, keg, UMA_FIXPT_PCT(UMA_MIN_EFF), 1874 UMA_FIXPT_PCT(eff), UMA_FIXPT_PCT(eff_offpage)); 1875 format = UMA_ZFLAG_OFFPAGE; 1876 ipers = ipers_offpage; 1877 } 1878 1879 out: 1880 /* 1881 * How do we find the slab header if it is offpage or if not all item 1882 * start addresses are in the same page? We could solve the latter 1883 * case with vaddr alignment, but we don't. 1884 */ 1885 if ((format & UMA_ZFLAG_OFFPAGE) != 0 || 1886 (ipers - 1) * rsize >= PAGE_SIZE) { 1887 if ((keg->uk_flags & UMA_ZONE_NOTPAGE) != 0) 1888 format |= UMA_ZFLAG_HASH; 1889 else 1890 format |= UMA_ZFLAG_VTOSLAB; 1891 } 1892 keg->uk_ipers = ipers; 1893 keg->uk_rsize = rsize; 1894 keg->uk_flags |= format; 1895 keg->uk_ppera = pages; 1896 CTR6(KTR_UMA, "%s: keg=%s, flags=%#x, rsize=%u, ipers=%u, ppera=%u", 1897 __func__, keg->uk_name, keg->uk_flags, rsize, ipers, pages); 1898 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, 1899 ("%s: keg=%s, flags=0x%b, rsize=%u, ipers=%u, ppera=%u", __func__, 1900 keg->uk_name, keg->uk_flags, PRINT_UMA_ZFLAGS, rsize, ipers, 1901 pages)); 1902 } 1903 1904 /* 1905 * Keg header ctor. This initializes all fields, locks, etc. And inserts 1906 * the keg onto the global keg list. 1907 * 1908 * Arguments/Returns follow uma_ctor specifications 1909 * udata Actually uma_kctor_args 1910 */ 1911 static int 1912 keg_ctor(void *mem, int size, void *udata, int flags) 1913 { 1914 struct uma_kctor_args *arg = udata; 1915 uma_keg_t keg = mem; 1916 uma_zone_t zone; 1917 int i; 1918 1919 bzero(keg, size); 1920 keg->uk_size = arg->size; 1921 keg->uk_init = arg->uminit; 1922 keg->uk_fini = arg->fini; 1923 keg->uk_align = arg->align; 1924 keg->uk_reserve = 0; 1925 keg->uk_flags = arg->flags; 1926 1927 /* 1928 * We use a global round-robin policy by default. Zones with 1929 * UMA_ZONE_FIRSTTOUCH set will use first-touch instead, in which 1930 * case the iterator is never run. 1931 */ 1932 keg->uk_dr.dr_policy = DOMAINSET_RR(); 1933 keg->uk_dr.dr_iter = 0; 1934 1935 /* 1936 * The master zone is passed to us at keg-creation time. 1937 */ 1938 zone = arg->zone; 1939 keg->uk_name = zone->uz_name; 1940 1941 if (arg->flags & UMA_ZONE_VM) 1942 keg->uk_flags |= UMA_ZFLAG_CACHEONLY; 1943 1944 if (arg->flags & UMA_ZONE_ZINIT) 1945 keg->uk_init = zero_init; 1946 1947 if (arg->flags & UMA_ZONE_MALLOC) 1948 keg->uk_flags |= UMA_ZFLAG_VTOSLAB; 1949 1950 #ifndef SMP 1951 keg->uk_flags &= ~UMA_ZONE_PCPU; 1952 #endif 1953 1954 keg_layout(keg); 1955 1956 /* 1957 * Use a first-touch NUMA policy for all kegs that pmap_extract() 1958 * will work on with the exception of critical VM structures 1959 * necessary for paging. 1960 * 1961 * Zones may override the default by specifying either. 1962 */ 1963 #ifdef NUMA 1964 if ((keg->uk_flags & 1965 (UMA_ZFLAG_HASH | UMA_ZONE_VM | UMA_ZONE_ROUNDROBIN)) == 0) 1966 keg->uk_flags |= UMA_ZONE_FIRSTTOUCH; 1967 else if ((keg->uk_flags & UMA_ZONE_FIRSTTOUCH) == 0) 1968 keg->uk_flags |= UMA_ZONE_ROUNDROBIN; 1969 #endif 1970 1971 /* 1972 * If we haven't booted yet we need allocations to go through the 1973 * startup cache until the vm is ready. 1974 */ 1975 if (booted < BOOT_PAGEALLOC) 1976 keg->uk_allocf = startup_alloc; 1977 #ifdef UMA_MD_SMALL_ALLOC 1978 else if (keg->uk_ppera == 1) 1979 keg->uk_allocf = uma_small_alloc; 1980 #endif 1981 else if (keg->uk_flags & UMA_ZONE_PCPU) 1982 keg->uk_allocf = pcpu_page_alloc; 1983 else 1984 keg->uk_allocf = page_alloc; 1985 #ifdef UMA_MD_SMALL_ALLOC 1986 if (keg->uk_ppera == 1) 1987 keg->uk_freef = uma_small_free; 1988 else 1989 #endif 1990 if (keg->uk_flags & UMA_ZONE_PCPU) 1991 keg->uk_freef = pcpu_page_free; 1992 else 1993 keg->uk_freef = page_free; 1994 1995 /* 1996 * Initialize keg's locks. 1997 */ 1998 for (i = 0; i < vm_ndomains; i++) 1999 KEG_LOCK_INIT(keg, i, (arg->flags & UMA_ZONE_MTXCLASS)); 2000 2001 /* 2002 * If we're putting the slab header in the actual page we need to 2003 * figure out where in each page it goes. See slab_sizeof 2004 * definition. 2005 */ 2006 if (!(keg->uk_flags & UMA_ZFLAG_OFFPAGE)) { 2007 size_t shsize; 2008 2009 shsize = slab_sizeof(keg->uk_ipers); 2010 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize; 2011 /* 2012 * The only way the following is possible is if with our 2013 * UMA_ALIGN_PTR adjustments we are now bigger than 2014 * UMA_SLAB_SIZE. I haven't checked whether this is 2015 * mathematically possible for all cases, so we make 2016 * sure here anyway. 2017 */ 2018 KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera, 2019 ("zone %s ipers %d rsize %d size %d slab won't fit", 2020 zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size)); 2021 } 2022 2023 if (keg->uk_flags & UMA_ZFLAG_HASH) 2024 hash_alloc(&keg->uk_hash, 0); 2025 2026 CTR3(KTR_UMA, "keg_ctor %p zone %s(%p)", keg, zone->uz_name, zone); 2027 2028 LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); 2029 2030 rw_wlock(&uma_rwlock); 2031 LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); 2032 rw_wunlock(&uma_rwlock); 2033 return (0); 2034 } 2035 2036 static void 2037 zone_alloc_counters(uma_zone_t zone, void *unused) 2038 { 2039 2040 zone->uz_allocs = counter_u64_alloc(M_WAITOK); 2041 zone->uz_frees = counter_u64_alloc(M_WAITOK); 2042 zone->uz_fails = counter_u64_alloc(M_WAITOK); 2043 } 2044 2045 static void 2046 zone_alloc_sysctl(uma_zone_t zone, void *unused) 2047 { 2048 uma_zone_domain_t zdom; 2049 uma_domain_t dom; 2050 uma_keg_t keg; 2051 struct sysctl_oid *oid, *domainoid; 2052 int domains, i, cnt; 2053 static const char *nokeg = "cache zone"; 2054 char *c; 2055 2056 /* 2057 * Make a sysctl safe copy of the zone name by removing 2058 * any special characters and handling dups by appending 2059 * an index. 2060 */ 2061 if (zone->uz_namecnt != 0) { 2062 /* Count the number of decimal digits and '_' separator. */ 2063 for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++) 2064 cnt /= 10; 2065 zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1, 2066 M_UMA, M_WAITOK); 2067 sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name, 2068 zone->uz_namecnt); 2069 } else 2070 zone->uz_ctlname = strdup(zone->uz_name, M_UMA); 2071 for (c = zone->uz_ctlname; *c != '\0'; c++) 2072 if (strchr("./\\ -", *c) != NULL) 2073 *c = '_'; 2074 2075 /* 2076 * Basic parameters at the root. 2077 */ 2078 zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma), 2079 OID_AUTO, zone->uz_ctlname, CTLFLAG_RD, NULL, ""); 2080 oid = zone->uz_oid; 2081 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2082 "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size"); 2083 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2084 "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE, 2085 zone, 0, sysctl_handle_uma_zone_flags, "A", 2086 "Allocator configuration flags"); 2087 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2088 "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0, 2089 "Desired per-cpu cache size"); 2090 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2091 "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0, 2092 "Maximum allowed per-cpu cache size"); 2093 2094 /* 2095 * keg if present. 2096 */ 2097 if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) 2098 domains = vm_ndomains; 2099 else 2100 domains = 1; 2101 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 2102 "keg", CTLFLAG_RD, NULL, ""); 2103 keg = zone->uz_keg; 2104 if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) { 2105 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2106 "name", CTLFLAG_RD, keg->uk_name, "Keg name"); 2107 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2108 "rsize", CTLFLAG_RD, &keg->uk_rsize, 0, 2109 "Real object size with alignment"); 2110 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2111 "ppera", CTLFLAG_RD, &keg->uk_ppera, 0, 2112 "pages per-slab allocation"); 2113 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2114 "ipers", CTLFLAG_RD, &keg->uk_ipers, 0, 2115 "items available per-slab"); 2116 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2117 "align", CTLFLAG_RD, &keg->uk_align, 0, 2118 "item alignment mask"); 2119 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2120 "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, 2121 keg, 0, sysctl_handle_uma_slab_efficiency, "I", 2122 "Slab utilization (100 - internal fragmentation %)"); 2123 domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(oid), 2124 OID_AUTO, "domain", CTLFLAG_RD, NULL, ""); 2125 for (i = 0; i < domains; i++) { 2126 dom = &keg->uk_domain[i]; 2127 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), 2128 OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD, 2129 NULL, ""); 2130 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2131 "pages", CTLFLAG_RD, &dom->ud_pages, 0, 2132 "Total pages currently allocated from VM"); 2133 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2134 "free", CTLFLAG_RD, &dom->ud_free, 0, 2135 "items free in the slab layer"); 2136 } 2137 } else 2138 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2139 "name", CTLFLAG_RD, nokeg, "Keg name"); 2140 2141 /* 2142 * Information about zone limits. 2143 */ 2144 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 2145 "limit", CTLFLAG_RD, NULL, ""); 2146 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2147 "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 2148 zone, 0, sysctl_handle_uma_zone_items, "QU", 2149 "current number of allocated items if limit is set"); 2150 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2151 "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, 2152 "Maximum number of cached items"); 2153 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2154 "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0, 2155 "Number of threads sleeping at limit"); 2156 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2157 "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, 2158 "Total zone limit sleeps"); 2159 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2160 "bucket_max", CTLFLAG_RD, &zone->uz_bkt_max, 0, 2161 "Maximum number of items in the bucket cache"); 2162 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2163 "bucket_cnt", CTLFLAG_RD, &zone->uz_bkt_count, 0, 2164 "Number of items in the bucket cache"); 2165 2166 /* 2167 * Per-domain zone information. 2168 */ 2169 domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), 2170 OID_AUTO, "domain", CTLFLAG_RD, NULL, ""); 2171 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0) 2172 domains = 1; 2173 for (i = 0; i < domains; i++) { 2174 zdom = &zone->uz_domain[i]; 2175 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), 2176 OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD, NULL, ""); 2177 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2178 "nitems", CTLFLAG_RD, &zdom->uzd_nitems, 2179 "number of items in this domain"); 2180 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2181 "imax", CTLFLAG_RD, &zdom->uzd_imax, 2182 "maximum item count in this period"); 2183 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2184 "imin", CTLFLAG_RD, &zdom->uzd_imin, 2185 "minimum item count in this period"); 2186 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2187 "wss", CTLFLAG_RD, &zdom->uzd_wss, 2188 "Working set size"); 2189 } 2190 2191 /* 2192 * General statistics. 2193 */ 2194 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 2195 "stats", CTLFLAG_RD, NULL, ""); 2196 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2197 "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, 2198 zone, 1, sysctl_handle_uma_zone_cur, "I", 2199 "Current number of allocated items"); 2200 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2201 "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 2202 zone, 0, sysctl_handle_uma_zone_allocs, "QU", 2203 "Total allocation calls"); 2204 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2205 "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 2206 zone, 0, sysctl_handle_uma_zone_frees, "QU", 2207 "Total free calls"); 2208 SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2209 "fails", CTLFLAG_RD, &zone->uz_fails, 2210 "Number of allocation failures"); 2211 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2212 "xdomain", CTLFLAG_RD, &zone->uz_xdomain, 0, 2213 "Free calls from the wrong domain"); 2214 } 2215 2216 struct uma_zone_count { 2217 const char *name; 2218 int count; 2219 }; 2220 2221 static void 2222 zone_count(uma_zone_t zone, void *arg) 2223 { 2224 struct uma_zone_count *cnt; 2225 2226 cnt = arg; 2227 /* 2228 * Some zones are rapidly created with identical names and 2229 * destroyed out of order. This can lead to gaps in the count. 2230 * Use one greater than the maximum observed for this name. 2231 */ 2232 if (strcmp(zone->uz_name, cnt->name) == 0) 2233 cnt->count = MAX(cnt->count, 2234 zone->uz_namecnt + 1); 2235 } 2236 2237 static void 2238 zone_update_caches(uma_zone_t zone) 2239 { 2240 int i; 2241 2242 for (i = 0; i <= mp_maxid; i++) { 2243 cache_set_uz_size(&zone->uz_cpu[i], zone->uz_size); 2244 cache_set_uz_flags(&zone->uz_cpu[i], zone->uz_flags); 2245 } 2246 } 2247 2248 /* 2249 * Zone header ctor. This initializes all fields, locks, etc. 2250 * 2251 * Arguments/Returns follow uma_ctor specifications 2252 * udata Actually uma_zctor_args 2253 */ 2254 static int 2255 zone_ctor(void *mem, int size, void *udata, int flags) 2256 { 2257 struct uma_zone_count cnt; 2258 struct uma_zctor_args *arg = udata; 2259 uma_zone_t zone = mem; 2260 uma_zone_t z; 2261 uma_keg_t keg; 2262 int i; 2263 2264 bzero(zone, size); 2265 zone->uz_name = arg->name; 2266 zone->uz_ctor = arg->ctor; 2267 zone->uz_dtor = arg->dtor; 2268 zone->uz_init = NULL; 2269 zone->uz_fini = NULL; 2270 zone->uz_sleeps = 0; 2271 zone->uz_xdomain = 0; 2272 zone->uz_bucket_size = 0; 2273 zone->uz_bucket_size_min = 0; 2274 zone->uz_bucket_size_max = BUCKET_MAX; 2275 zone->uz_flags = 0; 2276 zone->uz_warning = NULL; 2277 /* The domain structures follow the cpu structures. */ 2278 zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus]; 2279 zone->uz_bkt_max = ULONG_MAX; 2280 timevalclear(&zone->uz_ratecheck); 2281 2282 /* Count the number of duplicate names. */ 2283 cnt.name = arg->name; 2284 cnt.count = 0; 2285 zone_foreach(zone_count, &cnt); 2286 zone->uz_namecnt = cnt.count; 2287 ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); 2288 ZONE_CROSS_LOCK_INIT(zone); 2289 2290 for (i = 0; i < vm_ndomains; i++) 2291 TAILQ_INIT(&zone->uz_domain[i].uzd_buckets); 2292 2293 #ifdef INVARIANTS 2294 if (arg->uminit == trash_init && arg->fini == trash_fini) 2295 zone->uz_flags |= UMA_ZFLAG_TRASH | UMA_ZFLAG_CTORDTOR; 2296 #endif 2297 2298 /* 2299 * This is a pure cache zone, no kegs. 2300 */ 2301 if (arg->import) { 2302 KASSERT((arg->flags & UMA_ZFLAG_CACHE) != 0, 2303 ("zone_ctor: Import specified for non-cache zone.")); 2304 if (arg->flags & UMA_ZONE_VM) 2305 arg->flags |= UMA_ZFLAG_CACHEONLY; 2306 zone->uz_flags = arg->flags; 2307 zone->uz_size = arg->size; 2308 zone->uz_import = arg->import; 2309 zone->uz_release = arg->release; 2310 zone->uz_arg = arg->arg; 2311 rw_wlock(&uma_rwlock); 2312 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); 2313 rw_wunlock(&uma_rwlock); 2314 goto out; 2315 } 2316 2317 /* 2318 * Use the regular zone/keg/slab allocator. 2319 */ 2320 zone->uz_import = zone_import; 2321 zone->uz_release = zone_release; 2322 zone->uz_arg = zone; 2323 keg = arg->keg; 2324 2325 if (arg->flags & UMA_ZONE_SECONDARY) { 2326 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, 2327 ("Secondary zone requested UMA_ZFLAG_INTERNAL")); 2328 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); 2329 zone->uz_init = arg->uminit; 2330 zone->uz_fini = arg->fini; 2331 zone->uz_flags |= UMA_ZONE_SECONDARY; 2332 rw_wlock(&uma_rwlock); 2333 ZONE_LOCK(zone); 2334 LIST_FOREACH(z, &keg->uk_zones, uz_link) { 2335 if (LIST_NEXT(z, uz_link) == NULL) { 2336 LIST_INSERT_AFTER(z, zone, uz_link); 2337 break; 2338 } 2339 } 2340 ZONE_UNLOCK(zone); 2341 rw_wunlock(&uma_rwlock); 2342 } else if (keg == NULL) { 2343 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, 2344 arg->align, arg->flags)) == NULL) 2345 return (ENOMEM); 2346 } else { 2347 struct uma_kctor_args karg; 2348 int error; 2349 2350 /* We should only be here from uma_startup() */ 2351 karg.size = arg->size; 2352 karg.uminit = arg->uminit; 2353 karg.fini = arg->fini; 2354 karg.align = arg->align; 2355 karg.flags = arg->flags; 2356 karg.zone = zone; 2357 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, 2358 flags); 2359 if (error) 2360 return (error); 2361 } 2362 2363 /* Inherit properties from the keg. */ 2364 zone->uz_keg = keg; 2365 zone->uz_size = keg->uk_size; 2366 zone->uz_flags |= (keg->uk_flags & 2367 (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); 2368 2369 out: 2370 if (__predict_true(booted >= BOOT_RUNNING)) { 2371 zone_alloc_counters(zone, NULL); 2372 zone_alloc_sysctl(zone, NULL); 2373 } else { 2374 zone->uz_allocs = EARLY_COUNTER; 2375 zone->uz_frees = EARLY_COUNTER; 2376 zone->uz_fails = EARLY_COUNTER; 2377 } 2378 2379 KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != 2380 (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), 2381 ("Invalid zone flag combination")); 2382 if (arg->flags & UMA_ZFLAG_INTERNAL) 2383 zone->uz_bucket_size_max = zone->uz_bucket_size = 0; 2384 if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) 2385 zone->uz_bucket_size = BUCKET_MAX; 2386 else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) 2387 zone->uz_bucket_size_max = zone->uz_bucket_size = BUCKET_MIN; 2388 else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) 2389 zone->uz_bucket_size = 0; 2390 else 2391 zone->uz_bucket_size = bucket_select(zone->uz_size); 2392 zone->uz_bucket_size_min = zone->uz_bucket_size; 2393 if (zone->uz_dtor != NULL || zone->uz_ctor != NULL) 2394 zone->uz_flags |= UMA_ZFLAG_CTORDTOR; 2395 zone_update_caches(zone); 2396 2397 return (0); 2398 } 2399 2400 /* 2401 * Keg header dtor. This frees all data, destroys locks, frees the hash 2402 * table and removes the keg from the global list. 2403 * 2404 * Arguments/Returns follow uma_dtor specifications 2405 * udata unused 2406 */ 2407 static void 2408 keg_dtor(void *arg, int size, void *udata) 2409 { 2410 uma_keg_t keg; 2411 uint32_t free, pages; 2412 int i; 2413 2414 keg = (uma_keg_t)arg; 2415 free = pages = 0; 2416 for (i = 0; i < vm_ndomains; i++) { 2417 free += keg->uk_domain[i].ud_free; 2418 pages += keg->uk_domain[i].ud_pages; 2419 KEG_LOCK_FINI(keg, i); 2420 } 2421 if (free != 0) 2422 printf("Freed UMA keg (%s) was not empty (%u items). " 2423 " Lost %u pages of memory.\n", 2424 keg->uk_name ? keg->uk_name : "", 2425 free, pages); 2426 2427 hash_free(&keg->uk_hash); 2428 } 2429 2430 /* 2431 * Zone header dtor. 2432 * 2433 * Arguments/Returns follow uma_dtor specifications 2434 * udata unused 2435 */ 2436 static void 2437 zone_dtor(void *arg, int size, void *udata) 2438 { 2439 uma_zone_t zone; 2440 uma_keg_t keg; 2441 2442 zone = (uma_zone_t)arg; 2443 2444 sysctl_remove_oid(zone->uz_oid, 1, 1); 2445 2446 if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) 2447 cache_drain(zone); 2448 2449 rw_wlock(&uma_rwlock); 2450 LIST_REMOVE(zone, uz_link); 2451 rw_wunlock(&uma_rwlock); 2452 /* 2453 * XXX there are some races here where 2454 * the zone can be drained but zone lock 2455 * released and then refilled before we 2456 * remove it... we dont care for now 2457 */ 2458 zone_reclaim(zone, M_WAITOK, true); 2459 /* 2460 * We only destroy kegs from non secondary/non cache zones. 2461 */ 2462 if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { 2463 keg = zone->uz_keg; 2464 rw_wlock(&uma_rwlock); 2465 LIST_REMOVE(keg, uk_link); 2466 rw_wunlock(&uma_rwlock); 2467 zone_free_item(kegs, keg, NULL, SKIP_NONE); 2468 } 2469 counter_u64_free(zone->uz_allocs); 2470 counter_u64_free(zone->uz_frees); 2471 counter_u64_free(zone->uz_fails); 2472 free(zone->uz_ctlname, M_UMA); 2473 ZONE_LOCK_FINI(zone); 2474 ZONE_CROSS_LOCK_FINI(zone); 2475 } 2476 2477 /* 2478 * Traverses every zone in the system and calls a callback 2479 * 2480 * Arguments: 2481 * zfunc A pointer to a function which accepts a zone 2482 * as an argument. 2483 * 2484 * Returns: 2485 * Nothing 2486 */ 2487 static void 2488 zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg) 2489 { 2490 uma_keg_t keg; 2491 uma_zone_t zone; 2492 2493 /* 2494 * Before BOOT_RUNNING we are guaranteed to be single 2495 * threaded, so locking isn't needed. Startup functions 2496 * are allowed to use M_WAITOK. 2497 */ 2498 if (__predict_true(booted >= BOOT_RUNNING)) 2499 rw_rlock(&uma_rwlock); 2500 LIST_FOREACH(keg, &uma_kegs, uk_link) { 2501 LIST_FOREACH(zone, &keg->uk_zones, uz_link) 2502 zfunc(zone, arg); 2503 } 2504 LIST_FOREACH(zone, &uma_cachezones, uz_link) 2505 zfunc(zone, arg); 2506 if (__predict_true(booted >= BOOT_RUNNING)) 2507 rw_runlock(&uma_rwlock); 2508 } 2509 2510 /* 2511 * Count how many pages do we need to bootstrap. VM supplies 2512 * its need in early zones in the argument, we add up our zones, 2513 * which consist of the UMA Slabs, UMA Hash and 9 Bucket zones. The 2514 * zone of zones and zone of kegs are accounted separately. 2515 */ 2516 #define UMA_BOOT_ZONES 12 2517 static int zsize, ksize; 2518 int 2519 uma_startup_count(int vm_zones) 2520 { 2521 int zones, pages; 2522 u_int zppera, zipers; 2523 u_int kppera, kipers; 2524 size_t space, size; 2525 2526 ksize = sizeof(struct uma_keg) + 2527 (sizeof(struct uma_domain) * vm_ndomains); 2528 ksize = roundup(ksize, UMA_SUPER_ALIGN); 2529 zsize = sizeof(struct uma_zone) + 2530 (sizeof(struct uma_cache) * (mp_maxid + 1)) + 2531 (sizeof(struct uma_zone_domain) * vm_ndomains); 2532 zsize = roundup(zsize, UMA_SUPER_ALIGN); 2533 2534 /* 2535 * Memory for the zone of kegs and its keg, and for zone 2536 * of zones. Allocated directly in uma_startup(). 2537 */ 2538 pages = howmany(zsize * 2 + ksize, PAGE_SIZE); 2539 2540 #ifdef UMA_MD_SMALL_ALLOC 2541 zones = UMA_BOOT_ZONES; 2542 #else 2543 zones = UMA_BOOT_ZONES + vm_zones; 2544 vm_zones = 0; 2545 #endif 2546 size = slab_sizeof(SLAB_MAX_SETSIZE); 2547 space = slab_space(SLAB_MAX_SETSIZE); 2548 2549 /* Memory for the rest of startup zones, UMA and VM, ... */ 2550 if (zsize > space) { 2551 /* See keg_large_init(). */ 2552 zppera = howmany(zsize + slab_sizeof(1), PAGE_SIZE); 2553 zipers = 1; 2554 zones += vm_zones; 2555 } else { 2556 zppera = 1; 2557 zipers = space / zsize; 2558 } 2559 pages += howmany(zones, zipers) * zppera; 2560 2561 /* ... and their kegs. Note that zone of zones allocates a keg! */ 2562 if (ksize > space) { 2563 /* See keg_large_init(). */ 2564 kppera = howmany(ksize + slab_sizeof(1), PAGE_SIZE); 2565 kipers = 1; 2566 } else { 2567 kppera = 1; 2568 kipers = space / ksize; 2569 } 2570 pages += howmany(zones + 1, kipers) * kppera; 2571 2572 /* 2573 * Allocate an additional slab for zones and kegs on NUMA 2574 * systems. The round-robin allocation policy will populate at 2575 * least one slab per-domain. 2576 */ 2577 pages += (vm_ndomains - 1) * (zppera + kppera); 2578 2579 return (pages); 2580 } 2581 2582 void 2583 uma_startup(void *mem, int npages) 2584 { 2585 struct uma_zctor_args args; 2586 uma_keg_t masterkeg; 2587 uintptr_t m; 2588 2589 #ifdef DIAGNOSTIC 2590 printf("Entering %s with %d boot pages configured\n", __func__, npages); 2591 #endif 2592 2593 rw_init(&uma_rwlock, "UMA lock"); 2594 2595 /* Use bootpages memory for the zone of zones and zone of kegs. */ 2596 m = (uintptr_t)mem; 2597 zones = (uma_zone_t)m; 2598 m += zsize; 2599 kegs = (uma_zone_t)m; 2600 m += zsize; 2601 masterkeg = (uma_keg_t)m; 2602 m += ksize; 2603 m = roundup(m, PAGE_SIZE); 2604 npages -= (m - (uintptr_t)mem) / PAGE_SIZE; 2605 mem = (void *)m; 2606 2607 /* "manually" create the initial zone */ 2608 memset(&args, 0, sizeof(args)); 2609 args.name = "UMA Kegs"; 2610 args.size = ksize; 2611 args.ctor = keg_ctor; 2612 args.dtor = keg_dtor; 2613 args.uminit = zero_init; 2614 args.fini = NULL; 2615 args.keg = masterkeg; 2616 args.align = UMA_SUPER_ALIGN - 1; 2617 args.flags = UMA_ZFLAG_INTERNAL; 2618 zone_ctor(kegs, zsize, &args, M_WAITOK); 2619 2620 bootmem = mem; 2621 boot_pages = npages; 2622 2623 args.name = "UMA Zones"; 2624 args.size = zsize; 2625 args.ctor = zone_ctor; 2626 args.dtor = zone_dtor; 2627 args.uminit = zero_init; 2628 args.fini = NULL; 2629 args.keg = NULL; 2630 args.align = UMA_SUPER_ALIGN - 1; 2631 args.flags = UMA_ZFLAG_INTERNAL; 2632 zone_ctor(zones, zsize, &args, M_WAITOK); 2633 2634 /* Now make zones for slab headers */ 2635 slabzones[0] = uma_zcreate("UMA Slabs 0", SLABZONE0_SIZE, 2636 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 2637 slabzones[1] = uma_zcreate("UMA Slabs 1", SLABZONE1_SIZE, 2638 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 2639 2640 hashzone = uma_zcreate("UMA Hash", 2641 sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, 2642 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 2643 2644 booted = BOOT_STRAPPED; 2645 } 2646 2647 void 2648 uma_startup1(void) 2649 { 2650 2651 #ifdef DIAGNOSTIC 2652 printf("Entering %s with %d boot pages left\n", __func__, boot_pages); 2653 #endif 2654 booted = BOOT_PAGEALLOC; 2655 } 2656 2657 void 2658 uma_startup2(void) 2659 { 2660 2661 #ifdef DIAGNOSTIC 2662 printf("Entering %s with %d boot pages left\n", __func__, boot_pages); 2663 #endif 2664 sx_init(&uma_reclaim_lock, "umareclaim"); 2665 bucket_init(); 2666 booted = BOOT_BUCKETS; 2667 bucket_enable(); 2668 } 2669 2670 static void 2671 uma_startup3(void) 2672 { 2673 2674 #ifdef INVARIANTS 2675 TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor); 2676 uma_dbg_cnt = counter_u64_alloc(M_WAITOK); 2677 uma_skip_cnt = counter_u64_alloc(M_WAITOK); 2678 #endif 2679 zone_foreach(zone_alloc_counters, NULL); 2680 zone_foreach(zone_alloc_sysctl, NULL); 2681 callout_init(&uma_callout, 1); 2682 callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); 2683 booted = BOOT_RUNNING; 2684 2685 EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL, 2686 EVENTHANDLER_PRI_FIRST); 2687 } 2688 2689 static void 2690 uma_shutdown(void) 2691 { 2692 2693 booted = BOOT_SHUTDOWN; 2694 } 2695 2696 static uma_keg_t 2697 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, 2698 int align, uint32_t flags) 2699 { 2700 struct uma_kctor_args args; 2701 2702 args.size = size; 2703 args.uminit = uminit; 2704 args.fini = fini; 2705 args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; 2706 args.flags = flags; 2707 args.zone = zone; 2708 return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); 2709 } 2710 2711 /* Public functions */ 2712 /* See uma.h */ 2713 void 2714 uma_set_align(int align) 2715 { 2716 2717 if (align != UMA_ALIGN_CACHE) 2718 uma_align_cache = align; 2719 } 2720 2721 /* See uma.h */ 2722 uma_zone_t 2723 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, 2724 uma_init uminit, uma_fini fini, int align, uint32_t flags) 2725 2726 { 2727 struct uma_zctor_args args; 2728 uma_zone_t res; 2729 bool locked; 2730 2731 KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"", 2732 align, name)); 2733 2734 /* This stuff is essential for the zone ctor */ 2735 memset(&args, 0, sizeof(args)); 2736 args.name = name; 2737 args.size = size; 2738 args.ctor = ctor; 2739 args.dtor = dtor; 2740 args.uminit = uminit; 2741 args.fini = fini; 2742 #ifdef INVARIANTS 2743 /* 2744 * Inject procedures which check for memory use after free if we are 2745 * allowed to scramble the memory while it is not allocated. This 2746 * requires that: UMA is actually able to access the memory, no init 2747 * or fini procedures, no dependency on the initial value of the 2748 * memory, and no (legitimate) use of the memory after free. Note, 2749 * the ctor and dtor do not need to be empty. 2750 */ 2751 if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOTOUCH | 2752 UMA_ZONE_NOFREE))) && uminit == NULL && fini == NULL) { 2753 args.uminit = trash_init; 2754 args.fini = trash_fini; 2755 } 2756 #endif 2757 args.align = align; 2758 args.flags = flags; 2759 args.keg = NULL; 2760 2761 if (booted < BOOT_BUCKETS) { 2762 locked = false; 2763 } else { 2764 sx_slock(&uma_reclaim_lock); 2765 locked = true; 2766 } 2767 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 2768 if (locked) 2769 sx_sunlock(&uma_reclaim_lock); 2770 return (res); 2771 } 2772 2773 /* See uma.h */ 2774 uma_zone_t 2775 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, 2776 uma_init zinit, uma_fini zfini, uma_zone_t master) 2777 { 2778 struct uma_zctor_args args; 2779 uma_keg_t keg; 2780 uma_zone_t res; 2781 bool locked; 2782 2783 keg = master->uz_keg; 2784 memset(&args, 0, sizeof(args)); 2785 args.name = name; 2786 args.size = keg->uk_size; 2787 args.ctor = ctor; 2788 args.dtor = dtor; 2789 args.uminit = zinit; 2790 args.fini = zfini; 2791 args.align = keg->uk_align; 2792 args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; 2793 args.keg = keg; 2794 2795 if (booted < BOOT_BUCKETS) { 2796 locked = false; 2797 } else { 2798 sx_slock(&uma_reclaim_lock); 2799 locked = true; 2800 } 2801 /* XXX Attaches only one keg of potentially many. */ 2802 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 2803 if (locked) 2804 sx_sunlock(&uma_reclaim_lock); 2805 return (res); 2806 } 2807 2808 /* See uma.h */ 2809 uma_zone_t 2810 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, 2811 uma_init zinit, uma_fini zfini, uma_import zimport, 2812 uma_release zrelease, void *arg, int flags) 2813 { 2814 struct uma_zctor_args args; 2815 2816 memset(&args, 0, sizeof(args)); 2817 args.name = name; 2818 args.size = size; 2819 args.ctor = ctor; 2820 args.dtor = dtor; 2821 args.uminit = zinit; 2822 args.fini = zfini; 2823 args.import = zimport; 2824 args.release = zrelease; 2825 args.arg = arg; 2826 args.align = 0; 2827 args.flags = flags | UMA_ZFLAG_CACHE; 2828 2829 return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); 2830 } 2831 2832 /* See uma.h */ 2833 void 2834 uma_zdestroy(uma_zone_t zone) 2835 { 2836 2837 /* 2838 * Large slabs are expensive to reclaim, so don't bother doing 2839 * unnecessary work if we're shutting down. 2840 */ 2841 if (booted == BOOT_SHUTDOWN && 2842 zone->uz_fini == NULL && zone->uz_release == zone_release) 2843 return; 2844 sx_slock(&uma_reclaim_lock); 2845 zone_free_item(zones, zone, NULL, SKIP_NONE); 2846 sx_sunlock(&uma_reclaim_lock); 2847 } 2848 2849 void 2850 uma_zwait(uma_zone_t zone) 2851 { 2852 void *item; 2853 2854 item = uma_zalloc_arg(zone, NULL, M_WAITOK); 2855 uma_zfree(zone, item); 2856 } 2857 2858 void * 2859 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags) 2860 { 2861 void *item; 2862 #ifdef SMP 2863 int i; 2864 2865 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 2866 #endif 2867 item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO); 2868 if (item != NULL && (flags & M_ZERO)) { 2869 #ifdef SMP 2870 for (i = 0; i <= mp_maxid; i++) 2871 bzero(zpcpu_get_cpu(item, i), zone->uz_size); 2872 #else 2873 bzero(item, zone->uz_size); 2874 #endif 2875 } 2876 return (item); 2877 } 2878 2879 /* 2880 * A stub while both regular and pcpu cases are identical. 2881 */ 2882 void 2883 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata) 2884 { 2885 2886 #ifdef SMP 2887 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 2888 #endif 2889 uma_zfree_arg(zone, item, udata); 2890 } 2891 2892 #ifdef INVARIANTS 2893 #define UMA_ALWAYS_CTORDTOR 1 2894 #else 2895 #define UMA_ALWAYS_CTORDTOR 0 2896 #endif 2897 2898 static void * 2899 item_ctor(uma_zone_t zone, int size, void *udata, int flags, void *item) 2900 { 2901 #ifdef INVARIANTS 2902 bool skipdbg; 2903 2904 skipdbg = uma_dbg_zskip(zone, item); 2905 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && 2906 zone->uz_ctor != trash_ctor) 2907 trash_ctor(item, size, udata, flags); 2908 #endif 2909 if (__predict_false(zone->uz_ctor != NULL) && 2910 zone->uz_ctor(item, size, udata, flags) != 0) { 2911 counter_u64_add(zone->uz_fails, 1); 2912 zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); 2913 return (NULL); 2914 } 2915 #ifdef INVARIANTS 2916 if (!skipdbg) 2917 uma_dbg_alloc(zone, NULL, item); 2918 #endif 2919 if (flags & M_ZERO) 2920 bzero(item, size); 2921 2922 return (item); 2923 } 2924 2925 static inline void 2926 item_dtor(uma_zone_t zone, void *item, int size, void *udata, 2927 enum zfreeskip skip) 2928 { 2929 #ifdef INVARIANTS 2930 bool skipdbg; 2931 2932 skipdbg = uma_dbg_zskip(zone, item); 2933 if (skip == SKIP_NONE && !skipdbg) { 2934 if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0) 2935 uma_dbg_free(zone, udata, item); 2936 else 2937 uma_dbg_free(zone, NULL, item); 2938 } 2939 #endif 2940 if (__predict_true(skip < SKIP_DTOR)) { 2941 if (zone->uz_dtor != NULL) 2942 zone->uz_dtor(item, size, udata); 2943 #ifdef INVARIANTS 2944 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && 2945 zone->uz_dtor != trash_dtor) 2946 trash_dtor(item, size, udata); 2947 #endif 2948 } 2949 } 2950 2951 /* See uma.h */ 2952 void * 2953 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) 2954 { 2955 uma_cache_bucket_t bucket; 2956 uma_cache_t cache; 2957 void *item; 2958 int domain, size, uz_flags; 2959 2960 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 2961 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 2962 2963 /* This is the fast path allocation */ 2964 CTR3(KTR_UMA, "uma_zalloc_arg zone %s(%p) flags %d", zone->uz_name, 2965 zone, flags); 2966 2967 #ifdef WITNESS 2968 if (flags & M_WAITOK) { 2969 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2970 "uma_zalloc_arg: zone \"%s\"", zone->uz_name); 2971 } 2972 #endif 2973 2974 #ifdef INVARIANTS 2975 KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC")); 2976 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 2977 ("uma_zalloc_arg: called with spinlock or critical section held")); 2978 if (zone->uz_flags & UMA_ZONE_PCPU) 2979 KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone " 2980 "with M_ZERO passed")); 2981 #endif 2982 2983 #ifdef DEBUG_MEMGUARD 2984 if (memguard_cmp_zone(zone)) { 2985 item = memguard_alloc(zone->uz_size, flags); 2986 if (item != NULL) { 2987 if (zone->uz_init != NULL && 2988 zone->uz_init(item, zone->uz_size, flags) != 0) 2989 return (NULL); 2990 if (zone->uz_ctor != NULL && 2991 zone->uz_ctor(item, zone->uz_size, udata, 2992 flags) != 0) { 2993 counter_u64_add(zone->uz_fails, 1); 2994 zone->uz_fini(item, zone->uz_size); 2995 return (NULL); 2996 } 2997 return (item); 2998 } 2999 /* This is unfortunate but should not be fatal. */ 3000 } 3001 #endif 3002 /* 3003 * If possible, allocate from the per-CPU cache. There are two 3004 * requirements for safe access to the per-CPU cache: (1) the thread 3005 * accessing the cache must not be preempted or yield during access, 3006 * and (2) the thread must not migrate CPUs without switching which 3007 * cache it accesses. We rely on a critical section to prevent 3008 * preemption and migration. We release the critical section in 3009 * order to acquire the zone mutex if we are unable to allocate from 3010 * the current cache; when we re-acquire the critical section, we 3011 * must detect and handle migration if it has occurred. 3012 */ 3013 critical_enter(); 3014 do { 3015 cache = &zone->uz_cpu[curcpu]; 3016 bucket = &cache->uc_allocbucket; 3017 size = cache_uz_size(cache); 3018 uz_flags = cache_uz_flags(cache); 3019 if (__predict_true(bucket->ucb_cnt != 0)) { 3020 item = cache_bucket_pop(cache, bucket); 3021 critical_exit(); 3022 if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0 || 3023 UMA_ALWAYS_CTORDTOR)) 3024 return (item_ctor(zone, size, udata, flags, item)); 3025 if (flags & M_ZERO) 3026 bzero(item, size); 3027 return (item); 3028 } 3029 } while (cache_alloc(zone, cache, udata, flags)); 3030 critical_exit(); 3031 3032 /* 3033 * We can not get a bucket so try to return a single item. 3034 */ 3035 if (uz_flags & UMA_ZONE_FIRSTTOUCH) 3036 domain = PCPU_GET(domain); 3037 else 3038 domain = UMA_ANYDOMAIN; 3039 return (zone_alloc_item(zone, udata, domain, flags)); 3040 } 3041 3042 /* 3043 * Replenish an alloc bucket and possibly restore an old one. Called in 3044 * a critical section. Returns in a critical section. 3045 * 3046 * A false return value indicates an allocation failure. 3047 * A true return value indicates success and the caller should retry. 3048 */ 3049 static __noinline bool 3050 cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) 3051 { 3052 uma_zone_domain_t zdom; 3053 uma_bucket_t bucket; 3054 int domain; 3055 bool lockfail; 3056 3057 CRITICAL_ASSERT(curthread); 3058 3059 /* 3060 * If we have run out of items in our alloc bucket see 3061 * if we can switch with the free bucket. 3062 */ 3063 if (cache->uc_freebucket.ucb_cnt != 0) { 3064 cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket); 3065 return (true); 3066 } 3067 3068 /* 3069 * Discard any empty allocation bucket while we hold no locks. 3070 */ 3071 bucket = cache_bucket_unload_alloc(cache); 3072 critical_exit(); 3073 if (bucket != NULL) 3074 bucket_free(zone, bucket, udata); 3075 3076 /* Short-circuit for zones without buckets and low memory. */ 3077 if (zone->uz_bucket_size == 0 || bucketdisable) { 3078 critical_enter(); 3079 return (false); 3080 } 3081 3082 /* 3083 * Attempt to retrieve the item from the per-CPU cache has failed, so 3084 * we must go back to the zone. This requires the zone lock, so we 3085 * must drop the critical section, then re-acquire it when we go back 3086 * to the cache. Since the critical section is released, we may be 3087 * preempted or migrate. As such, make sure not to maintain any 3088 * thread-local state specific to the cache from prior to releasing 3089 * the critical section. 3090 */ 3091 lockfail = 0; 3092 if (ZONE_TRYLOCK(zone) == 0) { 3093 /* Record contention to size the buckets. */ 3094 ZONE_LOCK(zone); 3095 lockfail = 1; 3096 } 3097 3098 /* See if we lost the race to fill the cache. */ 3099 critical_enter(); 3100 cache = &zone->uz_cpu[curcpu]; 3101 if (cache->uc_allocbucket.ucb_bucket != NULL) { 3102 ZONE_UNLOCK(zone); 3103 return (true); 3104 } 3105 3106 /* 3107 * Check the zone's cache of buckets. 3108 */ 3109 if (zone->uz_flags & UMA_ZONE_FIRSTTOUCH) { 3110 domain = PCPU_GET(domain); 3111 zdom = &zone->uz_domain[domain]; 3112 } else { 3113 domain = UMA_ANYDOMAIN; 3114 zdom = &zone->uz_domain[0]; 3115 } 3116 3117 if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) { 3118 ZONE_UNLOCK(zone); 3119 KASSERT(bucket->ub_cnt != 0, 3120 ("uma_zalloc_arg: Returning an empty bucket.")); 3121 cache_bucket_load_alloc(cache, bucket); 3122 return (true); 3123 } 3124 /* We are no longer associated with this CPU. */ 3125 critical_exit(); 3126 3127 /* 3128 * We bump the uz count when the cache size is insufficient to 3129 * handle the working set. 3130 */ 3131 if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) 3132 zone->uz_bucket_size++; 3133 ZONE_UNLOCK(zone); 3134 3135 /* 3136 * Fill a bucket and attempt to use it as the alloc bucket. 3137 */ 3138 bucket = zone_alloc_bucket(zone, udata, domain, flags); 3139 CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", 3140 zone->uz_name, zone, bucket); 3141 if (bucket == NULL) { 3142 critical_enter(); 3143 return (false); 3144 } 3145 3146 /* 3147 * See if we lost the race or were migrated. Cache the 3148 * initialized bucket to make this less likely or claim 3149 * the memory directly. 3150 */ 3151 ZONE_LOCK(zone); 3152 critical_enter(); 3153 cache = &zone->uz_cpu[curcpu]; 3154 if (cache->uc_allocbucket.ucb_bucket == NULL && 3155 ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) == 0 || 3156 domain == PCPU_GET(domain))) { 3157 cache_bucket_load_alloc(cache, bucket); 3158 zdom->uzd_imax += bucket->ub_cnt; 3159 } else if (zone->uz_bkt_count >= zone->uz_bkt_max) { 3160 critical_exit(); 3161 ZONE_UNLOCK(zone); 3162 bucket_drain(zone, bucket); 3163 bucket_free(zone, bucket, udata); 3164 critical_enter(); 3165 return (true); 3166 } else 3167 zone_put_bucket(zone, zdom, bucket, false); 3168 ZONE_UNLOCK(zone); 3169 return (true); 3170 } 3171 3172 void * 3173 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) 3174 { 3175 3176 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3177 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3178 3179 /* This is the fast path allocation */ 3180 CTR4(KTR_UMA, "uma_zalloc_domain zone %s(%p) domain %d flags %d", 3181 zone->uz_name, zone, domain, flags); 3182 3183 if (flags & M_WAITOK) { 3184 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 3185 "uma_zalloc_domain: zone \"%s\"", zone->uz_name); 3186 } 3187 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3188 ("uma_zalloc_domain: called with spinlock or critical section held")); 3189 3190 return (zone_alloc_item(zone, udata, domain, flags)); 3191 } 3192 3193 /* 3194 * Find a slab with some space. Prefer slabs that are partially used over those 3195 * that are totally full. This helps to reduce fragmentation. 3196 * 3197 * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check 3198 * only 'domain'. 3199 */ 3200 static uma_slab_t 3201 keg_first_slab(uma_keg_t keg, int domain, bool rr) 3202 { 3203 uma_domain_t dom; 3204 uma_slab_t slab; 3205 int start; 3206 3207 KASSERT(domain >= 0 && domain < vm_ndomains, 3208 ("keg_first_slab: domain %d out of range", domain)); 3209 KEG_LOCK_ASSERT(keg, domain); 3210 3211 slab = NULL; 3212 start = domain; 3213 do { 3214 dom = &keg->uk_domain[domain]; 3215 if (!LIST_EMPTY(&dom->ud_part_slab)) 3216 return (LIST_FIRST(&dom->ud_part_slab)); 3217 if (!LIST_EMPTY(&dom->ud_free_slab)) { 3218 slab = LIST_FIRST(&dom->ud_free_slab); 3219 LIST_REMOVE(slab, us_link); 3220 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 3221 return (slab); 3222 } 3223 if (rr) 3224 domain = (domain + 1) % vm_ndomains; 3225 } while (domain != start); 3226 3227 return (NULL); 3228 } 3229 3230 /* 3231 * Fetch an existing slab from a free or partial list. Returns with the 3232 * keg domain lock held if a slab was found or unlocked if not. 3233 */ 3234 static uma_slab_t 3235 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) 3236 { 3237 uma_slab_t slab; 3238 uint32_t reserve; 3239 3240 /* HASH has a single free list. */ 3241 if ((keg->uk_flags & UMA_ZFLAG_HASH) != 0) 3242 domain = 0; 3243 3244 KEG_LOCK(keg, domain); 3245 reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; 3246 if (keg->uk_domain[domain].ud_free <= reserve || 3247 (slab = keg_first_slab(keg, domain, rr)) == NULL) { 3248 KEG_UNLOCK(keg, domain); 3249 return (NULL); 3250 } 3251 return (slab); 3252 } 3253 3254 static uma_slab_t 3255 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) 3256 { 3257 struct vm_domainset_iter di; 3258 uma_slab_t slab; 3259 int aflags, domain; 3260 bool rr; 3261 3262 restart: 3263 /* 3264 * Use the keg's policy if upper layers haven't already specified a 3265 * domain (as happens with first-touch zones). 3266 * 3267 * To avoid races we run the iterator with the keg lock held, but that 3268 * means that we cannot allow the vm_domainset layer to sleep. Thus, 3269 * clear M_WAITOK and handle low memory conditions locally. 3270 */ 3271 rr = rdomain == UMA_ANYDOMAIN; 3272 if (rr) { 3273 aflags = (flags & ~M_WAITOK) | M_NOWAIT; 3274 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 3275 &aflags); 3276 } else { 3277 aflags = flags; 3278 domain = rdomain; 3279 } 3280 3281 for (;;) { 3282 slab = keg_fetch_free_slab(keg, domain, rr, flags); 3283 if (slab != NULL) 3284 return (slab); 3285 3286 /* 3287 * M_NOVM means don't ask at all! 3288 */ 3289 if (flags & M_NOVM) 3290 break; 3291 3292 slab = keg_alloc_slab(keg, zone, domain, flags, aflags); 3293 if (slab != NULL) 3294 return (slab); 3295 if (!rr && (flags & M_WAITOK) == 0) 3296 break; 3297 if (rr && vm_domainset_iter_policy(&di, &domain) != 0) { 3298 if ((flags & M_WAITOK) != 0) { 3299 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); 3300 goto restart; 3301 } 3302 break; 3303 } 3304 } 3305 3306 /* 3307 * We might not have been able to get a slab but another cpu 3308 * could have while we were unlocked. Check again before we 3309 * fail. 3310 */ 3311 if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) 3312 return (slab); 3313 3314 return (NULL); 3315 } 3316 3317 static void * 3318 slab_alloc_item(uma_keg_t keg, uma_slab_t slab) 3319 { 3320 uma_domain_t dom; 3321 void *item; 3322 int freei; 3323 3324 KEG_LOCK_ASSERT(keg, slab->us_domain); 3325 3326 dom = &keg->uk_domain[slab->us_domain]; 3327 freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1; 3328 BIT_CLR(keg->uk_ipers, freei, &slab->us_free); 3329 item = slab_item(slab, keg, freei); 3330 slab->us_freecount--; 3331 dom->ud_free--; 3332 3333 /* Move this slab to the full list */ 3334 if (slab->us_freecount == 0) { 3335 LIST_REMOVE(slab, us_link); 3336 LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); 3337 } 3338 3339 return (item); 3340 } 3341 3342 static int 3343 zone_import(void *arg, void **bucket, int max, int domain, int flags) 3344 { 3345 uma_domain_t dom; 3346 uma_zone_t zone; 3347 uma_slab_t slab; 3348 uma_keg_t keg; 3349 #ifdef NUMA 3350 int stripe; 3351 #endif 3352 int i; 3353 3354 zone = arg; 3355 slab = NULL; 3356 keg = zone->uz_keg; 3357 /* Try to keep the buckets totally full */ 3358 for (i = 0; i < max; ) { 3359 if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL) 3360 break; 3361 #ifdef NUMA 3362 stripe = howmany(max, vm_ndomains); 3363 #endif 3364 dom = &keg->uk_domain[slab->us_domain]; 3365 while (slab->us_freecount && i < max) { 3366 bucket[i++] = slab_alloc_item(keg, slab); 3367 if (dom->ud_free <= keg->uk_reserve) 3368 break; 3369 #ifdef NUMA 3370 /* 3371 * If the zone is striped we pick a new slab for every 3372 * N allocations. Eliminating this conditional will 3373 * instead pick a new domain for each bucket rather 3374 * than stripe within each bucket. The current option 3375 * produces more fragmentation and requires more cpu 3376 * time but yields better distribution. 3377 */ 3378 if ((zone->uz_flags & UMA_ZONE_ROUNDROBIN) != 0 && 3379 vm_ndomains > 1 && --stripe == 0) 3380 break; 3381 #endif 3382 } 3383 KEG_UNLOCK(keg, slab->us_domain); 3384 /* Don't block if we allocated any successfully. */ 3385 flags &= ~M_WAITOK; 3386 flags |= M_NOWAIT; 3387 } 3388 3389 return i; 3390 } 3391 3392 static int 3393 zone_alloc_limit_hard(uma_zone_t zone, int count, int flags) 3394 { 3395 uint64_t old, new, total, max; 3396 3397 /* 3398 * The hard case. We're going to sleep because there were existing 3399 * sleepers or because we ran out of items. This routine enforces 3400 * fairness by keeping fifo order. 3401 * 3402 * First release our ill gotten gains and make some noise. 3403 */ 3404 for (;;) { 3405 zone_free_limit(zone, count); 3406 zone_log_warning(zone); 3407 zone_maxaction(zone); 3408 if (flags & M_NOWAIT) 3409 return (0); 3410 3411 /* 3412 * We need to allocate an item or set ourself as a sleeper 3413 * while the sleepq lock is held to avoid wakeup races. This 3414 * is essentially a home rolled semaphore. 3415 */ 3416 sleepq_lock(&zone->uz_max_items); 3417 old = zone->uz_items; 3418 do { 3419 MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX); 3420 /* Cache the max since we will evaluate twice. */ 3421 max = zone->uz_max_items; 3422 if (UZ_ITEMS_SLEEPERS(old) != 0 || 3423 UZ_ITEMS_COUNT(old) >= max) 3424 new = old + UZ_ITEMS_SLEEPER; 3425 else 3426 new = old + MIN(count, max - old); 3427 } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0); 3428 3429 /* We may have successfully allocated under the sleepq lock. */ 3430 if (UZ_ITEMS_SLEEPERS(new) == 0) { 3431 sleepq_release(&zone->uz_max_items); 3432 return (new - old); 3433 } 3434 3435 /* 3436 * This is in a different cacheline from uz_items so that we 3437 * don't constantly invalidate the fastpath cacheline when we 3438 * adjust item counts. This could be limited to toggling on 3439 * transitions. 3440 */ 3441 atomic_add_32(&zone->uz_sleepers, 1); 3442 atomic_add_64(&zone->uz_sleeps, 1); 3443 3444 /* 3445 * We have added ourselves as a sleeper. The sleepq lock 3446 * protects us from wakeup races. Sleep now and then retry. 3447 */ 3448 sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0); 3449 sleepq_wait(&zone->uz_max_items, PVM); 3450 3451 /* 3452 * After wakeup, remove ourselves as a sleeper and try 3453 * again. We no longer have the sleepq lock for protection. 3454 * 3455 * Subract ourselves as a sleeper while attempting to add 3456 * our count. 3457 */ 3458 atomic_subtract_32(&zone->uz_sleepers, 1); 3459 old = atomic_fetchadd_64(&zone->uz_items, 3460 -(UZ_ITEMS_SLEEPER - count)); 3461 /* We're no longer a sleeper. */ 3462 old -= UZ_ITEMS_SLEEPER; 3463 3464 /* 3465 * If we're still at the limit, restart. Notably do not 3466 * block on other sleepers. Cache the max value to protect 3467 * against changes via sysctl. 3468 */ 3469 total = UZ_ITEMS_COUNT(old); 3470 max = zone->uz_max_items; 3471 if (total >= max) 3472 continue; 3473 /* Truncate if necessary, otherwise wake other sleepers. */ 3474 if (total + count > max) { 3475 zone_free_limit(zone, total + count - max); 3476 count = max - total; 3477 } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0) 3478 wakeup_one(&zone->uz_max_items); 3479 3480 return (count); 3481 } 3482 } 3483 3484 /* 3485 * Allocate 'count' items from our max_items limit. Returns the number 3486 * available. If M_NOWAIT is not specified it will sleep until at least 3487 * one item can be allocated. 3488 */ 3489 static int 3490 zone_alloc_limit(uma_zone_t zone, int count, int flags) 3491 { 3492 uint64_t old; 3493 uint64_t max; 3494 3495 max = zone->uz_max_items; 3496 MPASS(max > 0); 3497 3498 /* 3499 * We expect normal allocations to succeed with a simple 3500 * fetchadd. 3501 */ 3502 old = atomic_fetchadd_64(&zone->uz_items, count); 3503 if (__predict_true(old + count <= max)) 3504 return (count); 3505 3506 /* 3507 * If we had some items and no sleepers just return the 3508 * truncated value. We have to release the excess space 3509 * though because that may wake sleepers who weren't woken 3510 * because we were temporarily over the limit. 3511 */ 3512 if (old < max) { 3513 zone_free_limit(zone, (old + count) - max); 3514 return (max - old); 3515 } 3516 return (zone_alloc_limit_hard(zone, count, flags)); 3517 } 3518 3519 /* 3520 * Free a number of items back to the limit. 3521 */ 3522 static void 3523 zone_free_limit(uma_zone_t zone, int count) 3524 { 3525 uint64_t old; 3526 3527 MPASS(count > 0); 3528 3529 /* 3530 * In the common case we either have no sleepers or 3531 * are still over the limit and can just return. 3532 */ 3533 old = atomic_fetchadd_64(&zone->uz_items, -count); 3534 if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 || 3535 UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items)) 3536 return; 3537 3538 /* 3539 * Moderate the rate of wakeups. Sleepers will continue 3540 * to generate wakeups if necessary. 3541 */ 3542 wakeup_one(&zone->uz_max_items); 3543 } 3544 3545 static uma_bucket_t 3546 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) 3547 { 3548 uma_bucket_t bucket; 3549 int maxbucket, cnt; 3550 3551 CTR3(KTR_UMA, "zone_alloc_bucket zone %s(%p) domain %d", zone->uz_name, 3552 zone, domain); 3553 3554 /* Avoid allocs targeting empty domains. */ 3555 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 3556 domain = UMA_ANYDOMAIN; 3557 3558 if (zone->uz_max_items > 0) 3559 maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size, 3560 M_NOWAIT); 3561 else 3562 maxbucket = zone->uz_bucket_size; 3563 if (maxbucket == 0) 3564 return (false); 3565 3566 /* Don't wait for buckets, preserve caller's NOVM setting. */ 3567 bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); 3568 if (bucket == NULL) { 3569 cnt = 0; 3570 goto out; 3571 } 3572 3573 bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, 3574 MIN(maxbucket, bucket->ub_entries), domain, flags); 3575 3576 /* 3577 * Initialize the memory if necessary. 3578 */ 3579 if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { 3580 int i; 3581 3582 for (i = 0; i < bucket->ub_cnt; i++) 3583 if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size, 3584 flags) != 0) 3585 break; 3586 /* 3587 * If we couldn't initialize the whole bucket, put the 3588 * rest back onto the freelist. 3589 */ 3590 if (i != bucket->ub_cnt) { 3591 zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], 3592 bucket->ub_cnt - i); 3593 #ifdef INVARIANTS 3594 bzero(&bucket->ub_bucket[i], 3595 sizeof(void *) * (bucket->ub_cnt - i)); 3596 #endif 3597 bucket->ub_cnt = i; 3598 } 3599 } 3600 3601 cnt = bucket->ub_cnt; 3602 if (bucket->ub_cnt == 0) { 3603 bucket_free(zone, bucket, udata); 3604 counter_u64_add(zone->uz_fails, 1); 3605 bucket = NULL; 3606 } 3607 out: 3608 if (zone->uz_max_items > 0 && cnt < maxbucket) 3609 zone_free_limit(zone, maxbucket - cnt); 3610 3611 return (bucket); 3612 } 3613 3614 /* 3615 * Allocates a single item from a zone. 3616 * 3617 * Arguments 3618 * zone The zone to alloc for. 3619 * udata The data to be passed to the constructor. 3620 * domain The domain to allocate from or UMA_ANYDOMAIN. 3621 * flags M_WAITOK, M_NOWAIT, M_ZERO. 3622 * 3623 * Returns 3624 * NULL if there is no memory and M_NOWAIT is set 3625 * An item if successful 3626 */ 3627 3628 static void * 3629 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) 3630 { 3631 void *item; 3632 3633 if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) 3634 return (NULL); 3635 3636 /* Avoid allocs targeting empty domains. */ 3637 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 3638 domain = UMA_ANYDOMAIN; 3639 3640 if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) 3641 goto fail_cnt; 3642 3643 /* 3644 * We have to call both the zone's init (not the keg's init) 3645 * and the zone's ctor. This is because the item is going from 3646 * a keg slab directly to the user, and the user is expecting it 3647 * to be both zone-init'd as well as zone-ctor'd. 3648 */ 3649 if (zone->uz_init != NULL) { 3650 if (zone->uz_init(item, zone->uz_size, flags) != 0) { 3651 zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); 3652 goto fail_cnt; 3653 } 3654 } 3655 item = item_ctor(zone, zone->uz_size, udata, flags, item); 3656 if (item == NULL) 3657 goto fail; 3658 3659 counter_u64_add(zone->uz_allocs, 1); 3660 CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, 3661 zone->uz_name, zone); 3662 3663 return (item); 3664 3665 fail_cnt: 3666 counter_u64_add(zone->uz_fails, 1); 3667 fail: 3668 if (zone->uz_max_items > 0) 3669 zone_free_limit(zone, 1); 3670 CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", 3671 zone->uz_name, zone); 3672 3673 return (NULL); 3674 } 3675 3676 /* See uma.h */ 3677 void 3678 uma_zfree_arg(uma_zone_t zone, void *item, void *udata) 3679 { 3680 uma_cache_t cache; 3681 uma_cache_bucket_t bucket; 3682 int domain, itemdomain, uz_flags; 3683 3684 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3685 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3686 3687 CTR2(KTR_UMA, "uma_zfree_arg zone %s(%p)", zone->uz_name, zone); 3688 3689 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3690 ("uma_zfree_arg: called with spinlock or critical section held")); 3691 3692 /* uma_zfree(..., NULL) does nothing, to match free(9). */ 3693 if (item == NULL) 3694 return; 3695 #ifdef DEBUG_MEMGUARD 3696 if (is_memguard_addr(item)) { 3697 if (zone->uz_dtor != NULL) 3698 zone->uz_dtor(item, zone->uz_size, udata); 3699 if (zone->uz_fini != NULL) 3700 zone->uz_fini(item, zone->uz_size); 3701 memguard_free(item); 3702 return; 3703 } 3704 #endif 3705 3706 /* 3707 * We are accessing the per-cpu cache without a critical section to 3708 * fetch size and flags. This is acceptable, if we are preempted we 3709 * will simply read another cpu's line. 3710 */ 3711 cache = &zone->uz_cpu[curcpu]; 3712 uz_flags = cache_uz_flags(cache); 3713 if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0 || 3714 UMA_ALWAYS_CTORDTOR)) 3715 item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE); 3716 3717 /* 3718 * The race here is acceptable. If we miss it we'll just have to wait 3719 * a little longer for the limits to be reset. 3720 */ 3721 if (__predict_false(uz_flags & UMA_ZFLAG_LIMIT)) { 3722 if (zone->uz_sleepers > 0) 3723 goto zfree_item; 3724 } 3725 3726 /* 3727 * If possible, free to the per-CPU cache. There are two 3728 * requirements for safe access to the per-CPU cache: (1) the thread 3729 * accessing the cache must not be preempted or yield during access, 3730 * and (2) the thread must not migrate CPUs without switching which 3731 * cache it accesses. We rely on a critical section to prevent 3732 * preemption and migration. We release the critical section in 3733 * order to acquire the zone mutex if we are unable to free to the 3734 * current cache; when we re-acquire the critical section, we must 3735 * detect and handle migration if it has occurred. 3736 */ 3737 domain = itemdomain = 0; 3738 #ifdef NUMA 3739 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) 3740 itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item)); 3741 #endif 3742 critical_enter(); 3743 do { 3744 cache = &zone->uz_cpu[curcpu]; 3745 #ifdef NUMA 3746 domain = PCPU_GET(domain); 3747 if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 && 3748 domain != itemdomain) { 3749 bucket = &cache->uc_crossbucket; 3750 } else 3751 #endif 3752 { 3753 /* 3754 * Try to free into the allocbucket first to give LIFO 3755 * ordering for cache-hot datastructures. Spill over 3756 * into the freebucket if necessary. Alloc will swap 3757 * them if one runs dry. 3758 */ 3759 bucket = &cache->uc_allocbucket; 3760 if (__predict_false(bucket->ucb_cnt >= 3761 bucket->ucb_entries)) 3762 bucket = &cache->uc_freebucket; 3763 } 3764 if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) { 3765 cache_bucket_push(cache, bucket, item); 3766 critical_exit(); 3767 return; 3768 } 3769 } while (cache_free(zone, cache, udata, item, itemdomain)); 3770 critical_exit(); 3771 3772 /* 3773 * If nothing else caught this, we'll just do an internal free. 3774 */ 3775 zfree_item: 3776 zone_free_item(zone, item, udata, SKIP_DTOR); 3777 } 3778 3779 #ifdef NUMA 3780 /* 3781 * sort crossdomain free buckets to domain correct buckets and cache 3782 * them. 3783 */ 3784 static void 3785 zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, void *udata) 3786 { 3787 struct uma_bucketlist fullbuckets; 3788 uma_zone_domain_t zdom; 3789 uma_bucket_t b; 3790 void *item; 3791 int domain; 3792 3793 CTR3(KTR_UMA, 3794 "uma_zfree: zone %s(%p) draining cross bucket %p", 3795 zone->uz_name, zone, bucket); 3796 3797 TAILQ_INIT(&fullbuckets); 3798 3799 /* 3800 * To avoid having ndomain * ndomain buckets for sorting we have a 3801 * lock on the current crossfree bucket. A full matrix with 3802 * per-domain locking could be used if necessary. 3803 */ 3804 ZONE_CROSS_LOCK(zone); 3805 while (bucket->ub_cnt > 0) { 3806 item = bucket->ub_bucket[bucket->ub_cnt - 1]; 3807 domain = _vm_phys_domain(pmap_kextract((vm_offset_t)item)); 3808 zdom = &zone->uz_domain[domain]; 3809 if (zdom->uzd_cross == NULL) { 3810 zdom->uzd_cross = bucket_alloc(zone, udata, M_NOWAIT); 3811 if (zdom->uzd_cross == NULL) 3812 break; 3813 } 3814 zdom->uzd_cross->ub_bucket[zdom->uzd_cross->ub_cnt++] = item; 3815 if (zdom->uzd_cross->ub_cnt == zdom->uzd_cross->ub_entries) { 3816 TAILQ_INSERT_HEAD(&fullbuckets, zdom->uzd_cross, 3817 ub_link); 3818 zdom->uzd_cross = NULL; 3819 } 3820 bucket->ub_cnt--; 3821 } 3822 ZONE_CROSS_UNLOCK(zone); 3823 if (!TAILQ_EMPTY(&fullbuckets)) { 3824 ZONE_LOCK(zone); 3825 while ((b = TAILQ_FIRST(&fullbuckets)) != NULL) { 3826 TAILQ_REMOVE(&fullbuckets, b, ub_link); 3827 if (zone->uz_bkt_count >= zone->uz_bkt_max) { 3828 ZONE_UNLOCK(zone); 3829 bucket_drain(zone, b); 3830 bucket_free(zone, b, udata); 3831 ZONE_LOCK(zone); 3832 } else { 3833 domain = _vm_phys_domain( 3834 pmap_kextract( 3835 (vm_offset_t)b->ub_bucket[0])); 3836 zdom = &zone->uz_domain[domain]; 3837 zone_put_bucket(zone, zdom, b, true); 3838 } 3839 } 3840 ZONE_UNLOCK(zone); 3841 } 3842 if (bucket->ub_cnt != 0) 3843 bucket_drain(zone, bucket); 3844 bucket_free(zone, bucket, udata); 3845 } 3846 #endif 3847 3848 static void 3849 zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, 3850 int domain, int itemdomain) 3851 { 3852 uma_zone_domain_t zdom; 3853 3854 #ifdef NUMA 3855 /* 3856 * Buckets coming from the wrong domain will be entirely for the 3857 * only other domain on two domain systems. In this case we can 3858 * simply cache them. Otherwise we need to sort them back to 3859 * correct domains. 3860 */ 3861 if (domain != itemdomain && vm_ndomains > 2) { 3862 zone_free_cross(zone, bucket, udata); 3863 return; 3864 } 3865 #endif 3866 3867 /* 3868 * Attempt to save the bucket in the zone's domain bucket cache. 3869 * 3870 * We bump the uz count when the cache size is insufficient to 3871 * handle the working set. 3872 */ 3873 if (ZONE_TRYLOCK(zone) == 0) { 3874 /* Record contention to size the buckets. */ 3875 ZONE_LOCK(zone); 3876 if (zone->uz_bucket_size < zone->uz_bucket_size_max) 3877 zone->uz_bucket_size++; 3878 } 3879 3880 CTR3(KTR_UMA, 3881 "uma_zfree: zone %s(%p) putting bucket %p on free list", 3882 zone->uz_name, zone, bucket); 3883 /* ub_cnt is pointing to the last free item */ 3884 KASSERT(bucket->ub_cnt == bucket->ub_entries, 3885 ("uma_zfree: Attempting to insert partial bucket onto the full list.\n")); 3886 if (zone->uz_bkt_count >= zone->uz_bkt_max) { 3887 ZONE_UNLOCK(zone); 3888 bucket_drain(zone, bucket); 3889 bucket_free(zone, bucket, udata); 3890 } else { 3891 zdom = &zone->uz_domain[itemdomain]; 3892 zone_put_bucket(zone, zdom, bucket, true); 3893 ZONE_UNLOCK(zone); 3894 } 3895 } 3896 3897 /* 3898 * Populate a free or cross bucket for the current cpu cache. Free any 3899 * existing full bucket either to the zone cache or back to the slab layer. 3900 * 3901 * Enters and returns in a critical section. false return indicates that 3902 * we can not satisfy this free in the cache layer. true indicates that 3903 * the caller should retry. 3904 */ 3905 static __noinline bool 3906 cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item, 3907 int itemdomain) 3908 { 3909 uma_cache_bucket_t cbucket; 3910 uma_bucket_t bucket; 3911 int domain; 3912 3913 CRITICAL_ASSERT(curthread); 3914 3915 if (zone->uz_bucket_size == 0 || bucketdisable) 3916 return false; 3917 3918 cache = &zone->uz_cpu[curcpu]; 3919 3920 /* 3921 * FIRSTTOUCH domains need to free to the correct zdom. When 3922 * enabled this is the zdom of the item. The bucket is the 3923 * cross bucket if the current domain and itemdomain do not match. 3924 */ 3925 cbucket = &cache->uc_freebucket; 3926 #ifdef NUMA 3927 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) { 3928 domain = PCPU_GET(domain); 3929 if (domain != itemdomain) { 3930 cbucket = &cache->uc_crossbucket; 3931 if (cbucket->ucb_cnt != 0) 3932 atomic_add_64(&zone->uz_xdomain, 3933 cbucket->ucb_cnt); 3934 } 3935 } else 3936 #endif 3937 itemdomain = domain = 0; 3938 bucket = cache_bucket_unload(cbucket); 3939 3940 /* We are no longer associated with this CPU. */ 3941 critical_exit(); 3942 3943 if (bucket != NULL) 3944 zone_free_bucket(zone, bucket, udata, domain, itemdomain); 3945 3946 bucket = bucket_alloc(zone, udata, M_NOWAIT); 3947 CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p", 3948 zone->uz_name, zone, bucket); 3949 critical_enter(); 3950 if (bucket == NULL) 3951 return (false); 3952 cache = &zone->uz_cpu[curcpu]; 3953 #ifdef NUMA 3954 /* 3955 * Check to see if we should be populating the cross bucket. If it 3956 * is already populated we will fall through and attempt to populate 3957 * the free bucket. 3958 */ 3959 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) { 3960 domain = PCPU_GET(domain); 3961 if (domain != itemdomain && 3962 cache->uc_crossbucket.ucb_bucket == NULL) { 3963 cache_bucket_load_cross(cache, bucket); 3964 return (true); 3965 } 3966 } 3967 #endif 3968 /* 3969 * We may have lost the race to fill the bucket or switched CPUs. 3970 */ 3971 if (cache->uc_freebucket.ucb_bucket != NULL) { 3972 critical_exit(); 3973 bucket_free(zone, bucket, udata); 3974 critical_enter(); 3975 } else 3976 cache_bucket_load_free(cache, bucket); 3977 3978 return (true); 3979 } 3980 3981 void 3982 uma_zfree_domain(uma_zone_t zone, void *item, void *udata) 3983 { 3984 3985 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3986 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3987 3988 CTR2(KTR_UMA, "uma_zfree_domain zone %s(%p)", zone->uz_name, zone); 3989 3990 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3991 ("uma_zfree_domain: called with spinlock or critical section held")); 3992 3993 /* uma_zfree(..., NULL) does nothing, to match free(9). */ 3994 if (item == NULL) 3995 return; 3996 zone_free_item(zone, item, udata, SKIP_NONE); 3997 } 3998 3999 static void 4000 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) 4001 { 4002 uma_keg_t keg; 4003 uma_domain_t dom; 4004 int freei; 4005 4006 keg = zone->uz_keg; 4007 KEG_LOCK_ASSERT(keg, slab->us_domain); 4008 4009 /* Do we need to remove from any lists? */ 4010 dom = &keg->uk_domain[slab->us_domain]; 4011 if (slab->us_freecount+1 == keg->uk_ipers) { 4012 LIST_REMOVE(slab, us_link); 4013 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); 4014 } else if (slab->us_freecount == 0) { 4015 LIST_REMOVE(slab, us_link); 4016 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 4017 } 4018 4019 /* Slab management. */ 4020 freei = slab_item_index(slab, keg, item); 4021 BIT_SET(keg->uk_ipers, freei, &slab->us_free); 4022 slab->us_freecount++; 4023 4024 /* Keg statistics. */ 4025 dom->ud_free++; 4026 } 4027 4028 static void 4029 zone_release(void *arg, void **bucket, int cnt) 4030 { 4031 struct mtx *lock; 4032 uma_zone_t zone; 4033 uma_slab_t slab; 4034 uma_keg_t keg; 4035 uint8_t *mem; 4036 void *item; 4037 int i; 4038 4039 zone = arg; 4040 keg = zone->uz_keg; 4041 lock = NULL; 4042 if (__predict_false((zone->uz_flags & UMA_ZFLAG_HASH) != 0)) 4043 lock = KEG_LOCK(keg, 0); 4044 for (i = 0; i < cnt; i++) { 4045 item = bucket[i]; 4046 if (__predict_true((zone->uz_flags & UMA_ZFLAG_VTOSLAB) != 0)) { 4047 slab = vtoslab((vm_offset_t)item); 4048 } else { 4049 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 4050 if ((zone->uz_flags & UMA_ZFLAG_HASH) != 0) 4051 slab = hash_sfind(&keg->uk_hash, mem); 4052 else 4053 slab = (uma_slab_t)(mem + keg->uk_pgoff); 4054 } 4055 if (lock != KEG_LOCKPTR(keg, slab->us_domain)) { 4056 if (lock != NULL) 4057 mtx_unlock(lock); 4058 lock = KEG_LOCK(keg, slab->us_domain); 4059 } 4060 slab_free_item(zone, slab, item); 4061 } 4062 if (lock != NULL) 4063 mtx_unlock(lock); 4064 } 4065 4066 /* 4067 * Frees a single item to any zone. 4068 * 4069 * Arguments: 4070 * zone The zone to free to 4071 * item The item we're freeing 4072 * udata User supplied data for the dtor 4073 * skip Skip dtors and finis 4074 */ 4075 static void 4076 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) 4077 { 4078 4079 item_dtor(zone, item, zone->uz_size, udata, skip); 4080 4081 if (skip < SKIP_FINI && zone->uz_fini) 4082 zone->uz_fini(item, zone->uz_size); 4083 4084 zone->uz_release(zone->uz_arg, &item, 1); 4085 4086 if (skip & SKIP_CNT) 4087 return; 4088 4089 counter_u64_add(zone->uz_frees, 1); 4090 4091 if (zone->uz_max_items > 0) 4092 zone_free_limit(zone, 1); 4093 } 4094 4095 /* See uma.h */ 4096 int 4097 uma_zone_set_max(uma_zone_t zone, int nitems) 4098 { 4099 struct uma_bucket_zone *ubz; 4100 int count; 4101 4102 /* 4103 * XXX This can misbehave if the zone has any allocations with 4104 * no limit and a limit is imposed. There is currently no 4105 * way to clear a limit. 4106 */ 4107 ZONE_LOCK(zone); 4108 ubz = bucket_zone_max(zone, nitems); 4109 count = ubz != NULL ? ubz->ubz_entries : 0; 4110 zone->uz_bucket_size_max = zone->uz_bucket_size = count; 4111 if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) 4112 zone->uz_bucket_size_min = zone->uz_bucket_size_max; 4113 zone->uz_max_items = nitems; 4114 zone->uz_flags |= UMA_ZFLAG_LIMIT; 4115 zone_update_caches(zone); 4116 /* We may need to wake waiters. */ 4117 wakeup(&zone->uz_max_items); 4118 ZONE_UNLOCK(zone); 4119 4120 return (nitems); 4121 } 4122 4123 /* See uma.h */ 4124 void 4125 uma_zone_set_maxcache(uma_zone_t zone, int nitems) 4126 { 4127 struct uma_bucket_zone *ubz; 4128 int bpcpu; 4129 4130 ZONE_LOCK(zone); 4131 ubz = bucket_zone_max(zone, nitems); 4132 if (ubz != NULL) { 4133 bpcpu = 2; 4134 if ((zone->uz_flags & UMA_ZONE_FIRSTTOUCH) != 0) 4135 /* Count the cross-domain bucket. */ 4136 bpcpu++; 4137 nitems -= ubz->ubz_entries * bpcpu * mp_ncpus; 4138 zone->uz_bucket_size_max = ubz->ubz_entries; 4139 } else { 4140 zone->uz_bucket_size_max = zone->uz_bucket_size = 0; 4141 } 4142 if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) 4143 zone->uz_bucket_size_min = zone->uz_bucket_size_max; 4144 zone->uz_bkt_max = nitems; 4145 ZONE_UNLOCK(zone); 4146 } 4147 4148 /* See uma.h */ 4149 int 4150 uma_zone_get_max(uma_zone_t zone) 4151 { 4152 int nitems; 4153 4154 nitems = atomic_load_64(&zone->uz_max_items); 4155 4156 return (nitems); 4157 } 4158 4159 /* See uma.h */ 4160 void 4161 uma_zone_set_warning(uma_zone_t zone, const char *warning) 4162 { 4163 4164 ZONE_ASSERT_COLD(zone); 4165 zone->uz_warning = warning; 4166 } 4167 4168 /* See uma.h */ 4169 void 4170 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) 4171 { 4172 4173 ZONE_ASSERT_COLD(zone); 4174 TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); 4175 } 4176 4177 /* See uma.h */ 4178 int 4179 uma_zone_get_cur(uma_zone_t zone) 4180 { 4181 int64_t nitems; 4182 u_int i; 4183 4184 nitems = 0; 4185 if (zone->uz_allocs != EARLY_COUNTER && zone->uz_frees != EARLY_COUNTER) 4186 nitems = counter_u64_fetch(zone->uz_allocs) - 4187 counter_u64_fetch(zone->uz_frees); 4188 CPU_FOREACH(i) 4189 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs) - 4190 atomic_load_64(&zone->uz_cpu[i].uc_frees); 4191 4192 return (nitems < 0 ? 0 : nitems); 4193 } 4194 4195 static uint64_t 4196 uma_zone_get_allocs(uma_zone_t zone) 4197 { 4198 uint64_t nitems; 4199 u_int i; 4200 4201 nitems = 0; 4202 if (zone->uz_allocs != EARLY_COUNTER) 4203 nitems = counter_u64_fetch(zone->uz_allocs); 4204 CPU_FOREACH(i) 4205 nitems += atomic_load_64(&zone->uz_cpu[i].uc_allocs); 4206 4207 return (nitems); 4208 } 4209 4210 static uint64_t 4211 uma_zone_get_frees(uma_zone_t zone) 4212 { 4213 uint64_t nitems; 4214 u_int i; 4215 4216 nitems = 0; 4217 if (zone->uz_frees != EARLY_COUNTER) 4218 nitems = counter_u64_fetch(zone->uz_frees); 4219 CPU_FOREACH(i) 4220 nitems += atomic_load_64(&zone->uz_cpu[i].uc_frees); 4221 4222 return (nitems); 4223 } 4224 4225 #ifdef INVARIANTS 4226 /* Used only for KEG_ASSERT_COLD(). */ 4227 static uint64_t 4228 uma_keg_get_allocs(uma_keg_t keg) 4229 { 4230 uma_zone_t z; 4231 uint64_t nitems; 4232 4233 nitems = 0; 4234 LIST_FOREACH(z, &keg->uk_zones, uz_link) 4235 nitems += uma_zone_get_allocs(z); 4236 4237 return (nitems); 4238 } 4239 #endif 4240 4241 /* See uma.h */ 4242 void 4243 uma_zone_set_init(uma_zone_t zone, uma_init uminit) 4244 { 4245 uma_keg_t keg; 4246 4247 KEG_GET(zone, keg); 4248 KEG_ASSERT_COLD(keg); 4249 keg->uk_init = uminit; 4250 } 4251 4252 /* See uma.h */ 4253 void 4254 uma_zone_set_fini(uma_zone_t zone, uma_fini fini) 4255 { 4256 uma_keg_t keg; 4257 4258 KEG_GET(zone, keg); 4259 KEG_ASSERT_COLD(keg); 4260 keg->uk_fini = fini; 4261 } 4262 4263 /* See uma.h */ 4264 void 4265 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) 4266 { 4267 4268 ZONE_ASSERT_COLD(zone); 4269 zone->uz_init = zinit; 4270 } 4271 4272 /* See uma.h */ 4273 void 4274 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) 4275 { 4276 4277 ZONE_ASSERT_COLD(zone); 4278 zone->uz_fini = zfini; 4279 } 4280 4281 /* See uma.h */ 4282 void 4283 uma_zone_set_freef(uma_zone_t zone, uma_free freef) 4284 { 4285 uma_keg_t keg; 4286 4287 KEG_GET(zone, keg); 4288 KEG_ASSERT_COLD(keg); 4289 keg->uk_freef = freef; 4290 } 4291 4292 /* See uma.h */ 4293 void 4294 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) 4295 { 4296 uma_keg_t keg; 4297 4298 KEG_GET(zone, keg); 4299 KEG_ASSERT_COLD(keg); 4300 keg->uk_allocf = allocf; 4301 } 4302 4303 /* See uma.h */ 4304 void 4305 uma_zone_reserve(uma_zone_t zone, int items) 4306 { 4307 uma_keg_t keg; 4308 4309 KEG_GET(zone, keg); 4310 KEG_ASSERT_COLD(keg); 4311 keg->uk_reserve = items; 4312 } 4313 4314 /* See uma.h */ 4315 int 4316 uma_zone_reserve_kva(uma_zone_t zone, int count) 4317 { 4318 uma_keg_t keg; 4319 vm_offset_t kva; 4320 u_int pages; 4321 4322 KEG_GET(zone, keg); 4323 KEG_ASSERT_COLD(keg); 4324 ZONE_ASSERT_COLD(zone); 4325 4326 pages = howmany(count, keg->uk_ipers) * keg->uk_ppera; 4327 4328 #ifdef UMA_MD_SMALL_ALLOC 4329 if (keg->uk_ppera > 1) { 4330 #else 4331 if (1) { 4332 #endif 4333 kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); 4334 if (kva == 0) 4335 return (0); 4336 } else 4337 kva = 0; 4338 4339 ZONE_LOCK(zone); 4340 MPASS(keg->uk_kva == 0); 4341 keg->uk_kva = kva; 4342 keg->uk_offset = 0; 4343 zone->uz_max_items = pages * keg->uk_ipers; 4344 #ifdef UMA_MD_SMALL_ALLOC 4345 keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; 4346 #else 4347 keg->uk_allocf = noobj_alloc; 4348 #endif 4349 keg->uk_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; 4350 zone->uz_flags |= UMA_ZFLAG_LIMIT | UMA_ZONE_NOFREE; 4351 zone_update_caches(zone); 4352 ZONE_UNLOCK(zone); 4353 4354 return (1); 4355 } 4356 4357 /* See uma.h */ 4358 void 4359 uma_prealloc(uma_zone_t zone, int items) 4360 { 4361 struct vm_domainset_iter di; 4362 uma_domain_t dom; 4363 uma_slab_t slab; 4364 uma_keg_t keg; 4365 int aflags, domain, slabs; 4366 4367 KEG_GET(zone, keg); 4368 slabs = howmany(items, keg->uk_ipers); 4369 while (slabs-- > 0) { 4370 aflags = M_NOWAIT; 4371 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 4372 &aflags); 4373 for (;;) { 4374 slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, 4375 aflags); 4376 if (slab != NULL) { 4377 dom = &keg->uk_domain[slab->us_domain]; 4378 LIST_REMOVE(slab, us_link); 4379 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, 4380 us_link); 4381 KEG_UNLOCK(keg, slab->us_domain); 4382 break; 4383 } 4384 if (vm_domainset_iter_policy(&di, &domain) != 0) 4385 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); 4386 } 4387 } 4388 } 4389 4390 /* See uma.h */ 4391 void 4392 uma_reclaim(int req) 4393 { 4394 4395 CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); 4396 sx_xlock(&uma_reclaim_lock); 4397 bucket_enable(); 4398 4399 switch (req) { 4400 case UMA_RECLAIM_TRIM: 4401 zone_foreach(zone_trim, NULL); 4402 break; 4403 case UMA_RECLAIM_DRAIN: 4404 case UMA_RECLAIM_DRAIN_CPU: 4405 zone_foreach(zone_drain, NULL); 4406 if (req == UMA_RECLAIM_DRAIN_CPU) { 4407 pcpu_cache_drain_safe(NULL); 4408 zone_foreach(zone_drain, NULL); 4409 } 4410 break; 4411 default: 4412 panic("unhandled reclamation request %d", req); 4413 } 4414 4415 /* 4416 * Some slabs may have been freed but this zone will be visited early 4417 * we visit again so that we can free pages that are empty once other 4418 * zones are drained. We have to do the same for buckets. 4419 */ 4420 zone_drain(slabzones[0], NULL); 4421 zone_drain(slabzones[1], NULL); 4422 bucket_zone_drain(); 4423 sx_xunlock(&uma_reclaim_lock); 4424 } 4425 4426 static volatile int uma_reclaim_needed; 4427 4428 void 4429 uma_reclaim_wakeup(void) 4430 { 4431 4432 if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0) 4433 wakeup(uma_reclaim); 4434 } 4435 4436 void 4437 uma_reclaim_worker(void *arg __unused) 4438 { 4439 4440 for (;;) { 4441 sx_xlock(&uma_reclaim_lock); 4442 while (atomic_load_int(&uma_reclaim_needed) == 0) 4443 sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", 4444 hz); 4445 sx_xunlock(&uma_reclaim_lock); 4446 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); 4447 uma_reclaim(UMA_RECLAIM_DRAIN_CPU); 4448 atomic_store_int(&uma_reclaim_needed, 0); 4449 /* Don't fire more than once per-second. */ 4450 pause("umarclslp", hz); 4451 } 4452 } 4453 4454 /* See uma.h */ 4455 void 4456 uma_zone_reclaim(uma_zone_t zone, int req) 4457 { 4458 4459 switch (req) { 4460 case UMA_RECLAIM_TRIM: 4461 zone_trim(zone, NULL); 4462 break; 4463 case UMA_RECLAIM_DRAIN: 4464 zone_drain(zone, NULL); 4465 break; 4466 case UMA_RECLAIM_DRAIN_CPU: 4467 pcpu_cache_drain_safe(zone); 4468 zone_drain(zone, NULL); 4469 break; 4470 default: 4471 panic("unhandled reclamation request %d", req); 4472 } 4473 } 4474 4475 /* See uma.h */ 4476 int 4477 uma_zone_exhausted(uma_zone_t zone) 4478 { 4479 4480 return (atomic_load_32(&zone->uz_sleepers) > 0); 4481 } 4482 4483 unsigned long 4484 uma_limit(void) 4485 { 4486 4487 return (uma_kmem_limit); 4488 } 4489 4490 void 4491 uma_set_limit(unsigned long limit) 4492 { 4493 4494 uma_kmem_limit = limit; 4495 } 4496 4497 unsigned long 4498 uma_size(void) 4499 { 4500 4501 return (atomic_load_long(&uma_kmem_total)); 4502 } 4503 4504 long 4505 uma_avail(void) 4506 { 4507 4508 return (uma_kmem_limit - uma_size()); 4509 } 4510 4511 #ifdef DDB 4512 /* 4513 * Generate statistics across both the zone and its per-cpu cache's. Return 4514 * desired statistics if the pointer is non-NULL for that statistic. 4515 * 4516 * Note: does not update the zone statistics, as it can't safely clear the 4517 * per-CPU cache statistic. 4518 * 4519 */ 4520 static void 4521 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, 4522 uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) 4523 { 4524 uma_cache_t cache; 4525 uint64_t allocs, frees, sleeps, xdomain; 4526 int cachefree, cpu; 4527 4528 allocs = frees = sleeps = xdomain = 0; 4529 cachefree = 0; 4530 CPU_FOREACH(cpu) { 4531 cache = &z->uz_cpu[cpu]; 4532 cachefree += cache->uc_allocbucket.ucb_cnt; 4533 cachefree += cache->uc_freebucket.ucb_cnt; 4534 xdomain += cache->uc_crossbucket.ucb_cnt; 4535 cachefree += cache->uc_crossbucket.ucb_cnt; 4536 allocs += cache->uc_allocs; 4537 frees += cache->uc_frees; 4538 } 4539 allocs += counter_u64_fetch(z->uz_allocs); 4540 frees += counter_u64_fetch(z->uz_frees); 4541 sleeps += z->uz_sleeps; 4542 xdomain += z->uz_xdomain; 4543 if (cachefreep != NULL) 4544 *cachefreep = cachefree; 4545 if (allocsp != NULL) 4546 *allocsp = allocs; 4547 if (freesp != NULL) 4548 *freesp = frees; 4549 if (sleepsp != NULL) 4550 *sleepsp = sleeps; 4551 if (xdomainp != NULL) 4552 *xdomainp = xdomain; 4553 } 4554 #endif /* DDB */ 4555 4556 static int 4557 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) 4558 { 4559 uma_keg_t kz; 4560 uma_zone_t z; 4561 int count; 4562 4563 count = 0; 4564 rw_rlock(&uma_rwlock); 4565 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4566 LIST_FOREACH(z, &kz->uk_zones, uz_link) 4567 count++; 4568 } 4569 LIST_FOREACH(z, &uma_cachezones, uz_link) 4570 count++; 4571 4572 rw_runlock(&uma_rwlock); 4573 return (sysctl_handle_int(oidp, &count, 0, req)); 4574 } 4575 4576 static void 4577 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, 4578 struct uma_percpu_stat *ups, bool internal) 4579 { 4580 uma_zone_domain_t zdom; 4581 uma_cache_t cache; 4582 int i; 4583 4584 4585 for (i = 0; i < vm_ndomains; i++) { 4586 zdom = &z->uz_domain[i]; 4587 uth->uth_zone_free += zdom->uzd_nitems; 4588 } 4589 uth->uth_allocs = counter_u64_fetch(z->uz_allocs); 4590 uth->uth_frees = counter_u64_fetch(z->uz_frees); 4591 uth->uth_fails = counter_u64_fetch(z->uz_fails); 4592 uth->uth_sleeps = z->uz_sleeps; 4593 uth->uth_xdomain = z->uz_xdomain; 4594 4595 /* 4596 * While it is not normally safe to access the cache bucket pointers 4597 * while not on the CPU that owns the cache, we only allow the pointers 4598 * to be exchanged without the zone lock held, not invalidated, so 4599 * accept the possible race associated with bucket exchange during 4600 * monitoring. Use atomic_load_ptr() to ensure that the bucket pointers 4601 * are loaded only once. 4602 */ 4603 for (i = 0; i < mp_maxid + 1; i++) { 4604 bzero(&ups[i], sizeof(*ups)); 4605 if (internal || CPU_ABSENT(i)) 4606 continue; 4607 cache = &z->uz_cpu[i]; 4608 ups[i].ups_cache_free += cache->uc_allocbucket.ucb_cnt; 4609 ups[i].ups_cache_free += cache->uc_freebucket.ucb_cnt; 4610 ups[i].ups_cache_free += cache->uc_crossbucket.ucb_cnt; 4611 ups[i].ups_allocs = cache->uc_allocs; 4612 ups[i].ups_frees = cache->uc_frees; 4613 } 4614 } 4615 4616 static int 4617 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) 4618 { 4619 struct uma_stream_header ush; 4620 struct uma_type_header uth; 4621 struct uma_percpu_stat *ups; 4622 struct sbuf sbuf; 4623 uma_keg_t kz; 4624 uma_zone_t z; 4625 uint64_t items; 4626 uint32_t kfree, pages; 4627 int count, error, i; 4628 4629 error = sysctl_wire_old_buffer(req, 0); 4630 if (error != 0) 4631 return (error); 4632 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 4633 sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); 4634 ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); 4635 4636 count = 0; 4637 rw_rlock(&uma_rwlock); 4638 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4639 LIST_FOREACH(z, &kz->uk_zones, uz_link) 4640 count++; 4641 } 4642 4643 LIST_FOREACH(z, &uma_cachezones, uz_link) 4644 count++; 4645 4646 /* 4647 * Insert stream header. 4648 */ 4649 bzero(&ush, sizeof(ush)); 4650 ush.ush_version = UMA_STREAM_VERSION; 4651 ush.ush_maxcpus = (mp_maxid + 1); 4652 ush.ush_count = count; 4653 (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); 4654 4655 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4656 kfree = pages = 0; 4657 for (i = 0; i < vm_ndomains; i++) { 4658 kfree += kz->uk_domain[i].ud_free; 4659 pages += kz->uk_domain[i].ud_pages; 4660 } 4661 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 4662 bzero(&uth, sizeof(uth)); 4663 ZONE_LOCK(z); 4664 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 4665 uth.uth_align = kz->uk_align; 4666 uth.uth_size = kz->uk_size; 4667 uth.uth_rsize = kz->uk_rsize; 4668 if (z->uz_max_items > 0) { 4669 items = UZ_ITEMS_COUNT(z->uz_items); 4670 uth.uth_pages = (items / kz->uk_ipers) * 4671 kz->uk_ppera; 4672 } else 4673 uth.uth_pages = pages; 4674 uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * 4675 kz->uk_ppera; 4676 uth.uth_limit = z->uz_max_items; 4677 uth.uth_keg_free = kfree; 4678 4679 /* 4680 * A zone is secondary is it is not the first entry 4681 * on the keg's zone list. 4682 */ 4683 if ((z->uz_flags & UMA_ZONE_SECONDARY) && 4684 (LIST_FIRST(&kz->uk_zones) != z)) 4685 uth.uth_zone_flags = UTH_ZONE_SECONDARY; 4686 uma_vm_zone_stats(&uth, z, &sbuf, ups, 4687 kz->uk_flags & UMA_ZFLAG_INTERNAL); 4688 ZONE_UNLOCK(z); 4689 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 4690 for (i = 0; i < mp_maxid + 1; i++) 4691 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 4692 } 4693 } 4694 LIST_FOREACH(z, &uma_cachezones, uz_link) { 4695 bzero(&uth, sizeof(uth)); 4696 ZONE_LOCK(z); 4697 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 4698 uth.uth_size = z->uz_size; 4699 uma_vm_zone_stats(&uth, z, &sbuf, ups, false); 4700 ZONE_UNLOCK(z); 4701 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 4702 for (i = 0; i < mp_maxid + 1; i++) 4703 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 4704 } 4705 4706 rw_runlock(&uma_rwlock); 4707 error = sbuf_finish(&sbuf); 4708 sbuf_delete(&sbuf); 4709 free(ups, M_TEMP); 4710 return (error); 4711 } 4712 4713 int 4714 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) 4715 { 4716 uma_zone_t zone = *(uma_zone_t *)arg1; 4717 int error, max; 4718 4719 max = uma_zone_get_max(zone); 4720 error = sysctl_handle_int(oidp, &max, 0, req); 4721 if (error || !req->newptr) 4722 return (error); 4723 4724 uma_zone_set_max(zone, max); 4725 4726 return (0); 4727 } 4728 4729 int 4730 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) 4731 { 4732 uma_zone_t zone; 4733 int cur; 4734 4735 /* 4736 * Some callers want to add sysctls for global zones that 4737 * may not yet exist so they pass a pointer to a pointer. 4738 */ 4739 if (arg2 == 0) 4740 zone = *(uma_zone_t *)arg1; 4741 else 4742 zone = arg1; 4743 cur = uma_zone_get_cur(zone); 4744 return (sysctl_handle_int(oidp, &cur, 0, req)); 4745 } 4746 4747 static int 4748 sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS) 4749 { 4750 uma_zone_t zone = arg1; 4751 uint64_t cur; 4752 4753 cur = uma_zone_get_allocs(zone); 4754 return (sysctl_handle_64(oidp, &cur, 0, req)); 4755 } 4756 4757 static int 4758 sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS) 4759 { 4760 uma_zone_t zone = arg1; 4761 uint64_t cur; 4762 4763 cur = uma_zone_get_frees(zone); 4764 return (sysctl_handle_64(oidp, &cur, 0, req)); 4765 } 4766 4767 static int 4768 sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS) 4769 { 4770 struct sbuf sbuf; 4771 uma_zone_t zone = arg1; 4772 int error; 4773 4774 sbuf_new_for_sysctl(&sbuf, NULL, 0, req); 4775 if (zone->uz_flags != 0) 4776 sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS); 4777 else 4778 sbuf_printf(&sbuf, "0"); 4779 error = sbuf_finish(&sbuf); 4780 sbuf_delete(&sbuf); 4781 4782 return (error); 4783 } 4784 4785 static int 4786 sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS) 4787 { 4788 uma_keg_t keg = arg1; 4789 int avail, effpct, total; 4790 4791 total = keg->uk_ppera * PAGE_SIZE; 4792 if ((keg->uk_flags & UMA_ZFLAG_OFFPAGE) != 0) 4793 total += slabzone(keg->uk_ipers)->uz_keg->uk_rsize; 4794 /* 4795 * We consider the client's requested size and alignment here, not the 4796 * real size determination uk_rsize, because we also adjust the real 4797 * size for internal implementation reasons (max bitset size). 4798 */ 4799 avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1); 4800 if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) 4801 avail *= mp_maxid + 1; 4802 effpct = 100 * avail / total; 4803 return (sysctl_handle_int(oidp, &effpct, 0, req)); 4804 } 4805 4806 static int 4807 sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS) 4808 { 4809 uma_zone_t zone = arg1; 4810 uint64_t cur; 4811 4812 cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items)); 4813 return (sysctl_handle_64(oidp, &cur, 0, req)); 4814 } 4815 4816 #ifdef INVARIANTS 4817 static uma_slab_t 4818 uma_dbg_getslab(uma_zone_t zone, void *item) 4819 { 4820 uma_slab_t slab; 4821 uma_keg_t keg; 4822 uint8_t *mem; 4823 4824 /* 4825 * It is safe to return the slab here even though the 4826 * zone is unlocked because the item's allocation state 4827 * essentially holds a reference. 4828 */ 4829 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 4830 if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) 4831 return (NULL); 4832 if (zone->uz_flags & UMA_ZFLAG_VTOSLAB) 4833 return (vtoslab((vm_offset_t)mem)); 4834 keg = zone->uz_keg; 4835 if ((keg->uk_flags & UMA_ZFLAG_HASH) == 0) 4836 return ((uma_slab_t)(mem + keg->uk_pgoff)); 4837 KEG_LOCK(keg, 0); 4838 slab = hash_sfind(&keg->uk_hash, mem); 4839 KEG_UNLOCK(keg, 0); 4840 4841 return (slab); 4842 } 4843 4844 static bool 4845 uma_dbg_zskip(uma_zone_t zone, void *mem) 4846 { 4847 4848 if ((zone->uz_flags & UMA_ZFLAG_CACHE) != 0) 4849 return (true); 4850 4851 return (uma_dbg_kskip(zone->uz_keg, mem)); 4852 } 4853 4854 static bool 4855 uma_dbg_kskip(uma_keg_t keg, void *mem) 4856 { 4857 uintptr_t idx; 4858 4859 if (dbg_divisor == 0) 4860 return (true); 4861 4862 if (dbg_divisor == 1) 4863 return (false); 4864 4865 idx = (uintptr_t)mem >> PAGE_SHIFT; 4866 if (keg->uk_ipers > 1) { 4867 idx *= keg->uk_ipers; 4868 idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize; 4869 } 4870 4871 if ((idx / dbg_divisor) * dbg_divisor != idx) { 4872 counter_u64_add(uma_skip_cnt, 1); 4873 return (true); 4874 } 4875 counter_u64_add(uma_dbg_cnt, 1); 4876 4877 return (false); 4878 } 4879 4880 /* 4881 * Set up the slab's freei data such that uma_dbg_free can function. 4882 * 4883 */ 4884 static void 4885 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) 4886 { 4887 uma_keg_t keg; 4888 int freei; 4889 4890 if (slab == NULL) { 4891 slab = uma_dbg_getslab(zone, item); 4892 if (slab == NULL) 4893 panic("uma: item %p did not belong to zone %s\n", 4894 item, zone->uz_name); 4895 } 4896 keg = zone->uz_keg; 4897 freei = slab_item_index(slab, keg, item); 4898 4899 if (BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) 4900 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n", 4901 item, zone, zone->uz_name, slab, freei); 4902 BIT_SET_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)); 4903 } 4904 4905 /* 4906 * Verifies freed addresses. Checks for alignment, valid slab membership 4907 * and duplicate frees. 4908 * 4909 */ 4910 static void 4911 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) 4912 { 4913 uma_keg_t keg; 4914 int freei; 4915 4916 if (slab == NULL) { 4917 slab = uma_dbg_getslab(zone, item); 4918 if (slab == NULL) 4919 panic("uma: Freed item %p did not belong to zone %s\n", 4920 item, zone->uz_name); 4921 } 4922 keg = zone->uz_keg; 4923 freei = slab_item_index(slab, keg, item); 4924 4925 if (freei >= keg->uk_ipers) 4926 panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n", 4927 item, zone, zone->uz_name, slab, freei); 4928 4929 if (slab_item(slab, keg, freei) != item) 4930 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n", 4931 item, zone, zone->uz_name, slab, freei); 4932 4933 if (!BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) 4934 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n", 4935 item, zone, zone->uz_name, slab, freei); 4936 4937 BIT_CLR_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)); 4938 } 4939 #endif /* INVARIANTS */ 4940 4941 #ifdef DDB 4942 static int64_t 4943 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used, 4944 uint64_t *sleeps, long *cachefree, uint64_t *xdomain) 4945 { 4946 uint64_t frees; 4947 int i; 4948 4949 if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { 4950 *allocs = counter_u64_fetch(z->uz_allocs); 4951 frees = counter_u64_fetch(z->uz_frees); 4952 *sleeps = z->uz_sleeps; 4953 *cachefree = 0; 4954 *xdomain = 0; 4955 } else 4956 uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps, 4957 xdomain); 4958 for (i = 0; i < vm_ndomains; i++) { 4959 *cachefree += z->uz_domain[i].uzd_nitems; 4960 if (!((z->uz_flags & UMA_ZONE_SECONDARY) && 4961 (LIST_FIRST(&kz->uk_zones) != z))) 4962 *cachefree += kz->uk_domain[i].ud_free; 4963 } 4964 *used = *allocs - frees; 4965 return (((int64_t)*used + *cachefree) * kz->uk_size); 4966 } 4967 4968 DB_SHOW_COMMAND(uma, db_show_uma) 4969 { 4970 const char *fmt_hdr, *fmt_entry; 4971 uma_keg_t kz; 4972 uma_zone_t z; 4973 uint64_t allocs, used, sleeps, xdomain; 4974 long cachefree; 4975 /* variables for sorting */ 4976 uma_keg_t cur_keg; 4977 uma_zone_t cur_zone, last_zone; 4978 int64_t cur_size, last_size, size; 4979 int ties; 4980 4981 /* /i option produces machine-parseable CSV output */ 4982 if (modif[0] == 'i') { 4983 fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; 4984 fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n"; 4985 } else { 4986 fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n"; 4987 fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n"; 4988 } 4989 4990 db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests", 4991 "Sleeps", "Bucket", "Total Mem", "XFree"); 4992 4993 /* Sort the zones with largest size first. */ 4994 last_zone = NULL; 4995 last_size = INT64_MAX; 4996 for (;;) { 4997 cur_zone = NULL; 4998 cur_size = -1; 4999 ties = 0; 5000 LIST_FOREACH(kz, &uma_kegs, uk_link) { 5001 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 5002 /* 5003 * In the case of size ties, print out zones 5004 * in the order they are encountered. That is, 5005 * when we encounter the most recently output 5006 * zone, we have already printed all preceding 5007 * ties, and we must print all following ties. 5008 */ 5009 if (z == last_zone) { 5010 ties = 1; 5011 continue; 5012 } 5013 size = get_uma_stats(kz, z, &allocs, &used, 5014 &sleeps, &cachefree, &xdomain); 5015 if (size > cur_size && size < last_size + ties) 5016 { 5017 cur_size = size; 5018 cur_zone = z; 5019 cur_keg = kz; 5020 } 5021 } 5022 } 5023 if (cur_zone == NULL) 5024 break; 5025 5026 size = get_uma_stats(cur_keg, cur_zone, &allocs, &used, 5027 &sleeps, &cachefree, &xdomain); 5028 db_printf(fmt_entry, cur_zone->uz_name, 5029 (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree, 5030 (uintmax_t)allocs, (uintmax_t)sleeps, 5031 (unsigned)cur_zone->uz_bucket_size, (intmax_t)size, 5032 xdomain); 5033 5034 if (db_pager_quit) 5035 return; 5036 last_zone = cur_zone; 5037 last_size = cur_size; 5038 } 5039 } 5040 5041 DB_SHOW_COMMAND(umacache, db_show_umacache) 5042 { 5043 uma_zone_t z; 5044 uint64_t allocs, frees; 5045 long cachefree; 5046 int i; 5047 5048 db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", 5049 "Requests", "Bucket"); 5050 LIST_FOREACH(z, &uma_cachezones, uz_link) { 5051 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); 5052 for (i = 0; i < vm_ndomains; i++) 5053 cachefree += z->uz_domain[i].uzd_nitems; 5054 db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", 5055 z->uz_name, (uintmax_t)z->uz_size, 5056 (intmax_t)(allocs - frees), cachefree, 5057 (uintmax_t)allocs, z->uz_bucket_size); 5058 if (db_pager_quit) 5059 return; 5060 } 5061 } 5062 #endif /* DDB */ 5063