1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org> 5 * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org> 6 * Copyright (c) 2004-2006 Robert N. M. Watson 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice unmodified, this list of conditions, and the following 14 * disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * uma_core.c Implementation of the Universal Memory allocator 33 * 34 * This allocator is intended to replace the multitude of similar object caches 35 * in the standard FreeBSD kernel. The intent is to be flexible as well as 36 * efficient. A primary design goal is to return unused memory to the rest of 37 * the system. This will make the system as a whole more flexible due to the 38 * ability to move memory to subsystems which most need it instead of leaving 39 * pools of reserved memory unused. 40 * 41 * The basic ideas stem from similar slab/zone based allocators whose algorithms 42 * are well known. 43 * 44 */ 45 46 /* 47 * TODO: 48 * - Improve memory usage for large allocations 49 * - Investigate cache size adjustments 50 */ 51 52 #include <sys/cdefs.h> 53 __FBSDID("$FreeBSD$"); 54 55 #include "opt_ddb.h" 56 #include "opt_param.h" 57 #include "opt_vm.h" 58 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/bitset.h> 62 #include <sys/domainset.h> 63 #include <sys/eventhandler.h> 64 #include <sys/kernel.h> 65 #include <sys/types.h> 66 #include <sys/limits.h> 67 #include <sys/queue.h> 68 #include <sys/malloc.h> 69 #include <sys/ktr.h> 70 #include <sys/lock.h> 71 #include <sys/sysctl.h> 72 #include <sys/mutex.h> 73 #include <sys/proc.h> 74 #include <sys/random.h> 75 #include <sys/rwlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/taskqueue.h> 80 #include <sys/vmmeter.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_domainset.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pageout.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_phys.h> 89 #include <vm/vm_pagequeue.h> 90 #include <vm/vm_map.h> 91 #include <vm/vm_kern.h> 92 #include <vm/vm_extern.h> 93 #include <vm/uma.h> 94 #include <vm/uma_int.h> 95 #include <vm/uma_dbg.h> 96 97 #include <ddb/ddb.h> 98 99 #ifdef DEBUG_MEMGUARD 100 #include <vm/memguard.h> 101 #endif 102 103 /* 104 * This is the zone and keg from which all zones are spawned. 105 */ 106 static uma_zone_t kegs; 107 static uma_zone_t zones; 108 109 /* This is the zone from which all offpage uma_slab_ts are allocated. */ 110 static uma_zone_t slabzone; 111 112 /* 113 * The initial hash tables come out of this zone so they can be allocated 114 * prior to malloc coming up. 115 */ 116 static uma_zone_t hashzone; 117 118 /* The boot-time adjusted value for cache line alignment. */ 119 int uma_align_cache = 64 - 1; 120 121 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); 122 static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc"); 123 124 /* 125 * Are we allowed to allocate buckets? 126 */ 127 static int bucketdisable = 1; 128 129 /* Linked list of all kegs in the system */ 130 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); 131 132 /* Linked list of all cache-only zones in the system */ 133 static LIST_HEAD(,uma_zone) uma_cachezones = 134 LIST_HEAD_INITIALIZER(uma_cachezones); 135 136 /* This RW lock protects the keg list */ 137 static struct rwlock_padalign __exclusive_cache_line uma_rwlock; 138 139 /* 140 * Pointer and counter to pool of pages, that is preallocated at 141 * startup to bootstrap UMA. 142 */ 143 static char *bootmem; 144 static int boot_pages; 145 146 static struct sx uma_reclaim_lock; 147 148 /* 149 * kmem soft limit, initialized by uma_set_limit(). Ensure that early 150 * allocations don't trigger a wakeup of the reclaim thread. 151 */ 152 unsigned long uma_kmem_limit = LONG_MAX; 153 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, 154 "UMA kernel memory soft limit"); 155 unsigned long uma_kmem_total; 156 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, 157 "UMA kernel memory usage"); 158 159 /* Is the VM done starting up? */ 160 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS, 161 BOOT_RUNNING } booted = BOOT_COLD; 162 163 /* 164 * This is the handle used to schedule events that need to happen 165 * outside of the allocation fast path. 166 */ 167 static struct callout uma_callout; 168 #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ 169 170 /* 171 * This structure is passed as the zone ctor arg so that I don't have to create 172 * a special allocation function just for zones. 173 */ 174 struct uma_zctor_args { 175 const char *name; 176 size_t size; 177 uma_ctor ctor; 178 uma_dtor dtor; 179 uma_init uminit; 180 uma_fini fini; 181 uma_import import; 182 uma_release release; 183 void *arg; 184 uma_keg_t keg; 185 int align; 186 uint32_t flags; 187 }; 188 189 struct uma_kctor_args { 190 uma_zone_t zone; 191 size_t size; 192 uma_init uminit; 193 uma_fini fini; 194 int align; 195 uint32_t flags; 196 }; 197 198 struct uma_bucket_zone { 199 uma_zone_t ubz_zone; 200 char *ubz_name; 201 int ubz_entries; /* Number of items it can hold. */ 202 int ubz_maxsize; /* Maximum allocation size per-item. */ 203 }; 204 205 /* 206 * Compute the actual number of bucket entries to pack them in power 207 * of two sizes for more efficient space utilization. 208 */ 209 #define BUCKET_SIZE(n) \ 210 (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) 211 212 #define BUCKET_MAX BUCKET_SIZE(256) 213 #define BUCKET_MIN BUCKET_SIZE(4) 214 215 struct uma_bucket_zone bucket_zones[] = { 216 { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 }, 217 { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 }, 218 { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 }, 219 { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 }, 220 { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 }, 221 { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, 222 { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, 223 { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, 224 { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, 225 { NULL, NULL, 0} 226 }; 227 228 /* 229 * Flags and enumerations to be passed to internal functions. 230 */ 231 enum zfreeskip { 232 SKIP_NONE = 0, 233 SKIP_CNT = 0x00000001, 234 SKIP_DTOR = 0x00010000, 235 SKIP_FINI = 0x00020000, 236 }; 237 238 /* Prototypes.. */ 239 240 int uma_startup_count(int); 241 void uma_startup(void *, int); 242 void uma_startup1(void); 243 void uma_startup2(void); 244 245 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 246 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 247 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 248 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 249 static void page_free(void *, vm_size_t, uint8_t); 250 static void pcpu_page_free(void *, vm_size_t, uint8_t); 251 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); 252 static void cache_drain(uma_zone_t); 253 static void bucket_drain(uma_zone_t, uma_bucket_t); 254 static void bucket_cache_reclaim(uma_zone_t zone, bool); 255 static int keg_ctor(void *, int, void *, int); 256 static void keg_dtor(void *, int, void *); 257 static int zone_ctor(void *, int, void *, int); 258 static void zone_dtor(void *, int, void *); 259 static int zero_init(void *, int, int); 260 static void keg_small_init(uma_keg_t keg); 261 static void keg_large_init(uma_keg_t keg); 262 static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *); 263 static void zone_timeout(uma_zone_t zone, void *); 264 static int hash_alloc(struct uma_hash *, u_int); 265 static int hash_expand(struct uma_hash *, struct uma_hash *); 266 static void hash_free(struct uma_hash *hash); 267 static void uma_timeout(void *); 268 static void uma_startup3(void); 269 static void *zone_alloc_item(uma_zone_t, void *, int, int); 270 static void *zone_alloc_item_locked(uma_zone_t, void *, int, int); 271 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); 272 static void bucket_enable(void); 273 static void bucket_init(void); 274 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); 275 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); 276 static void bucket_zone_drain(void); 277 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); 278 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); 279 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); 280 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, 281 uma_fini fini, int align, uint32_t flags); 282 static int zone_import(void *, void **, int, int, int); 283 static void zone_release(void *, void **, int); 284 static void uma_zero_item(void *, uma_zone_t); 285 static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int); 286 static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int); 287 288 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); 289 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); 290 static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS); 291 static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); 292 static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS); 293 static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS); 294 295 #ifdef INVARIANTS 296 static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg); 297 298 static bool uma_dbg_kskip(uma_keg_t keg, void *mem); 299 static bool uma_dbg_zskip(uma_zone_t zone, void *mem); 300 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); 301 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); 302 303 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0, 304 "Memory allocation debugging"); 305 306 static u_int dbg_divisor = 1; 307 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor, 308 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0, 309 "Debug & thrash every this item in memory allocator"); 310 311 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER; 312 static counter_u64_t uma_skip_cnt = EARLY_COUNTER; 313 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD, 314 &uma_dbg_cnt, "memory items debugged"); 315 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD, 316 &uma_skip_cnt, "memory items skipped, not debugged"); 317 #endif 318 319 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); 320 321 SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW, 0, "Universal Memory Allocator"); 322 323 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT, 324 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); 325 326 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT, 327 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); 328 329 static int zone_warnings = 1; 330 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, 331 "Warn when UMA zones becomes full"); 332 333 /* 334 * This routine checks to see whether or not it's safe to enable buckets. 335 */ 336 static void 337 bucket_enable(void) 338 { 339 340 KASSERT(booted >= BOOT_BUCKETS, ("Bucket enable before init")); 341 bucketdisable = vm_page_count_min(); 342 } 343 344 /* 345 * Initialize bucket_zones, the array of zones of buckets of various sizes. 346 * 347 * For each zone, calculate the memory required for each bucket, consisting 348 * of the header and an array of pointers. 349 */ 350 static void 351 bucket_init(void) 352 { 353 struct uma_bucket_zone *ubz; 354 int size; 355 356 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { 357 size = roundup(sizeof(struct uma_bucket), sizeof(void *)); 358 size += sizeof(void *) * ubz->ubz_entries; 359 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, 360 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 361 UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA); 362 } 363 } 364 365 /* 366 * Given a desired number of entries for a bucket, return the zone from which 367 * to allocate the bucket. 368 */ 369 static struct uma_bucket_zone * 370 bucket_zone_lookup(int entries) 371 { 372 struct uma_bucket_zone *ubz; 373 374 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 375 if (ubz->ubz_entries >= entries) 376 return (ubz); 377 ubz--; 378 return (ubz); 379 } 380 381 static struct uma_bucket_zone * 382 bucket_zone_max(uma_zone_t zone, int nitems) 383 { 384 struct uma_bucket_zone *ubz; 385 int bpcpu; 386 387 bpcpu = 2; 388 #ifdef UMA_XDOMAIN 389 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) 390 /* Count the cross-domain bucket. */ 391 bpcpu++; 392 #endif 393 394 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 395 if (ubz->ubz_entries * bpcpu * mp_ncpus > nitems) 396 break; 397 if (ubz == &bucket_zones[0]) 398 ubz = NULL; 399 else 400 ubz--; 401 return (ubz); 402 } 403 404 static int 405 bucket_select(int size) 406 { 407 struct uma_bucket_zone *ubz; 408 409 ubz = &bucket_zones[0]; 410 if (size > ubz->ubz_maxsize) 411 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); 412 413 for (; ubz->ubz_entries != 0; ubz++) 414 if (ubz->ubz_maxsize < size) 415 break; 416 ubz--; 417 return (ubz->ubz_entries); 418 } 419 420 static uma_bucket_t 421 bucket_alloc(uma_zone_t zone, void *udata, int flags) 422 { 423 struct uma_bucket_zone *ubz; 424 uma_bucket_t bucket; 425 426 /* 427 * This is to stop us from allocating per cpu buckets while we're 428 * running out of vm.boot_pages. Otherwise, we would exhaust the 429 * boot pages. This also prevents us from allocating buckets in 430 * low memory situations. 431 */ 432 if (bucketdisable) 433 return (NULL); 434 /* 435 * To limit bucket recursion we store the original zone flags 436 * in a cookie passed via zalloc_arg/zfree_arg. This allows the 437 * NOVM flag to persist even through deep recursions. We also 438 * store ZFLAG_BUCKET once we have recursed attempting to allocate 439 * a bucket for a bucket zone so we do not allow infinite bucket 440 * recursion. This cookie will even persist to frees of unused 441 * buckets via the allocation path or bucket allocations in the 442 * free path. 443 */ 444 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 445 udata = (void *)(uintptr_t)zone->uz_flags; 446 else { 447 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) 448 return (NULL); 449 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); 450 } 451 if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY) 452 flags |= M_NOVM; 453 ubz = bucket_zone_lookup(zone->uz_bucket_size); 454 if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) 455 ubz++; 456 bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); 457 if (bucket) { 458 #ifdef INVARIANTS 459 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); 460 #endif 461 bucket->ub_cnt = 0; 462 bucket->ub_entries = ubz->ubz_entries; 463 } 464 465 return (bucket); 466 } 467 468 static void 469 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) 470 { 471 struct uma_bucket_zone *ubz; 472 473 KASSERT(bucket->ub_cnt == 0, 474 ("bucket_free: Freeing a non free bucket.")); 475 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 476 udata = (void *)(uintptr_t)zone->uz_flags; 477 ubz = bucket_zone_lookup(bucket->ub_entries); 478 uma_zfree_arg(ubz->ubz_zone, bucket, udata); 479 } 480 481 static void 482 bucket_zone_drain(void) 483 { 484 struct uma_bucket_zone *ubz; 485 486 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 487 uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN); 488 } 489 490 /* 491 * Attempt to satisfy an allocation by retrieving a full bucket from one of the 492 * zone's caches. 493 */ 494 static uma_bucket_t 495 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom) 496 { 497 uma_bucket_t bucket; 498 499 ZONE_LOCK_ASSERT(zone); 500 501 if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) { 502 MPASS(zdom->uzd_nitems >= bucket->ub_cnt); 503 TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); 504 zdom->uzd_nitems -= bucket->ub_cnt; 505 if (zdom->uzd_imin > zdom->uzd_nitems) 506 zdom->uzd_imin = zdom->uzd_nitems; 507 zone->uz_bkt_count -= bucket->ub_cnt; 508 } 509 return (bucket); 510 } 511 512 /* 513 * Insert a full bucket into the specified cache. The "ws" parameter indicates 514 * whether the bucket's contents should be counted as part of the zone's working 515 * set. 516 */ 517 static void 518 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket, 519 const bool ws) 520 { 521 522 ZONE_LOCK_ASSERT(zone); 523 KASSERT(!ws || zone->uz_bkt_count < zone->uz_bkt_max, 524 ("%s: zone %p overflow", __func__, zone)); 525 526 if (ws) 527 TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); 528 else 529 TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); 530 zdom->uzd_nitems += bucket->ub_cnt; 531 if (ws && zdom->uzd_imax < zdom->uzd_nitems) 532 zdom->uzd_imax = zdom->uzd_nitems; 533 zone->uz_bkt_count += bucket->ub_cnt; 534 } 535 536 static void 537 zone_log_warning(uma_zone_t zone) 538 { 539 static const struct timeval warninterval = { 300, 0 }; 540 541 if (!zone_warnings || zone->uz_warning == NULL) 542 return; 543 544 if (ratecheck(&zone->uz_ratecheck, &warninterval)) 545 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); 546 } 547 548 static inline void 549 zone_maxaction(uma_zone_t zone) 550 { 551 552 if (zone->uz_maxaction.ta_func != NULL) 553 taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); 554 } 555 556 /* 557 * Routine called by timeout which is used to fire off some time interval 558 * based calculations. (stats, hash size, etc.) 559 * 560 * Arguments: 561 * arg Unused 562 * 563 * Returns: 564 * Nothing 565 */ 566 static void 567 uma_timeout(void *unused) 568 { 569 bucket_enable(); 570 zone_foreach(zone_timeout, NULL); 571 572 /* Reschedule this event */ 573 callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); 574 } 575 576 /* 577 * Update the working set size estimate for the zone's bucket cache. 578 * The constants chosen here are somewhat arbitrary. With an update period of 579 * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the 580 * last 100s. 581 */ 582 static void 583 zone_domain_update_wss(uma_zone_domain_t zdom) 584 { 585 long wss; 586 587 MPASS(zdom->uzd_imax >= zdom->uzd_imin); 588 wss = zdom->uzd_imax - zdom->uzd_imin; 589 zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems; 590 zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5; 591 } 592 593 /* 594 * Routine to perform timeout driven calculations. This expands the 595 * hashes and does per cpu statistics aggregation. 596 * 597 * Returns nothing. 598 */ 599 static void 600 zone_timeout(uma_zone_t zone, void *unused) 601 { 602 uma_keg_t keg; 603 u_int slabs; 604 605 if ((zone->uz_flags & UMA_ZONE_HASH) == 0) 606 goto update_wss; 607 608 keg = zone->uz_keg; 609 KEG_LOCK(keg); 610 /* 611 * Expand the keg hash table. 612 * 613 * This is done if the number of slabs is larger than the hash size. 614 * What I'm trying to do here is completely reduce collisions. This 615 * may be a little aggressive. Should I allow for two collisions max? 616 */ 617 if (keg->uk_flags & UMA_ZONE_HASH && 618 (slabs = keg->uk_pages / keg->uk_ppera) > 619 keg->uk_hash.uh_hashsize) { 620 struct uma_hash newhash; 621 struct uma_hash oldhash; 622 int ret; 623 624 /* 625 * This is so involved because allocating and freeing 626 * while the keg lock is held will lead to deadlock. 627 * I have to do everything in stages and check for 628 * races. 629 */ 630 KEG_UNLOCK(keg); 631 ret = hash_alloc(&newhash, 1 << fls(slabs)); 632 KEG_LOCK(keg); 633 if (ret) { 634 if (hash_expand(&keg->uk_hash, &newhash)) { 635 oldhash = keg->uk_hash; 636 keg->uk_hash = newhash; 637 } else 638 oldhash = newhash; 639 640 KEG_UNLOCK(keg); 641 hash_free(&oldhash); 642 return; 643 } 644 } 645 KEG_UNLOCK(keg); 646 647 update_wss: 648 ZONE_LOCK(zone); 649 for (int i = 0; i < vm_ndomains; i++) 650 zone_domain_update_wss(&zone->uz_domain[i]); 651 ZONE_UNLOCK(zone); 652 } 653 654 /* 655 * Allocate and zero fill the next sized hash table from the appropriate 656 * backing store. 657 * 658 * Arguments: 659 * hash A new hash structure with the old hash size in uh_hashsize 660 * 661 * Returns: 662 * 1 on success and 0 on failure. 663 */ 664 static int 665 hash_alloc(struct uma_hash *hash, u_int size) 666 { 667 size_t alloc; 668 669 KASSERT(powerof2(size), ("hash size must be power of 2")); 670 if (size > UMA_HASH_SIZE_INIT) { 671 hash->uh_hashsize = size; 672 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; 673 hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT); 674 } else { 675 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; 676 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, 677 UMA_ANYDOMAIN, M_WAITOK); 678 hash->uh_hashsize = UMA_HASH_SIZE_INIT; 679 } 680 if (hash->uh_slab_hash) { 681 bzero(hash->uh_slab_hash, alloc); 682 hash->uh_hashmask = hash->uh_hashsize - 1; 683 return (1); 684 } 685 686 return (0); 687 } 688 689 /* 690 * Expands the hash table for HASH zones. This is done from zone_timeout 691 * to reduce collisions. This must not be done in the regular allocation 692 * path, otherwise, we can recurse on the vm while allocating pages. 693 * 694 * Arguments: 695 * oldhash The hash you want to expand 696 * newhash The hash structure for the new table 697 * 698 * Returns: 699 * Nothing 700 * 701 * Discussion: 702 */ 703 static int 704 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) 705 { 706 uma_hash_slab_t slab; 707 u_int hval; 708 u_int idx; 709 710 if (!newhash->uh_slab_hash) 711 return (0); 712 713 if (oldhash->uh_hashsize >= newhash->uh_hashsize) 714 return (0); 715 716 /* 717 * I need to investigate hash algorithms for resizing without a 718 * full rehash. 719 */ 720 721 for (idx = 0; idx < oldhash->uh_hashsize; idx++) 722 while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) { 723 slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]); 724 LIST_REMOVE(slab, uhs_hlink); 725 hval = UMA_HASH(newhash, slab->uhs_data); 726 LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], 727 slab, uhs_hlink); 728 } 729 730 return (1); 731 } 732 733 /* 734 * Free the hash bucket to the appropriate backing store. 735 * 736 * Arguments: 737 * slab_hash The hash bucket we're freeing 738 * hashsize The number of entries in that hash bucket 739 * 740 * Returns: 741 * Nothing 742 */ 743 static void 744 hash_free(struct uma_hash *hash) 745 { 746 if (hash->uh_slab_hash == NULL) 747 return; 748 if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) 749 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); 750 else 751 free(hash->uh_slab_hash, M_UMAHASH); 752 } 753 754 /* 755 * Frees all outstanding items in a bucket 756 * 757 * Arguments: 758 * zone The zone to free to, must be unlocked. 759 * bucket The free/alloc bucket with items, cpu queue must be locked. 760 * 761 * Returns: 762 * Nothing 763 */ 764 765 static void 766 bucket_drain(uma_zone_t zone, uma_bucket_t bucket) 767 { 768 int i; 769 770 if (bucket == NULL) 771 return; 772 773 if (zone->uz_fini) 774 for (i = 0; i < bucket->ub_cnt; i++) 775 zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); 776 zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); 777 if (zone->uz_max_items > 0) { 778 ZONE_LOCK(zone); 779 zone->uz_items -= bucket->ub_cnt; 780 if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items) 781 wakeup_one(zone); 782 ZONE_UNLOCK(zone); 783 } 784 bucket->ub_cnt = 0; 785 } 786 787 /* 788 * Drains the per cpu caches for a zone. 789 * 790 * NOTE: This may only be called while the zone is being turn down, and not 791 * during normal operation. This is necessary in order that we do not have 792 * to migrate CPUs to drain the per-CPU caches. 793 * 794 * Arguments: 795 * zone The zone to drain, must be unlocked. 796 * 797 * Returns: 798 * Nothing 799 */ 800 static void 801 cache_drain(uma_zone_t zone) 802 { 803 uma_cache_t cache; 804 int cpu; 805 806 /* 807 * XXX: It is safe to not lock the per-CPU caches, because we're 808 * tearing down the zone anyway. I.e., there will be no further use 809 * of the caches at this point. 810 * 811 * XXX: It would good to be able to assert that the zone is being 812 * torn down to prevent improper use of cache_drain(). 813 * 814 * XXX: We lock the zone before passing into bucket_cache_reclaim() as 815 * it is used elsewhere. Should the tear-down path be made special 816 * there in some form? 817 */ 818 CPU_FOREACH(cpu) { 819 cache = &zone->uz_cpu[cpu]; 820 bucket_drain(zone, cache->uc_allocbucket); 821 if (cache->uc_allocbucket != NULL) 822 bucket_free(zone, cache->uc_allocbucket, NULL); 823 cache->uc_allocbucket = NULL; 824 bucket_drain(zone, cache->uc_freebucket); 825 if (cache->uc_freebucket != NULL) 826 bucket_free(zone, cache->uc_freebucket, NULL); 827 cache->uc_freebucket = NULL; 828 bucket_drain(zone, cache->uc_crossbucket); 829 if (cache->uc_crossbucket != NULL) 830 bucket_free(zone, cache->uc_crossbucket, NULL); 831 cache->uc_crossbucket = NULL; 832 } 833 ZONE_LOCK(zone); 834 bucket_cache_reclaim(zone, true); 835 ZONE_UNLOCK(zone); 836 } 837 838 static void 839 cache_shrink(uma_zone_t zone, void *unused) 840 { 841 842 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 843 return; 844 845 ZONE_LOCK(zone); 846 zone->uz_bucket_size = 847 (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2; 848 ZONE_UNLOCK(zone); 849 } 850 851 static void 852 cache_drain_safe_cpu(uma_zone_t zone, void *unused) 853 { 854 uma_cache_t cache; 855 uma_bucket_t b1, b2, b3; 856 int domain; 857 858 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 859 return; 860 861 b1 = b2 = b3 = NULL; 862 ZONE_LOCK(zone); 863 critical_enter(); 864 if (zone->uz_flags & UMA_ZONE_NUMA) 865 domain = PCPU_GET(domain); 866 else 867 domain = 0; 868 cache = &zone->uz_cpu[curcpu]; 869 if (cache->uc_allocbucket) { 870 if (cache->uc_allocbucket->ub_cnt != 0) 871 zone_put_bucket(zone, &zone->uz_domain[domain], 872 cache->uc_allocbucket, false); 873 else 874 b1 = cache->uc_allocbucket; 875 cache->uc_allocbucket = NULL; 876 } 877 if (cache->uc_freebucket) { 878 if (cache->uc_freebucket->ub_cnt != 0) 879 zone_put_bucket(zone, &zone->uz_domain[domain], 880 cache->uc_freebucket, false); 881 else 882 b2 = cache->uc_freebucket; 883 cache->uc_freebucket = NULL; 884 } 885 b3 = cache->uc_crossbucket; 886 cache->uc_crossbucket = NULL; 887 critical_exit(); 888 ZONE_UNLOCK(zone); 889 if (b1) 890 bucket_free(zone, b1, NULL); 891 if (b2) 892 bucket_free(zone, b2, NULL); 893 if (b3) { 894 bucket_drain(zone, b3); 895 bucket_free(zone, b3, NULL); 896 } 897 } 898 899 /* 900 * Safely drain per-CPU caches of a zone(s) to alloc bucket. 901 * This is an expensive call because it needs to bind to all CPUs 902 * one by one and enter a critical section on each of them in order 903 * to safely access their cache buckets. 904 * Zone lock must not be held on call this function. 905 */ 906 static void 907 pcpu_cache_drain_safe(uma_zone_t zone) 908 { 909 int cpu; 910 911 /* 912 * Polite bucket sizes shrinking was not enouth, shrink aggressively. 913 */ 914 if (zone) 915 cache_shrink(zone, NULL); 916 else 917 zone_foreach(cache_shrink, NULL); 918 919 CPU_FOREACH(cpu) { 920 thread_lock(curthread); 921 sched_bind(curthread, cpu); 922 thread_unlock(curthread); 923 924 if (zone) 925 cache_drain_safe_cpu(zone, NULL); 926 else 927 zone_foreach(cache_drain_safe_cpu, NULL); 928 } 929 thread_lock(curthread); 930 sched_unbind(curthread); 931 thread_unlock(curthread); 932 } 933 934 /* 935 * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller 936 * requested a drain, otherwise the per-domain caches are trimmed to either 937 * estimated working set size. 938 */ 939 static void 940 bucket_cache_reclaim(uma_zone_t zone, bool drain) 941 { 942 uma_zone_domain_t zdom; 943 uma_bucket_t bucket; 944 long target, tofree; 945 int i; 946 947 for (i = 0; i < vm_ndomains; i++) { 948 zdom = &zone->uz_domain[i]; 949 950 /* 951 * If we were asked to drain the zone, we are done only once 952 * this bucket cache is empty. Otherwise, we reclaim items in 953 * excess of the zone's estimated working set size. If the 954 * difference nitems - imin is larger than the WSS estimate, 955 * then the estimate will grow at the end of this interval and 956 * we ignore the historical average. 957 */ 958 target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems - 959 zdom->uzd_imin); 960 while (zdom->uzd_nitems > target) { 961 bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist); 962 if (bucket == NULL) 963 break; 964 tofree = bucket->ub_cnt; 965 TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); 966 zdom->uzd_nitems -= tofree; 967 968 /* 969 * Shift the bounds of the current WSS interval to avoid 970 * perturbing the estimate. 971 */ 972 zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree); 973 zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree); 974 975 ZONE_UNLOCK(zone); 976 bucket_drain(zone, bucket); 977 bucket_free(zone, bucket, NULL); 978 ZONE_LOCK(zone); 979 } 980 } 981 982 /* 983 * Shrink the zone bucket size to ensure that the per-CPU caches 984 * don't grow too large. 985 */ 986 if (zone->uz_bucket_size > zone->uz_bucket_size_min) 987 zone->uz_bucket_size--; 988 } 989 990 static void 991 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) 992 { 993 uint8_t *mem; 994 int i; 995 uint8_t flags; 996 997 CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", 998 keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); 999 1000 mem = slab_data(slab, keg); 1001 flags = slab->us_flags; 1002 i = start; 1003 if (keg->uk_fini != NULL) { 1004 for (i--; i > -1; i--) 1005 #ifdef INVARIANTS 1006 /* 1007 * trash_fini implies that dtor was trash_dtor. trash_fini 1008 * would check that memory hasn't been modified since free, 1009 * which executed trash_dtor. 1010 * That's why we need to run uma_dbg_kskip() check here, 1011 * albeit we don't make skip check for other init/fini 1012 * invocations. 1013 */ 1014 if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) || 1015 keg->uk_fini != trash_fini) 1016 #endif 1017 keg->uk_fini(slab_item(slab, keg, i), keg->uk_size); 1018 } 1019 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1020 zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); 1021 keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags); 1022 uma_total_dec(PAGE_SIZE * keg->uk_ppera); 1023 } 1024 1025 /* 1026 * Frees pages from a keg back to the system. This is done on demand from 1027 * the pageout daemon. 1028 * 1029 * Returns nothing. 1030 */ 1031 static void 1032 keg_drain(uma_keg_t keg) 1033 { 1034 struct slabhead freeslabs = { 0 }; 1035 uma_domain_t dom; 1036 uma_slab_t slab, tmp; 1037 int i; 1038 1039 /* 1040 * We don't want to take pages from statically allocated kegs at this 1041 * time 1042 */ 1043 if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) 1044 return; 1045 1046 CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u", 1047 keg->uk_name, keg, keg->uk_free); 1048 KEG_LOCK(keg); 1049 if (keg->uk_free == 0) 1050 goto finished; 1051 1052 for (i = 0; i < vm_ndomains; i++) { 1053 dom = &keg->uk_domain[i]; 1054 LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) { 1055 /* We have nowhere to free these to. */ 1056 if (slab->us_flags & UMA_SLAB_BOOT) 1057 continue; 1058 1059 LIST_REMOVE(slab, us_link); 1060 keg->uk_pages -= keg->uk_ppera; 1061 keg->uk_free -= keg->uk_ipers; 1062 1063 if (keg->uk_flags & UMA_ZONE_HASH) 1064 UMA_HASH_REMOVE(&keg->uk_hash, slab); 1065 1066 LIST_INSERT_HEAD(&freeslabs, slab, us_link); 1067 } 1068 } 1069 1070 finished: 1071 KEG_UNLOCK(keg); 1072 1073 while ((slab = LIST_FIRST(&freeslabs)) != NULL) { 1074 LIST_REMOVE(slab, us_link); 1075 keg_free_slab(keg, slab, keg->uk_ipers); 1076 } 1077 } 1078 1079 static void 1080 zone_reclaim(uma_zone_t zone, int waitok, bool drain) 1081 { 1082 1083 /* 1084 * Set draining to interlock with zone_dtor() so we can release our 1085 * locks as we go. Only dtor() should do a WAITOK call since it 1086 * is the only call that knows the structure will still be available 1087 * when it wakes up. 1088 */ 1089 ZONE_LOCK(zone); 1090 while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) { 1091 if (waitok == M_NOWAIT) 1092 goto out; 1093 msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1); 1094 } 1095 zone->uz_flags |= UMA_ZFLAG_RECLAIMING; 1096 bucket_cache_reclaim(zone, drain); 1097 ZONE_UNLOCK(zone); 1098 1099 /* 1100 * The DRAINING flag protects us from being freed while 1101 * we're running. Normally the uma_rwlock would protect us but we 1102 * must be able to release and acquire the right lock for each keg. 1103 */ 1104 if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) 1105 keg_drain(zone->uz_keg); 1106 ZONE_LOCK(zone); 1107 zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING; 1108 wakeup(zone); 1109 out: 1110 ZONE_UNLOCK(zone); 1111 } 1112 1113 static void 1114 zone_drain(uma_zone_t zone, void *unused) 1115 { 1116 1117 zone_reclaim(zone, M_NOWAIT, true); 1118 } 1119 1120 static void 1121 zone_trim(uma_zone_t zone, void *unused) 1122 { 1123 1124 zone_reclaim(zone, M_NOWAIT, false); 1125 } 1126 1127 /* 1128 * Allocate a new slab for a keg. This does not insert the slab onto a list. 1129 * If the allocation was successful, the keg lock will be held upon return, 1130 * otherwise the keg will be left unlocked. 1131 * 1132 * Arguments: 1133 * flags Wait flags for the item initialization routine 1134 * aflags Wait flags for the slab allocation 1135 * 1136 * Returns: 1137 * The slab that was allocated or NULL if there is no memory and the 1138 * caller specified M_NOWAIT. 1139 */ 1140 static uma_slab_t 1141 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, 1142 int aflags) 1143 { 1144 uma_alloc allocf; 1145 uma_slab_t slab; 1146 unsigned long size; 1147 uint8_t *mem; 1148 uint8_t sflags; 1149 int i; 1150 1151 KASSERT(domain >= 0 && domain < vm_ndomains, 1152 ("keg_alloc_slab: domain %d out of range", domain)); 1153 KEG_LOCK_ASSERT(keg); 1154 MPASS(zone->uz_lockptr == &keg->uk_lock); 1155 1156 allocf = keg->uk_allocf; 1157 KEG_UNLOCK(keg); 1158 1159 slab = NULL; 1160 mem = NULL; 1161 if (keg->uk_flags & UMA_ZONE_OFFPAGE) { 1162 slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags); 1163 if (slab == NULL) 1164 goto out; 1165 } 1166 1167 /* 1168 * This reproduces the old vm_zone behavior of zero filling pages the 1169 * first time they are added to a zone. 1170 * 1171 * Malloced items are zeroed in uma_zalloc. 1172 */ 1173 1174 if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) 1175 aflags |= M_ZERO; 1176 else 1177 aflags &= ~M_ZERO; 1178 1179 if (keg->uk_flags & UMA_ZONE_NODUMP) 1180 aflags |= M_NODUMP; 1181 1182 /* zone is passed for legacy reasons. */ 1183 size = keg->uk_ppera * PAGE_SIZE; 1184 mem = allocf(zone, size, domain, &sflags, aflags); 1185 if (mem == NULL) { 1186 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1187 zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); 1188 slab = NULL; 1189 goto out; 1190 } 1191 uma_total_inc(size); 1192 1193 /* Point the slab into the allocated memory */ 1194 if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) 1195 slab = (uma_slab_t )(mem + keg->uk_pgoff); 1196 else 1197 ((uma_hash_slab_t)slab)->uhs_data = mem; 1198 1199 if (keg->uk_flags & UMA_ZONE_VTOSLAB) 1200 for (i = 0; i < keg->uk_ppera; i++) 1201 vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE), 1202 zone, slab); 1203 1204 slab->us_freecount = keg->uk_ipers; 1205 slab->us_flags = sflags; 1206 slab->us_domain = domain; 1207 BIT_FILL(keg->uk_ipers, &slab->us_free); 1208 #ifdef INVARIANTS 1209 BIT_ZERO(keg->uk_ipers, slab_dbg_bits(slab, keg)); 1210 #endif 1211 1212 if (keg->uk_init != NULL) { 1213 for (i = 0; i < keg->uk_ipers; i++) 1214 if (keg->uk_init(slab_item(slab, keg, i), 1215 keg->uk_size, flags) != 0) 1216 break; 1217 if (i != keg->uk_ipers) { 1218 keg_free_slab(keg, slab, i); 1219 slab = NULL; 1220 goto out; 1221 } 1222 } 1223 KEG_LOCK(keg); 1224 1225 CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", 1226 slab, keg->uk_name, keg); 1227 1228 if (keg->uk_flags & UMA_ZONE_HASH) 1229 UMA_HASH_INSERT(&keg->uk_hash, slab, mem); 1230 1231 keg->uk_pages += keg->uk_ppera; 1232 keg->uk_free += keg->uk_ipers; 1233 1234 out: 1235 return (slab); 1236 } 1237 1238 /* 1239 * This function is intended to be used early on in place of page_alloc() so 1240 * that we may use the boot time page cache to satisfy allocations before 1241 * the VM is ready. 1242 */ 1243 static void * 1244 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1245 int wait) 1246 { 1247 uma_keg_t keg; 1248 void *mem; 1249 int pages; 1250 1251 keg = zone->uz_keg; 1252 /* 1253 * If we are in BOOT_BUCKETS or higher, than switch to real 1254 * allocator. Zones with page sized slabs switch at BOOT_PAGEALLOC. 1255 */ 1256 switch (booted) { 1257 case BOOT_COLD: 1258 case BOOT_STRAPPED: 1259 break; 1260 case BOOT_PAGEALLOC: 1261 if (keg->uk_ppera > 1) 1262 break; 1263 case BOOT_BUCKETS: 1264 case BOOT_RUNNING: 1265 #ifdef UMA_MD_SMALL_ALLOC 1266 keg->uk_allocf = (keg->uk_ppera > 1) ? 1267 page_alloc : uma_small_alloc; 1268 #else 1269 keg->uk_allocf = page_alloc; 1270 #endif 1271 return keg->uk_allocf(zone, bytes, domain, pflag, wait); 1272 } 1273 1274 /* 1275 * Check our small startup cache to see if it has pages remaining. 1276 */ 1277 pages = howmany(bytes, PAGE_SIZE); 1278 KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__)); 1279 if (pages > boot_pages) 1280 panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name); 1281 #ifdef DIAGNOSTIC 1282 printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name, 1283 boot_pages); 1284 #endif 1285 mem = bootmem; 1286 boot_pages -= pages; 1287 bootmem += pages * PAGE_SIZE; 1288 *pflag = UMA_SLAB_BOOT; 1289 1290 return (mem); 1291 } 1292 1293 /* 1294 * Allocates a number of pages from the system 1295 * 1296 * Arguments: 1297 * bytes The number of bytes requested 1298 * wait Shall we wait? 1299 * 1300 * Returns: 1301 * A pointer to the alloced memory or possibly 1302 * NULL if M_NOWAIT is set. 1303 */ 1304 static void * 1305 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1306 int wait) 1307 { 1308 void *p; /* Returned page */ 1309 1310 *pflag = UMA_SLAB_KERNEL; 1311 p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); 1312 1313 return (p); 1314 } 1315 1316 static void * 1317 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1318 int wait) 1319 { 1320 struct pglist alloctail; 1321 vm_offset_t addr, zkva; 1322 int cpu, flags; 1323 vm_page_t p, p_next; 1324 #ifdef NUMA 1325 struct pcpu *pc; 1326 #endif 1327 1328 MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE); 1329 1330 TAILQ_INIT(&alloctail); 1331 flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | 1332 malloc2vm_flags(wait); 1333 *pflag = UMA_SLAB_KERNEL; 1334 for (cpu = 0; cpu <= mp_maxid; cpu++) { 1335 if (CPU_ABSENT(cpu)) { 1336 p = vm_page_alloc(NULL, 0, flags); 1337 } else { 1338 #ifndef NUMA 1339 p = vm_page_alloc(NULL, 0, flags); 1340 #else 1341 pc = pcpu_find(cpu); 1342 p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags); 1343 if (__predict_false(p == NULL)) 1344 p = vm_page_alloc(NULL, 0, flags); 1345 #endif 1346 } 1347 if (__predict_false(p == NULL)) 1348 goto fail; 1349 TAILQ_INSERT_TAIL(&alloctail, p, listq); 1350 } 1351 if ((addr = kva_alloc(bytes)) == 0) 1352 goto fail; 1353 zkva = addr; 1354 TAILQ_FOREACH(p, &alloctail, listq) { 1355 pmap_qenter(zkva, &p, 1); 1356 zkva += PAGE_SIZE; 1357 } 1358 return ((void*)addr); 1359 fail: 1360 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { 1361 vm_page_unwire_noq(p); 1362 vm_page_free(p); 1363 } 1364 return (NULL); 1365 } 1366 1367 /* 1368 * Allocates a number of pages from within an object 1369 * 1370 * Arguments: 1371 * bytes The number of bytes requested 1372 * wait Shall we wait? 1373 * 1374 * Returns: 1375 * A pointer to the alloced memory or possibly 1376 * NULL if M_NOWAIT is set. 1377 */ 1378 static void * 1379 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 1380 int wait) 1381 { 1382 TAILQ_HEAD(, vm_page) alloctail; 1383 u_long npages; 1384 vm_offset_t retkva, zkva; 1385 vm_page_t p, p_next; 1386 uma_keg_t keg; 1387 1388 TAILQ_INIT(&alloctail); 1389 keg = zone->uz_keg; 1390 1391 npages = howmany(bytes, PAGE_SIZE); 1392 while (npages > 0) { 1393 p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT | 1394 VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | 1395 ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : 1396 VM_ALLOC_NOWAIT)); 1397 if (p != NULL) { 1398 /* 1399 * Since the page does not belong to an object, its 1400 * listq is unused. 1401 */ 1402 TAILQ_INSERT_TAIL(&alloctail, p, listq); 1403 npages--; 1404 continue; 1405 } 1406 /* 1407 * Page allocation failed, free intermediate pages and 1408 * exit. 1409 */ 1410 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { 1411 vm_page_unwire_noq(p); 1412 vm_page_free(p); 1413 } 1414 return (NULL); 1415 } 1416 *flags = UMA_SLAB_PRIV; 1417 zkva = keg->uk_kva + 1418 atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); 1419 retkva = zkva; 1420 TAILQ_FOREACH(p, &alloctail, listq) { 1421 pmap_qenter(zkva, &p, 1); 1422 zkva += PAGE_SIZE; 1423 } 1424 1425 return ((void *)retkva); 1426 } 1427 1428 /* 1429 * Frees a number of pages to the system 1430 * 1431 * Arguments: 1432 * mem A pointer to the memory to be freed 1433 * size The size of the memory being freed 1434 * flags The original p->us_flags field 1435 * 1436 * Returns: 1437 * Nothing 1438 */ 1439 static void 1440 page_free(void *mem, vm_size_t size, uint8_t flags) 1441 { 1442 1443 if ((flags & UMA_SLAB_KERNEL) == 0) 1444 panic("UMA: page_free used with invalid flags %x", flags); 1445 1446 kmem_free((vm_offset_t)mem, size); 1447 } 1448 1449 /* 1450 * Frees pcpu zone allocations 1451 * 1452 * Arguments: 1453 * mem A pointer to the memory to be freed 1454 * size The size of the memory being freed 1455 * flags The original p->us_flags field 1456 * 1457 * Returns: 1458 * Nothing 1459 */ 1460 static void 1461 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) 1462 { 1463 vm_offset_t sva, curva; 1464 vm_paddr_t paddr; 1465 vm_page_t m; 1466 1467 MPASS(size == (mp_maxid+1)*PAGE_SIZE); 1468 sva = (vm_offset_t)mem; 1469 for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { 1470 paddr = pmap_kextract(curva); 1471 m = PHYS_TO_VM_PAGE(paddr); 1472 vm_page_unwire_noq(m); 1473 vm_page_free(m); 1474 } 1475 pmap_qremove(sva, size >> PAGE_SHIFT); 1476 kva_free(sva, size); 1477 } 1478 1479 1480 /* 1481 * Zero fill initializer 1482 * 1483 * Arguments/Returns follow uma_init specifications 1484 */ 1485 static int 1486 zero_init(void *mem, int size, int flags) 1487 { 1488 bzero(mem, size); 1489 return (0); 1490 } 1491 1492 #ifdef INVARIANTS 1493 struct noslabbits * 1494 slab_dbg_bits(uma_slab_t slab, uma_keg_t keg) 1495 { 1496 1497 return ((void *)((char *)&slab->us_free + BITSET_SIZE(keg->uk_ipers))); 1498 } 1499 #endif 1500 1501 /* 1502 * Actual size of embedded struct slab (!OFFPAGE). 1503 */ 1504 size_t 1505 slab_sizeof(int nitems) 1506 { 1507 size_t s; 1508 1509 s = sizeof(struct uma_slab) + BITSET_SIZE(nitems) * SLAB_BITSETS; 1510 return (roundup(s, UMA_ALIGN_PTR + 1)); 1511 } 1512 1513 /* 1514 * Size of memory for embedded slabs (!OFFPAGE). 1515 */ 1516 size_t 1517 slab_space(int nitems) 1518 { 1519 return (UMA_SLAB_SIZE - slab_sizeof(nitems)); 1520 } 1521 1522 /* 1523 * Compute the number of items that will fit in an embedded (!OFFPAGE) slab 1524 * with a given size and alignment. 1525 */ 1526 int 1527 slab_ipers(size_t size, int align) 1528 { 1529 int rsize; 1530 int nitems; 1531 1532 /* 1533 * Compute the ideal number of items that will fit in a page and 1534 * then compute the actual number based on a bitset nitems wide. 1535 */ 1536 rsize = roundup(size, align + 1); 1537 nitems = UMA_SLAB_SIZE / rsize; 1538 return (slab_space(nitems) / rsize); 1539 } 1540 1541 /* 1542 * Finish creating a small uma keg. This calculates ipers, and the keg size. 1543 * 1544 * Arguments 1545 * keg The zone we should initialize 1546 * 1547 * Returns 1548 * Nothing 1549 */ 1550 static void 1551 keg_small_init(uma_keg_t keg) 1552 { 1553 u_int rsize; 1554 u_int memused; 1555 u_int wastedspace; 1556 u_int shsize; 1557 u_int slabsize; 1558 1559 if (keg->uk_flags & UMA_ZONE_PCPU) { 1560 u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU; 1561 1562 slabsize = UMA_PCPU_ALLOC_SIZE; 1563 keg->uk_ppera = ncpus; 1564 } else { 1565 slabsize = UMA_SLAB_SIZE; 1566 keg->uk_ppera = 1; 1567 } 1568 1569 /* 1570 * Calculate the size of each allocation (rsize) according to 1571 * alignment. If the requested size is smaller than we have 1572 * allocation bits for we round it up. 1573 */ 1574 rsize = keg->uk_size; 1575 if (rsize < slabsize / SLAB_MAX_SETSIZE) 1576 rsize = slabsize / SLAB_MAX_SETSIZE; 1577 if (rsize & keg->uk_align) 1578 rsize = roundup(rsize, keg->uk_align + 1); 1579 keg->uk_rsize = rsize; 1580 1581 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || 1582 keg->uk_rsize < UMA_PCPU_ALLOC_SIZE, 1583 ("%s: size %u too large", __func__, keg->uk_rsize)); 1584 1585 /* 1586 * Use a pessimistic bit count for shsize. It may be possible to 1587 * squeeze one more item in for very particular sizes if we were 1588 * to loop and reduce the bitsize if there is waste. 1589 */ 1590 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1591 shsize = 0; 1592 else 1593 shsize = slab_sizeof(slabsize / rsize); 1594 1595 if (rsize <= slabsize - shsize) 1596 keg->uk_ipers = (slabsize - shsize) / rsize; 1597 else { 1598 /* Handle special case when we have 1 item per slab, so 1599 * alignment requirement can be relaxed. */ 1600 KASSERT(keg->uk_size <= slabsize - shsize, 1601 ("%s: size %u greater than slab", __func__, keg->uk_size)); 1602 keg->uk_ipers = 1; 1603 } 1604 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, 1605 ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); 1606 1607 memused = keg->uk_ipers * rsize + shsize; 1608 wastedspace = slabsize - memused; 1609 1610 /* 1611 * We can't do OFFPAGE if we're internal or if we've been 1612 * asked to not go to the VM for buckets. If we do this we 1613 * may end up going to the VM for slabs which we do not 1614 * want to do if we're UMA_ZFLAG_CACHEONLY as a result 1615 * of UMA_ZONE_VM, which clearly forbids it. 1616 */ 1617 if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) || 1618 (keg->uk_flags & UMA_ZFLAG_CACHEONLY)) 1619 return; 1620 1621 /* 1622 * See if using an OFFPAGE slab will limit our waste. Only do 1623 * this if it permits more items per-slab. 1624 * 1625 * XXX We could try growing slabsize to limit max waste as well. 1626 * Historically this was not done because the VM could not 1627 * efficiently handle contiguous allocations. 1628 */ 1629 if ((wastedspace >= slabsize / UMA_MAX_WASTE) && 1630 (keg->uk_ipers < (slabsize / keg->uk_rsize))) { 1631 keg->uk_ipers = slabsize / keg->uk_rsize; 1632 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, 1633 ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); 1634 CTR6(KTR_UMA, "UMA decided we need offpage slab headers for " 1635 "keg: %s(%p), calculated wastedspace = %d, " 1636 "maximum wasted space allowed = %d, " 1637 "calculated ipers = %d, " 1638 "new wasted space = %d\n", keg->uk_name, keg, wastedspace, 1639 slabsize / UMA_MAX_WASTE, keg->uk_ipers, 1640 slabsize - keg->uk_ipers * keg->uk_rsize); 1641 /* 1642 * If we had access to memory to embed a slab header we 1643 * also have a page structure to use vtoslab() instead of 1644 * hash to find slabs. If the zone was explicitly created 1645 * OFFPAGE we can't necessarily touch the memory. 1646 */ 1647 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) 1648 keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB; 1649 } 1650 1651 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && 1652 (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) 1653 keg->uk_flags |= UMA_ZONE_HASH; 1654 } 1655 1656 /* 1657 * Finish creating a large (> UMA_SLAB_SIZE) uma kegs. Just give in and do 1658 * OFFPAGE for now. When I can allow for more dynamic slab sizes this will be 1659 * more complicated. 1660 * 1661 * Arguments 1662 * keg The keg we should initialize 1663 * 1664 * Returns 1665 * Nothing 1666 */ 1667 static void 1668 keg_large_init(uma_keg_t keg) 1669 { 1670 1671 KASSERT(keg != NULL, ("Keg is null in keg_large_init")); 1672 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, 1673 ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__)); 1674 1675 keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE); 1676 keg->uk_ipers = 1; 1677 keg->uk_rsize = keg->uk_size; 1678 1679 /* Check whether we have enough space to not do OFFPAGE. */ 1680 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 && 1681 PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < 1682 slab_sizeof(SLAB_MIN_SETSIZE)) { 1683 /* 1684 * We can't do OFFPAGE if we're internal, in which case 1685 * we need an extra page per allocation to contain the 1686 * slab header. 1687 */ 1688 if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0) 1689 keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB; 1690 else 1691 keg->uk_ppera++; 1692 } 1693 1694 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && 1695 (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) 1696 keg->uk_flags |= UMA_ZONE_HASH; 1697 } 1698 1699 static void 1700 keg_cachespread_init(uma_keg_t keg) 1701 { 1702 int alignsize; 1703 int trailer; 1704 int pages; 1705 int rsize; 1706 1707 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, 1708 ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__)); 1709 1710 alignsize = keg->uk_align + 1; 1711 rsize = keg->uk_size; 1712 /* 1713 * We want one item to start on every align boundary in a page. To 1714 * do this we will span pages. We will also extend the item by the 1715 * size of align if it is an even multiple of align. Otherwise, it 1716 * would fall on the same boundary every time. 1717 */ 1718 if (rsize & keg->uk_align) 1719 rsize = (rsize & ~keg->uk_align) + alignsize; 1720 if ((rsize & alignsize) == 0) 1721 rsize += alignsize; 1722 trailer = rsize - keg->uk_size; 1723 pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE; 1724 pages = MIN(pages, (128 * 1024) / PAGE_SIZE); 1725 keg->uk_rsize = rsize; 1726 keg->uk_ppera = pages; 1727 keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize; 1728 keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB; 1729 KASSERT(keg->uk_ipers <= SLAB_MAX_SETSIZE, 1730 ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__, 1731 keg->uk_ipers)); 1732 } 1733 1734 /* 1735 * Keg header ctor. This initializes all fields, locks, etc. And inserts 1736 * the keg onto the global keg list. 1737 * 1738 * Arguments/Returns follow uma_ctor specifications 1739 * udata Actually uma_kctor_args 1740 */ 1741 static int 1742 keg_ctor(void *mem, int size, void *udata, int flags) 1743 { 1744 struct uma_kctor_args *arg = udata; 1745 uma_keg_t keg = mem; 1746 uma_zone_t zone; 1747 1748 bzero(keg, size); 1749 keg->uk_size = arg->size; 1750 keg->uk_init = arg->uminit; 1751 keg->uk_fini = arg->fini; 1752 keg->uk_align = arg->align; 1753 keg->uk_free = 0; 1754 keg->uk_reserve = 0; 1755 keg->uk_pages = 0; 1756 keg->uk_flags = arg->flags; 1757 keg->uk_slabzone = NULL; 1758 1759 /* 1760 * We use a global round-robin policy by default. Zones with 1761 * UMA_ZONE_NUMA set will use first-touch instead, in which case the 1762 * iterator is never run. 1763 */ 1764 keg->uk_dr.dr_policy = DOMAINSET_RR(); 1765 keg->uk_dr.dr_iter = 0; 1766 1767 /* 1768 * The master zone is passed to us at keg-creation time. 1769 */ 1770 zone = arg->zone; 1771 keg->uk_name = zone->uz_name; 1772 1773 if (arg->flags & UMA_ZONE_VM) 1774 keg->uk_flags |= UMA_ZFLAG_CACHEONLY; 1775 1776 if (arg->flags & UMA_ZONE_ZINIT) 1777 keg->uk_init = zero_init; 1778 1779 if (arg->flags & UMA_ZONE_MALLOC) 1780 keg->uk_flags |= UMA_ZONE_VTOSLAB; 1781 1782 if (arg->flags & UMA_ZONE_PCPU) 1783 #ifdef SMP 1784 keg->uk_flags |= UMA_ZONE_OFFPAGE; 1785 #else 1786 keg->uk_flags &= ~UMA_ZONE_PCPU; 1787 #endif 1788 1789 if (keg->uk_flags & UMA_ZONE_CACHESPREAD) { 1790 keg_cachespread_init(keg); 1791 } else { 1792 if (keg->uk_size > slab_space(SLAB_MIN_SETSIZE)) 1793 keg_large_init(keg); 1794 else 1795 keg_small_init(keg); 1796 } 1797 1798 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1799 keg->uk_slabzone = slabzone; 1800 1801 /* 1802 * If we haven't booted yet we need allocations to go through the 1803 * startup cache until the vm is ready. 1804 */ 1805 if (booted < BOOT_PAGEALLOC) 1806 keg->uk_allocf = startup_alloc; 1807 #ifdef UMA_MD_SMALL_ALLOC 1808 else if (keg->uk_ppera == 1) 1809 keg->uk_allocf = uma_small_alloc; 1810 #endif 1811 else if (keg->uk_flags & UMA_ZONE_PCPU) 1812 keg->uk_allocf = pcpu_page_alloc; 1813 else 1814 keg->uk_allocf = page_alloc; 1815 #ifdef UMA_MD_SMALL_ALLOC 1816 if (keg->uk_ppera == 1) 1817 keg->uk_freef = uma_small_free; 1818 else 1819 #endif 1820 if (keg->uk_flags & UMA_ZONE_PCPU) 1821 keg->uk_freef = pcpu_page_free; 1822 else 1823 keg->uk_freef = page_free; 1824 1825 /* 1826 * Initialize keg's lock 1827 */ 1828 KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS)); 1829 1830 /* 1831 * If we're putting the slab header in the actual page we need to 1832 * figure out where in each page it goes. See slab_sizeof 1833 * definition. 1834 */ 1835 if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) { 1836 size_t shsize; 1837 1838 shsize = slab_sizeof(keg->uk_ipers); 1839 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize; 1840 /* 1841 * The only way the following is possible is if with our 1842 * UMA_ALIGN_PTR adjustments we are now bigger than 1843 * UMA_SLAB_SIZE. I haven't checked whether this is 1844 * mathematically possible for all cases, so we make 1845 * sure here anyway. 1846 */ 1847 KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera, 1848 ("zone %s ipers %d rsize %d size %d slab won't fit", 1849 zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size)); 1850 } 1851 1852 if (keg->uk_flags & UMA_ZONE_HASH) 1853 hash_alloc(&keg->uk_hash, 0); 1854 1855 CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n", 1856 keg, zone->uz_name, zone, 1857 (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, 1858 keg->uk_free); 1859 1860 LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); 1861 1862 rw_wlock(&uma_rwlock); 1863 LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); 1864 rw_wunlock(&uma_rwlock); 1865 return (0); 1866 } 1867 1868 static void 1869 zone_alloc_counters(uma_zone_t zone, void *unused) 1870 { 1871 1872 zone->uz_allocs = counter_u64_alloc(M_WAITOK); 1873 zone->uz_frees = counter_u64_alloc(M_WAITOK); 1874 zone->uz_fails = counter_u64_alloc(M_WAITOK); 1875 } 1876 1877 static void 1878 zone_alloc_sysctl(uma_zone_t zone, void *unused) 1879 { 1880 uma_zone_domain_t zdom; 1881 uma_keg_t keg; 1882 struct sysctl_oid *oid, *domainoid; 1883 int domains, i, cnt; 1884 static const char *nokeg = "cache zone"; 1885 char *c; 1886 1887 /* 1888 * Make a sysctl safe copy of the zone name by removing 1889 * any special characters and handling dups by appending 1890 * an index. 1891 */ 1892 if (zone->uz_namecnt != 0) { 1893 /* Count the number of decimal digits and '_' separator. */ 1894 for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++) 1895 cnt /= 10; 1896 zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1, 1897 M_UMA, M_WAITOK); 1898 sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name, 1899 zone->uz_namecnt); 1900 } else 1901 zone->uz_ctlname = strdup(zone->uz_name, M_UMA); 1902 for (c = zone->uz_ctlname; *c != '\0'; c++) 1903 if (strchr("./\\ -", *c) != NULL) 1904 *c = '_'; 1905 1906 /* 1907 * Basic parameters at the root. 1908 */ 1909 zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma), 1910 OID_AUTO, zone->uz_ctlname, CTLFLAG_RD, NULL, ""); 1911 oid = zone->uz_oid; 1912 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1913 "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size"); 1914 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1915 "flags", CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_MPSAFE, 1916 zone, 0, sysctl_handle_uma_zone_flags, "A", 1917 "Allocator configuration flags"); 1918 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1919 "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0, 1920 "Desired per-cpu cache size"); 1921 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1922 "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0, 1923 "Maximum allowed per-cpu cache size"); 1924 1925 /* 1926 * keg if present. 1927 */ 1928 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 1929 "keg", CTLFLAG_RD, NULL, ""); 1930 keg = zone->uz_keg; 1931 if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) { 1932 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1933 "name", CTLFLAG_RD, keg->uk_name, "Keg name"); 1934 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1935 "rsize", CTLFLAG_RD, &keg->uk_rsize, 0, 1936 "Real object size with alignment"); 1937 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1938 "ppera", CTLFLAG_RD, &keg->uk_ppera, 0, 1939 "pages per-slab allocation"); 1940 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1941 "ipers", CTLFLAG_RD, &keg->uk_ipers, 0, 1942 "items available per-slab"); 1943 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1944 "align", CTLFLAG_RD, &keg->uk_align, 0, 1945 "item alignment mask"); 1946 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1947 "pages", CTLFLAG_RD, &keg->uk_pages, 0, 1948 "Total pages currently allocated from VM"); 1949 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1950 "free", CTLFLAG_RD, &keg->uk_free, 0, 1951 "items free in the slab layer"); 1952 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1953 "efficiency", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, 1954 keg, 0, sysctl_handle_uma_slab_efficiency, "I", 1955 "Slab utilization (100 - internal fragmentation %)"); 1956 } else 1957 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1958 "name", CTLFLAG_RD, nokeg, "Keg name"); 1959 1960 /* 1961 * Information about zone limits. 1962 */ 1963 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 1964 "limit", CTLFLAG_RD, NULL, ""); 1965 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1966 "items", CTLFLAG_RD, &zone->uz_items, 0, 1967 "current number of cached items"); 1968 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1969 "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, 1970 "Maximum number of cached items"); 1971 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1972 "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0, 1973 "Number of threads sleeping at limit"); 1974 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1975 "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, 1976 "Total zone limit sleeps"); 1977 1978 /* 1979 * Per-domain information. 1980 */ 1981 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) 1982 domains = vm_ndomains; 1983 else 1984 domains = 1; 1985 domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), 1986 OID_AUTO, "domain", CTLFLAG_RD, NULL, ""); 1987 for (i = 0; i < domains; i++) { 1988 zdom = &zone->uz_domain[i]; 1989 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), 1990 OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD, NULL, ""); 1991 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1992 "nitems", CTLFLAG_RD, &zdom->uzd_nitems, 1993 "number of items in this domain"); 1994 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1995 "imax", CTLFLAG_RD, &zdom->uzd_imax, 1996 "maximum item count in this period"); 1997 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1998 "imin", CTLFLAG_RD, &zdom->uzd_imin, 1999 "minimum item count in this period"); 2000 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2001 "wss", CTLFLAG_RD, &zdom->uzd_wss, 2002 "Working set size"); 2003 } 2004 2005 /* 2006 * General statistics. 2007 */ 2008 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 2009 "stats", CTLFLAG_RD, NULL, ""); 2010 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2011 "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, 2012 zone, 1, sysctl_handle_uma_zone_cur, "I", 2013 "Current number of allocated items"); 2014 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2015 "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 2016 zone, 0, sysctl_handle_uma_zone_allocs, "QU", 2017 "Total allocation calls"); 2018 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2019 "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 2020 zone, 0, sysctl_handle_uma_zone_frees, "QU", 2021 "Total free calls"); 2022 SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2023 "fails", CTLFLAG_RD, &zone->uz_fails, 2024 "Number of allocation failures"); 2025 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2026 "xdomain", CTLFLAG_RD, &zone->uz_xdomain, 0, 2027 "Free calls from the wrong domain"); 2028 } 2029 2030 struct uma_zone_count { 2031 const char *name; 2032 int count; 2033 }; 2034 2035 static void 2036 zone_count(uma_zone_t zone, void *arg) 2037 { 2038 struct uma_zone_count *cnt; 2039 2040 cnt = arg; 2041 /* 2042 * Some zones are rapidly created with identical names and 2043 * destroyed out of order. This can lead to gaps in the count. 2044 * Use one greater than the maximum observed for this name. 2045 */ 2046 if (strcmp(zone->uz_name, cnt->name) == 0) 2047 cnt->count = MAX(cnt->count, 2048 zone->uz_namecnt + 1); 2049 } 2050 2051 /* 2052 * Zone header ctor. This initializes all fields, locks, etc. 2053 * 2054 * Arguments/Returns follow uma_ctor specifications 2055 * udata Actually uma_zctor_args 2056 */ 2057 static int 2058 zone_ctor(void *mem, int size, void *udata, int flags) 2059 { 2060 struct uma_zone_count cnt; 2061 struct uma_zctor_args *arg = udata; 2062 uma_zone_t zone = mem; 2063 uma_zone_t z; 2064 uma_keg_t keg; 2065 int i; 2066 2067 bzero(zone, size); 2068 zone->uz_name = arg->name; 2069 zone->uz_ctor = arg->ctor; 2070 zone->uz_dtor = arg->dtor; 2071 zone->uz_init = NULL; 2072 zone->uz_fini = NULL; 2073 zone->uz_sleeps = 0; 2074 zone->uz_xdomain = 0; 2075 zone->uz_bucket_size = 0; 2076 zone->uz_bucket_size_min = 0; 2077 zone->uz_bucket_size_max = BUCKET_MAX; 2078 zone->uz_flags = 0; 2079 zone->uz_warning = NULL; 2080 /* The domain structures follow the cpu structures. */ 2081 zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus]; 2082 zone->uz_bkt_max = ULONG_MAX; 2083 timevalclear(&zone->uz_ratecheck); 2084 2085 /* Count the number of duplicate names. */ 2086 cnt.name = arg->name; 2087 cnt.count = 0; 2088 zone_foreach(zone_count, &cnt); 2089 zone->uz_namecnt = cnt.count; 2090 2091 for (i = 0; i < vm_ndomains; i++) 2092 TAILQ_INIT(&zone->uz_domain[i].uzd_buckets); 2093 2094 #ifdef INVARIANTS 2095 if (arg->uminit == trash_init && arg->fini == trash_fini) 2096 zone->uz_flags |= UMA_ZFLAG_TRASH; 2097 #endif 2098 2099 /* 2100 * This is a pure cache zone, no kegs. 2101 */ 2102 if (arg->import) { 2103 if (arg->flags & UMA_ZONE_VM) 2104 arg->flags |= UMA_ZFLAG_CACHEONLY; 2105 zone->uz_flags = arg->flags; 2106 zone->uz_size = arg->size; 2107 zone->uz_import = arg->import; 2108 zone->uz_release = arg->release; 2109 zone->uz_arg = arg->arg; 2110 zone->uz_lockptr = &zone->uz_lock; 2111 ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); 2112 rw_wlock(&uma_rwlock); 2113 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); 2114 rw_wunlock(&uma_rwlock); 2115 goto out; 2116 } 2117 2118 /* 2119 * Use the regular zone/keg/slab allocator. 2120 */ 2121 zone->uz_import = zone_import; 2122 zone->uz_release = zone_release; 2123 zone->uz_arg = zone; 2124 keg = arg->keg; 2125 2126 if (arg->flags & UMA_ZONE_SECONDARY) { 2127 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, 2128 ("Secondary zone requested UMA_ZFLAG_INTERNAL")); 2129 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); 2130 zone->uz_init = arg->uminit; 2131 zone->uz_fini = arg->fini; 2132 zone->uz_lockptr = &keg->uk_lock; 2133 zone->uz_flags |= UMA_ZONE_SECONDARY; 2134 rw_wlock(&uma_rwlock); 2135 ZONE_LOCK(zone); 2136 LIST_FOREACH(z, &keg->uk_zones, uz_link) { 2137 if (LIST_NEXT(z, uz_link) == NULL) { 2138 LIST_INSERT_AFTER(z, zone, uz_link); 2139 break; 2140 } 2141 } 2142 ZONE_UNLOCK(zone); 2143 rw_wunlock(&uma_rwlock); 2144 } else if (keg == NULL) { 2145 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, 2146 arg->align, arg->flags)) == NULL) 2147 return (ENOMEM); 2148 } else { 2149 struct uma_kctor_args karg; 2150 int error; 2151 2152 /* We should only be here from uma_startup() */ 2153 karg.size = arg->size; 2154 karg.uminit = arg->uminit; 2155 karg.fini = arg->fini; 2156 karg.align = arg->align; 2157 karg.flags = arg->flags; 2158 karg.zone = zone; 2159 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, 2160 flags); 2161 if (error) 2162 return (error); 2163 } 2164 2165 /* Inherit properties from the keg. */ 2166 zone->uz_keg = keg; 2167 zone->uz_size = keg->uk_size; 2168 zone->uz_flags |= (keg->uk_flags & 2169 (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); 2170 2171 out: 2172 if (__predict_true(booted == BOOT_RUNNING)) { 2173 zone_alloc_counters(zone, NULL); 2174 zone_alloc_sysctl(zone, NULL); 2175 } else { 2176 zone->uz_allocs = EARLY_COUNTER; 2177 zone->uz_frees = EARLY_COUNTER; 2178 zone->uz_fails = EARLY_COUNTER; 2179 } 2180 2181 KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != 2182 (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), 2183 ("Invalid zone flag combination")); 2184 if (arg->flags & UMA_ZFLAG_INTERNAL) 2185 zone->uz_bucket_size_max = zone->uz_bucket_size = 0; 2186 if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) 2187 zone->uz_bucket_size = BUCKET_MAX; 2188 else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) 2189 zone->uz_bucket_size_max = zone->uz_bucket_size = BUCKET_MIN; 2190 else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) 2191 zone->uz_bucket_size = 0; 2192 else 2193 zone->uz_bucket_size = bucket_select(zone->uz_size); 2194 zone->uz_bucket_size_min = zone->uz_bucket_size; 2195 2196 return (0); 2197 } 2198 2199 /* 2200 * Keg header dtor. This frees all data, destroys locks, frees the hash 2201 * table and removes the keg from the global list. 2202 * 2203 * Arguments/Returns follow uma_dtor specifications 2204 * udata unused 2205 */ 2206 static void 2207 keg_dtor(void *arg, int size, void *udata) 2208 { 2209 uma_keg_t keg; 2210 2211 keg = (uma_keg_t)arg; 2212 KEG_LOCK(keg); 2213 if (keg->uk_free != 0) { 2214 printf("Freed UMA keg (%s) was not empty (%d items). " 2215 " Lost %d pages of memory.\n", 2216 keg->uk_name ? keg->uk_name : "", 2217 keg->uk_free, keg->uk_pages); 2218 } 2219 KEG_UNLOCK(keg); 2220 2221 hash_free(&keg->uk_hash); 2222 2223 KEG_LOCK_FINI(keg); 2224 } 2225 2226 /* 2227 * Zone header dtor. 2228 * 2229 * Arguments/Returns follow uma_dtor specifications 2230 * udata unused 2231 */ 2232 static void 2233 zone_dtor(void *arg, int size, void *udata) 2234 { 2235 uma_zone_t zone; 2236 uma_keg_t keg; 2237 2238 zone = (uma_zone_t)arg; 2239 2240 sysctl_remove_oid(zone->uz_oid, 1, 1); 2241 2242 if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) 2243 cache_drain(zone); 2244 2245 rw_wlock(&uma_rwlock); 2246 LIST_REMOVE(zone, uz_link); 2247 rw_wunlock(&uma_rwlock); 2248 /* 2249 * XXX there are some races here where 2250 * the zone can be drained but zone lock 2251 * released and then refilled before we 2252 * remove it... we dont care for now 2253 */ 2254 zone_reclaim(zone, M_WAITOK, true); 2255 /* 2256 * We only destroy kegs from non secondary/non cache zones. 2257 */ 2258 if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { 2259 keg = zone->uz_keg; 2260 rw_wlock(&uma_rwlock); 2261 LIST_REMOVE(keg, uk_link); 2262 rw_wunlock(&uma_rwlock); 2263 zone_free_item(kegs, keg, NULL, SKIP_NONE); 2264 } 2265 counter_u64_free(zone->uz_allocs); 2266 counter_u64_free(zone->uz_frees); 2267 counter_u64_free(zone->uz_fails); 2268 free(zone->uz_ctlname, M_UMA); 2269 if (zone->uz_lockptr == &zone->uz_lock) 2270 ZONE_LOCK_FINI(zone); 2271 } 2272 2273 /* 2274 * Traverses every zone in the system and calls a callback 2275 * 2276 * Arguments: 2277 * zfunc A pointer to a function which accepts a zone 2278 * as an argument. 2279 * 2280 * Returns: 2281 * Nothing 2282 */ 2283 static void 2284 zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg) 2285 { 2286 uma_keg_t keg; 2287 uma_zone_t zone; 2288 2289 /* 2290 * Before BOOT_RUNNING we are guaranteed to be single 2291 * threaded, so locking isn't needed. Startup functions 2292 * are allowed to use M_WAITOK. 2293 */ 2294 if (__predict_true(booted == BOOT_RUNNING)) 2295 rw_rlock(&uma_rwlock); 2296 LIST_FOREACH(keg, &uma_kegs, uk_link) { 2297 LIST_FOREACH(zone, &keg->uk_zones, uz_link) 2298 zfunc(zone, arg); 2299 } 2300 LIST_FOREACH(zone, &uma_cachezones, uz_link) 2301 zfunc(zone, arg); 2302 if (__predict_true(booted == BOOT_RUNNING)) 2303 rw_runlock(&uma_rwlock); 2304 } 2305 2306 /* 2307 * Count how many pages do we need to bootstrap. VM supplies 2308 * its need in early zones in the argument, we add up our zones, 2309 * which consist of the UMA Slabs, UMA Hash and 9 Bucket zones. The 2310 * zone of zones and zone of kegs are accounted separately. 2311 */ 2312 #define UMA_BOOT_ZONES 11 2313 /* Zone of zones and zone of kegs have arbitrary alignment. */ 2314 #define UMA_BOOT_ALIGN 32 2315 static int zsize, ksize; 2316 int 2317 uma_startup_count(int vm_zones) 2318 { 2319 int zones, pages; 2320 size_t space, size; 2321 2322 ksize = sizeof(struct uma_keg) + 2323 (sizeof(struct uma_domain) * vm_ndomains); 2324 zsize = sizeof(struct uma_zone) + 2325 (sizeof(struct uma_cache) * (mp_maxid + 1)) + 2326 (sizeof(struct uma_zone_domain) * vm_ndomains); 2327 2328 /* 2329 * Memory for the zone of kegs and its keg, 2330 * and for zone of zones. 2331 */ 2332 pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 + 2333 roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE); 2334 2335 #ifdef UMA_MD_SMALL_ALLOC 2336 zones = UMA_BOOT_ZONES; 2337 #else 2338 zones = UMA_BOOT_ZONES + vm_zones; 2339 vm_zones = 0; 2340 #endif 2341 size = slab_sizeof(SLAB_MAX_SETSIZE); 2342 space = slab_space(SLAB_MAX_SETSIZE); 2343 2344 /* Memory for the rest of startup zones, UMA and VM, ... */ 2345 if (zsize > space) { 2346 /* See keg_large_init(). */ 2347 u_int ppera; 2348 2349 ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE); 2350 if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) < size) 2351 ppera++; 2352 pages += (zones + vm_zones) * ppera; 2353 } else if (roundup2(zsize, UMA_BOOT_ALIGN) > space) 2354 /* See keg_small_init() special case for uk_ppera = 1. */ 2355 pages += zones; 2356 else 2357 pages += howmany(zones, 2358 space / roundup2(zsize, UMA_BOOT_ALIGN)); 2359 2360 /* ... and their kegs. Note that zone of zones allocates a keg! */ 2361 pages += howmany(zones + 1, 2362 space / roundup2(ksize, UMA_BOOT_ALIGN)); 2363 2364 return (pages); 2365 } 2366 2367 void 2368 uma_startup(void *mem, int npages) 2369 { 2370 struct uma_zctor_args args; 2371 uma_keg_t masterkeg; 2372 uintptr_t m; 2373 2374 #ifdef DIAGNOSTIC 2375 printf("Entering %s with %d boot pages configured\n", __func__, npages); 2376 #endif 2377 2378 rw_init(&uma_rwlock, "UMA lock"); 2379 2380 /* Use bootpages memory for the zone of zones and zone of kegs. */ 2381 m = (uintptr_t)mem; 2382 zones = (uma_zone_t)m; 2383 m += roundup(zsize, CACHE_LINE_SIZE); 2384 kegs = (uma_zone_t)m; 2385 m += roundup(zsize, CACHE_LINE_SIZE); 2386 masterkeg = (uma_keg_t)m; 2387 m += roundup(ksize, CACHE_LINE_SIZE); 2388 m = roundup(m, PAGE_SIZE); 2389 npages -= (m - (uintptr_t)mem) / PAGE_SIZE; 2390 mem = (void *)m; 2391 2392 /* "manually" create the initial zone */ 2393 memset(&args, 0, sizeof(args)); 2394 args.name = "UMA Kegs"; 2395 args.size = ksize; 2396 args.ctor = keg_ctor; 2397 args.dtor = keg_dtor; 2398 args.uminit = zero_init; 2399 args.fini = NULL; 2400 args.keg = masterkeg; 2401 args.align = UMA_BOOT_ALIGN - 1; 2402 args.flags = UMA_ZFLAG_INTERNAL; 2403 zone_ctor(kegs, zsize, &args, M_WAITOK); 2404 2405 bootmem = mem; 2406 boot_pages = npages; 2407 2408 args.name = "UMA Zones"; 2409 args.size = zsize; 2410 args.ctor = zone_ctor; 2411 args.dtor = zone_dtor; 2412 args.uminit = zero_init; 2413 args.fini = NULL; 2414 args.keg = NULL; 2415 args.align = UMA_BOOT_ALIGN - 1; 2416 args.flags = UMA_ZFLAG_INTERNAL; 2417 zone_ctor(zones, zsize, &args, M_WAITOK); 2418 2419 /* Now make a zone for slab headers */ 2420 slabzone = uma_zcreate("UMA Slabs", sizeof(struct uma_hash_slab), 2421 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 2422 2423 hashzone = uma_zcreate("UMA Hash", 2424 sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, 2425 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 2426 2427 booted = BOOT_STRAPPED; 2428 } 2429 2430 void 2431 uma_startup1(void) 2432 { 2433 2434 #ifdef DIAGNOSTIC 2435 printf("Entering %s with %d boot pages left\n", __func__, boot_pages); 2436 #endif 2437 booted = BOOT_PAGEALLOC; 2438 } 2439 2440 void 2441 uma_startup2(void) 2442 { 2443 2444 #ifdef DIAGNOSTIC 2445 printf("Entering %s with %d boot pages left\n", __func__, boot_pages); 2446 #endif 2447 sx_init(&uma_reclaim_lock, "umareclaim"); 2448 bucket_init(); 2449 booted = BOOT_BUCKETS; 2450 bucket_enable(); 2451 } 2452 2453 /* 2454 * Initialize our callout handle 2455 * 2456 */ 2457 static void 2458 uma_startup3(void) 2459 { 2460 2461 #ifdef INVARIANTS 2462 TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor); 2463 uma_dbg_cnt = counter_u64_alloc(M_WAITOK); 2464 uma_skip_cnt = counter_u64_alloc(M_WAITOK); 2465 #endif 2466 zone_foreach(zone_alloc_counters, NULL); 2467 zone_foreach(zone_alloc_sysctl, NULL); 2468 callout_init(&uma_callout, 1); 2469 callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); 2470 booted = BOOT_RUNNING; 2471 } 2472 2473 static uma_keg_t 2474 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, 2475 int align, uint32_t flags) 2476 { 2477 struct uma_kctor_args args; 2478 2479 args.size = size; 2480 args.uminit = uminit; 2481 args.fini = fini; 2482 args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; 2483 args.flags = flags; 2484 args.zone = zone; 2485 return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); 2486 } 2487 2488 /* Public functions */ 2489 /* See uma.h */ 2490 void 2491 uma_set_align(int align) 2492 { 2493 2494 if (align != UMA_ALIGN_CACHE) 2495 uma_align_cache = align; 2496 } 2497 2498 /* See uma.h */ 2499 uma_zone_t 2500 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, 2501 uma_init uminit, uma_fini fini, int align, uint32_t flags) 2502 2503 { 2504 struct uma_zctor_args args; 2505 uma_zone_t res; 2506 bool locked; 2507 2508 KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"", 2509 align, name)); 2510 2511 /* Sets all zones to a first-touch domain policy. */ 2512 #ifdef UMA_FIRSTTOUCH 2513 flags |= UMA_ZONE_NUMA; 2514 #endif 2515 2516 /* This stuff is essential for the zone ctor */ 2517 memset(&args, 0, sizeof(args)); 2518 args.name = name; 2519 args.size = size; 2520 args.ctor = ctor; 2521 args.dtor = dtor; 2522 args.uminit = uminit; 2523 args.fini = fini; 2524 #ifdef INVARIANTS 2525 /* 2526 * Inject procedures which check for memory use after free if we are 2527 * allowed to scramble the memory while it is not allocated. This 2528 * requires that: UMA is actually able to access the memory, no init 2529 * or fini procedures, no dependency on the initial value of the 2530 * memory, and no (legitimate) use of the memory after free. Note, 2531 * the ctor and dtor do not need to be empty. 2532 * 2533 * XXX UMA_ZONE_OFFPAGE. 2534 */ 2535 if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) && 2536 uminit == NULL && fini == NULL) { 2537 args.uminit = trash_init; 2538 args.fini = trash_fini; 2539 } 2540 #endif 2541 args.align = align; 2542 args.flags = flags; 2543 args.keg = NULL; 2544 2545 if (booted < BOOT_BUCKETS) { 2546 locked = false; 2547 } else { 2548 sx_slock(&uma_reclaim_lock); 2549 locked = true; 2550 } 2551 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 2552 if (locked) 2553 sx_sunlock(&uma_reclaim_lock); 2554 return (res); 2555 } 2556 2557 /* See uma.h */ 2558 uma_zone_t 2559 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, 2560 uma_init zinit, uma_fini zfini, uma_zone_t master) 2561 { 2562 struct uma_zctor_args args; 2563 uma_keg_t keg; 2564 uma_zone_t res; 2565 bool locked; 2566 2567 keg = master->uz_keg; 2568 memset(&args, 0, sizeof(args)); 2569 args.name = name; 2570 args.size = keg->uk_size; 2571 args.ctor = ctor; 2572 args.dtor = dtor; 2573 args.uminit = zinit; 2574 args.fini = zfini; 2575 args.align = keg->uk_align; 2576 args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; 2577 args.keg = keg; 2578 2579 if (booted < BOOT_BUCKETS) { 2580 locked = false; 2581 } else { 2582 sx_slock(&uma_reclaim_lock); 2583 locked = true; 2584 } 2585 /* XXX Attaches only one keg of potentially many. */ 2586 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 2587 if (locked) 2588 sx_sunlock(&uma_reclaim_lock); 2589 return (res); 2590 } 2591 2592 /* See uma.h */ 2593 uma_zone_t 2594 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, 2595 uma_init zinit, uma_fini zfini, uma_import zimport, 2596 uma_release zrelease, void *arg, int flags) 2597 { 2598 struct uma_zctor_args args; 2599 2600 memset(&args, 0, sizeof(args)); 2601 args.name = name; 2602 args.size = size; 2603 args.ctor = ctor; 2604 args.dtor = dtor; 2605 args.uminit = zinit; 2606 args.fini = zfini; 2607 args.import = zimport; 2608 args.release = zrelease; 2609 args.arg = arg; 2610 args.align = 0; 2611 args.flags = flags | UMA_ZFLAG_CACHE; 2612 2613 return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); 2614 } 2615 2616 /* See uma.h */ 2617 void 2618 uma_zdestroy(uma_zone_t zone) 2619 { 2620 2621 sx_slock(&uma_reclaim_lock); 2622 zone_free_item(zones, zone, NULL, SKIP_NONE); 2623 sx_sunlock(&uma_reclaim_lock); 2624 } 2625 2626 void 2627 uma_zwait(uma_zone_t zone) 2628 { 2629 void *item; 2630 2631 item = uma_zalloc_arg(zone, NULL, M_WAITOK); 2632 uma_zfree(zone, item); 2633 } 2634 2635 void * 2636 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags) 2637 { 2638 void *item; 2639 #ifdef SMP 2640 int i; 2641 2642 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 2643 #endif 2644 item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO); 2645 if (item != NULL && (flags & M_ZERO)) { 2646 #ifdef SMP 2647 for (i = 0; i <= mp_maxid; i++) 2648 bzero(zpcpu_get_cpu(item, i), zone->uz_size); 2649 #else 2650 bzero(item, zone->uz_size); 2651 #endif 2652 } 2653 return (item); 2654 } 2655 2656 /* 2657 * A stub while both regular and pcpu cases are identical. 2658 */ 2659 void 2660 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata) 2661 { 2662 2663 #ifdef SMP 2664 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 2665 #endif 2666 uma_zfree_arg(zone, item, udata); 2667 } 2668 2669 static inline void * 2670 bucket_pop(uma_zone_t zone, uma_cache_t cache, uma_bucket_t bucket) 2671 { 2672 void *item; 2673 2674 bucket->ub_cnt--; 2675 item = bucket->ub_bucket[bucket->ub_cnt]; 2676 #ifdef INVARIANTS 2677 bucket->ub_bucket[bucket->ub_cnt] = NULL; 2678 KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); 2679 #endif 2680 cache->uc_allocs++; 2681 2682 return (item); 2683 } 2684 2685 static inline void 2686 bucket_push(uma_zone_t zone, uma_cache_t cache, uma_bucket_t bucket, 2687 void *item) 2688 { 2689 KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL, 2690 ("uma_zfree: Freeing to non free bucket index.")); 2691 bucket->ub_bucket[bucket->ub_cnt] = item; 2692 bucket->ub_cnt++; 2693 cache->uc_frees++; 2694 } 2695 2696 static void * 2697 item_ctor(uma_zone_t zone, void *udata, int flags, void *item) 2698 { 2699 #ifdef INVARIANTS 2700 bool skipdbg; 2701 2702 skipdbg = uma_dbg_zskip(zone, item); 2703 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && 2704 zone->uz_ctor != trash_ctor) 2705 trash_ctor(item, zone->uz_size, udata, flags); 2706 #endif 2707 if (__predict_false(zone->uz_ctor != NULL) && 2708 zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { 2709 counter_u64_add(zone->uz_fails, 1); 2710 zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); 2711 return (NULL); 2712 } 2713 #ifdef INVARIANTS 2714 if (!skipdbg) 2715 uma_dbg_alloc(zone, NULL, item); 2716 #endif 2717 if (flags & M_ZERO) 2718 uma_zero_item(item, zone); 2719 2720 return (item); 2721 } 2722 2723 static inline void 2724 item_dtor(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) 2725 { 2726 #ifdef INVARIANTS 2727 bool skipdbg; 2728 2729 skipdbg = uma_dbg_zskip(zone, item); 2730 if (skip == SKIP_NONE && !skipdbg) { 2731 if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0) 2732 uma_dbg_free(zone, udata, item); 2733 else 2734 uma_dbg_free(zone, NULL, item); 2735 } 2736 #endif 2737 if (skip < SKIP_DTOR) { 2738 if (zone->uz_dtor != NULL) 2739 zone->uz_dtor(item, zone->uz_size, udata); 2740 #ifdef INVARIANTS 2741 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && 2742 zone->uz_dtor != trash_dtor) 2743 trash_dtor(item, zone->uz_size, udata); 2744 #endif 2745 } 2746 } 2747 2748 /* See uma.h */ 2749 void * 2750 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) 2751 { 2752 uma_bucket_t bucket; 2753 uma_cache_t cache; 2754 void *item; 2755 int cpu, domain; 2756 2757 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 2758 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 2759 2760 /* This is the fast path allocation */ 2761 CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d", 2762 curthread, zone->uz_name, zone, flags); 2763 2764 if (flags & M_WAITOK) { 2765 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2766 "uma_zalloc_arg: zone \"%s\"", zone->uz_name); 2767 } 2768 KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC")); 2769 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 2770 ("uma_zalloc_arg: called with spinlock or critical section held")); 2771 if (zone->uz_flags & UMA_ZONE_PCPU) 2772 KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone " 2773 "with M_ZERO passed")); 2774 2775 #ifdef DEBUG_MEMGUARD 2776 if (memguard_cmp_zone(zone)) { 2777 item = memguard_alloc(zone->uz_size, flags); 2778 if (item != NULL) { 2779 if (zone->uz_init != NULL && 2780 zone->uz_init(item, zone->uz_size, flags) != 0) 2781 return (NULL); 2782 if (zone->uz_ctor != NULL && 2783 zone->uz_ctor(item, zone->uz_size, udata, 2784 flags) != 0) { 2785 counter_u64_add(zone->uz_fails, 1); 2786 zone->uz_fini(item, zone->uz_size); 2787 return (NULL); 2788 } 2789 return (item); 2790 } 2791 /* This is unfortunate but should not be fatal. */ 2792 } 2793 #endif 2794 /* 2795 * If possible, allocate from the per-CPU cache. There are two 2796 * requirements for safe access to the per-CPU cache: (1) the thread 2797 * accessing the cache must not be preempted or yield during access, 2798 * and (2) the thread must not migrate CPUs without switching which 2799 * cache it accesses. We rely on a critical section to prevent 2800 * preemption and migration. We release the critical section in 2801 * order to acquire the zone mutex if we are unable to allocate from 2802 * the current cache; when we re-acquire the critical section, we 2803 * must detect and handle migration if it has occurred. 2804 */ 2805 critical_enter(); 2806 do { 2807 cpu = curcpu; 2808 cache = &zone->uz_cpu[cpu]; 2809 bucket = cache->uc_allocbucket; 2810 if (__predict_true(bucket != NULL && bucket->ub_cnt != 0)) { 2811 item = bucket_pop(zone, cache, bucket); 2812 critical_exit(); 2813 return (item_ctor(zone, udata, flags, item)); 2814 } 2815 } while (cache_alloc(zone, cache, udata, flags)); 2816 critical_exit(); 2817 2818 /* 2819 * We can not get a bucket so try to return a single item. 2820 */ 2821 if (zone->uz_flags & UMA_ZONE_NUMA) 2822 domain = PCPU_GET(domain); 2823 else 2824 domain = UMA_ANYDOMAIN; 2825 return (zone_alloc_item_locked(zone, udata, domain, flags)); 2826 } 2827 2828 /* 2829 * Replenish an alloc bucket and possibly restore an old one. Called in 2830 * a critical section. Returns in a critical section. 2831 * 2832 * A false return value indicates failure and returns with the zone lock 2833 * held. A true return value indicates success and the caller should retry. 2834 */ 2835 static __noinline bool 2836 cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) 2837 { 2838 uma_zone_domain_t zdom; 2839 uma_bucket_t bucket; 2840 int cpu, domain; 2841 bool lockfail; 2842 2843 CRITICAL_ASSERT(curthread); 2844 2845 /* 2846 * If we have run out of items in our alloc bucket see 2847 * if we can switch with the free bucket. 2848 */ 2849 bucket = cache->uc_freebucket; 2850 if (bucket != NULL && bucket->ub_cnt != 0) { 2851 cache->uc_freebucket = cache->uc_allocbucket; 2852 cache->uc_allocbucket = bucket; 2853 return (true); 2854 } 2855 2856 /* 2857 * Discard any empty allocation bucket while we hold no locks. 2858 */ 2859 bucket = cache->uc_allocbucket; 2860 cache->uc_allocbucket = NULL; 2861 critical_exit(); 2862 if (bucket != NULL) 2863 bucket_free(zone, bucket, udata); 2864 2865 /* 2866 * Attempt to retrieve the item from the per-CPU cache has failed, so 2867 * we must go back to the zone. This requires the zone lock, so we 2868 * must drop the critical section, then re-acquire it when we go back 2869 * to the cache. Since the critical section is released, we may be 2870 * preempted or migrate. As such, make sure not to maintain any 2871 * thread-local state specific to the cache from prior to releasing 2872 * the critical section. 2873 */ 2874 lockfail = 0; 2875 if (ZONE_TRYLOCK(zone) == 0) { 2876 /* Record contention to size the buckets. */ 2877 ZONE_LOCK(zone); 2878 lockfail = 1; 2879 } 2880 2881 critical_enter(); 2882 /* Short-circuit for zones without buckets and low memory. */ 2883 if (zone->uz_bucket_size == 0 || bucketdisable) 2884 return (false); 2885 2886 cpu = curcpu; 2887 cache = &zone->uz_cpu[cpu]; 2888 2889 /* See if we lost the race to fill the cache. */ 2890 if (cache->uc_allocbucket != NULL) { 2891 ZONE_UNLOCK(zone); 2892 return (true); 2893 } 2894 2895 /* 2896 * Check the zone's cache of buckets. 2897 */ 2898 if (zone->uz_flags & UMA_ZONE_NUMA) { 2899 domain = PCPU_GET(domain); 2900 zdom = &zone->uz_domain[domain]; 2901 } else { 2902 domain = UMA_ANYDOMAIN; 2903 zdom = &zone->uz_domain[0]; 2904 } 2905 2906 if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) { 2907 ZONE_UNLOCK(zone); 2908 KASSERT(bucket->ub_cnt != 0, 2909 ("uma_zalloc_arg: Returning an empty bucket.")); 2910 cache->uc_allocbucket = bucket; 2911 return (true); 2912 } 2913 /* We are no longer associated with this CPU. */ 2914 critical_exit(); 2915 2916 /* 2917 * We bump the uz count when the cache size is insufficient to 2918 * handle the working set. 2919 */ 2920 if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) 2921 zone->uz_bucket_size++; 2922 2923 /* 2924 * Fill a bucket and attempt to use it as the alloc bucket. 2925 */ 2926 bucket = zone_alloc_bucket(zone, udata, domain, flags); 2927 CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", 2928 zone->uz_name, zone, bucket); 2929 critical_enter(); 2930 if (bucket == NULL) 2931 return (false); 2932 2933 /* 2934 * See if we lost the race or were migrated. Cache the 2935 * initialized bucket to make this less likely or claim 2936 * the memory directly. 2937 */ 2938 cpu = curcpu; 2939 cache = &zone->uz_cpu[cpu]; 2940 if (cache->uc_allocbucket == NULL && 2941 ((zone->uz_flags & UMA_ZONE_NUMA) == 0 || 2942 domain == PCPU_GET(domain))) { 2943 cache->uc_allocbucket = bucket; 2944 zdom->uzd_imax += bucket->ub_cnt; 2945 } else if (zone->uz_bkt_count >= zone->uz_bkt_max) { 2946 critical_exit(); 2947 ZONE_UNLOCK(zone); 2948 bucket_drain(zone, bucket); 2949 bucket_free(zone, bucket, udata); 2950 critical_enter(); 2951 return (true); 2952 } else 2953 zone_put_bucket(zone, zdom, bucket, false); 2954 ZONE_UNLOCK(zone); 2955 return (true); 2956 } 2957 2958 void * 2959 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) 2960 { 2961 2962 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 2963 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 2964 2965 /* This is the fast path allocation */ 2966 CTR5(KTR_UMA, 2967 "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d", 2968 curthread, zone->uz_name, zone, domain, flags); 2969 2970 if (flags & M_WAITOK) { 2971 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2972 "uma_zalloc_domain: zone \"%s\"", zone->uz_name); 2973 } 2974 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 2975 ("uma_zalloc_domain: called with spinlock or critical section held")); 2976 2977 return (zone_alloc_item(zone, udata, domain, flags)); 2978 } 2979 2980 /* 2981 * Find a slab with some space. Prefer slabs that are partially used over those 2982 * that are totally full. This helps to reduce fragmentation. 2983 * 2984 * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check 2985 * only 'domain'. 2986 */ 2987 static uma_slab_t 2988 keg_first_slab(uma_keg_t keg, int domain, bool rr) 2989 { 2990 uma_domain_t dom; 2991 uma_slab_t slab; 2992 int start; 2993 2994 KASSERT(domain >= 0 && domain < vm_ndomains, 2995 ("keg_first_slab: domain %d out of range", domain)); 2996 KEG_LOCK_ASSERT(keg); 2997 2998 slab = NULL; 2999 start = domain; 3000 do { 3001 dom = &keg->uk_domain[domain]; 3002 if (!LIST_EMPTY(&dom->ud_part_slab)) 3003 return (LIST_FIRST(&dom->ud_part_slab)); 3004 if (!LIST_EMPTY(&dom->ud_free_slab)) { 3005 slab = LIST_FIRST(&dom->ud_free_slab); 3006 LIST_REMOVE(slab, us_link); 3007 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 3008 return (slab); 3009 } 3010 if (rr) 3011 domain = (domain + 1) % vm_ndomains; 3012 } while (domain != start); 3013 3014 return (NULL); 3015 } 3016 3017 static uma_slab_t 3018 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) 3019 { 3020 uint32_t reserve; 3021 3022 KEG_LOCK_ASSERT(keg); 3023 3024 reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; 3025 if (keg->uk_free <= reserve) 3026 return (NULL); 3027 return (keg_first_slab(keg, domain, rr)); 3028 } 3029 3030 static uma_slab_t 3031 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) 3032 { 3033 struct vm_domainset_iter di; 3034 uma_domain_t dom; 3035 uma_slab_t slab; 3036 int aflags, domain; 3037 bool rr; 3038 3039 restart: 3040 KEG_LOCK_ASSERT(keg); 3041 3042 /* 3043 * Use the keg's policy if upper layers haven't already specified a 3044 * domain (as happens with first-touch zones). 3045 * 3046 * To avoid races we run the iterator with the keg lock held, but that 3047 * means that we cannot allow the vm_domainset layer to sleep. Thus, 3048 * clear M_WAITOK and handle low memory conditions locally. 3049 */ 3050 rr = rdomain == UMA_ANYDOMAIN; 3051 if (rr) { 3052 aflags = (flags & ~M_WAITOK) | M_NOWAIT; 3053 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 3054 &aflags); 3055 } else { 3056 aflags = flags; 3057 domain = rdomain; 3058 } 3059 3060 for (;;) { 3061 slab = keg_fetch_free_slab(keg, domain, rr, flags); 3062 if (slab != NULL) 3063 return (slab); 3064 3065 /* 3066 * M_NOVM means don't ask at all! 3067 */ 3068 if (flags & M_NOVM) 3069 break; 3070 3071 KASSERT(zone->uz_max_items == 0 || 3072 zone->uz_items <= zone->uz_max_items, 3073 ("%s: zone %p overflow", __func__, zone)); 3074 3075 slab = keg_alloc_slab(keg, zone, domain, flags, aflags); 3076 /* 3077 * If we got a slab here it's safe to mark it partially used 3078 * and return. We assume that the caller is going to remove 3079 * at least one item. 3080 */ 3081 if (slab) { 3082 dom = &keg->uk_domain[slab->us_domain]; 3083 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 3084 return (slab); 3085 } 3086 KEG_LOCK(keg); 3087 if (rr && vm_domainset_iter_policy(&di, &domain) != 0) { 3088 if ((flags & M_WAITOK) != 0) { 3089 KEG_UNLOCK(keg); 3090 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); 3091 KEG_LOCK(keg); 3092 goto restart; 3093 } 3094 break; 3095 } 3096 } 3097 3098 /* 3099 * We might not have been able to get a slab but another cpu 3100 * could have while we were unlocked. Check again before we 3101 * fail. 3102 */ 3103 if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) { 3104 return (slab); 3105 } 3106 return (NULL); 3107 } 3108 3109 static void * 3110 slab_alloc_item(uma_keg_t keg, uma_slab_t slab) 3111 { 3112 uma_domain_t dom; 3113 void *item; 3114 uint8_t freei; 3115 3116 KEG_LOCK_ASSERT(keg); 3117 3118 freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1; 3119 BIT_CLR(keg->uk_ipers, freei, &slab->us_free); 3120 item = slab_item(slab, keg, freei); 3121 slab->us_freecount--; 3122 keg->uk_free--; 3123 3124 /* Move this slab to the full list */ 3125 if (slab->us_freecount == 0) { 3126 LIST_REMOVE(slab, us_link); 3127 dom = &keg->uk_domain[slab->us_domain]; 3128 LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); 3129 } 3130 3131 return (item); 3132 } 3133 3134 static int 3135 zone_import(void *arg, void **bucket, int max, int domain, int flags) 3136 { 3137 uma_zone_t zone; 3138 uma_slab_t slab; 3139 uma_keg_t keg; 3140 #ifdef NUMA 3141 int stripe; 3142 #endif 3143 int i; 3144 3145 zone = arg; 3146 slab = NULL; 3147 keg = zone->uz_keg; 3148 KEG_LOCK(keg); 3149 /* Try to keep the buckets totally full */ 3150 for (i = 0; i < max; ) { 3151 if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL) 3152 break; 3153 #ifdef NUMA 3154 stripe = howmany(max, vm_ndomains); 3155 #endif 3156 while (slab->us_freecount && i < max) { 3157 bucket[i++] = slab_alloc_item(keg, slab); 3158 if (keg->uk_free <= keg->uk_reserve) 3159 break; 3160 #ifdef NUMA 3161 /* 3162 * If the zone is striped we pick a new slab for every 3163 * N allocations. Eliminating this conditional will 3164 * instead pick a new domain for each bucket rather 3165 * than stripe within each bucket. The current option 3166 * produces more fragmentation and requires more cpu 3167 * time but yields better distribution. 3168 */ 3169 if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 && 3170 vm_ndomains > 1 && --stripe == 0) 3171 break; 3172 #endif 3173 } 3174 /* Don't block if we allocated any successfully. */ 3175 flags &= ~M_WAITOK; 3176 flags |= M_NOWAIT; 3177 } 3178 KEG_UNLOCK(keg); 3179 3180 return i; 3181 } 3182 3183 static uma_bucket_t 3184 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) 3185 { 3186 uma_bucket_t bucket; 3187 int maxbucket, cnt; 3188 3189 CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain); 3190 3191 /* Avoid allocs targeting empty domains. */ 3192 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 3193 domain = UMA_ANYDOMAIN; 3194 3195 if (zone->uz_max_items > 0) { 3196 if (zone->uz_items >= zone->uz_max_items) 3197 return (false); 3198 maxbucket = MIN(zone->uz_bucket_size, 3199 zone->uz_max_items - zone->uz_items); 3200 zone->uz_items += maxbucket; 3201 } else 3202 maxbucket = zone->uz_bucket_size; 3203 ZONE_UNLOCK(zone); 3204 3205 /* Don't wait for buckets, preserve caller's NOVM setting. */ 3206 bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); 3207 if (bucket == NULL) { 3208 cnt = 0; 3209 goto out; 3210 } 3211 3212 bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, 3213 MIN(maxbucket, bucket->ub_entries), domain, flags); 3214 3215 /* 3216 * Initialize the memory if necessary. 3217 */ 3218 if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { 3219 int i; 3220 3221 for (i = 0; i < bucket->ub_cnt; i++) 3222 if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size, 3223 flags) != 0) 3224 break; 3225 /* 3226 * If we couldn't initialize the whole bucket, put the 3227 * rest back onto the freelist. 3228 */ 3229 if (i != bucket->ub_cnt) { 3230 zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], 3231 bucket->ub_cnt - i); 3232 #ifdef INVARIANTS 3233 bzero(&bucket->ub_bucket[i], 3234 sizeof(void *) * (bucket->ub_cnt - i)); 3235 #endif 3236 bucket->ub_cnt = i; 3237 } 3238 } 3239 3240 cnt = bucket->ub_cnt; 3241 if (bucket->ub_cnt == 0) { 3242 bucket_free(zone, bucket, udata); 3243 counter_u64_add(zone->uz_fails, 1); 3244 bucket = NULL; 3245 } 3246 out: 3247 ZONE_LOCK(zone); 3248 if (zone->uz_max_items > 0 && cnt < maxbucket) { 3249 MPASS(zone->uz_items >= maxbucket - cnt); 3250 zone->uz_items -= maxbucket - cnt; 3251 if (zone->uz_sleepers > 0 && 3252 (cnt == 0 ? zone->uz_items + 1 : zone->uz_items) < 3253 zone->uz_max_items) 3254 wakeup_one(zone); 3255 } 3256 3257 return (bucket); 3258 } 3259 3260 /* 3261 * Allocates a single item from a zone. 3262 * 3263 * Arguments 3264 * zone The zone to alloc for. 3265 * udata The data to be passed to the constructor. 3266 * domain The domain to allocate from or UMA_ANYDOMAIN. 3267 * flags M_WAITOK, M_NOWAIT, M_ZERO. 3268 * 3269 * Returns 3270 * NULL if there is no memory and M_NOWAIT is set 3271 * An item if successful 3272 */ 3273 3274 static void * 3275 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) 3276 { 3277 3278 ZONE_LOCK(zone); 3279 return (zone_alloc_item_locked(zone, udata, domain, flags)); 3280 } 3281 3282 /* 3283 * Returns with zone unlocked. 3284 */ 3285 static void * 3286 zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags) 3287 { 3288 void *item; 3289 3290 ZONE_LOCK_ASSERT(zone); 3291 3292 if (zone->uz_max_items > 0) { 3293 if (zone->uz_items >= zone->uz_max_items) { 3294 zone_log_warning(zone); 3295 zone_maxaction(zone); 3296 if (flags & M_NOWAIT) { 3297 ZONE_UNLOCK(zone); 3298 return (NULL); 3299 } 3300 zone->uz_sleeps++; 3301 zone->uz_sleepers++; 3302 while (zone->uz_items >= zone->uz_max_items) 3303 mtx_sleep(zone, zone->uz_lockptr, PVM, 3304 "zonelimit", 0); 3305 zone->uz_sleepers--; 3306 if (zone->uz_sleepers > 0 && 3307 zone->uz_items + 1 < zone->uz_max_items) 3308 wakeup_one(zone); 3309 } 3310 zone->uz_items++; 3311 } 3312 ZONE_UNLOCK(zone); 3313 3314 /* Avoid allocs targeting empty domains. */ 3315 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 3316 domain = UMA_ANYDOMAIN; 3317 3318 if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) 3319 goto fail_cnt; 3320 3321 /* 3322 * We have to call both the zone's init (not the keg's init) 3323 * and the zone's ctor. This is because the item is going from 3324 * a keg slab directly to the user, and the user is expecting it 3325 * to be both zone-init'd as well as zone-ctor'd. 3326 */ 3327 if (zone->uz_init != NULL) { 3328 if (zone->uz_init(item, zone->uz_size, flags) != 0) { 3329 zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); 3330 goto fail_cnt; 3331 } 3332 } 3333 item = item_ctor(zone, udata, flags, item); 3334 if (item == NULL) 3335 goto fail; 3336 3337 counter_u64_add(zone->uz_allocs, 1); 3338 CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, 3339 zone->uz_name, zone); 3340 3341 return (item); 3342 3343 fail_cnt: 3344 counter_u64_add(zone->uz_fails, 1); 3345 fail: 3346 if (zone->uz_max_items > 0) { 3347 ZONE_LOCK(zone); 3348 /* XXX Decrement without wakeup */ 3349 zone->uz_items--; 3350 ZONE_UNLOCK(zone); 3351 } 3352 CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", 3353 zone->uz_name, zone); 3354 return (NULL); 3355 } 3356 3357 /* See uma.h */ 3358 void 3359 uma_zfree_arg(uma_zone_t zone, void *item, void *udata) 3360 { 3361 uma_cache_t cache; 3362 uma_bucket_t bucket; 3363 int cpu, domain, itemdomain; 3364 3365 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3366 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3367 3368 CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread, 3369 zone->uz_name); 3370 3371 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3372 ("uma_zfree_arg: called with spinlock or critical section held")); 3373 3374 /* uma_zfree(..., NULL) does nothing, to match free(9). */ 3375 if (item == NULL) 3376 return; 3377 #ifdef DEBUG_MEMGUARD 3378 if (is_memguard_addr(item)) { 3379 if (zone->uz_dtor != NULL) 3380 zone->uz_dtor(item, zone->uz_size, udata); 3381 if (zone->uz_fini != NULL) 3382 zone->uz_fini(item, zone->uz_size); 3383 memguard_free(item); 3384 return; 3385 } 3386 #endif 3387 item_dtor(zone, item, udata, SKIP_NONE); 3388 3389 /* 3390 * The race here is acceptable. If we miss it we'll just have to wait 3391 * a little longer for the limits to be reset. 3392 */ 3393 if (zone->uz_sleepers > 0) 3394 goto zfree_item; 3395 3396 /* 3397 * If possible, free to the per-CPU cache. There are two 3398 * requirements for safe access to the per-CPU cache: (1) the thread 3399 * accessing the cache must not be preempted or yield during access, 3400 * and (2) the thread must not migrate CPUs without switching which 3401 * cache it accesses. We rely on a critical section to prevent 3402 * preemption and migration. We release the critical section in 3403 * order to acquire the zone mutex if we are unable to free to the 3404 * current cache; when we re-acquire the critical section, we must 3405 * detect and handle migration if it has occurred. 3406 */ 3407 domain = itemdomain = 0; 3408 critical_enter(); 3409 do { 3410 cpu = curcpu; 3411 cache = &zone->uz_cpu[cpu]; 3412 bucket = cache->uc_allocbucket; 3413 #ifdef UMA_XDOMAIN 3414 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) { 3415 itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item)); 3416 domain = PCPU_GET(domain); 3417 } 3418 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0 && 3419 domain != itemdomain) { 3420 bucket = cache->uc_crossbucket; 3421 } else 3422 #endif 3423 3424 /* 3425 * Try to free into the allocbucket first to give LIFO ordering 3426 * for cache-hot datastructures. Spill over into the freebucket 3427 * if necessary. Alloc will swap them if one runs dry. 3428 */ 3429 if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries) 3430 bucket = cache->uc_freebucket; 3431 if (__predict_true(bucket != NULL && 3432 bucket->ub_cnt < bucket->ub_entries)) { 3433 bucket_push(zone, cache, bucket, item); 3434 critical_exit(); 3435 return; 3436 } 3437 } while (cache_free(zone, cache, udata, item, itemdomain)); 3438 critical_exit(); 3439 3440 /* 3441 * If nothing else caught this, we'll just do an internal free. 3442 */ 3443 zfree_item: 3444 zone_free_item(zone, item, udata, SKIP_DTOR); 3445 } 3446 3447 static void 3448 zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, 3449 int domain, int itemdomain) 3450 { 3451 uma_zone_domain_t zdom; 3452 3453 #ifdef UMA_XDOMAIN 3454 /* 3455 * Buckets coming from the wrong domain will be entirely for the 3456 * only other domain on two domain systems. In this case we can 3457 * simply cache them. Otherwise we need to sort them back to 3458 * correct domains by freeing the contents to the slab layer. 3459 */ 3460 if (domain != itemdomain && vm_ndomains > 2) { 3461 CTR3(KTR_UMA, 3462 "uma_zfree: zone %s(%p) draining cross bucket %p", 3463 zone->uz_name, zone, bucket); 3464 bucket_drain(zone, bucket); 3465 bucket_free(zone, bucket, udata); 3466 return; 3467 } 3468 #endif 3469 /* 3470 * Attempt to save the bucket in the zone's domain bucket cache. 3471 * 3472 * We bump the uz count when the cache size is insufficient to 3473 * handle the working set. 3474 */ 3475 if (ZONE_TRYLOCK(zone) == 0) { 3476 /* Record contention to size the buckets. */ 3477 ZONE_LOCK(zone); 3478 if (zone->uz_bucket_size < zone->uz_bucket_size_max) 3479 zone->uz_bucket_size++; 3480 } 3481 3482 CTR3(KTR_UMA, 3483 "uma_zfree: zone %s(%p) putting bucket %p on free list", 3484 zone->uz_name, zone, bucket); 3485 /* ub_cnt is pointing to the last free item */ 3486 KASSERT(bucket->ub_cnt == bucket->ub_entries, 3487 ("uma_zfree: Attempting to insert partial bucket onto the full list.\n")); 3488 if (zone->uz_bkt_count >= zone->uz_bkt_max) { 3489 ZONE_UNLOCK(zone); 3490 bucket_drain(zone, bucket); 3491 bucket_free(zone, bucket, udata); 3492 } else { 3493 zdom = &zone->uz_domain[itemdomain]; 3494 zone_put_bucket(zone, zdom, bucket, true); 3495 ZONE_UNLOCK(zone); 3496 } 3497 } 3498 3499 /* 3500 * Populate a free or cross bucket for the current cpu cache. Free any 3501 * existing full bucket either to the zone cache or back to the slab layer. 3502 * 3503 * Enters and returns in a critical section. false return indicates that 3504 * we can not satisfy this free in the cache layer. true indicates that 3505 * the caller should retry. 3506 */ 3507 static __noinline bool 3508 cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item, 3509 int itemdomain) 3510 { 3511 uma_bucket_t bucket; 3512 int cpu, domain; 3513 3514 CRITICAL_ASSERT(curthread); 3515 3516 if (zone->uz_bucket_size == 0 || bucketdisable) 3517 return false; 3518 3519 cpu = curcpu; 3520 cache = &zone->uz_cpu[cpu]; 3521 3522 /* 3523 * NUMA domains need to free to the correct zdom. When XDOMAIN 3524 * is enabled this is the zdom of the item and the bucket may be 3525 * the cross bucket if they do not match. 3526 */ 3527 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) 3528 #ifdef UMA_XDOMAIN 3529 domain = PCPU_GET(domain); 3530 #else 3531 itemdomain = domain = PCPU_GET(domain); 3532 #endif 3533 else 3534 itemdomain = domain = 0; 3535 #ifdef UMA_XDOMAIN 3536 if (domain != itemdomain) { 3537 bucket = cache->uc_crossbucket; 3538 cache->uc_crossbucket = NULL; 3539 if (bucket != NULL) 3540 atomic_add_64(&zone->uz_xdomain, bucket->ub_cnt); 3541 } else 3542 #endif 3543 { 3544 bucket = cache->uc_freebucket; 3545 cache->uc_freebucket = NULL; 3546 } 3547 3548 3549 /* We are no longer associated with this CPU. */ 3550 critical_exit(); 3551 3552 if (bucket != NULL) 3553 zone_free_bucket(zone, bucket, udata, domain, itemdomain); 3554 3555 bucket = bucket_alloc(zone, udata, M_NOWAIT); 3556 CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p", 3557 zone->uz_name, zone, bucket); 3558 critical_enter(); 3559 if (bucket == NULL) 3560 return (false); 3561 cpu = curcpu; 3562 cache = &zone->uz_cpu[cpu]; 3563 #ifdef UMA_XDOMAIN 3564 /* 3565 * Check to see if we should be populating the cross bucket. If it 3566 * is already populated we will fall through and attempt to populate 3567 * the free bucket. 3568 */ 3569 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) { 3570 domain = PCPU_GET(domain); 3571 if (domain != itemdomain && cache->uc_crossbucket == NULL) { 3572 cache->uc_crossbucket = bucket; 3573 return (true); 3574 } 3575 } 3576 #endif 3577 /* 3578 * We may have lost the race to fill the bucket or switched CPUs. 3579 */ 3580 if (cache->uc_freebucket != NULL) { 3581 critical_exit(); 3582 bucket_free(zone, bucket, udata); 3583 critical_enter(); 3584 } else 3585 cache->uc_freebucket = bucket; 3586 3587 return (true); 3588 } 3589 3590 void 3591 uma_zfree_domain(uma_zone_t zone, void *item, void *udata) 3592 { 3593 3594 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3595 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3596 3597 CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread, 3598 zone->uz_name); 3599 3600 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3601 ("uma_zfree_domain: called with spinlock or critical section held")); 3602 3603 /* uma_zfree(..., NULL) does nothing, to match free(9). */ 3604 if (item == NULL) 3605 return; 3606 zone_free_item(zone, item, udata, SKIP_NONE); 3607 } 3608 3609 static void 3610 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) 3611 { 3612 uma_keg_t keg; 3613 uma_domain_t dom; 3614 uint8_t freei; 3615 3616 keg = zone->uz_keg; 3617 MPASS(zone->uz_lockptr == &keg->uk_lock); 3618 KEG_LOCK_ASSERT(keg); 3619 3620 dom = &keg->uk_domain[slab->us_domain]; 3621 3622 /* Do we need to remove from any lists? */ 3623 if (slab->us_freecount+1 == keg->uk_ipers) { 3624 LIST_REMOVE(slab, us_link); 3625 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); 3626 } else if (slab->us_freecount == 0) { 3627 LIST_REMOVE(slab, us_link); 3628 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 3629 } 3630 3631 /* Slab management. */ 3632 freei = slab_item_index(slab, keg, item); 3633 BIT_SET(keg->uk_ipers, freei, &slab->us_free); 3634 slab->us_freecount++; 3635 3636 /* Keg statistics. */ 3637 keg->uk_free++; 3638 } 3639 3640 static void 3641 zone_release(void *arg, void **bucket, int cnt) 3642 { 3643 uma_zone_t zone; 3644 void *item; 3645 uma_slab_t slab; 3646 uma_keg_t keg; 3647 uint8_t *mem; 3648 int i; 3649 3650 zone = arg; 3651 keg = zone->uz_keg; 3652 KEG_LOCK(keg); 3653 for (i = 0; i < cnt; i++) { 3654 item = bucket[i]; 3655 if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) { 3656 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 3657 if (zone->uz_flags & UMA_ZONE_HASH) { 3658 slab = hash_sfind(&keg->uk_hash, mem); 3659 } else { 3660 mem += keg->uk_pgoff; 3661 slab = (uma_slab_t)mem; 3662 } 3663 } else 3664 slab = vtoslab((vm_offset_t)item); 3665 slab_free_item(zone, slab, item); 3666 } 3667 KEG_UNLOCK(keg); 3668 } 3669 3670 /* 3671 * Frees a single item to any zone. 3672 * 3673 * Arguments: 3674 * zone The zone to free to 3675 * item The item we're freeing 3676 * udata User supplied data for the dtor 3677 * skip Skip dtors and finis 3678 */ 3679 static void 3680 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) 3681 { 3682 3683 item_dtor(zone, item, udata, skip); 3684 3685 if (skip < SKIP_FINI && zone->uz_fini) 3686 zone->uz_fini(item, zone->uz_size); 3687 3688 zone->uz_release(zone->uz_arg, &item, 1); 3689 3690 if (skip & SKIP_CNT) 3691 return; 3692 3693 counter_u64_add(zone->uz_frees, 1); 3694 3695 if (zone->uz_max_items > 0) { 3696 ZONE_LOCK(zone); 3697 zone->uz_items--; 3698 if (zone->uz_sleepers > 0 && 3699 zone->uz_items < zone->uz_max_items) 3700 wakeup_one(zone); 3701 ZONE_UNLOCK(zone); 3702 } 3703 } 3704 3705 /* See uma.h */ 3706 int 3707 uma_zone_set_max(uma_zone_t zone, int nitems) 3708 { 3709 struct uma_bucket_zone *ubz; 3710 int count; 3711 3712 ZONE_LOCK(zone); 3713 ubz = bucket_zone_max(zone, nitems); 3714 count = ubz != NULL ? ubz->ubz_entries : 0; 3715 zone->uz_bucket_size_max = zone->uz_bucket_size = count; 3716 if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) 3717 zone->uz_bucket_size_min = zone->uz_bucket_size_max; 3718 zone->uz_max_items = nitems; 3719 ZONE_UNLOCK(zone); 3720 3721 return (nitems); 3722 } 3723 3724 /* See uma.h */ 3725 void 3726 uma_zone_set_maxcache(uma_zone_t zone, int nitems) 3727 { 3728 struct uma_bucket_zone *ubz; 3729 int bpcpu; 3730 3731 ZONE_LOCK(zone); 3732 ubz = bucket_zone_max(zone, nitems); 3733 if (ubz != NULL) { 3734 bpcpu = 2; 3735 #ifdef UMA_XDOMAIN 3736 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) 3737 /* Count the cross-domain bucket. */ 3738 bpcpu++; 3739 #endif 3740 nitems -= ubz->ubz_entries * bpcpu * mp_ncpus; 3741 zone->uz_bucket_size_max = ubz->ubz_entries; 3742 } else { 3743 zone->uz_bucket_size_max = zone->uz_bucket_size = 0; 3744 } 3745 if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) 3746 zone->uz_bucket_size_min = zone->uz_bucket_size_max; 3747 zone->uz_bkt_max = nitems; 3748 ZONE_UNLOCK(zone); 3749 } 3750 3751 /* See uma.h */ 3752 int 3753 uma_zone_get_max(uma_zone_t zone) 3754 { 3755 int nitems; 3756 3757 ZONE_LOCK(zone); 3758 nitems = zone->uz_max_items; 3759 ZONE_UNLOCK(zone); 3760 3761 return (nitems); 3762 } 3763 3764 /* See uma.h */ 3765 void 3766 uma_zone_set_warning(uma_zone_t zone, const char *warning) 3767 { 3768 3769 ZONE_LOCK(zone); 3770 zone->uz_warning = warning; 3771 ZONE_UNLOCK(zone); 3772 } 3773 3774 /* See uma.h */ 3775 void 3776 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) 3777 { 3778 3779 ZONE_LOCK(zone); 3780 TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); 3781 ZONE_UNLOCK(zone); 3782 } 3783 3784 /* See uma.h */ 3785 int 3786 uma_zone_get_cur(uma_zone_t zone) 3787 { 3788 int64_t nitems; 3789 u_int i; 3790 3791 ZONE_LOCK(zone); 3792 nitems = counter_u64_fetch(zone->uz_allocs) - 3793 counter_u64_fetch(zone->uz_frees); 3794 if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) { 3795 CPU_FOREACH(i) { 3796 /* 3797 * See the comment in uma_vm_zone_stats() regarding 3798 * the safety of accessing the per-cpu caches. With 3799 * the zone lock held, it is safe, but can potentially 3800 * result in stale data. 3801 */ 3802 nitems += zone->uz_cpu[i].uc_allocs - 3803 zone->uz_cpu[i].uc_frees; 3804 } 3805 } 3806 ZONE_UNLOCK(zone); 3807 3808 return (nitems < 0 ? 0 : nitems); 3809 } 3810 3811 static uint64_t 3812 uma_zone_get_allocs(uma_zone_t zone) 3813 { 3814 uint64_t nitems; 3815 u_int i; 3816 3817 ZONE_LOCK(zone); 3818 nitems = counter_u64_fetch(zone->uz_allocs); 3819 if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) { 3820 CPU_FOREACH(i) { 3821 /* 3822 * See the comment in uma_vm_zone_stats() regarding 3823 * the safety of accessing the per-cpu caches. With 3824 * the zone lock held, it is safe, but can potentially 3825 * result in stale data. 3826 */ 3827 nitems += zone->uz_cpu[i].uc_allocs; 3828 } 3829 } 3830 ZONE_UNLOCK(zone); 3831 3832 return (nitems); 3833 } 3834 3835 static uint64_t 3836 uma_zone_get_frees(uma_zone_t zone) 3837 { 3838 uint64_t nitems; 3839 u_int i; 3840 3841 ZONE_LOCK(zone); 3842 nitems = counter_u64_fetch(zone->uz_frees); 3843 if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) { 3844 CPU_FOREACH(i) { 3845 /* 3846 * See the comment in uma_vm_zone_stats() regarding 3847 * the safety of accessing the per-cpu caches. With 3848 * the zone lock held, it is safe, but can potentially 3849 * result in stale data. 3850 */ 3851 nitems += zone->uz_cpu[i].uc_frees; 3852 } 3853 } 3854 ZONE_UNLOCK(zone); 3855 3856 return (nitems); 3857 } 3858 3859 /* See uma.h */ 3860 void 3861 uma_zone_set_init(uma_zone_t zone, uma_init uminit) 3862 { 3863 uma_keg_t keg; 3864 3865 KEG_GET(zone, keg); 3866 KEG_LOCK(keg); 3867 KASSERT(keg->uk_pages == 0, 3868 ("uma_zone_set_init on non-empty keg")); 3869 keg->uk_init = uminit; 3870 KEG_UNLOCK(keg); 3871 } 3872 3873 /* See uma.h */ 3874 void 3875 uma_zone_set_fini(uma_zone_t zone, uma_fini fini) 3876 { 3877 uma_keg_t keg; 3878 3879 KEG_GET(zone, keg); 3880 KEG_LOCK(keg); 3881 KASSERT(keg->uk_pages == 0, 3882 ("uma_zone_set_fini on non-empty keg")); 3883 keg->uk_fini = fini; 3884 KEG_UNLOCK(keg); 3885 } 3886 3887 /* See uma.h */ 3888 void 3889 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) 3890 { 3891 3892 ZONE_LOCK(zone); 3893 KASSERT(zone->uz_keg->uk_pages == 0, 3894 ("uma_zone_set_zinit on non-empty keg")); 3895 zone->uz_init = zinit; 3896 ZONE_UNLOCK(zone); 3897 } 3898 3899 /* See uma.h */ 3900 void 3901 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) 3902 { 3903 3904 ZONE_LOCK(zone); 3905 KASSERT(zone->uz_keg->uk_pages == 0, 3906 ("uma_zone_set_zfini on non-empty keg")); 3907 zone->uz_fini = zfini; 3908 ZONE_UNLOCK(zone); 3909 } 3910 3911 /* See uma.h */ 3912 /* XXX uk_freef is not actually used with the zone locked */ 3913 void 3914 uma_zone_set_freef(uma_zone_t zone, uma_free freef) 3915 { 3916 uma_keg_t keg; 3917 3918 KEG_GET(zone, keg); 3919 KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type")); 3920 KEG_LOCK(keg); 3921 keg->uk_freef = freef; 3922 KEG_UNLOCK(keg); 3923 } 3924 3925 /* See uma.h */ 3926 /* XXX uk_allocf is not actually used with the zone locked */ 3927 void 3928 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) 3929 { 3930 uma_keg_t keg; 3931 3932 KEG_GET(zone, keg); 3933 KEG_LOCK(keg); 3934 keg->uk_allocf = allocf; 3935 KEG_UNLOCK(keg); 3936 } 3937 3938 /* See uma.h */ 3939 void 3940 uma_zone_reserve(uma_zone_t zone, int items) 3941 { 3942 uma_keg_t keg; 3943 3944 KEG_GET(zone, keg); 3945 KEG_LOCK(keg); 3946 keg->uk_reserve = items; 3947 KEG_UNLOCK(keg); 3948 } 3949 3950 /* See uma.h */ 3951 int 3952 uma_zone_reserve_kva(uma_zone_t zone, int count) 3953 { 3954 uma_keg_t keg; 3955 vm_offset_t kva; 3956 u_int pages; 3957 3958 KEG_GET(zone, keg); 3959 3960 pages = count / keg->uk_ipers; 3961 if (pages * keg->uk_ipers < count) 3962 pages++; 3963 pages *= keg->uk_ppera; 3964 3965 #ifdef UMA_MD_SMALL_ALLOC 3966 if (keg->uk_ppera > 1) { 3967 #else 3968 if (1) { 3969 #endif 3970 kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); 3971 if (kva == 0) 3972 return (0); 3973 } else 3974 kva = 0; 3975 3976 ZONE_LOCK(zone); 3977 MPASS(keg->uk_kva == 0); 3978 keg->uk_kva = kva; 3979 keg->uk_offset = 0; 3980 zone->uz_max_items = pages * keg->uk_ipers; 3981 #ifdef UMA_MD_SMALL_ALLOC 3982 keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; 3983 #else 3984 keg->uk_allocf = noobj_alloc; 3985 #endif 3986 keg->uk_flags |= UMA_ZONE_NOFREE; 3987 ZONE_UNLOCK(zone); 3988 3989 return (1); 3990 } 3991 3992 /* See uma.h */ 3993 void 3994 uma_prealloc(uma_zone_t zone, int items) 3995 { 3996 struct vm_domainset_iter di; 3997 uma_domain_t dom; 3998 uma_slab_t slab; 3999 uma_keg_t keg; 4000 int aflags, domain, slabs; 4001 4002 KEG_GET(zone, keg); 4003 KEG_LOCK(keg); 4004 slabs = items / keg->uk_ipers; 4005 if (slabs * keg->uk_ipers < items) 4006 slabs++; 4007 while (slabs-- > 0) { 4008 aflags = M_NOWAIT; 4009 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 4010 &aflags); 4011 for (;;) { 4012 slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, 4013 aflags); 4014 if (slab != NULL) { 4015 dom = &keg->uk_domain[slab->us_domain]; 4016 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, 4017 us_link); 4018 break; 4019 } 4020 KEG_LOCK(keg); 4021 if (vm_domainset_iter_policy(&di, &domain) != 0) { 4022 KEG_UNLOCK(keg); 4023 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); 4024 KEG_LOCK(keg); 4025 } 4026 } 4027 } 4028 KEG_UNLOCK(keg); 4029 } 4030 4031 /* See uma.h */ 4032 void 4033 uma_reclaim(int req) 4034 { 4035 4036 CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); 4037 sx_xlock(&uma_reclaim_lock); 4038 bucket_enable(); 4039 4040 switch (req) { 4041 case UMA_RECLAIM_TRIM: 4042 zone_foreach(zone_trim, NULL); 4043 break; 4044 case UMA_RECLAIM_DRAIN: 4045 case UMA_RECLAIM_DRAIN_CPU: 4046 zone_foreach(zone_drain, NULL); 4047 if (req == UMA_RECLAIM_DRAIN_CPU) { 4048 pcpu_cache_drain_safe(NULL); 4049 zone_foreach(zone_drain, NULL); 4050 } 4051 break; 4052 default: 4053 panic("unhandled reclamation request %d", req); 4054 } 4055 4056 /* 4057 * Some slabs may have been freed but this zone will be visited early 4058 * we visit again so that we can free pages that are empty once other 4059 * zones are drained. We have to do the same for buckets. 4060 */ 4061 zone_drain(slabzone, NULL); 4062 bucket_zone_drain(); 4063 sx_xunlock(&uma_reclaim_lock); 4064 } 4065 4066 static volatile int uma_reclaim_needed; 4067 4068 void 4069 uma_reclaim_wakeup(void) 4070 { 4071 4072 if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0) 4073 wakeup(uma_reclaim); 4074 } 4075 4076 void 4077 uma_reclaim_worker(void *arg __unused) 4078 { 4079 4080 for (;;) { 4081 sx_xlock(&uma_reclaim_lock); 4082 while (atomic_load_int(&uma_reclaim_needed) == 0) 4083 sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", 4084 hz); 4085 sx_xunlock(&uma_reclaim_lock); 4086 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); 4087 uma_reclaim(UMA_RECLAIM_DRAIN_CPU); 4088 atomic_store_int(&uma_reclaim_needed, 0); 4089 /* Don't fire more than once per-second. */ 4090 pause("umarclslp", hz); 4091 } 4092 } 4093 4094 /* See uma.h */ 4095 void 4096 uma_zone_reclaim(uma_zone_t zone, int req) 4097 { 4098 4099 switch (req) { 4100 case UMA_RECLAIM_TRIM: 4101 zone_trim(zone, NULL); 4102 break; 4103 case UMA_RECLAIM_DRAIN: 4104 zone_drain(zone, NULL); 4105 break; 4106 case UMA_RECLAIM_DRAIN_CPU: 4107 pcpu_cache_drain_safe(zone); 4108 zone_drain(zone, NULL); 4109 break; 4110 default: 4111 panic("unhandled reclamation request %d", req); 4112 } 4113 } 4114 4115 /* See uma.h */ 4116 int 4117 uma_zone_exhausted(uma_zone_t zone) 4118 { 4119 int full; 4120 4121 ZONE_LOCK(zone); 4122 full = zone->uz_sleepers > 0; 4123 ZONE_UNLOCK(zone); 4124 return (full); 4125 } 4126 4127 int 4128 uma_zone_exhausted_nolock(uma_zone_t zone) 4129 { 4130 return (zone->uz_sleepers > 0); 4131 } 4132 4133 static void 4134 uma_zero_item(void *item, uma_zone_t zone) 4135 { 4136 4137 bzero(item, zone->uz_size); 4138 } 4139 4140 unsigned long 4141 uma_limit(void) 4142 { 4143 4144 return (uma_kmem_limit); 4145 } 4146 4147 void 4148 uma_set_limit(unsigned long limit) 4149 { 4150 4151 uma_kmem_limit = limit; 4152 } 4153 4154 unsigned long 4155 uma_size(void) 4156 { 4157 4158 return (atomic_load_long(&uma_kmem_total)); 4159 } 4160 4161 long 4162 uma_avail(void) 4163 { 4164 4165 return (uma_kmem_limit - uma_size()); 4166 } 4167 4168 #ifdef DDB 4169 /* 4170 * Generate statistics across both the zone and its per-cpu cache's. Return 4171 * desired statistics if the pointer is non-NULL for that statistic. 4172 * 4173 * Note: does not update the zone statistics, as it can't safely clear the 4174 * per-CPU cache statistic. 4175 * 4176 * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't 4177 * safe from off-CPU; we should modify the caches to track this information 4178 * directly so that we don't have to. 4179 */ 4180 static void 4181 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, 4182 uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) 4183 { 4184 uma_cache_t cache; 4185 uint64_t allocs, frees, sleeps, xdomain; 4186 int cachefree, cpu; 4187 4188 allocs = frees = sleeps = xdomain = 0; 4189 cachefree = 0; 4190 CPU_FOREACH(cpu) { 4191 cache = &z->uz_cpu[cpu]; 4192 if (cache->uc_allocbucket != NULL) 4193 cachefree += cache->uc_allocbucket->ub_cnt; 4194 if (cache->uc_freebucket != NULL) 4195 cachefree += cache->uc_freebucket->ub_cnt; 4196 if (cache->uc_crossbucket != NULL) { 4197 xdomain += cache->uc_crossbucket->ub_cnt; 4198 cachefree += cache->uc_crossbucket->ub_cnt; 4199 } 4200 allocs += cache->uc_allocs; 4201 frees += cache->uc_frees; 4202 } 4203 allocs += counter_u64_fetch(z->uz_allocs); 4204 frees += counter_u64_fetch(z->uz_frees); 4205 sleeps += z->uz_sleeps; 4206 xdomain += z->uz_xdomain; 4207 if (cachefreep != NULL) 4208 *cachefreep = cachefree; 4209 if (allocsp != NULL) 4210 *allocsp = allocs; 4211 if (freesp != NULL) 4212 *freesp = frees; 4213 if (sleepsp != NULL) 4214 *sleepsp = sleeps; 4215 if (xdomainp != NULL) 4216 *xdomainp = xdomain; 4217 } 4218 #endif /* DDB */ 4219 4220 static int 4221 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) 4222 { 4223 uma_keg_t kz; 4224 uma_zone_t z; 4225 int count; 4226 4227 count = 0; 4228 rw_rlock(&uma_rwlock); 4229 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4230 LIST_FOREACH(z, &kz->uk_zones, uz_link) 4231 count++; 4232 } 4233 LIST_FOREACH(z, &uma_cachezones, uz_link) 4234 count++; 4235 4236 rw_runlock(&uma_rwlock); 4237 return (sysctl_handle_int(oidp, &count, 0, req)); 4238 } 4239 4240 static void 4241 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, 4242 struct uma_percpu_stat *ups, bool internal) 4243 { 4244 uma_zone_domain_t zdom; 4245 uma_bucket_t bucket; 4246 uma_cache_t cache; 4247 int i; 4248 4249 4250 for (i = 0; i < vm_ndomains; i++) { 4251 zdom = &z->uz_domain[i]; 4252 uth->uth_zone_free += zdom->uzd_nitems; 4253 } 4254 uth->uth_allocs = counter_u64_fetch(z->uz_allocs); 4255 uth->uth_frees = counter_u64_fetch(z->uz_frees); 4256 uth->uth_fails = counter_u64_fetch(z->uz_fails); 4257 uth->uth_sleeps = z->uz_sleeps; 4258 uth->uth_xdomain = z->uz_xdomain; 4259 4260 /* 4261 * While it is not normally safe to access the cache bucket pointers 4262 * while not on the CPU that owns the cache, we only allow the pointers 4263 * to be exchanged without the zone lock held, not invalidated, so 4264 * accept the possible race associated with bucket exchange during 4265 * monitoring. Use atomic_load_ptr() to ensure that the bucket pointers 4266 * are loaded only once. 4267 */ 4268 for (i = 0; i < mp_maxid + 1; i++) { 4269 bzero(&ups[i], sizeof(*ups)); 4270 if (internal || CPU_ABSENT(i)) 4271 continue; 4272 cache = &z->uz_cpu[i]; 4273 bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_allocbucket); 4274 if (bucket != NULL) 4275 ups[i].ups_cache_free += bucket->ub_cnt; 4276 bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_freebucket); 4277 if (bucket != NULL) 4278 ups[i].ups_cache_free += bucket->ub_cnt; 4279 bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_crossbucket); 4280 if (bucket != NULL) 4281 ups[i].ups_cache_free += bucket->ub_cnt; 4282 ups[i].ups_allocs = cache->uc_allocs; 4283 ups[i].ups_frees = cache->uc_frees; 4284 } 4285 } 4286 4287 static int 4288 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) 4289 { 4290 struct uma_stream_header ush; 4291 struct uma_type_header uth; 4292 struct uma_percpu_stat *ups; 4293 struct sbuf sbuf; 4294 uma_keg_t kz; 4295 uma_zone_t z; 4296 int count, error, i; 4297 4298 error = sysctl_wire_old_buffer(req, 0); 4299 if (error != 0) 4300 return (error); 4301 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 4302 sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); 4303 ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); 4304 4305 count = 0; 4306 rw_rlock(&uma_rwlock); 4307 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4308 LIST_FOREACH(z, &kz->uk_zones, uz_link) 4309 count++; 4310 } 4311 4312 LIST_FOREACH(z, &uma_cachezones, uz_link) 4313 count++; 4314 4315 /* 4316 * Insert stream header. 4317 */ 4318 bzero(&ush, sizeof(ush)); 4319 ush.ush_version = UMA_STREAM_VERSION; 4320 ush.ush_maxcpus = (mp_maxid + 1); 4321 ush.ush_count = count; 4322 (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); 4323 4324 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4325 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 4326 bzero(&uth, sizeof(uth)); 4327 ZONE_LOCK(z); 4328 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 4329 uth.uth_align = kz->uk_align; 4330 uth.uth_size = kz->uk_size; 4331 uth.uth_rsize = kz->uk_rsize; 4332 if (z->uz_max_items > 0) 4333 uth.uth_pages = (z->uz_items / kz->uk_ipers) * 4334 kz->uk_ppera; 4335 else 4336 uth.uth_pages = kz->uk_pages; 4337 uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * 4338 kz->uk_ppera; 4339 uth.uth_limit = z->uz_max_items; 4340 uth.uth_keg_free = z->uz_keg->uk_free; 4341 4342 /* 4343 * A zone is secondary is it is not the first entry 4344 * on the keg's zone list. 4345 */ 4346 if ((z->uz_flags & UMA_ZONE_SECONDARY) && 4347 (LIST_FIRST(&kz->uk_zones) != z)) 4348 uth.uth_zone_flags = UTH_ZONE_SECONDARY; 4349 uma_vm_zone_stats(&uth, z, &sbuf, ups, 4350 kz->uk_flags & UMA_ZFLAG_INTERNAL); 4351 ZONE_UNLOCK(z); 4352 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 4353 for (i = 0; i < mp_maxid + 1; i++) 4354 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 4355 } 4356 } 4357 LIST_FOREACH(z, &uma_cachezones, uz_link) { 4358 bzero(&uth, sizeof(uth)); 4359 ZONE_LOCK(z); 4360 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 4361 uth.uth_size = z->uz_size; 4362 uma_vm_zone_stats(&uth, z, &sbuf, ups, false); 4363 ZONE_UNLOCK(z); 4364 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 4365 for (i = 0; i < mp_maxid + 1; i++) 4366 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 4367 } 4368 4369 rw_runlock(&uma_rwlock); 4370 error = sbuf_finish(&sbuf); 4371 sbuf_delete(&sbuf); 4372 free(ups, M_TEMP); 4373 return (error); 4374 } 4375 4376 int 4377 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) 4378 { 4379 uma_zone_t zone = *(uma_zone_t *)arg1; 4380 int error, max; 4381 4382 max = uma_zone_get_max(zone); 4383 error = sysctl_handle_int(oidp, &max, 0, req); 4384 if (error || !req->newptr) 4385 return (error); 4386 4387 uma_zone_set_max(zone, max); 4388 4389 return (0); 4390 } 4391 4392 int 4393 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) 4394 { 4395 uma_zone_t zone; 4396 int cur; 4397 4398 /* 4399 * Some callers want to add sysctls for global zones that 4400 * may not yet exist so they pass a pointer to a pointer. 4401 */ 4402 if (arg2 == 0) 4403 zone = *(uma_zone_t *)arg1; 4404 else 4405 zone = arg1; 4406 cur = uma_zone_get_cur(zone); 4407 return (sysctl_handle_int(oidp, &cur, 0, req)); 4408 } 4409 4410 static int 4411 sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS) 4412 { 4413 uma_zone_t zone = arg1; 4414 uint64_t cur; 4415 4416 cur = uma_zone_get_allocs(zone); 4417 return (sysctl_handle_64(oidp, &cur, 0, req)); 4418 } 4419 4420 static int 4421 sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS) 4422 { 4423 uma_zone_t zone = arg1; 4424 uint64_t cur; 4425 4426 cur = uma_zone_get_frees(zone); 4427 return (sysctl_handle_64(oidp, &cur, 0, req)); 4428 } 4429 4430 static int 4431 sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS) 4432 { 4433 struct sbuf sbuf; 4434 uma_zone_t zone = arg1; 4435 int error; 4436 4437 sbuf_new_for_sysctl(&sbuf, NULL, 0, req); 4438 if (zone->uz_flags != 0) 4439 sbuf_printf(&sbuf, "0x%b", zone->uz_flags, PRINT_UMA_ZFLAGS); 4440 else 4441 sbuf_printf(&sbuf, "0"); 4442 error = sbuf_finish(&sbuf); 4443 sbuf_delete(&sbuf); 4444 4445 return (error); 4446 } 4447 4448 static int 4449 sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS) 4450 { 4451 uma_keg_t keg = arg1; 4452 int avail, effpct, total; 4453 4454 total = keg->uk_ppera * PAGE_SIZE; 4455 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) != 0) 4456 total += slab_sizeof(SLAB_MAX_SETSIZE); 4457 /* 4458 * We consider the client's requested size and alignment here, not the 4459 * real size determination uk_rsize, because we also adjust the real 4460 * size for internal implementation reasons (max bitset size). 4461 */ 4462 avail = keg->uk_ipers * roundup2(keg->uk_size, keg->uk_align + 1); 4463 if ((keg->uk_flags & UMA_ZONE_PCPU) != 0) 4464 avail *= mp_maxid + 1; 4465 effpct = 100 * avail / total; 4466 return (sysctl_handle_int(oidp, &effpct, 0, req)); 4467 } 4468 4469 #ifdef INVARIANTS 4470 static uma_slab_t 4471 uma_dbg_getslab(uma_zone_t zone, void *item) 4472 { 4473 uma_slab_t slab; 4474 uma_keg_t keg; 4475 uint8_t *mem; 4476 4477 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 4478 if (zone->uz_flags & UMA_ZONE_VTOSLAB) { 4479 slab = vtoslab((vm_offset_t)mem); 4480 } else { 4481 /* 4482 * It is safe to return the slab here even though the 4483 * zone is unlocked because the item's allocation state 4484 * essentially holds a reference. 4485 */ 4486 if (zone->uz_lockptr == &zone->uz_lock) 4487 return (NULL); 4488 ZONE_LOCK(zone); 4489 keg = zone->uz_keg; 4490 if (keg->uk_flags & UMA_ZONE_HASH) 4491 slab = hash_sfind(&keg->uk_hash, mem); 4492 else 4493 slab = (uma_slab_t)(mem + keg->uk_pgoff); 4494 ZONE_UNLOCK(zone); 4495 } 4496 4497 return (slab); 4498 } 4499 4500 static bool 4501 uma_dbg_zskip(uma_zone_t zone, void *mem) 4502 { 4503 4504 if (zone->uz_lockptr == &zone->uz_lock) 4505 return (true); 4506 4507 return (uma_dbg_kskip(zone->uz_keg, mem)); 4508 } 4509 4510 static bool 4511 uma_dbg_kskip(uma_keg_t keg, void *mem) 4512 { 4513 uintptr_t idx; 4514 4515 if (dbg_divisor == 0) 4516 return (true); 4517 4518 if (dbg_divisor == 1) 4519 return (false); 4520 4521 idx = (uintptr_t)mem >> PAGE_SHIFT; 4522 if (keg->uk_ipers > 1) { 4523 idx *= keg->uk_ipers; 4524 idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize; 4525 } 4526 4527 if ((idx / dbg_divisor) * dbg_divisor != idx) { 4528 counter_u64_add(uma_skip_cnt, 1); 4529 return (true); 4530 } 4531 counter_u64_add(uma_dbg_cnt, 1); 4532 4533 return (false); 4534 } 4535 4536 /* 4537 * Set up the slab's freei data such that uma_dbg_free can function. 4538 * 4539 */ 4540 static void 4541 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) 4542 { 4543 uma_keg_t keg; 4544 int freei; 4545 4546 if (slab == NULL) { 4547 slab = uma_dbg_getslab(zone, item); 4548 if (slab == NULL) 4549 panic("uma: item %p did not belong to zone %s\n", 4550 item, zone->uz_name); 4551 } 4552 keg = zone->uz_keg; 4553 freei = slab_item_index(slab, keg, item); 4554 4555 if (BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) 4556 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n", 4557 item, zone, zone->uz_name, slab, freei); 4558 BIT_SET_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)); 4559 } 4560 4561 /* 4562 * Verifies freed addresses. Checks for alignment, valid slab membership 4563 * and duplicate frees. 4564 * 4565 */ 4566 static void 4567 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) 4568 { 4569 uma_keg_t keg; 4570 int freei; 4571 4572 if (slab == NULL) { 4573 slab = uma_dbg_getslab(zone, item); 4574 if (slab == NULL) 4575 panic("uma: Freed item %p did not belong to zone %s\n", 4576 item, zone->uz_name); 4577 } 4578 keg = zone->uz_keg; 4579 freei = slab_item_index(slab, keg, item); 4580 4581 if (freei >= keg->uk_ipers) 4582 panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n", 4583 item, zone, zone->uz_name, slab, freei); 4584 4585 if (slab_item(slab, keg, freei) != item) 4586 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n", 4587 item, zone, zone->uz_name, slab, freei); 4588 4589 if (!BIT_ISSET(keg->uk_ipers, freei, slab_dbg_bits(slab, keg))) 4590 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n", 4591 item, zone, zone->uz_name, slab, freei); 4592 4593 BIT_CLR_ATOMIC(keg->uk_ipers, freei, slab_dbg_bits(slab, keg)); 4594 } 4595 #endif /* INVARIANTS */ 4596 4597 #ifdef DDB 4598 static int64_t 4599 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used, 4600 uint64_t *sleeps, long *cachefree, uint64_t *xdomain) 4601 { 4602 uint64_t frees; 4603 int i; 4604 4605 if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { 4606 *allocs = counter_u64_fetch(z->uz_allocs); 4607 frees = counter_u64_fetch(z->uz_frees); 4608 *sleeps = z->uz_sleeps; 4609 *cachefree = 0; 4610 *xdomain = 0; 4611 } else 4612 uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps, 4613 xdomain); 4614 if (!((z->uz_flags & UMA_ZONE_SECONDARY) && 4615 (LIST_FIRST(&kz->uk_zones) != z))) 4616 *cachefree += kz->uk_free; 4617 for (i = 0; i < vm_ndomains; i++) 4618 *cachefree += z->uz_domain[i].uzd_nitems; 4619 *used = *allocs - frees; 4620 return (((int64_t)*used + *cachefree) * kz->uk_size); 4621 } 4622 4623 DB_SHOW_COMMAND(uma, db_show_uma) 4624 { 4625 const char *fmt_hdr, *fmt_entry; 4626 uma_keg_t kz; 4627 uma_zone_t z; 4628 uint64_t allocs, used, sleeps, xdomain; 4629 long cachefree; 4630 /* variables for sorting */ 4631 uma_keg_t cur_keg; 4632 uma_zone_t cur_zone, last_zone; 4633 int64_t cur_size, last_size, size; 4634 int ties; 4635 4636 /* /i option produces machine-parseable CSV output */ 4637 if (modif[0] == 'i') { 4638 fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; 4639 fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n"; 4640 } else { 4641 fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n"; 4642 fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n"; 4643 } 4644 4645 db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests", 4646 "Sleeps", "Bucket", "Total Mem", "XFree"); 4647 4648 /* Sort the zones with largest size first. */ 4649 last_zone = NULL; 4650 last_size = INT64_MAX; 4651 for (;;) { 4652 cur_zone = NULL; 4653 cur_size = -1; 4654 ties = 0; 4655 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4656 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 4657 /* 4658 * In the case of size ties, print out zones 4659 * in the order they are encountered. That is, 4660 * when we encounter the most recently output 4661 * zone, we have already printed all preceding 4662 * ties, and we must print all following ties. 4663 */ 4664 if (z == last_zone) { 4665 ties = 1; 4666 continue; 4667 } 4668 size = get_uma_stats(kz, z, &allocs, &used, 4669 &sleeps, &cachefree, &xdomain); 4670 if (size > cur_size && size < last_size + ties) 4671 { 4672 cur_size = size; 4673 cur_zone = z; 4674 cur_keg = kz; 4675 } 4676 } 4677 } 4678 if (cur_zone == NULL) 4679 break; 4680 4681 size = get_uma_stats(cur_keg, cur_zone, &allocs, &used, 4682 &sleeps, &cachefree, &xdomain); 4683 db_printf(fmt_entry, cur_zone->uz_name, 4684 (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree, 4685 (uintmax_t)allocs, (uintmax_t)sleeps, 4686 (unsigned)cur_zone->uz_bucket_size, (intmax_t)size, 4687 xdomain); 4688 4689 if (db_pager_quit) 4690 return; 4691 last_zone = cur_zone; 4692 last_size = cur_size; 4693 } 4694 } 4695 4696 DB_SHOW_COMMAND(umacache, db_show_umacache) 4697 { 4698 uma_zone_t z; 4699 uint64_t allocs, frees; 4700 long cachefree; 4701 int i; 4702 4703 db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", 4704 "Requests", "Bucket"); 4705 LIST_FOREACH(z, &uma_cachezones, uz_link) { 4706 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); 4707 for (i = 0; i < vm_ndomains; i++) 4708 cachefree += z->uz_domain[i].uzd_nitems; 4709 db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", 4710 z->uz_name, (uintmax_t)z->uz_size, 4711 (intmax_t)(allocs - frees), cachefree, 4712 (uintmax_t)allocs, z->uz_bucket_size); 4713 if (db_pager_quit) 4714 return; 4715 } 4716 } 4717 #endif /* DDB */ 4718