1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2019 Jeffrey Roberson <jeff@FreeBSD.org> 5 * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org> 6 * Copyright (c) 2004-2006 Robert N. M. Watson 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice unmodified, this list of conditions, and the following 14 * disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * uma_core.c Implementation of the Universal Memory allocator 33 * 34 * This allocator is intended to replace the multitude of similar object caches 35 * in the standard FreeBSD kernel. The intent is to be flexible as well as 36 * efficient. A primary design goal is to return unused memory to the rest of 37 * the system. This will make the system as a whole more flexible due to the 38 * ability to move memory to subsystems which most need it instead of leaving 39 * pools of reserved memory unused. 40 * 41 * The basic ideas stem from similar slab/zone based allocators whose algorithms 42 * are well known. 43 * 44 */ 45 46 /* 47 * TODO: 48 * - Improve memory usage for large allocations 49 * - Investigate cache size adjustments 50 */ 51 52 #include <sys/cdefs.h> 53 __FBSDID("$FreeBSD$"); 54 55 #include "opt_ddb.h" 56 #include "opt_param.h" 57 #include "opt_vm.h" 58 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/bitset.h> 62 #include <sys/domainset.h> 63 #include <sys/eventhandler.h> 64 #include <sys/kernel.h> 65 #include <sys/types.h> 66 #include <sys/limits.h> 67 #include <sys/queue.h> 68 #include <sys/malloc.h> 69 #include <sys/ktr.h> 70 #include <sys/lock.h> 71 #include <sys/sysctl.h> 72 #include <sys/mutex.h> 73 #include <sys/proc.h> 74 #include <sys/random.h> 75 #include <sys/rwlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/taskqueue.h> 80 #include <sys/vmmeter.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_domainset.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pageout.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_phys.h> 89 #include <vm/vm_pagequeue.h> 90 #include <vm/vm_map.h> 91 #include <vm/vm_kern.h> 92 #include <vm/vm_extern.h> 93 #include <vm/uma.h> 94 #include <vm/uma_int.h> 95 #include <vm/uma_dbg.h> 96 97 #include <ddb/ddb.h> 98 99 #ifdef DEBUG_MEMGUARD 100 #include <vm/memguard.h> 101 #endif 102 103 /* 104 * This is the zone and keg from which all zones are spawned. 105 */ 106 static uma_zone_t kegs; 107 static uma_zone_t zones; 108 109 /* This is the zone from which all offpage uma_slab_ts are allocated. */ 110 static uma_zone_t slabzone; 111 112 /* 113 * The initial hash tables come out of this zone so they can be allocated 114 * prior to malloc coming up. 115 */ 116 static uma_zone_t hashzone; 117 118 /* The boot-time adjusted value for cache line alignment. */ 119 int uma_align_cache = 64 - 1; 120 121 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); 122 static MALLOC_DEFINE(M_UMA, "UMA", "UMA Misc"); 123 124 /* 125 * Are we allowed to allocate buckets? 126 */ 127 static int bucketdisable = 1; 128 129 /* Linked list of all kegs in the system */ 130 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); 131 132 /* Linked list of all cache-only zones in the system */ 133 static LIST_HEAD(,uma_zone) uma_cachezones = 134 LIST_HEAD_INITIALIZER(uma_cachezones); 135 136 /* This RW lock protects the keg list */ 137 static struct rwlock_padalign __exclusive_cache_line uma_rwlock; 138 139 /* 140 * Pointer and counter to pool of pages, that is preallocated at 141 * startup to bootstrap UMA. 142 */ 143 static char *bootmem; 144 static int boot_pages; 145 146 static struct sx uma_reclaim_lock; 147 148 /* 149 * kmem soft limit, initialized by uma_set_limit(). Ensure that early 150 * allocations don't trigger a wakeup of the reclaim thread. 151 */ 152 unsigned long uma_kmem_limit = LONG_MAX; 153 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, 154 "UMA kernel memory soft limit"); 155 unsigned long uma_kmem_total; 156 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, 157 "UMA kernel memory usage"); 158 159 /* Is the VM done starting up? */ 160 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS, 161 BOOT_RUNNING } booted = BOOT_COLD; 162 163 /* 164 * This is the handle used to schedule events that need to happen 165 * outside of the allocation fast path. 166 */ 167 static struct callout uma_callout; 168 #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ 169 170 /* 171 * This structure is passed as the zone ctor arg so that I don't have to create 172 * a special allocation function just for zones. 173 */ 174 struct uma_zctor_args { 175 const char *name; 176 size_t size; 177 uma_ctor ctor; 178 uma_dtor dtor; 179 uma_init uminit; 180 uma_fini fini; 181 uma_import import; 182 uma_release release; 183 void *arg; 184 uma_keg_t keg; 185 int align; 186 uint32_t flags; 187 }; 188 189 struct uma_kctor_args { 190 uma_zone_t zone; 191 size_t size; 192 uma_init uminit; 193 uma_fini fini; 194 int align; 195 uint32_t flags; 196 }; 197 198 struct uma_bucket_zone { 199 uma_zone_t ubz_zone; 200 char *ubz_name; 201 int ubz_entries; /* Number of items it can hold. */ 202 int ubz_maxsize; /* Maximum allocation size per-item. */ 203 }; 204 205 /* 206 * Compute the actual number of bucket entries to pack them in power 207 * of two sizes for more efficient space utilization. 208 */ 209 #define BUCKET_SIZE(n) \ 210 (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) 211 212 #define BUCKET_MAX BUCKET_SIZE(256) 213 #define BUCKET_MIN BUCKET_SIZE(4) 214 215 struct uma_bucket_zone bucket_zones[] = { 216 { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 }, 217 { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 }, 218 { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 }, 219 { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 }, 220 { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 }, 221 { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, 222 { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, 223 { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, 224 { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, 225 { NULL, NULL, 0} 226 }; 227 228 /* 229 * Flags and enumerations to be passed to internal functions. 230 */ 231 enum zfreeskip { 232 SKIP_NONE = 0, 233 SKIP_CNT = 0x00000001, 234 SKIP_DTOR = 0x00010000, 235 SKIP_FINI = 0x00020000, 236 }; 237 238 /* Prototypes.. */ 239 240 int uma_startup_count(int); 241 void uma_startup(void *, int); 242 void uma_startup1(void); 243 void uma_startup2(void); 244 245 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 246 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 247 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 248 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 249 static void page_free(void *, vm_size_t, uint8_t); 250 static void pcpu_page_free(void *, vm_size_t, uint8_t); 251 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); 252 static void cache_drain(uma_zone_t); 253 static void bucket_drain(uma_zone_t, uma_bucket_t); 254 static void bucket_cache_reclaim(uma_zone_t zone, bool); 255 static int keg_ctor(void *, int, void *, int); 256 static void keg_dtor(void *, int, void *); 257 static int zone_ctor(void *, int, void *, int); 258 static void zone_dtor(void *, int, void *); 259 static int zero_init(void *, int, int); 260 static void keg_small_init(uma_keg_t keg); 261 static void keg_large_init(uma_keg_t keg); 262 static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *); 263 static void zone_timeout(uma_zone_t zone, void *); 264 static int hash_alloc(struct uma_hash *, u_int); 265 static int hash_expand(struct uma_hash *, struct uma_hash *); 266 static void hash_free(struct uma_hash *hash); 267 static void uma_timeout(void *); 268 static void uma_startup3(void); 269 static void *zone_alloc_item(uma_zone_t, void *, int, int); 270 static void *zone_alloc_item_locked(uma_zone_t, void *, int, int); 271 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); 272 static void bucket_enable(void); 273 static void bucket_init(void); 274 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); 275 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); 276 static void bucket_zone_drain(void); 277 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int); 278 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); 279 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); 280 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, 281 uma_fini fini, int align, uint32_t flags); 282 static int zone_import(void *, void **, int, int, int); 283 static void zone_release(void *, void **, int); 284 static void uma_zero_item(void *, uma_zone_t); 285 static bool cache_alloc(uma_zone_t, uma_cache_t, void *, int); 286 static bool cache_free(uma_zone_t, uma_cache_t, void *, void *, int); 287 288 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); 289 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); 290 static int sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS); 291 static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); 292 293 #ifdef INVARIANTS 294 static bool uma_dbg_kskip(uma_keg_t keg, void *mem); 295 static bool uma_dbg_zskip(uma_zone_t zone, void *mem); 296 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); 297 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); 298 299 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0, 300 "Memory allocation debugging"); 301 302 static u_int dbg_divisor = 1; 303 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor, 304 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0, 305 "Debug & thrash every this item in memory allocator"); 306 307 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER; 308 static counter_u64_t uma_skip_cnt = EARLY_COUNTER; 309 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD, 310 &uma_dbg_cnt, "memory items debugged"); 311 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD, 312 &uma_skip_cnt, "memory items skipped, not debugged"); 313 #endif 314 315 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); 316 317 SYSCTL_NODE(_vm, OID_AUTO, uma, CTLFLAG_RW, 0, "Universal Memory Allocator"); 318 319 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT, 320 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); 321 322 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT, 323 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); 324 325 static int zone_warnings = 1; 326 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, 327 "Warn when UMA zones becomes full"); 328 329 /* 330 * This routine checks to see whether or not it's safe to enable buckets. 331 */ 332 static void 333 bucket_enable(void) 334 { 335 bucketdisable = vm_page_count_min(); 336 } 337 338 /* 339 * Initialize bucket_zones, the array of zones of buckets of various sizes. 340 * 341 * For each zone, calculate the memory required for each bucket, consisting 342 * of the header and an array of pointers. 343 */ 344 static void 345 bucket_init(void) 346 { 347 struct uma_bucket_zone *ubz; 348 int size; 349 350 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { 351 size = roundup(sizeof(struct uma_bucket), sizeof(void *)); 352 size += sizeof(void *) * ubz->ubz_entries; 353 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, 354 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 355 UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA); 356 } 357 } 358 359 /* 360 * Given a desired number of entries for a bucket, return the zone from which 361 * to allocate the bucket. 362 */ 363 static struct uma_bucket_zone * 364 bucket_zone_lookup(int entries) 365 { 366 struct uma_bucket_zone *ubz; 367 368 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 369 if (ubz->ubz_entries >= entries) 370 return (ubz); 371 ubz--; 372 return (ubz); 373 } 374 375 static struct uma_bucket_zone * 376 bucket_zone_max(uma_zone_t zone, int nitems) 377 { 378 struct uma_bucket_zone *ubz; 379 int bpcpu; 380 381 bpcpu = 2; 382 #ifdef UMA_XDOMAIN 383 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) 384 /* Count the cross-domain bucket. */ 385 bpcpu++; 386 #endif 387 388 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 389 if (ubz->ubz_entries * bpcpu * mp_ncpus > nitems) 390 break; 391 if (ubz == &bucket_zones[0]) 392 ubz = NULL; 393 else 394 ubz--; 395 return (ubz); 396 } 397 398 static int 399 bucket_select(int size) 400 { 401 struct uma_bucket_zone *ubz; 402 403 ubz = &bucket_zones[0]; 404 if (size > ubz->ubz_maxsize) 405 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); 406 407 for (; ubz->ubz_entries != 0; ubz++) 408 if (ubz->ubz_maxsize < size) 409 break; 410 ubz--; 411 return (ubz->ubz_entries); 412 } 413 414 static uma_bucket_t 415 bucket_alloc(uma_zone_t zone, void *udata, int flags) 416 { 417 struct uma_bucket_zone *ubz; 418 uma_bucket_t bucket; 419 420 /* 421 * This is to stop us from allocating per cpu buckets while we're 422 * running out of vm.boot_pages. Otherwise, we would exhaust the 423 * boot pages. This also prevents us from allocating buckets in 424 * low memory situations. 425 */ 426 if (bucketdisable) 427 return (NULL); 428 /* 429 * To limit bucket recursion we store the original zone flags 430 * in a cookie passed via zalloc_arg/zfree_arg. This allows the 431 * NOVM flag to persist even through deep recursions. We also 432 * store ZFLAG_BUCKET once we have recursed attempting to allocate 433 * a bucket for a bucket zone so we do not allow infinite bucket 434 * recursion. This cookie will even persist to frees of unused 435 * buckets via the allocation path or bucket allocations in the 436 * free path. 437 */ 438 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 439 udata = (void *)(uintptr_t)zone->uz_flags; 440 else { 441 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) 442 return (NULL); 443 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); 444 } 445 if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY) 446 flags |= M_NOVM; 447 ubz = bucket_zone_lookup(zone->uz_bucket_size); 448 if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) 449 ubz++; 450 bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); 451 if (bucket) { 452 #ifdef INVARIANTS 453 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); 454 #endif 455 bucket->ub_cnt = 0; 456 bucket->ub_entries = ubz->ubz_entries; 457 } 458 459 return (bucket); 460 } 461 462 static void 463 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) 464 { 465 struct uma_bucket_zone *ubz; 466 467 KASSERT(bucket->ub_cnt == 0, 468 ("bucket_free: Freeing a non free bucket.")); 469 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 470 udata = (void *)(uintptr_t)zone->uz_flags; 471 ubz = bucket_zone_lookup(bucket->ub_entries); 472 uma_zfree_arg(ubz->ubz_zone, bucket, udata); 473 } 474 475 static void 476 bucket_zone_drain(void) 477 { 478 struct uma_bucket_zone *ubz; 479 480 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 481 uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN); 482 } 483 484 /* 485 * Attempt to satisfy an allocation by retrieving a full bucket from one of the 486 * zone's caches. 487 */ 488 static uma_bucket_t 489 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom) 490 { 491 uma_bucket_t bucket; 492 493 ZONE_LOCK_ASSERT(zone); 494 495 if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) { 496 MPASS(zdom->uzd_nitems >= bucket->ub_cnt); 497 TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); 498 zdom->uzd_nitems -= bucket->ub_cnt; 499 if (zdom->uzd_imin > zdom->uzd_nitems) 500 zdom->uzd_imin = zdom->uzd_nitems; 501 zone->uz_bkt_count -= bucket->ub_cnt; 502 } 503 return (bucket); 504 } 505 506 /* 507 * Insert a full bucket into the specified cache. The "ws" parameter indicates 508 * whether the bucket's contents should be counted as part of the zone's working 509 * set. 510 */ 511 static void 512 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket, 513 const bool ws) 514 { 515 516 ZONE_LOCK_ASSERT(zone); 517 KASSERT(!ws || zone->uz_bkt_count < zone->uz_bkt_max, 518 ("%s: zone %p overflow", __func__, zone)); 519 520 if (ws) 521 TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); 522 else 523 TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); 524 zdom->uzd_nitems += bucket->ub_cnt; 525 if (ws && zdom->uzd_imax < zdom->uzd_nitems) 526 zdom->uzd_imax = zdom->uzd_nitems; 527 zone->uz_bkt_count += bucket->ub_cnt; 528 } 529 530 static void 531 zone_log_warning(uma_zone_t zone) 532 { 533 static const struct timeval warninterval = { 300, 0 }; 534 535 if (!zone_warnings || zone->uz_warning == NULL) 536 return; 537 538 if (ratecheck(&zone->uz_ratecheck, &warninterval)) 539 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); 540 } 541 542 static inline void 543 zone_maxaction(uma_zone_t zone) 544 { 545 546 if (zone->uz_maxaction.ta_func != NULL) 547 taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); 548 } 549 550 /* 551 * Routine called by timeout which is used to fire off some time interval 552 * based calculations. (stats, hash size, etc.) 553 * 554 * Arguments: 555 * arg Unused 556 * 557 * Returns: 558 * Nothing 559 */ 560 static void 561 uma_timeout(void *unused) 562 { 563 bucket_enable(); 564 zone_foreach(zone_timeout, NULL); 565 566 /* Reschedule this event */ 567 callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); 568 } 569 570 /* 571 * Update the working set size estimate for the zone's bucket cache. 572 * The constants chosen here are somewhat arbitrary. With an update period of 573 * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the 574 * last 100s. 575 */ 576 static void 577 zone_domain_update_wss(uma_zone_domain_t zdom) 578 { 579 long wss; 580 581 MPASS(zdom->uzd_imax >= zdom->uzd_imin); 582 wss = zdom->uzd_imax - zdom->uzd_imin; 583 zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems; 584 zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5; 585 } 586 587 /* 588 * Routine to perform timeout driven calculations. This expands the 589 * hashes and does per cpu statistics aggregation. 590 * 591 * Returns nothing. 592 */ 593 static void 594 zone_timeout(uma_zone_t zone, void *unused) 595 { 596 uma_keg_t keg; 597 u_int slabs; 598 599 if ((zone->uz_flags & UMA_ZONE_HASH) == 0) 600 goto update_wss; 601 602 keg = zone->uz_keg; 603 KEG_LOCK(keg); 604 /* 605 * Expand the keg hash table. 606 * 607 * This is done if the number of slabs is larger than the hash size. 608 * What I'm trying to do here is completely reduce collisions. This 609 * may be a little aggressive. Should I allow for two collisions max? 610 */ 611 if (keg->uk_flags & UMA_ZONE_HASH && 612 (slabs = keg->uk_pages / keg->uk_ppera) > 613 keg->uk_hash.uh_hashsize) { 614 struct uma_hash newhash; 615 struct uma_hash oldhash; 616 int ret; 617 618 /* 619 * This is so involved because allocating and freeing 620 * while the keg lock is held will lead to deadlock. 621 * I have to do everything in stages and check for 622 * races. 623 */ 624 KEG_UNLOCK(keg); 625 ret = hash_alloc(&newhash, 1 << fls(slabs)); 626 KEG_LOCK(keg); 627 if (ret) { 628 if (hash_expand(&keg->uk_hash, &newhash)) { 629 oldhash = keg->uk_hash; 630 keg->uk_hash = newhash; 631 } else 632 oldhash = newhash; 633 634 KEG_UNLOCK(keg); 635 hash_free(&oldhash); 636 return; 637 } 638 } 639 KEG_UNLOCK(keg); 640 641 update_wss: 642 ZONE_LOCK(zone); 643 for (int i = 0; i < vm_ndomains; i++) 644 zone_domain_update_wss(&zone->uz_domain[i]); 645 ZONE_UNLOCK(zone); 646 } 647 648 /* 649 * Allocate and zero fill the next sized hash table from the appropriate 650 * backing store. 651 * 652 * Arguments: 653 * hash A new hash structure with the old hash size in uh_hashsize 654 * 655 * Returns: 656 * 1 on success and 0 on failure. 657 */ 658 static int 659 hash_alloc(struct uma_hash *hash, u_int size) 660 { 661 size_t alloc; 662 663 KASSERT(powerof2(size), ("hash size must be power of 2")); 664 if (size > UMA_HASH_SIZE_INIT) { 665 hash->uh_hashsize = size; 666 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; 667 hash->uh_slab_hash = malloc(alloc, M_UMAHASH, M_NOWAIT); 668 } else { 669 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; 670 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, 671 UMA_ANYDOMAIN, M_WAITOK); 672 hash->uh_hashsize = UMA_HASH_SIZE_INIT; 673 } 674 if (hash->uh_slab_hash) { 675 bzero(hash->uh_slab_hash, alloc); 676 hash->uh_hashmask = hash->uh_hashsize - 1; 677 return (1); 678 } 679 680 return (0); 681 } 682 683 /* 684 * Expands the hash table for HASH zones. This is done from zone_timeout 685 * to reduce collisions. This must not be done in the regular allocation 686 * path, otherwise, we can recurse on the vm while allocating pages. 687 * 688 * Arguments: 689 * oldhash The hash you want to expand 690 * newhash The hash structure for the new table 691 * 692 * Returns: 693 * Nothing 694 * 695 * Discussion: 696 */ 697 static int 698 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) 699 { 700 uma_hash_slab_t slab; 701 u_int hval; 702 u_int idx; 703 704 if (!newhash->uh_slab_hash) 705 return (0); 706 707 if (oldhash->uh_hashsize >= newhash->uh_hashsize) 708 return (0); 709 710 /* 711 * I need to investigate hash algorithms for resizing without a 712 * full rehash. 713 */ 714 715 for (idx = 0; idx < oldhash->uh_hashsize; idx++) 716 while (!LIST_EMPTY(&oldhash->uh_slab_hash[idx])) { 717 slab = LIST_FIRST(&oldhash->uh_slab_hash[idx]); 718 LIST_REMOVE(slab, uhs_hlink); 719 hval = UMA_HASH(newhash, slab->uhs_data); 720 LIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], 721 slab, uhs_hlink); 722 } 723 724 return (1); 725 } 726 727 /* 728 * Free the hash bucket to the appropriate backing store. 729 * 730 * Arguments: 731 * slab_hash The hash bucket we're freeing 732 * hashsize The number of entries in that hash bucket 733 * 734 * Returns: 735 * Nothing 736 */ 737 static void 738 hash_free(struct uma_hash *hash) 739 { 740 if (hash->uh_slab_hash == NULL) 741 return; 742 if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) 743 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); 744 else 745 free(hash->uh_slab_hash, M_UMAHASH); 746 } 747 748 /* 749 * Frees all outstanding items in a bucket 750 * 751 * Arguments: 752 * zone The zone to free to, must be unlocked. 753 * bucket The free/alloc bucket with items, cpu queue must be locked. 754 * 755 * Returns: 756 * Nothing 757 */ 758 759 static void 760 bucket_drain(uma_zone_t zone, uma_bucket_t bucket) 761 { 762 int i; 763 764 if (bucket == NULL) 765 return; 766 767 if (zone->uz_fini) 768 for (i = 0; i < bucket->ub_cnt; i++) 769 zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); 770 zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); 771 if (zone->uz_max_items > 0) { 772 ZONE_LOCK(zone); 773 zone->uz_items -= bucket->ub_cnt; 774 if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items) 775 wakeup_one(zone); 776 ZONE_UNLOCK(zone); 777 } 778 bucket->ub_cnt = 0; 779 } 780 781 /* 782 * Drains the per cpu caches for a zone. 783 * 784 * NOTE: This may only be called while the zone is being turn down, and not 785 * during normal operation. This is necessary in order that we do not have 786 * to migrate CPUs to drain the per-CPU caches. 787 * 788 * Arguments: 789 * zone The zone to drain, must be unlocked. 790 * 791 * Returns: 792 * Nothing 793 */ 794 static void 795 cache_drain(uma_zone_t zone) 796 { 797 uma_cache_t cache; 798 int cpu; 799 800 /* 801 * XXX: It is safe to not lock the per-CPU caches, because we're 802 * tearing down the zone anyway. I.e., there will be no further use 803 * of the caches at this point. 804 * 805 * XXX: It would good to be able to assert that the zone is being 806 * torn down to prevent improper use of cache_drain(). 807 * 808 * XXX: We lock the zone before passing into bucket_cache_reclaim() as 809 * it is used elsewhere. Should the tear-down path be made special 810 * there in some form? 811 */ 812 CPU_FOREACH(cpu) { 813 cache = &zone->uz_cpu[cpu]; 814 bucket_drain(zone, cache->uc_allocbucket); 815 if (cache->uc_allocbucket != NULL) 816 bucket_free(zone, cache->uc_allocbucket, NULL); 817 cache->uc_allocbucket = NULL; 818 bucket_drain(zone, cache->uc_freebucket); 819 if (cache->uc_freebucket != NULL) 820 bucket_free(zone, cache->uc_freebucket, NULL); 821 cache->uc_freebucket = NULL; 822 bucket_drain(zone, cache->uc_crossbucket); 823 if (cache->uc_crossbucket != NULL) 824 bucket_free(zone, cache->uc_crossbucket, NULL); 825 cache->uc_crossbucket = NULL; 826 } 827 ZONE_LOCK(zone); 828 bucket_cache_reclaim(zone, true); 829 ZONE_UNLOCK(zone); 830 } 831 832 static void 833 cache_shrink(uma_zone_t zone, void *unused) 834 { 835 836 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 837 return; 838 839 ZONE_LOCK(zone); 840 zone->uz_bucket_size = 841 (zone->uz_bucket_size_min + zone->uz_bucket_size) / 2; 842 ZONE_UNLOCK(zone); 843 } 844 845 static void 846 cache_drain_safe_cpu(uma_zone_t zone, void *unused) 847 { 848 uma_cache_t cache; 849 uma_bucket_t b1, b2, b3; 850 int domain; 851 852 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 853 return; 854 855 b1 = b2 = b3 = NULL; 856 ZONE_LOCK(zone); 857 critical_enter(); 858 if (zone->uz_flags & UMA_ZONE_NUMA) 859 domain = PCPU_GET(domain); 860 else 861 domain = 0; 862 cache = &zone->uz_cpu[curcpu]; 863 if (cache->uc_allocbucket) { 864 if (cache->uc_allocbucket->ub_cnt != 0) 865 zone_put_bucket(zone, &zone->uz_domain[domain], 866 cache->uc_allocbucket, false); 867 else 868 b1 = cache->uc_allocbucket; 869 cache->uc_allocbucket = NULL; 870 } 871 if (cache->uc_freebucket) { 872 if (cache->uc_freebucket->ub_cnt != 0) 873 zone_put_bucket(zone, &zone->uz_domain[domain], 874 cache->uc_freebucket, false); 875 else 876 b2 = cache->uc_freebucket; 877 cache->uc_freebucket = NULL; 878 } 879 b3 = cache->uc_crossbucket; 880 cache->uc_crossbucket = NULL; 881 critical_exit(); 882 ZONE_UNLOCK(zone); 883 if (b1) 884 bucket_free(zone, b1, NULL); 885 if (b2) 886 bucket_free(zone, b2, NULL); 887 if (b3) { 888 bucket_drain(zone, b3); 889 bucket_free(zone, b3, NULL); 890 } 891 } 892 893 /* 894 * Safely drain per-CPU caches of a zone(s) to alloc bucket. 895 * This is an expensive call because it needs to bind to all CPUs 896 * one by one and enter a critical section on each of them in order 897 * to safely access their cache buckets. 898 * Zone lock must not be held on call this function. 899 */ 900 static void 901 pcpu_cache_drain_safe(uma_zone_t zone) 902 { 903 int cpu; 904 905 /* 906 * Polite bucket sizes shrinking was not enouth, shrink aggressively. 907 */ 908 if (zone) 909 cache_shrink(zone, NULL); 910 else 911 zone_foreach(cache_shrink, NULL); 912 913 CPU_FOREACH(cpu) { 914 thread_lock(curthread); 915 sched_bind(curthread, cpu); 916 thread_unlock(curthread); 917 918 if (zone) 919 cache_drain_safe_cpu(zone, NULL); 920 else 921 zone_foreach(cache_drain_safe_cpu, NULL); 922 } 923 thread_lock(curthread); 924 sched_unbind(curthread); 925 thread_unlock(curthread); 926 } 927 928 /* 929 * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller 930 * requested a drain, otherwise the per-domain caches are trimmed to either 931 * estimated working set size. 932 */ 933 static void 934 bucket_cache_reclaim(uma_zone_t zone, bool drain) 935 { 936 uma_zone_domain_t zdom; 937 uma_bucket_t bucket; 938 long target, tofree; 939 int i; 940 941 for (i = 0; i < vm_ndomains; i++) { 942 zdom = &zone->uz_domain[i]; 943 944 /* 945 * If we were asked to drain the zone, we are done only once 946 * this bucket cache is empty. Otherwise, we reclaim items in 947 * excess of the zone's estimated working set size. If the 948 * difference nitems - imin is larger than the WSS estimate, 949 * then the estimate will grow at the end of this interval and 950 * we ignore the historical average. 951 */ 952 target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems - 953 zdom->uzd_imin); 954 while (zdom->uzd_nitems > target) { 955 bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist); 956 if (bucket == NULL) 957 break; 958 tofree = bucket->ub_cnt; 959 TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); 960 zdom->uzd_nitems -= tofree; 961 962 /* 963 * Shift the bounds of the current WSS interval to avoid 964 * perturbing the estimate. 965 */ 966 zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree); 967 zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree); 968 969 ZONE_UNLOCK(zone); 970 bucket_drain(zone, bucket); 971 bucket_free(zone, bucket, NULL); 972 ZONE_LOCK(zone); 973 } 974 } 975 976 /* 977 * Shrink the zone bucket size to ensure that the per-CPU caches 978 * don't grow too large. 979 */ 980 if (zone->uz_bucket_size > zone->uz_bucket_size_min) 981 zone->uz_bucket_size--; 982 } 983 984 static void 985 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) 986 { 987 uint8_t *mem; 988 int i; 989 uint8_t flags; 990 991 CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", 992 keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); 993 994 mem = slab_data(slab, keg); 995 flags = slab->us_flags; 996 i = start; 997 if (keg->uk_fini != NULL) { 998 for (i--; i > -1; i--) 999 #ifdef INVARIANTS 1000 /* 1001 * trash_fini implies that dtor was trash_dtor. trash_fini 1002 * would check that memory hasn't been modified since free, 1003 * which executed trash_dtor. 1004 * That's why we need to run uma_dbg_kskip() check here, 1005 * albeit we don't make skip check for other init/fini 1006 * invocations. 1007 */ 1008 if (!uma_dbg_kskip(keg, slab_item(slab, keg, i)) || 1009 keg->uk_fini != trash_fini) 1010 #endif 1011 keg->uk_fini(slab_item(slab, keg, i), keg->uk_size); 1012 } 1013 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1014 zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); 1015 keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags); 1016 uma_total_dec(PAGE_SIZE * keg->uk_ppera); 1017 } 1018 1019 /* 1020 * Frees pages from a keg back to the system. This is done on demand from 1021 * the pageout daemon. 1022 * 1023 * Returns nothing. 1024 */ 1025 static void 1026 keg_drain(uma_keg_t keg) 1027 { 1028 struct slabhead freeslabs = { 0 }; 1029 uma_domain_t dom; 1030 uma_slab_t slab, tmp; 1031 int i; 1032 1033 /* 1034 * We don't want to take pages from statically allocated kegs at this 1035 * time 1036 */ 1037 if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) 1038 return; 1039 1040 CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u", 1041 keg->uk_name, keg, keg->uk_free); 1042 KEG_LOCK(keg); 1043 if (keg->uk_free == 0) 1044 goto finished; 1045 1046 for (i = 0; i < vm_ndomains; i++) { 1047 dom = &keg->uk_domain[i]; 1048 LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) { 1049 /* We have nowhere to free these to. */ 1050 if (slab->us_flags & UMA_SLAB_BOOT) 1051 continue; 1052 1053 LIST_REMOVE(slab, us_link); 1054 keg->uk_pages -= keg->uk_ppera; 1055 keg->uk_free -= keg->uk_ipers; 1056 1057 if (keg->uk_flags & UMA_ZONE_HASH) 1058 UMA_HASH_REMOVE(&keg->uk_hash, slab); 1059 1060 LIST_INSERT_HEAD(&freeslabs, slab, us_link); 1061 } 1062 } 1063 1064 finished: 1065 KEG_UNLOCK(keg); 1066 1067 while ((slab = LIST_FIRST(&freeslabs)) != NULL) { 1068 LIST_REMOVE(slab, us_link); 1069 keg_free_slab(keg, slab, keg->uk_ipers); 1070 } 1071 } 1072 1073 static void 1074 zone_reclaim(uma_zone_t zone, int waitok, bool drain) 1075 { 1076 1077 /* 1078 * Set draining to interlock with zone_dtor() so we can release our 1079 * locks as we go. Only dtor() should do a WAITOK call since it 1080 * is the only call that knows the structure will still be available 1081 * when it wakes up. 1082 */ 1083 ZONE_LOCK(zone); 1084 while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) { 1085 if (waitok == M_NOWAIT) 1086 goto out; 1087 msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1); 1088 } 1089 zone->uz_flags |= UMA_ZFLAG_RECLAIMING; 1090 bucket_cache_reclaim(zone, drain); 1091 ZONE_UNLOCK(zone); 1092 1093 /* 1094 * The DRAINING flag protects us from being freed while 1095 * we're running. Normally the uma_rwlock would protect us but we 1096 * must be able to release and acquire the right lock for each keg. 1097 */ 1098 if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) 1099 keg_drain(zone->uz_keg); 1100 ZONE_LOCK(zone); 1101 zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING; 1102 wakeup(zone); 1103 out: 1104 ZONE_UNLOCK(zone); 1105 } 1106 1107 static void 1108 zone_drain(uma_zone_t zone, void *unused) 1109 { 1110 1111 zone_reclaim(zone, M_NOWAIT, true); 1112 } 1113 1114 static void 1115 zone_trim(uma_zone_t zone, void *unused) 1116 { 1117 1118 zone_reclaim(zone, M_NOWAIT, false); 1119 } 1120 1121 /* 1122 * Allocate a new slab for a keg. This does not insert the slab onto a list. 1123 * If the allocation was successful, the keg lock will be held upon return, 1124 * otherwise the keg will be left unlocked. 1125 * 1126 * Arguments: 1127 * flags Wait flags for the item initialization routine 1128 * aflags Wait flags for the slab allocation 1129 * 1130 * Returns: 1131 * The slab that was allocated or NULL if there is no memory and the 1132 * caller specified M_NOWAIT. 1133 */ 1134 static uma_slab_t 1135 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, 1136 int aflags) 1137 { 1138 uma_alloc allocf; 1139 uma_slab_t slab; 1140 unsigned long size; 1141 uint8_t *mem; 1142 uint8_t sflags; 1143 int i; 1144 1145 KASSERT(domain >= 0 && domain < vm_ndomains, 1146 ("keg_alloc_slab: domain %d out of range", domain)); 1147 KEG_LOCK_ASSERT(keg); 1148 MPASS(zone->uz_lockptr == &keg->uk_lock); 1149 1150 allocf = keg->uk_allocf; 1151 KEG_UNLOCK(keg); 1152 1153 slab = NULL; 1154 mem = NULL; 1155 if (keg->uk_flags & UMA_ZONE_OFFPAGE) { 1156 slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags); 1157 if (slab == NULL) 1158 goto out; 1159 } 1160 1161 /* 1162 * This reproduces the old vm_zone behavior of zero filling pages the 1163 * first time they are added to a zone. 1164 * 1165 * Malloced items are zeroed in uma_zalloc. 1166 */ 1167 1168 if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) 1169 aflags |= M_ZERO; 1170 else 1171 aflags &= ~M_ZERO; 1172 1173 if (keg->uk_flags & UMA_ZONE_NODUMP) 1174 aflags |= M_NODUMP; 1175 1176 /* zone is passed for legacy reasons. */ 1177 size = keg->uk_ppera * PAGE_SIZE; 1178 mem = allocf(zone, size, domain, &sflags, aflags); 1179 if (mem == NULL) { 1180 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1181 zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); 1182 slab = NULL; 1183 goto out; 1184 } 1185 uma_total_inc(size); 1186 1187 /* Point the slab into the allocated memory */ 1188 if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) 1189 slab = (uma_slab_t )(mem + keg->uk_pgoff); 1190 else 1191 ((uma_hash_slab_t)slab)->uhs_data = mem; 1192 1193 if (keg->uk_flags & UMA_ZONE_VTOSLAB) 1194 for (i = 0; i < keg->uk_ppera; i++) 1195 vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE), 1196 zone, slab); 1197 1198 slab->us_freecount = keg->uk_ipers; 1199 slab->us_flags = sflags; 1200 slab->us_domain = domain; 1201 BIT_FILL(keg->uk_ipers, &slab->us_free); 1202 #ifdef INVARIANTS 1203 BIT_ZERO(SLAB_MAX_SETSIZE, &slab->us_debugfree); 1204 #endif 1205 1206 if (keg->uk_init != NULL) { 1207 for (i = 0; i < keg->uk_ipers; i++) 1208 if (keg->uk_init(slab_item(slab, keg, i), 1209 keg->uk_size, flags) != 0) 1210 break; 1211 if (i != keg->uk_ipers) { 1212 keg_free_slab(keg, slab, i); 1213 slab = NULL; 1214 goto out; 1215 } 1216 } 1217 KEG_LOCK(keg); 1218 1219 CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", 1220 slab, keg->uk_name, keg); 1221 1222 if (keg->uk_flags & UMA_ZONE_HASH) 1223 UMA_HASH_INSERT(&keg->uk_hash, slab, mem); 1224 1225 keg->uk_pages += keg->uk_ppera; 1226 keg->uk_free += keg->uk_ipers; 1227 1228 out: 1229 return (slab); 1230 } 1231 1232 /* 1233 * This function is intended to be used early on in place of page_alloc() so 1234 * that we may use the boot time page cache to satisfy allocations before 1235 * the VM is ready. 1236 */ 1237 static void * 1238 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1239 int wait) 1240 { 1241 uma_keg_t keg; 1242 void *mem; 1243 int pages; 1244 1245 keg = zone->uz_keg; 1246 /* 1247 * If we are in BOOT_BUCKETS or higher, than switch to real 1248 * allocator. Zones with page sized slabs switch at BOOT_PAGEALLOC. 1249 */ 1250 switch (booted) { 1251 case BOOT_COLD: 1252 case BOOT_STRAPPED: 1253 break; 1254 case BOOT_PAGEALLOC: 1255 if (keg->uk_ppera > 1) 1256 break; 1257 case BOOT_BUCKETS: 1258 case BOOT_RUNNING: 1259 #ifdef UMA_MD_SMALL_ALLOC 1260 keg->uk_allocf = (keg->uk_ppera > 1) ? 1261 page_alloc : uma_small_alloc; 1262 #else 1263 keg->uk_allocf = page_alloc; 1264 #endif 1265 return keg->uk_allocf(zone, bytes, domain, pflag, wait); 1266 } 1267 1268 /* 1269 * Check our small startup cache to see if it has pages remaining. 1270 */ 1271 pages = howmany(bytes, PAGE_SIZE); 1272 KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__)); 1273 if (pages > boot_pages) 1274 panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name); 1275 #ifdef DIAGNOSTIC 1276 printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name, 1277 boot_pages); 1278 #endif 1279 mem = bootmem; 1280 boot_pages -= pages; 1281 bootmem += pages * PAGE_SIZE; 1282 *pflag = UMA_SLAB_BOOT; 1283 1284 return (mem); 1285 } 1286 1287 /* 1288 * Allocates a number of pages from the system 1289 * 1290 * Arguments: 1291 * bytes The number of bytes requested 1292 * wait Shall we wait? 1293 * 1294 * Returns: 1295 * A pointer to the alloced memory or possibly 1296 * NULL if M_NOWAIT is set. 1297 */ 1298 static void * 1299 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1300 int wait) 1301 { 1302 void *p; /* Returned page */ 1303 1304 *pflag = UMA_SLAB_KERNEL; 1305 p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); 1306 1307 return (p); 1308 } 1309 1310 static void * 1311 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1312 int wait) 1313 { 1314 struct pglist alloctail; 1315 vm_offset_t addr, zkva; 1316 int cpu, flags; 1317 vm_page_t p, p_next; 1318 #ifdef NUMA 1319 struct pcpu *pc; 1320 #endif 1321 1322 MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE); 1323 1324 TAILQ_INIT(&alloctail); 1325 flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | 1326 malloc2vm_flags(wait); 1327 *pflag = UMA_SLAB_KERNEL; 1328 for (cpu = 0; cpu <= mp_maxid; cpu++) { 1329 if (CPU_ABSENT(cpu)) { 1330 p = vm_page_alloc(NULL, 0, flags); 1331 } else { 1332 #ifndef NUMA 1333 p = vm_page_alloc(NULL, 0, flags); 1334 #else 1335 pc = pcpu_find(cpu); 1336 p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags); 1337 if (__predict_false(p == NULL)) 1338 p = vm_page_alloc(NULL, 0, flags); 1339 #endif 1340 } 1341 if (__predict_false(p == NULL)) 1342 goto fail; 1343 TAILQ_INSERT_TAIL(&alloctail, p, listq); 1344 } 1345 if ((addr = kva_alloc(bytes)) == 0) 1346 goto fail; 1347 zkva = addr; 1348 TAILQ_FOREACH(p, &alloctail, listq) { 1349 pmap_qenter(zkva, &p, 1); 1350 zkva += PAGE_SIZE; 1351 } 1352 return ((void*)addr); 1353 fail: 1354 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { 1355 vm_page_unwire_noq(p); 1356 vm_page_free(p); 1357 } 1358 return (NULL); 1359 } 1360 1361 /* 1362 * Allocates a number of pages from within an object 1363 * 1364 * Arguments: 1365 * bytes The number of bytes requested 1366 * wait Shall we wait? 1367 * 1368 * Returns: 1369 * A pointer to the alloced memory or possibly 1370 * NULL if M_NOWAIT is set. 1371 */ 1372 static void * 1373 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 1374 int wait) 1375 { 1376 TAILQ_HEAD(, vm_page) alloctail; 1377 u_long npages; 1378 vm_offset_t retkva, zkva; 1379 vm_page_t p, p_next; 1380 uma_keg_t keg; 1381 1382 TAILQ_INIT(&alloctail); 1383 keg = zone->uz_keg; 1384 1385 npages = howmany(bytes, PAGE_SIZE); 1386 while (npages > 0) { 1387 p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT | 1388 VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | 1389 ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : 1390 VM_ALLOC_NOWAIT)); 1391 if (p != NULL) { 1392 /* 1393 * Since the page does not belong to an object, its 1394 * listq is unused. 1395 */ 1396 TAILQ_INSERT_TAIL(&alloctail, p, listq); 1397 npages--; 1398 continue; 1399 } 1400 /* 1401 * Page allocation failed, free intermediate pages and 1402 * exit. 1403 */ 1404 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { 1405 vm_page_unwire_noq(p); 1406 vm_page_free(p); 1407 } 1408 return (NULL); 1409 } 1410 *flags = UMA_SLAB_PRIV; 1411 zkva = keg->uk_kva + 1412 atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); 1413 retkva = zkva; 1414 TAILQ_FOREACH(p, &alloctail, listq) { 1415 pmap_qenter(zkva, &p, 1); 1416 zkva += PAGE_SIZE; 1417 } 1418 1419 return ((void *)retkva); 1420 } 1421 1422 /* 1423 * Frees a number of pages to the system 1424 * 1425 * Arguments: 1426 * mem A pointer to the memory to be freed 1427 * size The size of the memory being freed 1428 * flags The original p->us_flags field 1429 * 1430 * Returns: 1431 * Nothing 1432 */ 1433 static void 1434 page_free(void *mem, vm_size_t size, uint8_t flags) 1435 { 1436 1437 if ((flags & UMA_SLAB_KERNEL) == 0) 1438 panic("UMA: page_free used with invalid flags %x", flags); 1439 1440 kmem_free((vm_offset_t)mem, size); 1441 } 1442 1443 /* 1444 * Frees pcpu zone allocations 1445 * 1446 * Arguments: 1447 * mem A pointer to the memory to be freed 1448 * size The size of the memory being freed 1449 * flags The original p->us_flags field 1450 * 1451 * Returns: 1452 * Nothing 1453 */ 1454 static void 1455 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) 1456 { 1457 vm_offset_t sva, curva; 1458 vm_paddr_t paddr; 1459 vm_page_t m; 1460 1461 MPASS(size == (mp_maxid+1)*PAGE_SIZE); 1462 sva = (vm_offset_t)mem; 1463 for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { 1464 paddr = pmap_kextract(curva); 1465 m = PHYS_TO_VM_PAGE(paddr); 1466 vm_page_unwire_noq(m); 1467 vm_page_free(m); 1468 } 1469 pmap_qremove(sva, size >> PAGE_SHIFT); 1470 kva_free(sva, size); 1471 } 1472 1473 1474 /* 1475 * Zero fill initializer 1476 * 1477 * Arguments/Returns follow uma_init specifications 1478 */ 1479 static int 1480 zero_init(void *mem, int size, int flags) 1481 { 1482 bzero(mem, size); 1483 return (0); 1484 } 1485 1486 /* 1487 * Actual size of embedded struct slab (!OFFPAGE). 1488 */ 1489 size_t 1490 slab_sizeof(int nitems) 1491 { 1492 size_t s; 1493 1494 s = sizeof(struct uma_slab) + BITSET_SIZE(nitems); 1495 return (roundup(s, UMA_ALIGN_PTR + 1)); 1496 } 1497 1498 /* 1499 * Size of memory for embedded slabs (!OFFPAGE). 1500 */ 1501 size_t 1502 slab_space(int nitems) 1503 { 1504 return (UMA_SLAB_SIZE - slab_sizeof(nitems)); 1505 } 1506 1507 /* 1508 * Compute the number of items that will fit in an embedded (!OFFPAGE) slab 1509 * with a given size and alignment. 1510 */ 1511 int 1512 slab_ipers(size_t size, int align) 1513 { 1514 int rsize; 1515 int nitems; 1516 1517 /* 1518 * Compute the ideal number of items that will fit in a page and 1519 * then compute the actual number based on a bitset nitems wide. 1520 */ 1521 rsize = roundup(size, align + 1); 1522 nitems = UMA_SLAB_SIZE / rsize; 1523 return (slab_space(nitems) / rsize); 1524 } 1525 1526 /* 1527 * Finish creating a small uma keg. This calculates ipers, and the keg size. 1528 * 1529 * Arguments 1530 * keg The zone we should initialize 1531 * 1532 * Returns 1533 * Nothing 1534 */ 1535 static void 1536 keg_small_init(uma_keg_t keg) 1537 { 1538 u_int rsize; 1539 u_int memused; 1540 u_int wastedspace; 1541 u_int shsize; 1542 u_int slabsize; 1543 1544 if (keg->uk_flags & UMA_ZONE_PCPU) { 1545 u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU; 1546 1547 slabsize = UMA_PCPU_ALLOC_SIZE; 1548 keg->uk_ppera = ncpus; 1549 } else { 1550 slabsize = UMA_SLAB_SIZE; 1551 keg->uk_ppera = 1; 1552 } 1553 1554 /* 1555 * Calculate the size of each allocation (rsize) according to 1556 * alignment. If the requested size is smaller than we have 1557 * allocation bits for we round it up. 1558 */ 1559 rsize = keg->uk_size; 1560 if (rsize < slabsize / SLAB_MAX_SETSIZE) 1561 rsize = slabsize / SLAB_MAX_SETSIZE; 1562 if (rsize & keg->uk_align) 1563 rsize = roundup(rsize, keg->uk_align + 1); 1564 keg->uk_rsize = rsize; 1565 1566 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || 1567 keg->uk_rsize < UMA_PCPU_ALLOC_SIZE, 1568 ("%s: size %u too large", __func__, keg->uk_rsize)); 1569 1570 /* 1571 * Use a pessimistic bit count for shsize. It may be possible to 1572 * squeeze one more item in for very particular sizes if we were 1573 * to loop and reduce the bitsize if there is waste. 1574 */ 1575 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1576 shsize = 0; 1577 else 1578 shsize = slab_sizeof(slabsize / rsize); 1579 1580 if (rsize <= slabsize - shsize) 1581 keg->uk_ipers = (slabsize - shsize) / rsize; 1582 else { 1583 /* Handle special case when we have 1 item per slab, so 1584 * alignment requirement can be relaxed. */ 1585 KASSERT(keg->uk_size <= slabsize - shsize, 1586 ("%s: size %u greater than slab", __func__, keg->uk_size)); 1587 keg->uk_ipers = 1; 1588 } 1589 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, 1590 ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); 1591 1592 memused = keg->uk_ipers * rsize + shsize; 1593 wastedspace = slabsize - memused; 1594 1595 /* 1596 * We can't do OFFPAGE if we're internal or if we've been 1597 * asked to not go to the VM for buckets. If we do this we 1598 * may end up going to the VM for slabs which we do not 1599 * want to do if we're UMA_ZFLAG_CACHEONLY as a result 1600 * of UMA_ZONE_VM, which clearly forbids it. 1601 */ 1602 if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) || 1603 (keg->uk_flags & UMA_ZFLAG_CACHEONLY)) 1604 return; 1605 1606 /* 1607 * See if using an OFFPAGE slab will limit our waste. Only do 1608 * this if it permits more items per-slab. 1609 * 1610 * XXX We could try growing slabsize to limit max waste as well. 1611 * Historically this was not done because the VM could not 1612 * efficiently handle contiguous allocations. 1613 */ 1614 if ((wastedspace >= slabsize / UMA_MAX_WASTE) && 1615 (keg->uk_ipers < (slabsize / keg->uk_rsize))) { 1616 keg->uk_ipers = slabsize / keg->uk_rsize; 1617 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_MAX_SETSIZE, 1618 ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); 1619 CTR6(KTR_UMA, "UMA decided we need offpage slab headers for " 1620 "keg: %s(%p), calculated wastedspace = %d, " 1621 "maximum wasted space allowed = %d, " 1622 "calculated ipers = %d, " 1623 "new wasted space = %d\n", keg->uk_name, keg, wastedspace, 1624 slabsize / UMA_MAX_WASTE, keg->uk_ipers, 1625 slabsize - keg->uk_ipers * keg->uk_rsize); 1626 /* 1627 * If we had access to memory to embed a slab header we 1628 * also have a page structure to use vtoslab() instead of 1629 * hash to find slabs. If the zone was explicitly created 1630 * OFFPAGE we can't necessarily touch the memory. 1631 */ 1632 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) 1633 keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB; 1634 } 1635 1636 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && 1637 (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) 1638 keg->uk_flags |= UMA_ZONE_HASH; 1639 } 1640 1641 /* 1642 * Finish creating a large (> UMA_SLAB_SIZE) uma kegs. Just give in and do 1643 * OFFPAGE for now. When I can allow for more dynamic slab sizes this will be 1644 * more complicated. 1645 * 1646 * Arguments 1647 * keg The keg we should initialize 1648 * 1649 * Returns 1650 * Nothing 1651 */ 1652 static void 1653 keg_large_init(uma_keg_t keg) 1654 { 1655 1656 KASSERT(keg != NULL, ("Keg is null in keg_large_init")); 1657 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, 1658 ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__)); 1659 1660 keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE); 1661 keg->uk_ipers = 1; 1662 keg->uk_rsize = keg->uk_size; 1663 1664 /* Check whether we have enough space to not do OFFPAGE. */ 1665 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 && 1666 PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < 1667 slab_sizeof(SLAB_MIN_SETSIZE)) { 1668 /* 1669 * We can't do OFFPAGE if we're internal, in which case 1670 * we need an extra page per allocation to contain the 1671 * slab header. 1672 */ 1673 if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0) 1674 keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB; 1675 else 1676 keg->uk_ppera++; 1677 } 1678 1679 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && 1680 (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) 1681 keg->uk_flags |= UMA_ZONE_HASH; 1682 } 1683 1684 static void 1685 keg_cachespread_init(uma_keg_t keg) 1686 { 1687 int alignsize; 1688 int trailer; 1689 int pages; 1690 int rsize; 1691 1692 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, 1693 ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__)); 1694 1695 alignsize = keg->uk_align + 1; 1696 rsize = keg->uk_size; 1697 /* 1698 * We want one item to start on every align boundary in a page. To 1699 * do this we will span pages. We will also extend the item by the 1700 * size of align if it is an even multiple of align. Otherwise, it 1701 * would fall on the same boundary every time. 1702 */ 1703 if (rsize & keg->uk_align) 1704 rsize = (rsize & ~keg->uk_align) + alignsize; 1705 if ((rsize & alignsize) == 0) 1706 rsize += alignsize; 1707 trailer = rsize - keg->uk_size; 1708 pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE; 1709 pages = MIN(pages, (128 * 1024) / PAGE_SIZE); 1710 keg->uk_rsize = rsize; 1711 keg->uk_ppera = pages; 1712 keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize; 1713 keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB; 1714 KASSERT(keg->uk_ipers <= SLAB_MAX_SETSIZE, 1715 ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__, 1716 keg->uk_ipers)); 1717 } 1718 1719 /* 1720 * Keg header ctor. This initializes all fields, locks, etc. And inserts 1721 * the keg onto the global keg list. 1722 * 1723 * Arguments/Returns follow uma_ctor specifications 1724 * udata Actually uma_kctor_args 1725 */ 1726 static int 1727 keg_ctor(void *mem, int size, void *udata, int flags) 1728 { 1729 struct uma_kctor_args *arg = udata; 1730 uma_keg_t keg = mem; 1731 uma_zone_t zone; 1732 1733 bzero(keg, size); 1734 keg->uk_size = arg->size; 1735 keg->uk_init = arg->uminit; 1736 keg->uk_fini = arg->fini; 1737 keg->uk_align = arg->align; 1738 keg->uk_free = 0; 1739 keg->uk_reserve = 0; 1740 keg->uk_pages = 0; 1741 keg->uk_flags = arg->flags; 1742 keg->uk_slabzone = NULL; 1743 1744 /* 1745 * We use a global round-robin policy by default. Zones with 1746 * UMA_ZONE_NUMA set will use first-touch instead, in which case the 1747 * iterator is never run. 1748 */ 1749 keg->uk_dr.dr_policy = DOMAINSET_RR(); 1750 keg->uk_dr.dr_iter = 0; 1751 1752 /* 1753 * The master zone is passed to us at keg-creation time. 1754 */ 1755 zone = arg->zone; 1756 keg->uk_name = zone->uz_name; 1757 1758 if (arg->flags & UMA_ZONE_VM) 1759 keg->uk_flags |= UMA_ZFLAG_CACHEONLY; 1760 1761 if (arg->flags & UMA_ZONE_ZINIT) 1762 keg->uk_init = zero_init; 1763 1764 if (arg->flags & UMA_ZONE_MALLOC) 1765 keg->uk_flags |= UMA_ZONE_VTOSLAB; 1766 1767 if (arg->flags & UMA_ZONE_PCPU) 1768 #ifdef SMP 1769 keg->uk_flags |= UMA_ZONE_OFFPAGE; 1770 #else 1771 keg->uk_flags &= ~UMA_ZONE_PCPU; 1772 #endif 1773 1774 if (keg->uk_flags & UMA_ZONE_CACHESPREAD) { 1775 keg_cachespread_init(keg); 1776 } else { 1777 if (keg->uk_size > slab_space(SLAB_MIN_SETSIZE)) 1778 keg_large_init(keg); 1779 else 1780 keg_small_init(keg); 1781 } 1782 1783 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1784 keg->uk_slabzone = slabzone; 1785 1786 /* 1787 * If we haven't booted yet we need allocations to go through the 1788 * startup cache until the vm is ready. 1789 */ 1790 if (booted < BOOT_PAGEALLOC) 1791 keg->uk_allocf = startup_alloc; 1792 #ifdef UMA_MD_SMALL_ALLOC 1793 else if (keg->uk_ppera == 1) 1794 keg->uk_allocf = uma_small_alloc; 1795 #endif 1796 else if (keg->uk_flags & UMA_ZONE_PCPU) 1797 keg->uk_allocf = pcpu_page_alloc; 1798 else 1799 keg->uk_allocf = page_alloc; 1800 #ifdef UMA_MD_SMALL_ALLOC 1801 if (keg->uk_ppera == 1) 1802 keg->uk_freef = uma_small_free; 1803 else 1804 #endif 1805 if (keg->uk_flags & UMA_ZONE_PCPU) 1806 keg->uk_freef = pcpu_page_free; 1807 else 1808 keg->uk_freef = page_free; 1809 1810 /* 1811 * Initialize keg's lock 1812 */ 1813 KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS)); 1814 1815 /* 1816 * If we're putting the slab header in the actual page we need to 1817 * figure out where in each page it goes. See slab_sizeof 1818 * definition. 1819 */ 1820 if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) { 1821 size_t shsize; 1822 1823 shsize = slab_sizeof(keg->uk_ipers); 1824 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - shsize; 1825 /* 1826 * The only way the following is possible is if with our 1827 * UMA_ALIGN_PTR adjustments we are now bigger than 1828 * UMA_SLAB_SIZE. I haven't checked whether this is 1829 * mathematically possible for all cases, so we make 1830 * sure here anyway. 1831 */ 1832 KASSERT(keg->uk_pgoff + shsize <= PAGE_SIZE * keg->uk_ppera, 1833 ("zone %s ipers %d rsize %d size %d slab won't fit", 1834 zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size)); 1835 } 1836 1837 if (keg->uk_flags & UMA_ZONE_HASH) 1838 hash_alloc(&keg->uk_hash, 0); 1839 1840 CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n", 1841 keg, zone->uz_name, zone, 1842 (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, 1843 keg->uk_free); 1844 1845 LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); 1846 1847 rw_wlock(&uma_rwlock); 1848 LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); 1849 rw_wunlock(&uma_rwlock); 1850 return (0); 1851 } 1852 1853 static void 1854 zone_alloc_counters(uma_zone_t zone, void *unused) 1855 { 1856 1857 zone->uz_allocs = counter_u64_alloc(M_WAITOK); 1858 zone->uz_frees = counter_u64_alloc(M_WAITOK); 1859 zone->uz_fails = counter_u64_alloc(M_WAITOK); 1860 } 1861 1862 static void 1863 zone_alloc_sysctl(uma_zone_t zone, void *unused) 1864 { 1865 uma_zone_domain_t zdom; 1866 uma_keg_t keg; 1867 struct sysctl_oid *oid, *domainoid; 1868 int domains, i, cnt; 1869 static const char *nokeg = "cache zone"; 1870 char *c; 1871 1872 /* 1873 * Make a sysctl safe copy of the zone name by removing 1874 * any special characters and handling dups by appending 1875 * an index. 1876 */ 1877 if (zone->uz_namecnt != 0) { 1878 /* Count the number of decimal digits and '_' separator. */ 1879 for (i = 1, cnt = zone->uz_namecnt; cnt != 0; i++) 1880 cnt /= 10; 1881 zone->uz_ctlname = malloc(strlen(zone->uz_name) + i + 1, 1882 M_UMA, M_WAITOK); 1883 sprintf(zone->uz_ctlname, "%s_%d", zone->uz_name, 1884 zone->uz_namecnt); 1885 } else 1886 zone->uz_ctlname = strdup(zone->uz_name, M_UMA); 1887 for (c = zone->uz_ctlname; *c != '\0'; c++) 1888 if (strchr("./\\ -", *c) != NULL) 1889 *c = '_'; 1890 1891 /* 1892 * Basic parameters at the root. 1893 */ 1894 zone->uz_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm_uma), 1895 OID_AUTO, zone->uz_ctlname, CTLFLAG_RD, NULL, ""); 1896 oid = zone->uz_oid; 1897 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1898 "size", CTLFLAG_RD, &zone->uz_size, 0, "Allocation size"); 1899 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1900 "flags", CTLFLAG_RD, &zone->uz_flags, 0, 1901 "Allocator configuration flags"); 1902 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1903 "bucket_size", CTLFLAG_RD, &zone->uz_bucket_size, 0, 1904 "Desired per-cpu cache size"); 1905 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1906 "bucket_size_max", CTLFLAG_RD, &zone->uz_bucket_size_max, 0, 1907 "Maximum allowed per-cpu cache size"); 1908 1909 /* 1910 * keg if present. 1911 */ 1912 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 1913 "keg", CTLFLAG_RD, NULL, ""); 1914 keg = zone->uz_keg; 1915 if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) { 1916 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1917 "name", CTLFLAG_RD, keg->uk_name, "Keg name"); 1918 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1919 "rsize", CTLFLAG_RD, &keg->uk_rsize, 0, 1920 "Real object size with alignment"); 1921 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1922 "ppera", CTLFLAG_RD, &keg->uk_ppera, 0, 1923 "pages per-slab allocation"); 1924 SYSCTL_ADD_U16(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1925 "ipers", CTLFLAG_RD, &keg->uk_ipers, 0, 1926 "items available per-slab"); 1927 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1928 "align", CTLFLAG_RD, &keg->uk_align, 0, 1929 "item alignment mask"); 1930 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1931 "pages", CTLFLAG_RD, &keg->uk_pages, 0, 1932 "Total pages currently allocated from VM"); 1933 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1934 "free", CTLFLAG_RD, &keg->uk_free, 0, 1935 "items free in the slab layer"); 1936 } else 1937 SYSCTL_ADD_CONST_STRING(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1938 "name", CTLFLAG_RD, nokeg, "Keg name"); 1939 1940 /* 1941 * Information about zone limits. 1942 */ 1943 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 1944 "limit", CTLFLAG_RD, NULL, ""); 1945 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1946 "items", CTLFLAG_RD, &zone->uz_items, 0, 1947 "current number of cached items"); 1948 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1949 "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, 1950 "Maximum number of cached items"); 1951 SYSCTL_ADD_U32(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1952 "sleepers", CTLFLAG_RD, &zone->uz_sleepers, 0, 1953 "Number of threads sleeping at limit"); 1954 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1955 "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, 1956 "Total zone limit sleeps"); 1957 1958 /* 1959 * Per-domain information. 1960 */ 1961 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) 1962 domains = vm_ndomains; 1963 else 1964 domains = 1; 1965 domainoid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), 1966 OID_AUTO, "domain", CTLFLAG_RD, NULL, ""); 1967 for (i = 0; i < domains; i++) { 1968 zdom = &zone->uz_domain[i]; 1969 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(domainoid), 1970 OID_AUTO, VM_DOMAIN(i)->vmd_name, CTLFLAG_RD, NULL, ""); 1971 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1972 "nitems", CTLFLAG_RD, &zdom->uzd_nitems, 1973 "number of items in this domain"); 1974 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1975 "imax", CTLFLAG_RD, &zdom->uzd_imax, 1976 "maximum item count in this period"); 1977 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1978 "imin", CTLFLAG_RD, &zdom->uzd_imin, 1979 "minimum item count in this period"); 1980 SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1981 "wss", CTLFLAG_RD, &zdom->uzd_wss, 1982 "Working set size"); 1983 } 1984 1985 /* 1986 * General statistics. 1987 */ 1988 oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, 1989 "stats", CTLFLAG_RD, NULL, ""); 1990 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1991 "current", CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, 1992 zone, 1, sysctl_handle_uma_zone_cur, "I", 1993 "Current number of allocated items"); 1994 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1995 "allocs", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 1996 zone, 0, sysctl_handle_uma_zone_allocs, "QU", 1997 "Total allocation calls"); 1998 SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 1999 "frees", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, 2000 zone, 0, sysctl_handle_uma_zone_frees, "QU", 2001 "Total free calls"); 2002 SYSCTL_ADD_COUNTER_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2003 "fails", CTLFLAG_RD, &zone->uz_fails, 2004 "Number of allocation failures"); 2005 SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, 2006 "xdomain", CTLFLAG_RD, &zone->uz_xdomain, 0, 2007 "Free calls from the wrong domain"); 2008 } 2009 2010 struct uma_zone_count { 2011 const char *name; 2012 int count; 2013 }; 2014 2015 static void 2016 zone_count(uma_zone_t zone, void *arg) 2017 { 2018 struct uma_zone_count *cnt; 2019 2020 cnt = arg; 2021 /* 2022 * Some zones are rapidly created with identical names and 2023 * destroyed out of order. This can lead to gaps in the count. 2024 * Use one greater than the maximum observed for this name. 2025 */ 2026 if (strcmp(zone->uz_name, cnt->name) == 0) 2027 cnt->count = MAX(cnt->count, 2028 zone->uz_namecnt + 1); 2029 } 2030 2031 /* 2032 * Zone header ctor. This initializes all fields, locks, etc. 2033 * 2034 * Arguments/Returns follow uma_ctor specifications 2035 * udata Actually uma_zctor_args 2036 */ 2037 static int 2038 zone_ctor(void *mem, int size, void *udata, int flags) 2039 { 2040 struct uma_zone_count cnt; 2041 struct uma_zctor_args *arg = udata; 2042 uma_zone_t zone = mem; 2043 uma_zone_t z; 2044 uma_keg_t keg; 2045 int i; 2046 2047 bzero(zone, size); 2048 zone->uz_name = arg->name; 2049 zone->uz_ctor = arg->ctor; 2050 zone->uz_dtor = arg->dtor; 2051 zone->uz_init = NULL; 2052 zone->uz_fini = NULL; 2053 zone->uz_sleeps = 0; 2054 zone->uz_xdomain = 0; 2055 zone->uz_bucket_size = 0; 2056 zone->uz_bucket_size_min = 0; 2057 zone->uz_bucket_size_max = BUCKET_MAX; 2058 zone->uz_flags = 0; 2059 zone->uz_warning = NULL; 2060 /* The domain structures follow the cpu structures. */ 2061 zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus]; 2062 zone->uz_bkt_max = ULONG_MAX; 2063 timevalclear(&zone->uz_ratecheck); 2064 2065 /* Count the number of duplicate names. */ 2066 cnt.name = arg->name; 2067 cnt.count = 0; 2068 zone_foreach(zone_count, &cnt); 2069 zone->uz_namecnt = cnt.count; 2070 2071 for (i = 0; i < vm_ndomains; i++) 2072 TAILQ_INIT(&zone->uz_domain[i].uzd_buckets); 2073 2074 #ifdef INVARIANTS 2075 if (arg->uminit == trash_init && arg->fini == trash_fini) 2076 zone->uz_flags |= UMA_ZFLAG_TRASH; 2077 #endif 2078 2079 /* 2080 * This is a pure cache zone, no kegs. 2081 */ 2082 if (arg->import) { 2083 if (arg->flags & UMA_ZONE_VM) 2084 arg->flags |= UMA_ZFLAG_CACHEONLY; 2085 zone->uz_flags = arg->flags; 2086 zone->uz_size = arg->size; 2087 zone->uz_import = arg->import; 2088 zone->uz_release = arg->release; 2089 zone->uz_arg = arg->arg; 2090 zone->uz_lockptr = &zone->uz_lock; 2091 ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); 2092 rw_wlock(&uma_rwlock); 2093 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); 2094 rw_wunlock(&uma_rwlock); 2095 goto out; 2096 } 2097 2098 /* 2099 * Use the regular zone/keg/slab allocator. 2100 */ 2101 zone->uz_import = zone_import; 2102 zone->uz_release = zone_release; 2103 zone->uz_arg = zone; 2104 keg = arg->keg; 2105 2106 if (arg->flags & UMA_ZONE_SECONDARY) { 2107 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, 2108 ("Secondary zone requested UMA_ZFLAG_INTERNAL")); 2109 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); 2110 zone->uz_init = arg->uminit; 2111 zone->uz_fini = arg->fini; 2112 zone->uz_lockptr = &keg->uk_lock; 2113 zone->uz_flags |= UMA_ZONE_SECONDARY; 2114 rw_wlock(&uma_rwlock); 2115 ZONE_LOCK(zone); 2116 LIST_FOREACH(z, &keg->uk_zones, uz_link) { 2117 if (LIST_NEXT(z, uz_link) == NULL) { 2118 LIST_INSERT_AFTER(z, zone, uz_link); 2119 break; 2120 } 2121 } 2122 ZONE_UNLOCK(zone); 2123 rw_wunlock(&uma_rwlock); 2124 } else if (keg == NULL) { 2125 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, 2126 arg->align, arg->flags)) == NULL) 2127 return (ENOMEM); 2128 } else { 2129 struct uma_kctor_args karg; 2130 int error; 2131 2132 /* We should only be here from uma_startup() */ 2133 karg.size = arg->size; 2134 karg.uminit = arg->uminit; 2135 karg.fini = arg->fini; 2136 karg.align = arg->align; 2137 karg.flags = arg->flags; 2138 karg.zone = zone; 2139 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, 2140 flags); 2141 if (error) 2142 return (error); 2143 } 2144 2145 /* Inherit properties from the keg. */ 2146 zone->uz_keg = keg; 2147 zone->uz_size = keg->uk_size; 2148 zone->uz_flags |= (keg->uk_flags & 2149 (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); 2150 2151 out: 2152 if (__predict_true(booted == BOOT_RUNNING)) { 2153 zone_alloc_counters(zone, NULL); 2154 zone_alloc_sysctl(zone, NULL); 2155 } else { 2156 zone->uz_allocs = EARLY_COUNTER; 2157 zone->uz_frees = EARLY_COUNTER; 2158 zone->uz_fails = EARLY_COUNTER; 2159 } 2160 2161 KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != 2162 (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), 2163 ("Invalid zone flag combination")); 2164 if (arg->flags & UMA_ZFLAG_INTERNAL) 2165 zone->uz_bucket_size_max = zone->uz_bucket_size = 0; 2166 if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) 2167 zone->uz_bucket_size = BUCKET_MAX; 2168 else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) 2169 zone->uz_bucket_size_max = zone->uz_bucket_size = BUCKET_MIN; 2170 else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) 2171 zone->uz_bucket_size = 0; 2172 else 2173 zone->uz_bucket_size = bucket_select(zone->uz_size); 2174 zone->uz_bucket_size_min = zone->uz_bucket_size; 2175 2176 return (0); 2177 } 2178 2179 /* 2180 * Keg header dtor. This frees all data, destroys locks, frees the hash 2181 * table and removes the keg from the global list. 2182 * 2183 * Arguments/Returns follow uma_dtor specifications 2184 * udata unused 2185 */ 2186 static void 2187 keg_dtor(void *arg, int size, void *udata) 2188 { 2189 uma_keg_t keg; 2190 2191 keg = (uma_keg_t)arg; 2192 KEG_LOCK(keg); 2193 if (keg->uk_free != 0) { 2194 printf("Freed UMA keg (%s) was not empty (%d items). " 2195 " Lost %d pages of memory.\n", 2196 keg->uk_name ? keg->uk_name : "", 2197 keg->uk_free, keg->uk_pages); 2198 } 2199 KEG_UNLOCK(keg); 2200 2201 hash_free(&keg->uk_hash); 2202 2203 KEG_LOCK_FINI(keg); 2204 } 2205 2206 /* 2207 * Zone header dtor. 2208 * 2209 * Arguments/Returns follow uma_dtor specifications 2210 * udata unused 2211 */ 2212 static void 2213 zone_dtor(void *arg, int size, void *udata) 2214 { 2215 uma_zone_t zone; 2216 uma_keg_t keg; 2217 2218 zone = (uma_zone_t)arg; 2219 2220 sysctl_remove_oid(zone->uz_oid, 1, 1); 2221 2222 if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) 2223 cache_drain(zone); 2224 2225 rw_wlock(&uma_rwlock); 2226 LIST_REMOVE(zone, uz_link); 2227 rw_wunlock(&uma_rwlock); 2228 /* 2229 * XXX there are some races here where 2230 * the zone can be drained but zone lock 2231 * released and then refilled before we 2232 * remove it... we dont care for now 2233 */ 2234 zone_reclaim(zone, M_WAITOK, true); 2235 /* 2236 * We only destroy kegs from non secondary/non cache zones. 2237 */ 2238 if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { 2239 keg = zone->uz_keg; 2240 rw_wlock(&uma_rwlock); 2241 LIST_REMOVE(keg, uk_link); 2242 rw_wunlock(&uma_rwlock); 2243 zone_free_item(kegs, keg, NULL, SKIP_NONE); 2244 } 2245 counter_u64_free(zone->uz_allocs); 2246 counter_u64_free(zone->uz_frees); 2247 counter_u64_free(zone->uz_fails); 2248 free(zone->uz_ctlname, M_UMA); 2249 if (zone->uz_lockptr == &zone->uz_lock) 2250 ZONE_LOCK_FINI(zone); 2251 } 2252 2253 /* 2254 * Traverses every zone in the system and calls a callback 2255 * 2256 * Arguments: 2257 * zfunc A pointer to a function which accepts a zone 2258 * as an argument. 2259 * 2260 * Returns: 2261 * Nothing 2262 */ 2263 static void 2264 zone_foreach(void (*zfunc)(uma_zone_t, void *arg), void *arg) 2265 { 2266 uma_keg_t keg; 2267 uma_zone_t zone; 2268 2269 /* 2270 * Before BOOT_RUNNING we are guaranteed to be single 2271 * threaded, so locking isn't needed. Startup functions 2272 * are allowed to use M_WAITOK. 2273 */ 2274 if (__predict_true(booted == BOOT_RUNNING)) 2275 rw_rlock(&uma_rwlock); 2276 LIST_FOREACH(keg, &uma_kegs, uk_link) { 2277 LIST_FOREACH(zone, &keg->uk_zones, uz_link) 2278 zfunc(zone, arg); 2279 } 2280 LIST_FOREACH(zone, &uma_cachezones, uz_link) 2281 zfunc(zone, arg); 2282 if (__predict_true(booted == BOOT_RUNNING)) 2283 rw_runlock(&uma_rwlock); 2284 } 2285 2286 /* 2287 * Count how many pages do we need to bootstrap. VM supplies 2288 * its need in early zones in the argument, we add up our zones, 2289 * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The 2290 * zone of zones and zone of kegs are accounted separately. 2291 */ 2292 #define UMA_BOOT_ZONES 11 2293 /* Zone of zones and zone of kegs have arbitrary alignment. */ 2294 #define UMA_BOOT_ALIGN 32 2295 static int zsize, ksize; 2296 int 2297 uma_startup_count(int vm_zones) 2298 { 2299 int zones, pages; 2300 size_t space, size; 2301 2302 ksize = sizeof(struct uma_keg) + 2303 (sizeof(struct uma_domain) * vm_ndomains); 2304 zsize = sizeof(struct uma_zone) + 2305 (sizeof(struct uma_cache) * (mp_maxid + 1)) + 2306 (sizeof(struct uma_zone_domain) * vm_ndomains); 2307 2308 /* 2309 * Memory for the zone of kegs and its keg, 2310 * and for zone of zones. 2311 */ 2312 pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 + 2313 roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE); 2314 2315 #ifdef UMA_MD_SMALL_ALLOC 2316 zones = UMA_BOOT_ZONES; 2317 #else 2318 zones = UMA_BOOT_ZONES + vm_zones; 2319 vm_zones = 0; 2320 #endif 2321 size = slab_sizeof(SLAB_MAX_SETSIZE); 2322 space = slab_space(SLAB_MAX_SETSIZE); 2323 2324 /* Memory for the rest of startup zones, UMA and VM, ... */ 2325 if (zsize > space) { 2326 /* See keg_large_init(). */ 2327 u_int ppera; 2328 2329 ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE); 2330 if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) < size) 2331 ppera++; 2332 pages += (zones + vm_zones) * ppera; 2333 } else if (roundup2(zsize, UMA_BOOT_ALIGN) > space) 2334 /* See keg_small_init() special case for uk_ppera = 1. */ 2335 pages += zones; 2336 else 2337 pages += howmany(zones, 2338 space / roundup2(zsize, UMA_BOOT_ALIGN)); 2339 2340 /* ... and their kegs. Note that zone of zones allocates a keg! */ 2341 pages += howmany(zones + 1, 2342 space / roundup2(ksize, UMA_BOOT_ALIGN)); 2343 2344 return (pages); 2345 } 2346 2347 void 2348 uma_startup(void *mem, int npages) 2349 { 2350 struct uma_zctor_args args; 2351 uma_keg_t masterkeg; 2352 uintptr_t m; 2353 2354 #ifdef DIAGNOSTIC 2355 printf("Entering %s with %d boot pages configured\n", __func__, npages); 2356 #endif 2357 2358 rw_init(&uma_rwlock, "UMA lock"); 2359 2360 /* Use bootpages memory for the zone of zones and zone of kegs. */ 2361 m = (uintptr_t)mem; 2362 zones = (uma_zone_t)m; 2363 m += roundup(zsize, CACHE_LINE_SIZE); 2364 kegs = (uma_zone_t)m; 2365 m += roundup(zsize, CACHE_LINE_SIZE); 2366 masterkeg = (uma_keg_t)m; 2367 m += roundup(ksize, CACHE_LINE_SIZE); 2368 m = roundup(m, PAGE_SIZE); 2369 npages -= (m - (uintptr_t)mem) / PAGE_SIZE; 2370 mem = (void *)m; 2371 2372 /* "manually" create the initial zone */ 2373 memset(&args, 0, sizeof(args)); 2374 args.name = "UMA Kegs"; 2375 args.size = ksize; 2376 args.ctor = keg_ctor; 2377 args.dtor = keg_dtor; 2378 args.uminit = zero_init; 2379 args.fini = NULL; 2380 args.keg = masterkeg; 2381 args.align = UMA_BOOT_ALIGN - 1; 2382 args.flags = UMA_ZFLAG_INTERNAL; 2383 zone_ctor(kegs, zsize, &args, M_WAITOK); 2384 2385 bootmem = mem; 2386 boot_pages = npages; 2387 2388 args.name = "UMA Zones"; 2389 args.size = zsize; 2390 args.ctor = zone_ctor; 2391 args.dtor = zone_dtor; 2392 args.uminit = zero_init; 2393 args.fini = NULL; 2394 args.keg = NULL; 2395 args.align = UMA_BOOT_ALIGN - 1; 2396 args.flags = UMA_ZFLAG_INTERNAL; 2397 zone_ctor(zones, zsize, &args, M_WAITOK); 2398 2399 /* Now make a zone for slab headers */ 2400 slabzone = uma_zcreate("UMA Slabs", sizeof(struct uma_hash_slab), 2401 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 2402 2403 hashzone = uma_zcreate("UMA Hash", 2404 sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, 2405 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 2406 2407 bucket_init(); 2408 2409 booted = BOOT_STRAPPED; 2410 } 2411 2412 void 2413 uma_startup1(void) 2414 { 2415 2416 #ifdef DIAGNOSTIC 2417 printf("Entering %s with %d boot pages left\n", __func__, boot_pages); 2418 #endif 2419 booted = BOOT_PAGEALLOC; 2420 } 2421 2422 void 2423 uma_startup2(void) 2424 { 2425 2426 #ifdef DIAGNOSTIC 2427 printf("Entering %s with %d boot pages left\n", __func__, boot_pages); 2428 #endif 2429 booted = BOOT_BUCKETS; 2430 sx_init(&uma_reclaim_lock, "umareclaim"); 2431 bucket_enable(); 2432 } 2433 2434 /* 2435 * Initialize our callout handle 2436 * 2437 */ 2438 static void 2439 uma_startup3(void) 2440 { 2441 2442 #ifdef INVARIANTS 2443 TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor); 2444 uma_dbg_cnt = counter_u64_alloc(M_WAITOK); 2445 uma_skip_cnt = counter_u64_alloc(M_WAITOK); 2446 #endif 2447 zone_foreach(zone_alloc_counters, NULL); 2448 zone_foreach(zone_alloc_sysctl, NULL); 2449 callout_init(&uma_callout, 1); 2450 callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); 2451 booted = BOOT_RUNNING; 2452 } 2453 2454 static uma_keg_t 2455 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, 2456 int align, uint32_t flags) 2457 { 2458 struct uma_kctor_args args; 2459 2460 args.size = size; 2461 args.uminit = uminit; 2462 args.fini = fini; 2463 args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; 2464 args.flags = flags; 2465 args.zone = zone; 2466 return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); 2467 } 2468 2469 /* Public functions */ 2470 /* See uma.h */ 2471 void 2472 uma_set_align(int align) 2473 { 2474 2475 if (align != UMA_ALIGN_CACHE) 2476 uma_align_cache = align; 2477 } 2478 2479 /* See uma.h */ 2480 uma_zone_t 2481 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, 2482 uma_init uminit, uma_fini fini, int align, uint32_t flags) 2483 2484 { 2485 struct uma_zctor_args args; 2486 uma_zone_t res; 2487 bool locked; 2488 2489 KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"", 2490 align, name)); 2491 2492 /* Sets all zones to a first-touch domain policy. */ 2493 #ifdef UMA_FIRSTTOUCH 2494 flags |= UMA_ZONE_NUMA; 2495 #endif 2496 2497 /* This stuff is essential for the zone ctor */ 2498 memset(&args, 0, sizeof(args)); 2499 args.name = name; 2500 args.size = size; 2501 args.ctor = ctor; 2502 args.dtor = dtor; 2503 args.uminit = uminit; 2504 args.fini = fini; 2505 #ifdef INVARIANTS 2506 /* 2507 * Inject procedures which check for memory use after free if we are 2508 * allowed to scramble the memory while it is not allocated. This 2509 * requires that: UMA is actually able to access the memory, no init 2510 * or fini procedures, no dependency on the initial value of the 2511 * memory, and no (legitimate) use of the memory after free. Note, 2512 * the ctor and dtor do not need to be empty. 2513 * 2514 * XXX UMA_ZONE_OFFPAGE. 2515 */ 2516 if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) && 2517 uminit == NULL && fini == NULL) { 2518 args.uminit = trash_init; 2519 args.fini = trash_fini; 2520 } 2521 #endif 2522 args.align = align; 2523 args.flags = flags; 2524 args.keg = NULL; 2525 2526 if (booted < BOOT_BUCKETS) { 2527 locked = false; 2528 } else { 2529 sx_slock(&uma_reclaim_lock); 2530 locked = true; 2531 } 2532 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 2533 if (locked) 2534 sx_sunlock(&uma_reclaim_lock); 2535 return (res); 2536 } 2537 2538 /* See uma.h */ 2539 uma_zone_t 2540 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, 2541 uma_init zinit, uma_fini zfini, uma_zone_t master) 2542 { 2543 struct uma_zctor_args args; 2544 uma_keg_t keg; 2545 uma_zone_t res; 2546 bool locked; 2547 2548 keg = master->uz_keg; 2549 memset(&args, 0, sizeof(args)); 2550 args.name = name; 2551 args.size = keg->uk_size; 2552 args.ctor = ctor; 2553 args.dtor = dtor; 2554 args.uminit = zinit; 2555 args.fini = zfini; 2556 args.align = keg->uk_align; 2557 args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; 2558 args.keg = keg; 2559 2560 if (booted < BOOT_BUCKETS) { 2561 locked = false; 2562 } else { 2563 sx_slock(&uma_reclaim_lock); 2564 locked = true; 2565 } 2566 /* XXX Attaches only one keg of potentially many. */ 2567 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 2568 if (locked) 2569 sx_sunlock(&uma_reclaim_lock); 2570 return (res); 2571 } 2572 2573 /* See uma.h */ 2574 uma_zone_t 2575 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, 2576 uma_init zinit, uma_fini zfini, uma_import zimport, 2577 uma_release zrelease, void *arg, int flags) 2578 { 2579 struct uma_zctor_args args; 2580 2581 memset(&args, 0, sizeof(args)); 2582 args.name = name; 2583 args.size = size; 2584 args.ctor = ctor; 2585 args.dtor = dtor; 2586 args.uminit = zinit; 2587 args.fini = zfini; 2588 args.import = zimport; 2589 args.release = zrelease; 2590 args.arg = arg; 2591 args.align = 0; 2592 args.flags = flags | UMA_ZFLAG_CACHE; 2593 2594 return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); 2595 } 2596 2597 /* See uma.h */ 2598 void 2599 uma_zdestroy(uma_zone_t zone) 2600 { 2601 2602 sx_slock(&uma_reclaim_lock); 2603 zone_free_item(zones, zone, NULL, SKIP_NONE); 2604 sx_sunlock(&uma_reclaim_lock); 2605 } 2606 2607 void 2608 uma_zwait(uma_zone_t zone) 2609 { 2610 void *item; 2611 2612 item = uma_zalloc_arg(zone, NULL, M_WAITOK); 2613 uma_zfree(zone, item); 2614 } 2615 2616 void * 2617 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags) 2618 { 2619 void *item; 2620 #ifdef SMP 2621 int i; 2622 2623 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 2624 #endif 2625 item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO); 2626 if (item != NULL && (flags & M_ZERO)) { 2627 #ifdef SMP 2628 for (i = 0; i <= mp_maxid; i++) 2629 bzero(zpcpu_get_cpu(item, i), zone->uz_size); 2630 #else 2631 bzero(item, zone->uz_size); 2632 #endif 2633 } 2634 return (item); 2635 } 2636 2637 /* 2638 * A stub while both regular and pcpu cases are identical. 2639 */ 2640 void 2641 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata) 2642 { 2643 2644 #ifdef SMP 2645 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 2646 #endif 2647 uma_zfree_arg(zone, item, udata); 2648 } 2649 2650 static inline void * 2651 bucket_pop(uma_zone_t zone, uma_cache_t cache, uma_bucket_t bucket) 2652 { 2653 void *item; 2654 2655 bucket->ub_cnt--; 2656 item = bucket->ub_bucket[bucket->ub_cnt]; 2657 #ifdef INVARIANTS 2658 bucket->ub_bucket[bucket->ub_cnt] = NULL; 2659 KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); 2660 #endif 2661 cache->uc_allocs++; 2662 2663 return (item); 2664 } 2665 2666 static inline void 2667 bucket_push(uma_zone_t zone, uma_cache_t cache, uma_bucket_t bucket, 2668 void *item) 2669 { 2670 KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL, 2671 ("uma_zfree: Freeing to non free bucket index.")); 2672 bucket->ub_bucket[bucket->ub_cnt] = item; 2673 bucket->ub_cnt++; 2674 cache->uc_frees++; 2675 } 2676 2677 static void * 2678 item_ctor(uma_zone_t zone, void *udata, int flags, void *item) 2679 { 2680 #ifdef INVARIANTS 2681 bool skipdbg; 2682 2683 skipdbg = uma_dbg_zskip(zone, item); 2684 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && 2685 zone->uz_ctor != trash_ctor) 2686 trash_ctor(item, zone->uz_size, udata, flags); 2687 #endif 2688 if (__predict_false(zone->uz_ctor != NULL) && 2689 zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { 2690 counter_u64_add(zone->uz_fails, 1); 2691 zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); 2692 return (NULL); 2693 } 2694 #ifdef INVARIANTS 2695 if (!skipdbg) 2696 uma_dbg_alloc(zone, NULL, item); 2697 #endif 2698 if (flags & M_ZERO) 2699 uma_zero_item(item, zone); 2700 2701 return (item); 2702 } 2703 2704 static inline void 2705 item_dtor(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) 2706 { 2707 #ifdef INVARIANTS 2708 bool skipdbg; 2709 2710 skipdbg = uma_dbg_zskip(zone, item); 2711 if (skip == SKIP_NONE && !skipdbg) { 2712 if ((zone->uz_flags & UMA_ZONE_MALLOC) != 0) 2713 uma_dbg_free(zone, udata, item); 2714 else 2715 uma_dbg_free(zone, NULL, item); 2716 } 2717 #endif 2718 if (skip < SKIP_DTOR) { 2719 if (zone->uz_dtor != NULL) 2720 zone->uz_dtor(item, zone->uz_size, udata); 2721 #ifdef INVARIANTS 2722 if (!skipdbg && (zone->uz_flags & UMA_ZFLAG_TRASH) != 0 && 2723 zone->uz_dtor != trash_dtor) 2724 trash_dtor(item, zone->uz_size, udata); 2725 #endif 2726 } 2727 } 2728 2729 /* See uma.h */ 2730 void * 2731 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) 2732 { 2733 uma_bucket_t bucket; 2734 uma_cache_t cache; 2735 void *item; 2736 int cpu, domain; 2737 2738 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 2739 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 2740 2741 /* This is the fast path allocation */ 2742 CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d", 2743 curthread, zone->uz_name, zone, flags); 2744 2745 if (flags & M_WAITOK) { 2746 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2747 "uma_zalloc_arg: zone \"%s\"", zone->uz_name); 2748 } 2749 KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC")); 2750 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 2751 ("uma_zalloc_arg: called with spinlock or critical section held")); 2752 if (zone->uz_flags & UMA_ZONE_PCPU) 2753 KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone " 2754 "with M_ZERO passed")); 2755 2756 #ifdef DEBUG_MEMGUARD 2757 if (memguard_cmp_zone(zone)) { 2758 item = memguard_alloc(zone->uz_size, flags); 2759 if (item != NULL) { 2760 if (zone->uz_init != NULL && 2761 zone->uz_init(item, zone->uz_size, flags) != 0) 2762 return (NULL); 2763 if (zone->uz_ctor != NULL && 2764 zone->uz_ctor(item, zone->uz_size, udata, 2765 flags) != 0) { 2766 counter_u64_add(zone->uz_fails, 1); 2767 zone->uz_fini(item, zone->uz_size); 2768 return (NULL); 2769 } 2770 return (item); 2771 } 2772 /* This is unfortunate but should not be fatal. */ 2773 } 2774 #endif 2775 /* 2776 * If possible, allocate from the per-CPU cache. There are two 2777 * requirements for safe access to the per-CPU cache: (1) the thread 2778 * accessing the cache must not be preempted or yield during access, 2779 * and (2) the thread must not migrate CPUs without switching which 2780 * cache it accesses. We rely on a critical section to prevent 2781 * preemption and migration. We release the critical section in 2782 * order to acquire the zone mutex if we are unable to allocate from 2783 * the current cache; when we re-acquire the critical section, we 2784 * must detect and handle migration if it has occurred. 2785 */ 2786 critical_enter(); 2787 do { 2788 cpu = curcpu; 2789 cache = &zone->uz_cpu[cpu]; 2790 bucket = cache->uc_allocbucket; 2791 if (__predict_true(bucket != NULL && bucket->ub_cnt != 0)) { 2792 item = bucket_pop(zone, cache, bucket); 2793 critical_exit(); 2794 return (item_ctor(zone, udata, flags, item)); 2795 } 2796 } while (cache_alloc(zone, cache, udata, flags)); 2797 critical_exit(); 2798 2799 /* 2800 * We can not get a bucket so try to return a single item. 2801 */ 2802 if (zone->uz_flags & UMA_ZONE_NUMA) 2803 domain = PCPU_GET(domain); 2804 else 2805 domain = UMA_ANYDOMAIN; 2806 return (zone_alloc_item_locked(zone, udata, domain, flags)); 2807 } 2808 2809 /* 2810 * Replenish an alloc bucket and possibly restore an old one. Called in 2811 * a critical section. Returns in a critical section. 2812 * 2813 * A false return value indicates failure and returns with the zone lock 2814 * held. A true return value indicates success and the caller should retry. 2815 */ 2816 static __noinline bool 2817 cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) 2818 { 2819 uma_zone_domain_t zdom; 2820 uma_bucket_t bucket; 2821 int cpu, domain; 2822 bool lockfail; 2823 2824 CRITICAL_ASSERT(curthread); 2825 2826 /* 2827 * If we have run out of items in our alloc bucket see 2828 * if we can switch with the free bucket. 2829 */ 2830 bucket = cache->uc_freebucket; 2831 if (bucket != NULL && bucket->ub_cnt != 0) { 2832 cache->uc_freebucket = cache->uc_allocbucket; 2833 cache->uc_allocbucket = bucket; 2834 return (true); 2835 } 2836 2837 /* 2838 * Discard any empty allocation bucket while we hold no locks. 2839 */ 2840 bucket = cache->uc_allocbucket; 2841 cache->uc_allocbucket = NULL; 2842 critical_exit(); 2843 if (bucket != NULL) 2844 bucket_free(zone, bucket, udata); 2845 2846 /* 2847 * Attempt to retrieve the item from the per-CPU cache has failed, so 2848 * we must go back to the zone. This requires the zone lock, so we 2849 * must drop the critical section, then re-acquire it when we go back 2850 * to the cache. Since the critical section is released, we may be 2851 * preempted or migrate. As such, make sure not to maintain any 2852 * thread-local state specific to the cache from prior to releasing 2853 * the critical section. 2854 */ 2855 lockfail = 0; 2856 if (ZONE_TRYLOCK(zone) == 0) { 2857 /* Record contention to size the buckets. */ 2858 ZONE_LOCK(zone); 2859 lockfail = 1; 2860 } 2861 2862 critical_enter(); 2863 /* Short-circuit for zones without buckets and low memory. */ 2864 if (zone->uz_bucket_size == 0 || bucketdisable) 2865 return (false); 2866 2867 cpu = curcpu; 2868 cache = &zone->uz_cpu[cpu]; 2869 2870 /* See if we lost the race to fill the cache. */ 2871 if (cache->uc_allocbucket != NULL) { 2872 ZONE_UNLOCK(zone); 2873 return (true); 2874 } 2875 2876 /* 2877 * Check the zone's cache of buckets. 2878 */ 2879 if (zone->uz_flags & UMA_ZONE_NUMA) { 2880 domain = PCPU_GET(domain); 2881 zdom = &zone->uz_domain[domain]; 2882 } else { 2883 domain = UMA_ANYDOMAIN; 2884 zdom = &zone->uz_domain[0]; 2885 } 2886 2887 if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) { 2888 ZONE_UNLOCK(zone); 2889 KASSERT(bucket->ub_cnt != 0, 2890 ("uma_zalloc_arg: Returning an empty bucket.")); 2891 cache->uc_allocbucket = bucket; 2892 return (true); 2893 } 2894 /* We are no longer associated with this CPU. */ 2895 critical_exit(); 2896 2897 /* 2898 * We bump the uz count when the cache size is insufficient to 2899 * handle the working set. 2900 */ 2901 if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) 2902 zone->uz_bucket_size++; 2903 2904 /* 2905 * Fill a bucket and attempt to use it as the alloc bucket. 2906 */ 2907 bucket = zone_alloc_bucket(zone, udata, domain, flags); 2908 CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", 2909 zone->uz_name, zone, bucket); 2910 critical_enter(); 2911 if (bucket == NULL) 2912 return (false); 2913 2914 /* 2915 * See if we lost the race or were migrated. Cache the 2916 * initialized bucket to make this less likely or claim 2917 * the memory directly. 2918 */ 2919 cpu = curcpu; 2920 cache = &zone->uz_cpu[cpu]; 2921 if (cache->uc_allocbucket == NULL && 2922 ((zone->uz_flags & UMA_ZONE_NUMA) == 0 || 2923 domain == PCPU_GET(domain))) { 2924 cache->uc_allocbucket = bucket; 2925 zdom->uzd_imax += bucket->ub_cnt; 2926 } else if (zone->uz_bkt_count >= zone->uz_bkt_max) { 2927 critical_exit(); 2928 ZONE_UNLOCK(zone); 2929 bucket_drain(zone, bucket); 2930 bucket_free(zone, bucket, udata); 2931 critical_enter(); 2932 return (true); 2933 } else 2934 zone_put_bucket(zone, zdom, bucket, false); 2935 ZONE_UNLOCK(zone); 2936 return (true); 2937 } 2938 2939 void * 2940 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) 2941 { 2942 2943 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 2944 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 2945 2946 /* This is the fast path allocation */ 2947 CTR5(KTR_UMA, 2948 "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d", 2949 curthread, zone->uz_name, zone, domain, flags); 2950 2951 if (flags & M_WAITOK) { 2952 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2953 "uma_zalloc_domain: zone \"%s\"", zone->uz_name); 2954 } 2955 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 2956 ("uma_zalloc_domain: called with spinlock or critical section held")); 2957 2958 return (zone_alloc_item(zone, udata, domain, flags)); 2959 } 2960 2961 /* 2962 * Find a slab with some space. Prefer slabs that are partially used over those 2963 * that are totally full. This helps to reduce fragmentation. 2964 * 2965 * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check 2966 * only 'domain'. 2967 */ 2968 static uma_slab_t 2969 keg_first_slab(uma_keg_t keg, int domain, bool rr) 2970 { 2971 uma_domain_t dom; 2972 uma_slab_t slab; 2973 int start; 2974 2975 KASSERT(domain >= 0 && domain < vm_ndomains, 2976 ("keg_first_slab: domain %d out of range", domain)); 2977 KEG_LOCK_ASSERT(keg); 2978 2979 slab = NULL; 2980 start = domain; 2981 do { 2982 dom = &keg->uk_domain[domain]; 2983 if (!LIST_EMPTY(&dom->ud_part_slab)) 2984 return (LIST_FIRST(&dom->ud_part_slab)); 2985 if (!LIST_EMPTY(&dom->ud_free_slab)) { 2986 slab = LIST_FIRST(&dom->ud_free_slab); 2987 LIST_REMOVE(slab, us_link); 2988 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 2989 return (slab); 2990 } 2991 if (rr) 2992 domain = (domain + 1) % vm_ndomains; 2993 } while (domain != start); 2994 2995 return (NULL); 2996 } 2997 2998 static uma_slab_t 2999 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) 3000 { 3001 uint32_t reserve; 3002 3003 KEG_LOCK_ASSERT(keg); 3004 3005 reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; 3006 if (keg->uk_free <= reserve) 3007 return (NULL); 3008 return (keg_first_slab(keg, domain, rr)); 3009 } 3010 3011 static uma_slab_t 3012 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) 3013 { 3014 struct vm_domainset_iter di; 3015 uma_domain_t dom; 3016 uma_slab_t slab; 3017 int aflags, domain; 3018 bool rr; 3019 3020 restart: 3021 KEG_LOCK_ASSERT(keg); 3022 3023 /* 3024 * Use the keg's policy if upper layers haven't already specified a 3025 * domain (as happens with first-touch zones). 3026 * 3027 * To avoid races we run the iterator with the keg lock held, but that 3028 * means that we cannot allow the vm_domainset layer to sleep. Thus, 3029 * clear M_WAITOK and handle low memory conditions locally. 3030 */ 3031 rr = rdomain == UMA_ANYDOMAIN; 3032 if (rr) { 3033 aflags = (flags & ~M_WAITOK) | M_NOWAIT; 3034 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 3035 &aflags); 3036 } else { 3037 aflags = flags; 3038 domain = rdomain; 3039 } 3040 3041 for (;;) { 3042 slab = keg_fetch_free_slab(keg, domain, rr, flags); 3043 if (slab != NULL) 3044 return (slab); 3045 3046 /* 3047 * M_NOVM means don't ask at all! 3048 */ 3049 if (flags & M_NOVM) 3050 break; 3051 3052 KASSERT(zone->uz_max_items == 0 || 3053 zone->uz_items <= zone->uz_max_items, 3054 ("%s: zone %p overflow", __func__, zone)); 3055 3056 slab = keg_alloc_slab(keg, zone, domain, flags, aflags); 3057 /* 3058 * If we got a slab here it's safe to mark it partially used 3059 * and return. We assume that the caller is going to remove 3060 * at least one item. 3061 */ 3062 if (slab) { 3063 dom = &keg->uk_domain[slab->us_domain]; 3064 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 3065 return (slab); 3066 } 3067 KEG_LOCK(keg); 3068 if (rr && vm_domainset_iter_policy(&di, &domain) != 0) { 3069 if ((flags & M_WAITOK) != 0) { 3070 KEG_UNLOCK(keg); 3071 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); 3072 KEG_LOCK(keg); 3073 goto restart; 3074 } 3075 break; 3076 } 3077 } 3078 3079 /* 3080 * We might not have been able to get a slab but another cpu 3081 * could have while we were unlocked. Check again before we 3082 * fail. 3083 */ 3084 if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) { 3085 return (slab); 3086 } 3087 return (NULL); 3088 } 3089 3090 static void * 3091 slab_alloc_item(uma_keg_t keg, uma_slab_t slab) 3092 { 3093 uma_domain_t dom; 3094 void *item; 3095 uint8_t freei; 3096 3097 KEG_LOCK_ASSERT(keg); 3098 3099 freei = BIT_FFS(keg->uk_ipers, &slab->us_free) - 1; 3100 BIT_CLR(keg->uk_ipers, freei, &slab->us_free); 3101 item = slab_item(slab, keg, freei); 3102 slab->us_freecount--; 3103 keg->uk_free--; 3104 3105 /* Move this slab to the full list */ 3106 if (slab->us_freecount == 0) { 3107 LIST_REMOVE(slab, us_link); 3108 dom = &keg->uk_domain[slab->us_domain]; 3109 LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); 3110 } 3111 3112 return (item); 3113 } 3114 3115 static int 3116 zone_import(void *arg, void **bucket, int max, int domain, int flags) 3117 { 3118 uma_zone_t zone; 3119 uma_slab_t slab; 3120 uma_keg_t keg; 3121 #ifdef NUMA 3122 int stripe; 3123 #endif 3124 int i; 3125 3126 zone = arg; 3127 slab = NULL; 3128 keg = zone->uz_keg; 3129 KEG_LOCK(keg); 3130 /* Try to keep the buckets totally full */ 3131 for (i = 0; i < max; ) { 3132 if ((slab = keg_fetch_slab(keg, zone, domain, flags)) == NULL) 3133 break; 3134 #ifdef NUMA 3135 stripe = howmany(max, vm_ndomains); 3136 #endif 3137 while (slab->us_freecount && i < max) { 3138 bucket[i++] = slab_alloc_item(keg, slab); 3139 if (keg->uk_free <= keg->uk_reserve) 3140 break; 3141 #ifdef NUMA 3142 /* 3143 * If the zone is striped we pick a new slab for every 3144 * N allocations. Eliminating this conditional will 3145 * instead pick a new domain for each bucket rather 3146 * than stripe within each bucket. The current option 3147 * produces more fragmentation and requires more cpu 3148 * time but yields better distribution. 3149 */ 3150 if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 && 3151 vm_ndomains > 1 && --stripe == 0) 3152 break; 3153 #endif 3154 } 3155 /* Don't block if we allocated any successfully. */ 3156 flags &= ~M_WAITOK; 3157 flags |= M_NOWAIT; 3158 } 3159 KEG_UNLOCK(keg); 3160 3161 return i; 3162 } 3163 3164 static uma_bucket_t 3165 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) 3166 { 3167 uma_bucket_t bucket; 3168 int maxbucket, cnt; 3169 3170 CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain); 3171 3172 /* Avoid allocs targeting empty domains. */ 3173 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 3174 domain = UMA_ANYDOMAIN; 3175 3176 if (zone->uz_max_items > 0) { 3177 if (zone->uz_items >= zone->uz_max_items) 3178 return (false); 3179 maxbucket = MIN(zone->uz_bucket_size, 3180 zone->uz_max_items - zone->uz_items); 3181 zone->uz_items += maxbucket; 3182 } else 3183 maxbucket = zone->uz_bucket_size; 3184 ZONE_UNLOCK(zone); 3185 3186 /* Don't wait for buckets, preserve caller's NOVM setting. */ 3187 bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); 3188 if (bucket == NULL) { 3189 cnt = 0; 3190 goto out; 3191 } 3192 3193 bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, 3194 MIN(maxbucket, bucket->ub_entries), domain, flags); 3195 3196 /* 3197 * Initialize the memory if necessary. 3198 */ 3199 if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { 3200 int i; 3201 3202 for (i = 0; i < bucket->ub_cnt; i++) 3203 if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size, 3204 flags) != 0) 3205 break; 3206 /* 3207 * If we couldn't initialize the whole bucket, put the 3208 * rest back onto the freelist. 3209 */ 3210 if (i != bucket->ub_cnt) { 3211 zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], 3212 bucket->ub_cnt - i); 3213 #ifdef INVARIANTS 3214 bzero(&bucket->ub_bucket[i], 3215 sizeof(void *) * (bucket->ub_cnt - i)); 3216 #endif 3217 bucket->ub_cnt = i; 3218 } 3219 } 3220 3221 cnt = bucket->ub_cnt; 3222 if (bucket->ub_cnt == 0) { 3223 bucket_free(zone, bucket, udata); 3224 counter_u64_add(zone->uz_fails, 1); 3225 bucket = NULL; 3226 } 3227 out: 3228 ZONE_LOCK(zone); 3229 if (zone->uz_max_items > 0 && cnt < maxbucket) { 3230 MPASS(zone->uz_items >= maxbucket - cnt); 3231 zone->uz_items -= maxbucket - cnt; 3232 if (zone->uz_sleepers > 0 && 3233 (cnt == 0 ? zone->uz_items + 1 : zone->uz_items) < 3234 zone->uz_max_items) 3235 wakeup_one(zone); 3236 } 3237 3238 return (bucket); 3239 } 3240 3241 /* 3242 * Allocates a single item from a zone. 3243 * 3244 * Arguments 3245 * zone The zone to alloc for. 3246 * udata The data to be passed to the constructor. 3247 * domain The domain to allocate from or UMA_ANYDOMAIN. 3248 * flags M_WAITOK, M_NOWAIT, M_ZERO. 3249 * 3250 * Returns 3251 * NULL if there is no memory and M_NOWAIT is set 3252 * An item if successful 3253 */ 3254 3255 static void * 3256 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) 3257 { 3258 3259 ZONE_LOCK(zone); 3260 return (zone_alloc_item_locked(zone, udata, domain, flags)); 3261 } 3262 3263 /* 3264 * Returns with zone unlocked. 3265 */ 3266 static void * 3267 zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags) 3268 { 3269 void *item; 3270 3271 ZONE_LOCK_ASSERT(zone); 3272 3273 if (zone->uz_max_items > 0) { 3274 if (zone->uz_items >= zone->uz_max_items) { 3275 zone_log_warning(zone); 3276 zone_maxaction(zone); 3277 if (flags & M_NOWAIT) { 3278 ZONE_UNLOCK(zone); 3279 return (NULL); 3280 } 3281 zone->uz_sleeps++; 3282 zone->uz_sleepers++; 3283 while (zone->uz_items >= zone->uz_max_items) 3284 mtx_sleep(zone, zone->uz_lockptr, PVM, 3285 "zonelimit", 0); 3286 zone->uz_sleepers--; 3287 if (zone->uz_sleepers > 0 && 3288 zone->uz_items + 1 < zone->uz_max_items) 3289 wakeup_one(zone); 3290 } 3291 zone->uz_items++; 3292 } 3293 ZONE_UNLOCK(zone); 3294 3295 /* Avoid allocs targeting empty domains. */ 3296 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 3297 domain = UMA_ANYDOMAIN; 3298 3299 if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) 3300 goto fail_cnt; 3301 3302 /* 3303 * We have to call both the zone's init (not the keg's init) 3304 * and the zone's ctor. This is because the item is going from 3305 * a keg slab directly to the user, and the user is expecting it 3306 * to be both zone-init'd as well as zone-ctor'd. 3307 */ 3308 if (zone->uz_init != NULL) { 3309 if (zone->uz_init(item, zone->uz_size, flags) != 0) { 3310 zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); 3311 goto fail_cnt; 3312 } 3313 } 3314 item = item_ctor(zone, udata, flags, item); 3315 if (item == NULL) 3316 goto fail; 3317 3318 counter_u64_add(zone->uz_allocs, 1); 3319 CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, 3320 zone->uz_name, zone); 3321 3322 return (item); 3323 3324 fail_cnt: 3325 counter_u64_add(zone->uz_fails, 1); 3326 fail: 3327 if (zone->uz_max_items > 0) { 3328 ZONE_LOCK(zone); 3329 /* XXX Decrement without wakeup */ 3330 zone->uz_items--; 3331 ZONE_UNLOCK(zone); 3332 } 3333 CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", 3334 zone->uz_name, zone); 3335 return (NULL); 3336 } 3337 3338 /* See uma.h */ 3339 void 3340 uma_zfree_arg(uma_zone_t zone, void *item, void *udata) 3341 { 3342 uma_cache_t cache; 3343 uma_bucket_t bucket; 3344 int cpu, domain, itemdomain; 3345 3346 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3347 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3348 3349 CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread, 3350 zone->uz_name); 3351 3352 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3353 ("uma_zfree_arg: called with spinlock or critical section held")); 3354 3355 /* uma_zfree(..., NULL) does nothing, to match free(9). */ 3356 if (item == NULL) 3357 return; 3358 #ifdef DEBUG_MEMGUARD 3359 if (is_memguard_addr(item)) { 3360 if (zone->uz_dtor != NULL) 3361 zone->uz_dtor(item, zone->uz_size, udata); 3362 if (zone->uz_fini != NULL) 3363 zone->uz_fini(item, zone->uz_size); 3364 memguard_free(item); 3365 return; 3366 } 3367 #endif 3368 item_dtor(zone, item, udata, SKIP_NONE); 3369 3370 /* 3371 * The race here is acceptable. If we miss it we'll just have to wait 3372 * a little longer for the limits to be reset. 3373 */ 3374 if (zone->uz_sleepers > 0) 3375 goto zfree_item; 3376 3377 /* 3378 * If possible, free to the per-CPU cache. There are two 3379 * requirements for safe access to the per-CPU cache: (1) the thread 3380 * accessing the cache must not be preempted or yield during access, 3381 * and (2) the thread must not migrate CPUs without switching which 3382 * cache it accesses. We rely on a critical section to prevent 3383 * preemption and migration. We release the critical section in 3384 * order to acquire the zone mutex if we are unable to free to the 3385 * current cache; when we re-acquire the critical section, we must 3386 * detect and handle migration if it has occurred. 3387 */ 3388 domain = itemdomain = 0; 3389 critical_enter(); 3390 do { 3391 cpu = curcpu; 3392 cache = &zone->uz_cpu[cpu]; 3393 bucket = cache->uc_allocbucket; 3394 #ifdef UMA_XDOMAIN 3395 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) { 3396 itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item)); 3397 domain = PCPU_GET(domain); 3398 } 3399 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0 && 3400 domain != itemdomain) { 3401 bucket = cache->uc_crossbucket; 3402 } else 3403 #endif 3404 3405 /* 3406 * Try to free into the allocbucket first to give LIFO ordering 3407 * for cache-hot datastructures. Spill over into the freebucket 3408 * if necessary. Alloc will swap them if one runs dry. 3409 */ 3410 if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries) 3411 bucket = cache->uc_freebucket; 3412 if (__predict_true(bucket != NULL && 3413 bucket->ub_cnt < bucket->ub_entries)) { 3414 bucket_push(zone, cache, bucket, item); 3415 critical_exit(); 3416 return; 3417 } 3418 } while (cache_free(zone, cache, udata, item, itemdomain)); 3419 critical_exit(); 3420 3421 /* 3422 * If nothing else caught this, we'll just do an internal free. 3423 */ 3424 zfree_item: 3425 zone_free_item(zone, item, udata, SKIP_DTOR); 3426 } 3427 3428 static void 3429 zone_free_bucket(uma_zone_t zone, uma_bucket_t bucket, void *udata, 3430 int domain, int itemdomain) 3431 { 3432 uma_zone_domain_t zdom; 3433 3434 #ifdef UMA_XDOMAIN 3435 /* 3436 * Buckets coming from the wrong domain will be entirely for the 3437 * only other domain on two domain systems. In this case we can 3438 * simply cache them. Otherwise we need to sort them back to 3439 * correct domains by freeing the contents to the slab layer. 3440 */ 3441 if (domain != itemdomain && vm_ndomains > 2) { 3442 CTR3(KTR_UMA, 3443 "uma_zfree: zone %s(%p) draining cross bucket %p", 3444 zone->uz_name, zone, bucket); 3445 bucket_drain(zone, bucket); 3446 bucket_free(zone, bucket, udata); 3447 return; 3448 } 3449 #endif 3450 /* 3451 * Attempt to save the bucket in the zone's domain bucket cache. 3452 * 3453 * We bump the uz count when the cache size is insufficient to 3454 * handle the working set. 3455 */ 3456 if (ZONE_TRYLOCK(zone) == 0) { 3457 /* Record contention to size the buckets. */ 3458 ZONE_LOCK(zone); 3459 if (zone->uz_bucket_size < zone->uz_bucket_size_max) 3460 zone->uz_bucket_size++; 3461 } 3462 3463 CTR3(KTR_UMA, 3464 "uma_zfree: zone %s(%p) putting bucket %p on free list", 3465 zone->uz_name, zone, bucket); 3466 /* ub_cnt is pointing to the last free item */ 3467 KASSERT(bucket->ub_cnt == bucket->ub_entries, 3468 ("uma_zfree: Attempting to insert partial bucket onto the full list.\n")); 3469 if (zone->uz_bkt_count >= zone->uz_bkt_max) { 3470 ZONE_UNLOCK(zone); 3471 bucket_drain(zone, bucket); 3472 bucket_free(zone, bucket, udata); 3473 } else { 3474 zdom = &zone->uz_domain[itemdomain]; 3475 zone_put_bucket(zone, zdom, bucket, true); 3476 ZONE_UNLOCK(zone); 3477 } 3478 } 3479 3480 /* 3481 * Populate a free or cross bucket for the current cpu cache. Free any 3482 * existing full bucket either to the zone cache or back to the slab layer. 3483 * 3484 * Enters and returns in a critical section. false return indicates that 3485 * we can not satisfy this free in the cache layer. true indicates that 3486 * the caller should retry. 3487 */ 3488 static __noinline bool 3489 cache_free(uma_zone_t zone, uma_cache_t cache, void *udata, void *item, 3490 int itemdomain) 3491 { 3492 uma_bucket_t bucket; 3493 int cpu, domain; 3494 3495 CRITICAL_ASSERT(curthread); 3496 3497 if (zone->uz_bucket_size == 0 || bucketdisable) 3498 return false; 3499 3500 cpu = curcpu; 3501 cache = &zone->uz_cpu[cpu]; 3502 3503 /* 3504 * NUMA domains need to free to the correct zdom. When XDOMAIN 3505 * is enabled this is the zdom of the item and the bucket may be 3506 * the cross bucket if they do not match. 3507 */ 3508 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) 3509 #ifdef UMA_XDOMAIN 3510 domain = PCPU_GET(domain); 3511 #else 3512 itemdomain = domain = PCPU_GET(domain); 3513 #endif 3514 else 3515 itemdomain = domain = 0; 3516 #ifdef UMA_XDOMAIN 3517 if (domain != itemdomain) { 3518 bucket = cache->uc_crossbucket; 3519 cache->uc_crossbucket = NULL; 3520 if (bucket != NULL) 3521 atomic_add_64(&zone->uz_xdomain, bucket->ub_cnt); 3522 } else 3523 #endif 3524 { 3525 bucket = cache->uc_freebucket; 3526 cache->uc_freebucket = NULL; 3527 } 3528 3529 3530 /* We are no longer associated with this CPU. */ 3531 critical_exit(); 3532 3533 if (bucket != NULL) 3534 zone_free_bucket(zone, bucket, udata, domain, itemdomain); 3535 3536 bucket = bucket_alloc(zone, udata, M_NOWAIT); 3537 CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p", 3538 zone->uz_name, zone, bucket); 3539 critical_enter(); 3540 if (bucket == NULL) 3541 return (false); 3542 cpu = curcpu; 3543 cache = &zone->uz_cpu[cpu]; 3544 #ifdef UMA_XDOMAIN 3545 /* 3546 * Check to see if we should be populating the cross bucket. If it 3547 * is already populated we will fall through and attempt to populate 3548 * the free bucket. 3549 */ 3550 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) { 3551 domain = PCPU_GET(domain); 3552 if (domain != itemdomain && cache->uc_crossbucket == NULL) { 3553 cache->uc_crossbucket = bucket; 3554 return (true); 3555 } 3556 } 3557 #endif 3558 /* 3559 * We may have lost the race to fill the bucket or switched CPUs. 3560 */ 3561 if (cache->uc_freebucket != NULL) { 3562 critical_exit(); 3563 bucket_free(zone, bucket, udata); 3564 critical_enter(); 3565 } else 3566 cache->uc_freebucket = bucket; 3567 3568 return (true); 3569 } 3570 3571 void 3572 uma_zfree_domain(uma_zone_t zone, void *item, void *udata) 3573 { 3574 3575 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3576 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3577 3578 CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread, 3579 zone->uz_name); 3580 3581 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3582 ("uma_zfree_domain: called with spinlock or critical section held")); 3583 3584 /* uma_zfree(..., NULL) does nothing, to match free(9). */ 3585 if (item == NULL) 3586 return; 3587 zone_free_item(zone, item, udata, SKIP_NONE); 3588 } 3589 3590 static void 3591 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) 3592 { 3593 uma_keg_t keg; 3594 uma_domain_t dom; 3595 uint8_t freei; 3596 3597 keg = zone->uz_keg; 3598 MPASS(zone->uz_lockptr == &keg->uk_lock); 3599 KEG_LOCK_ASSERT(keg); 3600 3601 dom = &keg->uk_domain[slab->us_domain]; 3602 3603 /* Do we need to remove from any lists? */ 3604 if (slab->us_freecount+1 == keg->uk_ipers) { 3605 LIST_REMOVE(slab, us_link); 3606 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); 3607 } else if (slab->us_freecount == 0) { 3608 LIST_REMOVE(slab, us_link); 3609 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 3610 } 3611 3612 /* Slab management. */ 3613 freei = slab_item_index(slab, keg, item); 3614 BIT_SET(keg->uk_ipers, freei, &slab->us_free); 3615 slab->us_freecount++; 3616 3617 /* Keg statistics. */ 3618 keg->uk_free++; 3619 } 3620 3621 static void 3622 zone_release(void *arg, void **bucket, int cnt) 3623 { 3624 uma_zone_t zone; 3625 void *item; 3626 uma_slab_t slab; 3627 uma_keg_t keg; 3628 uint8_t *mem; 3629 int i; 3630 3631 zone = arg; 3632 keg = zone->uz_keg; 3633 KEG_LOCK(keg); 3634 for (i = 0; i < cnt; i++) { 3635 item = bucket[i]; 3636 if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) { 3637 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 3638 if (zone->uz_flags & UMA_ZONE_HASH) { 3639 slab = hash_sfind(&keg->uk_hash, mem); 3640 } else { 3641 mem += keg->uk_pgoff; 3642 slab = (uma_slab_t)mem; 3643 } 3644 } else 3645 slab = vtoslab((vm_offset_t)item); 3646 slab_free_item(zone, slab, item); 3647 } 3648 KEG_UNLOCK(keg); 3649 } 3650 3651 /* 3652 * Frees a single item to any zone. 3653 * 3654 * Arguments: 3655 * zone The zone to free to 3656 * item The item we're freeing 3657 * udata User supplied data for the dtor 3658 * skip Skip dtors and finis 3659 */ 3660 static void 3661 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) 3662 { 3663 3664 item_dtor(zone, item, udata, skip); 3665 3666 if (skip < SKIP_FINI && zone->uz_fini) 3667 zone->uz_fini(item, zone->uz_size); 3668 3669 zone->uz_release(zone->uz_arg, &item, 1); 3670 3671 if (skip & SKIP_CNT) 3672 return; 3673 3674 counter_u64_add(zone->uz_frees, 1); 3675 3676 if (zone->uz_max_items > 0) { 3677 ZONE_LOCK(zone); 3678 zone->uz_items--; 3679 if (zone->uz_sleepers > 0 && 3680 zone->uz_items < zone->uz_max_items) 3681 wakeup_one(zone); 3682 ZONE_UNLOCK(zone); 3683 } 3684 } 3685 3686 /* See uma.h */ 3687 int 3688 uma_zone_set_max(uma_zone_t zone, int nitems) 3689 { 3690 struct uma_bucket_zone *ubz; 3691 int count; 3692 3693 ZONE_LOCK(zone); 3694 ubz = bucket_zone_max(zone, nitems); 3695 count = ubz != NULL ? ubz->ubz_entries : 0; 3696 zone->uz_bucket_size_max = zone->uz_bucket_size = count; 3697 if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) 3698 zone->uz_bucket_size_min = zone->uz_bucket_size_max; 3699 zone->uz_max_items = nitems; 3700 ZONE_UNLOCK(zone); 3701 3702 return (nitems); 3703 } 3704 3705 /* See uma.h */ 3706 void 3707 uma_zone_set_maxcache(uma_zone_t zone, int nitems) 3708 { 3709 struct uma_bucket_zone *ubz; 3710 int bpcpu; 3711 3712 ZONE_LOCK(zone); 3713 ubz = bucket_zone_max(zone, nitems); 3714 if (ubz != NULL) { 3715 bpcpu = 2; 3716 #ifdef UMA_XDOMAIN 3717 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) 3718 /* Count the cross-domain bucket. */ 3719 bpcpu++; 3720 #endif 3721 nitems -= ubz->ubz_entries * bpcpu * mp_ncpus; 3722 zone->uz_bucket_size_max = ubz->ubz_entries; 3723 } else { 3724 zone->uz_bucket_size_max = zone->uz_bucket_size = 0; 3725 } 3726 if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) 3727 zone->uz_bucket_size_min = zone->uz_bucket_size_max; 3728 zone->uz_bkt_max = nitems; 3729 ZONE_UNLOCK(zone); 3730 } 3731 3732 /* See uma.h */ 3733 int 3734 uma_zone_get_max(uma_zone_t zone) 3735 { 3736 int nitems; 3737 3738 ZONE_LOCK(zone); 3739 nitems = zone->uz_max_items; 3740 ZONE_UNLOCK(zone); 3741 3742 return (nitems); 3743 } 3744 3745 /* See uma.h */ 3746 void 3747 uma_zone_set_warning(uma_zone_t zone, const char *warning) 3748 { 3749 3750 ZONE_LOCK(zone); 3751 zone->uz_warning = warning; 3752 ZONE_UNLOCK(zone); 3753 } 3754 3755 /* See uma.h */ 3756 void 3757 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) 3758 { 3759 3760 ZONE_LOCK(zone); 3761 TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); 3762 ZONE_UNLOCK(zone); 3763 } 3764 3765 /* See uma.h */ 3766 int 3767 uma_zone_get_cur(uma_zone_t zone) 3768 { 3769 int64_t nitems; 3770 u_int i; 3771 3772 ZONE_LOCK(zone); 3773 nitems = counter_u64_fetch(zone->uz_allocs) - 3774 counter_u64_fetch(zone->uz_frees); 3775 if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) { 3776 CPU_FOREACH(i) { 3777 /* 3778 * See the comment in uma_vm_zone_stats() regarding 3779 * the safety of accessing the per-cpu caches. With 3780 * the zone lock held, it is safe, but can potentially 3781 * result in stale data. 3782 */ 3783 nitems += zone->uz_cpu[i].uc_allocs - 3784 zone->uz_cpu[i].uc_frees; 3785 } 3786 } 3787 ZONE_UNLOCK(zone); 3788 3789 return (nitems < 0 ? 0 : nitems); 3790 } 3791 3792 static uint64_t 3793 uma_zone_get_allocs(uma_zone_t zone) 3794 { 3795 uint64_t nitems; 3796 u_int i; 3797 3798 ZONE_LOCK(zone); 3799 nitems = counter_u64_fetch(zone->uz_allocs); 3800 if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) { 3801 CPU_FOREACH(i) { 3802 /* 3803 * See the comment in uma_vm_zone_stats() regarding 3804 * the safety of accessing the per-cpu caches. With 3805 * the zone lock held, it is safe, but can potentially 3806 * result in stale data. 3807 */ 3808 nitems += zone->uz_cpu[i].uc_allocs; 3809 } 3810 } 3811 ZONE_UNLOCK(zone); 3812 3813 return (nitems); 3814 } 3815 3816 static uint64_t 3817 uma_zone_get_frees(uma_zone_t zone) 3818 { 3819 uint64_t nitems; 3820 u_int i; 3821 3822 ZONE_LOCK(zone); 3823 nitems = counter_u64_fetch(zone->uz_frees); 3824 if ((zone->uz_flags & UMA_ZFLAG_INTERNAL) == 0) { 3825 CPU_FOREACH(i) { 3826 /* 3827 * See the comment in uma_vm_zone_stats() regarding 3828 * the safety of accessing the per-cpu caches. With 3829 * the zone lock held, it is safe, but can potentially 3830 * result in stale data. 3831 */ 3832 nitems += zone->uz_cpu[i].uc_frees; 3833 } 3834 } 3835 ZONE_UNLOCK(zone); 3836 3837 return (nitems); 3838 } 3839 3840 /* See uma.h */ 3841 void 3842 uma_zone_set_init(uma_zone_t zone, uma_init uminit) 3843 { 3844 uma_keg_t keg; 3845 3846 KEG_GET(zone, keg); 3847 KEG_LOCK(keg); 3848 KASSERT(keg->uk_pages == 0, 3849 ("uma_zone_set_init on non-empty keg")); 3850 keg->uk_init = uminit; 3851 KEG_UNLOCK(keg); 3852 } 3853 3854 /* See uma.h */ 3855 void 3856 uma_zone_set_fini(uma_zone_t zone, uma_fini fini) 3857 { 3858 uma_keg_t keg; 3859 3860 KEG_GET(zone, keg); 3861 KEG_LOCK(keg); 3862 KASSERT(keg->uk_pages == 0, 3863 ("uma_zone_set_fini on non-empty keg")); 3864 keg->uk_fini = fini; 3865 KEG_UNLOCK(keg); 3866 } 3867 3868 /* See uma.h */ 3869 void 3870 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) 3871 { 3872 3873 ZONE_LOCK(zone); 3874 KASSERT(zone->uz_keg->uk_pages == 0, 3875 ("uma_zone_set_zinit on non-empty keg")); 3876 zone->uz_init = zinit; 3877 ZONE_UNLOCK(zone); 3878 } 3879 3880 /* See uma.h */ 3881 void 3882 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) 3883 { 3884 3885 ZONE_LOCK(zone); 3886 KASSERT(zone->uz_keg->uk_pages == 0, 3887 ("uma_zone_set_zfini on non-empty keg")); 3888 zone->uz_fini = zfini; 3889 ZONE_UNLOCK(zone); 3890 } 3891 3892 /* See uma.h */ 3893 /* XXX uk_freef is not actually used with the zone locked */ 3894 void 3895 uma_zone_set_freef(uma_zone_t zone, uma_free freef) 3896 { 3897 uma_keg_t keg; 3898 3899 KEG_GET(zone, keg); 3900 KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type")); 3901 KEG_LOCK(keg); 3902 keg->uk_freef = freef; 3903 KEG_UNLOCK(keg); 3904 } 3905 3906 /* See uma.h */ 3907 /* XXX uk_allocf is not actually used with the zone locked */ 3908 void 3909 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) 3910 { 3911 uma_keg_t keg; 3912 3913 KEG_GET(zone, keg); 3914 KEG_LOCK(keg); 3915 keg->uk_allocf = allocf; 3916 KEG_UNLOCK(keg); 3917 } 3918 3919 /* See uma.h */ 3920 void 3921 uma_zone_reserve(uma_zone_t zone, int items) 3922 { 3923 uma_keg_t keg; 3924 3925 KEG_GET(zone, keg); 3926 KEG_LOCK(keg); 3927 keg->uk_reserve = items; 3928 KEG_UNLOCK(keg); 3929 } 3930 3931 /* See uma.h */ 3932 int 3933 uma_zone_reserve_kva(uma_zone_t zone, int count) 3934 { 3935 uma_keg_t keg; 3936 vm_offset_t kva; 3937 u_int pages; 3938 3939 KEG_GET(zone, keg); 3940 3941 pages = count / keg->uk_ipers; 3942 if (pages * keg->uk_ipers < count) 3943 pages++; 3944 pages *= keg->uk_ppera; 3945 3946 #ifdef UMA_MD_SMALL_ALLOC 3947 if (keg->uk_ppera > 1) { 3948 #else 3949 if (1) { 3950 #endif 3951 kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); 3952 if (kva == 0) 3953 return (0); 3954 } else 3955 kva = 0; 3956 3957 ZONE_LOCK(zone); 3958 MPASS(keg->uk_kva == 0); 3959 keg->uk_kva = kva; 3960 keg->uk_offset = 0; 3961 zone->uz_max_items = pages * keg->uk_ipers; 3962 #ifdef UMA_MD_SMALL_ALLOC 3963 keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; 3964 #else 3965 keg->uk_allocf = noobj_alloc; 3966 #endif 3967 keg->uk_flags |= UMA_ZONE_NOFREE; 3968 ZONE_UNLOCK(zone); 3969 3970 return (1); 3971 } 3972 3973 /* See uma.h */ 3974 void 3975 uma_prealloc(uma_zone_t zone, int items) 3976 { 3977 struct vm_domainset_iter di; 3978 uma_domain_t dom; 3979 uma_slab_t slab; 3980 uma_keg_t keg; 3981 int aflags, domain, slabs; 3982 3983 KEG_GET(zone, keg); 3984 KEG_LOCK(keg); 3985 slabs = items / keg->uk_ipers; 3986 if (slabs * keg->uk_ipers < items) 3987 slabs++; 3988 while (slabs-- > 0) { 3989 aflags = M_NOWAIT; 3990 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 3991 &aflags); 3992 for (;;) { 3993 slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, 3994 aflags); 3995 if (slab != NULL) { 3996 dom = &keg->uk_domain[slab->us_domain]; 3997 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, 3998 us_link); 3999 break; 4000 } 4001 KEG_LOCK(keg); 4002 if (vm_domainset_iter_policy(&di, &domain) != 0) { 4003 KEG_UNLOCK(keg); 4004 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); 4005 KEG_LOCK(keg); 4006 } 4007 } 4008 } 4009 KEG_UNLOCK(keg); 4010 } 4011 4012 /* See uma.h */ 4013 void 4014 uma_reclaim(int req) 4015 { 4016 4017 CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); 4018 sx_xlock(&uma_reclaim_lock); 4019 bucket_enable(); 4020 4021 switch (req) { 4022 case UMA_RECLAIM_TRIM: 4023 zone_foreach(zone_trim, NULL); 4024 break; 4025 case UMA_RECLAIM_DRAIN: 4026 case UMA_RECLAIM_DRAIN_CPU: 4027 zone_foreach(zone_drain, NULL); 4028 if (req == UMA_RECLAIM_DRAIN_CPU) { 4029 pcpu_cache_drain_safe(NULL); 4030 zone_foreach(zone_drain, NULL); 4031 } 4032 break; 4033 default: 4034 panic("unhandled reclamation request %d", req); 4035 } 4036 4037 /* 4038 * Some slabs may have been freed but this zone will be visited early 4039 * we visit again so that we can free pages that are empty once other 4040 * zones are drained. We have to do the same for buckets. 4041 */ 4042 zone_drain(slabzone, NULL); 4043 bucket_zone_drain(); 4044 sx_xunlock(&uma_reclaim_lock); 4045 } 4046 4047 static volatile int uma_reclaim_needed; 4048 4049 void 4050 uma_reclaim_wakeup(void) 4051 { 4052 4053 if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0) 4054 wakeup(uma_reclaim); 4055 } 4056 4057 void 4058 uma_reclaim_worker(void *arg __unused) 4059 { 4060 4061 for (;;) { 4062 sx_xlock(&uma_reclaim_lock); 4063 while (atomic_load_int(&uma_reclaim_needed) == 0) 4064 sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", 4065 hz); 4066 sx_xunlock(&uma_reclaim_lock); 4067 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); 4068 uma_reclaim(UMA_RECLAIM_DRAIN_CPU); 4069 atomic_store_int(&uma_reclaim_needed, 0); 4070 /* Don't fire more than once per-second. */ 4071 pause("umarclslp", hz); 4072 } 4073 } 4074 4075 /* See uma.h */ 4076 void 4077 uma_zone_reclaim(uma_zone_t zone, int req) 4078 { 4079 4080 switch (req) { 4081 case UMA_RECLAIM_TRIM: 4082 zone_trim(zone, NULL); 4083 break; 4084 case UMA_RECLAIM_DRAIN: 4085 zone_drain(zone, NULL); 4086 break; 4087 case UMA_RECLAIM_DRAIN_CPU: 4088 pcpu_cache_drain_safe(zone); 4089 zone_drain(zone, NULL); 4090 break; 4091 default: 4092 panic("unhandled reclamation request %d", req); 4093 } 4094 } 4095 4096 /* See uma.h */ 4097 int 4098 uma_zone_exhausted(uma_zone_t zone) 4099 { 4100 int full; 4101 4102 ZONE_LOCK(zone); 4103 full = zone->uz_sleepers > 0; 4104 ZONE_UNLOCK(zone); 4105 return (full); 4106 } 4107 4108 int 4109 uma_zone_exhausted_nolock(uma_zone_t zone) 4110 { 4111 return (zone->uz_sleepers > 0); 4112 } 4113 4114 static void 4115 uma_zero_item(void *item, uma_zone_t zone) 4116 { 4117 4118 bzero(item, zone->uz_size); 4119 } 4120 4121 unsigned long 4122 uma_limit(void) 4123 { 4124 4125 return (uma_kmem_limit); 4126 } 4127 4128 void 4129 uma_set_limit(unsigned long limit) 4130 { 4131 4132 uma_kmem_limit = limit; 4133 } 4134 4135 unsigned long 4136 uma_size(void) 4137 { 4138 4139 return (atomic_load_long(&uma_kmem_total)); 4140 } 4141 4142 long 4143 uma_avail(void) 4144 { 4145 4146 return (uma_kmem_limit - uma_size()); 4147 } 4148 4149 #ifdef DDB 4150 /* 4151 * Generate statistics across both the zone and its per-cpu cache's. Return 4152 * desired statistics if the pointer is non-NULL for that statistic. 4153 * 4154 * Note: does not update the zone statistics, as it can't safely clear the 4155 * per-CPU cache statistic. 4156 * 4157 * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't 4158 * safe from off-CPU; we should modify the caches to track this information 4159 * directly so that we don't have to. 4160 */ 4161 static void 4162 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, 4163 uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) 4164 { 4165 uma_cache_t cache; 4166 uint64_t allocs, frees, sleeps, xdomain; 4167 int cachefree, cpu; 4168 4169 allocs = frees = sleeps = xdomain = 0; 4170 cachefree = 0; 4171 CPU_FOREACH(cpu) { 4172 cache = &z->uz_cpu[cpu]; 4173 if (cache->uc_allocbucket != NULL) 4174 cachefree += cache->uc_allocbucket->ub_cnt; 4175 if (cache->uc_freebucket != NULL) 4176 cachefree += cache->uc_freebucket->ub_cnt; 4177 if (cache->uc_crossbucket != NULL) { 4178 xdomain += cache->uc_crossbucket->ub_cnt; 4179 cachefree += cache->uc_crossbucket->ub_cnt; 4180 } 4181 allocs += cache->uc_allocs; 4182 frees += cache->uc_frees; 4183 } 4184 allocs += counter_u64_fetch(z->uz_allocs); 4185 frees += counter_u64_fetch(z->uz_frees); 4186 sleeps += z->uz_sleeps; 4187 xdomain += z->uz_xdomain; 4188 if (cachefreep != NULL) 4189 *cachefreep = cachefree; 4190 if (allocsp != NULL) 4191 *allocsp = allocs; 4192 if (freesp != NULL) 4193 *freesp = frees; 4194 if (sleepsp != NULL) 4195 *sleepsp = sleeps; 4196 if (xdomainp != NULL) 4197 *xdomainp = xdomain; 4198 } 4199 #endif /* DDB */ 4200 4201 static int 4202 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) 4203 { 4204 uma_keg_t kz; 4205 uma_zone_t z; 4206 int count; 4207 4208 count = 0; 4209 rw_rlock(&uma_rwlock); 4210 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4211 LIST_FOREACH(z, &kz->uk_zones, uz_link) 4212 count++; 4213 } 4214 LIST_FOREACH(z, &uma_cachezones, uz_link) 4215 count++; 4216 4217 rw_runlock(&uma_rwlock); 4218 return (sysctl_handle_int(oidp, &count, 0, req)); 4219 } 4220 4221 static void 4222 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, 4223 struct uma_percpu_stat *ups, bool internal) 4224 { 4225 uma_zone_domain_t zdom; 4226 uma_bucket_t bucket; 4227 uma_cache_t cache; 4228 int i; 4229 4230 4231 for (i = 0; i < vm_ndomains; i++) { 4232 zdom = &z->uz_domain[i]; 4233 uth->uth_zone_free += zdom->uzd_nitems; 4234 } 4235 uth->uth_allocs = counter_u64_fetch(z->uz_allocs); 4236 uth->uth_frees = counter_u64_fetch(z->uz_frees); 4237 uth->uth_fails = counter_u64_fetch(z->uz_fails); 4238 uth->uth_sleeps = z->uz_sleeps; 4239 uth->uth_xdomain = z->uz_xdomain; 4240 4241 /* 4242 * While it is not normally safe to access the cache bucket pointers 4243 * while not on the CPU that owns the cache, we only allow the pointers 4244 * to be exchanged without the zone lock held, not invalidated, so 4245 * accept the possible race associated with bucket exchange during 4246 * monitoring. Use atomic_load_ptr() to ensure that the bucket pointers 4247 * are loaded only once. 4248 */ 4249 for (i = 0; i < mp_maxid + 1; i++) { 4250 bzero(&ups[i], sizeof(*ups)); 4251 if (internal || CPU_ABSENT(i)) 4252 continue; 4253 cache = &z->uz_cpu[i]; 4254 bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_allocbucket); 4255 if (bucket != NULL) 4256 ups[i].ups_cache_free += bucket->ub_cnt; 4257 bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_freebucket); 4258 if (bucket != NULL) 4259 ups[i].ups_cache_free += bucket->ub_cnt; 4260 bucket = (uma_bucket_t)atomic_load_ptr(&cache->uc_crossbucket); 4261 if (bucket != NULL) 4262 ups[i].ups_cache_free += bucket->ub_cnt; 4263 ups[i].ups_allocs = cache->uc_allocs; 4264 ups[i].ups_frees = cache->uc_frees; 4265 } 4266 } 4267 4268 static int 4269 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) 4270 { 4271 struct uma_stream_header ush; 4272 struct uma_type_header uth; 4273 struct uma_percpu_stat *ups; 4274 struct sbuf sbuf; 4275 uma_keg_t kz; 4276 uma_zone_t z; 4277 int count, error, i; 4278 4279 error = sysctl_wire_old_buffer(req, 0); 4280 if (error != 0) 4281 return (error); 4282 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 4283 sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); 4284 ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); 4285 4286 count = 0; 4287 rw_rlock(&uma_rwlock); 4288 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4289 LIST_FOREACH(z, &kz->uk_zones, uz_link) 4290 count++; 4291 } 4292 4293 LIST_FOREACH(z, &uma_cachezones, uz_link) 4294 count++; 4295 4296 /* 4297 * Insert stream header. 4298 */ 4299 bzero(&ush, sizeof(ush)); 4300 ush.ush_version = UMA_STREAM_VERSION; 4301 ush.ush_maxcpus = (mp_maxid + 1); 4302 ush.ush_count = count; 4303 (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); 4304 4305 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4306 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 4307 bzero(&uth, sizeof(uth)); 4308 ZONE_LOCK(z); 4309 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 4310 uth.uth_align = kz->uk_align; 4311 uth.uth_size = kz->uk_size; 4312 uth.uth_rsize = kz->uk_rsize; 4313 if (z->uz_max_items > 0) 4314 uth.uth_pages = (z->uz_items / kz->uk_ipers) * 4315 kz->uk_ppera; 4316 else 4317 uth.uth_pages = kz->uk_pages; 4318 uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * 4319 kz->uk_ppera; 4320 uth.uth_limit = z->uz_max_items; 4321 uth.uth_keg_free = z->uz_keg->uk_free; 4322 4323 /* 4324 * A zone is secondary is it is not the first entry 4325 * on the keg's zone list. 4326 */ 4327 if ((z->uz_flags & UMA_ZONE_SECONDARY) && 4328 (LIST_FIRST(&kz->uk_zones) != z)) 4329 uth.uth_zone_flags = UTH_ZONE_SECONDARY; 4330 uma_vm_zone_stats(&uth, z, &sbuf, ups, 4331 kz->uk_flags & UMA_ZFLAG_INTERNAL); 4332 ZONE_UNLOCK(z); 4333 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 4334 for (i = 0; i < mp_maxid + 1; i++) 4335 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 4336 } 4337 } 4338 LIST_FOREACH(z, &uma_cachezones, uz_link) { 4339 bzero(&uth, sizeof(uth)); 4340 ZONE_LOCK(z); 4341 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 4342 uth.uth_size = z->uz_size; 4343 uma_vm_zone_stats(&uth, z, &sbuf, ups, false); 4344 ZONE_UNLOCK(z); 4345 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 4346 for (i = 0; i < mp_maxid + 1; i++) 4347 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 4348 } 4349 4350 rw_runlock(&uma_rwlock); 4351 error = sbuf_finish(&sbuf); 4352 sbuf_delete(&sbuf); 4353 free(ups, M_TEMP); 4354 return (error); 4355 } 4356 4357 int 4358 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) 4359 { 4360 uma_zone_t zone = *(uma_zone_t *)arg1; 4361 int error, max; 4362 4363 max = uma_zone_get_max(zone); 4364 error = sysctl_handle_int(oidp, &max, 0, req); 4365 if (error || !req->newptr) 4366 return (error); 4367 4368 uma_zone_set_max(zone, max); 4369 4370 return (0); 4371 } 4372 4373 int 4374 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) 4375 { 4376 uma_zone_t zone; 4377 int cur; 4378 4379 /* 4380 * Some callers want to add sysctls for global zones that 4381 * may not yet exist so they pass a pointer to a pointer. 4382 */ 4383 if (arg2 == 0) 4384 zone = *(uma_zone_t *)arg1; 4385 else 4386 zone = arg1; 4387 cur = uma_zone_get_cur(zone); 4388 return (sysctl_handle_int(oidp, &cur, 0, req)); 4389 } 4390 4391 static int 4392 sysctl_handle_uma_zone_allocs(SYSCTL_HANDLER_ARGS) 4393 { 4394 uma_zone_t zone = arg1; 4395 uint64_t cur; 4396 4397 cur = uma_zone_get_allocs(zone); 4398 return (sysctl_handle_64(oidp, &cur, 0, req)); 4399 } 4400 4401 static int 4402 sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS) 4403 { 4404 uma_zone_t zone = arg1; 4405 uint64_t cur; 4406 4407 cur = uma_zone_get_frees(zone); 4408 return (sysctl_handle_64(oidp, &cur, 0, req)); 4409 } 4410 4411 #ifdef INVARIANTS 4412 static uma_slab_t 4413 uma_dbg_getslab(uma_zone_t zone, void *item) 4414 { 4415 uma_slab_t slab; 4416 uma_keg_t keg; 4417 uint8_t *mem; 4418 4419 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 4420 if (zone->uz_flags & UMA_ZONE_VTOSLAB) { 4421 slab = vtoslab((vm_offset_t)mem); 4422 } else { 4423 /* 4424 * It is safe to return the slab here even though the 4425 * zone is unlocked because the item's allocation state 4426 * essentially holds a reference. 4427 */ 4428 if (zone->uz_lockptr == &zone->uz_lock) 4429 return (NULL); 4430 ZONE_LOCK(zone); 4431 keg = zone->uz_keg; 4432 if (keg->uk_flags & UMA_ZONE_HASH) 4433 slab = hash_sfind(&keg->uk_hash, mem); 4434 else 4435 slab = (uma_slab_t)(mem + keg->uk_pgoff); 4436 ZONE_UNLOCK(zone); 4437 } 4438 4439 return (slab); 4440 } 4441 4442 static bool 4443 uma_dbg_zskip(uma_zone_t zone, void *mem) 4444 { 4445 4446 if (zone->uz_lockptr == &zone->uz_lock) 4447 return (true); 4448 4449 return (uma_dbg_kskip(zone->uz_keg, mem)); 4450 } 4451 4452 static bool 4453 uma_dbg_kskip(uma_keg_t keg, void *mem) 4454 { 4455 uintptr_t idx; 4456 4457 if (dbg_divisor == 0) 4458 return (true); 4459 4460 if (dbg_divisor == 1) 4461 return (false); 4462 4463 idx = (uintptr_t)mem >> PAGE_SHIFT; 4464 if (keg->uk_ipers > 1) { 4465 idx *= keg->uk_ipers; 4466 idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize; 4467 } 4468 4469 if ((idx / dbg_divisor) * dbg_divisor != idx) { 4470 counter_u64_add(uma_skip_cnt, 1); 4471 return (true); 4472 } 4473 counter_u64_add(uma_dbg_cnt, 1); 4474 4475 return (false); 4476 } 4477 4478 /* 4479 * Set up the slab's freei data such that uma_dbg_free can function. 4480 * 4481 */ 4482 static void 4483 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) 4484 { 4485 uma_keg_t keg; 4486 int freei; 4487 4488 if (slab == NULL) { 4489 slab = uma_dbg_getslab(zone, item); 4490 if (slab == NULL) 4491 panic("uma: item %p did not belong to zone %s\n", 4492 item, zone->uz_name); 4493 } 4494 keg = zone->uz_keg; 4495 freei = slab_item_index(slab, keg, item); 4496 4497 if (BIT_ISSET(SLAB_MAX_SETSIZE, freei, &slab->us_debugfree)) 4498 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n", 4499 item, zone, zone->uz_name, slab, freei); 4500 BIT_SET_ATOMIC(SLAB_MAX_SETSIZE, freei, &slab->us_debugfree); 4501 4502 return; 4503 } 4504 4505 /* 4506 * Verifies freed addresses. Checks for alignment, valid slab membership 4507 * and duplicate frees. 4508 * 4509 */ 4510 static void 4511 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) 4512 { 4513 uma_keg_t keg; 4514 int freei; 4515 4516 if (slab == NULL) { 4517 slab = uma_dbg_getslab(zone, item); 4518 if (slab == NULL) 4519 panic("uma: Freed item %p did not belong to zone %s\n", 4520 item, zone->uz_name); 4521 } 4522 keg = zone->uz_keg; 4523 freei = slab_item_index(slab, keg, item); 4524 4525 if (freei >= keg->uk_ipers) 4526 panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n", 4527 item, zone, zone->uz_name, slab, freei); 4528 4529 if (slab_item(slab, keg, freei) != item) 4530 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n", 4531 item, zone, zone->uz_name, slab, freei); 4532 4533 if (!BIT_ISSET(SLAB_MAX_SETSIZE, freei, &slab->us_debugfree)) 4534 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n", 4535 item, zone, zone->uz_name, slab, freei); 4536 4537 BIT_CLR_ATOMIC(SLAB_MAX_SETSIZE, freei, &slab->us_debugfree); 4538 } 4539 #endif /* INVARIANTS */ 4540 4541 #ifdef DDB 4542 static int64_t 4543 get_uma_stats(uma_keg_t kz, uma_zone_t z, uint64_t *allocs, uint64_t *used, 4544 uint64_t *sleeps, long *cachefree, uint64_t *xdomain) 4545 { 4546 uint64_t frees; 4547 int i; 4548 4549 if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { 4550 *allocs = counter_u64_fetch(z->uz_allocs); 4551 frees = counter_u64_fetch(z->uz_frees); 4552 *sleeps = z->uz_sleeps; 4553 *cachefree = 0; 4554 *xdomain = 0; 4555 } else 4556 uma_zone_sumstat(z, cachefree, allocs, &frees, sleeps, 4557 xdomain); 4558 if (!((z->uz_flags & UMA_ZONE_SECONDARY) && 4559 (LIST_FIRST(&kz->uk_zones) != z))) 4560 *cachefree += kz->uk_free; 4561 for (i = 0; i < vm_ndomains; i++) 4562 *cachefree += z->uz_domain[i].uzd_nitems; 4563 *used = *allocs - frees; 4564 return (((int64_t)*used + *cachefree) * kz->uk_size); 4565 } 4566 4567 DB_SHOW_COMMAND(uma, db_show_uma) 4568 { 4569 const char *fmt_hdr, *fmt_entry; 4570 uma_keg_t kz; 4571 uma_zone_t z; 4572 uint64_t allocs, used, sleeps, xdomain; 4573 long cachefree; 4574 /* variables for sorting */ 4575 uma_keg_t cur_keg; 4576 uma_zone_t cur_zone, last_zone; 4577 int64_t cur_size, last_size, size; 4578 int ties; 4579 4580 /* /i option produces machine-parseable CSV output */ 4581 if (modif[0] == 'i') { 4582 fmt_hdr = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; 4583 fmt_entry = "\"%s\",%ju,%jd,%ld,%ju,%ju,%u,%jd,%ju\n"; 4584 } else { 4585 fmt_hdr = "%18s %6s %7s %7s %11s %7s %7s %10s %8s\n"; 4586 fmt_entry = "%18s %6ju %7jd %7ld %11ju %7ju %7u %10jd %8ju\n"; 4587 } 4588 4589 db_printf(fmt_hdr, "Zone", "Size", "Used", "Free", "Requests", 4590 "Sleeps", "Bucket", "Total Mem", "XFree"); 4591 4592 /* Sort the zones with largest size first. */ 4593 last_zone = NULL; 4594 last_size = INT64_MAX; 4595 for (;;) { 4596 cur_zone = NULL; 4597 cur_size = -1; 4598 ties = 0; 4599 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4600 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 4601 /* 4602 * In the case of size ties, print out zones 4603 * in the order they are encountered. That is, 4604 * when we encounter the most recently output 4605 * zone, we have already printed all preceding 4606 * ties, and we must print all following ties. 4607 */ 4608 if (z == last_zone) { 4609 ties = 1; 4610 continue; 4611 } 4612 size = get_uma_stats(kz, z, &allocs, &used, 4613 &sleeps, &cachefree, &xdomain); 4614 if (size > cur_size && size < last_size + ties) 4615 { 4616 cur_size = size; 4617 cur_zone = z; 4618 cur_keg = kz; 4619 } 4620 } 4621 } 4622 if (cur_zone == NULL) 4623 break; 4624 4625 size = get_uma_stats(cur_keg, cur_zone, &allocs, &used, 4626 &sleeps, &cachefree, &xdomain); 4627 db_printf(fmt_entry, cur_zone->uz_name, 4628 (uintmax_t)cur_keg->uk_size, (intmax_t)used, cachefree, 4629 (uintmax_t)allocs, (uintmax_t)sleeps, 4630 (unsigned)cur_zone->uz_bucket_size, (intmax_t)size, 4631 xdomain); 4632 4633 if (db_pager_quit) 4634 return; 4635 last_zone = cur_zone; 4636 last_size = cur_size; 4637 } 4638 } 4639 4640 DB_SHOW_COMMAND(umacache, db_show_umacache) 4641 { 4642 uma_zone_t z; 4643 uint64_t allocs, frees; 4644 long cachefree; 4645 int i; 4646 4647 db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", 4648 "Requests", "Bucket"); 4649 LIST_FOREACH(z, &uma_cachezones, uz_link) { 4650 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); 4651 for (i = 0; i < vm_ndomains; i++) 4652 cachefree += z->uz_domain[i].uzd_nitems; 4653 db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", 4654 z->uz_name, (uintmax_t)z->uz_size, 4655 (intmax_t)(allocs - frees), cachefree, 4656 (uintmax_t)allocs, z->uz_bucket_size); 4657 if (db_pager_quit) 4658 return; 4659 } 4660 } 4661 #endif /* DDB */ 4662