1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org> 5 * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org> 6 * Copyright (c) 2004-2006 Robert N. M. Watson 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice unmodified, this list of conditions, and the following 14 * disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 /* 32 * uma_core.c Implementation of the Universal Memory allocator 33 * 34 * This allocator is intended to replace the multitude of similar object caches 35 * in the standard FreeBSD kernel. The intent is to be flexible as well as 36 * efficient. A primary design goal is to return unused memory to the rest of 37 * the system. This will make the system as a whole more flexible due to the 38 * ability to move memory to subsystems which most need it instead of leaving 39 * pools of reserved memory unused. 40 * 41 * The basic ideas stem from similar slab/zone based allocators whose algorithms 42 * are well known. 43 * 44 */ 45 46 /* 47 * TODO: 48 * - Improve memory usage for large allocations 49 * - Investigate cache size adjustments 50 */ 51 52 #include <sys/cdefs.h> 53 __FBSDID("$FreeBSD$"); 54 55 #include "opt_ddb.h" 56 #include "opt_param.h" 57 #include "opt_vm.h" 58 59 #include <sys/param.h> 60 #include <sys/systm.h> 61 #include <sys/bitset.h> 62 #include <sys/domainset.h> 63 #include <sys/eventhandler.h> 64 #include <sys/kernel.h> 65 #include <sys/types.h> 66 #include <sys/limits.h> 67 #include <sys/queue.h> 68 #include <sys/malloc.h> 69 #include <sys/ktr.h> 70 #include <sys/lock.h> 71 #include <sys/sysctl.h> 72 #include <sys/mutex.h> 73 #include <sys/proc.h> 74 #include <sys/random.h> 75 #include <sys/rwlock.h> 76 #include <sys/sbuf.h> 77 #include <sys/sched.h> 78 #include <sys/smp.h> 79 #include <sys/taskqueue.h> 80 #include <sys/vmmeter.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_domainset.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pageout.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_phys.h> 89 #include <vm/vm_pagequeue.h> 90 #include <vm/vm_map.h> 91 #include <vm/vm_kern.h> 92 #include <vm/vm_extern.h> 93 #include <vm/uma.h> 94 #include <vm/uma_int.h> 95 #include <vm/uma_dbg.h> 96 97 #include <ddb/ddb.h> 98 99 #ifdef DEBUG_MEMGUARD 100 #include <vm/memguard.h> 101 #endif 102 103 /* 104 * This is the zone and keg from which all zones are spawned. 105 */ 106 static uma_zone_t kegs; 107 static uma_zone_t zones; 108 109 /* This is the zone from which all offpage uma_slab_ts are allocated. */ 110 static uma_zone_t slabzone; 111 112 /* 113 * The initial hash tables come out of this zone so they can be allocated 114 * prior to malloc coming up. 115 */ 116 static uma_zone_t hashzone; 117 118 /* The boot-time adjusted value for cache line alignment. */ 119 int uma_align_cache = 64 - 1; 120 121 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets"); 122 123 /* 124 * Are we allowed to allocate buckets? 125 */ 126 static int bucketdisable = 1; 127 128 /* Linked list of all kegs in the system */ 129 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs); 130 131 /* Linked list of all cache-only zones in the system */ 132 static LIST_HEAD(,uma_zone) uma_cachezones = 133 LIST_HEAD_INITIALIZER(uma_cachezones); 134 135 /* This RW lock protects the keg list */ 136 static struct rwlock_padalign __exclusive_cache_line uma_rwlock; 137 138 /* 139 * Pointer and counter to pool of pages, that is preallocated at 140 * startup to bootstrap UMA. 141 */ 142 static char *bootmem; 143 static int boot_pages; 144 145 static struct sx uma_drain_lock; 146 147 /* 148 * kmem soft limit, initialized by uma_set_limit(). Ensure that early 149 * allocations don't trigger a wakeup of the reclaim thread. 150 */ 151 static unsigned long uma_kmem_limit = LONG_MAX; 152 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0, 153 "UMA kernel memory soft limit"); 154 static unsigned long uma_kmem_total; 155 SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0, 156 "UMA kernel memory usage"); 157 158 /* Is the VM done starting up? */ 159 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS, 160 BOOT_RUNNING } booted = BOOT_COLD; 161 162 /* 163 * This is the handle used to schedule events that need to happen 164 * outside of the allocation fast path. 165 */ 166 static struct callout uma_callout; 167 #define UMA_TIMEOUT 20 /* Seconds for callout interval. */ 168 169 /* 170 * This structure is passed as the zone ctor arg so that I don't have to create 171 * a special allocation function just for zones. 172 */ 173 struct uma_zctor_args { 174 const char *name; 175 size_t size; 176 uma_ctor ctor; 177 uma_dtor dtor; 178 uma_init uminit; 179 uma_fini fini; 180 uma_import import; 181 uma_release release; 182 void *arg; 183 uma_keg_t keg; 184 int align; 185 uint32_t flags; 186 }; 187 188 struct uma_kctor_args { 189 uma_zone_t zone; 190 size_t size; 191 uma_init uminit; 192 uma_fini fini; 193 int align; 194 uint32_t flags; 195 }; 196 197 struct uma_bucket_zone { 198 uma_zone_t ubz_zone; 199 char *ubz_name; 200 int ubz_entries; /* Number of items it can hold. */ 201 int ubz_maxsize; /* Maximum allocation size per-item. */ 202 }; 203 204 /* 205 * Compute the actual number of bucket entries to pack them in power 206 * of two sizes for more efficient space utilization. 207 */ 208 #define BUCKET_SIZE(n) \ 209 (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *)) 210 211 #define BUCKET_MAX BUCKET_SIZE(256) 212 #define BUCKET_MIN BUCKET_SIZE(4) 213 214 struct uma_bucket_zone bucket_zones[] = { 215 { NULL, "4 Bucket", BUCKET_SIZE(4), 4096 }, 216 { NULL, "6 Bucket", BUCKET_SIZE(6), 3072 }, 217 { NULL, "8 Bucket", BUCKET_SIZE(8), 2048 }, 218 { NULL, "12 Bucket", BUCKET_SIZE(12), 1536 }, 219 { NULL, "16 Bucket", BUCKET_SIZE(16), 1024 }, 220 { NULL, "32 Bucket", BUCKET_SIZE(32), 512 }, 221 { NULL, "64 Bucket", BUCKET_SIZE(64), 256 }, 222 { NULL, "128 Bucket", BUCKET_SIZE(128), 128 }, 223 { NULL, "256 Bucket", BUCKET_SIZE(256), 64 }, 224 { NULL, NULL, 0} 225 }; 226 227 /* 228 * Flags and enumerations to be passed to internal functions. 229 */ 230 enum zfreeskip { 231 SKIP_NONE = 0, 232 SKIP_CNT = 0x00000001, 233 SKIP_DTOR = 0x00010000, 234 SKIP_FINI = 0x00020000, 235 }; 236 237 #define UMA_ANYDOMAIN -1 /* Special value for domain search. */ 238 239 /* Prototypes.. */ 240 241 int uma_startup_count(int); 242 void uma_startup(void *, int); 243 void uma_startup1(void); 244 void uma_startup2(void); 245 246 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 247 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 248 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 249 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); 250 static void page_free(void *, vm_size_t, uint8_t); 251 static void pcpu_page_free(void *, vm_size_t, uint8_t); 252 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); 253 static void cache_drain(uma_zone_t); 254 static void bucket_drain(uma_zone_t, uma_bucket_t); 255 static void bucket_cache_drain(uma_zone_t zone); 256 static int keg_ctor(void *, int, void *, int); 257 static void keg_dtor(void *, int, void *); 258 static int zone_ctor(void *, int, void *, int); 259 static void zone_dtor(void *, int, void *); 260 static int zero_init(void *, int, int); 261 static void keg_small_init(uma_keg_t keg); 262 static void keg_large_init(uma_keg_t keg); 263 static void zone_foreach(void (*zfunc)(uma_zone_t)); 264 static void zone_timeout(uma_zone_t zone); 265 static int hash_alloc(struct uma_hash *, u_int); 266 static int hash_expand(struct uma_hash *, struct uma_hash *); 267 static void hash_free(struct uma_hash *hash); 268 static void uma_timeout(void *); 269 static void uma_startup3(void); 270 static void *zone_alloc_item(uma_zone_t, void *, int, int); 271 static void *zone_alloc_item_locked(uma_zone_t, void *, int, int); 272 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); 273 static void bucket_enable(void); 274 static void bucket_init(void); 275 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); 276 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *); 277 static void bucket_zone_drain(void); 278 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int, int); 279 static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int); 280 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab); 281 static void slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item); 282 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, 283 uma_fini fini, int align, uint32_t flags); 284 static int zone_import(uma_zone_t, void **, int, int, int); 285 static void zone_release(uma_zone_t, void **, int); 286 static void uma_zero_item(void *, uma_zone_t); 287 288 void uma_print_zone(uma_zone_t); 289 void uma_print_stats(void); 290 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS); 291 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS); 292 293 #ifdef INVARIANTS 294 static bool uma_dbg_kskip(uma_keg_t keg, void *mem); 295 static bool uma_dbg_zskip(uma_zone_t zone, void *mem); 296 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item); 297 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item); 298 299 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0, 300 "Memory allocation debugging"); 301 302 static u_int dbg_divisor = 1; 303 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor, 304 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0, 305 "Debug & thrash every this item in memory allocator"); 306 307 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER; 308 static counter_u64_t uma_skip_cnt = EARLY_COUNTER; 309 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD, 310 &uma_dbg_cnt, "memory items debugged"); 311 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD, 312 &uma_skip_cnt, "memory items skipped, not debugged"); 313 #endif 314 315 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL); 316 317 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT, 318 0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones"); 319 320 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT, 321 0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats"); 322 323 static int zone_warnings = 1; 324 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0, 325 "Warn when UMA zones becomes full"); 326 327 /* Adjust bytes under management by UMA. */ 328 static inline void 329 uma_total_dec(unsigned long size) 330 { 331 332 atomic_subtract_long(&uma_kmem_total, size); 333 } 334 335 static inline void 336 uma_total_inc(unsigned long size) 337 { 338 339 if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit) 340 uma_reclaim_wakeup(); 341 } 342 343 /* 344 * This routine checks to see whether or not it's safe to enable buckets. 345 */ 346 static void 347 bucket_enable(void) 348 { 349 bucketdisable = vm_page_count_min(); 350 } 351 352 /* 353 * Initialize bucket_zones, the array of zones of buckets of various sizes. 354 * 355 * For each zone, calculate the memory required for each bucket, consisting 356 * of the header and an array of pointers. 357 */ 358 static void 359 bucket_init(void) 360 { 361 struct uma_bucket_zone *ubz; 362 int size; 363 364 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) { 365 size = roundup(sizeof(struct uma_bucket), sizeof(void *)); 366 size += sizeof(void *) * ubz->ubz_entries; 367 ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size, 368 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 369 UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA); 370 } 371 } 372 373 /* 374 * Given a desired number of entries for a bucket, return the zone from which 375 * to allocate the bucket. 376 */ 377 static struct uma_bucket_zone * 378 bucket_zone_lookup(int entries) 379 { 380 struct uma_bucket_zone *ubz; 381 382 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 383 if (ubz->ubz_entries >= entries) 384 return (ubz); 385 ubz--; 386 return (ubz); 387 } 388 389 static int 390 bucket_select(int size) 391 { 392 struct uma_bucket_zone *ubz; 393 394 ubz = &bucket_zones[0]; 395 if (size > ubz->ubz_maxsize) 396 return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1); 397 398 for (; ubz->ubz_entries != 0; ubz++) 399 if (ubz->ubz_maxsize < size) 400 break; 401 ubz--; 402 return (ubz->ubz_entries); 403 } 404 405 static uma_bucket_t 406 bucket_alloc(uma_zone_t zone, void *udata, int flags) 407 { 408 struct uma_bucket_zone *ubz; 409 uma_bucket_t bucket; 410 411 /* 412 * This is to stop us from allocating per cpu buckets while we're 413 * running out of vm.boot_pages. Otherwise, we would exhaust the 414 * boot pages. This also prevents us from allocating buckets in 415 * low memory situations. 416 */ 417 if (bucketdisable) 418 return (NULL); 419 /* 420 * To limit bucket recursion we store the original zone flags 421 * in a cookie passed via zalloc_arg/zfree_arg. This allows the 422 * NOVM flag to persist even through deep recursions. We also 423 * store ZFLAG_BUCKET once we have recursed attempting to allocate 424 * a bucket for a bucket zone so we do not allow infinite bucket 425 * recursion. This cookie will even persist to frees of unused 426 * buckets via the allocation path or bucket allocations in the 427 * free path. 428 */ 429 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 430 udata = (void *)(uintptr_t)zone->uz_flags; 431 else { 432 if ((uintptr_t)udata & UMA_ZFLAG_BUCKET) 433 return (NULL); 434 udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET); 435 } 436 if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY) 437 flags |= M_NOVM; 438 ubz = bucket_zone_lookup(zone->uz_count); 439 if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0) 440 ubz++; 441 bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags); 442 if (bucket) { 443 #ifdef INVARIANTS 444 bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries); 445 #endif 446 bucket->ub_cnt = 0; 447 bucket->ub_entries = ubz->ubz_entries; 448 } 449 450 return (bucket); 451 } 452 453 static void 454 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata) 455 { 456 struct uma_bucket_zone *ubz; 457 458 KASSERT(bucket->ub_cnt == 0, 459 ("bucket_free: Freeing a non free bucket.")); 460 if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0) 461 udata = (void *)(uintptr_t)zone->uz_flags; 462 ubz = bucket_zone_lookup(bucket->ub_entries); 463 uma_zfree_arg(ubz->ubz_zone, bucket, udata); 464 } 465 466 static void 467 bucket_zone_drain(void) 468 { 469 struct uma_bucket_zone *ubz; 470 471 for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) 472 zone_drain(ubz->ubz_zone); 473 } 474 475 static uma_bucket_t 476 zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, const bool ws) 477 { 478 uma_bucket_t bucket; 479 480 ZONE_LOCK_ASSERT(zone); 481 482 if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { 483 MPASS(zdom->uzd_nitems >= bucket->ub_cnt); 484 LIST_REMOVE(bucket, ub_link); 485 zdom->uzd_nitems -= bucket->ub_cnt; 486 if (ws && zdom->uzd_imin > zdom->uzd_nitems) 487 zdom->uzd_imin = zdom->uzd_nitems; 488 zone->uz_bkt_count -= bucket->ub_cnt; 489 } 490 return (bucket); 491 } 492 493 static void 494 zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket, 495 const bool ws) 496 { 497 498 ZONE_LOCK_ASSERT(zone); 499 KASSERT(zone->uz_bkt_count < zone->uz_bkt_max, ("%s: zone %p overflow", 500 __func__, zone)); 501 502 LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); 503 zdom->uzd_nitems += bucket->ub_cnt; 504 if (ws && zdom->uzd_imax < zdom->uzd_nitems) 505 zdom->uzd_imax = zdom->uzd_nitems; 506 zone->uz_bkt_count += bucket->ub_cnt; 507 } 508 509 static void 510 zone_log_warning(uma_zone_t zone) 511 { 512 static const struct timeval warninterval = { 300, 0 }; 513 514 if (!zone_warnings || zone->uz_warning == NULL) 515 return; 516 517 if (ratecheck(&zone->uz_ratecheck, &warninterval)) 518 printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning); 519 } 520 521 static inline void 522 zone_maxaction(uma_zone_t zone) 523 { 524 525 if (zone->uz_maxaction.ta_func != NULL) 526 taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction); 527 } 528 529 /* 530 * Routine called by timeout which is used to fire off some time interval 531 * based calculations. (stats, hash size, etc.) 532 * 533 * Arguments: 534 * arg Unused 535 * 536 * Returns: 537 * Nothing 538 */ 539 static void 540 uma_timeout(void *unused) 541 { 542 bucket_enable(); 543 zone_foreach(zone_timeout); 544 545 /* Reschedule this event */ 546 callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); 547 } 548 549 /* 550 * Update the working set size estimate for the zone's bucket cache. 551 * The constants chosen here are somewhat arbitrary. With an update period of 552 * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the 553 * last 100s. 554 */ 555 static void 556 zone_domain_update_wss(uma_zone_domain_t zdom) 557 { 558 long wss; 559 560 MPASS(zdom->uzd_imax >= zdom->uzd_imin); 561 wss = zdom->uzd_imax - zdom->uzd_imin; 562 zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems; 563 zdom->uzd_wss = (3 * wss + 2 * zdom->uzd_wss) / 5; 564 } 565 566 /* 567 * Routine to perform timeout driven calculations. This expands the 568 * hashes and does per cpu statistics aggregation. 569 * 570 * Returns nothing. 571 */ 572 static void 573 zone_timeout(uma_zone_t zone) 574 { 575 uma_keg_t keg = zone->uz_keg; 576 u_int slabs; 577 578 KEG_LOCK(keg); 579 /* 580 * Expand the keg hash table. 581 * 582 * This is done if the number of slabs is larger than the hash size. 583 * What I'm trying to do here is completely reduce collisions. This 584 * may be a little aggressive. Should I allow for two collisions max? 585 */ 586 if (keg->uk_flags & UMA_ZONE_HASH && 587 (slabs = keg->uk_pages / keg->uk_ppera) > 588 keg->uk_hash.uh_hashsize) { 589 struct uma_hash newhash; 590 struct uma_hash oldhash; 591 int ret; 592 593 /* 594 * This is so involved because allocating and freeing 595 * while the keg lock is held will lead to deadlock. 596 * I have to do everything in stages and check for 597 * races. 598 */ 599 KEG_UNLOCK(keg); 600 ret = hash_alloc(&newhash, 1 << fls(slabs)); 601 KEG_LOCK(keg); 602 if (ret) { 603 if (hash_expand(&keg->uk_hash, &newhash)) { 604 oldhash = keg->uk_hash; 605 keg->uk_hash = newhash; 606 } else 607 oldhash = newhash; 608 609 KEG_UNLOCK(keg); 610 hash_free(&oldhash); 611 return; 612 } 613 } 614 615 for (int i = 0; i < vm_ndomains; i++) 616 zone_domain_update_wss(&zone->uz_domain[i]); 617 618 KEG_UNLOCK(keg); 619 } 620 621 /* 622 * Allocate and zero fill the next sized hash table from the appropriate 623 * backing store. 624 * 625 * Arguments: 626 * hash A new hash structure with the old hash size in uh_hashsize 627 * 628 * Returns: 629 * 1 on success and 0 on failure. 630 */ 631 static int 632 hash_alloc(struct uma_hash *hash, u_int size) 633 { 634 size_t alloc; 635 636 KASSERT(powerof2(size), ("hash size must be power of 2")); 637 if (size > UMA_HASH_SIZE_INIT) { 638 hash->uh_hashsize = size; 639 alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize; 640 hash->uh_slab_hash = (struct slabhead *)malloc(alloc, 641 M_UMAHASH, M_NOWAIT); 642 } else { 643 alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT; 644 hash->uh_slab_hash = zone_alloc_item(hashzone, NULL, 645 UMA_ANYDOMAIN, M_WAITOK); 646 hash->uh_hashsize = UMA_HASH_SIZE_INIT; 647 } 648 if (hash->uh_slab_hash) { 649 bzero(hash->uh_slab_hash, alloc); 650 hash->uh_hashmask = hash->uh_hashsize - 1; 651 return (1); 652 } 653 654 return (0); 655 } 656 657 /* 658 * Expands the hash table for HASH zones. This is done from zone_timeout 659 * to reduce collisions. This must not be done in the regular allocation 660 * path, otherwise, we can recurse on the vm while allocating pages. 661 * 662 * Arguments: 663 * oldhash The hash you want to expand 664 * newhash The hash structure for the new table 665 * 666 * Returns: 667 * Nothing 668 * 669 * Discussion: 670 */ 671 static int 672 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash) 673 { 674 uma_slab_t slab; 675 u_int hval; 676 u_int idx; 677 678 if (!newhash->uh_slab_hash) 679 return (0); 680 681 if (oldhash->uh_hashsize >= newhash->uh_hashsize) 682 return (0); 683 684 /* 685 * I need to investigate hash algorithms for resizing without a 686 * full rehash. 687 */ 688 689 for (idx = 0; idx < oldhash->uh_hashsize; idx++) 690 while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) { 691 slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]); 692 SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink); 693 hval = UMA_HASH(newhash, slab->us_data); 694 SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval], 695 slab, us_hlink); 696 } 697 698 return (1); 699 } 700 701 /* 702 * Free the hash bucket to the appropriate backing store. 703 * 704 * Arguments: 705 * slab_hash The hash bucket we're freeing 706 * hashsize The number of entries in that hash bucket 707 * 708 * Returns: 709 * Nothing 710 */ 711 static void 712 hash_free(struct uma_hash *hash) 713 { 714 if (hash->uh_slab_hash == NULL) 715 return; 716 if (hash->uh_hashsize == UMA_HASH_SIZE_INIT) 717 zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE); 718 else 719 free(hash->uh_slab_hash, M_UMAHASH); 720 } 721 722 /* 723 * Frees all outstanding items in a bucket 724 * 725 * Arguments: 726 * zone The zone to free to, must be unlocked. 727 * bucket The free/alloc bucket with items, cpu queue must be locked. 728 * 729 * Returns: 730 * Nothing 731 */ 732 733 static void 734 bucket_drain(uma_zone_t zone, uma_bucket_t bucket) 735 { 736 int i; 737 738 if (bucket == NULL) 739 return; 740 741 if (zone->uz_fini) 742 for (i = 0; i < bucket->ub_cnt; i++) 743 zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); 744 zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); 745 if (zone->uz_max_items > 0) { 746 ZONE_LOCK(zone); 747 zone->uz_items -= bucket->ub_cnt; 748 if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items) 749 wakeup_one(zone); 750 ZONE_UNLOCK(zone); 751 } 752 bucket->ub_cnt = 0; 753 } 754 755 /* 756 * Drains the per cpu caches for a zone. 757 * 758 * NOTE: This may only be called while the zone is being turn down, and not 759 * during normal operation. This is necessary in order that we do not have 760 * to migrate CPUs to drain the per-CPU caches. 761 * 762 * Arguments: 763 * zone The zone to drain, must be unlocked. 764 * 765 * Returns: 766 * Nothing 767 */ 768 static void 769 cache_drain(uma_zone_t zone) 770 { 771 uma_cache_t cache; 772 int cpu; 773 774 /* 775 * XXX: It is safe to not lock the per-CPU caches, because we're 776 * tearing down the zone anyway. I.e., there will be no further use 777 * of the caches at this point. 778 * 779 * XXX: It would good to be able to assert that the zone is being 780 * torn down to prevent improper use of cache_drain(). 781 * 782 * XXX: We lock the zone before passing into bucket_cache_drain() as 783 * it is used elsewhere. Should the tear-down path be made special 784 * there in some form? 785 */ 786 CPU_FOREACH(cpu) { 787 cache = &zone->uz_cpu[cpu]; 788 bucket_drain(zone, cache->uc_allocbucket); 789 if (cache->uc_allocbucket != NULL) 790 bucket_free(zone, cache->uc_allocbucket, NULL); 791 cache->uc_allocbucket = NULL; 792 bucket_drain(zone, cache->uc_freebucket); 793 if (cache->uc_freebucket != NULL) 794 bucket_free(zone, cache->uc_freebucket, NULL); 795 cache->uc_freebucket = NULL; 796 bucket_drain(zone, cache->uc_crossbucket); 797 if (cache->uc_crossbucket != NULL) 798 bucket_free(zone, cache->uc_crossbucket, NULL); 799 cache->uc_crossbucket = NULL; 800 } 801 ZONE_LOCK(zone); 802 bucket_cache_drain(zone); 803 ZONE_UNLOCK(zone); 804 } 805 806 static void 807 cache_shrink(uma_zone_t zone) 808 { 809 810 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 811 return; 812 813 ZONE_LOCK(zone); 814 zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2; 815 ZONE_UNLOCK(zone); 816 } 817 818 static void 819 cache_drain_safe_cpu(uma_zone_t zone) 820 { 821 uma_cache_t cache; 822 uma_bucket_t b1, b2, b3; 823 int domain; 824 825 if (zone->uz_flags & UMA_ZFLAG_INTERNAL) 826 return; 827 828 b1 = b2 = b3 = NULL; 829 ZONE_LOCK(zone); 830 critical_enter(); 831 if (zone->uz_flags & UMA_ZONE_NUMA) 832 domain = PCPU_GET(domain); 833 else 834 domain = 0; 835 cache = &zone->uz_cpu[curcpu]; 836 if (cache->uc_allocbucket) { 837 if (cache->uc_allocbucket->ub_cnt != 0) 838 zone_put_bucket(zone, &zone->uz_domain[domain], 839 cache->uc_allocbucket, false); 840 else 841 b1 = cache->uc_allocbucket; 842 cache->uc_allocbucket = NULL; 843 } 844 if (cache->uc_freebucket) { 845 if (cache->uc_freebucket->ub_cnt != 0) 846 zone_put_bucket(zone, &zone->uz_domain[domain], 847 cache->uc_freebucket, false); 848 else 849 b2 = cache->uc_freebucket; 850 cache->uc_freebucket = NULL; 851 } 852 b3 = cache->uc_crossbucket; 853 cache->uc_crossbucket = NULL; 854 critical_exit(); 855 ZONE_UNLOCK(zone); 856 if (b1) 857 bucket_free(zone, b1, NULL); 858 if (b2) 859 bucket_free(zone, b2, NULL); 860 if (b3) { 861 bucket_drain(zone, b3); 862 bucket_free(zone, b3, NULL); 863 } 864 } 865 866 /* 867 * Safely drain per-CPU caches of a zone(s) to alloc bucket. 868 * This is an expensive call because it needs to bind to all CPUs 869 * one by one and enter a critical section on each of them in order 870 * to safely access their cache buckets. 871 * Zone lock must not be held on call this function. 872 */ 873 static void 874 cache_drain_safe(uma_zone_t zone) 875 { 876 int cpu; 877 878 /* 879 * Polite bucket sizes shrinking was not enouth, shrink aggressively. 880 */ 881 if (zone) 882 cache_shrink(zone); 883 else 884 zone_foreach(cache_shrink); 885 886 CPU_FOREACH(cpu) { 887 thread_lock(curthread); 888 sched_bind(curthread, cpu); 889 thread_unlock(curthread); 890 891 if (zone) 892 cache_drain_safe_cpu(zone); 893 else 894 zone_foreach(cache_drain_safe_cpu); 895 } 896 thread_lock(curthread); 897 sched_unbind(curthread); 898 thread_unlock(curthread); 899 } 900 901 /* 902 * Drain the cached buckets from a zone. Expects a locked zone on entry. 903 */ 904 static void 905 bucket_cache_drain(uma_zone_t zone) 906 { 907 uma_zone_domain_t zdom; 908 uma_bucket_t bucket; 909 int i; 910 911 /* 912 * Drain the bucket queues and free the buckets. 913 */ 914 for (i = 0; i < vm_ndomains; i++) { 915 zdom = &zone->uz_domain[i]; 916 while ((bucket = zone_try_fetch_bucket(zone, zdom, false)) != 917 NULL) { 918 ZONE_UNLOCK(zone); 919 bucket_drain(zone, bucket); 920 bucket_free(zone, bucket, NULL); 921 ZONE_LOCK(zone); 922 } 923 } 924 925 /* 926 * Shrink further bucket sizes. Price of single zone lock collision 927 * is probably lower then price of global cache drain. 928 */ 929 if (zone->uz_count > zone->uz_count_min) 930 zone->uz_count--; 931 } 932 933 static void 934 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start) 935 { 936 uint8_t *mem; 937 int i; 938 uint8_t flags; 939 940 CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes", 941 keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera); 942 943 mem = slab->us_data; 944 flags = slab->us_flags; 945 i = start; 946 if (keg->uk_fini != NULL) { 947 for (i--; i > -1; i--) 948 #ifdef INVARIANTS 949 /* 950 * trash_fini implies that dtor was trash_dtor. trash_fini 951 * would check that memory hasn't been modified since free, 952 * which executed trash_dtor. 953 * That's why we need to run uma_dbg_kskip() check here, 954 * albeit we don't make skip check for other init/fini 955 * invocations. 956 */ 957 if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) || 958 keg->uk_fini != trash_fini) 959 #endif 960 keg->uk_fini(slab->us_data + (keg->uk_rsize * i), 961 keg->uk_size); 962 } 963 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 964 zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); 965 keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags); 966 uma_total_dec(PAGE_SIZE * keg->uk_ppera); 967 } 968 969 /* 970 * Frees pages from a keg back to the system. This is done on demand from 971 * the pageout daemon. 972 * 973 * Returns nothing. 974 */ 975 static void 976 keg_drain(uma_keg_t keg) 977 { 978 struct slabhead freeslabs = { 0 }; 979 uma_domain_t dom; 980 uma_slab_t slab, tmp; 981 int i; 982 983 /* 984 * We don't want to take pages from statically allocated kegs at this 985 * time 986 */ 987 if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL) 988 return; 989 990 CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u", 991 keg->uk_name, keg, keg->uk_free); 992 KEG_LOCK(keg); 993 if (keg->uk_free == 0) 994 goto finished; 995 996 for (i = 0; i < vm_ndomains; i++) { 997 dom = &keg->uk_domain[i]; 998 LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) { 999 /* We have nowhere to free these to. */ 1000 if (slab->us_flags & UMA_SLAB_BOOT) 1001 continue; 1002 1003 LIST_REMOVE(slab, us_link); 1004 keg->uk_pages -= keg->uk_ppera; 1005 keg->uk_free -= keg->uk_ipers; 1006 1007 if (keg->uk_flags & UMA_ZONE_HASH) 1008 UMA_HASH_REMOVE(&keg->uk_hash, slab, 1009 slab->us_data); 1010 1011 SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink); 1012 } 1013 } 1014 1015 finished: 1016 KEG_UNLOCK(keg); 1017 1018 while ((slab = SLIST_FIRST(&freeslabs)) != NULL) { 1019 SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink); 1020 keg_free_slab(keg, slab, keg->uk_ipers); 1021 } 1022 } 1023 1024 static void 1025 zone_drain_wait(uma_zone_t zone, int waitok) 1026 { 1027 1028 /* 1029 * Set draining to interlock with zone_dtor() so we can release our 1030 * locks as we go. Only dtor() should do a WAITOK call since it 1031 * is the only call that knows the structure will still be available 1032 * when it wakes up. 1033 */ 1034 ZONE_LOCK(zone); 1035 while (zone->uz_flags & UMA_ZFLAG_DRAINING) { 1036 if (waitok == M_NOWAIT) 1037 goto out; 1038 msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1); 1039 } 1040 zone->uz_flags |= UMA_ZFLAG_DRAINING; 1041 bucket_cache_drain(zone); 1042 ZONE_UNLOCK(zone); 1043 /* 1044 * The DRAINING flag protects us from being freed while 1045 * we're running. Normally the uma_rwlock would protect us but we 1046 * must be able to release and acquire the right lock for each keg. 1047 */ 1048 keg_drain(zone->uz_keg); 1049 ZONE_LOCK(zone); 1050 zone->uz_flags &= ~UMA_ZFLAG_DRAINING; 1051 wakeup(zone); 1052 out: 1053 ZONE_UNLOCK(zone); 1054 } 1055 1056 void 1057 zone_drain(uma_zone_t zone) 1058 { 1059 1060 zone_drain_wait(zone, M_NOWAIT); 1061 } 1062 1063 /* 1064 * Allocate a new slab for a keg. This does not insert the slab onto a list. 1065 * If the allocation was successful, the keg lock will be held upon return, 1066 * otherwise the keg will be left unlocked. 1067 * 1068 * Arguments: 1069 * flags Wait flags for the item initialization routine 1070 * aflags Wait flags for the slab allocation 1071 * 1072 * Returns: 1073 * The slab that was allocated or NULL if there is no memory and the 1074 * caller specified M_NOWAIT. 1075 */ 1076 static uma_slab_t 1077 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags, 1078 int aflags) 1079 { 1080 uma_alloc allocf; 1081 uma_slab_t slab; 1082 unsigned long size; 1083 uint8_t *mem; 1084 uint8_t sflags; 1085 int i; 1086 1087 KASSERT(domain >= 0 && domain < vm_ndomains, 1088 ("keg_alloc_slab: domain %d out of range", domain)); 1089 KEG_LOCK_ASSERT(keg); 1090 MPASS(zone->uz_lockptr == &keg->uk_lock); 1091 1092 allocf = keg->uk_allocf; 1093 KEG_UNLOCK(keg); 1094 1095 slab = NULL; 1096 mem = NULL; 1097 if (keg->uk_flags & UMA_ZONE_OFFPAGE) { 1098 slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags); 1099 if (slab == NULL) 1100 goto out; 1101 } 1102 1103 /* 1104 * This reproduces the old vm_zone behavior of zero filling pages the 1105 * first time they are added to a zone. 1106 * 1107 * Malloced items are zeroed in uma_zalloc. 1108 */ 1109 1110 if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0) 1111 aflags |= M_ZERO; 1112 else 1113 aflags &= ~M_ZERO; 1114 1115 if (keg->uk_flags & UMA_ZONE_NODUMP) 1116 aflags |= M_NODUMP; 1117 1118 /* zone is passed for legacy reasons. */ 1119 size = keg->uk_ppera * PAGE_SIZE; 1120 mem = allocf(zone, size, domain, &sflags, aflags); 1121 if (mem == NULL) { 1122 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1123 zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE); 1124 slab = NULL; 1125 goto out; 1126 } 1127 uma_total_inc(size); 1128 1129 /* Point the slab into the allocated memory */ 1130 if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) 1131 slab = (uma_slab_t )(mem + keg->uk_pgoff); 1132 1133 if (keg->uk_flags & UMA_ZONE_VTOSLAB) 1134 for (i = 0; i < keg->uk_ppera; i++) 1135 vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab); 1136 1137 slab->us_keg = keg; 1138 slab->us_data = mem; 1139 slab->us_freecount = keg->uk_ipers; 1140 slab->us_flags = sflags; 1141 slab->us_domain = domain; 1142 BIT_FILL(SLAB_SETSIZE, &slab->us_free); 1143 #ifdef INVARIANTS 1144 BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree); 1145 #endif 1146 1147 if (keg->uk_init != NULL) { 1148 for (i = 0; i < keg->uk_ipers; i++) 1149 if (keg->uk_init(slab->us_data + (keg->uk_rsize * i), 1150 keg->uk_size, flags) != 0) 1151 break; 1152 if (i != keg->uk_ipers) { 1153 keg_free_slab(keg, slab, i); 1154 slab = NULL; 1155 goto out; 1156 } 1157 } 1158 KEG_LOCK(keg); 1159 1160 CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)", 1161 slab, keg->uk_name, keg); 1162 1163 if (keg->uk_flags & UMA_ZONE_HASH) 1164 UMA_HASH_INSERT(&keg->uk_hash, slab, mem); 1165 1166 keg->uk_pages += keg->uk_ppera; 1167 keg->uk_free += keg->uk_ipers; 1168 1169 out: 1170 return (slab); 1171 } 1172 1173 /* 1174 * This function is intended to be used early on in place of page_alloc() so 1175 * that we may use the boot time page cache to satisfy allocations before 1176 * the VM is ready. 1177 */ 1178 static void * 1179 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1180 int wait) 1181 { 1182 uma_keg_t keg; 1183 void *mem; 1184 int pages; 1185 1186 keg = zone->uz_keg; 1187 /* 1188 * If we are in BOOT_BUCKETS or higher, than switch to real 1189 * allocator. Zones with page sized slabs switch at BOOT_PAGEALLOC. 1190 */ 1191 switch (booted) { 1192 case BOOT_COLD: 1193 case BOOT_STRAPPED: 1194 break; 1195 case BOOT_PAGEALLOC: 1196 if (keg->uk_ppera > 1) 1197 break; 1198 case BOOT_BUCKETS: 1199 case BOOT_RUNNING: 1200 #ifdef UMA_MD_SMALL_ALLOC 1201 keg->uk_allocf = (keg->uk_ppera > 1) ? 1202 page_alloc : uma_small_alloc; 1203 #else 1204 keg->uk_allocf = page_alloc; 1205 #endif 1206 return keg->uk_allocf(zone, bytes, domain, pflag, wait); 1207 } 1208 1209 /* 1210 * Check our small startup cache to see if it has pages remaining. 1211 */ 1212 pages = howmany(bytes, PAGE_SIZE); 1213 KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__)); 1214 if (pages > boot_pages) 1215 panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name); 1216 #ifdef DIAGNOSTIC 1217 printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name, 1218 boot_pages); 1219 #endif 1220 mem = bootmem; 1221 boot_pages -= pages; 1222 bootmem += pages * PAGE_SIZE; 1223 *pflag = UMA_SLAB_BOOT; 1224 1225 return (mem); 1226 } 1227 1228 /* 1229 * Allocates a number of pages from the system 1230 * 1231 * Arguments: 1232 * bytes The number of bytes requested 1233 * wait Shall we wait? 1234 * 1235 * Returns: 1236 * A pointer to the alloced memory or possibly 1237 * NULL if M_NOWAIT is set. 1238 */ 1239 static void * 1240 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1241 int wait) 1242 { 1243 void *p; /* Returned page */ 1244 1245 *pflag = UMA_SLAB_KERNEL; 1246 p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait); 1247 1248 return (p); 1249 } 1250 1251 static void * 1252 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 1253 int wait) 1254 { 1255 struct pglist alloctail; 1256 vm_offset_t addr, zkva; 1257 int cpu, flags; 1258 vm_page_t p, p_next; 1259 #ifdef NUMA 1260 struct pcpu *pc; 1261 #endif 1262 1263 MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE); 1264 1265 TAILQ_INIT(&alloctail); 1266 flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | 1267 malloc2vm_flags(wait); 1268 *pflag = UMA_SLAB_KERNEL; 1269 for (cpu = 0; cpu <= mp_maxid; cpu++) { 1270 if (CPU_ABSENT(cpu)) { 1271 p = vm_page_alloc(NULL, 0, flags); 1272 } else { 1273 #ifndef NUMA 1274 p = vm_page_alloc(NULL, 0, flags); 1275 #else 1276 pc = pcpu_find(cpu); 1277 p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags); 1278 if (__predict_false(p == NULL)) 1279 p = vm_page_alloc(NULL, 0, flags); 1280 #endif 1281 } 1282 if (__predict_false(p == NULL)) 1283 goto fail; 1284 TAILQ_INSERT_TAIL(&alloctail, p, listq); 1285 } 1286 if ((addr = kva_alloc(bytes)) == 0) 1287 goto fail; 1288 zkva = addr; 1289 TAILQ_FOREACH(p, &alloctail, listq) { 1290 pmap_qenter(zkva, &p, 1); 1291 zkva += PAGE_SIZE; 1292 } 1293 return ((void*)addr); 1294 fail: 1295 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { 1296 vm_page_unwire_noq(p); 1297 vm_page_free(p); 1298 } 1299 return (NULL); 1300 } 1301 1302 /* 1303 * Allocates a number of pages from within an object 1304 * 1305 * Arguments: 1306 * bytes The number of bytes requested 1307 * wait Shall we wait? 1308 * 1309 * Returns: 1310 * A pointer to the alloced memory or possibly 1311 * NULL if M_NOWAIT is set. 1312 */ 1313 static void * 1314 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, 1315 int wait) 1316 { 1317 TAILQ_HEAD(, vm_page) alloctail; 1318 u_long npages; 1319 vm_offset_t retkva, zkva; 1320 vm_page_t p, p_next; 1321 uma_keg_t keg; 1322 1323 TAILQ_INIT(&alloctail); 1324 keg = zone->uz_keg; 1325 1326 npages = howmany(bytes, PAGE_SIZE); 1327 while (npages > 0) { 1328 p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT | 1329 VM_ALLOC_WIRED | VM_ALLOC_NOOBJ | 1330 ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK : 1331 VM_ALLOC_NOWAIT)); 1332 if (p != NULL) { 1333 /* 1334 * Since the page does not belong to an object, its 1335 * listq is unused. 1336 */ 1337 TAILQ_INSERT_TAIL(&alloctail, p, listq); 1338 npages--; 1339 continue; 1340 } 1341 /* 1342 * Page allocation failed, free intermediate pages and 1343 * exit. 1344 */ 1345 TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) { 1346 vm_page_unwire_noq(p); 1347 vm_page_free(p); 1348 } 1349 return (NULL); 1350 } 1351 *flags = UMA_SLAB_PRIV; 1352 zkva = keg->uk_kva + 1353 atomic_fetchadd_long(&keg->uk_offset, round_page(bytes)); 1354 retkva = zkva; 1355 TAILQ_FOREACH(p, &alloctail, listq) { 1356 pmap_qenter(zkva, &p, 1); 1357 zkva += PAGE_SIZE; 1358 } 1359 1360 return ((void *)retkva); 1361 } 1362 1363 /* 1364 * Frees a number of pages to the system 1365 * 1366 * Arguments: 1367 * mem A pointer to the memory to be freed 1368 * size The size of the memory being freed 1369 * flags The original p->us_flags field 1370 * 1371 * Returns: 1372 * Nothing 1373 */ 1374 static void 1375 page_free(void *mem, vm_size_t size, uint8_t flags) 1376 { 1377 1378 if ((flags & UMA_SLAB_KERNEL) == 0) 1379 panic("UMA: page_free used with invalid flags %x", flags); 1380 1381 kmem_free((vm_offset_t)mem, size); 1382 } 1383 1384 /* 1385 * Frees pcpu zone allocations 1386 * 1387 * Arguments: 1388 * mem A pointer to the memory to be freed 1389 * size The size of the memory being freed 1390 * flags The original p->us_flags field 1391 * 1392 * Returns: 1393 * Nothing 1394 */ 1395 static void 1396 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags) 1397 { 1398 vm_offset_t sva, curva; 1399 vm_paddr_t paddr; 1400 vm_page_t m; 1401 1402 MPASS(size == (mp_maxid+1)*PAGE_SIZE); 1403 sva = (vm_offset_t)mem; 1404 for (curva = sva; curva < sva + size; curva += PAGE_SIZE) { 1405 paddr = pmap_kextract(curva); 1406 m = PHYS_TO_VM_PAGE(paddr); 1407 vm_page_unwire_noq(m); 1408 vm_page_free(m); 1409 } 1410 pmap_qremove(sva, size >> PAGE_SHIFT); 1411 kva_free(sva, size); 1412 } 1413 1414 1415 /* 1416 * Zero fill initializer 1417 * 1418 * Arguments/Returns follow uma_init specifications 1419 */ 1420 static int 1421 zero_init(void *mem, int size, int flags) 1422 { 1423 bzero(mem, size); 1424 return (0); 1425 } 1426 1427 /* 1428 * Finish creating a small uma keg. This calculates ipers, and the keg size. 1429 * 1430 * Arguments 1431 * keg The zone we should initialize 1432 * 1433 * Returns 1434 * Nothing 1435 */ 1436 static void 1437 keg_small_init(uma_keg_t keg) 1438 { 1439 u_int rsize; 1440 u_int memused; 1441 u_int wastedspace; 1442 u_int shsize; 1443 u_int slabsize; 1444 1445 if (keg->uk_flags & UMA_ZONE_PCPU) { 1446 u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU; 1447 1448 slabsize = UMA_PCPU_ALLOC_SIZE; 1449 keg->uk_ppera = ncpus; 1450 } else { 1451 slabsize = UMA_SLAB_SIZE; 1452 keg->uk_ppera = 1; 1453 } 1454 1455 /* 1456 * Calculate the size of each allocation (rsize) according to 1457 * alignment. If the requested size is smaller than we have 1458 * allocation bits for we round it up. 1459 */ 1460 rsize = keg->uk_size; 1461 if (rsize < slabsize / SLAB_SETSIZE) 1462 rsize = slabsize / SLAB_SETSIZE; 1463 if (rsize & keg->uk_align) 1464 rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1); 1465 keg->uk_rsize = rsize; 1466 1467 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 || 1468 keg->uk_rsize < UMA_PCPU_ALLOC_SIZE, 1469 ("%s: size %u too large", __func__, keg->uk_rsize)); 1470 1471 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1472 shsize = 0; 1473 else 1474 shsize = SIZEOF_UMA_SLAB; 1475 1476 if (rsize <= slabsize - shsize) 1477 keg->uk_ipers = (slabsize - shsize) / rsize; 1478 else { 1479 /* Handle special case when we have 1 item per slab, so 1480 * alignment requirement can be relaxed. */ 1481 KASSERT(keg->uk_size <= slabsize - shsize, 1482 ("%s: size %u greater than slab", __func__, keg->uk_size)); 1483 keg->uk_ipers = 1; 1484 } 1485 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE, 1486 ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); 1487 1488 memused = keg->uk_ipers * rsize + shsize; 1489 wastedspace = slabsize - memused; 1490 1491 /* 1492 * We can't do OFFPAGE if we're internal or if we've been 1493 * asked to not go to the VM for buckets. If we do this we 1494 * may end up going to the VM for slabs which we do not 1495 * want to do if we're UMA_ZFLAG_CACHEONLY as a result 1496 * of UMA_ZONE_VM, which clearly forbids it. 1497 */ 1498 if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) || 1499 (keg->uk_flags & UMA_ZFLAG_CACHEONLY)) 1500 return; 1501 1502 /* 1503 * See if using an OFFPAGE slab will limit our waste. Only do 1504 * this if it permits more items per-slab. 1505 * 1506 * XXX We could try growing slabsize to limit max waste as well. 1507 * Historically this was not done because the VM could not 1508 * efficiently handle contiguous allocations. 1509 */ 1510 if ((wastedspace >= slabsize / UMA_MAX_WASTE) && 1511 (keg->uk_ipers < (slabsize / keg->uk_rsize))) { 1512 keg->uk_ipers = slabsize / keg->uk_rsize; 1513 KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE, 1514 ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers)); 1515 CTR6(KTR_UMA, "UMA decided we need offpage slab headers for " 1516 "keg: %s(%p), calculated wastedspace = %d, " 1517 "maximum wasted space allowed = %d, " 1518 "calculated ipers = %d, " 1519 "new wasted space = %d\n", keg->uk_name, keg, wastedspace, 1520 slabsize / UMA_MAX_WASTE, keg->uk_ipers, 1521 slabsize - keg->uk_ipers * keg->uk_rsize); 1522 keg->uk_flags |= UMA_ZONE_OFFPAGE; 1523 } 1524 1525 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && 1526 (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) 1527 keg->uk_flags |= UMA_ZONE_HASH; 1528 } 1529 1530 /* 1531 * Finish creating a large (> UMA_SLAB_SIZE) uma kegs. Just give in and do 1532 * OFFPAGE for now. When I can allow for more dynamic slab sizes this will be 1533 * more complicated. 1534 * 1535 * Arguments 1536 * keg The keg we should initialize 1537 * 1538 * Returns 1539 * Nothing 1540 */ 1541 static void 1542 keg_large_init(uma_keg_t keg) 1543 { 1544 1545 KASSERT(keg != NULL, ("Keg is null in keg_large_init")); 1546 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, 1547 ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__)); 1548 1549 keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE); 1550 keg->uk_ipers = 1; 1551 keg->uk_rsize = keg->uk_size; 1552 1553 /* Check whether we have enough space to not do OFFPAGE. */ 1554 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0 && 1555 PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < SIZEOF_UMA_SLAB) { 1556 /* 1557 * We can't do OFFPAGE if we're internal, in which case 1558 * we need an extra page per allocation to contain the 1559 * slab header. 1560 */ 1561 if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0) 1562 keg->uk_flags |= UMA_ZONE_OFFPAGE; 1563 else 1564 keg->uk_ppera++; 1565 } 1566 1567 if ((keg->uk_flags & UMA_ZONE_OFFPAGE) && 1568 (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0) 1569 keg->uk_flags |= UMA_ZONE_HASH; 1570 } 1571 1572 static void 1573 keg_cachespread_init(uma_keg_t keg) 1574 { 1575 int alignsize; 1576 int trailer; 1577 int pages; 1578 int rsize; 1579 1580 KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0, 1581 ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__)); 1582 1583 alignsize = keg->uk_align + 1; 1584 rsize = keg->uk_size; 1585 /* 1586 * We want one item to start on every align boundary in a page. To 1587 * do this we will span pages. We will also extend the item by the 1588 * size of align if it is an even multiple of align. Otherwise, it 1589 * would fall on the same boundary every time. 1590 */ 1591 if (rsize & keg->uk_align) 1592 rsize = (rsize & ~keg->uk_align) + alignsize; 1593 if ((rsize & alignsize) == 0) 1594 rsize += alignsize; 1595 trailer = rsize - keg->uk_size; 1596 pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE; 1597 pages = MIN(pages, (128 * 1024) / PAGE_SIZE); 1598 keg->uk_rsize = rsize; 1599 keg->uk_ppera = pages; 1600 keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize; 1601 keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB; 1602 KASSERT(keg->uk_ipers <= SLAB_SETSIZE, 1603 ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__, 1604 keg->uk_ipers)); 1605 } 1606 1607 /* 1608 * Keg header ctor. This initializes all fields, locks, etc. And inserts 1609 * the keg onto the global keg list. 1610 * 1611 * Arguments/Returns follow uma_ctor specifications 1612 * udata Actually uma_kctor_args 1613 */ 1614 static int 1615 keg_ctor(void *mem, int size, void *udata, int flags) 1616 { 1617 struct uma_kctor_args *arg = udata; 1618 uma_keg_t keg = mem; 1619 uma_zone_t zone; 1620 1621 bzero(keg, size); 1622 keg->uk_size = arg->size; 1623 keg->uk_init = arg->uminit; 1624 keg->uk_fini = arg->fini; 1625 keg->uk_align = arg->align; 1626 keg->uk_free = 0; 1627 keg->uk_reserve = 0; 1628 keg->uk_pages = 0; 1629 keg->uk_flags = arg->flags; 1630 keg->uk_slabzone = NULL; 1631 1632 /* 1633 * We use a global round-robin policy by default. Zones with 1634 * UMA_ZONE_NUMA set will use first-touch instead, in which case the 1635 * iterator is never run. 1636 */ 1637 keg->uk_dr.dr_policy = DOMAINSET_RR(); 1638 keg->uk_dr.dr_iter = 0; 1639 1640 /* 1641 * The master zone is passed to us at keg-creation time. 1642 */ 1643 zone = arg->zone; 1644 keg->uk_name = zone->uz_name; 1645 1646 if (arg->flags & UMA_ZONE_VM) 1647 keg->uk_flags |= UMA_ZFLAG_CACHEONLY; 1648 1649 if (arg->flags & UMA_ZONE_ZINIT) 1650 keg->uk_init = zero_init; 1651 1652 if (arg->flags & UMA_ZONE_MALLOC) 1653 keg->uk_flags |= UMA_ZONE_VTOSLAB; 1654 1655 if (arg->flags & UMA_ZONE_PCPU) 1656 #ifdef SMP 1657 keg->uk_flags |= UMA_ZONE_OFFPAGE; 1658 #else 1659 keg->uk_flags &= ~UMA_ZONE_PCPU; 1660 #endif 1661 1662 if (keg->uk_flags & UMA_ZONE_CACHESPREAD) { 1663 keg_cachespread_init(keg); 1664 } else { 1665 if (keg->uk_size > UMA_SLAB_SPACE) 1666 keg_large_init(keg); 1667 else 1668 keg_small_init(keg); 1669 } 1670 1671 if (keg->uk_flags & UMA_ZONE_OFFPAGE) 1672 keg->uk_slabzone = slabzone; 1673 1674 /* 1675 * If we haven't booted yet we need allocations to go through the 1676 * startup cache until the vm is ready. 1677 */ 1678 if (booted < BOOT_PAGEALLOC) 1679 keg->uk_allocf = startup_alloc; 1680 #ifdef UMA_MD_SMALL_ALLOC 1681 else if (keg->uk_ppera == 1) 1682 keg->uk_allocf = uma_small_alloc; 1683 #endif 1684 else if (keg->uk_flags & UMA_ZONE_PCPU) 1685 keg->uk_allocf = pcpu_page_alloc; 1686 else 1687 keg->uk_allocf = page_alloc; 1688 #ifdef UMA_MD_SMALL_ALLOC 1689 if (keg->uk_ppera == 1) 1690 keg->uk_freef = uma_small_free; 1691 else 1692 #endif 1693 if (keg->uk_flags & UMA_ZONE_PCPU) 1694 keg->uk_freef = pcpu_page_free; 1695 else 1696 keg->uk_freef = page_free; 1697 1698 /* 1699 * Initialize keg's lock 1700 */ 1701 KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS)); 1702 1703 /* 1704 * If we're putting the slab header in the actual page we need to 1705 * figure out where in each page it goes. See SIZEOF_UMA_SLAB 1706 * macro definition. 1707 */ 1708 if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) { 1709 keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - SIZEOF_UMA_SLAB; 1710 /* 1711 * The only way the following is possible is if with our 1712 * UMA_ALIGN_PTR adjustments we are now bigger than 1713 * UMA_SLAB_SIZE. I haven't checked whether this is 1714 * mathematically possible for all cases, so we make 1715 * sure here anyway. 1716 */ 1717 KASSERT(keg->uk_pgoff + sizeof(struct uma_slab) <= 1718 PAGE_SIZE * keg->uk_ppera, 1719 ("zone %s ipers %d rsize %d size %d slab won't fit", 1720 zone->uz_name, keg->uk_ipers, keg->uk_rsize, keg->uk_size)); 1721 } 1722 1723 if (keg->uk_flags & UMA_ZONE_HASH) 1724 hash_alloc(&keg->uk_hash, 0); 1725 1726 CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n", 1727 keg, zone->uz_name, zone, 1728 (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, 1729 keg->uk_free); 1730 1731 LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link); 1732 1733 rw_wlock(&uma_rwlock); 1734 LIST_INSERT_HEAD(&uma_kegs, keg, uk_link); 1735 rw_wunlock(&uma_rwlock); 1736 return (0); 1737 } 1738 1739 static void 1740 zone_alloc_counters(uma_zone_t zone) 1741 { 1742 1743 zone->uz_allocs = counter_u64_alloc(M_WAITOK); 1744 zone->uz_frees = counter_u64_alloc(M_WAITOK); 1745 zone->uz_fails = counter_u64_alloc(M_WAITOK); 1746 } 1747 1748 /* 1749 * Zone header ctor. This initializes all fields, locks, etc. 1750 * 1751 * Arguments/Returns follow uma_ctor specifications 1752 * udata Actually uma_zctor_args 1753 */ 1754 static int 1755 zone_ctor(void *mem, int size, void *udata, int flags) 1756 { 1757 struct uma_zctor_args *arg = udata; 1758 uma_zone_t zone = mem; 1759 uma_zone_t z; 1760 uma_keg_t keg; 1761 1762 bzero(zone, size); 1763 zone->uz_name = arg->name; 1764 zone->uz_ctor = arg->ctor; 1765 zone->uz_dtor = arg->dtor; 1766 zone->uz_init = NULL; 1767 zone->uz_fini = NULL; 1768 zone->uz_sleeps = 0; 1769 zone->uz_xdomain = 0; 1770 zone->uz_count = 0; 1771 zone->uz_count_min = 0; 1772 zone->uz_count_max = BUCKET_MAX; 1773 zone->uz_flags = 0; 1774 zone->uz_warning = NULL; 1775 /* The domain structures follow the cpu structures. */ 1776 zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus]; 1777 zone->uz_bkt_max = ULONG_MAX; 1778 timevalclear(&zone->uz_ratecheck); 1779 1780 if (__predict_true(booted == BOOT_RUNNING)) 1781 zone_alloc_counters(zone); 1782 else { 1783 zone->uz_allocs = EARLY_COUNTER; 1784 zone->uz_frees = EARLY_COUNTER; 1785 zone->uz_fails = EARLY_COUNTER; 1786 } 1787 1788 /* 1789 * This is a pure cache zone, no kegs. 1790 */ 1791 if (arg->import) { 1792 if (arg->flags & UMA_ZONE_VM) 1793 arg->flags |= UMA_ZFLAG_CACHEONLY; 1794 zone->uz_flags = arg->flags; 1795 zone->uz_size = arg->size; 1796 zone->uz_import = arg->import; 1797 zone->uz_release = arg->release; 1798 zone->uz_arg = arg->arg; 1799 zone->uz_lockptr = &zone->uz_lock; 1800 ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS)); 1801 rw_wlock(&uma_rwlock); 1802 LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link); 1803 rw_wunlock(&uma_rwlock); 1804 goto out; 1805 } 1806 1807 /* 1808 * Use the regular zone/keg/slab allocator. 1809 */ 1810 zone->uz_import = (uma_import)zone_import; 1811 zone->uz_release = (uma_release)zone_release; 1812 zone->uz_arg = zone; 1813 keg = arg->keg; 1814 1815 if (arg->flags & UMA_ZONE_SECONDARY) { 1816 KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg")); 1817 zone->uz_init = arg->uminit; 1818 zone->uz_fini = arg->fini; 1819 zone->uz_lockptr = &keg->uk_lock; 1820 zone->uz_flags |= UMA_ZONE_SECONDARY; 1821 rw_wlock(&uma_rwlock); 1822 ZONE_LOCK(zone); 1823 LIST_FOREACH(z, &keg->uk_zones, uz_link) { 1824 if (LIST_NEXT(z, uz_link) == NULL) { 1825 LIST_INSERT_AFTER(z, zone, uz_link); 1826 break; 1827 } 1828 } 1829 ZONE_UNLOCK(zone); 1830 rw_wunlock(&uma_rwlock); 1831 } else if (keg == NULL) { 1832 if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini, 1833 arg->align, arg->flags)) == NULL) 1834 return (ENOMEM); 1835 } else { 1836 struct uma_kctor_args karg; 1837 int error; 1838 1839 /* We should only be here from uma_startup() */ 1840 karg.size = arg->size; 1841 karg.uminit = arg->uminit; 1842 karg.fini = arg->fini; 1843 karg.align = arg->align; 1844 karg.flags = arg->flags; 1845 karg.zone = zone; 1846 error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg, 1847 flags); 1848 if (error) 1849 return (error); 1850 } 1851 1852 zone->uz_keg = keg; 1853 zone->uz_size = keg->uk_size; 1854 zone->uz_flags |= (keg->uk_flags & 1855 (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT)); 1856 1857 /* 1858 * Some internal zones don't have room allocated for the per cpu 1859 * caches. If we're internal, bail out here. 1860 */ 1861 if (keg->uk_flags & UMA_ZFLAG_INTERNAL) { 1862 KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0, 1863 ("Secondary zone requested UMA_ZFLAG_INTERNAL")); 1864 return (0); 1865 } 1866 1867 out: 1868 KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != 1869 (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), 1870 ("Invalid zone flag combination")); 1871 if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0) { 1872 zone->uz_count = BUCKET_MAX; 1873 } else if ((arg->flags & UMA_ZONE_MINBUCKET) != 0) { 1874 zone->uz_count = BUCKET_MIN; 1875 zone->uz_count_max = BUCKET_MIN; 1876 } else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0) 1877 zone->uz_count = 0; 1878 else 1879 zone->uz_count = bucket_select(zone->uz_size); 1880 zone->uz_count_min = zone->uz_count; 1881 1882 return (0); 1883 } 1884 1885 /* 1886 * Keg header dtor. This frees all data, destroys locks, frees the hash 1887 * table and removes the keg from the global list. 1888 * 1889 * Arguments/Returns follow uma_dtor specifications 1890 * udata unused 1891 */ 1892 static void 1893 keg_dtor(void *arg, int size, void *udata) 1894 { 1895 uma_keg_t keg; 1896 1897 keg = (uma_keg_t)arg; 1898 KEG_LOCK(keg); 1899 if (keg->uk_free != 0) { 1900 printf("Freed UMA keg (%s) was not empty (%d items). " 1901 " Lost %d pages of memory.\n", 1902 keg->uk_name ? keg->uk_name : "", 1903 keg->uk_free, keg->uk_pages); 1904 } 1905 KEG_UNLOCK(keg); 1906 1907 hash_free(&keg->uk_hash); 1908 1909 KEG_LOCK_FINI(keg); 1910 } 1911 1912 /* 1913 * Zone header dtor. 1914 * 1915 * Arguments/Returns follow uma_dtor specifications 1916 * udata unused 1917 */ 1918 static void 1919 zone_dtor(void *arg, int size, void *udata) 1920 { 1921 uma_zone_t zone; 1922 uma_keg_t keg; 1923 1924 zone = (uma_zone_t)arg; 1925 1926 if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL)) 1927 cache_drain(zone); 1928 1929 rw_wlock(&uma_rwlock); 1930 LIST_REMOVE(zone, uz_link); 1931 rw_wunlock(&uma_rwlock); 1932 /* 1933 * XXX there are some races here where 1934 * the zone can be drained but zone lock 1935 * released and then refilled before we 1936 * remove it... we dont care for now 1937 */ 1938 zone_drain_wait(zone, M_WAITOK); 1939 /* 1940 * We only destroy kegs from non secondary/non cache zones. 1941 */ 1942 if ((zone->uz_flags & (UMA_ZONE_SECONDARY | UMA_ZFLAG_CACHE)) == 0) { 1943 keg = zone->uz_keg; 1944 rw_wlock(&uma_rwlock); 1945 LIST_REMOVE(keg, uk_link); 1946 rw_wunlock(&uma_rwlock); 1947 zone_free_item(kegs, keg, NULL, SKIP_NONE); 1948 } 1949 counter_u64_free(zone->uz_allocs); 1950 counter_u64_free(zone->uz_frees); 1951 counter_u64_free(zone->uz_fails); 1952 if (zone->uz_lockptr == &zone->uz_lock) 1953 ZONE_LOCK_FINI(zone); 1954 } 1955 1956 /* 1957 * Traverses every zone in the system and calls a callback 1958 * 1959 * Arguments: 1960 * zfunc A pointer to a function which accepts a zone 1961 * as an argument. 1962 * 1963 * Returns: 1964 * Nothing 1965 */ 1966 static void 1967 zone_foreach(void (*zfunc)(uma_zone_t)) 1968 { 1969 uma_keg_t keg; 1970 uma_zone_t zone; 1971 1972 /* 1973 * Before BOOT_RUNNING we are guaranteed to be single 1974 * threaded, so locking isn't needed. Startup functions 1975 * are allowed to use M_WAITOK. 1976 */ 1977 if (__predict_true(booted == BOOT_RUNNING)) 1978 rw_rlock(&uma_rwlock); 1979 LIST_FOREACH(keg, &uma_kegs, uk_link) { 1980 LIST_FOREACH(zone, &keg->uk_zones, uz_link) 1981 zfunc(zone); 1982 } 1983 if (__predict_true(booted == BOOT_RUNNING)) 1984 rw_runlock(&uma_rwlock); 1985 } 1986 1987 /* 1988 * Count how many pages do we need to bootstrap. VM supplies 1989 * its need in early zones in the argument, we add up our zones, 1990 * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The 1991 * zone of zones and zone of kegs are accounted separately. 1992 */ 1993 #define UMA_BOOT_ZONES 11 1994 /* Zone of zones and zone of kegs have arbitrary alignment. */ 1995 #define UMA_BOOT_ALIGN 32 1996 static int zsize, ksize; 1997 int 1998 uma_startup_count(int vm_zones) 1999 { 2000 int zones, pages; 2001 2002 ksize = sizeof(struct uma_keg) + 2003 (sizeof(struct uma_domain) * vm_ndomains); 2004 zsize = sizeof(struct uma_zone) + 2005 (sizeof(struct uma_cache) * (mp_maxid + 1)) + 2006 (sizeof(struct uma_zone_domain) * vm_ndomains); 2007 2008 /* 2009 * Memory for the zone of kegs and its keg, 2010 * and for zone of zones. 2011 */ 2012 pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 + 2013 roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE); 2014 2015 #ifdef UMA_MD_SMALL_ALLOC 2016 zones = UMA_BOOT_ZONES; 2017 #else 2018 zones = UMA_BOOT_ZONES + vm_zones; 2019 vm_zones = 0; 2020 #endif 2021 2022 /* Memory for the rest of startup zones, UMA and VM, ... */ 2023 if (zsize > UMA_SLAB_SPACE) { 2024 /* See keg_large_init(). */ 2025 u_int ppera; 2026 2027 ppera = howmany(roundup2(zsize, UMA_BOOT_ALIGN), PAGE_SIZE); 2028 if (PAGE_SIZE * ppera - roundup2(zsize, UMA_BOOT_ALIGN) < 2029 SIZEOF_UMA_SLAB) 2030 ppera++; 2031 pages += (zones + vm_zones) * ppera; 2032 } else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE) 2033 /* See keg_small_init() special case for uk_ppera = 1. */ 2034 pages += zones; 2035 else 2036 pages += howmany(zones, 2037 UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN)); 2038 2039 /* ... and their kegs. Note that zone of zones allocates a keg! */ 2040 pages += howmany(zones + 1, 2041 UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN)); 2042 2043 /* 2044 * Most of startup zones are not going to be offpages, that's 2045 * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all 2046 * calculations. Some large bucket zones will be offpage, and 2047 * thus will allocate hashes. We take conservative approach 2048 * and assume that all zones may allocate hash. This may give 2049 * us some positive inaccuracy, usually an extra single page. 2050 */ 2051 pages += howmany(zones, UMA_SLAB_SPACE / 2052 (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT)); 2053 2054 return (pages); 2055 } 2056 2057 void 2058 uma_startup(void *mem, int npages) 2059 { 2060 struct uma_zctor_args args; 2061 uma_keg_t masterkeg; 2062 uintptr_t m; 2063 2064 #ifdef DIAGNOSTIC 2065 printf("Entering %s with %d boot pages configured\n", __func__, npages); 2066 #endif 2067 2068 rw_init(&uma_rwlock, "UMA lock"); 2069 2070 /* Use bootpages memory for the zone of zones and zone of kegs. */ 2071 m = (uintptr_t)mem; 2072 zones = (uma_zone_t)m; 2073 m += roundup(zsize, CACHE_LINE_SIZE); 2074 kegs = (uma_zone_t)m; 2075 m += roundup(zsize, CACHE_LINE_SIZE); 2076 masterkeg = (uma_keg_t)m; 2077 m += roundup(ksize, CACHE_LINE_SIZE); 2078 m = roundup(m, PAGE_SIZE); 2079 npages -= (m - (uintptr_t)mem) / PAGE_SIZE; 2080 mem = (void *)m; 2081 2082 /* "manually" create the initial zone */ 2083 memset(&args, 0, sizeof(args)); 2084 args.name = "UMA Kegs"; 2085 args.size = ksize; 2086 args.ctor = keg_ctor; 2087 args.dtor = keg_dtor; 2088 args.uminit = zero_init; 2089 args.fini = NULL; 2090 args.keg = masterkeg; 2091 args.align = UMA_BOOT_ALIGN - 1; 2092 args.flags = UMA_ZFLAG_INTERNAL; 2093 zone_ctor(kegs, zsize, &args, M_WAITOK); 2094 2095 bootmem = mem; 2096 boot_pages = npages; 2097 2098 args.name = "UMA Zones"; 2099 args.size = zsize; 2100 args.ctor = zone_ctor; 2101 args.dtor = zone_dtor; 2102 args.uminit = zero_init; 2103 args.fini = NULL; 2104 args.keg = NULL; 2105 args.align = UMA_BOOT_ALIGN - 1; 2106 args.flags = UMA_ZFLAG_INTERNAL; 2107 zone_ctor(zones, zsize, &args, M_WAITOK); 2108 2109 /* Now make a zone for slab headers */ 2110 slabzone = uma_zcreate("UMA Slabs", 2111 sizeof(struct uma_slab), 2112 NULL, NULL, NULL, NULL, 2113 UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 2114 2115 hashzone = uma_zcreate("UMA Hash", 2116 sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT, 2117 NULL, NULL, NULL, NULL, 2118 UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL); 2119 2120 bucket_init(); 2121 2122 booted = BOOT_STRAPPED; 2123 } 2124 2125 void 2126 uma_startup1(void) 2127 { 2128 2129 #ifdef DIAGNOSTIC 2130 printf("Entering %s with %d boot pages left\n", __func__, boot_pages); 2131 #endif 2132 booted = BOOT_PAGEALLOC; 2133 } 2134 2135 void 2136 uma_startup2(void) 2137 { 2138 2139 #ifdef DIAGNOSTIC 2140 printf("Entering %s with %d boot pages left\n", __func__, boot_pages); 2141 #endif 2142 booted = BOOT_BUCKETS; 2143 sx_init(&uma_drain_lock, "umadrain"); 2144 bucket_enable(); 2145 } 2146 2147 /* 2148 * Initialize our callout handle 2149 * 2150 */ 2151 static void 2152 uma_startup3(void) 2153 { 2154 2155 #ifdef INVARIANTS 2156 TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor); 2157 uma_dbg_cnt = counter_u64_alloc(M_WAITOK); 2158 uma_skip_cnt = counter_u64_alloc(M_WAITOK); 2159 #endif 2160 zone_foreach(zone_alloc_counters); 2161 callout_init(&uma_callout, 1); 2162 callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL); 2163 booted = BOOT_RUNNING; 2164 } 2165 2166 static uma_keg_t 2167 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini, 2168 int align, uint32_t flags) 2169 { 2170 struct uma_kctor_args args; 2171 2172 args.size = size; 2173 args.uminit = uminit; 2174 args.fini = fini; 2175 args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align; 2176 args.flags = flags; 2177 args.zone = zone; 2178 return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK)); 2179 } 2180 2181 /* Public functions */ 2182 /* See uma.h */ 2183 void 2184 uma_set_align(int align) 2185 { 2186 2187 if (align != UMA_ALIGN_CACHE) 2188 uma_align_cache = align; 2189 } 2190 2191 /* See uma.h */ 2192 uma_zone_t 2193 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor, 2194 uma_init uminit, uma_fini fini, int align, uint32_t flags) 2195 2196 { 2197 struct uma_zctor_args args; 2198 uma_zone_t res; 2199 bool locked; 2200 2201 KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"", 2202 align, name)); 2203 2204 /* Sets all zones to a first-touch domain policy. */ 2205 #ifdef UMA_FIRSTTOUCH 2206 flags |= UMA_ZONE_NUMA; 2207 #endif 2208 2209 /* This stuff is essential for the zone ctor */ 2210 memset(&args, 0, sizeof(args)); 2211 args.name = name; 2212 args.size = size; 2213 args.ctor = ctor; 2214 args.dtor = dtor; 2215 args.uminit = uminit; 2216 args.fini = fini; 2217 #ifdef INVARIANTS 2218 /* 2219 * If a zone is being created with an empty constructor and 2220 * destructor, pass UMA constructor/destructor which checks for 2221 * memory use after free. 2222 */ 2223 if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) && 2224 ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) { 2225 args.ctor = trash_ctor; 2226 args.dtor = trash_dtor; 2227 args.uminit = trash_init; 2228 args.fini = trash_fini; 2229 } 2230 #endif 2231 args.align = align; 2232 args.flags = flags; 2233 args.keg = NULL; 2234 2235 if (booted < BOOT_BUCKETS) { 2236 locked = false; 2237 } else { 2238 sx_slock(&uma_drain_lock); 2239 locked = true; 2240 } 2241 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 2242 if (locked) 2243 sx_sunlock(&uma_drain_lock); 2244 return (res); 2245 } 2246 2247 /* See uma.h */ 2248 uma_zone_t 2249 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor, 2250 uma_init zinit, uma_fini zfini, uma_zone_t master) 2251 { 2252 struct uma_zctor_args args; 2253 uma_keg_t keg; 2254 uma_zone_t res; 2255 bool locked; 2256 2257 keg = master->uz_keg; 2258 memset(&args, 0, sizeof(args)); 2259 args.name = name; 2260 args.size = keg->uk_size; 2261 args.ctor = ctor; 2262 args.dtor = dtor; 2263 args.uminit = zinit; 2264 args.fini = zfini; 2265 args.align = keg->uk_align; 2266 args.flags = keg->uk_flags | UMA_ZONE_SECONDARY; 2267 args.keg = keg; 2268 2269 if (booted < BOOT_BUCKETS) { 2270 locked = false; 2271 } else { 2272 sx_slock(&uma_drain_lock); 2273 locked = true; 2274 } 2275 /* XXX Attaches only one keg of potentially many. */ 2276 res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); 2277 if (locked) 2278 sx_sunlock(&uma_drain_lock); 2279 return (res); 2280 } 2281 2282 /* See uma.h */ 2283 uma_zone_t 2284 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor, 2285 uma_init zinit, uma_fini zfini, uma_import zimport, 2286 uma_release zrelease, void *arg, int flags) 2287 { 2288 struct uma_zctor_args args; 2289 2290 memset(&args, 0, sizeof(args)); 2291 args.name = name; 2292 args.size = size; 2293 args.ctor = ctor; 2294 args.dtor = dtor; 2295 args.uminit = zinit; 2296 args.fini = zfini; 2297 args.import = zimport; 2298 args.release = zrelease; 2299 args.arg = arg; 2300 args.align = 0; 2301 args.flags = flags | UMA_ZFLAG_CACHE; 2302 2303 return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK)); 2304 } 2305 2306 /* See uma.h */ 2307 void 2308 uma_zdestroy(uma_zone_t zone) 2309 { 2310 2311 sx_slock(&uma_drain_lock); 2312 zone_free_item(zones, zone, NULL, SKIP_NONE); 2313 sx_sunlock(&uma_drain_lock); 2314 } 2315 2316 void 2317 uma_zwait(uma_zone_t zone) 2318 { 2319 void *item; 2320 2321 item = uma_zalloc_arg(zone, NULL, M_WAITOK); 2322 uma_zfree(zone, item); 2323 } 2324 2325 void * 2326 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags) 2327 { 2328 void *item; 2329 #ifdef SMP 2330 int i; 2331 2332 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 2333 #endif 2334 item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO); 2335 if (item != NULL && (flags & M_ZERO)) { 2336 #ifdef SMP 2337 for (i = 0; i <= mp_maxid; i++) 2338 bzero(zpcpu_get_cpu(item, i), zone->uz_size); 2339 #else 2340 bzero(item, zone->uz_size); 2341 #endif 2342 } 2343 return (item); 2344 } 2345 2346 /* 2347 * A stub while both regular and pcpu cases are identical. 2348 */ 2349 void 2350 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata) 2351 { 2352 2353 #ifdef SMP 2354 MPASS(zone->uz_flags & UMA_ZONE_PCPU); 2355 #endif 2356 uma_zfree_arg(zone, item, udata); 2357 } 2358 2359 /* See uma.h */ 2360 void * 2361 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) 2362 { 2363 uma_zone_domain_t zdom; 2364 uma_bucket_t bucket; 2365 uma_cache_t cache; 2366 void *item; 2367 int cpu, domain, lockfail, maxbucket; 2368 #ifdef INVARIANTS 2369 bool skipdbg; 2370 #endif 2371 2372 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 2373 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 2374 2375 /* This is the fast path allocation */ 2376 CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d", 2377 curthread, zone->uz_name, zone, flags); 2378 2379 if (flags & M_WAITOK) { 2380 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2381 "uma_zalloc_arg: zone \"%s\"", zone->uz_name); 2382 } 2383 KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC")); 2384 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 2385 ("uma_zalloc_arg: called with spinlock or critical section held")); 2386 if (zone->uz_flags & UMA_ZONE_PCPU) 2387 KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone " 2388 "with M_ZERO passed")); 2389 2390 #ifdef DEBUG_MEMGUARD 2391 if (memguard_cmp_zone(zone)) { 2392 item = memguard_alloc(zone->uz_size, flags); 2393 if (item != NULL) { 2394 if (zone->uz_init != NULL && 2395 zone->uz_init(item, zone->uz_size, flags) != 0) 2396 return (NULL); 2397 if (zone->uz_ctor != NULL && 2398 zone->uz_ctor(item, zone->uz_size, udata, 2399 flags) != 0) { 2400 zone->uz_fini(item, zone->uz_size); 2401 return (NULL); 2402 } 2403 return (item); 2404 } 2405 /* This is unfortunate but should not be fatal. */ 2406 } 2407 #endif 2408 /* 2409 * If possible, allocate from the per-CPU cache. There are two 2410 * requirements for safe access to the per-CPU cache: (1) the thread 2411 * accessing the cache must not be preempted or yield during access, 2412 * and (2) the thread must not migrate CPUs without switching which 2413 * cache it accesses. We rely on a critical section to prevent 2414 * preemption and migration. We release the critical section in 2415 * order to acquire the zone mutex if we are unable to allocate from 2416 * the current cache; when we re-acquire the critical section, we 2417 * must detect and handle migration if it has occurred. 2418 */ 2419 zalloc_restart: 2420 critical_enter(); 2421 cpu = curcpu; 2422 cache = &zone->uz_cpu[cpu]; 2423 2424 zalloc_start: 2425 bucket = cache->uc_allocbucket; 2426 if (bucket != NULL && bucket->ub_cnt > 0) { 2427 bucket->ub_cnt--; 2428 item = bucket->ub_bucket[bucket->ub_cnt]; 2429 #ifdef INVARIANTS 2430 bucket->ub_bucket[bucket->ub_cnt] = NULL; 2431 #endif 2432 KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled.")); 2433 cache->uc_allocs++; 2434 critical_exit(); 2435 #ifdef INVARIANTS 2436 skipdbg = uma_dbg_zskip(zone, item); 2437 #endif 2438 if (zone->uz_ctor != NULL && 2439 #ifdef INVARIANTS 2440 (!skipdbg || zone->uz_ctor != trash_ctor || 2441 zone->uz_dtor != trash_dtor) && 2442 #endif 2443 zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { 2444 counter_u64_add(zone->uz_fails, 1); 2445 zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); 2446 return (NULL); 2447 } 2448 #ifdef INVARIANTS 2449 if (!skipdbg) 2450 uma_dbg_alloc(zone, NULL, item); 2451 #endif 2452 if (flags & M_ZERO) 2453 uma_zero_item(item, zone); 2454 return (item); 2455 } 2456 2457 /* 2458 * We have run out of items in our alloc bucket. 2459 * See if we can switch with our free bucket. 2460 */ 2461 bucket = cache->uc_freebucket; 2462 if (bucket != NULL && bucket->ub_cnt > 0) { 2463 CTR2(KTR_UMA, 2464 "uma_zalloc: zone %s(%p) swapping empty with alloc", 2465 zone->uz_name, zone); 2466 cache->uc_freebucket = cache->uc_allocbucket; 2467 cache->uc_allocbucket = bucket; 2468 goto zalloc_start; 2469 } 2470 2471 /* 2472 * Discard any empty allocation bucket while we hold no locks. 2473 */ 2474 bucket = cache->uc_allocbucket; 2475 cache->uc_allocbucket = NULL; 2476 critical_exit(); 2477 if (bucket != NULL) 2478 bucket_free(zone, bucket, udata); 2479 2480 /* Short-circuit for zones without buckets and low memory. */ 2481 if (zone->uz_count == 0 || bucketdisable) { 2482 ZONE_LOCK(zone); 2483 if (zone->uz_flags & UMA_ZONE_NUMA) 2484 domain = PCPU_GET(domain); 2485 else 2486 domain = UMA_ANYDOMAIN; 2487 goto zalloc_item; 2488 } 2489 2490 /* 2491 * Attempt to retrieve the item from the per-CPU cache has failed, so 2492 * we must go back to the zone. This requires the zone lock, so we 2493 * must drop the critical section, then re-acquire it when we go back 2494 * to the cache. Since the critical section is released, we may be 2495 * preempted or migrate. As such, make sure not to maintain any 2496 * thread-local state specific to the cache from prior to releasing 2497 * the critical section. 2498 */ 2499 lockfail = 0; 2500 if (ZONE_TRYLOCK(zone) == 0) { 2501 /* Record contention to size the buckets. */ 2502 ZONE_LOCK(zone); 2503 lockfail = 1; 2504 } 2505 critical_enter(); 2506 cpu = curcpu; 2507 cache = &zone->uz_cpu[cpu]; 2508 2509 /* See if we lost the race to fill the cache. */ 2510 if (cache->uc_allocbucket != NULL) { 2511 ZONE_UNLOCK(zone); 2512 goto zalloc_start; 2513 } 2514 2515 /* 2516 * Check the zone's cache of buckets. 2517 */ 2518 if (zone->uz_flags & UMA_ZONE_NUMA) { 2519 domain = PCPU_GET(domain); 2520 zdom = &zone->uz_domain[domain]; 2521 } else { 2522 domain = UMA_ANYDOMAIN; 2523 zdom = &zone->uz_domain[0]; 2524 } 2525 2526 if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) { 2527 KASSERT(bucket->ub_cnt != 0, 2528 ("uma_zalloc_arg: Returning an empty bucket.")); 2529 cache->uc_allocbucket = bucket; 2530 ZONE_UNLOCK(zone); 2531 goto zalloc_start; 2532 } 2533 /* We are no longer associated with this CPU. */ 2534 critical_exit(); 2535 2536 /* 2537 * We bump the uz count when the cache size is insufficient to 2538 * handle the working set. 2539 */ 2540 if (lockfail && zone->uz_count < zone->uz_count_max) 2541 zone->uz_count++; 2542 2543 if (zone->uz_max_items > 0) { 2544 if (zone->uz_items >= zone->uz_max_items) 2545 goto zalloc_item; 2546 maxbucket = MIN(zone->uz_count, 2547 zone->uz_max_items - zone->uz_items); 2548 zone->uz_items += maxbucket; 2549 } else 2550 maxbucket = zone->uz_count; 2551 ZONE_UNLOCK(zone); 2552 2553 /* 2554 * Now lets just fill a bucket and put it on the free list. If that 2555 * works we'll restart the allocation from the beginning and it 2556 * will use the just filled bucket. 2557 */ 2558 bucket = zone_alloc_bucket(zone, udata, domain, flags, maxbucket); 2559 CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", 2560 zone->uz_name, zone, bucket); 2561 ZONE_LOCK(zone); 2562 if (bucket != NULL) { 2563 if (zone->uz_max_items > 0 && bucket->ub_cnt < maxbucket) { 2564 MPASS(zone->uz_items >= maxbucket - bucket->ub_cnt); 2565 zone->uz_items -= maxbucket - bucket->ub_cnt; 2566 if (zone->uz_sleepers > 0 && 2567 zone->uz_items < zone->uz_max_items) 2568 wakeup_one(zone); 2569 } 2570 critical_enter(); 2571 cpu = curcpu; 2572 cache = &zone->uz_cpu[cpu]; 2573 2574 /* 2575 * See if we lost the race or were migrated. Cache the 2576 * initialized bucket to make this less likely or claim 2577 * the memory directly. 2578 */ 2579 if (cache->uc_allocbucket == NULL && 2580 ((zone->uz_flags & UMA_ZONE_NUMA) == 0 || 2581 domain == PCPU_GET(domain))) { 2582 cache->uc_allocbucket = bucket; 2583 zdom->uzd_imax += bucket->ub_cnt; 2584 } else if (zone->uz_bkt_count >= zone->uz_bkt_max) { 2585 critical_exit(); 2586 ZONE_UNLOCK(zone); 2587 bucket_drain(zone, bucket); 2588 bucket_free(zone, bucket, udata); 2589 goto zalloc_restart; 2590 } else 2591 zone_put_bucket(zone, zdom, bucket, false); 2592 ZONE_UNLOCK(zone); 2593 goto zalloc_start; 2594 } else if (zone->uz_max_items > 0) { 2595 zone->uz_items -= maxbucket; 2596 if (zone->uz_sleepers > 0 && 2597 zone->uz_items + 1 < zone->uz_max_items) 2598 wakeup_one(zone); 2599 } 2600 2601 /* 2602 * We may not be able to get a bucket so return an actual item. 2603 */ 2604 zalloc_item: 2605 item = zone_alloc_item_locked(zone, udata, domain, flags); 2606 2607 return (item); 2608 } 2609 2610 void * 2611 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags) 2612 { 2613 2614 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 2615 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 2616 2617 /* This is the fast path allocation */ 2618 CTR5(KTR_UMA, 2619 "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d", 2620 curthread, zone->uz_name, zone, domain, flags); 2621 2622 if (flags & M_WAITOK) { 2623 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 2624 "uma_zalloc_domain: zone \"%s\"", zone->uz_name); 2625 } 2626 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 2627 ("uma_zalloc_domain: called with spinlock or critical section held")); 2628 2629 return (zone_alloc_item(zone, udata, domain, flags)); 2630 } 2631 2632 /* 2633 * Find a slab with some space. Prefer slabs that are partially used over those 2634 * that are totally full. This helps to reduce fragmentation. 2635 * 2636 * If 'rr' is 1, search all domains starting from 'domain'. Otherwise check 2637 * only 'domain'. 2638 */ 2639 static uma_slab_t 2640 keg_first_slab(uma_keg_t keg, int domain, bool rr) 2641 { 2642 uma_domain_t dom; 2643 uma_slab_t slab; 2644 int start; 2645 2646 KASSERT(domain >= 0 && domain < vm_ndomains, 2647 ("keg_first_slab: domain %d out of range", domain)); 2648 KEG_LOCK_ASSERT(keg); 2649 2650 slab = NULL; 2651 start = domain; 2652 do { 2653 dom = &keg->uk_domain[domain]; 2654 if (!LIST_EMPTY(&dom->ud_part_slab)) 2655 return (LIST_FIRST(&dom->ud_part_slab)); 2656 if (!LIST_EMPTY(&dom->ud_free_slab)) { 2657 slab = LIST_FIRST(&dom->ud_free_slab); 2658 LIST_REMOVE(slab, us_link); 2659 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 2660 return (slab); 2661 } 2662 if (rr) 2663 domain = (domain + 1) % vm_ndomains; 2664 } while (domain != start); 2665 2666 return (NULL); 2667 } 2668 2669 static uma_slab_t 2670 keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags) 2671 { 2672 uint32_t reserve; 2673 2674 KEG_LOCK_ASSERT(keg); 2675 2676 reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve; 2677 if (keg->uk_free <= reserve) 2678 return (NULL); 2679 return (keg_first_slab(keg, domain, rr)); 2680 } 2681 2682 static uma_slab_t 2683 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags) 2684 { 2685 struct vm_domainset_iter di; 2686 uma_domain_t dom; 2687 uma_slab_t slab; 2688 int aflags, domain; 2689 bool rr; 2690 2691 restart: 2692 KEG_LOCK_ASSERT(keg); 2693 2694 /* 2695 * Use the keg's policy if upper layers haven't already specified a 2696 * domain (as happens with first-touch zones). 2697 * 2698 * To avoid races we run the iterator with the keg lock held, but that 2699 * means that we cannot allow the vm_domainset layer to sleep. Thus, 2700 * clear M_WAITOK and handle low memory conditions locally. 2701 */ 2702 rr = rdomain == UMA_ANYDOMAIN; 2703 if (rr) { 2704 aflags = (flags & ~M_WAITOK) | M_NOWAIT; 2705 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 2706 &aflags); 2707 } else { 2708 aflags = flags; 2709 domain = rdomain; 2710 } 2711 2712 for (;;) { 2713 slab = keg_fetch_free_slab(keg, domain, rr, flags); 2714 if (slab != NULL) { 2715 MPASS(slab->us_keg == keg); 2716 return (slab); 2717 } 2718 2719 /* 2720 * M_NOVM means don't ask at all! 2721 */ 2722 if (flags & M_NOVM) 2723 break; 2724 2725 KASSERT(zone->uz_max_items == 0 || 2726 zone->uz_items <= zone->uz_max_items, 2727 ("%s: zone %p overflow", __func__, zone)); 2728 2729 slab = keg_alloc_slab(keg, zone, domain, flags, aflags); 2730 /* 2731 * If we got a slab here it's safe to mark it partially used 2732 * and return. We assume that the caller is going to remove 2733 * at least one item. 2734 */ 2735 if (slab) { 2736 MPASS(slab->us_keg == keg); 2737 dom = &keg->uk_domain[slab->us_domain]; 2738 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 2739 return (slab); 2740 } 2741 KEG_LOCK(keg); 2742 if (rr && vm_domainset_iter_policy(&di, &domain) != 0) { 2743 if ((flags & M_WAITOK) != 0) { 2744 KEG_UNLOCK(keg); 2745 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); 2746 KEG_LOCK(keg); 2747 goto restart; 2748 } 2749 break; 2750 } 2751 } 2752 2753 /* 2754 * We might not have been able to get a slab but another cpu 2755 * could have while we were unlocked. Check again before we 2756 * fail. 2757 */ 2758 if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) { 2759 MPASS(slab->us_keg == keg); 2760 return (slab); 2761 } 2762 return (NULL); 2763 } 2764 2765 static uma_slab_t 2766 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags) 2767 { 2768 uma_slab_t slab; 2769 2770 if (keg == NULL) { 2771 keg = zone->uz_keg; 2772 KEG_LOCK(keg); 2773 } 2774 2775 for (;;) { 2776 slab = keg_fetch_slab(keg, zone, domain, flags); 2777 if (slab) 2778 return (slab); 2779 if (flags & (M_NOWAIT | M_NOVM)) 2780 break; 2781 } 2782 KEG_UNLOCK(keg); 2783 return (NULL); 2784 } 2785 2786 static void * 2787 slab_alloc_item(uma_keg_t keg, uma_slab_t slab) 2788 { 2789 uma_domain_t dom; 2790 void *item; 2791 uint8_t freei; 2792 2793 MPASS(keg == slab->us_keg); 2794 KEG_LOCK_ASSERT(keg); 2795 2796 freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1; 2797 BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free); 2798 item = slab->us_data + (keg->uk_rsize * freei); 2799 slab->us_freecount--; 2800 keg->uk_free--; 2801 2802 /* Move this slab to the full list */ 2803 if (slab->us_freecount == 0) { 2804 LIST_REMOVE(slab, us_link); 2805 dom = &keg->uk_domain[slab->us_domain]; 2806 LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link); 2807 } 2808 2809 return (item); 2810 } 2811 2812 static int 2813 zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags) 2814 { 2815 uma_slab_t slab; 2816 uma_keg_t keg; 2817 #ifdef NUMA 2818 int stripe; 2819 #endif 2820 int i; 2821 2822 slab = NULL; 2823 keg = NULL; 2824 /* Try to keep the buckets totally full */ 2825 for (i = 0; i < max; ) { 2826 if ((slab = zone_fetch_slab(zone, keg, domain, flags)) == NULL) 2827 break; 2828 keg = slab->us_keg; 2829 #ifdef NUMA 2830 stripe = howmany(max, vm_ndomains); 2831 #endif 2832 while (slab->us_freecount && i < max) { 2833 bucket[i++] = slab_alloc_item(keg, slab); 2834 if (keg->uk_free <= keg->uk_reserve) 2835 break; 2836 #ifdef NUMA 2837 /* 2838 * If the zone is striped we pick a new slab for every 2839 * N allocations. Eliminating this conditional will 2840 * instead pick a new domain for each bucket rather 2841 * than stripe within each bucket. The current option 2842 * produces more fragmentation and requires more cpu 2843 * time but yields better distribution. 2844 */ 2845 if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 && 2846 vm_ndomains > 1 && --stripe == 0) 2847 break; 2848 #endif 2849 } 2850 /* Don't block if we allocated any successfully. */ 2851 flags &= ~M_WAITOK; 2852 flags |= M_NOWAIT; 2853 } 2854 if (slab != NULL) 2855 KEG_UNLOCK(keg); 2856 2857 return i; 2858 } 2859 2860 static uma_bucket_t 2861 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags, int max) 2862 { 2863 uma_bucket_t bucket; 2864 2865 CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain); 2866 2867 /* Avoid allocs targeting empty domains. */ 2868 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 2869 domain = UMA_ANYDOMAIN; 2870 2871 /* Don't wait for buckets, preserve caller's NOVM setting. */ 2872 bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); 2873 if (bucket == NULL) 2874 return (NULL); 2875 2876 bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket, 2877 MIN(max, bucket->ub_entries), domain, flags); 2878 2879 /* 2880 * Initialize the memory if necessary. 2881 */ 2882 if (bucket->ub_cnt != 0 && zone->uz_init != NULL) { 2883 int i; 2884 2885 for (i = 0; i < bucket->ub_cnt; i++) 2886 if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size, 2887 flags) != 0) 2888 break; 2889 /* 2890 * If we couldn't initialize the whole bucket, put the 2891 * rest back onto the freelist. 2892 */ 2893 if (i != bucket->ub_cnt) { 2894 zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i], 2895 bucket->ub_cnt - i); 2896 #ifdef INVARIANTS 2897 bzero(&bucket->ub_bucket[i], 2898 sizeof(void *) * (bucket->ub_cnt - i)); 2899 #endif 2900 bucket->ub_cnt = i; 2901 } 2902 } 2903 2904 if (bucket->ub_cnt == 0) { 2905 bucket_free(zone, bucket, udata); 2906 counter_u64_add(zone->uz_fails, 1); 2907 return (NULL); 2908 } 2909 2910 return (bucket); 2911 } 2912 2913 /* 2914 * Allocates a single item from a zone. 2915 * 2916 * Arguments 2917 * zone The zone to alloc for. 2918 * udata The data to be passed to the constructor. 2919 * domain The domain to allocate from or UMA_ANYDOMAIN. 2920 * flags M_WAITOK, M_NOWAIT, M_ZERO. 2921 * 2922 * Returns 2923 * NULL if there is no memory and M_NOWAIT is set 2924 * An item if successful 2925 */ 2926 2927 static void * 2928 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) 2929 { 2930 2931 ZONE_LOCK(zone); 2932 return (zone_alloc_item_locked(zone, udata, domain, flags)); 2933 } 2934 2935 /* 2936 * Returns with zone unlocked. 2937 */ 2938 static void * 2939 zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags) 2940 { 2941 void *item; 2942 #ifdef INVARIANTS 2943 bool skipdbg; 2944 #endif 2945 2946 ZONE_LOCK_ASSERT(zone); 2947 2948 if (zone->uz_max_items > 0) { 2949 if (zone->uz_items >= zone->uz_max_items) { 2950 zone_log_warning(zone); 2951 zone_maxaction(zone); 2952 if (flags & M_NOWAIT) { 2953 ZONE_UNLOCK(zone); 2954 return (NULL); 2955 } 2956 zone->uz_sleeps++; 2957 zone->uz_sleepers++; 2958 while (zone->uz_items >= zone->uz_max_items) 2959 mtx_sleep(zone, zone->uz_lockptr, PVM, 2960 "zonelimit", 0); 2961 zone->uz_sleepers--; 2962 if (zone->uz_sleepers > 0 && 2963 zone->uz_items + 1 < zone->uz_max_items) 2964 wakeup_one(zone); 2965 } 2966 zone->uz_items++; 2967 } 2968 ZONE_UNLOCK(zone); 2969 2970 /* Avoid allocs targeting empty domains. */ 2971 if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) 2972 domain = UMA_ANYDOMAIN; 2973 2974 if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1) 2975 goto fail; 2976 2977 #ifdef INVARIANTS 2978 skipdbg = uma_dbg_zskip(zone, item); 2979 #endif 2980 /* 2981 * We have to call both the zone's init (not the keg's init) 2982 * and the zone's ctor. This is because the item is going from 2983 * a keg slab directly to the user, and the user is expecting it 2984 * to be both zone-init'd as well as zone-ctor'd. 2985 */ 2986 if (zone->uz_init != NULL) { 2987 if (zone->uz_init(item, zone->uz_size, flags) != 0) { 2988 zone_free_item(zone, item, udata, SKIP_FINI | SKIP_CNT); 2989 goto fail; 2990 } 2991 } 2992 if (zone->uz_ctor != NULL && 2993 #ifdef INVARIANTS 2994 (!skipdbg || zone->uz_ctor != trash_ctor || 2995 zone->uz_dtor != trash_dtor) && 2996 #endif 2997 zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) { 2998 zone_free_item(zone, item, udata, SKIP_DTOR | SKIP_CNT); 2999 goto fail; 3000 } 3001 #ifdef INVARIANTS 3002 if (!skipdbg) 3003 uma_dbg_alloc(zone, NULL, item); 3004 #endif 3005 if (flags & M_ZERO) 3006 uma_zero_item(item, zone); 3007 3008 counter_u64_add(zone->uz_allocs, 1); 3009 CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item, 3010 zone->uz_name, zone); 3011 3012 return (item); 3013 3014 fail: 3015 if (zone->uz_max_items > 0) { 3016 ZONE_LOCK(zone); 3017 zone->uz_items--; 3018 ZONE_UNLOCK(zone); 3019 } 3020 counter_u64_add(zone->uz_fails, 1); 3021 CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", 3022 zone->uz_name, zone); 3023 return (NULL); 3024 } 3025 3026 /* See uma.h */ 3027 void 3028 uma_zfree_arg(uma_zone_t zone, void *item, void *udata) 3029 { 3030 uma_cache_t cache; 3031 uma_bucket_t bucket; 3032 uma_zone_domain_t zdom; 3033 int cpu, domain; 3034 #ifdef UMA_XDOMAIN 3035 int itemdomain; 3036 #endif 3037 bool lockfail; 3038 #ifdef INVARIANTS 3039 bool skipdbg; 3040 #endif 3041 3042 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3043 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3044 3045 CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread, 3046 zone->uz_name); 3047 3048 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3049 ("uma_zfree_arg: called with spinlock or critical section held")); 3050 3051 /* uma_zfree(..., NULL) does nothing, to match free(9). */ 3052 if (item == NULL) 3053 return; 3054 #ifdef DEBUG_MEMGUARD 3055 if (is_memguard_addr(item)) { 3056 if (zone->uz_dtor != NULL) 3057 zone->uz_dtor(item, zone->uz_size, udata); 3058 if (zone->uz_fini != NULL) 3059 zone->uz_fini(item, zone->uz_size); 3060 memguard_free(item); 3061 return; 3062 } 3063 #endif 3064 #ifdef INVARIANTS 3065 skipdbg = uma_dbg_zskip(zone, item); 3066 if (skipdbg == false) { 3067 if (zone->uz_flags & UMA_ZONE_MALLOC) 3068 uma_dbg_free(zone, udata, item); 3069 else 3070 uma_dbg_free(zone, NULL, item); 3071 } 3072 if (zone->uz_dtor != NULL && (!skipdbg || 3073 zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor)) 3074 #else 3075 if (zone->uz_dtor != NULL) 3076 #endif 3077 zone->uz_dtor(item, zone->uz_size, udata); 3078 3079 /* 3080 * The race here is acceptable. If we miss it we'll just have to wait 3081 * a little longer for the limits to be reset. 3082 */ 3083 if (zone->uz_sleepers > 0) 3084 goto zfree_item; 3085 3086 #ifdef UMA_XDOMAIN 3087 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) 3088 itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item)); 3089 #endif 3090 3091 /* 3092 * If possible, free to the per-CPU cache. There are two 3093 * requirements for safe access to the per-CPU cache: (1) the thread 3094 * accessing the cache must not be preempted or yield during access, 3095 * and (2) the thread must not migrate CPUs without switching which 3096 * cache it accesses. We rely on a critical section to prevent 3097 * preemption and migration. We release the critical section in 3098 * order to acquire the zone mutex if we are unable to free to the 3099 * current cache; when we re-acquire the critical section, we must 3100 * detect and handle migration if it has occurred. 3101 */ 3102 zfree_restart: 3103 critical_enter(); 3104 cpu = curcpu; 3105 cache = &zone->uz_cpu[cpu]; 3106 3107 zfree_start: 3108 domain = PCPU_GET(domain); 3109 #ifdef UMA_XDOMAIN 3110 if ((zone->uz_flags & UMA_ZONE_NUMA) == 0) 3111 itemdomain = domain; 3112 #endif 3113 /* 3114 * Try to free into the allocbucket first to give LIFO ordering 3115 * for cache-hot datastructures. Spill over into the freebucket 3116 * if necessary. Alloc will swap them if one runs dry. 3117 */ 3118 #ifdef UMA_XDOMAIN 3119 if (domain != itemdomain) { 3120 bucket = cache->uc_crossbucket; 3121 } else 3122 #endif 3123 { 3124 bucket = cache->uc_allocbucket; 3125 if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries) 3126 bucket = cache->uc_freebucket; 3127 } 3128 if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) { 3129 KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL, 3130 ("uma_zfree: Freeing to non free bucket index.")); 3131 bucket->ub_bucket[bucket->ub_cnt] = item; 3132 bucket->ub_cnt++; 3133 cache->uc_frees++; 3134 critical_exit(); 3135 return; 3136 } 3137 3138 /* 3139 * We must go back the zone, which requires acquiring the zone lock, 3140 * which in turn means we must release and re-acquire the critical 3141 * section. Since the critical section is released, we may be 3142 * preempted or migrate. As such, make sure not to maintain any 3143 * thread-local state specific to the cache from prior to releasing 3144 * the critical section. 3145 */ 3146 critical_exit(); 3147 if (zone->uz_count == 0 || bucketdisable) 3148 goto zfree_item; 3149 3150 lockfail = false; 3151 if (ZONE_TRYLOCK(zone) == 0) { 3152 /* Record contention to size the buckets. */ 3153 ZONE_LOCK(zone); 3154 lockfail = true; 3155 } 3156 critical_enter(); 3157 cpu = curcpu; 3158 domain = PCPU_GET(domain); 3159 cache = &zone->uz_cpu[cpu]; 3160 3161 #ifdef UMA_XDOMAIN 3162 if (domain != itemdomain) 3163 bucket = cache->uc_crossbucket; 3164 else 3165 #endif 3166 bucket = cache->uc_freebucket; 3167 if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) { 3168 ZONE_UNLOCK(zone); 3169 goto zfree_start; 3170 } 3171 #ifdef UMA_XDOMAIN 3172 if (domain != itemdomain) 3173 cache->uc_crossbucket = NULL; 3174 else 3175 #endif 3176 cache->uc_freebucket = NULL; 3177 /* We are no longer associated with this CPU. */ 3178 critical_exit(); 3179 3180 #ifdef UMA_XDOMAIN 3181 if (domain != itemdomain) { 3182 if (bucket != NULL) { 3183 zone->uz_xdomain += bucket->ub_cnt; 3184 if (vm_ndomains > 2 || 3185 zone->uz_bkt_count >= zone->uz_bkt_max) { 3186 ZONE_UNLOCK(zone); 3187 bucket_drain(zone, bucket); 3188 bucket_free(zone, bucket, udata); 3189 } else { 3190 zdom = &zone->uz_domain[itemdomain]; 3191 zone_put_bucket(zone, zdom, bucket, true); 3192 ZONE_UNLOCK(zone); 3193 } 3194 } else 3195 ZONE_UNLOCK(zone); 3196 bucket = bucket_alloc(zone, udata, M_NOWAIT); 3197 if (bucket == NULL) 3198 goto zfree_item; 3199 critical_enter(); 3200 cpu = curcpu; 3201 cache = &zone->uz_cpu[cpu]; 3202 if (cache->uc_crossbucket == NULL) { 3203 cache->uc_crossbucket = bucket; 3204 goto zfree_start; 3205 } 3206 critical_exit(); 3207 bucket_free(zone, bucket, udata); 3208 goto zfree_restart; 3209 } 3210 #endif 3211 3212 if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) { 3213 zdom = &zone->uz_domain[domain]; 3214 } else { 3215 domain = 0; 3216 zdom = &zone->uz_domain[0]; 3217 } 3218 3219 /* Can we throw this on the zone full list? */ 3220 if (bucket != NULL) { 3221 CTR3(KTR_UMA, 3222 "uma_zfree: zone %s(%p) putting bucket %p on free list", 3223 zone->uz_name, zone, bucket); 3224 /* ub_cnt is pointing to the last free item */ 3225 KASSERT(bucket->ub_cnt == bucket->ub_entries, 3226 ("uma_zfree: Attempting to insert not full bucket onto the full list.\n")); 3227 if (zone->uz_bkt_count >= zone->uz_bkt_max) { 3228 ZONE_UNLOCK(zone); 3229 bucket_drain(zone, bucket); 3230 bucket_free(zone, bucket, udata); 3231 goto zfree_restart; 3232 } else 3233 zone_put_bucket(zone, zdom, bucket, true); 3234 } 3235 3236 /* 3237 * We bump the uz count when the cache size is insufficient to 3238 * handle the working set. 3239 */ 3240 if (lockfail && zone->uz_count < zone->uz_count_max) 3241 zone->uz_count++; 3242 ZONE_UNLOCK(zone); 3243 3244 bucket = bucket_alloc(zone, udata, M_NOWAIT); 3245 CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p", 3246 zone->uz_name, zone, bucket); 3247 if (bucket) { 3248 critical_enter(); 3249 cpu = curcpu; 3250 cache = &zone->uz_cpu[cpu]; 3251 if (cache->uc_freebucket == NULL && 3252 ((zone->uz_flags & UMA_ZONE_NUMA) == 0 || 3253 domain == PCPU_GET(domain))) { 3254 cache->uc_freebucket = bucket; 3255 goto zfree_start; 3256 } 3257 /* 3258 * We lost the race, start over. We have to drop our 3259 * critical section to free the bucket. 3260 */ 3261 critical_exit(); 3262 bucket_free(zone, bucket, udata); 3263 goto zfree_restart; 3264 } 3265 3266 /* 3267 * If nothing else caught this, we'll just do an internal free. 3268 */ 3269 zfree_item: 3270 zone_free_item(zone, item, udata, SKIP_DTOR); 3271 } 3272 3273 void 3274 uma_zfree_domain(uma_zone_t zone, void *item, void *udata) 3275 { 3276 3277 /* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */ 3278 random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA); 3279 3280 CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread, 3281 zone->uz_name); 3282 3283 KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(), 3284 ("uma_zfree_domain: called with spinlock or critical section held")); 3285 3286 /* uma_zfree(..., NULL) does nothing, to match free(9). */ 3287 if (item == NULL) 3288 return; 3289 zone_free_item(zone, item, udata, SKIP_NONE); 3290 } 3291 3292 static void 3293 slab_free_item(uma_zone_t zone, uma_slab_t slab, void *item) 3294 { 3295 uma_keg_t keg; 3296 uma_domain_t dom; 3297 uint8_t freei; 3298 3299 keg = zone->uz_keg; 3300 MPASS(zone->uz_lockptr == &keg->uk_lock); 3301 KEG_LOCK_ASSERT(keg); 3302 MPASS(keg == slab->us_keg); 3303 3304 dom = &keg->uk_domain[slab->us_domain]; 3305 3306 /* Do we need to remove from any lists? */ 3307 if (slab->us_freecount+1 == keg->uk_ipers) { 3308 LIST_REMOVE(slab, us_link); 3309 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link); 3310 } else if (slab->us_freecount == 0) { 3311 LIST_REMOVE(slab, us_link); 3312 LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link); 3313 } 3314 3315 /* Slab management. */ 3316 freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; 3317 BIT_SET(SLAB_SETSIZE, freei, &slab->us_free); 3318 slab->us_freecount++; 3319 3320 /* Keg statistics. */ 3321 keg->uk_free++; 3322 } 3323 3324 static void 3325 zone_release(uma_zone_t zone, void **bucket, int cnt) 3326 { 3327 void *item; 3328 uma_slab_t slab; 3329 uma_keg_t keg; 3330 uint8_t *mem; 3331 int i; 3332 3333 keg = zone->uz_keg; 3334 KEG_LOCK(keg); 3335 for (i = 0; i < cnt; i++) { 3336 item = bucket[i]; 3337 if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) { 3338 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 3339 if (zone->uz_flags & UMA_ZONE_HASH) { 3340 slab = hash_sfind(&keg->uk_hash, mem); 3341 } else { 3342 mem += keg->uk_pgoff; 3343 slab = (uma_slab_t)mem; 3344 } 3345 } else { 3346 slab = vtoslab((vm_offset_t)item); 3347 MPASS(slab->us_keg == keg); 3348 } 3349 slab_free_item(zone, slab, item); 3350 } 3351 KEG_UNLOCK(keg); 3352 } 3353 3354 /* 3355 * Frees a single item to any zone. 3356 * 3357 * Arguments: 3358 * zone The zone to free to 3359 * item The item we're freeing 3360 * udata User supplied data for the dtor 3361 * skip Skip dtors and finis 3362 */ 3363 static void 3364 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip) 3365 { 3366 #ifdef INVARIANTS 3367 bool skipdbg; 3368 3369 skipdbg = uma_dbg_zskip(zone, item); 3370 if (skip == SKIP_NONE && !skipdbg) { 3371 if (zone->uz_flags & UMA_ZONE_MALLOC) 3372 uma_dbg_free(zone, udata, item); 3373 else 3374 uma_dbg_free(zone, NULL, item); 3375 } 3376 3377 if (skip < SKIP_DTOR && zone->uz_dtor != NULL && 3378 (!skipdbg || zone->uz_dtor != trash_dtor || 3379 zone->uz_ctor != trash_ctor)) 3380 #else 3381 if (skip < SKIP_DTOR && zone->uz_dtor != NULL) 3382 #endif 3383 zone->uz_dtor(item, zone->uz_size, udata); 3384 3385 if (skip < SKIP_FINI && zone->uz_fini) 3386 zone->uz_fini(item, zone->uz_size); 3387 3388 zone->uz_release(zone->uz_arg, &item, 1); 3389 3390 if (skip & SKIP_CNT) 3391 return; 3392 3393 counter_u64_add(zone->uz_frees, 1); 3394 3395 if (zone->uz_max_items > 0) { 3396 ZONE_LOCK(zone); 3397 zone->uz_items--; 3398 if (zone->uz_sleepers > 0 && 3399 zone->uz_items < zone->uz_max_items) 3400 wakeup_one(zone); 3401 ZONE_UNLOCK(zone); 3402 } 3403 } 3404 3405 /* See uma.h */ 3406 int 3407 uma_zone_set_max(uma_zone_t zone, int nitems) 3408 { 3409 struct uma_bucket_zone *ubz; 3410 3411 /* 3412 * If limit is very low we may need to limit how 3413 * much items are allowed in CPU caches. 3414 */ 3415 ubz = &bucket_zones[0]; 3416 for (; ubz->ubz_entries != 0; ubz++) 3417 if (ubz->ubz_entries * 2 * mp_ncpus > nitems) 3418 break; 3419 if (ubz == &bucket_zones[0]) 3420 nitems = ubz->ubz_entries * 2 * mp_ncpus; 3421 else 3422 ubz--; 3423 3424 ZONE_LOCK(zone); 3425 zone->uz_count_max = zone->uz_count = ubz->ubz_entries; 3426 if (zone->uz_count_min > zone->uz_count_max) 3427 zone->uz_count_min = zone->uz_count_max; 3428 zone->uz_max_items = nitems; 3429 ZONE_UNLOCK(zone); 3430 3431 return (nitems); 3432 } 3433 3434 /* See uma.h */ 3435 int 3436 uma_zone_set_maxcache(uma_zone_t zone, int nitems) 3437 { 3438 3439 ZONE_LOCK(zone); 3440 zone->uz_bkt_max = nitems; 3441 ZONE_UNLOCK(zone); 3442 3443 return (nitems); 3444 } 3445 3446 /* See uma.h */ 3447 int 3448 uma_zone_get_max(uma_zone_t zone) 3449 { 3450 int nitems; 3451 3452 ZONE_LOCK(zone); 3453 nitems = zone->uz_max_items; 3454 ZONE_UNLOCK(zone); 3455 3456 return (nitems); 3457 } 3458 3459 /* See uma.h */ 3460 void 3461 uma_zone_set_warning(uma_zone_t zone, const char *warning) 3462 { 3463 3464 ZONE_LOCK(zone); 3465 zone->uz_warning = warning; 3466 ZONE_UNLOCK(zone); 3467 } 3468 3469 /* See uma.h */ 3470 void 3471 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction) 3472 { 3473 3474 ZONE_LOCK(zone); 3475 TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone); 3476 ZONE_UNLOCK(zone); 3477 } 3478 3479 /* See uma.h */ 3480 int 3481 uma_zone_get_cur(uma_zone_t zone) 3482 { 3483 int64_t nitems; 3484 u_int i; 3485 3486 ZONE_LOCK(zone); 3487 nitems = counter_u64_fetch(zone->uz_allocs) - 3488 counter_u64_fetch(zone->uz_frees); 3489 CPU_FOREACH(i) { 3490 /* 3491 * See the comment in uma_vm_zone_stats() regarding the 3492 * safety of accessing the per-cpu caches. With the zone lock 3493 * held, it is safe, but can potentially result in stale data. 3494 */ 3495 nitems += zone->uz_cpu[i].uc_allocs - 3496 zone->uz_cpu[i].uc_frees; 3497 } 3498 ZONE_UNLOCK(zone); 3499 3500 return (nitems < 0 ? 0 : nitems); 3501 } 3502 3503 /* See uma.h */ 3504 void 3505 uma_zone_set_init(uma_zone_t zone, uma_init uminit) 3506 { 3507 uma_keg_t keg; 3508 3509 KEG_GET(zone, keg); 3510 KEG_LOCK(keg); 3511 KASSERT(keg->uk_pages == 0, 3512 ("uma_zone_set_init on non-empty keg")); 3513 keg->uk_init = uminit; 3514 KEG_UNLOCK(keg); 3515 } 3516 3517 /* See uma.h */ 3518 void 3519 uma_zone_set_fini(uma_zone_t zone, uma_fini fini) 3520 { 3521 uma_keg_t keg; 3522 3523 KEG_GET(zone, keg); 3524 KEG_LOCK(keg); 3525 KASSERT(keg->uk_pages == 0, 3526 ("uma_zone_set_fini on non-empty keg")); 3527 keg->uk_fini = fini; 3528 KEG_UNLOCK(keg); 3529 } 3530 3531 /* See uma.h */ 3532 void 3533 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit) 3534 { 3535 3536 ZONE_LOCK(zone); 3537 KASSERT(zone->uz_keg->uk_pages == 0, 3538 ("uma_zone_set_zinit on non-empty keg")); 3539 zone->uz_init = zinit; 3540 ZONE_UNLOCK(zone); 3541 } 3542 3543 /* See uma.h */ 3544 void 3545 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini) 3546 { 3547 3548 ZONE_LOCK(zone); 3549 KASSERT(zone->uz_keg->uk_pages == 0, 3550 ("uma_zone_set_zfini on non-empty keg")); 3551 zone->uz_fini = zfini; 3552 ZONE_UNLOCK(zone); 3553 } 3554 3555 /* See uma.h */ 3556 /* XXX uk_freef is not actually used with the zone locked */ 3557 void 3558 uma_zone_set_freef(uma_zone_t zone, uma_free freef) 3559 { 3560 uma_keg_t keg; 3561 3562 KEG_GET(zone, keg); 3563 KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type")); 3564 KEG_LOCK(keg); 3565 keg->uk_freef = freef; 3566 KEG_UNLOCK(keg); 3567 } 3568 3569 /* See uma.h */ 3570 /* XXX uk_allocf is not actually used with the zone locked */ 3571 void 3572 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf) 3573 { 3574 uma_keg_t keg; 3575 3576 KEG_GET(zone, keg); 3577 KEG_LOCK(keg); 3578 keg->uk_allocf = allocf; 3579 KEG_UNLOCK(keg); 3580 } 3581 3582 /* See uma.h */ 3583 void 3584 uma_zone_reserve(uma_zone_t zone, int items) 3585 { 3586 uma_keg_t keg; 3587 3588 KEG_GET(zone, keg); 3589 KEG_LOCK(keg); 3590 keg->uk_reserve = items; 3591 KEG_UNLOCK(keg); 3592 } 3593 3594 /* See uma.h */ 3595 int 3596 uma_zone_reserve_kva(uma_zone_t zone, int count) 3597 { 3598 uma_keg_t keg; 3599 vm_offset_t kva; 3600 u_int pages; 3601 3602 KEG_GET(zone, keg); 3603 3604 pages = count / keg->uk_ipers; 3605 if (pages * keg->uk_ipers < count) 3606 pages++; 3607 pages *= keg->uk_ppera; 3608 3609 #ifdef UMA_MD_SMALL_ALLOC 3610 if (keg->uk_ppera > 1) { 3611 #else 3612 if (1) { 3613 #endif 3614 kva = kva_alloc((vm_size_t)pages * PAGE_SIZE); 3615 if (kva == 0) 3616 return (0); 3617 } else 3618 kva = 0; 3619 3620 ZONE_LOCK(zone); 3621 MPASS(keg->uk_kva == 0); 3622 keg->uk_kva = kva; 3623 keg->uk_offset = 0; 3624 zone->uz_max_items = pages * keg->uk_ipers; 3625 #ifdef UMA_MD_SMALL_ALLOC 3626 keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc; 3627 #else 3628 keg->uk_allocf = noobj_alloc; 3629 #endif 3630 keg->uk_flags |= UMA_ZONE_NOFREE; 3631 ZONE_UNLOCK(zone); 3632 3633 return (1); 3634 } 3635 3636 /* See uma.h */ 3637 void 3638 uma_prealloc(uma_zone_t zone, int items) 3639 { 3640 struct vm_domainset_iter di; 3641 uma_domain_t dom; 3642 uma_slab_t slab; 3643 uma_keg_t keg; 3644 int aflags, domain, slabs; 3645 3646 KEG_GET(zone, keg); 3647 KEG_LOCK(keg); 3648 slabs = items / keg->uk_ipers; 3649 if (slabs * keg->uk_ipers < items) 3650 slabs++; 3651 while (slabs-- > 0) { 3652 aflags = M_NOWAIT; 3653 vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain, 3654 &aflags); 3655 for (;;) { 3656 slab = keg_alloc_slab(keg, zone, domain, M_WAITOK, 3657 aflags); 3658 if (slab != NULL) { 3659 MPASS(slab->us_keg == keg); 3660 dom = &keg->uk_domain[slab->us_domain]; 3661 LIST_INSERT_HEAD(&dom->ud_free_slab, slab, 3662 us_link); 3663 break; 3664 } 3665 KEG_LOCK(keg); 3666 if (vm_domainset_iter_policy(&di, &domain) != 0) { 3667 KEG_UNLOCK(keg); 3668 vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask); 3669 KEG_LOCK(keg); 3670 } 3671 } 3672 } 3673 KEG_UNLOCK(keg); 3674 } 3675 3676 /* See uma.h */ 3677 static void 3678 uma_reclaim_locked(bool kmem_danger) 3679 { 3680 3681 CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); 3682 sx_assert(&uma_drain_lock, SA_XLOCKED); 3683 bucket_enable(); 3684 zone_foreach(zone_drain); 3685 if (vm_page_count_min() || kmem_danger) { 3686 cache_drain_safe(NULL); 3687 zone_foreach(zone_drain); 3688 } 3689 3690 /* 3691 * Some slabs may have been freed but this zone will be visited early 3692 * we visit again so that we can free pages that are empty once other 3693 * zones are drained. We have to do the same for buckets. 3694 */ 3695 zone_drain(slabzone); 3696 bucket_zone_drain(); 3697 } 3698 3699 void 3700 uma_reclaim(void) 3701 { 3702 3703 sx_xlock(&uma_drain_lock); 3704 uma_reclaim_locked(false); 3705 sx_xunlock(&uma_drain_lock); 3706 } 3707 3708 static volatile int uma_reclaim_needed; 3709 3710 void 3711 uma_reclaim_wakeup(void) 3712 { 3713 3714 if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0) 3715 wakeup(uma_reclaim); 3716 } 3717 3718 void 3719 uma_reclaim_worker(void *arg __unused) 3720 { 3721 3722 for (;;) { 3723 sx_xlock(&uma_drain_lock); 3724 while (atomic_load_int(&uma_reclaim_needed) == 0) 3725 sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl", 3726 hz); 3727 sx_xunlock(&uma_drain_lock); 3728 EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); 3729 sx_xlock(&uma_drain_lock); 3730 uma_reclaim_locked(true); 3731 atomic_store_int(&uma_reclaim_needed, 0); 3732 sx_xunlock(&uma_drain_lock); 3733 /* Don't fire more than once per-second. */ 3734 pause("umarclslp", hz); 3735 } 3736 } 3737 3738 /* See uma.h */ 3739 int 3740 uma_zone_exhausted(uma_zone_t zone) 3741 { 3742 int full; 3743 3744 ZONE_LOCK(zone); 3745 full = zone->uz_sleepers > 0; 3746 ZONE_UNLOCK(zone); 3747 return (full); 3748 } 3749 3750 int 3751 uma_zone_exhausted_nolock(uma_zone_t zone) 3752 { 3753 return (zone->uz_sleepers > 0); 3754 } 3755 3756 void * 3757 uma_large_malloc_domain(vm_size_t size, int domain, int wait) 3758 { 3759 struct domainset *policy; 3760 vm_offset_t addr; 3761 uma_slab_t slab; 3762 3763 if (domain != UMA_ANYDOMAIN) { 3764 /* avoid allocs targeting empty domains */ 3765 if (VM_DOMAIN_EMPTY(domain)) 3766 domain = UMA_ANYDOMAIN; 3767 } 3768 slab = zone_alloc_item(slabzone, NULL, domain, wait); 3769 if (slab == NULL) 3770 return (NULL); 3771 policy = (domain == UMA_ANYDOMAIN) ? DOMAINSET_RR() : 3772 DOMAINSET_FIXED(domain); 3773 addr = kmem_malloc_domainset(policy, size, wait); 3774 if (addr != 0) { 3775 vsetslab(addr, slab); 3776 slab->us_data = (void *)addr; 3777 slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC; 3778 slab->us_size = size; 3779 slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE( 3780 pmap_kextract(addr))); 3781 uma_total_inc(size); 3782 } else { 3783 zone_free_item(slabzone, slab, NULL, SKIP_NONE); 3784 } 3785 3786 return ((void *)addr); 3787 } 3788 3789 void * 3790 uma_large_malloc(vm_size_t size, int wait) 3791 { 3792 3793 return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait); 3794 } 3795 3796 void 3797 uma_large_free(uma_slab_t slab) 3798 { 3799 3800 KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0, 3801 ("uma_large_free: Memory not allocated with uma_large_malloc.")); 3802 kmem_free((vm_offset_t)slab->us_data, slab->us_size); 3803 uma_total_dec(slab->us_size); 3804 zone_free_item(slabzone, slab, NULL, SKIP_NONE); 3805 } 3806 3807 static void 3808 uma_zero_item(void *item, uma_zone_t zone) 3809 { 3810 3811 bzero(item, zone->uz_size); 3812 } 3813 3814 unsigned long 3815 uma_limit(void) 3816 { 3817 3818 return (uma_kmem_limit); 3819 } 3820 3821 void 3822 uma_set_limit(unsigned long limit) 3823 { 3824 3825 uma_kmem_limit = limit; 3826 } 3827 3828 unsigned long 3829 uma_size(void) 3830 { 3831 3832 return (atomic_load_long(&uma_kmem_total)); 3833 } 3834 3835 long 3836 uma_avail(void) 3837 { 3838 3839 return (uma_kmem_limit - uma_size()); 3840 } 3841 3842 void 3843 uma_print_stats(void) 3844 { 3845 zone_foreach(uma_print_zone); 3846 } 3847 3848 static void 3849 slab_print(uma_slab_t slab) 3850 { 3851 printf("slab: keg %p, data %p, freecount %d\n", 3852 slab->us_keg, slab->us_data, slab->us_freecount); 3853 } 3854 3855 static void 3856 cache_print(uma_cache_t cache) 3857 { 3858 printf("alloc: %p(%d), free: %p(%d), cross: %p(%d)j\n", 3859 cache->uc_allocbucket, 3860 cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0, 3861 cache->uc_freebucket, 3862 cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0, 3863 cache->uc_crossbucket, 3864 cache->uc_crossbucket?cache->uc_crossbucket->ub_cnt:0); 3865 } 3866 3867 static void 3868 uma_print_keg(uma_keg_t keg) 3869 { 3870 uma_domain_t dom; 3871 uma_slab_t slab; 3872 int i; 3873 3874 printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d " 3875 "out %d free %d\n", 3876 keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags, 3877 keg->uk_ipers, keg->uk_ppera, 3878 (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free, 3879 keg->uk_free); 3880 for (i = 0; i < vm_ndomains; i++) { 3881 dom = &keg->uk_domain[i]; 3882 printf("Part slabs:\n"); 3883 LIST_FOREACH(slab, &dom->ud_part_slab, us_link) 3884 slab_print(slab); 3885 printf("Free slabs:\n"); 3886 LIST_FOREACH(slab, &dom->ud_free_slab, us_link) 3887 slab_print(slab); 3888 printf("Full slabs:\n"); 3889 LIST_FOREACH(slab, &dom->ud_full_slab, us_link) 3890 slab_print(slab); 3891 } 3892 } 3893 3894 void 3895 uma_print_zone(uma_zone_t zone) 3896 { 3897 uma_cache_t cache; 3898 int i; 3899 3900 printf("zone: %s(%p) size %d maxitems %ju flags %#x\n", 3901 zone->uz_name, zone, zone->uz_size, (uintmax_t)zone->uz_max_items, 3902 zone->uz_flags); 3903 if (zone->uz_lockptr != &zone->uz_lock) 3904 uma_print_keg(zone->uz_keg); 3905 CPU_FOREACH(i) { 3906 cache = &zone->uz_cpu[i]; 3907 printf("CPU %d Cache:\n", i); 3908 cache_print(cache); 3909 } 3910 } 3911 3912 #ifdef DDB 3913 /* 3914 * Generate statistics across both the zone and its per-cpu cache's. Return 3915 * desired statistics if the pointer is non-NULL for that statistic. 3916 * 3917 * Note: does not update the zone statistics, as it can't safely clear the 3918 * per-CPU cache statistic. 3919 * 3920 * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't 3921 * safe from off-CPU; we should modify the caches to track this information 3922 * directly so that we don't have to. 3923 */ 3924 static void 3925 uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp, 3926 uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp) 3927 { 3928 uma_cache_t cache; 3929 uint64_t allocs, frees, sleeps, xdomain; 3930 int cachefree, cpu; 3931 3932 allocs = frees = sleeps = xdomain = 0; 3933 cachefree = 0; 3934 CPU_FOREACH(cpu) { 3935 cache = &z->uz_cpu[cpu]; 3936 if (cache->uc_allocbucket != NULL) 3937 cachefree += cache->uc_allocbucket->ub_cnt; 3938 if (cache->uc_freebucket != NULL) 3939 cachefree += cache->uc_freebucket->ub_cnt; 3940 if (cache->uc_crossbucket != NULL) { 3941 xdomain += cache->uc_crossbucket->ub_cnt; 3942 cachefree += cache->uc_crossbucket->ub_cnt; 3943 } 3944 allocs += cache->uc_allocs; 3945 frees += cache->uc_frees; 3946 } 3947 allocs += counter_u64_fetch(z->uz_allocs); 3948 frees += counter_u64_fetch(z->uz_frees); 3949 sleeps += z->uz_sleeps; 3950 xdomain += z->uz_xdomain; 3951 if (cachefreep != NULL) 3952 *cachefreep = cachefree; 3953 if (allocsp != NULL) 3954 *allocsp = allocs; 3955 if (freesp != NULL) 3956 *freesp = frees; 3957 if (sleepsp != NULL) 3958 *sleepsp = sleeps; 3959 if (xdomainp != NULL) 3960 *xdomainp = xdomain; 3961 } 3962 #endif /* DDB */ 3963 3964 static int 3965 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS) 3966 { 3967 uma_keg_t kz; 3968 uma_zone_t z; 3969 int count; 3970 3971 count = 0; 3972 rw_rlock(&uma_rwlock); 3973 LIST_FOREACH(kz, &uma_kegs, uk_link) { 3974 LIST_FOREACH(z, &kz->uk_zones, uz_link) 3975 count++; 3976 } 3977 LIST_FOREACH(z, &uma_cachezones, uz_link) 3978 count++; 3979 3980 rw_runlock(&uma_rwlock); 3981 return (sysctl_handle_int(oidp, &count, 0, req)); 3982 } 3983 3984 static void 3985 uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf, 3986 struct uma_percpu_stat *ups, bool internal) 3987 { 3988 uma_zone_domain_t zdom; 3989 uma_cache_t cache; 3990 int i; 3991 3992 3993 for (i = 0; i < vm_ndomains; i++) { 3994 zdom = &z->uz_domain[i]; 3995 uth->uth_zone_free += zdom->uzd_nitems; 3996 } 3997 uth->uth_allocs = counter_u64_fetch(z->uz_allocs); 3998 uth->uth_frees = counter_u64_fetch(z->uz_frees); 3999 uth->uth_fails = counter_u64_fetch(z->uz_fails); 4000 uth->uth_sleeps = z->uz_sleeps; 4001 uth->uth_xdomain = z->uz_xdomain; 4002 /* 4003 * While it is not normally safe to access the cache 4004 * bucket pointers while not on the CPU that owns the 4005 * cache, we only allow the pointers to be exchanged 4006 * without the zone lock held, not invalidated, so 4007 * accept the possible race associated with bucket 4008 * exchange during monitoring. 4009 */ 4010 for (i = 0; i < mp_maxid + 1; i++) { 4011 bzero(&ups[i], sizeof(*ups)); 4012 if (internal || CPU_ABSENT(i)) 4013 continue; 4014 cache = &z->uz_cpu[i]; 4015 if (cache->uc_allocbucket != NULL) 4016 ups[i].ups_cache_free += 4017 cache->uc_allocbucket->ub_cnt; 4018 if (cache->uc_freebucket != NULL) 4019 ups[i].ups_cache_free += 4020 cache->uc_freebucket->ub_cnt; 4021 if (cache->uc_crossbucket != NULL) 4022 ups[i].ups_cache_free += 4023 cache->uc_crossbucket->ub_cnt; 4024 ups[i].ups_allocs = cache->uc_allocs; 4025 ups[i].ups_frees = cache->uc_frees; 4026 } 4027 } 4028 4029 static int 4030 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS) 4031 { 4032 struct uma_stream_header ush; 4033 struct uma_type_header uth; 4034 struct uma_percpu_stat *ups; 4035 struct sbuf sbuf; 4036 uma_keg_t kz; 4037 uma_zone_t z; 4038 int count, error, i; 4039 4040 error = sysctl_wire_old_buffer(req, 0); 4041 if (error != 0) 4042 return (error); 4043 sbuf_new_for_sysctl(&sbuf, NULL, 128, req); 4044 sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL); 4045 ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK); 4046 4047 count = 0; 4048 rw_rlock(&uma_rwlock); 4049 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4050 LIST_FOREACH(z, &kz->uk_zones, uz_link) 4051 count++; 4052 } 4053 4054 LIST_FOREACH(z, &uma_cachezones, uz_link) 4055 count++; 4056 4057 /* 4058 * Insert stream header. 4059 */ 4060 bzero(&ush, sizeof(ush)); 4061 ush.ush_version = UMA_STREAM_VERSION; 4062 ush.ush_maxcpus = (mp_maxid + 1); 4063 ush.ush_count = count; 4064 (void)sbuf_bcat(&sbuf, &ush, sizeof(ush)); 4065 4066 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4067 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 4068 bzero(&uth, sizeof(uth)); 4069 ZONE_LOCK(z); 4070 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 4071 uth.uth_align = kz->uk_align; 4072 uth.uth_size = kz->uk_size; 4073 uth.uth_rsize = kz->uk_rsize; 4074 if (z->uz_max_items > 0) 4075 uth.uth_pages = (z->uz_items / kz->uk_ipers) * 4076 kz->uk_ppera; 4077 else 4078 uth.uth_pages = kz->uk_pages; 4079 uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * 4080 kz->uk_ppera; 4081 uth.uth_limit = z->uz_max_items; 4082 uth.uth_keg_free = z->uz_keg->uk_free; 4083 4084 /* 4085 * A zone is secondary is it is not the first entry 4086 * on the keg's zone list. 4087 */ 4088 if ((z->uz_flags & UMA_ZONE_SECONDARY) && 4089 (LIST_FIRST(&kz->uk_zones) != z)) 4090 uth.uth_zone_flags = UTH_ZONE_SECONDARY; 4091 uma_vm_zone_stats(&uth, z, &sbuf, ups, 4092 kz->uk_flags & UMA_ZFLAG_INTERNAL); 4093 ZONE_UNLOCK(z); 4094 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 4095 for (i = 0; i < mp_maxid + 1; i++) 4096 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 4097 } 4098 } 4099 LIST_FOREACH(z, &uma_cachezones, uz_link) { 4100 bzero(&uth, sizeof(uth)); 4101 ZONE_LOCK(z); 4102 strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME); 4103 uth.uth_size = z->uz_size; 4104 uma_vm_zone_stats(&uth, z, &sbuf, ups, false); 4105 ZONE_UNLOCK(z); 4106 (void)sbuf_bcat(&sbuf, &uth, sizeof(uth)); 4107 for (i = 0; i < mp_maxid + 1; i++) 4108 (void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i])); 4109 } 4110 4111 rw_runlock(&uma_rwlock); 4112 error = sbuf_finish(&sbuf); 4113 sbuf_delete(&sbuf); 4114 free(ups, M_TEMP); 4115 return (error); 4116 } 4117 4118 int 4119 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS) 4120 { 4121 uma_zone_t zone = *(uma_zone_t *)arg1; 4122 int error, max; 4123 4124 max = uma_zone_get_max(zone); 4125 error = sysctl_handle_int(oidp, &max, 0, req); 4126 if (error || !req->newptr) 4127 return (error); 4128 4129 uma_zone_set_max(zone, max); 4130 4131 return (0); 4132 } 4133 4134 int 4135 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS) 4136 { 4137 uma_zone_t zone = *(uma_zone_t *)arg1; 4138 int cur; 4139 4140 cur = uma_zone_get_cur(zone); 4141 return (sysctl_handle_int(oidp, &cur, 0, req)); 4142 } 4143 4144 #ifdef INVARIANTS 4145 static uma_slab_t 4146 uma_dbg_getslab(uma_zone_t zone, void *item) 4147 { 4148 uma_slab_t slab; 4149 uma_keg_t keg; 4150 uint8_t *mem; 4151 4152 mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK)); 4153 if (zone->uz_flags & UMA_ZONE_VTOSLAB) { 4154 slab = vtoslab((vm_offset_t)mem); 4155 } else { 4156 /* 4157 * It is safe to return the slab here even though the 4158 * zone is unlocked because the item's allocation state 4159 * essentially holds a reference. 4160 */ 4161 if (zone->uz_lockptr == &zone->uz_lock) 4162 return (NULL); 4163 ZONE_LOCK(zone); 4164 keg = zone->uz_keg; 4165 if (keg->uk_flags & UMA_ZONE_HASH) 4166 slab = hash_sfind(&keg->uk_hash, mem); 4167 else 4168 slab = (uma_slab_t)(mem + keg->uk_pgoff); 4169 ZONE_UNLOCK(zone); 4170 } 4171 4172 return (slab); 4173 } 4174 4175 static bool 4176 uma_dbg_zskip(uma_zone_t zone, void *mem) 4177 { 4178 4179 if (zone->uz_lockptr == &zone->uz_lock) 4180 return (true); 4181 4182 return (uma_dbg_kskip(zone->uz_keg, mem)); 4183 } 4184 4185 static bool 4186 uma_dbg_kskip(uma_keg_t keg, void *mem) 4187 { 4188 uintptr_t idx; 4189 4190 if (dbg_divisor == 0) 4191 return (true); 4192 4193 if (dbg_divisor == 1) 4194 return (false); 4195 4196 idx = (uintptr_t)mem >> PAGE_SHIFT; 4197 if (keg->uk_ipers > 1) { 4198 idx *= keg->uk_ipers; 4199 idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize; 4200 } 4201 4202 if ((idx / dbg_divisor) * dbg_divisor != idx) { 4203 counter_u64_add(uma_skip_cnt, 1); 4204 return (true); 4205 } 4206 counter_u64_add(uma_dbg_cnt, 1); 4207 4208 return (false); 4209 } 4210 4211 /* 4212 * Set up the slab's freei data such that uma_dbg_free can function. 4213 * 4214 */ 4215 static void 4216 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item) 4217 { 4218 uma_keg_t keg; 4219 int freei; 4220 4221 if (slab == NULL) { 4222 slab = uma_dbg_getslab(zone, item); 4223 if (slab == NULL) 4224 panic("uma: item %p did not belong to zone %s\n", 4225 item, zone->uz_name); 4226 } 4227 keg = slab->us_keg; 4228 freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; 4229 4230 if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree)) 4231 panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n", 4232 item, zone, zone->uz_name, slab, freei); 4233 BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree); 4234 4235 return; 4236 } 4237 4238 /* 4239 * Verifies freed addresses. Checks for alignment, valid slab membership 4240 * and duplicate frees. 4241 * 4242 */ 4243 static void 4244 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item) 4245 { 4246 uma_keg_t keg; 4247 int freei; 4248 4249 if (slab == NULL) { 4250 slab = uma_dbg_getslab(zone, item); 4251 if (slab == NULL) 4252 panic("uma: Freed item %p did not belong to zone %s\n", 4253 item, zone->uz_name); 4254 } 4255 keg = slab->us_keg; 4256 freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize; 4257 4258 if (freei >= keg->uk_ipers) 4259 panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n", 4260 item, zone, zone->uz_name, slab, freei); 4261 4262 if (((freei * keg->uk_rsize) + slab->us_data) != item) 4263 panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n", 4264 item, zone, zone->uz_name, slab, freei); 4265 4266 if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree)) 4267 panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n", 4268 item, zone, zone->uz_name, slab, freei); 4269 4270 BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree); 4271 } 4272 #endif /* INVARIANTS */ 4273 4274 #ifdef DDB 4275 DB_SHOW_COMMAND(uma, db_show_uma) 4276 { 4277 uma_keg_t kz; 4278 uma_zone_t z; 4279 uint64_t allocs, frees, sleeps, xdomain; 4280 long cachefree; 4281 int i; 4282 4283 db_printf("%18s %8s %8s %8s %12s %8s %8s %8s\n", "Zone", "Size", "Used", 4284 "Free", "Requests", "Sleeps", "Bucket", "XFree"); 4285 LIST_FOREACH(kz, &uma_kegs, uk_link) { 4286 LIST_FOREACH(z, &kz->uk_zones, uz_link) { 4287 if (kz->uk_flags & UMA_ZFLAG_INTERNAL) { 4288 allocs = counter_u64_fetch(z->uz_allocs); 4289 frees = counter_u64_fetch(z->uz_frees); 4290 sleeps = z->uz_sleeps; 4291 cachefree = 0; 4292 } else 4293 uma_zone_sumstat(z, &cachefree, &allocs, 4294 &frees, &sleeps, &xdomain); 4295 if (!((z->uz_flags & UMA_ZONE_SECONDARY) && 4296 (LIST_FIRST(&kz->uk_zones) != z))) 4297 cachefree += kz->uk_free; 4298 for (i = 0; i < vm_ndomains; i++) 4299 cachefree += z->uz_domain[i].uzd_nitems; 4300 4301 db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u %8ju\n", 4302 z->uz_name, (uintmax_t)kz->uk_size, 4303 (intmax_t)(allocs - frees), cachefree, 4304 (uintmax_t)allocs, sleeps, z->uz_count, xdomain); 4305 if (db_pager_quit) 4306 return; 4307 } 4308 } 4309 } 4310 4311 DB_SHOW_COMMAND(umacache, db_show_umacache) 4312 { 4313 uma_zone_t z; 4314 uint64_t allocs, frees; 4315 long cachefree; 4316 int i; 4317 4318 db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free", 4319 "Requests", "Bucket"); 4320 LIST_FOREACH(z, &uma_cachezones, uz_link) { 4321 uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL); 4322 for (i = 0; i < vm_ndomains; i++) 4323 cachefree += z->uz_domain[i].uzd_nitems; 4324 db_printf("%18s %8ju %8jd %8ld %12ju %8u\n", 4325 z->uz_name, (uintmax_t)z->uz_size, 4326 (intmax_t)(allocs - frees), cachefree, 4327 (uintmax_t)allocs, z->uz_count); 4328 if (db_pager_quit) 4329 return; 4330 } 4331 } 4332 #endif /* DDB */ 4333