1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2019, Joyent, Inc. 26 * Copyright (c) 2017 by Delphix. All rights reserved. 27 */ 28 29 /* 30 * For a more complete description of the main ideas, see: 31 * 32 * Jeff Bonwick and Jonathan Adams, 33 * 34 * Magazines and vmem: Extending the Slab Allocator to Many CPUs and 35 * Arbitrary Resources. 36 * 37 * Proceedings of the 2001 Usenix Conference. 38 * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf. 39 * 40 * For the "Big Theory Statement", see usr/src/uts/common/os/vmem.c 41 * 42 * 1. Overview of changes 43 * ------------------------------ 44 * There have been a few changes to vmem in order to support umem. The 45 * main areas are: 46 * 47 * * VM_SLEEP unsupported 48 * 49 * * Reaping changes 50 * 51 * * initialization changes 52 * 53 * * _vmem_extend_alloc 54 * 55 * 56 * 2. VM_SLEEP Removed 57 * ------------------- 58 * Since VM_SLEEP allocations can hold locks (in vmem_populate()) for 59 * possibly infinite amounts of time, they are not supported in this 60 * version of vmem. Sleep-like behavior can be achieved through 61 * UMEM_NOFAIL umem allocations. 62 * 63 * 64 * 3. Reaping changes 65 * ------------------ 66 * Unlike kmem_reap(), which just asynchronously schedules work, umem_reap() 67 * can do allocations and frees synchronously. This is a problem if it 68 * occurs during a vmem_populate() allocation. 69 * 70 * Instead, we delay reaps while populates are active. 71 * 72 * 73 * 4. Initialization changes 74 * ------------------------- 75 * In the kernel, vmem_init() allows you to create a single, top-level arena, 76 * which has vmem_internal_arena as a child. For umem, we want to be able 77 * to extend arenas dynamically. It is much easier to support this if we 78 * allow a two-level "heap" arena: 79 * 80 * +----------+ 81 * | "fake" | 82 * +----------+ 83 * | 84 * +----------+ 85 * | "heap" | 86 * +----------+ 87 * | \ \ 88 * | +-+-- ... <other children> 89 * | 90 * +---------------+ 91 * | vmem_internal | 92 * +---------------+ 93 * | | | | 94 * <children> 95 * 96 * The new vmem_init() allows you to specify a "parent" of the heap, along 97 * with allocation functions. 98 * 99 * 100 * 5. _vmem_extend_alloc 101 * --------------------- 102 * The other part of extending is _vmem_extend_alloc. This function allows 103 * you to extend (expand current spans, if possible) an arena and allocate 104 * a chunk of the newly extened span atomically. This is needed to support 105 * extending the heap while vmem_populate()ing it. 106 * 107 * In order to increase the usefulness of extending, non-imported spans are 108 * sorted in address order. 109 */ 110 111 #include <sys/vmem_impl_user.h> 112 #include <alloca.h> 113 #include <sys/sysmacros.h> 114 #include <stdio.h> 115 #include <strings.h> 116 #include <atomic.h> 117 118 #include "vmem_base.h" 119 #include "umem_base.h" 120 121 #define VMEM_INITIAL 6 /* early vmem arenas */ 122 #define VMEM_SEG_INITIAL 100 /* early segments */ 123 124 /* 125 * Adding a new span to an arena requires two segment structures: one to 126 * represent the span, and one to represent the free segment it contains. 127 */ 128 #define VMEM_SEGS_PER_SPAN_CREATE 2 129 130 /* 131 * Allocating a piece of an existing segment requires 0-2 segment structures 132 * depending on how much of the segment we're allocating. 133 * 134 * To allocate the entire segment, no new segment structures are needed; we 135 * simply move the existing segment structure from the freelist to the 136 * allocation hash table. 137 * 138 * To allocate a piece from the left or right end of the segment, we must 139 * split the segment into two pieces (allocated part and remainder), so we 140 * need one new segment structure to represent the remainder. 141 * 142 * To allocate from the middle of a segment, we need two new segment strucures 143 * to represent the remainders on either side of the allocated part. 144 */ 145 #define VMEM_SEGS_PER_EXACT_ALLOC 0 146 #define VMEM_SEGS_PER_LEFT_ALLOC 1 147 #define VMEM_SEGS_PER_RIGHT_ALLOC 1 148 #define VMEM_SEGS_PER_MIDDLE_ALLOC 2 149 150 /* 151 * vmem_populate() preallocates segment structures for vmem to do its work. 152 * It must preallocate enough for the worst case, which is when we must import 153 * a new span and then allocate from the middle of it. 154 */ 155 #define VMEM_SEGS_PER_ALLOC_MAX \ 156 (VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC) 157 158 /* 159 * The segment structures themselves are allocated from vmem_seg_arena, so 160 * we have a recursion problem when vmem_seg_arena needs to populate itself. 161 * We address this by working out the maximum number of segment structures 162 * this act will require, and multiplying by the maximum number of threads 163 * that we'll allow to do it simultaneously. 164 * 165 * The worst-case segment consumption to populate vmem_seg_arena is as 166 * follows (depicted as a stack trace to indicate why events are occurring): 167 * 168 * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc) 169 * vmem_alloc(vmem_internal_arena) -> 2 segs (span create + exact alloc) 170 * heap_alloc(heap_arena) 171 * vmem_alloc(heap_arena) -> 4 seg (span create + alloc) 172 * parent_alloc(parent_arena) 173 * _vmem_extend_alloc(parent_arena) -> 3 seg (span create + left alloc) 174 * 175 * Note: The reservation for heap_arena must be 4, since vmem_xalloc() 176 * is overly pessimistic on allocations where parent_arena has a stricter 177 * alignment than heap_arena. 178 * 179 * The worst-case consumption for any arena is 4 segment structures. 180 * For now, we only support VM_NOSLEEP allocations, so as long as we 181 * serialize all vmem_populates, a 4-seg reserve is sufficient. 182 */ 183 #define VMEM_POPULATE_SEGS_PER_ARENA 4 184 #define VMEM_POPULATE_LOCKS 1 185 186 #define VMEM_POPULATE_RESERVE \ 187 (VMEM_POPULATE_SEGS_PER_ARENA * VMEM_POPULATE_LOCKS) 188 189 /* 190 * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures 191 * so that it can satisfy the worst-case allocation *and* participate in 192 * worst-case allocation from vmem_seg_arena. 193 */ 194 #define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX) 195 196 /* Don't assume new statics are zeroed - see vmem_startup() */ 197 static vmem_t vmem0[VMEM_INITIAL]; 198 static vmem_t *vmem_populator[VMEM_INITIAL]; 199 static uint32_t vmem_id; 200 static uint32_t vmem_populators; 201 static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL]; 202 static vmem_seg_t *vmem_segfree; 203 static mutex_t vmem_list_lock; 204 static mutex_t vmem_segfree_lock; 205 static vmem_populate_lock_t vmem_nosleep_lock; 206 #define IN_POPULATE() (vmem_nosleep_lock.vmpl_thr == thr_self()) 207 static vmem_t *vmem_list; 208 static vmem_t *vmem_internal_arena; 209 static vmem_t *vmem_seg_arena; 210 static vmem_t *vmem_hash_arena; 211 static vmem_t *vmem_vmem_arena; 212 213 vmem_t *vmem_heap; 214 vmem_alloc_t *vmem_heap_alloc; 215 vmem_free_t *vmem_heap_free; 216 217 uint32_t vmem_mtbf; /* mean time between failures [default: off] */ 218 size_t vmem_seg_size = sizeof (vmem_seg_t); 219 220 /* 221 * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k'). 222 */ 223 #define VMEM_INSERT(vprev, vsp, type) \ 224 { \ 225 vmem_seg_t *vnext = (vprev)->vs_##type##next; \ 226 (vsp)->vs_##type##next = (vnext); \ 227 (vsp)->vs_##type##prev = (vprev); \ 228 (vprev)->vs_##type##next = (vsp); \ 229 (vnext)->vs_##type##prev = (vsp); \ 230 } 231 232 #define VMEM_DELETE(vsp, type) \ 233 { \ 234 vmem_seg_t *vprev = (vsp)->vs_##type##prev; \ 235 vmem_seg_t *vnext = (vsp)->vs_##type##next; \ 236 (vprev)->vs_##type##next = (vnext); \ 237 (vnext)->vs_##type##prev = (vprev); \ 238 } 239 240 /* 241 * Get a vmem_seg_t from the global segfree list. 242 */ 243 static vmem_seg_t * 244 vmem_getseg_global(void) 245 { 246 vmem_seg_t *vsp; 247 248 (void) mutex_lock(&vmem_segfree_lock); 249 if ((vsp = vmem_segfree) != NULL) 250 vmem_segfree = vsp->vs_knext; 251 (void) mutex_unlock(&vmem_segfree_lock); 252 253 return (vsp); 254 } 255 256 /* 257 * Put a vmem_seg_t on the global segfree list. 258 */ 259 static void 260 vmem_putseg_global(vmem_seg_t *vsp) 261 { 262 (void) mutex_lock(&vmem_segfree_lock); 263 vsp->vs_knext = vmem_segfree; 264 vmem_segfree = vsp; 265 (void) mutex_unlock(&vmem_segfree_lock); 266 } 267 268 /* 269 * Get a vmem_seg_t from vmp's segfree list. 270 */ 271 static vmem_seg_t * 272 vmem_getseg(vmem_t *vmp) 273 { 274 vmem_seg_t *vsp; 275 276 ASSERT(vmp->vm_nsegfree > 0); 277 278 vsp = vmp->vm_segfree; 279 vmp->vm_segfree = vsp->vs_knext; 280 vmp->vm_nsegfree--; 281 282 return (vsp); 283 } 284 285 /* 286 * Put a vmem_seg_t on vmp's segfree list. 287 */ 288 static void 289 vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) 290 { 291 vsp->vs_knext = vmp->vm_segfree; 292 vmp->vm_segfree = vsp; 293 vmp->vm_nsegfree++; 294 } 295 296 /* 297 * Add vsp to the appropriate freelist. 298 */ 299 static void 300 vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp) 301 { 302 vmem_seg_t *vprev; 303 304 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 305 306 vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; 307 vsp->vs_type = VMEM_FREE; 308 vmp->vm_freemap |= VS_SIZE(vprev); 309 VMEM_INSERT(vprev, vsp, k); 310 311 (void) cond_broadcast(&vmp->vm_cv); 312 } 313 314 /* 315 * Take vsp from the freelist. 316 */ 317 static void 318 vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp) 319 { 320 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 321 ASSERT(vsp->vs_type == VMEM_FREE); 322 323 if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) { 324 /* 325 * The segments on both sides of 'vsp' are freelist heads, 326 * so taking vsp leaves the freelist at vsp->vs_kprev empty. 327 */ 328 ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev)); 329 vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev); 330 } 331 VMEM_DELETE(vsp, k); 332 } 333 334 /* 335 * Add vsp to the allocated-segment hash table and update kstats. 336 */ 337 static void 338 vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp) 339 { 340 vmem_seg_t **bucket; 341 342 vsp->vs_type = VMEM_ALLOC; 343 bucket = VMEM_HASH(vmp, vsp->vs_start); 344 vsp->vs_knext = *bucket; 345 *bucket = vsp; 346 347 if (vmem_seg_size == sizeof (vmem_seg_t)) { 348 vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack, 349 VMEM_STACK_DEPTH, 0); 350 vsp->vs_thread = thr_self(); 351 vsp->vs_timestamp = gethrtime(); 352 } else { 353 vsp->vs_depth = 0; 354 } 355 356 vmp->vm_kstat.vk_alloc++; 357 vmp->vm_kstat.vk_mem_inuse += VS_SIZE(vsp); 358 } 359 360 /* 361 * Remove vsp from the allocated-segment hash table and update kstats. 362 */ 363 static vmem_seg_t * 364 vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size) 365 { 366 vmem_seg_t *vsp, **prev_vspp; 367 368 prev_vspp = VMEM_HASH(vmp, addr); 369 while ((vsp = *prev_vspp) != NULL) { 370 if (vsp->vs_start == addr) { 371 *prev_vspp = vsp->vs_knext; 372 break; 373 } 374 vmp->vm_kstat.vk_lookup++; 375 prev_vspp = &vsp->vs_knext; 376 } 377 378 if (vsp == NULL) { 379 umem_panic("vmem_hash_delete(%p, %lx, %lu): bad free", 380 vmp, addr, size); 381 } 382 if (VS_SIZE(vsp) != size) { 383 umem_panic("vmem_hash_delete(%p, %lx, %lu): wrong size " 384 "(expect %lu)", vmp, addr, size, VS_SIZE(vsp)); 385 } 386 387 vmp->vm_kstat.vk_free++; 388 vmp->vm_kstat.vk_mem_inuse -= size; 389 390 return (vsp); 391 } 392 393 /* 394 * Create a segment spanning the range [start, end) and add it to the arena. 395 */ 396 static vmem_seg_t * 397 vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) 398 { 399 vmem_seg_t *newseg = vmem_getseg(vmp); 400 401 newseg->vs_start = start; 402 newseg->vs_end = end; 403 newseg->vs_type = 0; 404 newseg->vs_import = 0; 405 406 VMEM_INSERT(vprev, newseg, a); 407 408 return (newseg); 409 } 410 411 /* 412 * Remove segment vsp from the arena. 413 */ 414 static void 415 vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) 416 { 417 ASSERT(vsp->vs_type != VMEM_ROTOR); 418 VMEM_DELETE(vsp, a); 419 420 vmem_putseg(vmp, vsp); 421 } 422 423 /* 424 * Add the span [vaddr, vaddr + size) to vmp and update kstats. 425 */ 426 static vmem_seg_t * 427 vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import) 428 { 429 vmem_seg_t *knext; 430 vmem_seg_t *newseg, *span; 431 uintptr_t start = (uintptr_t)vaddr; 432 uintptr_t end = start + size; 433 434 knext = &vmp->vm_seg0; 435 if (!import && vmp->vm_source_alloc == NULL) { 436 vmem_seg_t *kend, *kprev; 437 /* 438 * non-imported spans are sorted in address order. This 439 * makes vmem_extend_unlocked() much more effective. 440 * 441 * We search in reverse order, since new spans are 442 * generally at higher addresses. 443 */ 444 kend = &vmp->vm_seg0; 445 for (kprev = kend->vs_kprev; kprev != kend; 446 kprev = kprev->vs_kprev) { 447 if (!kprev->vs_import && (kprev->vs_end - 1) < start) 448 break; 449 } 450 knext = kprev->vs_knext; 451 } 452 453 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 454 455 if ((start | end) & (vmp->vm_quantum - 1)) { 456 umem_panic("vmem_span_create(%p, %p, %lu): misaligned", 457 vmp, vaddr, size); 458 } 459 460 span = vmem_seg_create(vmp, knext->vs_aprev, start, end); 461 span->vs_type = VMEM_SPAN; 462 span->vs_import = import; 463 VMEM_INSERT(knext->vs_kprev, span, k); 464 465 newseg = vmem_seg_create(vmp, span, start, end); 466 vmem_freelist_insert(vmp, newseg); 467 468 if (import) 469 vmp->vm_kstat.vk_mem_import += size; 470 vmp->vm_kstat.vk_mem_total += size; 471 472 return (newseg); 473 } 474 475 /* 476 * Remove span vsp from vmp and update kstats. 477 */ 478 static void 479 vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp) 480 { 481 vmem_seg_t *span = vsp->vs_aprev; 482 size_t size = VS_SIZE(vsp); 483 484 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 485 ASSERT(span->vs_type == VMEM_SPAN); 486 487 if (span->vs_import) 488 vmp->vm_kstat.vk_mem_import -= size; 489 vmp->vm_kstat.vk_mem_total -= size; 490 491 VMEM_DELETE(span, k); 492 493 vmem_seg_destroy(vmp, vsp); 494 vmem_seg_destroy(vmp, span); 495 } 496 497 /* 498 * Allocate the subrange [addr, addr + size) from segment vsp. 499 * If there are leftovers on either side, place them on the freelist. 500 * Returns a pointer to the segment representing [addr, addr + size). 501 */ 502 static vmem_seg_t * 503 vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size) 504 { 505 uintptr_t vs_start = vsp->vs_start; 506 uintptr_t vs_end = vsp->vs_end; 507 size_t vs_size = vs_end - vs_start; 508 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 509 uintptr_t addr_end = addr + realsize; 510 511 ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0); 512 ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0); 513 ASSERT(vsp->vs_type == VMEM_FREE); 514 ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1); 515 ASSERT(addr - 1 <= addr_end - 1); 516 517 /* 518 * If we're allocating from the start of the segment, and the 519 * remainder will be on the same freelist, we can save quite 520 * a bit of work. 521 */ 522 if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) { 523 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 524 vsp->vs_start = addr_end; 525 vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size); 526 vmem_hash_insert(vmp, vsp); 527 return (vsp); 528 } 529 530 vmem_freelist_delete(vmp, vsp); 531 532 if (vs_end != addr_end) 533 vmem_freelist_insert(vmp, 534 vmem_seg_create(vmp, vsp, addr_end, vs_end)); 535 536 if (vs_start != addr) 537 vmem_freelist_insert(vmp, 538 vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr)); 539 540 vsp->vs_start = addr; 541 vsp->vs_end = addr + size; 542 543 vmem_hash_insert(vmp, vsp); 544 return (vsp); 545 } 546 547 /* 548 * We cannot reap if we are in the middle of a vmem_populate(). 549 */ 550 void 551 vmem_reap(void) 552 { 553 if (!IN_POPULATE()) 554 umem_reap(); 555 } 556 557 /* 558 * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures. 559 */ 560 static int 561 vmem_populate(vmem_t *vmp, int vmflag) 562 { 563 char *p; 564 vmem_seg_t *vsp; 565 ssize_t nseg; 566 size_t size; 567 vmem_populate_lock_t *lp; 568 int i; 569 570 while (vmp->vm_nsegfree < VMEM_MINFREE && 571 (vsp = vmem_getseg_global()) != NULL) 572 vmem_putseg(vmp, vsp); 573 574 if (vmp->vm_nsegfree >= VMEM_MINFREE) 575 return (1); 576 577 /* 578 * If we're already populating, tap the reserve. 579 */ 580 if (vmem_nosleep_lock.vmpl_thr == thr_self()) { 581 ASSERT(vmp->vm_cflags & VMC_POPULATOR); 582 return (1); 583 } 584 585 (void) mutex_unlock(&vmp->vm_lock); 586 587 ASSERT(vmflag & VM_NOSLEEP); /* we do not allow sleep allocations */ 588 lp = &vmem_nosleep_lock; 589 590 /* 591 * Cannot be just a mutex_lock(), since that has no effect if 592 * libthread is not linked. 593 */ 594 (void) mutex_lock(&lp->vmpl_mutex); 595 ASSERT(lp->vmpl_thr == 0); 596 lp->vmpl_thr = thr_self(); 597 598 nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE; 599 size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum); 600 nseg = size / vmem_seg_size; 601 602 /* 603 * The following vmem_alloc() may need to populate vmem_seg_arena 604 * and all the things it imports from. When doing so, it will tap 605 * each arena's reserve to prevent recursion (see the block comment 606 * above the definition of VMEM_POPULATE_RESERVE). 607 * 608 * During this allocation, vmem_reap() is a no-op. If the allocation 609 * fails, we call vmem_reap() after dropping the population lock. 610 */ 611 p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_UMFLAGS); 612 if (p == NULL) { 613 lp->vmpl_thr = 0; 614 (void) mutex_unlock(&lp->vmpl_mutex); 615 vmem_reap(); 616 617 (void) mutex_lock(&vmp->vm_lock); 618 vmp->vm_kstat.vk_populate_fail++; 619 return (0); 620 } 621 /* 622 * Restock the arenas that may have been depleted during population. 623 */ 624 for (i = 0; i < vmem_populators; i++) { 625 (void) mutex_lock(&vmem_populator[i]->vm_lock); 626 while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE) 627 vmem_putseg(vmem_populator[i], 628 (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 629 (void) mutex_unlock(&vmem_populator[i]->vm_lock); 630 } 631 632 lp->vmpl_thr = 0; 633 (void) mutex_unlock(&lp->vmpl_mutex); 634 (void) mutex_lock(&vmp->vm_lock); 635 636 /* 637 * Now take our own segments. 638 */ 639 ASSERT(nseg >= VMEM_MINFREE); 640 while (vmp->vm_nsegfree < VMEM_MINFREE) 641 vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 642 643 /* 644 * Give the remainder to charity. 645 */ 646 while (nseg > 0) 647 vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size)); 648 649 return (1); 650 } 651 652 /* 653 * Advance a walker from its previous position to 'afterme'. 654 * Note: may drop and reacquire vmp->vm_lock. 655 */ 656 static void 657 vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme) 658 { 659 vmem_seg_t *vprev = walker->vs_aprev; 660 vmem_seg_t *vnext = walker->vs_anext; 661 vmem_seg_t *vsp = NULL; 662 663 VMEM_DELETE(walker, a); 664 665 if (afterme != NULL) 666 VMEM_INSERT(afterme, walker, a); 667 668 /* 669 * The walker segment's presence may have prevented its neighbors 670 * from coalescing. If so, coalesce them now. 671 */ 672 if (vprev->vs_type == VMEM_FREE) { 673 if (vnext->vs_type == VMEM_FREE) { 674 ASSERT(vprev->vs_end == vnext->vs_start); 675 vmem_freelist_delete(vmp, vnext); 676 vmem_freelist_delete(vmp, vprev); 677 vprev->vs_end = vnext->vs_end; 678 vmem_freelist_insert(vmp, vprev); 679 vmem_seg_destroy(vmp, vnext); 680 } 681 vsp = vprev; 682 } else if (vnext->vs_type == VMEM_FREE) { 683 vsp = vnext; 684 } 685 686 /* 687 * vsp could represent a complete imported span, 688 * in which case we must return it to the source. 689 */ 690 if (vsp != NULL && vsp->vs_aprev->vs_import && 691 vmp->vm_source_free != NULL && 692 vsp->vs_aprev->vs_type == VMEM_SPAN && 693 vsp->vs_anext->vs_type == VMEM_SPAN) { 694 void *vaddr = (void *)vsp->vs_start; 695 size_t size = VS_SIZE(vsp); 696 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 697 vmem_freelist_delete(vmp, vsp); 698 vmem_span_destroy(vmp, vsp); 699 (void) mutex_unlock(&vmp->vm_lock); 700 vmp->vm_source_free(vmp->vm_source, vaddr, size); 701 (void) mutex_lock(&vmp->vm_lock); 702 } 703 } 704 705 /* 706 * VM_NEXTFIT allocations deliberately cycle through all virtual addresses 707 * in an arena, so that we avoid reusing addresses for as long as possible. 708 * This helps to catch used-after-freed bugs. It's also the perfect policy 709 * for allocating things like process IDs, where we want to cycle through 710 * all values in order. 711 */ 712 static void * 713 vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag) 714 { 715 vmem_seg_t *vsp, *rotor; 716 uintptr_t addr; 717 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 718 size_t vs_size; 719 720 (void) mutex_lock(&vmp->vm_lock); 721 722 if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) { 723 (void) mutex_unlock(&vmp->vm_lock); 724 return (NULL); 725 } 726 727 /* 728 * The common case is that the segment right after the rotor is free, 729 * and large enough that extracting 'size' bytes won't change which 730 * freelist it's on. In this case we can avoid a *lot* of work. 731 * Instead of the normal vmem_seg_alloc(), we just advance the start 732 * address of the victim segment. Instead of moving the rotor, we 733 * create the new segment structure *behind the rotor*, which has 734 * the same effect. And finally, we know we don't have to coalesce 735 * the rotor's neighbors because the new segment lies between them. 736 */ 737 rotor = &vmp->vm_rotor; 738 vsp = rotor->vs_anext; 739 if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize && 740 P2SAMEHIGHBIT(vs_size, vs_size - realsize)) { 741 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 742 addr = vsp->vs_start; 743 vsp->vs_start = addr + realsize; 744 vmem_hash_insert(vmp, 745 vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size)); 746 (void) mutex_unlock(&vmp->vm_lock); 747 return ((void *)addr); 748 } 749 750 /* 751 * Starting at the rotor, look for a segment large enough to 752 * satisfy the allocation. 753 */ 754 for (;;) { 755 vmp->vm_kstat.vk_search++; 756 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 757 break; 758 vsp = vsp->vs_anext; 759 if (vsp == rotor) { 760 int cancel_state; 761 762 /* 763 * We've come full circle. One possibility is that the 764 * there's actually enough space, but the rotor itself 765 * is preventing the allocation from succeeding because 766 * it's sitting between two free segments. Therefore, 767 * we advance the rotor and see if that liberates a 768 * suitable segment. 769 */ 770 vmem_advance(vmp, rotor, rotor->vs_anext); 771 vsp = rotor->vs_aprev; 772 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 773 break; 774 /* 775 * If there's a lower arena we can import from, or it's 776 * a VM_NOSLEEP allocation, let vmem_xalloc() handle it. 777 * Otherwise, wait until another thread frees something. 778 */ 779 if (vmp->vm_source_alloc != NULL || 780 (vmflag & VM_NOSLEEP)) { 781 (void) mutex_unlock(&vmp->vm_lock); 782 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 783 0, 0, NULL, NULL, vmflag & VM_UMFLAGS)); 784 } 785 vmp->vm_kstat.vk_wait++; 786 (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 787 &cancel_state); 788 (void) cond_wait(&vmp->vm_cv, &vmp->vm_lock); 789 (void) pthread_setcancelstate(cancel_state, NULL); 790 vsp = rotor->vs_anext; 791 } 792 } 793 794 /* 795 * We found a segment. Extract enough space to satisfy the allocation. 796 */ 797 addr = vsp->vs_start; 798 vsp = vmem_seg_alloc(vmp, vsp, addr, size); 799 ASSERT(vsp->vs_type == VMEM_ALLOC && 800 vsp->vs_start == addr && vsp->vs_end == addr + size); 801 802 /* 803 * Advance the rotor to right after the newly-allocated segment. 804 * That's where the next VM_NEXTFIT allocation will begin searching. 805 */ 806 vmem_advance(vmp, rotor, vsp); 807 (void) mutex_unlock(&vmp->vm_lock); 808 return ((void *)addr); 809 } 810 811 /* 812 * Allocate size bytes at offset phase from an align boundary such that the 813 * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) 814 * that does not straddle a nocross-aligned boundary. 815 */ 816 void * 817 vmem_xalloc(vmem_t *vmp, size_t size, size_t align, size_t phase, 818 size_t nocross, void *minaddr, void *maxaddr, int vmflag) 819 { 820 vmem_seg_t *vsp; 821 vmem_seg_t *vbest = NULL; 822 uintptr_t addr = 0, taddr, start, end; 823 void *vaddr; 824 int hb, flist, resv; 825 uint32_t mtbf; 826 827 if (phase > 0 && phase >= align) 828 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 829 "invalid phase", 830 (void *)vmp, size, align, phase, nocross, 831 minaddr, maxaddr, vmflag); 832 833 if (align == 0) 834 align = vmp->vm_quantum; 835 836 if ((align | phase | nocross) & (vmp->vm_quantum - 1)) { 837 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 838 "parameters not vm_quantum aligned", 839 (void *)vmp, size, align, phase, nocross, 840 minaddr, maxaddr, vmflag); 841 } 842 843 if (nocross != 0 && 844 (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) { 845 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 846 "overconstrained allocation", 847 (void *)vmp, size, align, phase, nocross, 848 minaddr, maxaddr, vmflag); 849 } 850 851 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 852 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 853 return (NULL); 854 855 (void) mutex_lock(&vmp->vm_lock); 856 for (;;) { 857 int cancel_state; 858 859 if (vmp->vm_nsegfree < VMEM_MINFREE && 860 !vmem_populate(vmp, vmflag)) 861 break; 862 863 /* 864 * highbit() returns the highest bit + 1, which is exactly 865 * what we want: we want to search the first freelist whose 866 * members are *definitely* large enough to satisfy our 867 * allocation. However, there are certain cases in which we 868 * want to look at the next-smallest freelist (which *might* 869 * be able to satisfy the allocation): 870 * 871 * (1) The size is exactly a power of 2, in which case 872 * the smaller freelist is always big enough; 873 * 874 * (2) All other freelists are empty; 875 * 876 * (3) We're in the highest possible freelist, which is 877 * always empty (e.g. the 4GB freelist on 32-bit systems); 878 * 879 * (4) We're doing a best-fit or first-fit allocation. 880 */ 881 if ((size & (size - 1)) == 0) { 882 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 883 } else { 884 hb = highbit(size); 885 if ((vmp->vm_freemap >> hb) == 0 || 886 hb == VMEM_FREELISTS || 887 (vmflag & (VM_BESTFIT | VM_FIRSTFIT))) 888 hb--; 889 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 890 } 891 892 for (vbest = NULL, vsp = (flist == 0) ? NULL : 893 vmp->vm_freelist[flist - 1].vs_knext; 894 vsp != NULL; vsp = vsp->vs_knext) { 895 vmp->vm_kstat.vk_search++; 896 if (vsp->vs_start == 0) { 897 /* 898 * We're moving up to a larger freelist, 899 * so if we've already found a candidate, 900 * the fit can't possibly get any better. 901 */ 902 if (vbest != NULL) 903 break; 904 /* 905 * Find the next non-empty freelist. 906 */ 907 flist = lowbit(P2ALIGN(vmp->vm_freemap, 908 VS_SIZE(vsp))); 909 if (flist-- == 0) 910 break; 911 vsp = (vmem_seg_t *)&vmp->vm_freelist[flist]; 912 ASSERT(vsp->vs_knext->vs_type == VMEM_FREE); 913 continue; 914 } 915 if (vsp->vs_end - 1 < (uintptr_t)minaddr) 916 continue; 917 if (vsp->vs_start > (uintptr_t)maxaddr - 1) 918 continue; 919 start = MAX(vsp->vs_start, (uintptr_t)minaddr); 920 end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; 921 taddr = P2PHASEUP(start, align, phase); 922 if (P2BOUNDARY(taddr, size, nocross)) 923 taddr += 924 P2ROUNDUP(P2NPHASE(taddr, nocross), align); 925 if ((taddr - start) + size > end - start || 926 (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest))) 927 continue; 928 vbest = vsp; 929 addr = taddr; 930 if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size) 931 break; 932 } 933 if (vbest != NULL) 934 break; 935 if (size == 0) 936 umem_panic("vmem_xalloc(): size == 0"); 937 if (vmp->vm_source_alloc != NULL && nocross == 0 && 938 minaddr == NULL && maxaddr == NULL) { 939 size_t asize = P2ROUNDUP(size + phase, 940 MAX(align, vmp->vm_source->vm_quantum)); 941 if (asize < size) { /* overflow */ 942 (void) mutex_unlock(&vmp->vm_lock); 943 if (vmflag & VM_NOSLEEP) 944 return (NULL); 945 946 umem_panic("vmem_xalloc(): " 947 "overflow on VM_SLEEP allocation"); 948 } 949 /* 950 * Determine how many segment structures we'll consume. 951 * The calculation must be presise because if we're 952 * here on behalf of vmem_populate(), we are taking 953 * segments from a very limited reserve. 954 */ 955 resv = (size == asize) ? 956 VMEM_SEGS_PER_SPAN_CREATE + 957 VMEM_SEGS_PER_EXACT_ALLOC : 958 VMEM_SEGS_PER_ALLOC_MAX; 959 ASSERT(vmp->vm_nsegfree >= resv); 960 vmp->vm_nsegfree -= resv; /* reserve our segs */ 961 (void) mutex_unlock(&vmp->vm_lock); 962 vaddr = vmp->vm_source_alloc(vmp->vm_source, asize, 963 vmflag & VM_UMFLAGS); 964 (void) mutex_lock(&vmp->vm_lock); 965 vmp->vm_nsegfree += resv; /* claim reservation */ 966 if (vaddr != NULL) { 967 vbest = vmem_span_create(vmp, vaddr, asize, 1); 968 addr = P2PHASEUP(vbest->vs_start, align, phase); 969 break; 970 } 971 } 972 (void) mutex_unlock(&vmp->vm_lock); 973 vmem_reap(); 974 (void) mutex_lock(&vmp->vm_lock); 975 if (vmflag & VM_NOSLEEP) 976 break; 977 vmp->vm_kstat.vk_wait++; 978 (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 979 &cancel_state); 980 (void) cond_wait(&vmp->vm_cv, &vmp->vm_lock); 981 (void) pthread_setcancelstate(cancel_state, NULL); 982 } 983 if (vbest != NULL) { 984 if (addr == 0) { 985 umem_panic("vmem_xalloc(): addr == 0"); 986 } 987 ASSERT(vbest->vs_type == VMEM_FREE); 988 ASSERT(vbest->vs_knext != vbest); 989 (void) vmem_seg_alloc(vmp, vbest, addr, size); 990 (void) mutex_unlock(&vmp->vm_lock); 991 ASSERT(P2PHASE(addr, align) == phase); 992 ASSERT(!P2BOUNDARY(addr, size, nocross)); 993 ASSERT(addr >= (uintptr_t)minaddr); 994 ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); 995 return ((void *)addr); 996 } 997 vmp->vm_kstat.vk_fail++; 998 (void) mutex_unlock(&vmp->vm_lock); 999 if (vmflag & VM_PANIC) 1000 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 1001 "cannot satisfy mandatory allocation", 1002 (void *)vmp, size, align, phase, nocross, 1003 minaddr, maxaddr, vmflag); 1004 return (NULL); 1005 } 1006 1007 /* 1008 * Free the segment [vaddr, vaddr + size), where vaddr was a constrained 1009 * allocation. vmem_xalloc() and vmem_xfree() must always be paired because 1010 * both routines bypass the quantum caches. 1011 */ 1012 void 1013 vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) 1014 { 1015 vmem_seg_t *vsp, *vnext, *vprev; 1016 1017 (void) mutex_lock(&vmp->vm_lock); 1018 1019 vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size); 1020 vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum); 1021 1022 /* 1023 * Attempt to coalesce with the next segment. 1024 */ 1025 vnext = vsp->vs_anext; 1026 if (vnext->vs_type == VMEM_FREE) { 1027 ASSERT(vsp->vs_end == vnext->vs_start); 1028 vmem_freelist_delete(vmp, vnext); 1029 vsp->vs_end = vnext->vs_end; 1030 vmem_seg_destroy(vmp, vnext); 1031 } 1032 1033 /* 1034 * Attempt to coalesce with the previous segment. 1035 */ 1036 vprev = vsp->vs_aprev; 1037 if (vprev->vs_type == VMEM_FREE) { 1038 ASSERT(vprev->vs_end == vsp->vs_start); 1039 vmem_freelist_delete(vmp, vprev); 1040 vprev->vs_end = vsp->vs_end; 1041 vmem_seg_destroy(vmp, vsp); 1042 vsp = vprev; 1043 } 1044 1045 /* 1046 * If the entire span is free, return it to the source. 1047 */ 1048 if (vsp->vs_aprev->vs_import && vmp->vm_source_free != NULL && 1049 vsp->vs_aprev->vs_type == VMEM_SPAN && 1050 vsp->vs_anext->vs_type == VMEM_SPAN) { 1051 vaddr = (void *)vsp->vs_start; 1052 size = VS_SIZE(vsp); 1053 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 1054 vmem_span_destroy(vmp, vsp); 1055 (void) mutex_unlock(&vmp->vm_lock); 1056 vmp->vm_source_free(vmp->vm_source, vaddr, size); 1057 } else { 1058 vmem_freelist_insert(vmp, vsp); 1059 (void) mutex_unlock(&vmp->vm_lock); 1060 } 1061 } 1062 1063 /* 1064 * Allocate size bytes from arena vmp. Returns the allocated address 1065 * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, 1066 * and may also specify best-fit, first-fit, or next-fit allocation policy 1067 * instead of the default instant-fit policy. VM_SLEEP allocations are 1068 * guaranteed to succeed. 1069 */ 1070 void * 1071 vmem_alloc(vmem_t *vmp, size_t size, int vmflag) 1072 { 1073 vmem_seg_t *vsp; 1074 uintptr_t addr; 1075 int hb; 1076 int flist = 0; 1077 uint32_t mtbf; 1078 vmflag |= vmem_allocator; 1079 1080 if (size - 1 < vmp->vm_qcache_max) { 1081 ASSERT(vmflag & VM_NOSLEEP); 1082 return (_umem_cache_alloc(vmp->vm_qcache[(size - 1) >> 1083 vmp->vm_qshift], UMEM_DEFAULT)); 1084 } 1085 1086 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 1087 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 1088 return (NULL); 1089 1090 if (vmflag & VM_NEXTFIT) 1091 return (vmem_nextfit_alloc(vmp, size, vmflag)); 1092 1093 if (vmflag & (VM_BESTFIT | VM_FIRSTFIT)) 1094 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0, 1095 NULL, NULL, vmflag)); 1096 1097 /* 1098 * Unconstrained instant-fit allocation from the segment list. 1099 */ 1100 (void) mutex_lock(&vmp->vm_lock); 1101 1102 if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) { 1103 if ((size & (size - 1)) == 0) 1104 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 1105 else if ((hb = highbit(size)) < VMEM_FREELISTS) 1106 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 1107 } 1108 1109 if (flist-- == 0) { 1110 (void) mutex_unlock(&vmp->vm_lock); 1111 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 1112 0, 0, NULL, NULL, vmflag)); 1113 } 1114 1115 ASSERT(size <= (1UL << flist)); 1116 vsp = vmp->vm_freelist[flist].vs_knext; 1117 addr = vsp->vs_start; 1118 (void) vmem_seg_alloc(vmp, vsp, addr, size); 1119 (void) mutex_unlock(&vmp->vm_lock); 1120 return ((void *)addr); 1121 } 1122 1123 /* 1124 * Free the segment [vaddr, vaddr + size). 1125 */ 1126 void 1127 vmem_free(vmem_t *vmp, void *vaddr, size_t size) 1128 { 1129 if (size - 1 < vmp->vm_qcache_max) 1130 _umem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], 1131 vaddr); 1132 else 1133 vmem_xfree(vmp, vaddr, size); 1134 } 1135 1136 /* 1137 * Determine whether arena vmp contains the segment [vaddr, vaddr + size). 1138 */ 1139 int 1140 vmem_contains(vmem_t *vmp, void *vaddr, size_t size) 1141 { 1142 uintptr_t start = (uintptr_t)vaddr; 1143 uintptr_t end = start + size; 1144 vmem_seg_t *vsp; 1145 vmem_seg_t *seg0 = &vmp->vm_seg0; 1146 1147 (void) mutex_lock(&vmp->vm_lock); 1148 vmp->vm_kstat.vk_contains++; 1149 for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) { 1150 vmp->vm_kstat.vk_contains_search++; 1151 ASSERT(vsp->vs_type == VMEM_SPAN); 1152 if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1) 1153 break; 1154 } 1155 (void) mutex_unlock(&vmp->vm_lock); 1156 return (vsp != seg0); 1157 } 1158 1159 /* 1160 * Add the span [vaddr, vaddr + size) to arena vmp. 1161 */ 1162 void * 1163 vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag) 1164 { 1165 if (vaddr == NULL || size == 0) { 1166 umem_panic("vmem_add(%p, %p, %lu): bad arguments", 1167 vmp, vaddr, size); 1168 } 1169 1170 ASSERT(!vmem_contains(vmp, vaddr, size)); 1171 1172 (void) mutex_lock(&vmp->vm_lock); 1173 if (vmem_populate(vmp, vmflag)) 1174 (void) vmem_span_create(vmp, vaddr, size, 0); 1175 else 1176 vaddr = NULL; 1177 (void) cond_broadcast(&vmp->vm_cv); 1178 (void) mutex_unlock(&vmp->vm_lock); 1179 return (vaddr); 1180 } 1181 1182 /* 1183 * Adds the address range [addr, endaddr) to arena vmp, by either: 1184 * 1. joining two existing spans, [x, addr), and [endaddr, y) (which 1185 * are in that order) into a single [x, y) span, 1186 * 2. expanding an existing [x, addr) span to [x, endaddr), 1187 * 3. expanding an existing [endaddr, x) span to [addr, x), or 1188 * 4. creating a new [addr, endaddr) span. 1189 * 1190 * Called with vmp->vm_lock held, and a successful vmem_populate() completed. 1191 * Cannot fail. Returns the new segment. 1192 * 1193 * NOTE: this algorithm is linear-time in the number of spans, but is 1194 * constant-time when you are extending the last (highest-addressed) 1195 * span. 1196 */ 1197 static vmem_seg_t * 1198 vmem_extend_unlocked(vmem_t *vmp, uintptr_t addr, uintptr_t endaddr) 1199 { 1200 vmem_seg_t *span; 1201 vmem_seg_t *vsp; 1202 1203 vmem_seg_t *end = &vmp->vm_seg0; 1204 1205 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 1206 1207 /* 1208 * the second "if" clause below relies on the direction of this search 1209 */ 1210 for (span = end->vs_kprev; span != end; span = span->vs_kprev) { 1211 if (span->vs_end == addr || span->vs_start == endaddr) 1212 break; 1213 } 1214 1215 if (span == end) 1216 return (vmem_span_create(vmp, (void *)addr, endaddr - addr, 0)); 1217 if (span->vs_kprev->vs_end == addr && span->vs_start == endaddr) { 1218 vmem_seg_t *prevspan = span->vs_kprev; 1219 vmem_seg_t *nextseg = span->vs_anext; 1220 vmem_seg_t *prevseg = span->vs_aprev; 1221 1222 /* 1223 * prevspan becomes the span marker for the full range 1224 */ 1225 prevspan->vs_end = span->vs_end; 1226 1227 /* 1228 * Notionally, span becomes a free segment representing 1229 * [addr, endaddr). 1230 * 1231 * However, if either of its neighbors are free, we coalesce 1232 * by destroying span and changing the free segment. 1233 */ 1234 if (prevseg->vs_type == VMEM_FREE && 1235 nextseg->vs_type == VMEM_FREE) { 1236 /* 1237 * coalesce both ways 1238 */ 1239 ASSERT(prevseg->vs_end == addr && 1240 nextseg->vs_start == endaddr); 1241 1242 vmem_freelist_delete(vmp, prevseg); 1243 prevseg->vs_end = nextseg->vs_end; 1244 1245 vmem_freelist_delete(vmp, nextseg); 1246 VMEM_DELETE(span, k); 1247 vmem_seg_destroy(vmp, nextseg); 1248 vmem_seg_destroy(vmp, span); 1249 1250 vsp = prevseg; 1251 } else if (prevseg->vs_type == VMEM_FREE) { 1252 /* 1253 * coalesce left 1254 */ 1255 ASSERT(prevseg->vs_end == addr); 1256 1257 VMEM_DELETE(span, k); 1258 vmem_seg_destroy(vmp, span); 1259 1260 vmem_freelist_delete(vmp, prevseg); 1261 prevseg->vs_end = endaddr; 1262 1263 vsp = prevseg; 1264 } else if (nextseg->vs_type == VMEM_FREE) { 1265 /* 1266 * coalesce right 1267 */ 1268 ASSERT(nextseg->vs_start == endaddr); 1269 1270 VMEM_DELETE(span, k); 1271 vmem_seg_destroy(vmp, span); 1272 1273 vmem_freelist_delete(vmp, nextseg); 1274 nextseg->vs_start = addr; 1275 1276 vsp = nextseg; 1277 } else { 1278 /* 1279 * cannnot coalesce 1280 */ 1281 VMEM_DELETE(span, k); 1282 span->vs_start = addr; 1283 span->vs_end = endaddr; 1284 1285 vsp = span; 1286 } 1287 } else if (span->vs_end == addr) { 1288 vmem_seg_t *oldseg = span->vs_knext->vs_aprev; 1289 span->vs_end = endaddr; 1290 1291 ASSERT(oldseg->vs_type != VMEM_SPAN); 1292 if (oldseg->vs_type == VMEM_FREE) { 1293 ASSERT(oldseg->vs_end == addr); 1294 vmem_freelist_delete(vmp, oldseg); 1295 oldseg->vs_end = endaddr; 1296 vsp = oldseg; 1297 } else 1298 vsp = vmem_seg_create(vmp, oldseg, addr, endaddr); 1299 } else { 1300 vmem_seg_t *oldseg = span->vs_anext; 1301 ASSERT(span->vs_start == endaddr); 1302 span->vs_start = addr; 1303 1304 ASSERT(oldseg->vs_type != VMEM_SPAN); 1305 if (oldseg->vs_type == VMEM_FREE) { 1306 ASSERT(oldseg->vs_start == endaddr); 1307 vmem_freelist_delete(vmp, oldseg); 1308 oldseg->vs_start = addr; 1309 vsp = oldseg; 1310 } else 1311 vsp = vmem_seg_create(vmp, span, addr, endaddr); 1312 } 1313 vmem_freelist_insert(vmp, vsp); 1314 vmp->vm_kstat.vk_mem_total += (endaddr - addr); 1315 return (vsp); 1316 } 1317 1318 /* 1319 * Does some error checking, calls vmem_extend_unlocked to add 1320 * [vaddr, vaddr+size) to vmp, then allocates alloc bytes from the 1321 * newly merged segment. 1322 */ 1323 void * 1324 _vmem_extend_alloc(vmem_t *vmp, void *vaddr, size_t size, size_t alloc, 1325 int vmflag) 1326 { 1327 uintptr_t addr = (uintptr_t)vaddr; 1328 uintptr_t endaddr = addr + size; 1329 vmem_seg_t *vsp; 1330 1331 ASSERT(vaddr != NULL && size != 0 && endaddr > addr); 1332 ASSERT(alloc <= size && alloc != 0); 1333 ASSERT(((addr | size | alloc) & (vmp->vm_quantum - 1)) == 0); 1334 1335 ASSERT(!vmem_contains(vmp, vaddr, size)); 1336 1337 (void) mutex_lock(&vmp->vm_lock); 1338 if (!vmem_populate(vmp, vmflag)) { 1339 (void) mutex_unlock(&vmp->vm_lock); 1340 return (NULL); 1341 } 1342 /* 1343 * if there is a source, we can't mess with the spans 1344 */ 1345 if (vmp->vm_source_alloc != NULL) 1346 vsp = vmem_span_create(vmp, vaddr, size, 0); 1347 else 1348 vsp = vmem_extend_unlocked(vmp, addr, endaddr); 1349 1350 ASSERT(VS_SIZE(vsp) >= alloc); 1351 1352 addr = vsp->vs_start; 1353 (void) vmem_seg_alloc(vmp, vsp, addr, alloc); 1354 vaddr = (void *)addr; 1355 1356 (void) cond_broadcast(&vmp->vm_cv); 1357 (void) mutex_unlock(&vmp->vm_lock); 1358 1359 return (vaddr); 1360 } 1361 1362 /* 1363 * Walk the vmp arena, applying func to each segment matching typemask. 1364 * If VMEM_REENTRANT is specified, the arena lock is dropped across each 1365 * call to func(); otherwise, it is held for the duration of vmem_walk() 1366 * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks 1367 * are *not* necessarily consistent, so they may only be used when a hint 1368 * is adequate. 1369 */ 1370 void 1371 vmem_walk(vmem_t *vmp, int typemask, 1372 void (*func)(void *, void *, size_t), void *arg) 1373 { 1374 vmem_seg_t *vsp; 1375 vmem_seg_t *seg0 = &vmp->vm_seg0; 1376 vmem_seg_t walker; 1377 1378 if (typemask & VMEM_WALKER) 1379 return; 1380 1381 bzero(&walker, sizeof (walker)); 1382 walker.vs_type = VMEM_WALKER; 1383 1384 (void) mutex_lock(&vmp->vm_lock); 1385 VMEM_INSERT(seg0, &walker, a); 1386 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) { 1387 if (vsp->vs_type & typemask) { 1388 void *start = (void *)vsp->vs_start; 1389 size_t size = VS_SIZE(vsp); 1390 if (typemask & VMEM_REENTRANT) { 1391 vmem_advance(vmp, &walker, vsp); 1392 (void) mutex_unlock(&vmp->vm_lock); 1393 func(arg, start, size); 1394 (void) mutex_lock(&vmp->vm_lock); 1395 vsp = &walker; 1396 } else { 1397 func(arg, start, size); 1398 } 1399 } 1400 } 1401 vmem_advance(vmp, &walker, NULL); 1402 (void) mutex_unlock(&vmp->vm_lock); 1403 } 1404 1405 /* 1406 * Return the total amount of memory whose type matches typemask. Thus: 1407 * 1408 * typemask VMEM_ALLOC yields total memory allocated (in use). 1409 * typemask VMEM_FREE yields total memory free (available). 1410 * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size. 1411 */ 1412 size_t 1413 vmem_size(vmem_t *vmp, int typemask) 1414 { 1415 uint64_t size = 0; 1416 1417 if (typemask & VMEM_ALLOC) 1418 size += vmp->vm_kstat.vk_mem_inuse; 1419 if (typemask & VMEM_FREE) 1420 size += vmp->vm_kstat.vk_mem_total - 1421 vmp->vm_kstat.vk_mem_inuse; 1422 return ((size_t)size); 1423 } 1424 1425 /* 1426 * Create an arena called name whose initial span is [base, base + size). 1427 * The arena's natural unit of currency is quantum, so vmem_alloc() 1428 * guarantees quantum-aligned results. The arena may import new spans 1429 * by invoking afunc() on source, and may return those spans by invoking 1430 * ffunc() on source. To make small allocations fast and scalable, 1431 * the arena offers high-performance caching for each integer multiple 1432 * of quantum up to qcache_max. 1433 */ 1434 vmem_t * 1435 vmem_create(const char *name, void *base, size_t size, size_t quantum, 1436 vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source, 1437 size_t qcache_max, int vmflag) 1438 { 1439 int i; 1440 size_t nqcache; 1441 vmem_t *vmp, *cur, **vmpp; 1442 vmem_seg_t *vsp; 1443 vmem_freelist_t *vfp; 1444 uint32_t id = atomic_add_32_nv(&vmem_id, 1); 1445 1446 if (vmem_vmem_arena != NULL) { 1447 vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t), 1448 vmflag & VM_UMFLAGS); 1449 } else { 1450 ASSERT(id <= VMEM_INITIAL); 1451 vmp = &vmem0[id - 1]; 1452 } 1453 1454 if (vmp == NULL) 1455 return (NULL); 1456 bzero(vmp, sizeof (vmem_t)); 1457 1458 (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name); 1459 (void) mutex_init(&vmp->vm_lock, USYNC_THREAD, NULL); 1460 (void) cond_init(&vmp->vm_cv, USYNC_THREAD, NULL); 1461 vmp->vm_cflags = vmflag; 1462 vmflag &= VM_UMFLAGS; 1463 1464 vmp->vm_quantum = quantum; 1465 vmp->vm_qshift = highbit(quantum) - 1; 1466 nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX); 1467 1468 for (i = 0; i <= VMEM_FREELISTS; i++) { 1469 vfp = &vmp->vm_freelist[i]; 1470 vfp->vs_end = 1UL << i; 1471 vfp->vs_knext = (vmem_seg_t *)(vfp + 1); 1472 vfp->vs_kprev = (vmem_seg_t *)(vfp - 1); 1473 } 1474 1475 vmp->vm_freelist[0].vs_kprev = NULL; 1476 vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL; 1477 vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0; 1478 vmp->vm_hash_table = vmp->vm_hash0; 1479 vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1; 1480 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1481 1482 vsp = &vmp->vm_seg0; 1483 vsp->vs_anext = vsp; 1484 vsp->vs_aprev = vsp; 1485 vsp->vs_knext = vsp; 1486 vsp->vs_kprev = vsp; 1487 vsp->vs_type = VMEM_SPAN; 1488 1489 vsp = &vmp->vm_rotor; 1490 vsp->vs_type = VMEM_ROTOR; 1491 VMEM_INSERT(&vmp->vm_seg0, vsp, a); 1492 1493 vmp->vm_id = id; 1494 if (source != NULL) 1495 vmp->vm_kstat.vk_source_id = source->vm_id; 1496 vmp->vm_source = source; 1497 vmp->vm_source_alloc = afunc; 1498 vmp->vm_source_free = ffunc; 1499 1500 if (nqcache != 0) { 1501 vmp->vm_qcache_max = nqcache << vmp->vm_qshift; 1502 for (i = 0; i < nqcache; i++) { 1503 char buf[VMEM_NAMELEN + 21]; 1504 (void) snprintf(buf, sizeof (buf), "%s_%lu", 1505 vmp->vm_name, (long)((i + 1) * quantum)); 1506 vmp->vm_qcache[i] = umem_cache_create(buf, 1507 (i + 1) * quantum, quantum, NULL, NULL, NULL, 1508 NULL, vmp, UMC_QCACHE | UMC_NOTOUCH); 1509 if (vmp->vm_qcache[i] == NULL) { 1510 vmp->vm_qcache_max = i * quantum; 1511 break; 1512 } 1513 } 1514 } 1515 1516 (void) mutex_lock(&vmem_list_lock); 1517 vmpp = &vmem_list; 1518 while ((cur = *vmpp) != NULL) 1519 vmpp = &cur->vm_next; 1520 *vmpp = vmp; 1521 (void) mutex_unlock(&vmem_list_lock); 1522 1523 if (vmp->vm_cflags & VMC_POPULATOR) { 1524 uint_t pop_id = atomic_add_32_nv(&vmem_populators, 1); 1525 ASSERT(pop_id <= VMEM_INITIAL); 1526 vmem_populator[pop_id - 1] = vmp; 1527 (void) mutex_lock(&vmp->vm_lock); 1528 (void) vmem_populate(vmp, vmflag | VM_PANIC); 1529 (void) mutex_unlock(&vmp->vm_lock); 1530 } 1531 1532 if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) { 1533 vmem_destroy(vmp); 1534 return (NULL); 1535 } 1536 1537 return (vmp); 1538 } 1539 1540 /* 1541 * Destroy arena vmp. 1542 */ 1543 void 1544 vmem_destroy(vmem_t *vmp) 1545 { 1546 vmem_t *cur, **vmpp; 1547 vmem_seg_t *seg0 = &vmp->vm_seg0; 1548 vmem_seg_t *vsp; 1549 size_t leaked; 1550 int i; 1551 1552 (void) mutex_lock(&vmem_list_lock); 1553 vmpp = &vmem_list; 1554 while ((cur = *vmpp) != vmp) 1555 vmpp = &cur->vm_next; 1556 *vmpp = vmp->vm_next; 1557 (void) mutex_unlock(&vmem_list_lock); 1558 1559 for (i = 0; i < VMEM_NQCACHE_MAX; i++) 1560 if (vmp->vm_qcache[i]) 1561 umem_cache_destroy(vmp->vm_qcache[i]); 1562 1563 leaked = vmem_size(vmp, VMEM_ALLOC); 1564 if (leaked != 0) 1565 umem_printf("vmem_destroy('%s'): leaked %lu bytes", 1566 vmp->vm_name, leaked); 1567 1568 if (vmp->vm_hash_table != vmp->vm_hash0) 1569 vmem_free(vmem_hash_arena, vmp->vm_hash_table, 1570 (vmp->vm_hash_mask + 1) * sizeof (void *)); 1571 1572 /* 1573 * Give back the segment structures for anything that's left in the 1574 * arena, e.g. the primary spans and their free segments. 1575 */ 1576 VMEM_DELETE(&vmp->vm_rotor, a); 1577 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) 1578 vmem_putseg_global(vsp); 1579 1580 while (vmp->vm_nsegfree > 0) 1581 vmem_putseg_global(vmem_getseg(vmp)); 1582 1583 (void) mutex_destroy(&vmp->vm_lock); 1584 (void) cond_destroy(&vmp->vm_cv); 1585 vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); 1586 } 1587 1588 /* 1589 * Resize vmp's hash table to keep the average lookup depth near 1.0. 1590 */ 1591 static void 1592 vmem_hash_rescale(vmem_t *vmp) 1593 { 1594 vmem_seg_t **old_table, **new_table, *vsp; 1595 size_t old_size, new_size, h, nseg; 1596 1597 nseg = (size_t)(vmp->vm_kstat.vk_alloc - vmp->vm_kstat.vk_free); 1598 1599 new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); 1600 old_size = vmp->vm_hash_mask + 1; 1601 1602 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) 1603 return; 1604 1605 new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *), 1606 VM_NOSLEEP); 1607 if (new_table == NULL) 1608 return; 1609 bzero(new_table, new_size * sizeof (void *)); 1610 1611 (void) mutex_lock(&vmp->vm_lock); 1612 1613 old_size = vmp->vm_hash_mask + 1; 1614 old_table = vmp->vm_hash_table; 1615 1616 vmp->vm_hash_mask = new_size - 1; 1617 vmp->vm_hash_table = new_table; 1618 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1619 1620 for (h = 0; h < old_size; h++) { 1621 vsp = old_table[h]; 1622 while (vsp != NULL) { 1623 uintptr_t addr = vsp->vs_start; 1624 vmem_seg_t *next_vsp = vsp->vs_knext; 1625 vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr); 1626 vsp->vs_knext = *hash_bucket; 1627 *hash_bucket = vsp; 1628 vsp = next_vsp; 1629 } 1630 } 1631 1632 (void) mutex_unlock(&vmp->vm_lock); 1633 1634 if (old_table != vmp->vm_hash0) 1635 vmem_free(vmem_hash_arena, old_table, 1636 old_size * sizeof (void *)); 1637 } 1638 1639 /* 1640 * Perform periodic maintenance on all vmem arenas. 1641 */ 1642 /*ARGSUSED*/ 1643 void 1644 vmem_update(void *dummy) 1645 { 1646 vmem_t *vmp; 1647 1648 (void) mutex_lock(&vmem_list_lock); 1649 for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) { 1650 /* 1651 * If threads are waiting for resources, wake them up 1652 * periodically so they can issue another vmem_reap() 1653 * to reclaim resources cached by the slab allocator. 1654 */ 1655 (void) cond_broadcast(&vmp->vm_cv); 1656 1657 /* 1658 * Rescale the hash table to keep the hash chains short. 1659 */ 1660 vmem_hash_rescale(vmp); 1661 } 1662 (void) mutex_unlock(&vmem_list_lock); 1663 } 1664 1665 /* 1666 * If vmem_init is called again, we need to be able to reset the world. 1667 * That includes resetting the statics back to their original values. 1668 */ 1669 void 1670 vmem_startup(void) 1671 { 1672 #ifdef UMEM_STANDALONE 1673 vmem_id = 0; 1674 vmem_populators = 0; 1675 vmem_segfree = NULL; 1676 vmem_list = NULL; 1677 vmem_internal_arena = NULL; 1678 vmem_seg_arena = NULL; 1679 vmem_hash_arena = NULL; 1680 vmem_vmem_arena = NULL; 1681 vmem_heap = NULL; 1682 vmem_heap_alloc = NULL; 1683 vmem_heap_free = NULL; 1684 1685 bzero(vmem0, sizeof (vmem0)); 1686 bzero(vmem_populator, sizeof (vmem_populator)); 1687 bzero(vmem_seg0, sizeof (vmem_seg0)); 1688 #endif 1689 } 1690 1691 /* 1692 * Prepare vmem for use. 1693 */ 1694 vmem_t * 1695 vmem_init(const char *parent_name, size_t parent_quantum, 1696 vmem_alloc_t *parent_alloc, vmem_free_t *parent_free, 1697 const char *heap_name, void *heap_start, size_t heap_size, 1698 size_t heap_quantum, vmem_alloc_t *heap_alloc, vmem_free_t *heap_free) 1699 { 1700 uint32_t id; 1701 int nseg = VMEM_SEG_INITIAL; 1702 vmem_t *parent, *heap; 1703 1704 ASSERT(vmem_internal_arena == NULL); 1705 1706 while (--nseg >= 0) 1707 vmem_putseg_global(&vmem_seg0[nseg]); 1708 1709 if (parent_name != NULL) { 1710 parent = vmem_create(parent_name, 1711 heap_start, heap_size, parent_quantum, 1712 NULL, NULL, NULL, 0, 1713 VM_SLEEP | VMC_POPULATOR); 1714 heap_start = NULL; 1715 heap_size = 0; 1716 } else { 1717 ASSERT(parent_alloc == NULL && parent_free == NULL); 1718 parent = NULL; 1719 } 1720 1721 heap = vmem_create(heap_name, 1722 heap_start, heap_size, heap_quantum, 1723 parent_alloc, parent_free, parent, 0, 1724 VM_SLEEP | VMC_POPULATOR); 1725 1726 vmem_heap = heap; 1727 vmem_heap_alloc = heap_alloc; 1728 vmem_heap_free = heap_free; 1729 1730 vmem_internal_arena = vmem_create("vmem_internal", 1731 NULL, 0, heap_quantum, 1732 heap_alloc, heap_free, heap, 0, 1733 VM_SLEEP | VMC_POPULATOR); 1734 1735 vmem_seg_arena = vmem_create("vmem_seg", 1736 NULL, 0, heap_quantum, 1737 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1738 VM_SLEEP | VMC_POPULATOR); 1739 1740 vmem_hash_arena = vmem_create("vmem_hash", 1741 NULL, 0, 8, 1742 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1743 VM_SLEEP); 1744 1745 vmem_vmem_arena = vmem_create("vmem_vmem", 1746 vmem0, sizeof (vmem0), 1, 1747 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1748 VM_SLEEP); 1749 1750 for (id = 0; id < vmem_id; id++) 1751 (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t), 1752 1, 0, 0, &vmem0[id], &vmem0[id + 1], 1753 VM_NOSLEEP | VM_BESTFIT | VM_PANIC); 1754 1755 return (heap); 1756 } 1757 1758 void 1759 vmem_no_debug(void) 1760 { 1761 /* 1762 * This size must be a multiple of the minimum required alignment, 1763 * since vmem_populate allocates them compactly. 1764 */ 1765 vmem_seg_size = P2ROUNDUP(offsetof(vmem_seg_t, vs_thread), 1766 sizeof (hrtime_t)); 1767 } 1768 1769 /* 1770 * Lockup and release, for fork1(2) handling. 1771 */ 1772 void 1773 vmem_lockup(void) 1774 { 1775 vmem_t *cur; 1776 1777 (void) mutex_lock(&vmem_list_lock); 1778 (void) mutex_lock(&vmem_nosleep_lock.vmpl_mutex); 1779 1780 /* 1781 * Lock up and broadcast all arenas. 1782 */ 1783 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) { 1784 (void) mutex_lock(&cur->vm_lock); 1785 (void) cond_broadcast(&cur->vm_cv); 1786 } 1787 1788 (void) mutex_lock(&vmem_segfree_lock); 1789 } 1790 1791 void 1792 vmem_release(void) 1793 { 1794 vmem_t *cur; 1795 1796 (void) mutex_unlock(&vmem_nosleep_lock.vmpl_mutex); 1797 1798 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) 1799 (void) mutex_unlock(&cur->vm_lock); 1800 1801 (void) mutex_unlock(&vmem_segfree_lock); 1802 (void) mutex_unlock(&vmem_list_lock); 1803 } 1804