1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2012 Joyent, Inc. All rights reserved. 26 */ 27 28 /* 29 * For a more complete description of the main ideas, see: 30 * 31 * Jeff Bonwick and Jonathan Adams, 32 * 33 * Magazines and vmem: Extending the Slab Allocator to Many CPUs and 34 * Arbitrary Resources. 35 * 36 * Proceedings of the 2001 Usenix Conference. 37 * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf. 38 * 39 * For the "Big Theory Statement", see usr/src/uts/common/os/vmem.c 40 * 41 * 1. Overview of changes 42 * ------------------------------ 43 * There have been a few changes to vmem in order to support umem. The 44 * main areas are: 45 * 46 * * VM_SLEEP unsupported 47 * 48 * * Reaping changes 49 * 50 * * initialization changes 51 * 52 * * _vmem_extend_alloc 53 * 54 * 55 * 2. VM_SLEEP Removed 56 * ------------------- 57 * Since VM_SLEEP allocations can hold locks (in vmem_populate()) for 58 * possibly infinite amounts of time, they are not supported in this 59 * version of vmem. Sleep-like behavior can be achieved through 60 * UMEM_NOFAIL umem allocations. 61 * 62 * 63 * 3. Reaping changes 64 * ------------------ 65 * Unlike kmem_reap(), which just asynchronously schedules work, umem_reap() 66 * can do allocations and frees synchronously. This is a problem if it 67 * occurs during a vmem_populate() allocation. 68 * 69 * Instead, we delay reaps while populates are active. 70 * 71 * 72 * 4. Initialization changes 73 * ------------------------- 74 * In the kernel, vmem_init() allows you to create a single, top-level arena, 75 * which has vmem_internal_arena as a child. For umem, we want to be able 76 * to extend arenas dynamically. It is much easier to support this if we 77 * allow a two-level "heap" arena: 78 * 79 * +----------+ 80 * | "fake" | 81 * +----------+ 82 * | 83 * +----------+ 84 * | "heap" | 85 * +----------+ 86 * | \ \ 87 * | +-+-- ... <other children> 88 * | 89 * +---------------+ 90 * | vmem_internal | 91 * +---------------+ 92 * | | | | 93 * <children> 94 * 95 * The new vmem_init() allows you to specify a "parent" of the heap, along 96 * with allocation functions. 97 * 98 * 99 * 5. _vmem_extend_alloc 100 * --------------------- 101 * The other part of extending is _vmem_extend_alloc. This function allows 102 * you to extend (expand current spans, if possible) an arena and allocate 103 * a chunk of the newly extened span atomically. This is needed to support 104 * extending the heap while vmem_populate()ing it. 105 * 106 * In order to increase the usefulness of extending, non-imported spans are 107 * sorted in address order. 108 */ 109 110 #include <sys/vmem_impl_user.h> 111 #include <alloca.h> 112 #include <sys/sysmacros.h> 113 #include <stdio.h> 114 #include <strings.h> 115 #include <atomic.h> 116 117 #include "vmem_base.h" 118 #include "umem_base.h" 119 120 #define VMEM_INITIAL 6 /* early vmem arenas */ 121 #define VMEM_SEG_INITIAL 100 /* early segments */ 122 123 /* 124 * Adding a new span to an arena requires two segment structures: one to 125 * represent the span, and one to represent the free segment it contains. 126 */ 127 #define VMEM_SEGS_PER_SPAN_CREATE 2 128 129 /* 130 * Allocating a piece of an existing segment requires 0-2 segment structures 131 * depending on how much of the segment we're allocating. 132 * 133 * To allocate the entire segment, no new segment structures are needed; we 134 * simply move the existing segment structure from the freelist to the 135 * allocation hash table. 136 * 137 * To allocate a piece from the left or right end of the segment, we must 138 * split the segment into two pieces (allocated part and remainder), so we 139 * need one new segment structure to represent the remainder. 140 * 141 * To allocate from the middle of a segment, we need two new segment strucures 142 * to represent the remainders on either side of the allocated part. 143 */ 144 #define VMEM_SEGS_PER_EXACT_ALLOC 0 145 #define VMEM_SEGS_PER_LEFT_ALLOC 1 146 #define VMEM_SEGS_PER_RIGHT_ALLOC 1 147 #define VMEM_SEGS_PER_MIDDLE_ALLOC 2 148 149 /* 150 * vmem_populate() preallocates segment structures for vmem to do its work. 151 * It must preallocate enough for the worst case, which is when we must import 152 * a new span and then allocate from the middle of it. 153 */ 154 #define VMEM_SEGS_PER_ALLOC_MAX \ 155 (VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC) 156 157 /* 158 * The segment structures themselves are allocated from vmem_seg_arena, so 159 * we have a recursion problem when vmem_seg_arena needs to populate itself. 160 * We address this by working out the maximum number of segment structures 161 * this act will require, and multiplying by the maximum number of threads 162 * that we'll allow to do it simultaneously. 163 * 164 * The worst-case segment consumption to populate vmem_seg_arena is as 165 * follows (depicted as a stack trace to indicate why events are occurring): 166 * 167 * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc) 168 * vmem_alloc(vmem_internal_arena) -> 2 segs (span create + exact alloc) 169 * heap_alloc(heap_arena) 170 * vmem_alloc(heap_arena) -> 4 seg (span create + alloc) 171 * parent_alloc(parent_arena) 172 * _vmem_extend_alloc(parent_arena) -> 3 seg (span create + left alloc) 173 * 174 * Note: The reservation for heap_arena must be 4, since vmem_xalloc() 175 * is overly pessimistic on allocations where parent_arena has a stricter 176 * alignment than heap_arena. 177 * 178 * The worst-case consumption for any arena is 4 segment structures. 179 * For now, we only support VM_NOSLEEP allocations, so as long as we 180 * serialize all vmem_populates, a 4-seg reserve is sufficient. 181 */ 182 #define VMEM_POPULATE_SEGS_PER_ARENA 4 183 #define VMEM_POPULATE_LOCKS 1 184 185 #define VMEM_POPULATE_RESERVE \ 186 (VMEM_POPULATE_SEGS_PER_ARENA * VMEM_POPULATE_LOCKS) 187 188 /* 189 * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures 190 * so that it can satisfy the worst-case allocation *and* participate in 191 * worst-case allocation from vmem_seg_arena. 192 */ 193 #define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX) 194 195 /* Don't assume new statics are zeroed - see vmem_startup() */ 196 static vmem_t vmem0[VMEM_INITIAL]; 197 static vmem_t *vmem_populator[VMEM_INITIAL]; 198 static uint32_t vmem_id; 199 static uint32_t vmem_populators; 200 static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL]; 201 static vmem_seg_t *vmem_segfree; 202 static mutex_t vmem_list_lock; 203 static mutex_t vmem_segfree_lock; 204 static vmem_populate_lock_t vmem_nosleep_lock; 205 #define IN_POPULATE() (vmem_nosleep_lock.vmpl_thr == thr_self()) 206 static vmem_t *vmem_list; 207 static vmem_t *vmem_internal_arena; 208 static vmem_t *vmem_seg_arena; 209 static vmem_t *vmem_hash_arena; 210 static vmem_t *vmem_vmem_arena; 211 212 vmem_t *vmem_heap; 213 vmem_alloc_t *vmem_heap_alloc; 214 vmem_free_t *vmem_heap_free; 215 216 uint32_t vmem_mtbf; /* mean time between failures [default: off] */ 217 size_t vmem_seg_size = sizeof (vmem_seg_t); 218 219 /* 220 * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k'). 221 */ 222 #define VMEM_INSERT(vprev, vsp, type) \ 223 { \ 224 vmem_seg_t *vnext = (vprev)->vs_##type##next; \ 225 (vsp)->vs_##type##next = (vnext); \ 226 (vsp)->vs_##type##prev = (vprev); \ 227 (vprev)->vs_##type##next = (vsp); \ 228 (vnext)->vs_##type##prev = (vsp); \ 229 } 230 231 #define VMEM_DELETE(vsp, type) \ 232 { \ 233 vmem_seg_t *vprev = (vsp)->vs_##type##prev; \ 234 vmem_seg_t *vnext = (vsp)->vs_##type##next; \ 235 (vprev)->vs_##type##next = (vnext); \ 236 (vnext)->vs_##type##prev = (vprev); \ 237 } 238 239 /* 240 * Get a vmem_seg_t from the global segfree list. 241 */ 242 static vmem_seg_t * 243 vmem_getseg_global(void) 244 { 245 vmem_seg_t *vsp; 246 247 (void) mutex_lock(&vmem_segfree_lock); 248 if ((vsp = vmem_segfree) != NULL) 249 vmem_segfree = vsp->vs_knext; 250 (void) mutex_unlock(&vmem_segfree_lock); 251 252 return (vsp); 253 } 254 255 /* 256 * Put a vmem_seg_t on the global segfree list. 257 */ 258 static void 259 vmem_putseg_global(vmem_seg_t *vsp) 260 { 261 (void) mutex_lock(&vmem_segfree_lock); 262 vsp->vs_knext = vmem_segfree; 263 vmem_segfree = vsp; 264 (void) mutex_unlock(&vmem_segfree_lock); 265 } 266 267 /* 268 * Get a vmem_seg_t from vmp's segfree list. 269 */ 270 static vmem_seg_t * 271 vmem_getseg(vmem_t *vmp) 272 { 273 vmem_seg_t *vsp; 274 275 ASSERT(vmp->vm_nsegfree > 0); 276 277 vsp = vmp->vm_segfree; 278 vmp->vm_segfree = vsp->vs_knext; 279 vmp->vm_nsegfree--; 280 281 return (vsp); 282 } 283 284 /* 285 * Put a vmem_seg_t on vmp's segfree list. 286 */ 287 static void 288 vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) 289 { 290 vsp->vs_knext = vmp->vm_segfree; 291 vmp->vm_segfree = vsp; 292 vmp->vm_nsegfree++; 293 } 294 295 /* 296 * Add vsp to the appropriate freelist. 297 */ 298 static void 299 vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp) 300 { 301 vmem_seg_t *vprev; 302 303 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 304 305 vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; 306 vsp->vs_type = VMEM_FREE; 307 vmp->vm_freemap |= VS_SIZE(vprev); 308 VMEM_INSERT(vprev, vsp, k); 309 310 (void) cond_broadcast(&vmp->vm_cv); 311 } 312 313 /* 314 * Take vsp from the freelist. 315 */ 316 static void 317 vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp) 318 { 319 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 320 ASSERT(vsp->vs_type == VMEM_FREE); 321 322 if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) { 323 /* 324 * The segments on both sides of 'vsp' are freelist heads, 325 * so taking vsp leaves the freelist at vsp->vs_kprev empty. 326 */ 327 ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev)); 328 vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev); 329 } 330 VMEM_DELETE(vsp, k); 331 } 332 333 /* 334 * Add vsp to the allocated-segment hash table and update kstats. 335 */ 336 static void 337 vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp) 338 { 339 vmem_seg_t **bucket; 340 341 vsp->vs_type = VMEM_ALLOC; 342 bucket = VMEM_HASH(vmp, vsp->vs_start); 343 vsp->vs_knext = *bucket; 344 *bucket = vsp; 345 346 if (vmem_seg_size == sizeof (vmem_seg_t)) { 347 vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack, 348 VMEM_STACK_DEPTH, 0); 349 vsp->vs_thread = thr_self(); 350 vsp->vs_timestamp = gethrtime(); 351 } else { 352 vsp->vs_depth = 0; 353 } 354 355 vmp->vm_kstat.vk_alloc++; 356 vmp->vm_kstat.vk_mem_inuse += VS_SIZE(vsp); 357 } 358 359 /* 360 * Remove vsp from the allocated-segment hash table and update kstats. 361 */ 362 static vmem_seg_t * 363 vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size) 364 { 365 vmem_seg_t *vsp, **prev_vspp; 366 367 prev_vspp = VMEM_HASH(vmp, addr); 368 while ((vsp = *prev_vspp) != NULL) { 369 if (vsp->vs_start == addr) { 370 *prev_vspp = vsp->vs_knext; 371 break; 372 } 373 vmp->vm_kstat.vk_lookup++; 374 prev_vspp = &vsp->vs_knext; 375 } 376 377 if (vsp == NULL) { 378 umem_panic("vmem_hash_delete(%p, %lx, %lu): bad free", 379 vmp, addr, size); 380 } 381 if (VS_SIZE(vsp) != size) { 382 umem_panic("vmem_hash_delete(%p, %lx, %lu): wrong size " 383 "(expect %lu)", vmp, addr, size, VS_SIZE(vsp)); 384 } 385 386 vmp->vm_kstat.vk_free++; 387 vmp->vm_kstat.vk_mem_inuse -= size; 388 389 return (vsp); 390 } 391 392 /* 393 * Create a segment spanning the range [start, end) and add it to the arena. 394 */ 395 static vmem_seg_t * 396 vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) 397 { 398 vmem_seg_t *newseg = vmem_getseg(vmp); 399 400 newseg->vs_start = start; 401 newseg->vs_end = end; 402 newseg->vs_type = 0; 403 newseg->vs_import = 0; 404 405 VMEM_INSERT(vprev, newseg, a); 406 407 return (newseg); 408 } 409 410 /* 411 * Remove segment vsp from the arena. 412 */ 413 static void 414 vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) 415 { 416 ASSERT(vsp->vs_type != VMEM_ROTOR); 417 VMEM_DELETE(vsp, a); 418 419 vmem_putseg(vmp, vsp); 420 } 421 422 /* 423 * Add the span [vaddr, vaddr + size) to vmp and update kstats. 424 */ 425 static vmem_seg_t * 426 vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import) 427 { 428 vmem_seg_t *knext; 429 vmem_seg_t *newseg, *span; 430 uintptr_t start = (uintptr_t)vaddr; 431 uintptr_t end = start + size; 432 433 knext = &vmp->vm_seg0; 434 if (!import && vmp->vm_source_alloc == NULL) { 435 vmem_seg_t *kend, *kprev; 436 /* 437 * non-imported spans are sorted in address order. This 438 * makes vmem_extend_unlocked() much more effective. 439 * 440 * We search in reverse order, since new spans are 441 * generally at higher addresses. 442 */ 443 kend = &vmp->vm_seg0; 444 for (kprev = kend->vs_kprev; kprev != kend; 445 kprev = kprev->vs_kprev) { 446 if (!kprev->vs_import && (kprev->vs_end - 1) < start) 447 break; 448 } 449 knext = kprev->vs_knext; 450 } 451 452 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 453 454 if ((start | end) & (vmp->vm_quantum - 1)) { 455 umem_panic("vmem_span_create(%p, %p, %lu): misaligned", 456 vmp, vaddr, size); 457 } 458 459 span = vmem_seg_create(vmp, knext->vs_aprev, start, end); 460 span->vs_type = VMEM_SPAN; 461 VMEM_INSERT(knext->vs_kprev, span, k); 462 463 newseg = vmem_seg_create(vmp, span, start, end); 464 vmem_freelist_insert(vmp, newseg); 465 466 newseg->vs_import = import; 467 if (import) 468 vmp->vm_kstat.vk_mem_import += size; 469 vmp->vm_kstat.vk_mem_total += size; 470 471 return (newseg); 472 } 473 474 /* 475 * Remove span vsp from vmp and update kstats. 476 */ 477 static void 478 vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp) 479 { 480 vmem_seg_t *span = vsp->vs_aprev; 481 size_t size = VS_SIZE(vsp); 482 483 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 484 ASSERT(span->vs_type == VMEM_SPAN); 485 486 if (vsp->vs_import) 487 vmp->vm_kstat.vk_mem_import -= size; 488 vmp->vm_kstat.vk_mem_total -= size; 489 490 VMEM_DELETE(span, k); 491 492 vmem_seg_destroy(vmp, vsp); 493 vmem_seg_destroy(vmp, span); 494 } 495 496 /* 497 * Allocate the subrange [addr, addr + size) from segment vsp. 498 * If there are leftovers on either side, place them on the freelist. 499 * Returns a pointer to the segment representing [addr, addr + size). 500 */ 501 static vmem_seg_t * 502 vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size) 503 { 504 uintptr_t vs_start = vsp->vs_start; 505 uintptr_t vs_end = vsp->vs_end; 506 size_t vs_size = vs_end - vs_start; 507 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 508 uintptr_t addr_end = addr + realsize; 509 510 ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0); 511 ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0); 512 ASSERT(vsp->vs_type == VMEM_FREE); 513 ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1); 514 ASSERT(addr - 1 <= addr_end - 1); 515 516 /* 517 * If we're allocating from the start of the segment, and the 518 * remainder will be on the same freelist, we can save quite 519 * a bit of work. 520 */ 521 if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) { 522 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 523 vsp->vs_start = addr_end; 524 vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size); 525 vmem_hash_insert(vmp, vsp); 526 return (vsp); 527 } 528 529 vmem_freelist_delete(vmp, vsp); 530 531 if (vs_end != addr_end) 532 vmem_freelist_insert(vmp, 533 vmem_seg_create(vmp, vsp, addr_end, vs_end)); 534 535 if (vs_start != addr) 536 vmem_freelist_insert(vmp, 537 vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr)); 538 539 vsp->vs_start = addr; 540 vsp->vs_end = addr + size; 541 542 vmem_hash_insert(vmp, vsp); 543 return (vsp); 544 } 545 546 /* 547 * We cannot reap if we are in the middle of a vmem_populate(). 548 */ 549 void 550 vmem_reap(void) 551 { 552 if (!IN_POPULATE()) 553 umem_reap(); 554 } 555 556 /* 557 * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures. 558 */ 559 static int 560 vmem_populate(vmem_t *vmp, int vmflag) 561 { 562 char *p; 563 vmem_seg_t *vsp; 564 ssize_t nseg; 565 size_t size; 566 vmem_populate_lock_t *lp; 567 int i; 568 569 while (vmp->vm_nsegfree < VMEM_MINFREE && 570 (vsp = vmem_getseg_global()) != NULL) 571 vmem_putseg(vmp, vsp); 572 573 if (vmp->vm_nsegfree >= VMEM_MINFREE) 574 return (1); 575 576 /* 577 * If we're already populating, tap the reserve. 578 */ 579 if (vmem_nosleep_lock.vmpl_thr == thr_self()) { 580 ASSERT(vmp->vm_cflags & VMC_POPULATOR); 581 return (1); 582 } 583 584 (void) mutex_unlock(&vmp->vm_lock); 585 586 ASSERT(vmflag & VM_NOSLEEP); /* we do not allow sleep allocations */ 587 lp = &vmem_nosleep_lock; 588 589 /* 590 * Cannot be just a mutex_lock(), since that has no effect if 591 * libthread is not linked. 592 */ 593 (void) mutex_lock(&lp->vmpl_mutex); 594 ASSERT(lp->vmpl_thr == 0); 595 lp->vmpl_thr = thr_self(); 596 597 nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE; 598 size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum); 599 nseg = size / vmem_seg_size; 600 601 /* 602 * The following vmem_alloc() may need to populate vmem_seg_arena 603 * and all the things it imports from. When doing so, it will tap 604 * each arena's reserve to prevent recursion (see the block comment 605 * above the definition of VMEM_POPULATE_RESERVE). 606 * 607 * During this allocation, vmem_reap() is a no-op. If the allocation 608 * fails, we call vmem_reap() after dropping the population lock. 609 */ 610 p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_UMFLAGS); 611 if (p == NULL) { 612 lp->vmpl_thr = 0; 613 (void) mutex_unlock(&lp->vmpl_mutex); 614 vmem_reap(); 615 616 (void) mutex_lock(&vmp->vm_lock); 617 vmp->vm_kstat.vk_populate_fail++; 618 return (0); 619 } 620 /* 621 * Restock the arenas that may have been depleted during population. 622 */ 623 for (i = 0; i < vmem_populators; i++) { 624 (void) mutex_lock(&vmem_populator[i]->vm_lock); 625 while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE) 626 vmem_putseg(vmem_populator[i], 627 (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 628 (void) mutex_unlock(&vmem_populator[i]->vm_lock); 629 } 630 631 lp->vmpl_thr = 0; 632 (void) mutex_unlock(&lp->vmpl_mutex); 633 (void) mutex_lock(&vmp->vm_lock); 634 635 /* 636 * Now take our own segments. 637 */ 638 ASSERT(nseg >= VMEM_MINFREE); 639 while (vmp->vm_nsegfree < VMEM_MINFREE) 640 vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 641 642 /* 643 * Give the remainder to charity. 644 */ 645 while (nseg > 0) 646 vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size)); 647 648 return (1); 649 } 650 651 /* 652 * Advance a walker from its previous position to 'afterme'. 653 * Note: may drop and reacquire vmp->vm_lock. 654 */ 655 static void 656 vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme) 657 { 658 vmem_seg_t *vprev = walker->vs_aprev; 659 vmem_seg_t *vnext = walker->vs_anext; 660 vmem_seg_t *vsp = NULL; 661 662 VMEM_DELETE(walker, a); 663 664 if (afterme != NULL) 665 VMEM_INSERT(afterme, walker, a); 666 667 /* 668 * The walker segment's presence may have prevented its neighbors 669 * from coalescing. If so, coalesce them now. 670 */ 671 if (vprev->vs_type == VMEM_FREE) { 672 if (vnext->vs_type == VMEM_FREE) { 673 ASSERT(vprev->vs_end == vnext->vs_start); 674 vmem_freelist_delete(vmp, vnext); 675 vmem_freelist_delete(vmp, vprev); 676 vprev->vs_end = vnext->vs_end; 677 vmem_freelist_insert(vmp, vprev); 678 vmem_seg_destroy(vmp, vnext); 679 } 680 vsp = vprev; 681 } else if (vnext->vs_type == VMEM_FREE) { 682 vsp = vnext; 683 } 684 685 /* 686 * vsp could represent a complete imported span, 687 * in which case we must return it to the source. 688 */ 689 if (vsp != NULL && vsp->vs_import && vmp->vm_source_free != NULL && 690 vsp->vs_aprev->vs_type == VMEM_SPAN && 691 vsp->vs_anext->vs_type == VMEM_SPAN) { 692 void *vaddr = (void *)vsp->vs_start; 693 size_t size = VS_SIZE(vsp); 694 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 695 vmem_freelist_delete(vmp, vsp); 696 vmem_span_destroy(vmp, vsp); 697 (void) mutex_unlock(&vmp->vm_lock); 698 vmp->vm_source_free(vmp->vm_source, vaddr, size); 699 (void) mutex_lock(&vmp->vm_lock); 700 } 701 } 702 703 /* 704 * VM_NEXTFIT allocations deliberately cycle through all virtual addresses 705 * in an arena, so that we avoid reusing addresses for as long as possible. 706 * This helps to catch used-after-freed bugs. It's also the perfect policy 707 * for allocating things like process IDs, where we want to cycle through 708 * all values in order. 709 */ 710 static void * 711 vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag) 712 { 713 vmem_seg_t *vsp, *rotor; 714 uintptr_t addr; 715 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 716 size_t vs_size; 717 718 (void) mutex_lock(&vmp->vm_lock); 719 720 if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) { 721 (void) mutex_unlock(&vmp->vm_lock); 722 return (NULL); 723 } 724 725 /* 726 * The common case is that the segment right after the rotor is free, 727 * and large enough that extracting 'size' bytes won't change which 728 * freelist it's on. In this case we can avoid a *lot* of work. 729 * Instead of the normal vmem_seg_alloc(), we just advance the start 730 * address of the victim segment. Instead of moving the rotor, we 731 * create the new segment structure *behind the rotor*, which has 732 * the same effect. And finally, we know we don't have to coalesce 733 * the rotor's neighbors because the new segment lies between them. 734 */ 735 rotor = &vmp->vm_rotor; 736 vsp = rotor->vs_anext; 737 if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize && 738 P2SAMEHIGHBIT(vs_size, vs_size - realsize)) { 739 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 740 addr = vsp->vs_start; 741 vsp->vs_start = addr + realsize; 742 vmem_hash_insert(vmp, 743 vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size)); 744 (void) mutex_unlock(&vmp->vm_lock); 745 return ((void *)addr); 746 } 747 748 /* 749 * Starting at the rotor, look for a segment large enough to 750 * satisfy the allocation. 751 */ 752 for (;;) { 753 vmp->vm_kstat.vk_search++; 754 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 755 break; 756 vsp = vsp->vs_anext; 757 if (vsp == rotor) { 758 int cancel_state; 759 760 /* 761 * We've come full circle. One possibility is that the 762 * there's actually enough space, but the rotor itself 763 * is preventing the allocation from succeeding because 764 * it's sitting between two free segments. Therefore, 765 * we advance the rotor and see if that liberates a 766 * suitable segment. 767 */ 768 vmem_advance(vmp, rotor, rotor->vs_anext); 769 vsp = rotor->vs_aprev; 770 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 771 break; 772 /* 773 * If there's a lower arena we can import from, or it's 774 * a VM_NOSLEEP allocation, let vmem_xalloc() handle it. 775 * Otherwise, wait until another thread frees something. 776 */ 777 if (vmp->vm_source_alloc != NULL || 778 (vmflag & VM_NOSLEEP)) { 779 (void) mutex_unlock(&vmp->vm_lock); 780 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 781 0, 0, NULL, NULL, vmflag & VM_UMFLAGS)); 782 } 783 vmp->vm_kstat.vk_wait++; 784 (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 785 &cancel_state); 786 (void) cond_wait(&vmp->vm_cv, &vmp->vm_lock); 787 (void) pthread_setcancelstate(cancel_state, NULL); 788 vsp = rotor->vs_anext; 789 } 790 } 791 792 /* 793 * We found a segment. Extract enough space to satisfy the allocation. 794 */ 795 addr = vsp->vs_start; 796 vsp = vmem_seg_alloc(vmp, vsp, addr, size); 797 ASSERT(vsp->vs_type == VMEM_ALLOC && 798 vsp->vs_start == addr && vsp->vs_end == addr + size); 799 800 /* 801 * Advance the rotor to right after the newly-allocated segment. 802 * That's where the next VM_NEXTFIT allocation will begin searching. 803 */ 804 vmem_advance(vmp, rotor, vsp); 805 (void) mutex_unlock(&vmp->vm_lock); 806 return ((void *)addr); 807 } 808 809 /* 810 * Allocate size bytes at offset phase from an align boundary such that the 811 * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) 812 * that does not straddle a nocross-aligned boundary. 813 */ 814 void * 815 vmem_xalloc(vmem_t *vmp, size_t size, size_t align, size_t phase, 816 size_t nocross, void *minaddr, void *maxaddr, int vmflag) 817 { 818 vmem_seg_t *vsp; 819 vmem_seg_t *vbest = NULL; 820 uintptr_t addr, taddr, start, end; 821 void *vaddr; 822 int hb, flist, resv; 823 uint32_t mtbf; 824 825 if (phase > 0 && phase >= align) 826 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 827 "invalid phase", 828 (void *)vmp, size, align, phase, nocross, 829 minaddr, maxaddr, vmflag); 830 831 if (align == 0) 832 align = vmp->vm_quantum; 833 834 if ((align | phase | nocross) & (vmp->vm_quantum - 1)) { 835 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 836 "parameters not vm_quantum aligned", 837 (void *)vmp, size, align, phase, nocross, 838 minaddr, maxaddr, vmflag); 839 } 840 841 if (nocross != 0 && 842 (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) { 843 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 844 "overconstrained allocation", 845 (void *)vmp, size, align, phase, nocross, 846 minaddr, maxaddr, vmflag); 847 } 848 849 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 850 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 851 return (NULL); 852 853 (void) mutex_lock(&vmp->vm_lock); 854 for (;;) { 855 int cancel_state; 856 857 if (vmp->vm_nsegfree < VMEM_MINFREE && 858 !vmem_populate(vmp, vmflag)) 859 break; 860 861 /* 862 * highbit() returns the highest bit + 1, which is exactly 863 * what we want: we want to search the first freelist whose 864 * members are *definitely* large enough to satisfy our 865 * allocation. However, there are certain cases in which we 866 * want to look at the next-smallest freelist (which *might* 867 * be able to satisfy the allocation): 868 * 869 * (1) The size is exactly a power of 2, in which case 870 * the smaller freelist is always big enough; 871 * 872 * (2) All other freelists are empty; 873 * 874 * (3) We're in the highest possible freelist, which is 875 * always empty (e.g. the 4GB freelist on 32-bit systems); 876 * 877 * (4) We're doing a best-fit or first-fit allocation. 878 */ 879 if ((size & (size - 1)) == 0) { 880 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 881 } else { 882 hb = highbit(size); 883 if ((vmp->vm_freemap >> hb) == 0 || 884 hb == VMEM_FREELISTS || 885 (vmflag & (VM_BESTFIT | VM_FIRSTFIT))) 886 hb--; 887 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 888 } 889 890 for (vbest = NULL, vsp = (flist == 0) ? NULL : 891 vmp->vm_freelist[flist - 1].vs_knext; 892 vsp != NULL; vsp = vsp->vs_knext) { 893 vmp->vm_kstat.vk_search++; 894 if (vsp->vs_start == 0) { 895 /* 896 * We're moving up to a larger freelist, 897 * so if we've already found a candidate, 898 * the fit can't possibly get any better. 899 */ 900 if (vbest != NULL) 901 break; 902 /* 903 * Find the next non-empty freelist. 904 */ 905 flist = lowbit(P2ALIGN(vmp->vm_freemap, 906 VS_SIZE(vsp))); 907 if (flist-- == 0) 908 break; 909 vsp = (vmem_seg_t *)&vmp->vm_freelist[flist]; 910 ASSERT(vsp->vs_knext->vs_type == VMEM_FREE); 911 continue; 912 } 913 if (vsp->vs_end - 1 < (uintptr_t)minaddr) 914 continue; 915 if (vsp->vs_start > (uintptr_t)maxaddr - 1) 916 continue; 917 start = MAX(vsp->vs_start, (uintptr_t)minaddr); 918 end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; 919 taddr = P2PHASEUP(start, align, phase); 920 if (P2BOUNDARY(taddr, size, nocross)) 921 taddr += 922 P2ROUNDUP(P2NPHASE(taddr, nocross), align); 923 if ((taddr - start) + size > end - start || 924 (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest))) 925 continue; 926 vbest = vsp; 927 addr = taddr; 928 if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size) 929 break; 930 } 931 if (vbest != NULL) 932 break; 933 if (size == 0) 934 umem_panic("vmem_xalloc(): size == 0"); 935 if (vmp->vm_source_alloc != NULL && nocross == 0 && 936 minaddr == NULL && maxaddr == NULL) { 937 size_t asize = P2ROUNDUP(size + phase, 938 MAX(align, vmp->vm_source->vm_quantum)); 939 if (asize < size) { /* overflow */ 940 (void) mutex_unlock(&vmp->vm_lock); 941 if (vmflag & VM_NOSLEEP) 942 return (NULL); 943 944 umem_panic("vmem_xalloc(): " 945 "overflow on VM_SLEEP allocation"); 946 } 947 /* 948 * Determine how many segment structures we'll consume. 949 * The calculation must be presise because if we're 950 * here on behalf of vmem_populate(), we are taking 951 * segments from a very limited reserve. 952 */ 953 resv = (size == asize) ? 954 VMEM_SEGS_PER_SPAN_CREATE + 955 VMEM_SEGS_PER_EXACT_ALLOC : 956 VMEM_SEGS_PER_ALLOC_MAX; 957 ASSERT(vmp->vm_nsegfree >= resv); 958 vmp->vm_nsegfree -= resv; /* reserve our segs */ 959 (void) mutex_unlock(&vmp->vm_lock); 960 vaddr = vmp->vm_source_alloc(vmp->vm_source, asize, 961 vmflag & VM_UMFLAGS); 962 (void) mutex_lock(&vmp->vm_lock); 963 vmp->vm_nsegfree += resv; /* claim reservation */ 964 if (vaddr != NULL) { 965 vbest = vmem_span_create(vmp, vaddr, asize, 1); 966 addr = P2PHASEUP(vbest->vs_start, align, phase); 967 break; 968 } 969 } 970 (void) mutex_unlock(&vmp->vm_lock); 971 vmem_reap(); 972 (void) mutex_lock(&vmp->vm_lock); 973 if (vmflag & VM_NOSLEEP) 974 break; 975 vmp->vm_kstat.vk_wait++; 976 (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 977 &cancel_state); 978 (void) cond_wait(&vmp->vm_cv, &vmp->vm_lock); 979 (void) pthread_setcancelstate(cancel_state, NULL); 980 } 981 if (vbest != NULL) { 982 ASSERT(vbest->vs_type == VMEM_FREE); 983 ASSERT(vbest->vs_knext != vbest); 984 (void) vmem_seg_alloc(vmp, vbest, addr, size); 985 (void) mutex_unlock(&vmp->vm_lock); 986 ASSERT(P2PHASE(addr, align) == phase); 987 ASSERT(!P2BOUNDARY(addr, size, nocross)); 988 ASSERT(addr >= (uintptr_t)minaddr); 989 ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); 990 return ((void *)addr); 991 } 992 vmp->vm_kstat.vk_fail++; 993 (void) mutex_unlock(&vmp->vm_lock); 994 if (vmflag & VM_PANIC) 995 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 996 "cannot satisfy mandatory allocation", 997 (void *)vmp, size, align, phase, nocross, 998 minaddr, maxaddr, vmflag); 999 return (NULL); 1000 } 1001 1002 /* 1003 * Free the segment [vaddr, vaddr + size), where vaddr was a constrained 1004 * allocation. vmem_xalloc() and vmem_xfree() must always be paired because 1005 * both routines bypass the quantum caches. 1006 */ 1007 void 1008 vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) 1009 { 1010 vmem_seg_t *vsp, *vnext, *vprev; 1011 1012 (void) mutex_lock(&vmp->vm_lock); 1013 1014 vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size); 1015 vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum); 1016 1017 /* 1018 * Attempt to coalesce with the next segment. 1019 */ 1020 vnext = vsp->vs_anext; 1021 if (vnext->vs_type == VMEM_FREE) { 1022 ASSERT(vsp->vs_end == vnext->vs_start); 1023 vmem_freelist_delete(vmp, vnext); 1024 vsp->vs_end = vnext->vs_end; 1025 vmem_seg_destroy(vmp, vnext); 1026 } 1027 1028 /* 1029 * Attempt to coalesce with the previous segment. 1030 */ 1031 vprev = vsp->vs_aprev; 1032 if (vprev->vs_type == VMEM_FREE) { 1033 ASSERT(vprev->vs_end == vsp->vs_start); 1034 vmem_freelist_delete(vmp, vprev); 1035 vprev->vs_end = vsp->vs_end; 1036 vmem_seg_destroy(vmp, vsp); 1037 vsp = vprev; 1038 } 1039 1040 /* 1041 * If the entire span is free, return it to the source. 1042 */ 1043 if (vsp->vs_import && vmp->vm_source_free != NULL && 1044 vsp->vs_aprev->vs_type == VMEM_SPAN && 1045 vsp->vs_anext->vs_type == VMEM_SPAN) { 1046 vaddr = (void *)vsp->vs_start; 1047 size = VS_SIZE(vsp); 1048 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 1049 vmem_span_destroy(vmp, vsp); 1050 (void) mutex_unlock(&vmp->vm_lock); 1051 vmp->vm_source_free(vmp->vm_source, vaddr, size); 1052 } else { 1053 vmem_freelist_insert(vmp, vsp); 1054 (void) mutex_unlock(&vmp->vm_lock); 1055 } 1056 } 1057 1058 /* 1059 * Allocate size bytes from arena vmp. Returns the allocated address 1060 * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, 1061 * and may also specify best-fit, first-fit, or next-fit allocation policy 1062 * instead of the default instant-fit policy. VM_SLEEP allocations are 1063 * guaranteed to succeed. 1064 */ 1065 void * 1066 vmem_alloc(vmem_t *vmp, size_t size, int vmflag) 1067 { 1068 vmem_seg_t *vsp; 1069 uintptr_t addr; 1070 int hb; 1071 int flist = 0; 1072 uint32_t mtbf; 1073 vmflag |= vmem_allocator; 1074 1075 if (size - 1 < vmp->vm_qcache_max) { 1076 ASSERT(vmflag & VM_NOSLEEP); 1077 return (_umem_cache_alloc(vmp->vm_qcache[(size - 1) >> 1078 vmp->vm_qshift], UMEM_DEFAULT)); 1079 } 1080 1081 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 1082 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 1083 return (NULL); 1084 1085 if (vmflag & VM_NEXTFIT) 1086 return (vmem_nextfit_alloc(vmp, size, vmflag)); 1087 1088 if (vmflag & (VM_BESTFIT | VM_FIRSTFIT)) 1089 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0, 1090 NULL, NULL, vmflag)); 1091 1092 /* 1093 * Unconstrained instant-fit allocation from the segment list. 1094 */ 1095 (void) mutex_lock(&vmp->vm_lock); 1096 1097 if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) { 1098 if ((size & (size - 1)) == 0) 1099 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 1100 else if ((hb = highbit(size)) < VMEM_FREELISTS) 1101 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 1102 } 1103 1104 if (flist-- == 0) { 1105 (void) mutex_unlock(&vmp->vm_lock); 1106 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 1107 0, 0, NULL, NULL, vmflag)); 1108 } 1109 1110 ASSERT(size <= (1UL << flist)); 1111 vsp = vmp->vm_freelist[flist].vs_knext; 1112 addr = vsp->vs_start; 1113 (void) vmem_seg_alloc(vmp, vsp, addr, size); 1114 (void) mutex_unlock(&vmp->vm_lock); 1115 return ((void *)addr); 1116 } 1117 1118 /* 1119 * Free the segment [vaddr, vaddr + size). 1120 */ 1121 void 1122 vmem_free(vmem_t *vmp, void *vaddr, size_t size) 1123 { 1124 if (size - 1 < vmp->vm_qcache_max) 1125 _umem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], 1126 vaddr); 1127 else 1128 vmem_xfree(vmp, vaddr, size); 1129 } 1130 1131 /* 1132 * Determine whether arena vmp contains the segment [vaddr, vaddr + size). 1133 */ 1134 int 1135 vmem_contains(vmem_t *vmp, void *vaddr, size_t size) 1136 { 1137 uintptr_t start = (uintptr_t)vaddr; 1138 uintptr_t end = start + size; 1139 vmem_seg_t *vsp; 1140 vmem_seg_t *seg0 = &vmp->vm_seg0; 1141 1142 (void) mutex_lock(&vmp->vm_lock); 1143 vmp->vm_kstat.vk_contains++; 1144 for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) { 1145 vmp->vm_kstat.vk_contains_search++; 1146 ASSERT(vsp->vs_type == VMEM_SPAN); 1147 if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1) 1148 break; 1149 } 1150 (void) mutex_unlock(&vmp->vm_lock); 1151 return (vsp != seg0); 1152 } 1153 1154 /* 1155 * Add the span [vaddr, vaddr + size) to arena vmp. 1156 */ 1157 void * 1158 vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag) 1159 { 1160 if (vaddr == NULL || size == 0) { 1161 umem_panic("vmem_add(%p, %p, %lu): bad arguments", 1162 vmp, vaddr, size); 1163 } 1164 1165 ASSERT(!vmem_contains(vmp, vaddr, size)); 1166 1167 (void) mutex_lock(&vmp->vm_lock); 1168 if (vmem_populate(vmp, vmflag)) 1169 (void) vmem_span_create(vmp, vaddr, size, 0); 1170 else 1171 vaddr = NULL; 1172 (void) cond_broadcast(&vmp->vm_cv); 1173 (void) mutex_unlock(&vmp->vm_lock); 1174 return (vaddr); 1175 } 1176 1177 /* 1178 * Adds the address range [addr, endaddr) to arena vmp, by either: 1179 * 1. joining two existing spans, [x, addr), and [endaddr, y) (which 1180 * are in that order) into a single [x, y) span, 1181 * 2. expanding an existing [x, addr) span to [x, endaddr), 1182 * 3. expanding an existing [endaddr, x) span to [addr, x), or 1183 * 4. creating a new [addr, endaddr) span. 1184 * 1185 * Called with vmp->vm_lock held, and a successful vmem_populate() completed. 1186 * Cannot fail. Returns the new segment. 1187 * 1188 * NOTE: this algorithm is linear-time in the number of spans, but is 1189 * constant-time when you are extending the last (highest-addressed) 1190 * span. 1191 */ 1192 static vmem_seg_t * 1193 vmem_extend_unlocked(vmem_t *vmp, uintptr_t addr, uintptr_t endaddr) 1194 { 1195 vmem_seg_t *span; 1196 vmem_seg_t *vsp; 1197 1198 vmem_seg_t *end = &vmp->vm_seg0; 1199 1200 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 1201 1202 /* 1203 * the second "if" clause below relies on the direction of this search 1204 */ 1205 for (span = end->vs_kprev; span != end; span = span->vs_kprev) { 1206 if (span->vs_end == addr || span->vs_start == endaddr) 1207 break; 1208 } 1209 1210 if (span == end) 1211 return (vmem_span_create(vmp, (void *)addr, endaddr - addr, 0)); 1212 if (span->vs_kprev->vs_end == addr && span->vs_start == endaddr) { 1213 vmem_seg_t *prevspan = span->vs_kprev; 1214 vmem_seg_t *nextseg = span->vs_anext; 1215 vmem_seg_t *prevseg = span->vs_aprev; 1216 1217 /* 1218 * prevspan becomes the span marker for the full range 1219 */ 1220 prevspan->vs_end = span->vs_end; 1221 1222 /* 1223 * Notionally, span becomes a free segment representing 1224 * [addr, endaddr). 1225 * 1226 * However, if either of its neighbors are free, we coalesce 1227 * by destroying span and changing the free segment. 1228 */ 1229 if (prevseg->vs_type == VMEM_FREE && 1230 nextseg->vs_type == VMEM_FREE) { 1231 /* 1232 * coalesce both ways 1233 */ 1234 ASSERT(prevseg->vs_end == addr && 1235 nextseg->vs_start == endaddr); 1236 1237 vmem_freelist_delete(vmp, prevseg); 1238 prevseg->vs_end = nextseg->vs_end; 1239 1240 vmem_freelist_delete(vmp, nextseg); 1241 VMEM_DELETE(span, k); 1242 vmem_seg_destroy(vmp, nextseg); 1243 vmem_seg_destroy(vmp, span); 1244 1245 vsp = prevseg; 1246 } else if (prevseg->vs_type == VMEM_FREE) { 1247 /* 1248 * coalesce left 1249 */ 1250 ASSERT(prevseg->vs_end == addr); 1251 1252 VMEM_DELETE(span, k); 1253 vmem_seg_destroy(vmp, span); 1254 1255 vmem_freelist_delete(vmp, prevseg); 1256 prevseg->vs_end = endaddr; 1257 1258 vsp = prevseg; 1259 } else if (nextseg->vs_type == VMEM_FREE) { 1260 /* 1261 * coalesce right 1262 */ 1263 ASSERT(nextseg->vs_start == endaddr); 1264 1265 VMEM_DELETE(span, k); 1266 vmem_seg_destroy(vmp, span); 1267 1268 vmem_freelist_delete(vmp, nextseg); 1269 nextseg->vs_start = addr; 1270 1271 vsp = nextseg; 1272 } else { 1273 /* 1274 * cannnot coalesce 1275 */ 1276 VMEM_DELETE(span, k); 1277 span->vs_start = addr; 1278 span->vs_end = endaddr; 1279 1280 vsp = span; 1281 } 1282 } else if (span->vs_end == addr) { 1283 vmem_seg_t *oldseg = span->vs_knext->vs_aprev; 1284 span->vs_end = endaddr; 1285 1286 ASSERT(oldseg->vs_type != VMEM_SPAN); 1287 if (oldseg->vs_type == VMEM_FREE) { 1288 ASSERT(oldseg->vs_end == addr); 1289 vmem_freelist_delete(vmp, oldseg); 1290 oldseg->vs_end = endaddr; 1291 vsp = oldseg; 1292 } else 1293 vsp = vmem_seg_create(vmp, oldseg, addr, endaddr); 1294 } else { 1295 vmem_seg_t *oldseg = span->vs_anext; 1296 ASSERT(span->vs_start == endaddr); 1297 span->vs_start = addr; 1298 1299 ASSERT(oldseg->vs_type != VMEM_SPAN); 1300 if (oldseg->vs_type == VMEM_FREE) { 1301 ASSERT(oldseg->vs_start == endaddr); 1302 vmem_freelist_delete(vmp, oldseg); 1303 oldseg->vs_start = addr; 1304 vsp = oldseg; 1305 } else 1306 vsp = vmem_seg_create(vmp, span, addr, endaddr); 1307 } 1308 vmem_freelist_insert(vmp, vsp); 1309 vmp->vm_kstat.vk_mem_total += (endaddr - addr); 1310 return (vsp); 1311 } 1312 1313 /* 1314 * Does some error checking, calls vmem_extend_unlocked to add 1315 * [vaddr, vaddr+size) to vmp, then allocates alloc bytes from the 1316 * newly merged segment. 1317 */ 1318 void * 1319 _vmem_extend_alloc(vmem_t *vmp, void *vaddr, size_t size, size_t alloc, 1320 int vmflag) 1321 { 1322 uintptr_t addr = (uintptr_t)vaddr; 1323 uintptr_t endaddr = addr + size; 1324 vmem_seg_t *vsp; 1325 1326 ASSERT(vaddr != NULL && size != 0 && endaddr > addr); 1327 ASSERT(alloc <= size && alloc != 0); 1328 ASSERT(((addr | size | alloc) & (vmp->vm_quantum - 1)) == 0); 1329 1330 ASSERT(!vmem_contains(vmp, vaddr, size)); 1331 1332 (void) mutex_lock(&vmp->vm_lock); 1333 if (!vmem_populate(vmp, vmflag)) { 1334 (void) mutex_unlock(&vmp->vm_lock); 1335 return (NULL); 1336 } 1337 /* 1338 * if there is a source, we can't mess with the spans 1339 */ 1340 if (vmp->vm_source_alloc != NULL) 1341 vsp = vmem_span_create(vmp, vaddr, size, 0); 1342 else 1343 vsp = vmem_extend_unlocked(vmp, addr, endaddr); 1344 1345 ASSERT(VS_SIZE(vsp) >= alloc); 1346 1347 addr = vsp->vs_start; 1348 (void) vmem_seg_alloc(vmp, vsp, addr, alloc); 1349 vaddr = (void *)addr; 1350 1351 (void) cond_broadcast(&vmp->vm_cv); 1352 (void) mutex_unlock(&vmp->vm_lock); 1353 1354 return (vaddr); 1355 } 1356 1357 /* 1358 * Walk the vmp arena, applying func to each segment matching typemask. 1359 * If VMEM_REENTRANT is specified, the arena lock is dropped across each 1360 * call to func(); otherwise, it is held for the duration of vmem_walk() 1361 * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks 1362 * are *not* necessarily consistent, so they may only be used when a hint 1363 * is adequate. 1364 */ 1365 void 1366 vmem_walk(vmem_t *vmp, int typemask, 1367 void (*func)(void *, void *, size_t), void *arg) 1368 { 1369 vmem_seg_t *vsp; 1370 vmem_seg_t *seg0 = &vmp->vm_seg0; 1371 vmem_seg_t walker; 1372 1373 if (typemask & VMEM_WALKER) 1374 return; 1375 1376 bzero(&walker, sizeof (walker)); 1377 walker.vs_type = VMEM_WALKER; 1378 1379 (void) mutex_lock(&vmp->vm_lock); 1380 VMEM_INSERT(seg0, &walker, a); 1381 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) { 1382 if (vsp->vs_type & typemask) { 1383 void *start = (void *)vsp->vs_start; 1384 size_t size = VS_SIZE(vsp); 1385 if (typemask & VMEM_REENTRANT) { 1386 vmem_advance(vmp, &walker, vsp); 1387 (void) mutex_unlock(&vmp->vm_lock); 1388 func(arg, start, size); 1389 (void) mutex_lock(&vmp->vm_lock); 1390 vsp = &walker; 1391 } else { 1392 func(arg, start, size); 1393 } 1394 } 1395 } 1396 vmem_advance(vmp, &walker, NULL); 1397 (void) mutex_unlock(&vmp->vm_lock); 1398 } 1399 1400 /* 1401 * Return the total amount of memory whose type matches typemask. Thus: 1402 * 1403 * typemask VMEM_ALLOC yields total memory allocated (in use). 1404 * typemask VMEM_FREE yields total memory free (available). 1405 * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size. 1406 */ 1407 size_t 1408 vmem_size(vmem_t *vmp, int typemask) 1409 { 1410 uint64_t size = 0; 1411 1412 if (typemask & VMEM_ALLOC) 1413 size += vmp->vm_kstat.vk_mem_inuse; 1414 if (typemask & VMEM_FREE) 1415 size += vmp->vm_kstat.vk_mem_total - 1416 vmp->vm_kstat.vk_mem_inuse; 1417 return ((size_t)size); 1418 } 1419 1420 /* 1421 * Create an arena called name whose initial span is [base, base + size). 1422 * The arena's natural unit of currency is quantum, so vmem_alloc() 1423 * guarantees quantum-aligned results. The arena may import new spans 1424 * by invoking afunc() on source, and may return those spans by invoking 1425 * ffunc() on source. To make small allocations fast and scalable, 1426 * the arena offers high-performance caching for each integer multiple 1427 * of quantum up to qcache_max. 1428 */ 1429 vmem_t * 1430 vmem_create(const char *name, void *base, size_t size, size_t quantum, 1431 vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source, 1432 size_t qcache_max, int vmflag) 1433 { 1434 int i; 1435 size_t nqcache; 1436 vmem_t *vmp, *cur, **vmpp; 1437 vmem_seg_t *vsp; 1438 vmem_freelist_t *vfp; 1439 uint32_t id = atomic_add_32_nv(&vmem_id, 1); 1440 1441 if (vmem_vmem_arena != NULL) { 1442 vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t), 1443 vmflag & VM_UMFLAGS); 1444 } else { 1445 ASSERT(id <= VMEM_INITIAL); 1446 vmp = &vmem0[id - 1]; 1447 } 1448 1449 if (vmp == NULL) 1450 return (NULL); 1451 bzero(vmp, sizeof (vmem_t)); 1452 1453 (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name); 1454 (void) mutex_init(&vmp->vm_lock, USYNC_THREAD, NULL); 1455 (void) cond_init(&vmp->vm_cv, USYNC_THREAD, NULL); 1456 vmp->vm_cflags = vmflag; 1457 vmflag &= VM_UMFLAGS; 1458 1459 vmp->vm_quantum = quantum; 1460 vmp->vm_qshift = highbit(quantum) - 1; 1461 nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX); 1462 1463 for (i = 0; i <= VMEM_FREELISTS; i++) { 1464 vfp = &vmp->vm_freelist[i]; 1465 vfp->vs_end = 1UL << i; 1466 vfp->vs_knext = (vmem_seg_t *)(vfp + 1); 1467 vfp->vs_kprev = (vmem_seg_t *)(vfp - 1); 1468 } 1469 1470 vmp->vm_freelist[0].vs_kprev = NULL; 1471 vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL; 1472 vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0; 1473 vmp->vm_hash_table = vmp->vm_hash0; 1474 vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1; 1475 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1476 1477 vsp = &vmp->vm_seg0; 1478 vsp->vs_anext = vsp; 1479 vsp->vs_aprev = vsp; 1480 vsp->vs_knext = vsp; 1481 vsp->vs_kprev = vsp; 1482 vsp->vs_type = VMEM_SPAN; 1483 1484 vsp = &vmp->vm_rotor; 1485 vsp->vs_type = VMEM_ROTOR; 1486 VMEM_INSERT(&vmp->vm_seg0, vsp, a); 1487 1488 vmp->vm_id = id; 1489 if (source != NULL) 1490 vmp->vm_kstat.vk_source_id = source->vm_id; 1491 vmp->vm_source = source; 1492 vmp->vm_source_alloc = afunc; 1493 vmp->vm_source_free = ffunc; 1494 1495 if (nqcache != 0) { 1496 vmp->vm_qcache_max = nqcache << vmp->vm_qshift; 1497 for (i = 0; i < nqcache; i++) { 1498 char buf[VMEM_NAMELEN + 21]; 1499 (void) snprintf(buf, sizeof (buf), "%s_%lu", 1500 vmp->vm_name, (long)((i + 1) * quantum)); 1501 vmp->vm_qcache[i] = umem_cache_create(buf, 1502 (i + 1) * quantum, quantum, NULL, NULL, NULL, 1503 NULL, vmp, UMC_QCACHE | UMC_NOTOUCH); 1504 if (vmp->vm_qcache[i] == NULL) { 1505 vmp->vm_qcache_max = i * quantum; 1506 break; 1507 } 1508 } 1509 } 1510 1511 (void) mutex_lock(&vmem_list_lock); 1512 vmpp = &vmem_list; 1513 while ((cur = *vmpp) != NULL) 1514 vmpp = &cur->vm_next; 1515 *vmpp = vmp; 1516 (void) mutex_unlock(&vmem_list_lock); 1517 1518 if (vmp->vm_cflags & VMC_POPULATOR) { 1519 uint_t pop_id = atomic_add_32_nv(&vmem_populators, 1); 1520 ASSERT(pop_id <= VMEM_INITIAL); 1521 vmem_populator[pop_id - 1] = vmp; 1522 (void) mutex_lock(&vmp->vm_lock); 1523 (void) vmem_populate(vmp, vmflag | VM_PANIC); 1524 (void) mutex_unlock(&vmp->vm_lock); 1525 } 1526 1527 if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) { 1528 vmem_destroy(vmp); 1529 return (NULL); 1530 } 1531 1532 return (vmp); 1533 } 1534 1535 /* 1536 * Destroy arena vmp. 1537 */ 1538 void 1539 vmem_destroy(vmem_t *vmp) 1540 { 1541 vmem_t *cur, **vmpp; 1542 vmem_seg_t *seg0 = &vmp->vm_seg0; 1543 vmem_seg_t *vsp; 1544 size_t leaked; 1545 int i; 1546 1547 (void) mutex_lock(&vmem_list_lock); 1548 vmpp = &vmem_list; 1549 while ((cur = *vmpp) != vmp) 1550 vmpp = &cur->vm_next; 1551 *vmpp = vmp->vm_next; 1552 (void) mutex_unlock(&vmem_list_lock); 1553 1554 for (i = 0; i < VMEM_NQCACHE_MAX; i++) 1555 if (vmp->vm_qcache[i]) 1556 umem_cache_destroy(vmp->vm_qcache[i]); 1557 1558 leaked = vmem_size(vmp, VMEM_ALLOC); 1559 if (leaked != 0) 1560 umem_printf("vmem_destroy('%s'): leaked %lu bytes", 1561 vmp->vm_name, leaked); 1562 1563 if (vmp->vm_hash_table != vmp->vm_hash0) 1564 vmem_free(vmem_hash_arena, vmp->vm_hash_table, 1565 (vmp->vm_hash_mask + 1) * sizeof (void *)); 1566 1567 /* 1568 * Give back the segment structures for anything that's left in the 1569 * arena, e.g. the primary spans and their free segments. 1570 */ 1571 VMEM_DELETE(&vmp->vm_rotor, a); 1572 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) 1573 vmem_putseg_global(vsp); 1574 1575 while (vmp->vm_nsegfree > 0) 1576 vmem_putseg_global(vmem_getseg(vmp)); 1577 1578 (void) mutex_destroy(&vmp->vm_lock); 1579 (void) cond_destroy(&vmp->vm_cv); 1580 vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); 1581 } 1582 1583 /* 1584 * Resize vmp's hash table to keep the average lookup depth near 1.0. 1585 */ 1586 static void 1587 vmem_hash_rescale(vmem_t *vmp) 1588 { 1589 vmem_seg_t **old_table, **new_table, *vsp; 1590 size_t old_size, new_size, h, nseg; 1591 1592 nseg = (size_t)(vmp->vm_kstat.vk_alloc - vmp->vm_kstat.vk_free); 1593 1594 new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); 1595 old_size = vmp->vm_hash_mask + 1; 1596 1597 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) 1598 return; 1599 1600 new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *), 1601 VM_NOSLEEP); 1602 if (new_table == NULL) 1603 return; 1604 bzero(new_table, new_size * sizeof (void *)); 1605 1606 (void) mutex_lock(&vmp->vm_lock); 1607 1608 old_size = vmp->vm_hash_mask + 1; 1609 old_table = vmp->vm_hash_table; 1610 1611 vmp->vm_hash_mask = new_size - 1; 1612 vmp->vm_hash_table = new_table; 1613 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1614 1615 for (h = 0; h < old_size; h++) { 1616 vsp = old_table[h]; 1617 while (vsp != NULL) { 1618 uintptr_t addr = vsp->vs_start; 1619 vmem_seg_t *next_vsp = vsp->vs_knext; 1620 vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr); 1621 vsp->vs_knext = *hash_bucket; 1622 *hash_bucket = vsp; 1623 vsp = next_vsp; 1624 } 1625 } 1626 1627 (void) mutex_unlock(&vmp->vm_lock); 1628 1629 if (old_table != vmp->vm_hash0) 1630 vmem_free(vmem_hash_arena, old_table, 1631 old_size * sizeof (void *)); 1632 } 1633 1634 /* 1635 * Perform periodic maintenance on all vmem arenas. 1636 */ 1637 /*ARGSUSED*/ 1638 void 1639 vmem_update(void *dummy) 1640 { 1641 vmem_t *vmp; 1642 1643 (void) mutex_lock(&vmem_list_lock); 1644 for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) { 1645 /* 1646 * If threads are waiting for resources, wake them up 1647 * periodically so they can issue another vmem_reap() 1648 * to reclaim resources cached by the slab allocator. 1649 */ 1650 (void) cond_broadcast(&vmp->vm_cv); 1651 1652 /* 1653 * Rescale the hash table to keep the hash chains short. 1654 */ 1655 vmem_hash_rescale(vmp); 1656 } 1657 (void) mutex_unlock(&vmem_list_lock); 1658 } 1659 1660 /* 1661 * If vmem_init is called again, we need to be able to reset the world. 1662 * That includes resetting the statics back to their original values. 1663 */ 1664 void 1665 vmem_startup(void) 1666 { 1667 #ifdef UMEM_STANDALONE 1668 vmem_id = 0; 1669 vmem_populators = 0; 1670 vmem_segfree = NULL; 1671 vmem_list = NULL; 1672 vmem_internal_arena = NULL; 1673 vmem_seg_arena = NULL; 1674 vmem_hash_arena = NULL; 1675 vmem_vmem_arena = NULL; 1676 vmem_heap = NULL; 1677 vmem_heap_alloc = NULL; 1678 vmem_heap_free = NULL; 1679 1680 bzero(vmem0, sizeof (vmem0)); 1681 bzero(vmem_populator, sizeof (vmem_populator)); 1682 bzero(vmem_seg0, sizeof (vmem_seg0)); 1683 #endif 1684 } 1685 1686 /* 1687 * Prepare vmem for use. 1688 */ 1689 vmem_t * 1690 vmem_init(const char *parent_name, size_t parent_quantum, 1691 vmem_alloc_t *parent_alloc, vmem_free_t *parent_free, 1692 const char *heap_name, void *heap_start, size_t heap_size, 1693 size_t heap_quantum, vmem_alloc_t *heap_alloc, vmem_free_t *heap_free) 1694 { 1695 uint32_t id; 1696 int nseg = VMEM_SEG_INITIAL; 1697 vmem_t *parent, *heap; 1698 1699 ASSERT(vmem_internal_arena == NULL); 1700 1701 while (--nseg >= 0) 1702 vmem_putseg_global(&vmem_seg0[nseg]); 1703 1704 if (parent_name != NULL) { 1705 parent = vmem_create(parent_name, 1706 heap_start, heap_size, parent_quantum, 1707 NULL, NULL, NULL, 0, 1708 VM_SLEEP | VMC_POPULATOR); 1709 heap_start = NULL; 1710 heap_size = 0; 1711 } else { 1712 ASSERT(parent_alloc == NULL && parent_free == NULL); 1713 parent = NULL; 1714 } 1715 1716 heap = vmem_create(heap_name, 1717 heap_start, heap_size, heap_quantum, 1718 parent_alloc, parent_free, parent, 0, 1719 VM_SLEEP | VMC_POPULATOR); 1720 1721 vmem_heap = heap; 1722 vmem_heap_alloc = heap_alloc; 1723 vmem_heap_free = heap_free; 1724 1725 vmem_internal_arena = vmem_create("vmem_internal", 1726 NULL, 0, heap_quantum, 1727 heap_alloc, heap_free, heap, 0, 1728 VM_SLEEP | VMC_POPULATOR); 1729 1730 vmem_seg_arena = vmem_create("vmem_seg", 1731 NULL, 0, heap_quantum, 1732 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1733 VM_SLEEP | VMC_POPULATOR); 1734 1735 vmem_hash_arena = vmem_create("vmem_hash", 1736 NULL, 0, 8, 1737 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1738 VM_SLEEP); 1739 1740 vmem_vmem_arena = vmem_create("vmem_vmem", 1741 vmem0, sizeof (vmem0), 1, 1742 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1743 VM_SLEEP); 1744 1745 for (id = 0; id < vmem_id; id++) 1746 (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t), 1747 1, 0, 0, &vmem0[id], &vmem0[id + 1], 1748 VM_NOSLEEP | VM_BESTFIT | VM_PANIC); 1749 1750 return (heap); 1751 } 1752 1753 void 1754 vmem_no_debug(void) 1755 { 1756 /* 1757 * This size must be a multiple of the minimum required alignment, 1758 * since vmem_populate allocates them compactly. 1759 */ 1760 vmem_seg_size = P2ROUNDUP(offsetof(vmem_seg_t, vs_thread), 1761 sizeof (hrtime_t)); 1762 } 1763 1764 /* 1765 * Lockup and release, for fork1(2) handling. 1766 */ 1767 void 1768 vmem_lockup(void) 1769 { 1770 vmem_t *cur; 1771 1772 (void) mutex_lock(&vmem_list_lock); 1773 (void) mutex_lock(&vmem_nosleep_lock.vmpl_mutex); 1774 1775 /* 1776 * Lock up and broadcast all arenas. 1777 */ 1778 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) { 1779 (void) mutex_lock(&cur->vm_lock); 1780 (void) cond_broadcast(&cur->vm_cv); 1781 } 1782 1783 (void) mutex_lock(&vmem_segfree_lock); 1784 } 1785 1786 void 1787 vmem_release(void) 1788 { 1789 vmem_t *cur; 1790 1791 (void) mutex_unlock(&vmem_nosleep_lock.vmpl_mutex); 1792 1793 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) 1794 (void) mutex_unlock(&cur->vm_lock); 1795 1796 (void) mutex_unlock(&vmem_segfree_lock); 1797 (void) mutex_unlock(&vmem_list_lock); 1798 } 1799