1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * For a more complete description of the main ideas, see: 31 * 32 * Jeff Bonwick and Jonathan Adams, 33 * 34 * Magazines and vmem: Extending the Slab Allocator to Many CPUs and 35 * Arbitrary Resources. 36 * 37 * Proceedings of the 2001 Usenix Conference. 38 * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf. 39 * 40 * For the "Big Theory Statement", see usr/src/uts/common/os/vmem.c 41 * 42 * 1. Overview of changes 43 * ------------------------------ 44 * There have been a few changes to vmem in order to support umem. The 45 * main areas are: 46 * 47 * * VM_SLEEP unsupported 48 * 49 * * Reaping changes 50 * 51 * * initialization changes 52 * 53 * * _vmem_extend_alloc 54 * 55 * 56 * 2. VM_SLEEP Removed 57 * ------------------- 58 * Since VM_SLEEP allocations can hold locks (in vmem_populate()) for 59 * possibly infinite amounts of time, they are not supported in this 60 * version of vmem. Sleep-like behavior can be achieved through 61 * UMEM_NOFAIL umem allocations. 62 * 63 * 64 * 3. Reaping changes 65 * ------------------ 66 * Unlike kmem_reap(), which just asynchronously schedules work, umem_reap() 67 * can do allocations and frees synchronously. This is a problem if it 68 * occurs during a vmem_populate() allocation. 69 * 70 * Instead, we delay reaps while populates are active. 71 * 72 * 73 * 4. Initialization changes 74 * ------------------------- 75 * In the kernel, vmem_init() allows you to create a single, top-level arena, 76 * which has vmem_internal_arena as a child. For umem, we want to be able 77 * to extend arenas dynamically. It is much easier to support this if we 78 * allow a two-level "heap" arena: 79 * 80 * +----------+ 81 * | "fake" | 82 * +----------+ 83 * | 84 * +----------+ 85 * | "heap" | 86 * +----------+ 87 * | \ \ 88 * | +-+-- ... <other children> 89 * | 90 * +---------------+ 91 * | vmem_internal | 92 * +---------------+ 93 * | | | | 94 * <children> 95 * 96 * The new vmem_init() allows you to specify a "parent" of the heap, along 97 * with allocation functions. 98 * 99 * 100 * 5. _vmem_extend_alloc 101 * --------------------- 102 * The other part of extending is _vmem_extend_alloc. This function allows 103 * you to extend (expand current spans, if possible) an arena and allocate 104 * a chunk of the newly extened span atomically. This is needed to support 105 * extending the heap while vmem_populate()ing it. 106 * 107 * In order to increase the usefulness of extending, non-imported spans are 108 * sorted in address order. 109 */ 110 111 #include "c_synonyms.h" 112 #include <sys/vmem_impl_user.h> 113 #include <alloca.h> 114 #include <sys/sysmacros.h> 115 #include <stdio.h> 116 #include <strings.h> 117 #include <atomic.h> 118 119 #include "vmem_base.h" 120 #include "umem_base.h" 121 122 #define VMEM_INITIAL 6 /* early vmem arenas */ 123 #define VMEM_SEG_INITIAL 100 /* early segments */ 124 125 /* 126 * Adding a new span to an arena requires two segment structures: one to 127 * represent the span, and one to represent the free segment it contains. 128 */ 129 #define VMEM_SEGS_PER_SPAN_CREATE 2 130 131 /* 132 * Allocating a piece of an existing segment requires 0-2 segment structures 133 * depending on how much of the segment we're allocating. 134 * 135 * To allocate the entire segment, no new segment structures are needed; we 136 * simply move the existing segment structure from the freelist to the 137 * allocation hash table. 138 * 139 * To allocate a piece from the left or right end of the segment, we must 140 * split the segment into two pieces (allocated part and remainder), so we 141 * need one new segment structure to represent the remainder. 142 * 143 * To allocate from the middle of a segment, we need two new segment strucures 144 * to represent the remainders on either side of the allocated part. 145 */ 146 #define VMEM_SEGS_PER_EXACT_ALLOC 0 147 #define VMEM_SEGS_PER_LEFT_ALLOC 1 148 #define VMEM_SEGS_PER_RIGHT_ALLOC 1 149 #define VMEM_SEGS_PER_MIDDLE_ALLOC 2 150 151 /* 152 * vmem_populate() preallocates segment structures for vmem to do its work. 153 * It must preallocate enough for the worst case, which is when we must import 154 * a new span and then allocate from the middle of it. 155 */ 156 #define VMEM_SEGS_PER_ALLOC_MAX \ 157 (VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC) 158 159 /* 160 * The segment structures themselves are allocated from vmem_seg_arena, so 161 * we have a recursion problem when vmem_seg_arena needs to populate itself. 162 * We address this by working out the maximum number of segment structures 163 * this act will require, and multiplying by the maximum number of threads 164 * that we'll allow to do it simultaneously. 165 * 166 * The worst-case segment consumption to populate vmem_seg_arena is as 167 * follows (depicted as a stack trace to indicate why events are occurring): 168 * 169 * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc) 170 * vmem_alloc(vmem_internal_arena) -> 2 segs (span create + exact alloc) 171 * heap_alloc(heap_arena) 172 * vmem_alloc(heap_arena) -> 4 seg (span create + alloc) 173 * parent_alloc(parent_arena) 174 * _vmem_extend_alloc(parent_arena) -> 3 seg (span create + left alloc) 175 * 176 * Note: The reservation for heap_arena must be 4, since vmem_xalloc() 177 * is overly pessimistic on allocations where parent_arena has a stricter 178 * alignment than heap_arena. 179 * 180 * The worst-case consumption for any arena is 4 segment structures. 181 * For now, we only support VM_NOSLEEP allocations, so as long as we 182 * serialize all vmem_populates, a 4-seg reserve is sufficient. 183 */ 184 #define VMEM_POPULATE_SEGS_PER_ARENA 4 185 #define VMEM_POPULATE_LOCKS 1 186 187 #define VMEM_POPULATE_RESERVE \ 188 (VMEM_POPULATE_SEGS_PER_ARENA * VMEM_POPULATE_LOCKS) 189 190 /* 191 * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures 192 * so that it can satisfy the worst-case allocation *and* participate in 193 * worst-case allocation from vmem_seg_arena. 194 */ 195 #define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX) 196 197 /* Don't assume new statics are zeroed - see vmem_startup() */ 198 static vmem_t vmem0[VMEM_INITIAL]; 199 static vmem_t *vmem_populator[VMEM_INITIAL]; 200 static uint32_t vmem_id; 201 static uint32_t vmem_populators; 202 static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL]; 203 static vmem_seg_t *vmem_segfree; 204 static mutex_t vmem_list_lock; 205 static mutex_t vmem_segfree_lock; 206 static vmem_populate_lock_t vmem_nosleep_lock; 207 #define IN_POPULATE() (vmem_nosleep_lock.vmpl_thr == thr_self()) 208 static vmem_t *vmem_list; 209 static vmem_t *vmem_internal_arena; 210 static vmem_t *vmem_seg_arena; 211 static vmem_t *vmem_hash_arena; 212 static vmem_t *vmem_vmem_arena; 213 214 vmem_t *vmem_heap; 215 vmem_alloc_t *vmem_heap_alloc; 216 vmem_free_t *vmem_heap_free; 217 218 uint32_t vmem_mtbf; /* mean time between failures [default: off] */ 219 size_t vmem_seg_size = sizeof (vmem_seg_t); 220 221 /* 222 * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k'). 223 */ 224 #define VMEM_INSERT(vprev, vsp, type) \ 225 { \ 226 vmem_seg_t *vnext = (vprev)->vs_##type##next; \ 227 (vsp)->vs_##type##next = (vnext); \ 228 (vsp)->vs_##type##prev = (vprev); \ 229 (vprev)->vs_##type##next = (vsp); \ 230 (vnext)->vs_##type##prev = (vsp); \ 231 } 232 233 #define VMEM_DELETE(vsp, type) \ 234 { \ 235 vmem_seg_t *vprev = (vsp)->vs_##type##prev; \ 236 vmem_seg_t *vnext = (vsp)->vs_##type##next; \ 237 (vprev)->vs_##type##next = (vnext); \ 238 (vnext)->vs_##type##prev = (vprev); \ 239 } 240 241 /* 242 * Get a vmem_seg_t from the global segfree list. 243 */ 244 static vmem_seg_t * 245 vmem_getseg_global(void) 246 { 247 vmem_seg_t *vsp; 248 249 (void) mutex_lock(&vmem_segfree_lock); 250 if ((vsp = vmem_segfree) != NULL) 251 vmem_segfree = vsp->vs_knext; 252 (void) mutex_unlock(&vmem_segfree_lock); 253 254 return (vsp); 255 } 256 257 /* 258 * Put a vmem_seg_t on the global segfree list. 259 */ 260 static void 261 vmem_putseg_global(vmem_seg_t *vsp) 262 { 263 (void) mutex_lock(&vmem_segfree_lock); 264 vsp->vs_knext = vmem_segfree; 265 vmem_segfree = vsp; 266 (void) mutex_unlock(&vmem_segfree_lock); 267 } 268 269 /* 270 * Get a vmem_seg_t from vmp's segfree list. 271 */ 272 static vmem_seg_t * 273 vmem_getseg(vmem_t *vmp) 274 { 275 vmem_seg_t *vsp; 276 277 ASSERT(vmp->vm_nsegfree > 0); 278 279 vsp = vmp->vm_segfree; 280 vmp->vm_segfree = vsp->vs_knext; 281 vmp->vm_nsegfree--; 282 283 return (vsp); 284 } 285 286 /* 287 * Put a vmem_seg_t on vmp's segfree list. 288 */ 289 static void 290 vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) 291 { 292 vsp->vs_knext = vmp->vm_segfree; 293 vmp->vm_segfree = vsp; 294 vmp->vm_nsegfree++; 295 } 296 297 /* 298 * Add vsp to the appropriate freelist. 299 */ 300 static void 301 vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp) 302 { 303 vmem_seg_t *vprev; 304 305 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 306 307 vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; 308 vsp->vs_type = VMEM_FREE; 309 vmp->vm_freemap |= VS_SIZE(vprev); 310 VMEM_INSERT(vprev, vsp, k); 311 312 (void) cond_broadcast(&vmp->vm_cv); 313 } 314 315 /* 316 * Take vsp from the freelist. 317 */ 318 static void 319 vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp) 320 { 321 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 322 ASSERT(vsp->vs_type == VMEM_FREE); 323 324 if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) { 325 /* 326 * The segments on both sides of 'vsp' are freelist heads, 327 * so taking vsp leaves the freelist at vsp->vs_kprev empty. 328 */ 329 ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev)); 330 vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev); 331 } 332 VMEM_DELETE(vsp, k); 333 } 334 335 /* 336 * Add vsp to the allocated-segment hash table and update kstats. 337 */ 338 static void 339 vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp) 340 { 341 vmem_seg_t **bucket; 342 343 vsp->vs_type = VMEM_ALLOC; 344 bucket = VMEM_HASH(vmp, vsp->vs_start); 345 vsp->vs_knext = *bucket; 346 *bucket = vsp; 347 348 if (vmem_seg_size == sizeof (vmem_seg_t)) { 349 vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack, 350 VMEM_STACK_DEPTH, 0); 351 vsp->vs_thread = thr_self(); 352 vsp->vs_timestamp = gethrtime(); 353 } else { 354 vsp->vs_depth = 0; 355 } 356 357 vmp->vm_kstat.vk_alloc++; 358 vmp->vm_kstat.vk_mem_inuse += VS_SIZE(vsp); 359 } 360 361 /* 362 * Remove vsp from the allocated-segment hash table and update kstats. 363 */ 364 static vmem_seg_t * 365 vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size) 366 { 367 vmem_seg_t *vsp, **prev_vspp; 368 369 prev_vspp = VMEM_HASH(vmp, addr); 370 while ((vsp = *prev_vspp) != NULL) { 371 if (vsp->vs_start == addr) { 372 *prev_vspp = vsp->vs_knext; 373 break; 374 } 375 vmp->vm_kstat.vk_lookup++; 376 prev_vspp = &vsp->vs_knext; 377 } 378 379 if (vsp == NULL) { 380 umem_panic("vmem_hash_delete(%p, %lx, %lu): bad free", 381 vmp, addr, size); 382 } 383 if (VS_SIZE(vsp) != size) { 384 umem_panic("vmem_hash_delete(%p, %lx, %lu): wrong size " 385 "(expect %lu)", vmp, addr, size, VS_SIZE(vsp)); 386 } 387 388 vmp->vm_kstat.vk_free++; 389 vmp->vm_kstat.vk_mem_inuse -= size; 390 391 return (vsp); 392 } 393 394 /* 395 * Create a segment spanning the range [start, end) and add it to the arena. 396 */ 397 static vmem_seg_t * 398 vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) 399 { 400 vmem_seg_t *newseg = vmem_getseg(vmp); 401 402 newseg->vs_start = start; 403 newseg->vs_end = end; 404 newseg->vs_type = 0; 405 newseg->vs_import = 0; 406 407 VMEM_INSERT(vprev, newseg, a); 408 409 return (newseg); 410 } 411 412 /* 413 * Remove segment vsp from the arena. 414 */ 415 static void 416 vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) 417 { 418 ASSERT(vsp->vs_type != VMEM_ROTOR); 419 VMEM_DELETE(vsp, a); 420 421 vmem_putseg(vmp, vsp); 422 } 423 424 /* 425 * Add the span [vaddr, vaddr + size) to vmp and update kstats. 426 */ 427 static vmem_seg_t * 428 vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import) 429 { 430 vmem_seg_t *knext; 431 vmem_seg_t *newseg, *span; 432 uintptr_t start = (uintptr_t)vaddr; 433 uintptr_t end = start + size; 434 435 knext = &vmp->vm_seg0; 436 if (!import && vmp->vm_source_alloc == NULL) { 437 vmem_seg_t *kend, *kprev; 438 /* 439 * non-imported spans are sorted in address order. This 440 * makes vmem_extend_unlocked() much more effective. 441 * 442 * We search in reverse order, since new spans are 443 * generally at higher addresses. 444 */ 445 kend = &vmp->vm_seg0; 446 for (kprev = kend->vs_kprev; kprev != kend; 447 kprev = kprev->vs_kprev) { 448 if (!kprev->vs_import && (kprev->vs_end - 1) < start) 449 break; 450 } 451 knext = kprev->vs_knext; 452 } 453 454 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 455 456 if ((start | end) & (vmp->vm_quantum - 1)) { 457 umem_panic("vmem_span_create(%p, %p, %lu): misaligned", 458 vmp, vaddr, size); 459 } 460 461 span = vmem_seg_create(vmp, knext->vs_aprev, start, end); 462 span->vs_type = VMEM_SPAN; 463 VMEM_INSERT(knext->vs_kprev, span, k); 464 465 newseg = vmem_seg_create(vmp, span, start, end); 466 vmem_freelist_insert(vmp, newseg); 467 468 newseg->vs_import = import; 469 if (import) 470 vmp->vm_kstat.vk_mem_import += size; 471 vmp->vm_kstat.vk_mem_total += size; 472 473 return (newseg); 474 } 475 476 /* 477 * Remove span vsp from vmp and update kstats. 478 */ 479 static void 480 vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp) 481 { 482 vmem_seg_t *span = vsp->vs_aprev; 483 size_t size = VS_SIZE(vsp); 484 485 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 486 ASSERT(span->vs_type == VMEM_SPAN); 487 488 if (vsp->vs_import) 489 vmp->vm_kstat.vk_mem_import -= size; 490 vmp->vm_kstat.vk_mem_total -= size; 491 492 VMEM_DELETE(span, k); 493 494 vmem_seg_destroy(vmp, vsp); 495 vmem_seg_destroy(vmp, span); 496 } 497 498 /* 499 * Allocate the subrange [addr, addr + size) from segment vsp. 500 * If there are leftovers on either side, place them on the freelist. 501 * Returns a pointer to the segment representing [addr, addr + size). 502 */ 503 static vmem_seg_t * 504 vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size) 505 { 506 uintptr_t vs_start = vsp->vs_start; 507 uintptr_t vs_end = vsp->vs_end; 508 size_t vs_size = vs_end - vs_start; 509 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 510 uintptr_t addr_end = addr + realsize; 511 512 ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0); 513 ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0); 514 ASSERT(vsp->vs_type == VMEM_FREE); 515 ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1); 516 ASSERT(addr - 1 <= addr_end - 1); 517 518 /* 519 * If we're allocating from the start of the segment, and the 520 * remainder will be on the same freelist, we can save quite 521 * a bit of work. 522 */ 523 if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) { 524 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 525 vsp->vs_start = addr_end; 526 vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size); 527 vmem_hash_insert(vmp, vsp); 528 return (vsp); 529 } 530 531 vmem_freelist_delete(vmp, vsp); 532 533 if (vs_end != addr_end) 534 vmem_freelist_insert(vmp, 535 vmem_seg_create(vmp, vsp, addr_end, vs_end)); 536 537 if (vs_start != addr) 538 vmem_freelist_insert(vmp, 539 vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr)); 540 541 vsp->vs_start = addr; 542 vsp->vs_end = addr + size; 543 544 vmem_hash_insert(vmp, vsp); 545 return (vsp); 546 } 547 548 /* 549 * We cannot reap if we are in the middle of a vmem_populate(). 550 */ 551 void 552 vmem_reap(void) 553 { 554 if (!IN_POPULATE()) 555 umem_reap(); 556 } 557 558 /* 559 * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures. 560 */ 561 static int 562 vmem_populate(vmem_t *vmp, int vmflag) 563 { 564 char *p; 565 vmem_seg_t *vsp; 566 ssize_t nseg; 567 size_t size; 568 vmem_populate_lock_t *lp; 569 int i; 570 571 while (vmp->vm_nsegfree < VMEM_MINFREE && 572 (vsp = vmem_getseg_global()) != NULL) 573 vmem_putseg(vmp, vsp); 574 575 if (vmp->vm_nsegfree >= VMEM_MINFREE) 576 return (1); 577 578 /* 579 * If we're already populating, tap the reserve. 580 */ 581 if (vmem_nosleep_lock.vmpl_thr == thr_self()) { 582 ASSERT(vmp->vm_cflags & VMC_POPULATOR); 583 return (1); 584 } 585 586 (void) mutex_unlock(&vmp->vm_lock); 587 588 ASSERT(vmflag & VM_NOSLEEP); /* we do not allow sleep allocations */ 589 lp = &vmem_nosleep_lock; 590 591 /* 592 * Cannot be just a mutex_lock(), since that has no effect if 593 * libthread is not linked. 594 */ 595 (void) mutex_lock(&lp->vmpl_mutex); 596 ASSERT(lp->vmpl_thr == 0); 597 lp->vmpl_thr = thr_self(); 598 599 nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE; 600 size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum); 601 nseg = size / vmem_seg_size; 602 603 /* 604 * The following vmem_alloc() may need to populate vmem_seg_arena 605 * and all the things it imports from. When doing so, it will tap 606 * each arena's reserve to prevent recursion (see the block comment 607 * above the definition of VMEM_POPULATE_RESERVE). 608 * 609 * During this allocation, vmem_reap() is a no-op. If the allocation 610 * fails, we call vmem_reap() after dropping the population lock. 611 */ 612 p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_UMFLAGS); 613 if (p == NULL) { 614 lp->vmpl_thr = 0; 615 (void) mutex_unlock(&lp->vmpl_mutex); 616 vmem_reap(); 617 618 (void) mutex_lock(&vmp->vm_lock); 619 vmp->vm_kstat.vk_populate_fail++; 620 return (0); 621 } 622 /* 623 * Restock the arenas that may have been depleted during population. 624 */ 625 for (i = 0; i < vmem_populators; i++) { 626 (void) mutex_lock(&vmem_populator[i]->vm_lock); 627 while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE) 628 vmem_putseg(vmem_populator[i], 629 (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 630 (void) mutex_unlock(&vmem_populator[i]->vm_lock); 631 } 632 633 lp->vmpl_thr = 0; 634 (void) mutex_unlock(&lp->vmpl_mutex); 635 (void) mutex_lock(&vmp->vm_lock); 636 637 /* 638 * Now take our own segments. 639 */ 640 ASSERT(nseg >= VMEM_MINFREE); 641 while (vmp->vm_nsegfree < VMEM_MINFREE) 642 vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 643 644 /* 645 * Give the remainder to charity. 646 */ 647 while (nseg > 0) 648 vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size)); 649 650 return (1); 651 } 652 653 /* 654 * Advance a walker from its previous position to 'afterme'. 655 * Note: may drop and reacquire vmp->vm_lock. 656 */ 657 static void 658 vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme) 659 { 660 vmem_seg_t *vprev = walker->vs_aprev; 661 vmem_seg_t *vnext = walker->vs_anext; 662 vmem_seg_t *vsp = NULL; 663 664 VMEM_DELETE(walker, a); 665 666 if (afterme != NULL) 667 VMEM_INSERT(afterme, walker, a); 668 669 /* 670 * The walker segment's presence may have prevented its neighbors 671 * from coalescing. If so, coalesce them now. 672 */ 673 if (vprev->vs_type == VMEM_FREE) { 674 if (vnext->vs_type == VMEM_FREE) { 675 ASSERT(vprev->vs_end == vnext->vs_start); 676 vmem_freelist_delete(vmp, vnext); 677 vmem_freelist_delete(vmp, vprev); 678 vprev->vs_end = vnext->vs_end; 679 vmem_freelist_insert(vmp, vprev); 680 vmem_seg_destroy(vmp, vnext); 681 } 682 vsp = vprev; 683 } else if (vnext->vs_type == VMEM_FREE) { 684 vsp = vnext; 685 } 686 687 /* 688 * vsp could represent a complete imported span, 689 * in which case we must return it to the source. 690 */ 691 if (vsp != NULL && vsp->vs_import && vmp->vm_source_free != NULL && 692 vsp->vs_aprev->vs_type == VMEM_SPAN && 693 vsp->vs_anext->vs_type == VMEM_SPAN) { 694 void *vaddr = (void *)vsp->vs_start; 695 size_t size = VS_SIZE(vsp); 696 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 697 vmem_freelist_delete(vmp, vsp); 698 vmem_span_destroy(vmp, vsp); 699 (void) mutex_unlock(&vmp->vm_lock); 700 vmp->vm_source_free(vmp->vm_source, vaddr, size); 701 (void) mutex_lock(&vmp->vm_lock); 702 } 703 } 704 705 /* 706 * VM_NEXTFIT allocations deliberately cycle through all virtual addresses 707 * in an arena, so that we avoid reusing addresses for as long as possible. 708 * This helps to catch used-after-freed bugs. It's also the perfect policy 709 * for allocating things like process IDs, where we want to cycle through 710 * all values in order. 711 */ 712 static void * 713 vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag) 714 { 715 vmem_seg_t *vsp, *rotor; 716 uintptr_t addr; 717 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 718 size_t vs_size; 719 720 (void) mutex_lock(&vmp->vm_lock); 721 722 if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) { 723 (void) mutex_unlock(&vmp->vm_lock); 724 return (NULL); 725 } 726 727 /* 728 * The common case is that the segment right after the rotor is free, 729 * and large enough that extracting 'size' bytes won't change which 730 * freelist it's on. In this case we can avoid a *lot* of work. 731 * Instead of the normal vmem_seg_alloc(), we just advance the start 732 * address of the victim segment. Instead of moving the rotor, we 733 * create the new segment structure *behind the rotor*, which has 734 * the same effect. And finally, we know we don't have to coalesce 735 * the rotor's neighbors because the new segment lies between them. 736 */ 737 rotor = &vmp->vm_rotor; 738 vsp = rotor->vs_anext; 739 if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize && 740 P2SAMEHIGHBIT(vs_size, vs_size - realsize)) { 741 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 742 addr = vsp->vs_start; 743 vsp->vs_start = addr + realsize; 744 vmem_hash_insert(vmp, 745 vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size)); 746 (void) mutex_unlock(&vmp->vm_lock); 747 return ((void *)addr); 748 } 749 750 /* 751 * Starting at the rotor, look for a segment large enough to 752 * satisfy the allocation. 753 */ 754 for (;;) { 755 vmp->vm_kstat.vk_search++; 756 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 757 break; 758 vsp = vsp->vs_anext; 759 if (vsp == rotor) { 760 int cancel_state; 761 762 /* 763 * We've come full circle. One possibility is that the 764 * there's actually enough space, but the rotor itself 765 * is preventing the allocation from succeeding because 766 * it's sitting between two free segments. Therefore, 767 * we advance the rotor and see if that liberates a 768 * suitable segment. 769 */ 770 vmem_advance(vmp, rotor, rotor->vs_anext); 771 vsp = rotor->vs_aprev; 772 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 773 break; 774 /* 775 * If there's a lower arena we can import from, or it's 776 * a VM_NOSLEEP allocation, let vmem_xalloc() handle it. 777 * Otherwise, wait until another thread frees something. 778 */ 779 if (vmp->vm_source_alloc != NULL || 780 (vmflag & VM_NOSLEEP)) { 781 (void) mutex_unlock(&vmp->vm_lock); 782 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 783 0, 0, NULL, NULL, vmflag & VM_UMFLAGS)); 784 } 785 vmp->vm_kstat.vk_wait++; 786 (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 787 &cancel_state); 788 (void) cond_wait(&vmp->vm_cv, &vmp->vm_lock); 789 (void) pthread_setcancelstate(cancel_state, NULL); 790 vsp = rotor->vs_anext; 791 } 792 } 793 794 /* 795 * We found a segment. Extract enough space to satisfy the allocation. 796 */ 797 addr = vsp->vs_start; 798 vsp = vmem_seg_alloc(vmp, vsp, addr, size); 799 ASSERT(vsp->vs_type == VMEM_ALLOC && 800 vsp->vs_start == addr && vsp->vs_end == addr + size); 801 802 /* 803 * Advance the rotor to right after the newly-allocated segment. 804 * That's where the next VM_NEXTFIT allocation will begin searching. 805 */ 806 vmem_advance(vmp, rotor, vsp); 807 (void) mutex_unlock(&vmp->vm_lock); 808 return ((void *)addr); 809 } 810 811 /* 812 * Allocate size bytes at offset phase from an align boundary such that the 813 * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) 814 * that does not straddle a nocross-aligned boundary. 815 */ 816 void * 817 vmem_xalloc(vmem_t *vmp, size_t size, size_t align, size_t phase, 818 size_t nocross, void *minaddr, void *maxaddr, int vmflag) 819 { 820 vmem_seg_t *vsp; 821 vmem_seg_t *vbest = NULL; 822 uintptr_t addr, taddr, start, end; 823 void *vaddr; 824 int hb, flist, resv; 825 uint32_t mtbf; 826 827 if (phase > 0 && phase >= align) 828 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 829 "invalid phase", 830 (void *)vmp, size, align, phase, nocross, 831 minaddr, maxaddr, vmflag); 832 833 if (align == 0) 834 align = vmp->vm_quantum; 835 836 if ((align | phase | nocross) & (vmp->vm_quantum - 1)) { 837 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 838 "parameters not vm_quantum aligned", 839 (void *)vmp, size, align, phase, nocross, 840 minaddr, maxaddr, vmflag); 841 } 842 843 if (nocross != 0 && 844 (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) { 845 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 846 "overconstrained allocation", 847 (void *)vmp, size, align, phase, nocross, 848 minaddr, maxaddr, vmflag); 849 } 850 851 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 852 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 853 return (NULL); 854 855 (void) mutex_lock(&vmp->vm_lock); 856 for (;;) { 857 int cancel_state; 858 859 if (vmp->vm_nsegfree < VMEM_MINFREE && 860 !vmem_populate(vmp, vmflag)) 861 break; 862 863 /* 864 * highbit() returns the highest bit + 1, which is exactly 865 * what we want: we want to search the first freelist whose 866 * members are *definitely* large enough to satisfy our 867 * allocation. However, there are certain cases in which we 868 * want to look at the next-smallest freelist (which *might* 869 * be able to satisfy the allocation): 870 * 871 * (1) The size is exactly a power of 2, in which case 872 * the smaller freelist is always big enough; 873 * 874 * (2) All other freelists are empty; 875 * 876 * (3) We're in the highest possible freelist, which is 877 * always empty (e.g. the 4GB freelist on 32-bit systems); 878 * 879 * (4) We're doing a best-fit or first-fit allocation. 880 */ 881 if ((size & (size - 1)) == 0) { 882 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 883 } else { 884 hb = highbit(size); 885 if ((vmp->vm_freemap >> hb) == 0 || 886 hb == VMEM_FREELISTS || 887 (vmflag & (VM_BESTFIT | VM_FIRSTFIT))) 888 hb--; 889 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 890 } 891 892 for (vbest = NULL, vsp = (flist == 0) ? NULL : 893 vmp->vm_freelist[flist - 1].vs_knext; 894 vsp != NULL; vsp = vsp->vs_knext) { 895 vmp->vm_kstat.vk_search++; 896 if (vsp->vs_start == 0) { 897 /* 898 * We're moving up to a larger freelist, 899 * so if we've already found a candidate, 900 * the fit can't possibly get any better. 901 */ 902 if (vbest != NULL) 903 break; 904 /* 905 * Find the next non-empty freelist. 906 */ 907 flist = lowbit(P2ALIGN(vmp->vm_freemap, 908 VS_SIZE(vsp))); 909 if (flist-- == 0) 910 break; 911 vsp = (vmem_seg_t *)&vmp->vm_freelist[flist]; 912 ASSERT(vsp->vs_knext->vs_type == VMEM_FREE); 913 continue; 914 } 915 if (vsp->vs_end - 1 < (uintptr_t)minaddr) 916 continue; 917 if (vsp->vs_start > (uintptr_t)maxaddr - 1) 918 continue; 919 start = MAX(vsp->vs_start, (uintptr_t)minaddr); 920 end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; 921 taddr = P2PHASEUP(start, align, phase); 922 if (P2CROSS(taddr, taddr + size - 1, nocross)) 923 taddr += 924 P2ROUNDUP(P2NPHASE(taddr, nocross), align); 925 if ((taddr - start) + size > end - start || 926 (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest))) 927 continue; 928 vbest = vsp; 929 addr = taddr; 930 if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size) 931 break; 932 } 933 if (vbest != NULL) 934 break; 935 if (size == 0) 936 umem_panic("vmem_xalloc(): size == 0"); 937 if (vmp->vm_source_alloc != NULL && nocross == 0 && 938 minaddr == NULL && maxaddr == NULL) { 939 size_t asize = P2ROUNDUP(size + phase, 940 MAX(align, vmp->vm_source->vm_quantum)); 941 if (asize < size) { /* overflow */ 942 (void) mutex_unlock(&vmp->vm_lock); 943 if (vmflag & VM_NOSLEEP) 944 return (NULL); 945 946 umem_panic("vmem_xalloc(): " 947 "overflow on VM_SLEEP allocation"); 948 } 949 /* 950 * Determine how many segment structures we'll consume. 951 * The calculation must be presise because if we're 952 * here on behalf of vmem_populate(), we are taking 953 * segments from a very limited reserve. 954 */ 955 resv = (size == asize) ? 956 VMEM_SEGS_PER_SPAN_CREATE + 957 VMEM_SEGS_PER_EXACT_ALLOC : 958 VMEM_SEGS_PER_ALLOC_MAX; 959 ASSERT(vmp->vm_nsegfree >= resv); 960 vmp->vm_nsegfree -= resv; /* reserve our segs */ 961 (void) mutex_unlock(&vmp->vm_lock); 962 vaddr = vmp->vm_source_alloc(vmp->vm_source, asize, 963 vmflag & VM_UMFLAGS); 964 (void) mutex_lock(&vmp->vm_lock); 965 vmp->vm_nsegfree += resv; /* claim reservation */ 966 if (vaddr != NULL) { 967 vbest = vmem_span_create(vmp, vaddr, asize, 1); 968 addr = P2PHASEUP(vbest->vs_start, align, phase); 969 break; 970 } 971 } 972 (void) mutex_unlock(&vmp->vm_lock); 973 vmem_reap(); 974 (void) mutex_lock(&vmp->vm_lock); 975 if (vmflag & VM_NOSLEEP) 976 break; 977 vmp->vm_kstat.vk_wait++; 978 (void) pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 979 &cancel_state); 980 (void) cond_wait(&vmp->vm_cv, &vmp->vm_lock); 981 (void) pthread_setcancelstate(cancel_state, NULL); 982 } 983 if (vbest != NULL) { 984 ASSERT(vbest->vs_type == VMEM_FREE); 985 ASSERT(vbest->vs_knext != vbest); 986 (void) vmem_seg_alloc(vmp, vbest, addr, size); 987 (void) mutex_unlock(&vmp->vm_lock); 988 ASSERT(P2PHASE(addr, align) == phase); 989 ASSERT(!P2CROSS(addr, addr + size - 1, nocross)); 990 ASSERT(addr >= (uintptr_t)minaddr); 991 ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); 992 return ((void *)addr); 993 } 994 vmp->vm_kstat.vk_fail++; 995 (void) mutex_unlock(&vmp->vm_lock); 996 if (vmflag & VM_PANIC) 997 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 998 "cannot satisfy mandatory allocation", 999 (void *)vmp, size, align, phase, nocross, 1000 minaddr, maxaddr, vmflag); 1001 return (NULL); 1002 } 1003 1004 /* 1005 * Free the segment [vaddr, vaddr + size), where vaddr was a constrained 1006 * allocation. vmem_xalloc() and vmem_xfree() must always be paired because 1007 * both routines bypass the quantum caches. 1008 */ 1009 void 1010 vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) 1011 { 1012 vmem_seg_t *vsp, *vnext, *vprev; 1013 1014 (void) mutex_lock(&vmp->vm_lock); 1015 1016 vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size); 1017 vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum); 1018 1019 /* 1020 * Attempt to coalesce with the next segment. 1021 */ 1022 vnext = vsp->vs_anext; 1023 if (vnext->vs_type == VMEM_FREE) { 1024 ASSERT(vsp->vs_end == vnext->vs_start); 1025 vmem_freelist_delete(vmp, vnext); 1026 vsp->vs_end = vnext->vs_end; 1027 vmem_seg_destroy(vmp, vnext); 1028 } 1029 1030 /* 1031 * Attempt to coalesce with the previous segment. 1032 */ 1033 vprev = vsp->vs_aprev; 1034 if (vprev->vs_type == VMEM_FREE) { 1035 ASSERT(vprev->vs_end == vsp->vs_start); 1036 vmem_freelist_delete(vmp, vprev); 1037 vprev->vs_end = vsp->vs_end; 1038 vmem_seg_destroy(vmp, vsp); 1039 vsp = vprev; 1040 } 1041 1042 /* 1043 * If the entire span is free, return it to the source. 1044 */ 1045 if (vsp->vs_import && vmp->vm_source_free != NULL && 1046 vsp->vs_aprev->vs_type == VMEM_SPAN && 1047 vsp->vs_anext->vs_type == VMEM_SPAN) { 1048 vaddr = (void *)vsp->vs_start; 1049 size = VS_SIZE(vsp); 1050 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 1051 vmem_span_destroy(vmp, vsp); 1052 (void) mutex_unlock(&vmp->vm_lock); 1053 vmp->vm_source_free(vmp->vm_source, vaddr, size); 1054 } else { 1055 vmem_freelist_insert(vmp, vsp); 1056 (void) mutex_unlock(&vmp->vm_lock); 1057 } 1058 } 1059 1060 /* 1061 * Allocate size bytes from arena vmp. Returns the allocated address 1062 * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, 1063 * and may also specify best-fit, first-fit, or next-fit allocation policy 1064 * instead of the default instant-fit policy. VM_SLEEP allocations are 1065 * guaranteed to succeed. 1066 */ 1067 void * 1068 vmem_alloc(vmem_t *vmp, size_t size, int vmflag) 1069 { 1070 vmem_seg_t *vsp; 1071 uintptr_t addr; 1072 int hb; 1073 int flist = 0; 1074 uint32_t mtbf; 1075 1076 if (size - 1 < vmp->vm_qcache_max) { 1077 ASSERT(vmflag & VM_NOSLEEP); 1078 return (_umem_cache_alloc(vmp->vm_qcache[(size - 1) >> 1079 vmp->vm_qshift], UMEM_DEFAULT)); 1080 } 1081 1082 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 1083 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 1084 return (NULL); 1085 1086 if (vmflag & VM_NEXTFIT) 1087 return (vmem_nextfit_alloc(vmp, size, vmflag)); 1088 1089 if (vmflag & (VM_BESTFIT | VM_FIRSTFIT)) 1090 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0, 1091 NULL, NULL, vmflag)); 1092 1093 /* 1094 * Unconstrained instant-fit allocation from the segment list. 1095 */ 1096 (void) mutex_lock(&vmp->vm_lock); 1097 1098 if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) { 1099 if ((size & (size - 1)) == 0) 1100 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 1101 else if ((hb = highbit(size)) < VMEM_FREELISTS) 1102 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 1103 } 1104 1105 if (flist-- == 0) { 1106 (void) mutex_unlock(&vmp->vm_lock); 1107 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 1108 0, 0, NULL, NULL, vmflag)); 1109 } 1110 1111 ASSERT(size <= (1UL << flist)); 1112 vsp = vmp->vm_freelist[flist].vs_knext; 1113 addr = vsp->vs_start; 1114 (void) vmem_seg_alloc(vmp, vsp, addr, size); 1115 (void) mutex_unlock(&vmp->vm_lock); 1116 return ((void *)addr); 1117 } 1118 1119 /* 1120 * Free the segment [vaddr, vaddr + size). 1121 */ 1122 void 1123 vmem_free(vmem_t *vmp, void *vaddr, size_t size) 1124 { 1125 if (size - 1 < vmp->vm_qcache_max) 1126 _umem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], 1127 vaddr); 1128 else 1129 vmem_xfree(vmp, vaddr, size); 1130 } 1131 1132 /* 1133 * Determine whether arena vmp contains the segment [vaddr, vaddr + size). 1134 */ 1135 int 1136 vmem_contains(vmem_t *vmp, void *vaddr, size_t size) 1137 { 1138 uintptr_t start = (uintptr_t)vaddr; 1139 uintptr_t end = start + size; 1140 vmem_seg_t *vsp; 1141 vmem_seg_t *seg0 = &vmp->vm_seg0; 1142 1143 (void) mutex_lock(&vmp->vm_lock); 1144 vmp->vm_kstat.vk_contains++; 1145 for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) { 1146 vmp->vm_kstat.vk_contains_search++; 1147 ASSERT(vsp->vs_type == VMEM_SPAN); 1148 if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1) 1149 break; 1150 } 1151 (void) mutex_unlock(&vmp->vm_lock); 1152 return (vsp != seg0); 1153 } 1154 1155 /* 1156 * Add the span [vaddr, vaddr + size) to arena vmp. 1157 */ 1158 void * 1159 vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag) 1160 { 1161 if (vaddr == NULL || size == 0) { 1162 umem_panic("vmem_add(%p, %p, %lu): bad arguments", 1163 vmp, vaddr, size); 1164 } 1165 1166 ASSERT(!vmem_contains(vmp, vaddr, size)); 1167 1168 (void) mutex_lock(&vmp->vm_lock); 1169 if (vmem_populate(vmp, vmflag)) 1170 (void) vmem_span_create(vmp, vaddr, size, 0); 1171 else 1172 vaddr = NULL; 1173 (void) cond_broadcast(&vmp->vm_cv); 1174 (void) mutex_unlock(&vmp->vm_lock); 1175 return (vaddr); 1176 } 1177 1178 /* 1179 * Adds the address range [addr, endaddr) to arena vmp, by either: 1180 * 1. joining two existing spans, [x, addr), and [endaddr, y) (which 1181 * are in that order) into a single [x, y) span, 1182 * 2. expanding an existing [x, addr) span to [x, endaddr), 1183 * 3. expanding an existing [endaddr, x) span to [addr, x), or 1184 * 4. creating a new [addr, endaddr) span. 1185 * 1186 * Called with vmp->vm_lock held, and a successful vmem_populate() completed. 1187 * Cannot fail. Returns the new segment. 1188 * 1189 * NOTE: this algorithm is linear-time in the number of spans, but is 1190 * constant-time when you are extending the last (highest-addressed) 1191 * span. 1192 */ 1193 static vmem_seg_t * 1194 vmem_extend_unlocked(vmem_t *vmp, uintptr_t addr, uintptr_t endaddr) 1195 { 1196 vmem_seg_t *span; 1197 vmem_seg_t *vsp; 1198 1199 vmem_seg_t *end = &vmp->vm_seg0; 1200 1201 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 1202 1203 /* 1204 * the second "if" clause below relies on the direction of this search 1205 */ 1206 for (span = end->vs_kprev; span != end; span = span->vs_kprev) { 1207 if (span->vs_end == addr || span->vs_start == endaddr) 1208 break; 1209 } 1210 1211 if (span == end) 1212 return (vmem_span_create(vmp, (void *)addr, endaddr - addr, 0)); 1213 if (span->vs_kprev->vs_end == addr && span->vs_start == endaddr) { 1214 vmem_seg_t *prevspan = span->vs_kprev; 1215 vmem_seg_t *nextseg = span->vs_anext; 1216 vmem_seg_t *prevseg = span->vs_aprev; 1217 1218 /* 1219 * prevspan becomes the span marker for the full range 1220 */ 1221 prevspan->vs_end = span->vs_end; 1222 1223 /* 1224 * Notionally, span becomes a free segment representing 1225 * [addr, endaddr). 1226 * 1227 * However, if either of its neighbors are free, we coalesce 1228 * by destroying span and changing the free segment. 1229 */ 1230 if (prevseg->vs_type == VMEM_FREE && 1231 nextseg->vs_type == VMEM_FREE) { 1232 /* 1233 * coalesce both ways 1234 */ 1235 ASSERT(prevseg->vs_end == addr && 1236 nextseg->vs_start == endaddr); 1237 1238 vmem_freelist_delete(vmp, prevseg); 1239 prevseg->vs_end = nextseg->vs_end; 1240 1241 vmem_freelist_delete(vmp, nextseg); 1242 VMEM_DELETE(span, k); 1243 vmem_seg_destroy(vmp, nextseg); 1244 vmem_seg_destroy(vmp, span); 1245 1246 vsp = prevseg; 1247 } else if (prevseg->vs_type == VMEM_FREE) { 1248 /* 1249 * coalesce left 1250 */ 1251 ASSERT(prevseg->vs_end == addr); 1252 1253 VMEM_DELETE(span, k); 1254 vmem_seg_destroy(vmp, span); 1255 1256 vmem_freelist_delete(vmp, prevseg); 1257 prevseg->vs_end = endaddr; 1258 1259 vsp = prevseg; 1260 } else if (nextseg->vs_type == VMEM_FREE) { 1261 /* 1262 * coalesce right 1263 */ 1264 ASSERT(nextseg->vs_start == endaddr); 1265 1266 VMEM_DELETE(span, k); 1267 vmem_seg_destroy(vmp, span); 1268 1269 vmem_freelist_delete(vmp, nextseg); 1270 nextseg->vs_start = addr; 1271 1272 vsp = nextseg; 1273 } else { 1274 /* 1275 * cannnot coalesce 1276 */ 1277 VMEM_DELETE(span, k); 1278 span->vs_start = addr; 1279 span->vs_end = endaddr; 1280 1281 vsp = span; 1282 } 1283 } else if (span->vs_end == addr) { 1284 vmem_seg_t *oldseg = span->vs_knext->vs_aprev; 1285 span->vs_end = endaddr; 1286 1287 ASSERT(oldseg->vs_type != VMEM_SPAN); 1288 if (oldseg->vs_type == VMEM_FREE) { 1289 ASSERT(oldseg->vs_end == addr); 1290 vmem_freelist_delete(vmp, oldseg); 1291 oldseg->vs_end = endaddr; 1292 vsp = oldseg; 1293 } else 1294 vsp = vmem_seg_create(vmp, oldseg, addr, endaddr); 1295 } else { 1296 vmem_seg_t *oldseg = span->vs_anext; 1297 ASSERT(span->vs_start == endaddr); 1298 span->vs_start = addr; 1299 1300 ASSERT(oldseg->vs_type != VMEM_SPAN); 1301 if (oldseg->vs_type == VMEM_FREE) { 1302 ASSERT(oldseg->vs_start == endaddr); 1303 vmem_freelist_delete(vmp, oldseg); 1304 oldseg->vs_start = addr; 1305 vsp = oldseg; 1306 } else 1307 vsp = vmem_seg_create(vmp, span, addr, endaddr); 1308 } 1309 vmem_freelist_insert(vmp, vsp); 1310 vmp->vm_kstat.vk_mem_total += (endaddr - addr); 1311 return (vsp); 1312 } 1313 1314 /* 1315 * Does some error checking, calls vmem_extend_unlocked to add 1316 * [vaddr, vaddr+size) to vmp, then allocates alloc bytes from the 1317 * newly merged segment. 1318 */ 1319 void * 1320 _vmem_extend_alloc(vmem_t *vmp, void *vaddr, size_t size, size_t alloc, 1321 int vmflag) 1322 { 1323 uintptr_t addr = (uintptr_t)vaddr; 1324 uintptr_t endaddr = addr + size; 1325 vmem_seg_t *vsp; 1326 1327 ASSERT(vaddr != NULL && size != 0 && endaddr > addr); 1328 ASSERT(alloc <= size && alloc != 0); 1329 ASSERT(((addr | size | alloc) & (vmp->vm_quantum - 1)) == 0); 1330 1331 ASSERT(!vmem_contains(vmp, vaddr, size)); 1332 1333 (void) mutex_lock(&vmp->vm_lock); 1334 if (!vmem_populate(vmp, vmflag)) { 1335 (void) mutex_unlock(&vmp->vm_lock); 1336 return (NULL); 1337 } 1338 /* 1339 * if there is a source, we can't mess with the spans 1340 */ 1341 if (vmp->vm_source_alloc != NULL) 1342 vsp = vmem_span_create(vmp, vaddr, size, 0); 1343 else 1344 vsp = vmem_extend_unlocked(vmp, addr, endaddr); 1345 1346 ASSERT(VS_SIZE(vsp) >= alloc); 1347 1348 addr = vsp->vs_start; 1349 (void) vmem_seg_alloc(vmp, vsp, addr, alloc); 1350 vaddr = (void *)addr; 1351 1352 (void) cond_broadcast(&vmp->vm_cv); 1353 (void) mutex_unlock(&vmp->vm_lock); 1354 1355 return (vaddr); 1356 } 1357 1358 /* 1359 * Walk the vmp arena, applying func to each segment matching typemask. 1360 * If VMEM_REENTRANT is specified, the arena lock is dropped across each 1361 * call to func(); otherwise, it is held for the duration of vmem_walk() 1362 * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks 1363 * are *not* necessarily consistent, so they may only be used when a hint 1364 * is adequate. 1365 */ 1366 void 1367 vmem_walk(vmem_t *vmp, int typemask, 1368 void (*func)(void *, void *, size_t), void *arg) 1369 { 1370 vmem_seg_t *vsp; 1371 vmem_seg_t *seg0 = &vmp->vm_seg0; 1372 vmem_seg_t walker; 1373 1374 if (typemask & VMEM_WALKER) 1375 return; 1376 1377 bzero(&walker, sizeof (walker)); 1378 walker.vs_type = VMEM_WALKER; 1379 1380 (void) mutex_lock(&vmp->vm_lock); 1381 VMEM_INSERT(seg0, &walker, a); 1382 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) { 1383 if (vsp->vs_type & typemask) { 1384 void *start = (void *)vsp->vs_start; 1385 size_t size = VS_SIZE(vsp); 1386 if (typemask & VMEM_REENTRANT) { 1387 vmem_advance(vmp, &walker, vsp); 1388 (void) mutex_unlock(&vmp->vm_lock); 1389 func(arg, start, size); 1390 (void) mutex_lock(&vmp->vm_lock); 1391 vsp = &walker; 1392 } else { 1393 func(arg, start, size); 1394 } 1395 } 1396 } 1397 vmem_advance(vmp, &walker, NULL); 1398 (void) mutex_unlock(&vmp->vm_lock); 1399 } 1400 1401 /* 1402 * Return the total amount of memory whose type matches typemask. Thus: 1403 * 1404 * typemask VMEM_ALLOC yields total memory allocated (in use). 1405 * typemask VMEM_FREE yields total memory free (available). 1406 * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size. 1407 */ 1408 size_t 1409 vmem_size(vmem_t *vmp, int typemask) 1410 { 1411 uint64_t size = 0; 1412 1413 if (typemask & VMEM_ALLOC) 1414 size += vmp->vm_kstat.vk_mem_inuse; 1415 if (typemask & VMEM_FREE) 1416 size += vmp->vm_kstat.vk_mem_total - 1417 vmp->vm_kstat.vk_mem_inuse; 1418 return ((size_t)size); 1419 } 1420 1421 /* 1422 * Create an arena called name whose initial span is [base, base + size). 1423 * The arena's natural unit of currency is quantum, so vmem_alloc() 1424 * guarantees quantum-aligned results. The arena may import new spans 1425 * by invoking afunc() on source, and may return those spans by invoking 1426 * ffunc() on source. To make small allocations fast and scalable, 1427 * the arena offers high-performance caching for each integer multiple 1428 * of quantum up to qcache_max. 1429 */ 1430 vmem_t * 1431 vmem_create(const char *name, void *base, size_t size, size_t quantum, 1432 vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source, 1433 size_t qcache_max, int vmflag) 1434 { 1435 int i; 1436 size_t nqcache; 1437 vmem_t *vmp, *cur, **vmpp; 1438 vmem_seg_t *vsp; 1439 vmem_freelist_t *vfp; 1440 uint32_t id = atomic_add_32_nv(&vmem_id, 1); 1441 1442 if (vmem_vmem_arena != NULL) { 1443 vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t), 1444 vmflag & VM_UMFLAGS); 1445 } else { 1446 ASSERT(id <= VMEM_INITIAL); 1447 vmp = &vmem0[id - 1]; 1448 } 1449 1450 if (vmp == NULL) 1451 return (NULL); 1452 bzero(vmp, sizeof (vmem_t)); 1453 1454 (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name); 1455 (void) mutex_init(&vmp->vm_lock, USYNC_THREAD, NULL); 1456 (void) cond_init(&vmp->vm_cv, USYNC_THREAD, NULL); 1457 vmp->vm_cflags = vmflag; 1458 vmflag &= VM_UMFLAGS; 1459 1460 vmp->vm_quantum = quantum; 1461 vmp->vm_qshift = highbit(quantum) - 1; 1462 nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX); 1463 1464 for (i = 0; i <= VMEM_FREELISTS; i++) { 1465 vfp = &vmp->vm_freelist[i]; 1466 vfp->vs_end = 1UL << i; 1467 vfp->vs_knext = (vmem_seg_t *)(vfp + 1); 1468 vfp->vs_kprev = (vmem_seg_t *)(vfp - 1); 1469 } 1470 1471 vmp->vm_freelist[0].vs_kprev = NULL; 1472 vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL; 1473 vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0; 1474 vmp->vm_hash_table = vmp->vm_hash0; 1475 vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1; 1476 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1477 1478 vsp = &vmp->vm_seg0; 1479 vsp->vs_anext = vsp; 1480 vsp->vs_aprev = vsp; 1481 vsp->vs_knext = vsp; 1482 vsp->vs_kprev = vsp; 1483 vsp->vs_type = VMEM_SPAN; 1484 1485 vsp = &vmp->vm_rotor; 1486 vsp->vs_type = VMEM_ROTOR; 1487 VMEM_INSERT(&vmp->vm_seg0, vsp, a); 1488 1489 vmp->vm_id = id; 1490 if (source != NULL) 1491 vmp->vm_kstat.vk_source_id = source->vm_id; 1492 vmp->vm_source = source; 1493 vmp->vm_source_alloc = afunc; 1494 vmp->vm_source_free = ffunc; 1495 1496 if (nqcache != 0) { 1497 vmp->vm_qcache_max = nqcache << vmp->vm_qshift; 1498 for (i = 0; i < nqcache; i++) { 1499 char buf[VMEM_NAMELEN + 21]; 1500 (void) snprintf(buf, sizeof (buf), "%s_%lu", 1501 vmp->vm_name, (long)((i + 1) * quantum)); 1502 vmp->vm_qcache[i] = umem_cache_create(buf, 1503 (i + 1) * quantum, quantum, NULL, NULL, NULL, 1504 NULL, vmp, UMC_QCACHE | UMC_NOTOUCH); 1505 if (vmp->vm_qcache[i] == NULL) { 1506 vmp->vm_qcache_max = i * quantum; 1507 break; 1508 } 1509 } 1510 } 1511 1512 (void) mutex_lock(&vmem_list_lock); 1513 vmpp = &vmem_list; 1514 while ((cur = *vmpp) != NULL) 1515 vmpp = &cur->vm_next; 1516 *vmpp = vmp; 1517 (void) mutex_unlock(&vmem_list_lock); 1518 1519 if (vmp->vm_cflags & VMC_POPULATOR) { 1520 uint_t pop_id = atomic_add_32_nv(&vmem_populators, 1); 1521 ASSERT(pop_id <= VMEM_INITIAL); 1522 vmem_populator[pop_id - 1] = vmp; 1523 (void) mutex_lock(&vmp->vm_lock); 1524 (void) vmem_populate(vmp, vmflag | VM_PANIC); 1525 (void) mutex_unlock(&vmp->vm_lock); 1526 } 1527 1528 if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) { 1529 vmem_destroy(vmp); 1530 return (NULL); 1531 } 1532 1533 return (vmp); 1534 } 1535 1536 /* 1537 * Destroy arena vmp. 1538 */ 1539 void 1540 vmem_destroy(vmem_t *vmp) 1541 { 1542 vmem_t *cur, **vmpp; 1543 vmem_seg_t *seg0 = &vmp->vm_seg0; 1544 vmem_seg_t *vsp; 1545 size_t leaked; 1546 int i; 1547 1548 (void) mutex_lock(&vmem_list_lock); 1549 vmpp = &vmem_list; 1550 while ((cur = *vmpp) != vmp) 1551 vmpp = &cur->vm_next; 1552 *vmpp = vmp->vm_next; 1553 (void) mutex_unlock(&vmem_list_lock); 1554 1555 for (i = 0; i < VMEM_NQCACHE_MAX; i++) 1556 if (vmp->vm_qcache[i]) 1557 umem_cache_destroy(vmp->vm_qcache[i]); 1558 1559 leaked = vmem_size(vmp, VMEM_ALLOC); 1560 if (leaked != 0) 1561 umem_printf("vmem_destroy('%s'): leaked %lu bytes", 1562 vmp->vm_name, leaked); 1563 1564 if (vmp->vm_hash_table != vmp->vm_hash0) 1565 vmem_free(vmem_hash_arena, vmp->vm_hash_table, 1566 (vmp->vm_hash_mask + 1) * sizeof (void *)); 1567 1568 /* 1569 * Give back the segment structures for anything that's left in the 1570 * arena, e.g. the primary spans and their free segments. 1571 */ 1572 VMEM_DELETE(&vmp->vm_rotor, a); 1573 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) 1574 vmem_putseg_global(vsp); 1575 1576 while (vmp->vm_nsegfree > 0) 1577 vmem_putseg_global(vmem_getseg(vmp)); 1578 1579 (void) mutex_destroy(&vmp->vm_lock); 1580 (void) cond_destroy(&vmp->vm_cv); 1581 vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); 1582 } 1583 1584 /* 1585 * Resize vmp's hash table to keep the average lookup depth near 1.0. 1586 */ 1587 static void 1588 vmem_hash_rescale(vmem_t *vmp) 1589 { 1590 vmem_seg_t **old_table, **new_table, *vsp; 1591 size_t old_size, new_size, h, nseg; 1592 1593 nseg = (size_t)(vmp->vm_kstat.vk_alloc - vmp->vm_kstat.vk_free); 1594 1595 new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); 1596 old_size = vmp->vm_hash_mask + 1; 1597 1598 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) 1599 return; 1600 1601 new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *), 1602 VM_NOSLEEP); 1603 if (new_table == NULL) 1604 return; 1605 bzero(new_table, new_size * sizeof (void *)); 1606 1607 (void) mutex_lock(&vmp->vm_lock); 1608 1609 old_size = vmp->vm_hash_mask + 1; 1610 old_table = vmp->vm_hash_table; 1611 1612 vmp->vm_hash_mask = new_size - 1; 1613 vmp->vm_hash_table = new_table; 1614 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1615 1616 for (h = 0; h < old_size; h++) { 1617 vsp = old_table[h]; 1618 while (vsp != NULL) { 1619 uintptr_t addr = vsp->vs_start; 1620 vmem_seg_t *next_vsp = vsp->vs_knext; 1621 vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr); 1622 vsp->vs_knext = *hash_bucket; 1623 *hash_bucket = vsp; 1624 vsp = next_vsp; 1625 } 1626 } 1627 1628 (void) mutex_unlock(&vmp->vm_lock); 1629 1630 if (old_table != vmp->vm_hash0) 1631 vmem_free(vmem_hash_arena, old_table, 1632 old_size * sizeof (void *)); 1633 } 1634 1635 /* 1636 * Perform periodic maintenance on all vmem arenas. 1637 */ 1638 /*ARGSUSED*/ 1639 void 1640 vmem_update(void *dummy) 1641 { 1642 vmem_t *vmp; 1643 1644 (void) mutex_lock(&vmem_list_lock); 1645 for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) { 1646 /* 1647 * If threads are waiting for resources, wake them up 1648 * periodically so they can issue another vmem_reap() 1649 * to reclaim resources cached by the slab allocator. 1650 */ 1651 (void) cond_broadcast(&vmp->vm_cv); 1652 1653 /* 1654 * Rescale the hash table to keep the hash chains short. 1655 */ 1656 vmem_hash_rescale(vmp); 1657 } 1658 (void) mutex_unlock(&vmem_list_lock); 1659 } 1660 1661 /* 1662 * If vmem_init is called again, we need to be able to reset the world. 1663 * That includes resetting the statics back to their original values. 1664 */ 1665 void 1666 vmem_startup(void) 1667 { 1668 #ifdef UMEM_STANDALONE 1669 vmem_id = 0; 1670 vmem_populators = 0; 1671 vmem_segfree = NULL; 1672 vmem_list = NULL; 1673 vmem_internal_arena = NULL; 1674 vmem_seg_arena = NULL; 1675 vmem_hash_arena = NULL; 1676 vmem_vmem_arena = NULL; 1677 vmem_heap = NULL; 1678 vmem_heap_alloc = NULL; 1679 vmem_heap_free = NULL; 1680 1681 bzero(vmem0, sizeof (vmem0)); 1682 bzero(vmem_populator, sizeof (vmem_populator)); 1683 bzero(vmem_seg0, sizeof (vmem_seg0)); 1684 #endif 1685 } 1686 1687 /* 1688 * Prepare vmem for use. 1689 */ 1690 vmem_t * 1691 vmem_init(const char *parent_name, size_t parent_quantum, 1692 vmem_alloc_t *parent_alloc, vmem_free_t *parent_free, 1693 const char *heap_name, void *heap_start, size_t heap_size, 1694 size_t heap_quantum, vmem_alloc_t *heap_alloc, vmem_free_t *heap_free) 1695 { 1696 uint32_t id; 1697 int nseg = VMEM_SEG_INITIAL; 1698 vmem_t *parent, *heap; 1699 1700 ASSERT(vmem_internal_arena == NULL); 1701 1702 while (--nseg >= 0) 1703 vmem_putseg_global(&vmem_seg0[nseg]); 1704 1705 if (parent_name != NULL) { 1706 parent = vmem_create(parent_name, 1707 heap_start, heap_size, parent_quantum, 1708 NULL, NULL, NULL, 0, 1709 VM_SLEEP | VMC_POPULATOR); 1710 heap_start = NULL; 1711 heap_size = 0; 1712 } else { 1713 ASSERT(parent_alloc == NULL && parent_free == NULL); 1714 parent = NULL; 1715 } 1716 1717 heap = vmem_create(heap_name, 1718 heap_start, heap_size, heap_quantum, 1719 parent_alloc, parent_free, parent, 0, 1720 VM_SLEEP | VMC_POPULATOR); 1721 1722 vmem_heap = heap; 1723 vmem_heap_alloc = heap_alloc; 1724 vmem_heap_free = heap_free; 1725 1726 vmem_internal_arena = vmem_create("vmem_internal", 1727 NULL, 0, heap_quantum, 1728 heap_alloc, heap_free, heap, 0, 1729 VM_SLEEP | VMC_POPULATOR); 1730 1731 vmem_seg_arena = vmem_create("vmem_seg", 1732 NULL, 0, heap_quantum, 1733 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1734 VM_SLEEP | VMC_POPULATOR); 1735 1736 vmem_hash_arena = vmem_create("vmem_hash", 1737 NULL, 0, 8, 1738 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1739 VM_SLEEP); 1740 1741 vmem_vmem_arena = vmem_create("vmem_vmem", 1742 vmem0, sizeof (vmem0), 1, 1743 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1744 VM_SLEEP); 1745 1746 for (id = 0; id < vmem_id; id++) 1747 (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t), 1748 1, 0, 0, &vmem0[id], &vmem0[id + 1], 1749 VM_NOSLEEP | VM_BESTFIT | VM_PANIC); 1750 1751 return (heap); 1752 } 1753 1754 void 1755 vmem_no_debug(void) 1756 { 1757 /* 1758 * This size must be a multiple of the minimum required alignment, 1759 * since vmem_populate allocates them compactly. 1760 */ 1761 vmem_seg_size = P2ROUNDUP(offsetof(vmem_seg_t, vs_thread), 1762 sizeof (hrtime_t)); 1763 } 1764 1765 /* 1766 * Lockup and release, for fork1(2) handling. 1767 */ 1768 void 1769 vmem_lockup(void) 1770 { 1771 vmem_t *cur; 1772 1773 (void) mutex_lock(&vmem_list_lock); 1774 (void) mutex_lock(&vmem_nosleep_lock.vmpl_mutex); 1775 1776 /* 1777 * Lock up and broadcast all arenas. 1778 */ 1779 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) { 1780 (void) mutex_lock(&cur->vm_lock); 1781 (void) cond_broadcast(&cur->vm_cv); 1782 } 1783 1784 (void) mutex_lock(&vmem_segfree_lock); 1785 } 1786 1787 void 1788 vmem_release(void) 1789 { 1790 vmem_t *cur; 1791 1792 (void) mutex_unlock(&vmem_nosleep_lock.vmpl_mutex); 1793 1794 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) 1795 (void) mutex_unlock(&cur->vm_lock); 1796 1797 (void) mutex_unlock(&vmem_segfree_lock); 1798 (void) mutex_unlock(&vmem_list_lock); 1799 } 1800