1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 /* 31 * For a more complete description of the main ideas, see: 32 * 33 * Jeff Bonwick and Jonathan Adams, 34 * 35 * Magazines and vmem: Extending the Slab Allocator to Many CPUs and 36 * Arbitrary Resources. 37 * 38 * Proceedings of the 2001 Usenix Conference. 39 * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf. 40 * 41 * For the "Big Theory Statement", see usr/src/common/os/vmem.c 42 * 43 * 1. Overview of changes 44 * ------------------------------ 45 * There have been a few changes to vmem in order to support umem. The 46 * main areas are: 47 * 48 * * VM_SLEEP unsupported 49 * 50 * * Reaping changes 51 * 52 * * initialization changes 53 * 54 * * _vmem_extend_alloc 55 * 56 * 57 * 2. VM_SLEEP Removed 58 * ------------------- 59 * Since VM_SLEEP allocations can hold locks (in vmem_populate()) for 60 * possibly infinite amounts of time, they are not supported in this 61 * version of vmem. Sleep-like behavior can be achieved through 62 * UMEM_NOFAIL umem allocations. 63 * 64 * 65 * 3. Reaping changes 66 * ------------------ 67 * Unlike kmem_reap(), which just asynchronously schedules work, umem_reap() 68 * can do allocations and frees synchronously. This is a problem if it 69 * occurs during a vmem_populate() allocation. 70 * 71 * Instead, we delay reaps while populates are active. 72 * 73 * 74 * 4. Initialization changes 75 * ------------------------- 76 * In the kernel, vmem_init() allows you to create a single, top-level arena, 77 * which has vmem_internal_arena as a child. For umem, we want to be able 78 * to extend arenas dynamically. It is much easier to support this if we 79 * allow a two-level "heap" arena: 80 * 81 * +----------+ 82 * | "fake" | 83 * +----------+ 84 * | 85 * +----------+ 86 * | "heap" | 87 * +----------+ 88 * | \ \ 89 * | +-+-- ... <other children> 90 * | 91 * +---------------+ 92 * | vmem_internal | 93 * +---------------+ 94 * | | | | 95 * <children> 96 * 97 * The new vmem_init() allows you to specify a "parent" of the heap, along 98 * with allocation functions. 99 * 100 * 101 * 5. _vmem_extend_alloc 102 * --------------------- 103 * The other part of extending is _vmem_extend_alloc. This function allows 104 * you to extend (expand current spans, if possible) an arena and allocate 105 * a chunk of the newly extened span atomically. This is needed to support 106 * extending the heap while vmem_populate()ing it. 107 * 108 * In order to increase the usefulness of extending, non-imported spans are 109 * sorted in address order. 110 */ 111 112 #include "c_synonyms.h" 113 #include <sys/vmem_impl_user.h> 114 #include <alloca.h> 115 #include <sys/sysmacros.h> 116 #include <stdio.h> 117 #include <strings.h> 118 #include <atomic.h> 119 120 #include "vmem_base.h" 121 #include "umem_base.h" 122 123 #define VMEM_INITIAL 6 /* early vmem arenas */ 124 #define VMEM_SEG_INITIAL 100 /* early segments */ 125 126 /* 127 * Adding a new span to an arena requires two segment structures: one to 128 * represent the span, and one to represent the free segment it contains. 129 */ 130 #define VMEM_SEGS_PER_SPAN_CREATE 2 131 132 /* 133 * Allocating a piece of an existing segment requires 0-2 segment structures 134 * depending on how much of the segment we're allocating. 135 * 136 * To allocate the entire segment, no new segment structures are needed; we 137 * simply move the existing segment structure from the freelist to the 138 * allocation hash table. 139 * 140 * To allocate a piece from the left or right end of the segment, we must 141 * split the segment into two pieces (allocated part and remainder), so we 142 * need one new segment structure to represent the remainder. 143 * 144 * To allocate from the middle of a segment, we need two new segment strucures 145 * to represent the remainders on either side of the allocated part. 146 */ 147 #define VMEM_SEGS_PER_EXACT_ALLOC 0 148 #define VMEM_SEGS_PER_LEFT_ALLOC 1 149 #define VMEM_SEGS_PER_RIGHT_ALLOC 1 150 #define VMEM_SEGS_PER_MIDDLE_ALLOC 2 151 152 /* 153 * vmem_populate() preallocates segment structures for vmem to do its work. 154 * It must preallocate enough for the worst case, which is when we must import 155 * a new span and then allocate from the middle of it. 156 */ 157 #define VMEM_SEGS_PER_ALLOC_MAX \ 158 (VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC) 159 160 /* 161 * The segment structures themselves are allocated from vmem_seg_arena, so 162 * we have a recursion problem when vmem_seg_arena needs to populate itself. 163 * We address this by working out the maximum number of segment structures 164 * this act will require, and multiplying by the maximum number of threads 165 * that we'll allow to do it simultaneously. 166 * 167 * The worst-case segment consumption to populate vmem_seg_arena is as 168 * follows (depicted as a stack trace to indicate why events are occurring): 169 * 170 * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc) 171 * vmem_alloc(vmem_internal_arena) -> 2 segs (span create + exact alloc) 172 * heap_alloc(heap_arena) 173 * vmem_alloc(heap_arena) -> 4 seg (span create + alloc) 174 * parent_alloc(parent_arena) 175 * _vmem_extend_alloc(parent_arena) -> 3 seg (span create + left alloc) 176 * 177 * Note: The reservation for heap_arena must be 4, since vmem_xalloc() 178 * is overly pessimistic on allocations where parent_arena has a stricter 179 * alignment than heap_arena. 180 * 181 * The worst-case consumption for any arena is 4 segment structures. 182 * For now, we only support VM_NOSLEEP allocations, so as long as we 183 * serialize all vmem_populates, a 4-seg reserve is sufficient. 184 */ 185 #define VMEM_POPULATE_SEGS_PER_ARENA 4 186 #define VMEM_POPULATE_LOCKS 1 187 188 #define VMEM_POPULATE_RESERVE \ 189 (VMEM_POPULATE_SEGS_PER_ARENA * VMEM_POPULATE_LOCKS) 190 191 /* 192 * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures 193 * so that it can satisfy the worst-case allocation *and* participate in 194 * worst-case allocation from vmem_seg_arena. 195 */ 196 #define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX) 197 198 /* Don't assume new statics are zeroed - see vmem_startup() */ 199 static vmem_t vmem0[VMEM_INITIAL]; 200 static vmem_t *vmem_populator[VMEM_INITIAL]; 201 static uint32_t vmem_id; 202 static uint32_t vmem_populators; 203 static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL]; 204 static vmem_seg_t *vmem_segfree; 205 static mutex_t vmem_list_lock; 206 static mutex_t vmem_segfree_lock; 207 static vmem_populate_lock_t vmem_nosleep_lock; 208 #define IN_POPULATE() (vmem_nosleep_lock.vmpl_thr == thr_self()) 209 static vmem_t *vmem_list; 210 static vmem_t *vmem_internal_arena; 211 static vmem_t *vmem_seg_arena; 212 static vmem_t *vmem_hash_arena; 213 static vmem_t *vmem_vmem_arena; 214 215 vmem_t *vmem_heap; 216 vmem_alloc_t *vmem_heap_alloc; 217 vmem_free_t *vmem_heap_free; 218 219 uint32_t vmem_mtbf; /* mean time between failures [default: off] */ 220 size_t vmem_seg_size = sizeof (vmem_seg_t); 221 222 /* 223 * we use the _ version, since we don't want to be cancelled. 224 * Actually, this is automatically taken care of by including "mtlib.h". 225 */ 226 extern int _cond_wait(cond_t *cv, mutex_t *mutex); 227 228 /* 229 * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k'). 230 */ 231 #define VMEM_INSERT(vprev, vsp, type) \ 232 { \ 233 vmem_seg_t *vnext = (vprev)->vs_##type##next; \ 234 (vsp)->vs_##type##next = (vnext); \ 235 (vsp)->vs_##type##prev = (vprev); \ 236 (vprev)->vs_##type##next = (vsp); \ 237 (vnext)->vs_##type##prev = (vsp); \ 238 } 239 240 #define VMEM_DELETE(vsp, type) \ 241 { \ 242 vmem_seg_t *vprev = (vsp)->vs_##type##prev; \ 243 vmem_seg_t *vnext = (vsp)->vs_##type##next; \ 244 (vprev)->vs_##type##next = (vnext); \ 245 (vnext)->vs_##type##prev = (vprev); \ 246 } 247 248 /* 249 * Get a vmem_seg_t from the global segfree list. 250 */ 251 static vmem_seg_t * 252 vmem_getseg_global(void) 253 { 254 vmem_seg_t *vsp; 255 256 (void) mutex_lock(&vmem_segfree_lock); 257 if ((vsp = vmem_segfree) != NULL) 258 vmem_segfree = vsp->vs_knext; 259 (void) mutex_unlock(&vmem_segfree_lock); 260 261 return (vsp); 262 } 263 264 /* 265 * Put a vmem_seg_t on the global segfree list. 266 */ 267 static void 268 vmem_putseg_global(vmem_seg_t *vsp) 269 { 270 (void) mutex_lock(&vmem_segfree_lock); 271 vsp->vs_knext = vmem_segfree; 272 vmem_segfree = vsp; 273 (void) mutex_unlock(&vmem_segfree_lock); 274 } 275 276 /* 277 * Get a vmem_seg_t from vmp's segfree list. 278 */ 279 static vmem_seg_t * 280 vmem_getseg(vmem_t *vmp) 281 { 282 vmem_seg_t *vsp; 283 284 ASSERT(vmp->vm_nsegfree > 0); 285 286 vsp = vmp->vm_segfree; 287 vmp->vm_segfree = vsp->vs_knext; 288 vmp->vm_nsegfree--; 289 290 return (vsp); 291 } 292 293 /* 294 * Put a vmem_seg_t on vmp's segfree list. 295 */ 296 static void 297 vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) 298 { 299 vsp->vs_knext = vmp->vm_segfree; 300 vmp->vm_segfree = vsp; 301 vmp->vm_nsegfree++; 302 } 303 304 /* 305 * Add vsp to the appropriate freelist. 306 */ 307 static void 308 vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp) 309 { 310 vmem_seg_t *vprev; 311 312 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 313 314 vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; 315 vsp->vs_type = VMEM_FREE; 316 vmp->vm_freemap |= VS_SIZE(vprev); 317 VMEM_INSERT(vprev, vsp, k); 318 319 (void) cond_broadcast(&vmp->vm_cv); 320 } 321 322 /* 323 * Take vsp from the freelist. 324 */ 325 static void 326 vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp) 327 { 328 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 329 ASSERT(vsp->vs_type == VMEM_FREE); 330 331 if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) { 332 /* 333 * The segments on both sides of 'vsp' are freelist heads, 334 * so taking vsp leaves the freelist at vsp->vs_kprev empty. 335 */ 336 ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev)); 337 vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev); 338 } 339 VMEM_DELETE(vsp, k); 340 } 341 342 /* 343 * Add vsp to the allocated-segment hash table and update kstats. 344 */ 345 static void 346 vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp) 347 { 348 vmem_seg_t **bucket; 349 350 vsp->vs_type = VMEM_ALLOC; 351 bucket = VMEM_HASH(vmp, vsp->vs_start); 352 vsp->vs_knext = *bucket; 353 *bucket = vsp; 354 355 if (vmem_seg_size == sizeof (vmem_seg_t)) { 356 vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack, 357 VMEM_STACK_DEPTH, 0); 358 vsp->vs_thread = thr_self(); 359 vsp->vs_timestamp = gethrtime(); 360 } else { 361 vsp->vs_depth = 0; 362 } 363 364 vmp->vm_kstat.vk_alloc++; 365 vmp->vm_kstat.vk_mem_inuse += VS_SIZE(vsp); 366 } 367 368 /* 369 * Remove vsp from the allocated-segment hash table and update kstats. 370 */ 371 static vmem_seg_t * 372 vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size) 373 { 374 vmem_seg_t *vsp, **prev_vspp; 375 376 prev_vspp = VMEM_HASH(vmp, addr); 377 while ((vsp = *prev_vspp) != NULL) { 378 if (vsp->vs_start == addr) { 379 *prev_vspp = vsp->vs_knext; 380 break; 381 } 382 vmp->vm_kstat.vk_lookup++; 383 prev_vspp = &vsp->vs_knext; 384 } 385 386 if (vsp == NULL) { 387 umem_panic("vmem_hash_delete(%p, %lx, %lu): bad free", 388 vmp, addr, size); 389 } 390 if (VS_SIZE(vsp) != size) { 391 umem_panic("vmem_hash_delete(%p, %lx, %lu): wrong size " 392 "(expect %lu)", vmp, addr, size, VS_SIZE(vsp)); 393 } 394 395 vmp->vm_kstat.vk_free++; 396 vmp->vm_kstat.vk_mem_inuse -= size; 397 398 return (vsp); 399 } 400 401 /* 402 * Create a segment spanning the range [start, end) and add it to the arena. 403 */ 404 static vmem_seg_t * 405 vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) 406 { 407 vmem_seg_t *newseg = vmem_getseg(vmp); 408 409 newseg->vs_start = start; 410 newseg->vs_end = end; 411 newseg->vs_type = 0; 412 newseg->vs_import = 0; 413 414 VMEM_INSERT(vprev, newseg, a); 415 416 return (newseg); 417 } 418 419 /* 420 * Remove segment vsp from the arena. 421 */ 422 static void 423 vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) 424 { 425 ASSERT(vsp->vs_type != VMEM_ROTOR); 426 VMEM_DELETE(vsp, a); 427 428 vmem_putseg(vmp, vsp); 429 } 430 431 /* 432 * Add the span [vaddr, vaddr + size) to vmp and update kstats. 433 */ 434 static vmem_seg_t * 435 vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import) 436 { 437 vmem_seg_t *knext; 438 vmem_seg_t *newseg, *span; 439 uintptr_t start = (uintptr_t)vaddr; 440 uintptr_t end = start + size; 441 442 knext = &vmp->vm_seg0; 443 if (!import && vmp->vm_source_alloc == NULL) { 444 vmem_seg_t *kend, *kprev; 445 /* 446 * non-imported spans are sorted in address order. This 447 * makes vmem_extend_unlocked() much more effective. 448 * 449 * We search in reverse order, since new spans are 450 * generally at higher addresses. 451 */ 452 kend = &vmp->vm_seg0; 453 for (kprev = kend->vs_kprev; kprev != kend; 454 kprev = kprev->vs_kprev) { 455 if (!kprev->vs_import && (kprev->vs_end - 1) < start) 456 break; 457 } 458 knext = kprev->vs_knext; 459 } 460 461 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 462 463 if ((start | end) & (vmp->vm_quantum - 1)) { 464 umem_panic("vmem_span_create(%p, %p, %lu): misaligned", 465 vmp, vaddr, size); 466 } 467 468 span = vmem_seg_create(vmp, knext->vs_aprev, start, end); 469 span->vs_type = VMEM_SPAN; 470 VMEM_INSERT(knext->vs_kprev, span, k); 471 472 newseg = vmem_seg_create(vmp, span, start, end); 473 vmem_freelist_insert(vmp, newseg); 474 475 newseg->vs_import = import; 476 if (import) 477 vmp->vm_kstat.vk_mem_import += size; 478 vmp->vm_kstat.vk_mem_total += size; 479 480 return (newseg); 481 } 482 483 /* 484 * Remove span vsp from vmp and update kstats. 485 */ 486 static void 487 vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp) 488 { 489 vmem_seg_t *span = vsp->vs_aprev; 490 size_t size = VS_SIZE(vsp); 491 492 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 493 ASSERT(span->vs_type == VMEM_SPAN); 494 495 if (vsp->vs_import) 496 vmp->vm_kstat.vk_mem_import -= size; 497 vmp->vm_kstat.vk_mem_total -= size; 498 499 VMEM_DELETE(span, k); 500 501 vmem_seg_destroy(vmp, vsp); 502 vmem_seg_destroy(vmp, span); 503 } 504 505 /* 506 * Allocate the subrange [addr, addr + size) from segment vsp. 507 * If there are leftovers on either side, place them on the freelist. 508 * Returns a pointer to the segment representing [addr, addr + size). 509 */ 510 static vmem_seg_t * 511 vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size) 512 { 513 uintptr_t vs_start = vsp->vs_start; 514 uintptr_t vs_end = vsp->vs_end; 515 size_t vs_size = vs_end - vs_start; 516 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 517 uintptr_t addr_end = addr + realsize; 518 519 ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0); 520 ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0); 521 ASSERT(vsp->vs_type == VMEM_FREE); 522 ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1); 523 ASSERT(addr - 1 <= addr_end - 1); 524 525 /* 526 * If we're allocating from the start of the segment, and the 527 * remainder will be on the same freelist, we can save quite 528 * a bit of work. 529 */ 530 if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) { 531 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 532 vsp->vs_start = addr_end; 533 vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size); 534 vmem_hash_insert(vmp, vsp); 535 return (vsp); 536 } 537 538 vmem_freelist_delete(vmp, vsp); 539 540 if (vs_end != addr_end) 541 vmem_freelist_insert(vmp, 542 vmem_seg_create(vmp, vsp, addr_end, vs_end)); 543 544 if (vs_start != addr) 545 vmem_freelist_insert(vmp, 546 vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr)); 547 548 vsp->vs_start = addr; 549 vsp->vs_end = addr + size; 550 551 vmem_hash_insert(vmp, vsp); 552 return (vsp); 553 } 554 555 /* 556 * We cannot reap if we are in the middle of a vmem_populate(). 557 */ 558 void 559 vmem_reap(void) 560 { 561 if (!IN_POPULATE()) 562 umem_reap(); 563 } 564 565 /* 566 * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures. 567 */ 568 static int 569 vmem_populate(vmem_t *vmp, int vmflag) 570 { 571 char *p; 572 vmem_seg_t *vsp; 573 ssize_t nseg; 574 size_t size; 575 vmem_populate_lock_t *lp; 576 int i; 577 578 while (vmp->vm_nsegfree < VMEM_MINFREE && 579 (vsp = vmem_getseg_global()) != NULL) 580 vmem_putseg(vmp, vsp); 581 582 if (vmp->vm_nsegfree >= VMEM_MINFREE) 583 return (1); 584 585 /* 586 * If we're already populating, tap the reserve. 587 */ 588 if (vmem_nosleep_lock.vmpl_thr == thr_self()) { 589 ASSERT(vmp->vm_cflags & VMC_POPULATOR); 590 return (1); 591 } 592 593 (void) mutex_unlock(&vmp->vm_lock); 594 595 ASSERT(vmflag & VM_NOSLEEP); /* we do not allow sleep allocations */ 596 lp = &vmem_nosleep_lock; 597 598 /* 599 * Cannot be just a mutex_lock(), since that has no effect if 600 * libthread is not linked. 601 */ 602 (void) mutex_lock(&lp->vmpl_mutex); 603 ASSERT(lp->vmpl_thr == 0); 604 lp->vmpl_thr = thr_self(); 605 606 nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE; 607 size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum); 608 nseg = size / vmem_seg_size; 609 610 /* 611 * The following vmem_alloc() may need to populate vmem_seg_arena 612 * and all the things it imports from. When doing so, it will tap 613 * each arena's reserve to prevent recursion (see the block comment 614 * above the definition of VMEM_POPULATE_RESERVE). 615 * 616 * During this allocation, vmem_reap() is a no-op. If the allocation 617 * fails, we call vmem_reap() after dropping the population lock. 618 */ 619 p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_UMFLAGS); 620 if (p == NULL) { 621 lp->vmpl_thr = 0; 622 (void) mutex_unlock(&lp->vmpl_mutex); 623 vmem_reap(); 624 625 (void) mutex_lock(&vmp->vm_lock); 626 vmp->vm_kstat.vk_populate_fail++; 627 return (0); 628 } 629 /* 630 * Restock the arenas that may have been depleted during population. 631 */ 632 for (i = 0; i < vmem_populators; i++) { 633 (void) mutex_lock(&vmem_populator[i]->vm_lock); 634 while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE) 635 vmem_putseg(vmem_populator[i], 636 (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 637 (void) mutex_unlock(&vmem_populator[i]->vm_lock); 638 } 639 640 lp->vmpl_thr = 0; 641 (void) mutex_unlock(&lp->vmpl_mutex); 642 (void) mutex_lock(&vmp->vm_lock); 643 644 /* 645 * Now take our own segments. 646 */ 647 ASSERT(nseg >= VMEM_MINFREE); 648 while (vmp->vm_nsegfree < VMEM_MINFREE) 649 vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 650 651 /* 652 * Give the remainder to charity. 653 */ 654 while (nseg > 0) 655 vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size)); 656 657 return (1); 658 } 659 660 /* 661 * Advance a walker from its previous position to 'afterme'. 662 * Note: may drop and reacquire vmp->vm_lock. 663 */ 664 static void 665 vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme) 666 { 667 vmem_seg_t *vprev = walker->vs_aprev; 668 vmem_seg_t *vnext = walker->vs_anext; 669 vmem_seg_t *vsp = NULL; 670 671 VMEM_DELETE(walker, a); 672 673 if (afterme != NULL) 674 VMEM_INSERT(afterme, walker, a); 675 676 /* 677 * The walker segment's presence may have prevented its neighbors 678 * from coalescing. If so, coalesce them now. 679 */ 680 if (vprev->vs_type == VMEM_FREE) { 681 if (vnext->vs_type == VMEM_FREE) { 682 ASSERT(vprev->vs_end == vnext->vs_start); 683 vmem_freelist_delete(vmp, vnext); 684 vmem_freelist_delete(vmp, vprev); 685 vprev->vs_end = vnext->vs_end; 686 vmem_freelist_insert(vmp, vprev); 687 vmem_seg_destroy(vmp, vnext); 688 } 689 vsp = vprev; 690 } else if (vnext->vs_type == VMEM_FREE) { 691 vsp = vnext; 692 } 693 694 /* 695 * vsp could represent a complete imported span, 696 * in which case we must return it to the source. 697 */ 698 if (vsp != NULL && vsp->vs_import && vmp->vm_source_free != NULL && 699 vsp->vs_aprev->vs_type == VMEM_SPAN && 700 vsp->vs_anext->vs_type == VMEM_SPAN) { 701 void *vaddr = (void *)vsp->vs_start; 702 size_t size = VS_SIZE(vsp); 703 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 704 vmem_freelist_delete(vmp, vsp); 705 vmem_span_destroy(vmp, vsp); 706 (void) mutex_unlock(&vmp->vm_lock); 707 vmp->vm_source_free(vmp->vm_source, vaddr, size); 708 (void) mutex_lock(&vmp->vm_lock); 709 } 710 } 711 712 /* 713 * VM_NEXTFIT allocations deliberately cycle through all virtual addresses 714 * in an arena, so that we avoid reusing addresses for as long as possible. 715 * This helps to catch used-after-freed bugs. It's also the perfect policy 716 * for allocating things like process IDs, where we want to cycle through 717 * all values in order. 718 */ 719 static void * 720 vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag) 721 { 722 vmem_seg_t *vsp, *rotor; 723 uintptr_t addr; 724 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 725 size_t vs_size; 726 727 (void) mutex_lock(&vmp->vm_lock); 728 729 if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) { 730 (void) mutex_unlock(&vmp->vm_lock); 731 return (NULL); 732 } 733 734 /* 735 * The common case is that the segment right after the rotor is free, 736 * and large enough that extracting 'size' bytes won't change which 737 * freelist it's on. In this case we can avoid a *lot* of work. 738 * Instead of the normal vmem_seg_alloc(), we just advance the start 739 * address of the victim segment. Instead of moving the rotor, we 740 * create the new segment structure *behind the rotor*, which has 741 * the same effect. And finally, we know we don't have to coalesce 742 * the rotor's neighbors because the new segment lies between them. 743 */ 744 rotor = &vmp->vm_rotor; 745 vsp = rotor->vs_anext; 746 if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize && 747 P2SAMEHIGHBIT(vs_size, vs_size - realsize)) { 748 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 749 addr = vsp->vs_start; 750 vsp->vs_start = addr + realsize; 751 vmem_hash_insert(vmp, 752 vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size)); 753 (void) mutex_unlock(&vmp->vm_lock); 754 return ((void *)addr); 755 } 756 757 /* 758 * Starting at the rotor, look for a segment large enough to 759 * satisfy the allocation. 760 */ 761 for (;;) { 762 vmp->vm_kstat.vk_search++; 763 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 764 break; 765 vsp = vsp->vs_anext; 766 if (vsp == rotor) { 767 /* 768 * We've come full circle. One possibility is that the 769 * there's actually enough space, but the rotor itself 770 * is preventing the allocation from succeeding because 771 * it's sitting between two free segments. Therefore, 772 * we advance the rotor and see if that liberates a 773 * suitable segment. 774 */ 775 vmem_advance(vmp, rotor, rotor->vs_anext); 776 vsp = rotor->vs_aprev; 777 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 778 break; 779 /* 780 * If there's a lower arena we can import from, or it's 781 * a VM_NOSLEEP allocation, let vmem_xalloc() handle it. 782 * Otherwise, wait until another thread frees something. 783 */ 784 if (vmp->vm_source_alloc != NULL || 785 (vmflag & VM_NOSLEEP)) { 786 (void) mutex_unlock(&vmp->vm_lock); 787 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 788 0, 0, NULL, NULL, vmflag & VM_UMFLAGS)); 789 } 790 vmp->vm_kstat.vk_wait++; 791 (void) _cond_wait(&vmp->vm_cv, &vmp->vm_lock); 792 vsp = rotor->vs_anext; 793 } 794 } 795 796 /* 797 * We found a segment. Extract enough space to satisfy the allocation. 798 */ 799 addr = vsp->vs_start; 800 vsp = vmem_seg_alloc(vmp, vsp, addr, size); 801 ASSERT(vsp->vs_type == VMEM_ALLOC && 802 vsp->vs_start == addr && vsp->vs_end == addr + size); 803 804 /* 805 * Advance the rotor to right after the newly-allocated segment. 806 * That's where the next VM_NEXTFIT allocation will begin searching. 807 */ 808 vmem_advance(vmp, rotor, vsp); 809 (void) mutex_unlock(&vmp->vm_lock); 810 return ((void *)addr); 811 } 812 813 /* 814 * Allocate size bytes at offset phase from an align boundary such that the 815 * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) 816 * that does not straddle a nocross-aligned boundary. 817 */ 818 void * 819 vmem_xalloc(vmem_t *vmp, size_t size, size_t align, size_t phase, 820 size_t nocross, void *minaddr, void *maxaddr, int vmflag) 821 { 822 vmem_seg_t *vsp; 823 vmem_seg_t *vbest = NULL; 824 uintptr_t addr, taddr, start, end; 825 void *vaddr; 826 int hb, flist, resv; 827 uint32_t mtbf; 828 829 if (phase > 0 && phase >= align) 830 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 831 "invalid phase", 832 (void *)vmp, size, align, phase, nocross, 833 minaddr, maxaddr, vmflag); 834 835 if (align == 0) 836 align = vmp->vm_quantum; 837 838 if ((align | phase | nocross) & (vmp->vm_quantum - 1)) { 839 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 840 "parameters not vm_quantum aligned", 841 (void *)vmp, size, align, phase, nocross, 842 minaddr, maxaddr, vmflag); 843 } 844 845 if (nocross != 0 && 846 (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) { 847 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 848 "overconstrained allocation", 849 (void *)vmp, size, align, phase, nocross, 850 minaddr, maxaddr, vmflag); 851 } 852 853 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 854 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 855 return (NULL); 856 857 (void) mutex_lock(&vmp->vm_lock); 858 for (;;) { 859 if (vmp->vm_nsegfree < VMEM_MINFREE && 860 !vmem_populate(vmp, vmflag)) 861 break; 862 863 /* 864 * highbit() returns the highest bit + 1, which is exactly 865 * what we want: we want to search the first freelist whose 866 * members are *definitely* large enough to satisfy our 867 * allocation. However, there are certain cases in which we 868 * want to look at the next-smallest freelist (which *might* 869 * be able to satisfy the allocation): 870 * 871 * (1) The size is exactly a power of 2, in which case 872 * the smaller freelist is always big enough; 873 * 874 * (2) All other freelists are empty; 875 * 876 * (3) We're in the highest possible freelist, which is 877 * always empty (e.g. the 4GB freelist on 32-bit systems); 878 * 879 * (4) We're doing a best-fit or first-fit allocation. 880 */ 881 if ((size & (size - 1)) == 0) { 882 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 883 } else { 884 hb = highbit(size); 885 if ((vmp->vm_freemap >> hb) == 0 || 886 hb == VMEM_FREELISTS || 887 (vmflag & (VM_BESTFIT | VM_FIRSTFIT))) 888 hb--; 889 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 890 } 891 892 for (vbest = NULL, vsp = (flist == 0) ? NULL : 893 vmp->vm_freelist[flist - 1].vs_knext; 894 vsp != NULL; vsp = vsp->vs_knext) { 895 vmp->vm_kstat.vk_search++; 896 if (vsp->vs_start == 0) { 897 /* 898 * We're moving up to a larger freelist, 899 * so if we've already found a candidate, 900 * the fit can't possibly get any better. 901 */ 902 if (vbest != NULL) 903 break; 904 /* 905 * Find the next non-empty freelist. 906 */ 907 flist = lowbit(P2ALIGN(vmp->vm_freemap, 908 VS_SIZE(vsp))); 909 if (flist-- == 0) 910 break; 911 vsp = (vmem_seg_t *)&vmp->vm_freelist[flist]; 912 ASSERT(vsp->vs_knext->vs_type == VMEM_FREE); 913 continue; 914 } 915 if (vsp->vs_end - 1 < (uintptr_t)minaddr) 916 continue; 917 if (vsp->vs_start > (uintptr_t)maxaddr - 1) 918 continue; 919 start = MAX(vsp->vs_start, (uintptr_t)minaddr); 920 end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; 921 taddr = P2PHASEUP(start, align, phase); 922 if (P2CROSS(taddr, taddr + size - 1, nocross)) 923 taddr += 924 P2ROUNDUP(P2NPHASE(taddr, nocross), align); 925 if ((taddr - start) + size > end - start || 926 (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest))) 927 continue; 928 vbest = vsp; 929 addr = taddr; 930 if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size) 931 break; 932 } 933 if (vbest != NULL) 934 break; 935 if (size == 0) 936 umem_panic("vmem_xalloc(): size == 0"); 937 if (vmp->vm_source_alloc != NULL && nocross == 0 && 938 minaddr == NULL && maxaddr == NULL) { 939 size_t asize = P2ROUNDUP(size + phase, 940 MAX(align, vmp->vm_source->vm_quantum)); 941 if (asize < size) { /* overflow */ 942 (void) mutex_unlock(&vmp->vm_lock); 943 if (vmflag & VM_NOSLEEP) 944 return (NULL); 945 946 umem_panic("vmem_xalloc(): " 947 "overflow on VM_SLEEP allocation"); 948 } 949 /* 950 * Determine how many segment structures we'll consume. 951 * The calculation must be presise because if we're 952 * here on behalf of vmem_populate(), we are taking 953 * segments from a very limited reserve. 954 */ 955 resv = (size == asize) ? 956 VMEM_SEGS_PER_SPAN_CREATE + 957 VMEM_SEGS_PER_EXACT_ALLOC : 958 VMEM_SEGS_PER_ALLOC_MAX; 959 ASSERT(vmp->vm_nsegfree >= resv); 960 vmp->vm_nsegfree -= resv; /* reserve our segs */ 961 (void) mutex_unlock(&vmp->vm_lock); 962 vaddr = vmp->vm_source_alloc(vmp->vm_source, asize, 963 vmflag & VM_UMFLAGS); 964 (void) mutex_lock(&vmp->vm_lock); 965 vmp->vm_nsegfree += resv; /* claim reservation */ 966 if (vaddr != NULL) { 967 vbest = vmem_span_create(vmp, vaddr, asize, 1); 968 addr = P2PHASEUP(vbest->vs_start, align, phase); 969 break; 970 } 971 } 972 (void) mutex_unlock(&vmp->vm_lock); 973 vmem_reap(); 974 (void) mutex_lock(&vmp->vm_lock); 975 if (vmflag & VM_NOSLEEP) 976 break; 977 vmp->vm_kstat.vk_wait++; 978 (void) _cond_wait(&vmp->vm_cv, &vmp->vm_lock); 979 } 980 if (vbest != NULL) { 981 ASSERT(vbest->vs_type == VMEM_FREE); 982 ASSERT(vbest->vs_knext != vbest); 983 (void) vmem_seg_alloc(vmp, vbest, addr, size); 984 (void) mutex_unlock(&vmp->vm_lock); 985 ASSERT(P2PHASE(addr, align) == phase); 986 ASSERT(!P2CROSS(addr, addr + size - 1, nocross)); 987 ASSERT(addr >= (uintptr_t)minaddr); 988 ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); 989 return ((void *)addr); 990 } 991 vmp->vm_kstat.vk_fail++; 992 (void) mutex_unlock(&vmp->vm_lock); 993 if (vmflag & VM_PANIC) 994 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 995 "cannot satisfy mandatory allocation", 996 (void *)vmp, size, align, phase, nocross, 997 minaddr, maxaddr, vmflag); 998 return (NULL); 999 } 1000 1001 /* 1002 * Free the segment [vaddr, vaddr + size), where vaddr was a constrained 1003 * allocation. vmem_xalloc() and vmem_xfree() must always be paired because 1004 * both routines bypass the quantum caches. 1005 */ 1006 void 1007 vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) 1008 { 1009 vmem_seg_t *vsp, *vnext, *vprev; 1010 1011 (void) mutex_lock(&vmp->vm_lock); 1012 1013 vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size); 1014 vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum); 1015 1016 /* 1017 * Attempt to coalesce with the next segment. 1018 */ 1019 vnext = vsp->vs_anext; 1020 if (vnext->vs_type == VMEM_FREE) { 1021 ASSERT(vsp->vs_end == vnext->vs_start); 1022 vmem_freelist_delete(vmp, vnext); 1023 vsp->vs_end = vnext->vs_end; 1024 vmem_seg_destroy(vmp, vnext); 1025 } 1026 1027 /* 1028 * Attempt to coalesce with the previous segment. 1029 */ 1030 vprev = vsp->vs_aprev; 1031 if (vprev->vs_type == VMEM_FREE) { 1032 ASSERT(vprev->vs_end == vsp->vs_start); 1033 vmem_freelist_delete(vmp, vprev); 1034 vprev->vs_end = vsp->vs_end; 1035 vmem_seg_destroy(vmp, vsp); 1036 vsp = vprev; 1037 } 1038 1039 /* 1040 * If the entire span is free, return it to the source. 1041 */ 1042 if (vsp->vs_import && vmp->vm_source_free != NULL && 1043 vsp->vs_aprev->vs_type == VMEM_SPAN && 1044 vsp->vs_anext->vs_type == VMEM_SPAN) { 1045 vaddr = (void *)vsp->vs_start; 1046 size = VS_SIZE(vsp); 1047 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 1048 vmem_span_destroy(vmp, vsp); 1049 (void) mutex_unlock(&vmp->vm_lock); 1050 vmp->vm_source_free(vmp->vm_source, vaddr, size); 1051 } else { 1052 vmem_freelist_insert(vmp, vsp); 1053 (void) mutex_unlock(&vmp->vm_lock); 1054 } 1055 } 1056 1057 /* 1058 * Allocate size bytes from arena vmp. Returns the allocated address 1059 * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, 1060 * and may also specify best-fit, first-fit, or next-fit allocation policy 1061 * instead of the default instant-fit policy. VM_SLEEP allocations are 1062 * guaranteed to succeed. 1063 */ 1064 void * 1065 vmem_alloc(vmem_t *vmp, size_t size, int vmflag) 1066 { 1067 vmem_seg_t *vsp; 1068 uintptr_t addr; 1069 int hb; 1070 int flist = 0; 1071 uint32_t mtbf; 1072 1073 if (size - 1 < vmp->vm_qcache_max) { 1074 ASSERT(vmflag & VM_NOSLEEP); 1075 return (_umem_cache_alloc(vmp->vm_qcache[(size - 1) >> 1076 vmp->vm_qshift], UMEM_DEFAULT)); 1077 } 1078 1079 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 1080 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 1081 return (NULL); 1082 1083 if (vmflag & VM_NEXTFIT) 1084 return (vmem_nextfit_alloc(vmp, size, vmflag)); 1085 1086 if (vmflag & (VM_BESTFIT | VM_FIRSTFIT)) 1087 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0, 1088 NULL, NULL, vmflag)); 1089 1090 /* 1091 * Unconstrained instant-fit allocation from the segment list. 1092 */ 1093 (void) mutex_lock(&vmp->vm_lock); 1094 1095 if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) { 1096 if ((size & (size - 1)) == 0) 1097 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 1098 else if ((hb = highbit(size)) < VMEM_FREELISTS) 1099 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 1100 } 1101 1102 if (flist-- == 0) { 1103 (void) mutex_unlock(&vmp->vm_lock); 1104 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 1105 0, 0, NULL, NULL, vmflag)); 1106 } 1107 1108 ASSERT(size <= (1UL << flist)); 1109 vsp = vmp->vm_freelist[flist].vs_knext; 1110 addr = vsp->vs_start; 1111 (void) vmem_seg_alloc(vmp, vsp, addr, size); 1112 (void) mutex_unlock(&vmp->vm_lock); 1113 return ((void *)addr); 1114 } 1115 1116 /* 1117 * Free the segment [vaddr, vaddr + size). 1118 */ 1119 void 1120 vmem_free(vmem_t *vmp, void *vaddr, size_t size) 1121 { 1122 if (size - 1 < vmp->vm_qcache_max) 1123 _umem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], 1124 vaddr); 1125 else 1126 vmem_xfree(vmp, vaddr, size); 1127 } 1128 1129 /* 1130 * Determine whether arena vmp contains the segment [vaddr, vaddr + size). 1131 */ 1132 int 1133 vmem_contains(vmem_t *vmp, void *vaddr, size_t size) 1134 { 1135 uintptr_t start = (uintptr_t)vaddr; 1136 uintptr_t end = start + size; 1137 vmem_seg_t *vsp; 1138 vmem_seg_t *seg0 = &vmp->vm_seg0; 1139 1140 (void) mutex_lock(&vmp->vm_lock); 1141 vmp->vm_kstat.vk_contains++; 1142 for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) { 1143 vmp->vm_kstat.vk_contains_search++; 1144 ASSERT(vsp->vs_type == VMEM_SPAN); 1145 if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1) 1146 break; 1147 } 1148 (void) mutex_unlock(&vmp->vm_lock); 1149 return (vsp != seg0); 1150 } 1151 1152 /* 1153 * Add the span [vaddr, vaddr + size) to arena vmp. 1154 */ 1155 void * 1156 vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag) 1157 { 1158 if (vaddr == NULL || size == 0) { 1159 umem_panic("vmem_add(%p, %p, %lu): bad arguments", 1160 vmp, vaddr, size); 1161 } 1162 1163 ASSERT(!vmem_contains(vmp, vaddr, size)); 1164 1165 (void) mutex_lock(&vmp->vm_lock); 1166 if (vmem_populate(vmp, vmflag)) 1167 (void) vmem_span_create(vmp, vaddr, size, 0); 1168 else 1169 vaddr = NULL; 1170 (void) cond_broadcast(&vmp->vm_cv); 1171 (void) mutex_unlock(&vmp->vm_lock); 1172 return (vaddr); 1173 } 1174 1175 /* 1176 * Adds the address range [addr, endaddr) to arena vmp, by either: 1177 * 1. joining two existing spans, [x, addr), and [endaddr, y) (which 1178 * are in that order) into a single [x, y) span, 1179 * 2. expanding an existing [x, addr) span to [x, endaddr), 1180 * 3. expanding an existing [endaddr, x) span to [addr, x), or 1181 * 4. creating a new [addr, endaddr) span. 1182 * 1183 * Called with vmp->vm_lock held, and a successful vmem_populate() completed. 1184 * Cannot fail. Returns the new segment. 1185 * 1186 * NOTE: this algorithm is linear-time in the number of spans, but is 1187 * constant-time when you are extending the last (highest-addressed) 1188 * span. 1189 */ 1190 static vmem_seg_t * 1191 vmem_extend_unlocked(vmem_t *vmp, uintptr_t addr, uintptr_t endaddr) 1192 { 1193 vmem_seg_t *span; 1194 vmem_seg_t *vsp; 1195 1196 vmem_seg_t *end = &vmp->vm_seg0; 1197 1198 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 1199 1200 /* 1201 * the second "if" clause below relies on the direction of this search 1202 */ 1203 for (span = end->vs_kprev; span != end; span = span->vs_kprev) { 1204 if (span->vs_end == addr || span->vs_start == endaddr) 1205 break; 1206 } 1207 1208 if (span == end) 1209 return (vmem_span_create(vmp, (void *)addr, endaddr - addr, 0)); 1210 if (span->vs_kprev->vs_end == addr && span->vs_start == endaddr) { 1211 vmem_seg_t *prevspan = span->vs_kprev; 1212 vmem_seg_t *nextseg = span->vs_anext; 1213 vmem_seg_t *prevseg = span->vs_aprev; 1214 1215 /* 1216 * prevspan becomes the span marker for the full range 1217 */ 1218 prevspan->vs_end = span->vs_end; 1219 1220 /* 1221 * Notionally, span becomes a free segment representing 1222 * [addr, endaddr). 1223 * 1224 * However, if either of its neighbors are free, we coalesce 1225 * by destroying span and changing the free segment. 1226 */ 1227 if (prevseg->vs_type == VMEM_FREE && 1228 nextseg->vs_type == VMEM_FREE) { 1229 /* 1230 * coalesce both ways 1231 */ 1232 ASSERT(prevseg->vs_end == addr && 1233 nextseg->vs_start == endaddr); 1234 1235 vmem_freelist_delete(vmp, prevseg); 1236 prevseg->vs_end = nextseg->vs_end; 1237 1238 vmem_freelist_delete(vmp, nextseg); 1239 VMEM_DELETE(span, k); 1240 vmem_seg_destroy(vmp, nextseg); 1241 vmem_seg_destroy(vmp, span); 1242 1243 vsp = prevseg; 1244 } else if (prevseg->vs_type == VMEM_FREE) { 1245 /* 1246 * coalesce left 1247 */ 1248 ASSERT(prevseg->vs_end == addr); 1249 1250 VMEM_DELETE(span, k); 1251 vmem_seg_destroy(vmp, span); 1252 1253 vmem_freelist_delete(vmp, prevseg); 1254 prevseg->vs_end = endaddr; 1255 1256 vsp = prevseg; 1257 } else if (nextseg->vs_type == VMEM_FREE) { 1258 /* 1259 * coalesce right 1260 */ 1261 ASSERT(nextseg->vs_start == endaddr); 1262 1263 VMEM_DELETE(span, k); 1264 vmem_seg_destroy(vmp, span); 1265 1266 vmem_freelist_delete(vmp, nextseg); 1267 nextseg->vs_start = addr; 1268 1269 vsp = nextseg; 1270 } else { 1271 /* 1272 * cannnot coalesce 1273 */ 1274 VMEM_DELETE(span, k); 1275 span->vs_start = addr; 1276 span->vs_end = endaddr; 1277 1278 vsp = span; 1279 } 1280 } else if (span->vs_end == addr) { 1281 vmem_seg_t *oldseg = span->vs_knext->vs_aprev; 1282 span->vs_end = endaddr; 1283 1284 ASSERT(oldseg->vs_type != VMEM_SPAN); 1285 if (oldseg->vs_type == VMEM_FREE) { 1286 ASSERT(oldseg->vs_end == addr); 1287 vmem_freelist_delete(vmp, oldseg); 1288 oldseg->vs_end = endaddr; 1289 vsp = oldseg; 1290 } else 1291 vsp = vmem_seg_create(vmp, oldseg, addr, endaddr); 1292 } else { 1293 vmem_seg_t *oldseg = span->vs_anext; 1294 ASSERT(span->vs_start == endaddr); 1295 span->vs_start = addr; 1296 1297 ASSERT(oldseg->vs_type != VMEM_SPAN); 1298 if (oldseg->vs_type == VMEM_FREE) { 1299 ASSERT(oldseg->vs_start == endaddr); 1300 vmem_freelist_delete(vmp, oldseg); 1301 oldseg->vs_start = addr; 1302 vsp = oldseg; 1303 } else 1304 vsp = vmem_seg_create(vmp, span, addr, endaddr); 1305 } 1306 vmem_freelist_insert(vmp, vsp); 1307 vmp->vm_kstat.vk_mem_total += (endaddr - addr); 1308 return (vsp); 1309 } 1310 1311 /* 1312 * Does some error checking, calls vmem_extend_unlocked to add 1313 * [vaddr, vaddr+size) to vmp, then allocates alloc bytes from the 1314 * newly merged segment. 1315 */ 1316 void * 1317 _vmem_extend_alloc(vmem_t *vmp, void *vaddr, size_t size, size_t alloc, 1318 int vmflag) 1319 { 1320 uintptr_t addr = (uintptr_t)vaddr; 1321 uintptr_t endaddr = addr + size; 1322 vmem_seg_t *vsp; 1323 1324 ASSERT(vaddr != NULL && size != 0 && endaddr > addr); 1325 ASSERT(alloc <= size && alloc != 0); 1326 ASSERT(((addr | size | alloc) & (vmp->vm_quantum - 1)) == 0); 1327 1328 ASSERT(!vmem_contains(vmp, vaddr, size)); 1329 1330 (void) mutex_lock(&vmp->vm_lock); 1331 if (!vmem_populate(vmp, vmflag)) { 1332 (void) mutex_unlock(&vmp->vm_lock); 1333 return (NULL); 1334 } 1335 /* 1336 * if there is a source, we can't mess with the spans 1337 */ 1338 if (vmp->vm_source_alloc != NULL) 1339 vsp = vmem_span_create(vmp, vaddr, size, 0); 1340 else 1341 vsp = vmem_extend_unlocked(vmp, addr, endaddr); 1342 1343 ASSERT(VS_SIZE(vsp) >= alloc); 1344 1345 addr = vsp->vs_start; 1346 (void) vmem_seg_alloc(vmp, vsp, addr, alloc); 1347 vaddr = (void *)addr; 1348 1349 (void) cond_broadcast(&vmp->vm_cv); 1350 (void) mutex_unlock(&vmp->vm_lock); 1351 1352 return (vaddr); 1353 } 1354 1355 /* 1356 * Walk the vmp arena, applying func to each segment matching typemask. 1357 * If VMEM_REENTRANT is specified, the arena lock is dropped across each 1358 * call to func(); otherwise, it is held for the duration of vmem_walk() 1359 * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks 1360 * are *not* necessarily consistent, so they may only be used when a hint 1361 * is adequate. 1362 */ 1363 void 1364 vmem_walk(vmem_t *vmp, int typemask, 1365 void (*func)(void *, void *, size_t), void *arg) 1366 { 1367 vmem_seg_t *vsp; 1368 vmem_seg_t *seg0 = &vmp->vm_seg0; 1369 vmem_seg_t walker; 1370 1371 if (typemask & VMEM_WALKER) 1372 return; 1373 1374 bzero(&walker, sizeof (walker)); 1375 walker.vs_type = VMEM_WALKER; 1376 1377 (void) mutex_lock(&vmp->vm_lock); 1378 VMEM_INSERT(seg0, &walker, a); 1379 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) { 1380 if (vsp->vs_type & typemask) { 1381 void *start = (void *)vsp->vs_start; 1382 size_t size = VS_SIZE(vsp); 1383 if (typemask & VMEM_REENTRANT) { 1384 vmem_advance(vmp, &walker, vsp); 1385 (void) mutex_unlock(&vmp->vm_lock); 1386 func(arg, start, size); 1387 (void) mutex_lock(&vmp->vm_lock); 1388 vsp = &walker; 1389 } else { 1390 func(arg, start, size); 1391 } 1392 } 1393 } 1394 vmem_advance(vmp, &walker, NULL); 1395 (void) mutex_unlock(&vmp->vm_lock); 1396 } 1397 1398 /* 1399 * Return the total amount of memory whose type matches typemask. Thus: 1400 * 1401 * typemask VMEM_ALLOC yields total memory allocated (in use). 1402 * typemask VMEM_FREE yields total memory free (available). 1403 * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size. 1404 */ 1405 size_t 1406 vmem_size(vmem_t *vmp, int typemask) 1407 { 1408 uint64_t size = 0; 1409 1410 if (typemask & VMEM_ALLOC) 1411 size += vmp->vm_kstat.vk_mem_inuse; 1412 if (typemask & VMEM_FREE) 1413 size += vmp->vm_kstat.vk_mem_total - 1414 vmp->vm_kstat.vk_mem_inuse; 1415 return ((size_t)size); 1416 } 1417 1418 /* 1419 * Create an arena called name whose initial span is [base, base + size). 1420 * The arena's natural unit of currency is quantum, so vmem_alloc() 1421 * guarantees quantum-aligned results. The arena may import new spans 1422 * by invoking afunc() on source, and may return those spans by invoking 1423 * ffunc() on source. To make small allocations fast and scalable, 1424 * the arena offers high-performance caching for each integer multiple 1425 * of quantum up to qcache_max. 1426 */ 1427 vmem_t * 1428 vmem_create(const char *name, void *base, size_t size, size_t quantum, 1429 vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source, 1430 size_t qcache_max, int vmflag) 1431 { 1432 int i; 1433 size_t nqcache; 1434 vmem_t *vmp, *cur, **vmpp; 1435 vmem_seg_t *vsp; 1436 vmem_freelist_t *vfp; 1437 uint32_t id = atomic_add_32_nv(&vmem_id, 1); 1438 1439 if (vmem_vmem_arena != NULL) { 1440 vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t), 1441 vmflag & VM_UMFLAGS); 1442 } else { 1443 ASSERT(id <= VMEM_INITIAL); 1444 vmp = &vmem0[id - 1]; 1445 } 1446 1447 if (vmp == NULL) 1448 return (NULL); 1449 bzero(vmp, sizeof (vmem_t)); 1450 1451 (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name); 1452 (void) mutex_init(&vmp->vm_lock, USYNC_THREAD, NULL); 1453 (void) cond_init(&vmp->vm_cv, USYNC_THREAD, NULL); 1454 vmp->vm_cflags = vmflag; 1455 vmflag &= VM_UMFLAGS; 1456 1457 vmp->vm_quantum = quantum; 1458 vmp->vm_qshift = highbit(quantum) - 1; 1459 nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX); 1460 1461 for (i = 0; i <= VMEM_FREELISTS; i++) { 1462 vfp = &vmp->vm_freelist[i]; 1463 vfp->vs_end = 1UL << i; 1464 vfp->vs_knext = (vmem_seg_t *)(vfp + 1); 1465 vfp->vs_kprev = (vmem_seg_t *)(vfp - 1); 1466 } 1467 1468 vmp->vm_freelist[0].vs_kprev = NULL; 1469 vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL; 1470 vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0; 1471 vmp->vm_hash_table = vmp->vm_hash0; 1472 vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1; 1473 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1474 1475 vsp = &vmp->vm_seg0; 1476 vsp->vs_anext = vsp; 1477 vsp->vs_aprev = vsp; 1478 vsp->vs_knext = vsp; 1479 vsp->vs_kprev = vsp; 1480 vsp->vs_type = VMEM_SPAN; 1481 1482 vsp = &vmp->vm_rotor; 1483 vsp->vs_type = VMEM_ROTOR; 1484 VMEM_INSERT(&vmp->vm_seg0, vsp, a); 1485 1486 vmp->vm_id = id; 1487 if (source != NULL) 1488 vmp->vm_kstat.vk_source_id = source->vm_id; 1489 vmp->vm_source = source; 1490 vmp->vm_source_alloc = afunc; 1491 vmp->vm_source_free = ffunc; 1492 1493 if (nqcache != 0) { 1494 vmp->vm_qcache_max = nqcache << vmp->vm_qshift; 1495 for (i = 0; i < nqcache; i++) { 1496 char buf[VMEM_NAMELEN + 21]; 1497 (void) snprintf(buf, sizeof (buf), "%s_%lu", 1498 vmp->vm_name, (long)((i + 1) * quantum)); 1499 vmp->vm_qcache[i] = umem_cache_create(buf, 1500 (i + 1) * quantum, quantum, NULL, NULL, NULL, 1501 NULL, vmp, UMC_QCACHE | UMC_NOTOUCH); 1502 if (vmp->vm_qcache[i] == NULL) { 1503 vmp->vm_qcache_max = i * quantum; 1504 break; 1505 } 1506 } 1507 } 1508 1509 (void) mutex_lock(&vmem_list_lock); 1510 vmpp = &vmem_list; 1511 while ((cur = *vmpp) != NULL) 1512 vmpp = &cur->vm_next; 1513 *vmpp = vmp; 1514 (void) mutex_unlock(&vmem_list_lock); 1515 1516 if (vmp->vm_cflags & VMC_POPULATOR) { 1517 uint_t pop_id = atomic_add_32_nv(&vmem_populators, 1); 1518 ASSERT(pop_id <= VMEM_INITIAL); 1519 vmem_populator[pop_id - 1] = vmp; 1520 (void) mutex_lock(&vmp->vm_lock); 1521 (void) vmem_populate(vmp, vmflag | VM_PANIC); 1522 (void) mutex_unlock(&vmp->vm_lock); 1523 } 1524 1525 if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) { 1526 vmem_destroy(vmp); 1527 return (NULL); 1528 } 1529 1530 return (vmp); 1531 } 1532 1533 /* 1534 * Destroy arena vmp. 1535 */ 1536 void 1537 vmem_destroy(vmem_t *vmp) 1538 { 1539 vmem_t *cur, **vmpp; 1540 vmem_seg_t *seg0 = &vmp->vm_seg0; 1541 vmem_seg_t *vsp; 1542 size_t leaked; 1543 int i; 1544 1545 (void) mutex_lock(&vmem_list_lock); 1546 vmpp = &vmem_list; 1547 while ((cur = *vmpp) != vmp) 1548 vmpp = &cur->vm_next; 1549 *vmpp = vmp->vm_next; 1550 (void) mutex_unlock(&vmem_list_lock); 1551 1552 for (i = 0; i < VMEM_NQCACHE_MAX; i++) 1553 if (vmp->vm_qcache[i]) 1554 umem_cache_destroy(vmp->vm_qcache[i]); 1555 1556 leaked = vmem_size(vmp, VMEM_ALLOC); 1557 if (leaked != 0) 1558 umem_printf("vmem_destroy('%s'): leaked %lu bytes", 1559 vmp->vm_name, leaked); 1560 1561 if (vmp->vm_hash_table != vmp->vm_hash0) 1562 vmem_free(vmem_hash_arena, vmp->vm_hash_table, 1563 (vmp->vm_hash_mask + 1) * sizeof (void *)); 1564 1565 /* 1566 * Give back the segment structures for anything that's left in the 1567 * arena, e.g. the primary spans and their free segments. 1568 */ 1569 VMEM_DELETE(&vmp->vm_rotor, a); 1570 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) 1571 vmem_putseg_global(vsp); 1572 1573 while (vmp->vm_nsegfree > 0) 1574 vmem_putseg_global(vmem_getseg(vmp)); 1575 1576 (void) mutex_destroy(&vmp->vm_lock); 1577 (void) cond_destroy(&vmp->vm_cv); 1578 vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); 1579 } 1580 1581 /* 1582 * Resize vmp's hash table to keep the average lookup depth near 1.0. 1583 */ 1584 static void 1585 vmem_hash_rescale(vmem_t *vmp) 1586 { 1587 vmem_seg_t **old_table, **new_table, *vsp; 1588 size_t old_size, new_size, h, nseg; 1589 1590 nseg = (size_t)(vmp->vm_kstat.vk_alloc - vmp->vm_kstat.vk_free); 1591 1592 new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); 1593 old_size = vmp->vm_hash_mask + 1; 1594 1595 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) 1596 return; 1597 1598 new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *), 1599 VM_NOSLEEP); 1600 if (new_table == NULL) 1601 return; 1602 bzero(new_table, new_size * sizeof (void *)); 1603 1604 (void) mutex_lock(&vmp->vm_lock); 1605 1606 old_size = vmp->vm_hash_mask + 1; 1607 old_table = vmp->vm_hash_table; 1608 1609 vmp->vm_hash_mask = new_size - 1; 1610 vmp->vm_hash_table = new_table; 1611 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1612 1613 for (h = 0; h < old_size; h++) { 1614 vsp = old_table[h]; 1615 while (vsp != NULL) { 1616 uintptr_t addr = vsp->vs_start; 1617 vmem_seg_t *next_vsp = vsp->vs_knext; 1618 vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr); 1619 vsp->vs_knext = *hash_bucket; 1620 *hash_bucket = vsp; 1621 vsp = next_vsp; 1622 } 1623 } 1624 1625 (void) mutex_unlock(&vmp->vm_lock); 1626 1627 if (old_table != vmp->vm_hash0) 1628 vmem_free(vmem_hash_arena, old_table, 1629 old_size * sizeof (void *)); 1630 } 1631 1632 /* 1633 * Perform periodic maintenance on all vmem arenas. 1634 */ 1635 /*ARGSUSED*/ 1636 void 1637 vmem_update(void *dummy) 1638 { 1639 vmem_t *vmp; 1640 1641 (void) mutex_lock(&vmem_list_lock); 1642 for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) { 1643 /* 1644 * If threads are waiting for resources, wake them up 1645 * periodically so they can issue another vmem_reap() 1646 * to reclaim resources cached by the slab allocator. 1647 */ 1648 (void) cond_broadcast(&vmp->vm_cv); 1649 1650 /* 1651 * Rescale the hash table to keep the hash chains short. 1652 */ 1653 vmem_hash_rescale(vmp); 1654 } 1655 (void) mutex_unlock(&vmem_list_lock); 1656 } 1657 1658 /* 1659 * If vmem_init is called again, we need to be able to reset the world. 1660 * That includes resetting the statics back to their original values. 1661 */ 1662 void 1663 vmem_startup(void) 1664 { 1665 #ifdef UMEM_STANDALONE 1666 vmem_id = 0; 1667 vmem_populators = 0; 1668 vmem_segfree = NULL; 1669 vmem_list = NULL; 1670 vmem_internal_arena = NULL; 1671 vmem_seg_arena = NULL; 1672 vmem_hash_arena = NULL; 1673 vmem_vmem_arena = NULL; 1674 vmem_heap = NULL; 1675 vmem_heap_alloc = NULL; 1676 vmem_heap_free = NULL; 1677 1678 bzero(vmem0, sizeof (vmem0)); 1679 bzero(vmem_populator, sizeof (vmem_populator)); 1680 bzero(vmem_seg0, sizeof (vmem_seg0)); 1681 #endif 1682 } 1683 1684 /* 1685 * Prepare vmem for use. 1686 */ 1687 vmem_t * 1688 vmem_init(const char *parent_name, size_t parent_quantum, 1689 vmem_alloc_t *parent_alloc, vmem_free_t *parent_free, 1690 const char *heap_name, void *heap_start, size_t heap_size, 1691 size_t heap_quantum, vmem_alloc_t *heap_alloc, vmem_free_t *heap_free) 1692 { 1693 uint32_t id; 1694 int nseg = VMEM_SEG_INITIAL; 1695 vmem_t *parent, *heap; 1696 1697 ASSERT(vmem_internal_arena == NULL); 1698 1699 while (--nseg >= 0) 1700 vmem_putseg_global(&vmem_seg0[nseg]); 1701 1702 if (parent_name != NULL) { 1703 parent = vmem_create(parent_name, 1704 heap_start, heap_size, parent_quantum, 1705 NULL, NULL, NULL, 0, 1706 VM_SLEEP | VMC_POPULATOR); 1707 heap_start = NULL; 1708 heap_size = 0; 1709 } else { 1710 ASSERT(parent_alloc == NULL && parent_free == NULL); 1711 parent = NULL; 1712 } 1713 1714 heap = vmem_create(heap_name, 1715 heap_start, heap_size, heap_quantum, 1716 parent_alloc, parent_free, parent, 0, 1717 VM_SLEEP | VMC_POPULATOR); 1718 1719 vmem_heap = heap; 1720 vmem_heap_alloc = heap_alloc; 1721 vmem_heap_free = heap_free; 1722 1723 vmem_internal_arena = vmem_create("vmem_internal", 1724 NULL, 0, heap_quantum, 1725 heap_alloc, heap_free, heap, 0, 1726 VM_SLEEP | VMC_POPULATOR); 1727 1728 vmem_seg_arena = vmem_create("vmem_seg", 1729 NULL, 0, heap_quantum, 1730 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1731 VM_SLEEP | VMC_POPULATOR); 1732 1733 vmem_hash_arena = vmem_create("vmem_hash", 1734 NULL, 0, 8, 1735 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1736 VM_SLEEP); 1737 1738 vmem_vmem_arena = vmem_create("vmem_vmem", 1739 vmem0, sizeof (vmem0), 1, 1740 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1741 VM_SLEEP); 1742 1743 for (id = 0; id < vmem_id; id++) 1744 (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t), 1745 1, 0, 0, &vmem0[id], &vmem0[id + 1], 1746 VM_NOSLEEP | VM_BESTFIT | VM_PANIC); 1747 1748 return (heap); 1749 } 1750 1751 void 1752 vmem_no_debug(void) 1753 { 1754 /* 1755 * This size must be a multiple of the minimum required alignment, 1756 * since vmem_populate allocates them compactly. 1757 */ 1758 vmem_seg_size = P2ROUNDUP(offsetof(vmem_seg_t, vs_thread), 1759 sizeof (hrtime_t)); 1760 } 1761 1762 /* 1763 * Lockup and release, for fork1(2) handling. 1764 */ 1765 void 1766 vmem_lockup(void) 1767 { 1768 vmem_t *cur; 1769 1770 (void) mutex_lock(&vmem_list_lock); 1771 (void) mutex_lock(&vmem_nosleep_lock.vmpl_mutex); 1772 1773 /* 1774 * Lock up and broadcast all arenas. 1775 */ 1776 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) { 1777 (void) mutex_lock(&cur->vm_lock); 1778 (void) cond_broadcast(&cur->vm_cv); 1779 } 1780 1781 (void) mutex_lock(&vmem_segfree_lock); 1782 } 1783 1784 void 1785 vmem_release(void) 1786 { 1787 vmem_t *cur; 1788 1789 (void) mutex_unlock(&vmem_nosleep_lock.vmpl_mutex); 1790 1791 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) 1792 (void) mutex_unlock(&cur->vm_lock); 1793 1794 (void) mutex_unlock(&vmem_segfree_lock); 1795 (void) mutex_unlock(&vmem_list_lock); 1796 } 1797