1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * For a more complete description of the main ideas, see: 31 * 32 * Jeff Bonwick and Jonathan Adams, 33 * 34 * Magazines and vmem: Extending the Slab Allocator to Many CPUs and 35 * Arbitrary Resources. 36 * 37 * Proceedings of the 2001 Usenix Conference. 38 * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf. 39 * 40 * For the "Big Theory Statement", see usr/src/common/os/vmem.c 41 * 42 * 1. Overview of changes 43 * ------------------------------ 44 * There have been a few changes to vmem in order to support umem. The 45 * main areas are: 46 * 47 * * VM_SLEEP unsupported 48 * 49 * * Reaping changes 50 * 51 * * initialization changes 52 * 53 * * _vmem_extend_alloc 54 * 55 * 56 * 2. VM_SLEEP Removed 57 * ------------------- 58 * Since VM_SLEEP allocations can hold locks (in vmem_populate()) for 59 * possibly infinite amounts of time, they are not supported in this 60 * version of vmem. Sleep-like behavior can be achieved through 61 * UMEM_NOFAIL umem allocations. 62 * 63 * 64 * 3. Reaping changes 65 * ------------------ 66 * Unlike kmem_reap(), which just asynchronously schedules work, umem_reap() 67 * can do allocations and frees synchronously. This is a problem if it 68 * occurs during a vmem_populate() allocation. 69 * 70 * Instead, we delay reaps while populates are active. 71 * 72 * 73 * 4. Initialization changes 74 * ------------------------- 75 * In the kernel, vmem_init() allows you to create a single, top-level arena, 76 * which has vmem_internal_arena as a child. For umem, we want to be able 77 * to extend arenas dynamically. It is much easier to support this if we 78 * allow a two-level "heap" arena: 79 * 80 * +----------+ 81 * | "fake" | 82 * +----------+ 83 * | 84 * +----------+ 85 * | "heap" | 86 * +----------+ 87 * | \ \ 88 * | +-+-- ... <other children> 89 * | 90 * +---------------+ 91 * | vmem_internal | 92 * +---------------+ 93 * | | | | 94 * <children> 95 * 96 * The new vmem_init() allows you to specify a "parent" of the heap, along 97 * with allocation functions. 98 * 99 * 100 * 5. _vmem_extend_alloc 101 * --------------------- 102 * The other part of extending is _vmem_extend_alloc. This function allows 103 * you to extend (expand current spans, if possible) an arena and allocate 104 * a chunk of the newly extened span atomically. This is needed to support 105 * extending the heap while vmem_populate()ing it. 106 * 107 * In order to increase the usefulness of extending, non-imported spans are 108 * sorted in address order. 109 */ 110 111 #include "mtlib.h" 112 #include <sys/vmem_impl_user.h> 113 #include <alloca.h> 114 #include <sys/sysmacros.h> 115 #include <stdio.h> 116 #include <strings.h> 117 #include <atomic.h> 118 119 #include "vmem_base.h" 120 #include "umem_base.h" 121 122 #define VMEM_INITIAL 6 /* early vmem arenas */ 123 #define VMEM_SEG_INITIAL 100 /* early segments */ 124 125 /* 126 * Adding a new span to an arena requires two segment structures: one to 127 * represent the span, and one to represent the free segment it contains. 128 */ 129 #define VMEM_SEGS_PER_SPAN_CREATE 2 130 131 /* 132 * Allocating a piece of an existing segment requires 0-2 segment structures 133 * depending on how much of the segment we're allocating. 134 * 135 * To allocate the entire segment, no new segment structures are needed; we 136 * simply move the existing segment structure from the freelist to the 137 * allocation hash table. 138 * 139 * To allocate a piece from the left or right end of the segment, we must 140 * split the segment into two pieces (allocated part and remainder), so we 141 * need one new segment structure to represent the remainder. 142 * 143 * To allocate from the middle of a segment, we need two new segment strucures 144 * to represent the remainders on either side of the allocated part. 145 */ 146 #define VMEM_SEGS_PER_EXACT_ALLOC 0 147 #define VMEM_SEGS_PER_LEFT_ALLOC 1 148 #define VMEM_SEGS_PER_RIGHT_ALLOC 1 149 #define VMEM_SEGS_PER_MIDDLE_ALLOC 2 150 151 /* 152 * vmem_populate() preallocates segment structures for vmem to do its work. 153 * It must preallocate enough for the worst case, which is when we must import 154 * a new span and then allocate from the middle of it. 155 */ 156 #define VMEM_SEGS_PER_ALLOC_MAX \ 157 (VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC) 158 159 /* 160 * The segment structures themselves are allocated from vmem_seg_arena, so 161 * we have a recursion problem when vmem_seg_arena needs to populate itself. 162 * We address this by working out the maximum number of segment structures 163 * this act will require, and multiplying by the maximum number of threads 164 * that we'll allow to do it simultaneously. 165 * 166 * The worst-case segment consumption to populate vmem_seg_arena is as 167 * follows (depicted as a stack trace to indicate why events are occurring): 168 * 169 * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc) 170 * vmem_alloc(vmem_internal_arena) -> 2 segs (span create + exact alloc) 171 * heap_alloc(heap_arena) 172 * vmem_alloc(heap_arena) -> 4 seg (span create + alloc) 173 * parent_alloc(parent_arena) 174 * _vmem_extend_alloc(parent_arena) -> 3 seg (span create + left alloc) 175 * 176 * Note: The reservation for heap_arena must be 4, since vmem_xalloc() 177 * is overly pessimistic on allocations where parent_arena has a stricter 178 * alignment than heap_arena. 179 * 180 * The worst-case consumption for any arena is 4 segment structures. 181 * For now, we only support VM_NOSLEEP allocations, so as long as we 182 * serialize all vmem_populates, a 4-seg reserve is sufficient. 183 */ 184 #define VMEM_POPULATE_SEGS_PER_ARENA 4 185 #define VMEM_POPULATE_LOCKS 1 186 187 #define VMEM_POPULATE_RESERVE \ 188 (VMEM_POPULATE_SEGS_PER_ARENA * VMEM_POPULATE_LOCKS) 189 190 /* 191 * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures 192 * so that it can satisfy the worst-case allocation *and* participate in 193 * worst-case allocation from vmem_seg_arena. 194 */ 195 #define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX) 196 197 /* Don't assume new statics are zeroed - see vmem_startup() */ 198 static vmem_t vmem0[VMEM_INITIAL]; 199 static vmem_t *vmem_populator[VMEM_INITIAL]; 200 static uint32_t vmem_id; 201 static uint32_t vmem_populators; 202 static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL]; 203 static vmem_seg_t *vmem_segfree; 204 static mutex_t vmem_list_lock; 205 static mutex_t vmem_segfree_lock; 206 static vmem_populate_lock_t vmem_nosleep_lock; 207 #define IN_POPULATE() (vmem_nosleep_lock.vmpl_thr == thr_self()) 208 static vmem_t *vmem_list; 209 static vmem_t *vmem_internal_arena; 210 static vmem_t *vmem_seg_arena; 211 static vmem_t *vmem_hash_arena; 212 static vmem_t *vmem_vmem_arena; 213 214 vmem_t *vmem_heap; 215 vmem_alloc_t *vmem_heap_alloc; 216 vmem_free_t *vmem_heap_free; 217 218 uint32_t vmem_mtbf; /* mean time between failures [default: off] */ 219 size_t vmem_seg_size = sizeof (vmem_seg_t); 220 221 /* 222 * we use the _ version, since we don't want to be cancelled. 223 * Actually, this is automatically taken care of by including "mtlib.h". 224 */ 225 extern int _cond_wait(cond_t *cv, mutex_t *mutex); 226 227 /* 228 * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k'). 229 */ 230 #define VMEM_INSERT(vprev, vsp, type) \ 231 { \ 232 vmem_seg_t *vnext = (vprev)->vs_##type##next; \ 233 (vsp)->vs_##type##next = (vnext); \ 234 (vsp)->vs_##type##prev = (vprev); \ 235 (vprev)->vs_##type##next = (vsp); \ 236 (vnext)->vs_##type##prev = (vsp); \ 237 } 238 239 #define VMEM_DELETE(vsp, type) \ 240 { \ 241 vmem_seg_t *vprev = (vsp)->vs_##type##prev; \ 242 vmem_seg_t *vnext = (vsp)->vs_##type##next; \ 243 (vprev)->vs_##type##next = (vnext); \ 244 (vnext)->vs_##type##prev = (vprev); \ 245 } 246 247 /* 248 * Get a vmem_seg_t from the global segfree list. 249 */ 250 static vmem_seg_t * 251 vmem_getseg_global(void) 252 { 253 vmem_seg_t *vsp; 254 255 (void) mutex_lock(&vmem_segfree_lock); 256 if ((vsp = vmem_segfree) != NULL) 257 vmem_segfree = vsp->vs_knext; 258 (void) mutex_unlock(&vmem_segfree_lock); 259 260 return (vsp); 261 } 262 263 /* 264 * Put a vmem_seg_t on the global segfree list. 265 */ 266 static void 267 vmem_putseg_global(vmem_seg_t *vsp) 268 { 269 (void) mutex_lock(&vmem_segfree_lock); 270 vsp->vs_knext = vmem_segfree; 271 vmem_segfree = vsp; 272 (void) mutex_unlock(&vmem_segfree_lock); 273 } 274 275 /* 276 * Get a vmem_seg_t from vmp's segfree list. 277 */ 278 static vmem_seg_t * 279 vmem_getseg(vmem_t *vmp) 280 { 281 vmem_seg_t *vsp; 282 283 ASSERT(vmp->vm_nsegfree > 0); 284 285 vsp = vmp->vm_segfree; 286 vmp->vm_segfree = vsp->vs_knext; 287 vmp->vm_nsegfree--; 288 289 return (vsp); 290 } 291 292 /* 293 * Put a vmem_seg_t on vmp's segfree list. 294 */ 295 static void 296 vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) 297 { 298 vsp->vs_knext = vmp->vm_segfree; 299 vmp->vm_segfree = vsp; 300 vmp->vm_nsegfree++; 301 } 302 303 /* 304 * Add vsp to the appropriate freelist. 305 */ 306 static void 307 vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp) 308 { 309 vmem_seg_t *vprev; 310 311 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 312 313 vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; 314 vsp->vs_type = VMEM_FREE; 315 vmp->vm_freemap |= VS_SIZE(vprev); 316 VMEM_INSERT(vprev, vsp, k); 317 318 (void) cond_broadcast(&vmp->vm_cv); 319 } 320 321 /* 322 * Take vsp from the freelist. 323 */ 324 static void 325 vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp) 326 { 327 ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); 328 ASSERT(vsp->vs_type == VMEM_FREE); 329 330 if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) { 331 /* 332 * The segments on both sides of 'vsp' are freelist heads, 333 * so taking vsp leaves the freelist at vsp->vs_kprev empty. 334 */ 335 ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev)); 336 vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev); 337 } 338 VMEM_DELETE(vsp, k); 339 } 340 341 /* 342 * Add vsp to the allocated-segment hash table and update kstats. 343 */ 344 static void 345 vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp) 346 { 347 vmem_seg_t **bucket; 348 349 vsp->vs_type = VMEM_ALLOC; 350 bucket = VMEM_HASH(vmp, vsp->vs_start); 351 vsp->vs_knext = *bucket; 352 *bucket = vsp; 353 354 if (vmem_seg_size == sizeof (vmem_seg_t)) { 355 vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack, 356 VMEM_STACK_DEPTH, 0); 357 vsp->vs_thread = thr_self(); 358 vsp->vs_timestamp = gethrtime(); 359 } else { 360 vsp->vs_depth = 0; 361 } 362 363 vmp->vm_kstat.vk_alloc++; 364 vmp->vm_kstat.vk_mem_inuse += VS_SIZE(vsp); 365 } 366 367 /* 368 * Remove vsp from the allocated-segment hash table and update kstats. 369 */ 370 static vmem_seg_t * 371 vmem_hash_delete(vmem_t *vmp, uintptr_t addr, size_t size) 372 { 373 vmem_seg_t *vsp, **prev_vspp; 374 375 prev_vspp = VMEM_HASH(vmp, addr); 376 while ((vsp = *prev_vspp) != NULL) { 377 if (vsp->vs_start == addr) { 378 *prev_vspp = vsp->vs_knext; 379 break; 380 } 381 vmp->vm_kstat.vk_lookup++; 382 prev_vspp = &vsp->vs_knext; 383 } 384 385 if (vsp == NULL) { 386 umem_panic("vmem_hash_delete(%p, %lx, %lu): bad free", 387 vmp, addr, size); 388 } 389 if (VS_SIZE(vsp) != size) { 390 umem_panic("vmem_hash_delete(%p, %lx, %lu): wrong size " 391 "(expect %lu)", vmp, addr, size, VS_SIZE(vsp)); 392 } 393 394 vmp->vm_kstat.vk_free++; 395 vmp->vm_kstat.vk_mem_inuse -= size; 396 397 return (vsp); 398 } 399 400 /* 401 * Create a segment spanning the range [start, end) and add it to the arena. 402 */ 403 static vmem_seg_t * 404 vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) 405 { 406 vmem_seg_t *newseg = vmem_getseg(vmp); 407 408 newseg->vs_start = start; 409 newseg->vs_end = end; 410 newseg->vs_type = 0; 411 newseg->vs_import = 0; 412 413 VMEM_INSERT(vprev, newseg, a); 414 415 return (newseg); 416 } 417 418 /* 419 * Remove segment vsp from the arena. 420 */ 421 static void 422 vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) 423 { 424 ASSERT(vsp->vs_type != VMEM_ROTOR); 425 VMEM_DELETE(vsp, a); 426 427 vmem_putseg(vmp, vsp); 428 } 429 430 /* 431 * Add the span [vaddr, vaddr + size) to vmp and update kstats. 432 */ 433 static vmem_seg_t * 434 vmem_span_create(vmem_t *vmp, void *vaddr, size_t size, uint8_t import) 435 { 436 vmem_seg_t *knext; 437 vmem_seg_t *newseg, *span; 438 uintptr_t start = (uintptr_t)vaddr; 439 uintptr_t end = start + size; 440 441 knext = &vmp->vm_seg0; 442 if (!import && vmp->vm_source_alloc == NULL) { 443 vmem_seg_t *kend, *kprev; 444 /* 445 * non-imported spans are sorted in address order. This 446 * makes vmem_extend_unlocked() much more effective. 447 * 448 * We search in reverse order, since new spans are 449 * generally at higher addresses. 450 */ 451 kend = &vmp->vm_seg0; 452 for (kprev = kend->vs_kprev; kprev != kend; 453 kprev = kprev->vs_kprev) { 454 if (!kprev->vs_import && (kprev->vs_end - 1) < start) 455 break; 456 } 457 knext = kprev->vs_knext; 458 } 459 460 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 461 462 if ((start | end) & (vmp->vm_quantum - 1)) { 463 umem_panic("vmem_span_create(%p, %p, %lu): misaligned", 464 vmp, vaddr, size); 465 } 466 467 span = vmem_seg_create(vmp, knext->vs_aprev, start, end); 468 span->vs_type = VMEM_SPAN; 469 VMEM_INSERT(knext->vs_kprev, span, k); 470 471 newseg = vmem_seg_create(vmp, span, start, end); 472 vmem_freelist_insert(vmp, newseg); 473 474 newseg->vs_import = import; 475 if (import) 476 vmp->vm_kstat.vk_mem_import += size; 477 vmp->vm_kstat.vk_mem_total += size; 478 479 return (newseg); 480 } 481 482 /* 483 * Remove span vsp from vmp and update kstats. 484 */ 485 static void 486 vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp) 487 { 488 vmem_seg_t *span = vsp->vs_aprev; 489 size_t size = VS_SIZE(vsp); 490 491 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 492 ASSERT(span->vs_type == VMEM_SPAN); 493 494 if (vsp->vs_import) 495 vmp->vm_kstat.vk_mem_import -= size; 496 vmp->vm_kstat.vk_mem_total -= size; 497 498 VMEM_DELETE(span, k); 499 500 vmem_seg_destroy(vmp, vsp); 501 vmem_seg_destroy(vmp, span); 502 } 503 504 /* 505 * Allocate the subrange [addr, addr + size) from segment vsp. 506 * If there are leftovers on either side, place them on the freelist. 507 * Returns a pointer to the segment representing [addr, addr + size). 508 */ 509 static vmem_seg_t * 510 vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size) 511 { 512 uintptr_t vs_start = vsp->vs_start; 513 uintptr_t vs_end = vsp->vs_end; 514 size_t vs_size = vs_end - vs_start; 515 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 516 uintptr_t addr_end = addr + realsize; 517 518 ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0); 519 ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0); 520 ASSERT(vsp->vs_type == VMEM_FREE); 521 ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1); 522 ASSERT(addr - 1 <= addr_end - 1); 523 524 /* 525 * If we're allocating from the start of the segment, and the 526 * remainder will be on the same freelist, we can save quite 527 * a bit of work. 528 */ 529 if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) { 530 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 531 vsp->vs_start = addr_end; 532 vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size); 533 vmem_hash_insert(vmp, vsp); 534 return (vsp); 535 } 536 537 vmem_freelist_delete(vmp, vsp); 538 539 if (vs_end != addr_end) 540 vmem_freelist_insert(vmp, 541 vmem_seg_create(vmp, vsp, addr_end, vs_end)); 542 543 if (vs_start != addr) 544 vmem_freelist_insert(vmp, 545 vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr)); 546 547 vsp->vs_start = addr; 548 vsp->vs_end = addr + size; 549 550 vmem_hash_insert(vmp, vsp); 551 return (vsp); 552 } 553 554 /* 555 * We cannot reap if we are in the middle of a vmem_populate(). 556 */ 557 void 558 vmem_reap(void) 559 { 560 if (!IN_POPULATE()) 561 umem_reap(); 562 } 563 564 /* 565 * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures. 566 */ 567 static int 568 vmem_populate(vmem_t *vmp, int vmflag) 569 { 570 char *p; 571 vmem_seg_t *vsp; 572 ssize_t nseg; 573 size_t size; 574 vmem_populate_lock_t *lp; 575 int i; 576 577 while (vmp->vm_nsegfree < VMEM_MINFREE && 578 (vsp = vmem_getseg_global()) != NULL) 579 vmem_putseg(vmp, vsp); 580 581 if (vmp->vm_nsegfree >= VMEM_MINFREE) 582 return (1); 583 584 /* 585 * If we're already populating, tap the reserve. 586 */ 587 if (vmem_nosleep_lock.vmpl_thr == thr_self()) { 588 ASSERT(vmp->vm_cflags & VMC_POPULATOR); 589 return (1); 590 } 591 592 (void) mutex_unlock(&vmp->vm_lock); 593 594 ASSERT(vmflag & VM_NOSLEEP); /* we do not allow sleep allocations */ 595 lp = &vmem_nosleep_lock; 596 597 /* 598 * Cannot be just a mutex_lock(), since that has no effect if 599 * libthread is not linked. 600 */ 601 (void) mutex_lock(&lp->vmpl_mutex); 602 ASSERT(lp->vmpl_thr == 0); 603 lp->vmpl_thr = thr_self(); 604 605 nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE; 606 size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum); 607 nseg = size / vmem_seg_size; 608 609 /* 610 * The following vmem_alloc() may need to populate vmem_seg_arena 611 * and all the things it imports from. When doing so, it will tap 612 * each arena's reserve to prevent recursion (see the block comment 613 * above the definition of VMEM_POPULATE_RESERVE). 614 * 615 * During this allocation, vmem_reap() is a no-op. If the allocation 616 * fails, we call vmem_reap() after dropping the population lock. 617 */ 618 p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_UMFLAGS); 619 if (p == NULL) { 620 lp->vmpl_thr = 0; 621 (void) mutex_unlock(&lp->vmpl_mutex); 622 vmem_reap(); 623 624 (void) mutex_lock(&vmp->vm_lock); 625 vmp->vm_kstat.vk_populate_fail++; 626 return (0); 627 } 628 /* 629 * Restock the arenas that may have been depleted during population. 630 */ 631 for (i = 0; i < vmem_populators; i++) { 632 (void) mutex_lock(&vmem_populator[i]->vm_lock); 633 while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE) 634 vmem_putseg(vmem_populator[i], 635 (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 636 (void) mutex_unlock(&vmem_populator[i]->vm_lock); 637 } 638 639 lp->vmpl_thr = 0; 640 (void) mutex_unlock(&lp->vmpl_mutex); 641 (void) mutex_lock(&vmp->vm_lock); 642 643 /* 644 * Now take our own segments. 645 */ 646 ASSERT(nseg >= VMEM_MINFREE); 647 while (vmp->vm_nsegfree < VMEM_MINFREE) 648 vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size)); 649 650 /* 651 * Give the remainder to charity. 652 */ 653 while (nseg > 0) 654 vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size)); 655 656 return (1); 657 } 658 659 /* 660 * Advance a walker from its previous position to 'afterme'. 661 * Note: may drop and reacquire vmp->vm_lock. 662 */ 663 static void 664 vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme) 665 { 666 vmem_seg_t *vprev = walker->vs_aprev; 667 vmem_seg_t *vnext = walker->vs_anext; 668 vmem_seg_t *vsp = NULL; 669 670 VMEM_DELETE(walker, a); 671 672 if (afterme != NULL) 673 VMEM_INSERT(afterme, walker, a); 674 675 /* 676 * The walker segment's presence may have prevented its neighbors 677 * from coalescing. If so, coalesce them now. 678 */ 679 if (vprev->vs_type == VMEM_FREE) { 680 if (vnext->vs_type == VMEM_FREE) { 681 ASSERT(vprev->vs_end == vnext->vs_start); 682 vmem_freelist_delete(vmp, vnext); 683 vmem_freelist_delete(vmp, vprev); 684 vprev->vs_end = vnext->vs_end; 685 vmem_freelist_insert(vmp, vprev); 686 vmem_seg_destroy(vmp, vnext); 687 } 688 vsp = vprev; 689 } else if (vnext->vs_type == VMEM_FREE) { 690 vsp = vnext; 691 } 692 693 /* 694 * vsp could represent a complete imported span, 695 * in which case we must return it to the source. 696 */ 697 if (vsp != NULL && vsp->vs_import && vmp->vm_source_free != NULL && 698 vsp->vs_aprev->vs_type == VMEM_SPAN && 699 vsp->vs_anext->vs_type == VMEM_SPAN) { 700 void *vaddr = (void *)vsp->vs_start; 701 size_t size = VS_SIZE(vsp); 702 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 703 vmem_freelist_delete(vmp, vsp); 704 vmem_span_destroy(vmp, vsp); 705 (void) mutex_unlock(&vmp->vm_lock); 706 vmp->vm_source_free(vmp->vm_source, vaddr, size); 707 (void) mutex_lock(&vmp->vm_lock); 708 } 709 } 710 711 /* 712 * VM_NEXTFIT allocations deliberately cycle through all virtual addresses 713 * in an arena, so that we avoid reusing addresses for as long as possible. 714 * This helps to catch used-after-freed bugs. It's also the perfect policy 715 * for allocating things like process IDs, where we want to cycle through 716 * all values in order. 717 */ 718 static void * 719 vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag) 720 { 721 vmem_seg_t *vsp, *rotor; 722 uintptr_t addr; 723 size_t realsize = P2ROUNDUP(size, vmp->vm_quantum); 724 size_t vs_size; 725 726 (void) mutex_lock(&vmp->vm_lock); 727 728 if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) { 729 (void) mutex_unlock(&vmp->vm_lock); 730 return (NULL); 731 } 732 733 /* 734 * The common case is that the segment right after the rotor is free, 735 * and large enough that extracting 'size' bytes won't change which 736 * freelist it's on. In this case we can avoid a *lot* of work. 737 * Instead of the normal vmem_seg_alloc(), we just advance the start 738 * address of the victim segment. Instead of moving the rotor, we 739 * create the new segment structure *behind the rotor*, which has 740 * the same effect. And finally, we know we don't have to coalesce 741 * the rotor's neighbors because the new segment lies between them. 742 */ 743 rotor = &vmp->vm_rotor; 744 vsp = rotor->vs_anext; 745 if (vsp->vs_type == VMEM_FREE && (vs_size = VS_SIZE(vsp)) > realsize && 746 P2SAMEHIGHBIT(vs_size, vs_size - realsize)) { 747 ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); 748 addr = vsp->vs_start; 749 vsp->vs_start = addr + realsize; 750 vmem_hash_insert(vmp, 751 vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size)); 752 (void) mutex_unlock(&vmp->vm_lock); 753 return ((void *)addr); 754 } 755 756 /* 757 * Starting at the rotor, look for a segment large enough to 758 * satisfy the allocation. 759 */ 760 for (;;) { 761 vmp->vm_kstat.vk_search++; 762 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 763 break; 764 vsp = vsp->vs_anext; 765 if (vsp == rotor) { 766 /* 767 * We've come full circle. One possibility is that the 768 * there's actually enough space, but the rotor itself 769 * is preventing the allocation from succeeding because 770 * it's sitting between two free segments. Therefore, 771 * we advance the rotor and see if that liberates a 772 * suitable segment. 773 */ 774 vmem_advance(vmp, rotor, rotor->vs_anext); 775 vsp = rotor->vs_aprev; 776 if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) 777 break; 778 /* 779 * If there's a lower arena we can import from, or it's 780 * a VM_NOSLEEP allocation, let vmem_xalloc() handle it. 781 * Otherwise, wait until another thread frees something. 782 */ 783 if (vmp->vm_source_alloc != NULL || 784 (vmflag & VM_NOSLEEP)) { 785 (void) mutex_unlock(&vmp->vm_lock); 786 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 787 0, 0, NULL, NULL, vmflag & VM_UMFLAGS)); 788 } 789 vmp->vm_kstat.vk_wait++; 790 (void) _cond_wait(&vmp->vm_cv, &vmp->vm_lock); 791 vsp = rotor->vs_anext; 792 } 793 } 794 795 /* 796 * We found a segment. Extract enough space to satisfy the allocation. 797 */ 798 addr = vsp->vs_start; 799 vsp = vmem_seg_alloc(vmp, vsp, addr, size); 800 ASSERT(vsp->vs_type == VMEM_ALLOC && 801 vsp->vs_start == addr && vsp->vs_end == addr + size); 802 803 /* 804 * Advance the rotor to right after the newly-allocated segment. 805 * That's where the next VM_NEXTFIT allocation will begin searching. 806 */ 807 vmem_advance(vmp, rotor, vsp); 808 (void) mutex_unlock(&vmp->vm_lock); 809 return ((void *)addr); 810 } 811 812 /* 813 * Allocate size bytes at offset phase from an align boundary such that the 814 * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) 815 * that does not straddle a nocross-aligned boundary. 816 */ 817 void * 818 vmem_xalloc(vmem_t *vmp, size_t size, size_t align, size_t phase, 819 size_t nocross, void *minaddr, void *maxaddr, int vmflag) 820 { 821 vmem_seg_t *vsp; 822 vmem_seg_t *vbest = NULL; 823 uintptr_t addr, taddr, start, end; 824 void *vaddr; 825 int hb, flist, resv; 826 uint32_t mtbf; 827 828 if (phase > 0 && phase >= align) 829 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 830 "invalid phase", 831 (void *)vmp, size, align, phase, nocross, 832 minaddr, maxaddr, vmflag); 833 834 if (align == 0) 835 align = vmp->vm_quantum; 836 837 if ((align | phase | nocross) & (vmp->vm_quantum - 1)) { 838 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 839 "parameters not vm_quantum aligned", 840 (void *)vmp, size, align, phase, nocross, 841 minaddr, maxaddr, vmflag); 842 } 843 844 if (nocross != 0 && 845 (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) { 846 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 847 "overconstrained allocation", 848 (void *)vmp, size, align, phase, nocross, 849 minaddr, maxaddr, vmflag); 850 } 851 852 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 853 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 854 return (NULL); 855 856 (void) mutex_lock(&vmp->vm_lock); 857 for (;;) { 858 if (vmp->vm_nsegfree < VMEM_MINFREE && 859 !vmem_populate(vmp, vmflag)) 860 break; 861 862 /* 863 * highbit() returns the highest bit + 1, which is exactly 864 * what we want: we want to search the first freelist whose 865 * members are *definitely* large enough to satisfy our 866 * allocation. However, there are certain cases in which we 867 * want to look at the next-smallest freelist (which *might* 868 * be able to satisfy the allocation): 869 * 870 * (1) The size is exactly a power of 2, in which case 871 * the smaller freelist is always big enough; 872 * 873 * (2) All other freelists are empty; 874 * 875 * (3) We're in the highest possible freelist, which is 876 * always empty (e.g. the 4GB freelist on 32-bit systems); 877 * 878 * (4) We're doing a best-fit or first-fit allocation. 879 */ 880 if ((size & (size - 1)) == 0) { 881 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 882 } else { 883 hb = highbit(size); 884 if ((vmp->vm_freemap >> hb) == 0 || 885 hb == VMEM_FREELISTS || 886 (vmflag & (VM_BESTFIT | VM_FIRSTFIT))) 887 hb--; 888 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 889 } 890 891 for (vbest = NULL, vsp = (flist == 0) ? NULL : 892 vmp->vm_freelist[flist - 1].vs_knext; 893 vsp != NULL; vsp = vsp->vs_knext) { 894 vmp->vm_kstat.vk_search++; 895 if (vsp->vs_start == 0) { 896 /* 897 * We're moving up to a larger freelist, 898 * so if we've already found a candidate, 899 * the fit can't possibly get any better. 900 */ 901 if (vbest != NULL) 902 break; 903 /* 904 * Find the next non-empty freelist. 905 */ 906 flist = lowbit(P2ALIGN(vmp->vm_freemap, 907 VS_SIZE(vsp))); 908 if (flist-- == 0) 909 break; 910 vsp = (vmem_seg_t *)&vmp->vm_freelist[flist]; 911 ASSERT(vsp->vs_knext->vs_type == VMEM_FREE); 912 continue; 913 } 914 if (vsp->vs_end - 1 < (uintptr_t)minaddr) 915 continue; 916 if (vsp->vs_start > (uintptr_t)maxaddr - 1) 917 continue; 918 start = MAX(vsp->vs_start, (uintptr_t)minaddr); 919 end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; 920 taddr = P2PHASEUP(start, align, phase); 921 if (P2CROSS(taddr, taddr + size - 1, nocross)) 922 taddr += 923 P2ROUNDUP(P2NPHASE(taddr, nocross), align); 924 if ((taddr - start) + size > end - start || 925 (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest))) 926 continue; 927 vbest = vsp; 928 addr = taddr; 929 if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size) 930 break; 931 } 932 if (vbest != NULL) 933 break; 934 if (size == 0) 935 umem_panic("vmem_xalloc(): size == 0"); 936 if (vmp->vm_source_alloc != NULL && nocross == 0 && 937 minaddr == NULL && maxaddr == NULL) { 938 size_t asize = P2ROUNDUP(size + phase, 939 MAX(align, vmp->vm_source->vm_quantum)); 940 if (asize < size) { /* overflow */ 941 (void) mutex_unlock(&vmp->vm_lock); 942 if (vmflag & VM_NOSLEEP) 943 return (NULL); 944 945 umem_panic("vmem_xalloc(): " 946 "overflow on VM_SLEEP allocation"); 947 } 948 /* 949 * Determine how many segment structures we'll consume. 950 * The calculation must be presise because if we're 951 * here on behalf of vmem_populate(), we are taking 952 * segments from a very limited reserve. 953 */ 954 resv = (size == asize) ? 955 VMEM_SEGS_PER_SPAN_CREATE + 956 VMEM_SEGS_PER_EXACT_ALLOC : 957 VMEM_SEGS_PER_ALLOC_MAX; 958 ASSERT(vmp->vm_nsegfree >= resv); 959 vmp->vm_nsegfree -= resv; /* reserve our segs */ 960 (void) mutex_unlock(&vmp->vm_lock); 961 vaddr = vmp->vm_source_alloc(vmp->vm_source, asize, 962 vmflag & VM_UMFLAGS); 963 (void) mutex_lock(&vmp->vm_lock); 964 vmp->vm_nsegfree += resv; /* claim reservation */ 965 if (vaddr != NULL) { 966 vbest = vmem_span_create(vmp, vaddr, asize, 1); 967 addr = P2PHASEUP(vbest->vs_start, align, phase); 968 break; 969 } 970 } 971 (void) mutex_unlock(&vmp->vm_lock); 972 vmem_reap(); 973 (void) mutex_lock(&vmp->vm_lock); 974 if (vmflag & VM_NOSLEEP) 975 break; 976 vmp->vm_kstat.vk_wait++; 977 (void) _cond_wait(&vmp->vm_cv, &vmp->vm_lock); 978 } 979 if (vbest != NULL) { 980 ASSERT(vbest->vs_type == VMEM_FREE); 981 ASSERT(vbest->vs_knext != vbest); 982 (void) vmem_seg_alloc(vmp, vbest, addr, size); 983 (void) mutex_unlock(&vmp->vm_lock); 984 ASSERT(P2PHASE(addr, align) == phase); 985 ASSERT(!P2CROSS(addr, addr + size - 1, nocross)); 986 ASSERT(addr >= (uintptr_t)minaddr); 987 ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); 988 return ((void *)addr); 989 } 990 vmp->vm_kstat.vk_fail++; 991 (void) mutex_unlock(&vmp->vm_lock); 992 if (vmflag & VM_PANIC) 993 umem_panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " 994 "cannot satisfy mandatory allocation", 995 (void *)vmp, size, align, phase, nocross, 996 minaddr, maxaddr, vmflag); 997 return (NULL); 998 } 999 1000 /* 1001 * Free the segment [vaddr, vaddr + size), where vaddr was a constrained 1002 * allocation. vmem_xalloc() and vmem_xfree() must always be paired because 1003 * both routines bypass the quantum caches. 1004 */ 1005 void 1006 vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) 1007 { 1008 vmem_seg_t *vsp, *vnext, *vprev; 1009 1010 (void) mutex_lock(&vmp->vm_lock); 1011 1012 vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size); 1013 vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum); 1014 1015 /* 1016 * Attempt to coalesce with the next segment. 1017 */ 1018 vnext = vsp->vs_anext; 1019 if (vnext->vs_type == VMEM_FREE) { 1020 ASSERT(vsp->vs_end == vnext->vs_start); 1021 vmem_freelist_delete(vmp, vnext); 1022 vsp->vs_end = vnext->vs_end; 1023 vmem_seg_destroy(vmp, vnext); 1024 } 1025 1026 /* 1027 * Attempt to coalesce with the previous segment. 1028 */ 1029 vprev = vsp->vs_aprev; 1030 if (vprev->vs_type == VMEM_FREE) { 1031 ASSERT(vprev->vs_end == vsp->vs_start); 1032 vmem_freelist_delete(vmp, vprev); 1033 vprev->vs_end = vsp->vs_end; 1034 vmem_seg_destroy(vmp, vsp); 1035 vsp = vprev; 1036 } 1037 1038 /* 1039 * If the entire span is free, return it to the source. 1040 */ 1041 if (vsp->vs_import && vmp->vm_source_free != NULL && 1042 vsp->vs_aprev->vs_type == VMEM_SPAN && 1043 vsp->vs_anext->vs_type == VMEM_SPAN) { 1044 vaddr = (void *)vsp->vs_start; 1045 size = VS_SIZE(vsp); 1046 ASSERT(size == VS_SIZE(vsp->vs_aprev)); 1047 vmem_span_destroy(vmp, vsp); 1048 (void) mutex_unlock(&vmp->vm_lock); 1049 vmp->vm_source_free(vmp->vm_source, vaddr, size); 1050 } else { 1051 vmem_freelist_insert(vmp, vsp); 1052 (void) mutex_unlock(&vmp->vm_lock); 1053 } 1054 } 1055 1056 /* 1057 * Allocate size bytes from arena vmp. Returns the allocated address 1058 * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, 1059 * and may also specify best-fit, first-fit, or next-fit allocation policy 1060 * instead of the default instant-fit policy. VM_SLEEP allocations are 1061 * guaranteed to succeed. 1062 */ 1063 void * 1064 vmem_alloc(vmem_t *vmp, size_t size, int vmflag) 1065 { 1066 vmem_seg_t *vsp; 1067 uintptr_t addr; 1068 int hb; 1069 int flist = 0; 1070 uint32_t mtbf; 1071 1072 if (size - 1 < vmp->vm_qcache_max) { 1073 ASSERT(vmflag & VM_NOSLEEP); 1074 return (_umem_cache_alloc(vmp->vm_qcache[(size - 1) >> 1075 vmp->vm_qshift], UMEM_DEFAULT)); 1076 } 1077 1078 if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && 1079 (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) 1080 return (NULL); 1081 1082 if (vmflag & VM_NEXTFIT) 1083 return (vmem_nextfit_alloc(vmp, size, vmflag)); 1084 1085 if (vmflag & (VM_BESTFIT | VM_FIRSTFIT)) 1086 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0, 1087 NULL, NULL, vmflag)); 1088 1089 /* 1090 * Unconstrained instant-fit allocation from the segment list. 1091 */ 1092 (void) mutex_lock(&vmp->vm_lock); 1093 1094 if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) { 1095 if ((size & (size - 1)) == 0) 1096 flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); 1097 else if ((hb = highbit(size)) < VMEM_FREELISTS) 1098 flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); 1099 } 1100 1101 if (flist-- == 0) { 1102 (void) mutex_unlock(&vmp->vm_lock); 1103 return (vmem_xalloc(vmp, size, vmp->vm_quantum, 1104 0, 0, NULL, NULL, vmflag)); 1105 } 1106 1107 ASSERT(size <= (1UL << flist)); 1108 vsp = vmp->vm_freelist[flist].vs_knext; 1109 addr = vsp->vs_start; 1110 (void) vmem_seg_alloc(vmp, vsp, addr, size); 1111 (void) mutex_unlock(&vmp->vm_lock); 1112 return ((void *)addr); 1113 } 1114 1115 /* 1116 * Free the segment [vaddr, vaddr + size). 1117 */ 1118 void 1119 vmem_free(vmem_t *vmp, void *vaddr, size_t size) 1120 { 1121 if (size - 1 < vmp->vm_qcache_max) 1122 _umem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], 1123 vaddr); 1124 else 1125 vmem_xfree(vmp, vaddr, size); 1126 } 1127 1128 /* 1129 * Determine whether arena vmp contains the segment [vaddr, vaddr + size). 1130 */ 1131 int 1132 vmem_contains(vmem_t *vmp, void *vaddr, size_t size) 1133 { 1134 uintptr_t start = (uintptr_t)vaddr; 1135 uintptr_t end = start + size; 1136 vmem_seg_t *vsp; 1137 vmem_seg_t *seg0 = &vmp->vm_seg0; 1138 1139 (void) mutex_lock(&vmp->vm_lock); 1140 vmp->vm_kstat.vk_contains++; 1141 for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) { 1142 vmp->vm_kstat.vk_contains_search++; 1143 ASSERT(vsp->vs_type == VMEM_SPAN); 1144 if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1) 1145 break; 1146 } 1147 (void) mutex_unlock(&vmp->vm_lock); 1148 return (vsp != seg0); 1149 } 1150 1151 /* 1152 * Add the span [vaddr, vaddr + size) to arena vmp. 1153 */ 1154 void * 1155 vmem_add(vmem_t *vmp, void *vaddr, size_t size, int vmflag) 1156 { 1157 if (vaddr == NULL || size == 0) { 1158 umem_panic("vmem_add(%p, %p, %lu): bad arguments", 1159 vmp, vaddr, size); 1160 } 1161 1162 ASSERT(!vmem_contains(vmp, vaddr, size)); 1163 1164 (void) mutex_lock(&vmp->vm_lock); 1165 if (vmem_populate(vmp, vmflag)) 1166 (void) vmem_span_create(vmp, vaddr, size, 0); 1167 else 1168 vaddr = NULL; 1169 (void) cond_broadcast(&vmp->vm_cv); 1170 (void) mutex_unlock(&vmp->vm_lock); 1171 return (vaddr); 1172 } 1173 1174 /* 1175 * Adds the address range [addr, endaddr) to arena vmp, by either: 1176 * 1. joining two existing spans, [x, addr), and [endaddr, y) (which 1177 * are in that order) into a single [x, y) span, 1178 * 2. expanding an existing [x, addr) span to [x, endaddr), 1179 * 3. expanding an existing [endaddr, x) span to [addr, x), or 1180 * 4. creating a new [addr, endaddr) span. 1181 * 1182 * Called with vmp->vm_lock held, and a successful vmem_populate() completed. 1183 * Cannot fail. Returns the new segment. 1184 * 1185 * NOTE: this algorithm is linear-time in the number of spans, but is 1186 * constant-time when you are extending the last (highest-addressed) 1187 * span. 1188 */ 1189 static vmem_seg_t * 1190 vmem_extend_unlocked(vmem_t *vmp, uintptr_t addr, uintptr_t endaddr) 1191 { 1192 vmem_seg_t *span; 1193 vmem_seg_t *vsp; 1194 1195 vmem_seg_t *end = &vmp->vm_seg0; 1196 1197 ASSERT(MUTEX_HELD(&vmp->vm_lock)); 1198 1199 /* 1200 * the second "if" clause below relies on the direction of this search 1201 */ 1202 for (span = end->vs_kprev; span != end; span = span->vs_kprev) { 1203 if (span->vs_end == addr || span->vs_start == endaddr) 1204 break; 1205 } 1206 1207 if (span == end) 1208 return (vmem_span_create(vmp, (void *)addr, endaddr - addr, 0)); 1209 if (span->vs_kprev->vs_end == addr && span->vs_start == endaddr) { 1210 vmem_seg_t *prevspan = span->vs_kprev; 1211 vmem_seg_t *nextseg = span->vs_anext; 1212 vmem_seg_t *prevseg = span->vs_aprev; 1213 1214 /* 1215 * prevspan becomes the span marker for the full range 1216 */ 1217 prevspan->vs_end = span->vs_end; 1218 1219 /* 1220 * Notionally, span becomes a free segment representing 1221 * [addr, endaddr). 1222 * 1223 * However, if either of its neighbors are free, we coalesce 1224 * by destroying span and changing the free segment. 1225 */ 1226 if (prevseg->vs_type == VMEM_FREE && 1227 nextseg->vs_type == VMEM_FREE) { 1228 /* 1229 * coalesce both ways 1230 */ 1231 ASSERT(prevseg->vs_end == addr && 1232 nextseg->vs_start == endaddr); 1233 1234 vmem_freelist_delete(vmp, prevseg); 1235 prevseg->vs_end = nextseg->vs_end; 1236 1237 vmem_freelist_delete(vmp, nextseg); 1238 VMEM_DELETE(span, k); 1239 vmem_seg_destroy(vmp, nextseg); 1240 vmem_seg_destroy(vmp, span); 1241 1242 vsp = prevseg; 1243 } else if (prevseg->vs_type == VMEM_FREE) { 1244 /* 1245 * coalesce left 1246 */ 1247 ASSERT(prevseg->vs_end == addr); 1248 1249 VMEM_DELETE(span, k); 1250 vmem_seg_destroy(vmp, span); 1251 1252 vmem_freelist_delete(vmp, prevseg); 1253 prevseg->vs_end = endaddr; 1254 1255 vsp = prevseg; 1256 } else if (nextseg->vs_type == VMEM_FREE) { 1257 /* 1258 * coalesce right 1259 */ 1260 ASSERT(nextseg->vs_start == endaddr); 1261 1262 VMEM_DELETE(span, k); 1263 vmem_seg_destroy(vmp, span); 1264 1265 vmem_freelist_delete(vmp, nextseg); 1266 nextseg->vs_start = addr; 1267 1268 vsp = nextseg; 1269 } else { 1270 /* 1271 * cannnot coalesce 1272 */ 1273 VMEM_DELETE(span, k); 1274 span->vs_start = addr; 1275 span->vs_end = endaddr; 1276 1277 vsp = span; 1278 } 1279 } else if (span->vs_end == addr) { 1280 vmem_seg_t *oldseg = span->vs_knext->vs_aprev; 1281 span->vs_end = endaddr; 1282 1283 ASSERT(oldseg->vs_type != VMEM_SPAN); 1284 if (oldseg->vs_type == VMEM_FREE) { 1285 ASSERT(oldseg->vs_end == addr); 1286 vmem_freelist_delete(vmp, oldseg); 1287 oldseg->vs_end = endaddr; 1288 vsp = oldseg; 1289 } else 1290 vsp = vmem_seg_create(vmp, oldseg, addr, endaddr); 1291 } else { 1292 vmem_seg_t *oldseg = span->vs_anext; 1293 ASSERT(span->vs_start == endaddr); 1294 span->vs_start = addr; 1295 1296 ASSERT(oldseg->vs_type != VMEM_SPAN); 1297 if (oldseg->vs_type == VMEM_FREE) { 1298 ASSERT(oldseg->vs_start == endaddr); 1299 vmem_freelist_delete(vmp, oldseg); 1300 oldseg->vs_start = addr; 1301 vsp = oldseg; 1302 } else 1303 vsp = vmem_seg_create(vmp, span, addr, endaddr); 1304 } 1305 vmem_freelist_insert(vmp, vsp); 1306 vmp->vm_kstat.vk_mem_total += (endaddr - addr); 1307 return (vsp); 1308 } 1309 1310 /* 1311 * Does some error checking, calls vmem_extend_unlocked to add 1312 * [vaddr, vaddr+size) to vmp, then allocates alloc bytes from the 1313 * newly merged segment. 1314 */ 1315 void * 1316 _vmem_extend_alloc(vmem_t *vmp, void *vaddr, size_t size, size_t alloc, 1317 int vmflag) 1318 { 1319 uintptr_t addr = (uintptr_t)vaddr; 1320 uintptr_t endaddr = addr + size; 1321 vmem_seg_t *vsp; 1322 1323 ASSERT(vaddr != NULL && size != 0 && endaddr > addr); 1324 ASSERT(alloc <= size && alloc != 0); 1325 ASSERT(((addr | size | alloc) & (vmp->vm_quantum - 1)) == 0); 1326 1327 ASSERT(!vmem_contains(vmp, vaddr, size)); 1328 1329 (void) mutex_lock(&vmp->vm_lock); 1330 if (!vmem_populate(vmp, vmflag)) { 1331 (void) mutex_unlock(&vmp->vm_lock); 1332 return (NULL); 1333 } 1334 /* 1335 * if there is a source, we can't mess with the spans 1336 */ 1337 if (vmp->vm_source_alloc != NULL) 1338 vsp = vmem_span_create(vmp, vaddr, size, 0); 1339 else 1340 vsp = vmem_extend_unlocked(vmp, addr, endaddr); 1341 1342 ASSERT(VS_SIZE(vsp) >= alloc); 1343 1344 addr = vsp->vs_start; 1345 (void) vmem_seg_alloc(vmp, vsp, addr, alloc); 1346 vaddr = (void *)addr; 1347 1348 (void) cond_broadcast(&vmp->vm_cv); 1349 (void) mutex_unlock(&vmp->vm_lock); 1350 1351 return (vaddr); 1352 } 1353 1354 /* 1355 * Walk the vmp arena, applying func to each segment matching typemask. 1356 * If VMEM_REENTRANT is specified, the arena lock is dropped across each 1357 * call to func(); otherwise, it is held for the duration of vmem_walk() 1358 * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks 1359 * are *not* necessarily consistent, so they may only be used when a hint 1360 * is adequate. 1361 */ 1362 void 1363 vmem_walk(vmem_t *vmp, int typemask, 1364 void (*func)(void *, void *, size_t), void *arg) 1365 { 1366 vmem_seg_t *vsp; 1367 vmem_seg_t *seg0 = &vmp->vm_seg0; 1368 vmem_seg_t walker; 1369 1370 if (typemask & VMEM_WALKER) 1371 return; 1372 1373 bzero(&walker, sizeof (walker)); 1374 walker.vs_type = VMEM_WALKER; 1375 1376 (void) mutex_lock(&vmp->vm_lock); 1377 VMEM_INSERT(seg0, &walker, a); 1378 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) { 1379 if (vsp->vs_type & typemask) { 1380 void *start = (void *)vsp->vs_start; 1381 size_t size = VS_SIZE(vsp); 1382 if (typemask & VMEM_REENTRANT) { 1383 vmem_advance(vmp, &walker, vsp); 1384 (void) mutex_unlock(&vmp->vm_lock); 1385 func(arg, start, size); 1386 (void) mutex_lock(&vmp->vm_lock); 1387 vsp = &walker; 1388 } else { 1389 func(arg, start, size); 1390 } 1391 } 1392 } 1393 vmem_advance(vmp, &walker, NULL); 1394 (void) mutex_unlock(&vmp->vm_lock); 1395 } 1396 1397 /* 1398 * Return the total amount of memory whose type matches typemask. Thus: 1399 * 1400 * typemask VMEM_ALLOC yields total memory allocated (in use). 1401 * typemask VMEM_FREE yields total memory free (available). 1402 * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size. 1403 */ 1404 size_t 1405 vmem_size(vmem_t *vmp, int typemask) 1406 { 1407 uint64_t size = 0; 1408 1409 if (typemask & VMEM_ALLOC) 1410 size += vmp->vm_kstat.vk_mem_inuse; 1411 if (typemask & VMEM_FREE) 1412 size += vmp->vm_kstat.vk_mem_total - 1413 vmp->vm_kstat.vk_mem_inuse; 1414 return ((size_t)size); 1415 } 1416 1417 /* 1418 * Create an arena called name whose initial span is [base, base + size). 1419 * The arena's natural unit of currency is quantum, so vmem_alloc() 1420 * guarantees quantum-aligned results. The arena may import new spans 1421 * by invoking afunc() on source, and may return those spans by invoking 1422 * ffunc() on source. To make small allocations fast and scalable, 1423 * the arena offers high-performance caching for each integer multiple 1424 * of quantum up to qcache_max. 1425 */ 1426 vmem_t * 1427 vmem_create(const char *name, void *base, size_t size, size_t quantum, 1428 vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source, 1429 size_t qcache_max, int vmflag) 1430 { 1431 int i; 1432 size_t nqcache; 1433 vmem_t *vmp, *cur, **vmpp; 1434 vmem_seg_t *vsp; 1435 vmem_freelist_t *vfp; 1436 uint32_t id = atomic_add_32_nv(&vmem_id, 1); 1437 1438 if (vmem_vmem_arena != NULL) { 1439 vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t), 1440 vmflag & VM_UMFLAGS); 1441 } else { 1442 ASSERT(id <= VMEM_INITIAL); 1443 vmp = &vmem0[id - 1]; 1444 } 1445 1446 if (vmp == NULL) 1447 return (NULL); 1448 bzero(vmp, sizeof (vmem_t)); 1449 1450 (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name); 1451 (void) mutex_init(&vmp->vm_lock, USYNC_THREAD, NULL); 1452 (void) cond_init(&vmp->vm_cv, USYNC_THREAD, NULL); 1453 vmp->vm_cflags = vmflag; 1454 vmflag &= VM_UMFLAGS; 1455 1456 vmp->vm_quantum = quantum; 1457 vmp->vm_qshift = highbit(quantum) - 1; 1458 nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX); 1459 1460 for (i = 0; i <= VMEM_FREELISTS; i++) { 1461 vfp = &vmp->vm_freelist[i]; 1462 vfp->vs_end = 1UL << i; 1463 vfp->vs_knext = (vmem_seg_t *)(vfp + 1); 1464 vfp->vs_kprev = (vmem_seg_t *)(vfp - 1); 1465 } 1466 1467 vmp->vm_freelist[0].vs_kprev = NULL; 1468 vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL; 1469 vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0; 1470 vmp->vm_hash_table = vmp->vm_hash0; 1471 vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1; 1472 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1473 1474 vsp = &vmp->vm_seg0; 1475 vsp->vs_anext = vsp; 1476 vsp->vs_aprev = vsp; 1477 vsp->vs_knext = vsp; 1478 vsp->vs_kprev = vsp; 1479 vsp->vs_type = VMEM_SPAN; 1480 1481 vsp = &vmp->vm_rotor; 1482 vsp->vs_type = VMEM_ROTOR; 1483 VMEM_INSERT(&vmp->vm_seg0, vsp, a); 1484 1485 vmp->vm_id = id; 1486 if (source != NULL) 1487 vmp->vm_kstat.vk_source_id = source->vm_id; 1488 vmp->vm_source = source; 1489 vmp->vm_source_alloc = afunc; 1490 vmp->vm_source_free = ffunc; 1491 1492 if (nqcache != 0) { 1493 vmp->vm_qcache_max = nqcache << vmp->vm_qshift; 1494 for (i = 0; i < nqcache; i++) { 1495 char buf[VMEM_NAMELEN + 21]; 1496 (void) snprintf(buf, sizeof (buf), "%s_%lu", 1497 vmp->vm_name, (long)((i + 1) * quantum)); 1498 vmp->vm_qcache[i] = umem_cache_create(buf, 1499 (i + 1) * quantum, quantum, NULL, NULL, NULL, 1500 NULL, vmp, UMC_QCACHE | UMC_NOTOUCH); 1501 if (vmp->vm_qcache[i] == NULL) { 1502 vmp->vm_qcache_max = i * quantum; 1503 break; 1504 } 1505 } 1506 } 1507 1508 (void) mutex_lock(&vmem_list_lock); 1509 vmpp = &vmem_list; 1510 while ((cur = *vmpp) != NULL) 1511 vmpp = &cur->vm_next; 1512 *vmpp = vmp; 1513 (void) mutex_unlock(&vmem_list_lock); 1514 1515 if (vmp->vm_cflags & VMC_POPULATOR) { 1516 uint_t pop_id = atomic_add_32_nv(&vmem_populators, 1); 1517 ASSERT(pop_id <= VMEM_INITIAL); 1518 vmem_populator[pop_id - 1] = vmp; 1519 (void) mutex_lock(&vmp->vm_lock); 1520 (void) vmem_populate(vmp, vmflag | VM_PANIC); 1521 (void) mutex_unlock(&vmp->vm_lock); 1522 } 1523 1524 if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) { 1525 vmem_destroy(vmp); 1526 return (NULL); 1527 } 1528 1529 return (vmp); 1530 } 1531 1532 /* 1533 * Destroy arena vmp. 1534 */ 1535 void 1536 vmem_destroy(vmem_t *vmp) 1537 { 1538 vmem_t *cur, **vmpp; 1539 vmem_seg_t *seg0 = &vmp->vm_seg0; 1540 vmem_seg_t *vsp; 1541 size_t leaked; 1542 int i; 1543 1544 (void) mutex_lock(&vmem_list_lock); 1545 vmpp = &vmem_list; 1546 while ((cur = *vmpp) != vmp) 1547 vmpp = &cur->vm_next; 1548 *vmpp = vmp->vm_next; 1549 (void) mutex_unlock(&vmem_list_lock); 1550 1551 for (i = 0; i < VMEM_NQCACHE_MAX; i++) 1552 if (vmp->vm_qcache[i]) 1553 umem_cache_destroy(vmp->vm_qcache[i]); 1554 1555 leaked = vmem_size(vmp, VMEM_ALLOC); 1556 if (leaked != 0) 1557 umem_printf("vmem_destroy('%s'): leaked %lu bytes", 1558 vmp->vm_name, leaked); 1559 1560 if (vmp->vm_hash_table != vmp->vm_hash0) 1561 vmem_free(vmem_hash_arena, vmp->vm_hash_table, 1562 (vmp->vm_hash_mask + 1) * sizeof (void *)); 1563 1564 /* 1565 * Give back the segment structures for anything that's left in the 1566 * arena, e.g. the primary spans and their free segments. 1567 */ 1568 VMEM_DELETE(&vmp->vm_rotor, a); 1569 for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) 1570 vmem_putseg_global(vsp); 1571 1572 while (vmp->vm_nsegfree > 0) 1573 vmem_putseg_global(vmem_getseg(vmp)); 1574 1575 (void) mutex_destroy(&vmp->vm_lock); 1576 (void) cond_destroy(&vmp->vm_cv); 1577 vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); 1578 } 1579 1580 /* 1581 * Resize vmp's hash table to keep the average lookup depth near 1.0. 1582 */ 1583 static void 1584 vmem_hash_rescale(vmem_t *vmp) 1585 { 1586 vmem_seg_t **old_table, **new_table, *vsp; 1587 size_t old_size, new_size, h, nseg; 1588 1589 nseg = (size_t)(vmp->vm_kstat.vk_alloc - vmp->vm_kstat.vk_free); 1590 1591 new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); 1592 old_size = vmp->vm_hash_mask + 1; 1593 1594 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) 1595 return; 1596 1597 new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *), 1598 VM_NOSLEEP); 1599 if (new_table == NULL) 1600 return; 1601 bzero(new_table, new_size * sizeof (void *)); 1602 1603 (void) mutex_lock(&vmp->vm_lock); 1604 1605 old_size = vmp->vm_hash_mask + 1; 1606 old_table = vmp->vm_hash_table; 1607 1608 vmp->vm_hash_mask = new_size - 1; 1609 vmp->vm_hash_table = new_table; 1610 vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); 1611 1612 for (h = 0; h < old_size; h++) { 1613 vsp = old_table[h]; 1614 while (vsp != NULL) { 1615 uintptr_t addr = vsp->vs_start; 1616 vmem_seg_t *next_vsp = vsp->vs_knext; 1617 vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr); 1618 vsp->vs_knext = *hash_bucket; 1619 *hash_bucket = vsp; 1620 vsp = next_vsp; 1621 } 1622 } 1623 1624 (void) mutex_unlock(&vmp->vm_lock); 1625 1626 if (old_table != vmp->vm_hash0) 1627 vmem_free(vmem_hash_arena, old_table, 1628 old_size * sizeof (void *)); 1629 } 1630 1631 /* 1632 * Perform periodic maintenance on all vmem arenas. 1633 */ 1634 /*ARGSUSED*/ 1635 void 1636 vmem_update(void *dummy) 1637 { 1638 vmem_t *vmp; 1639 1640 (void) mutex_lock(&vmem_list_lock); 1641 for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) { 1642 /* 1643 * If threads are waiting for resources, wake them up 1644 * periodically so they can issue another vmem_reap() 1645 * to reclaim resources cached by the slab allocator. 1646 */ 1647 (void) cond_broadcast(&vmp->vm_cv); 1648 1649 /* 1650 * Rescale the hash table to keep the hash chains short. 1651 */ 1652 vmem_hash_rescale(vmp); 1653 } 1654 (void) mutex_unlock(&vmem_list_lock); 1655 } 1656 1657 /* 1658 * If vmem_init is called again, we need to be able to reset the world. 1659 * That includes resetting the statics back to their original values. 1660 */ 1661 void 1662 vmem_startup(void) 1663 { 1664 #ifdef UMEM_STANDALONE 1665 vmem_id = 0; 1666 vmem_populators = 0; 1667 vmem_segfree = NULL; 1668 vmem_list = NULL; 1669 vmem_internal_arena = NULL; 1670 vmem_seg_arena = NULL; 1671 vmem_hash_arena = NULL; 1672 vmem_vmem_arena = NULL; 1673 vmem_heap = NULL; 1674 vmem_heap_alloc = NULL; 1675 vmem_heap_free = NULL; 1676 1677 bzero(vmem0, sizeof (vmem0)); 1678 bzero(vmem_populator, sizeof (vmem_populator)); 1679 bzero(vmem_seg0, sizeof (vmem_seg0)); 1680 #endif 1681 } 1682 1683 /* 1684 * Prepare vmem for use. 1685 */ 1686 vmem_t * 1687 vmem_init(const char *parent_name, size_t parent_quantum, 1688 vmem_alloc_t *parent_alloc, vmem_free_t *parent_free, 1689 const char *heap_name, void *heap_start, size_t heap_size, 1690 size_t heap_quantum, vmem_alloc_t *heap_alloc, vmem_free_t *heap_free) 1691 { 1692 uint32_t id; 1693 int nseg = VMEM_SEG_INITIAL; 1694 vmem_t *parent, *heap; 1695 1696 ASSERT(vmem_internal_arena == NULL); 1697 1698 while (--nseg >= 0) 1699 vmem_putseg_global(&vmem_seg0[nseg]); 1700 1701 if (parent_name != NULL) { 1702 parent = vmem_create(parent_name, 1703 heap_start, heap_size, parent_quantum, 1704 NULL, NULL, NULL, 0, 1705 VM_SLEEP | VMC_POPULATOR); 1706 heap_start = NULL; 1707 heap_size = 0; 1708 } else { 1709 ASSERT(parent_alloc == NULL && parent_free == NULL); 1710 parent = NULL; 1711 } 1712 1713 heap = vmem_create(heap_name, 1714 heap_start, heap_size, heap_quantum, 1715 parent_alloc, parent_free, parent, 0, 1716 VM_SLEEP | VMC_POPULATOR); 1717 1718 vmem_heap = heap; 1719 vmem_heap_alloc = heap_alloc; 1720 vmem_heap_free = heap_free; 1721 1722 vmem_internal_arena = vmem_create("vmem_internal", 1723 NULL, 0, heap_quantum, 1724 heap_alloc, heap_free, heap, 0, 1725 VM_SLEEP | VMC_POPULATOR); 1726 1727 vmem_seg_arena = vmem_create("vmem_seg", 1728 NULL, 0, heap_quantum, 1729 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1730 VM_SLEEP | VMC_POPULATOR); 1731 1732 vmem_hash_arena = vmem_create("vmem_hash", 1733 NULL, 0, 8, 1734 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1735 VM_SLEEP); 1736 1737 vmem_vmem_arena = vmem_create("vmem_vmem", 1738 vmem0, sizeof (vmem0), 1, 1739 vmem_alloc, vmem_free, vmem_internal_arena, 0, 1740 VM_SLEEP); 1741 1742 for (id = 0; id < vmem_id; id++) 1743 (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t), 1744 1, 0, 0, &vmem0[id], &vmem0[id + 1], 1745 VM_NOSLEEP | VM_BESTFIT | VM_PANIC); 1746 1747 return (heap); 1748 } 1749 1750 void 1751 vmem_no_debug(void) 1752 { 1753 /* 1754 * This size must be a multiple of the minimum required alignment, 1755 * since vmem_populate allocates them compactly. 1756 */ 1757 vmem_seg_size = P2ROUNDUP(offsetof(vmem_seg_t, vs_thread), 1758 sizeof (hrtime_t)); 1759 } 1760 1761 /* 1762 * Lockup and release, for fork1(2) handling. 1763 */ 1764 void 1765 vmem_lockup(void) 1766 { 1767 vmem_t *cur; 1768 1769 (void) mutex_lock(&vmem_list_lock); 1770 (void) mutex_lock(&vmem_nosleep_lock.vmpl_mutex); 1771 1772 /* 1773 * Lock up and broadcast all arenas. 1774 */ 1775 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) { 1776 (void) mutex_lock(&cur->vm_lock); 1777 (void) cond_broadcast(&cur->vm_cv); 1778 } 1779 1780 (void) mutex_lock(&vmem_segfree_lock); 1781 } 1782 1783 void 1784 vmem_release(void) 1785 { 1786 vmem_t *cur; 1787 1788 (void) mutex_unlock(&vmem_nosleep_lock.vmpl_mutex); 1789 1790 for (cur = vmem_list; cur != NULL; cur = cur->vm_next) 1791 (void) mutex_unlock(&cur->vm_lock); 1792 1793 (void) mutex_unlock(&vmem_segfree_lock); 1794 (void) mutex_unlock(&vmem_list_lock); 1795 } 1796