1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi, 5 * Copyright (c) 2013 EMC Corp. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * From: 32 * $NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $ 33 * $NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $ 34 */ 35 36 /* 37 * reference: 38 * - Magazines and Vmem: Extending the Slab Allocator 39 * to Many CPUs and Arbitrary Resources 40 * http://www.usenix.org/event/usenix01/bonwick.html 41 */ 42 43 #include <sys/cdefs.h> 44 #include "opt_ddb.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kernel.h> 49 #include <sys/queue.h> 50 #include <sys/callout.h> 51 #include <sys/hash.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/mutex.h> 55 #include <sys/smp.h> 56 #include <sys/condvar.h> 57 #include <sys/sysctl.h> 58 #include <sys/taskqueue.h> 59 #include <sys/vmem.h> 60 #include <sys/vmmeter.h> 61 62 #include "opt_vm.h" 63 64 #include <vm/uma.h> 65 #include <vm/vm.h> 66 #include <vm/pmap.h> 67 #include <vm/vm_map.h> 68 #include <vm/vm_object.h> 69 #include <vm/vm_kern.h> 70 #include <vm/vm_extern.h> 71 #include <vm/vm_param.h> 72 #include <vm/vm_page.h> 73 #include <vm/vm_pageout.h> 74 #include <vm/vm_phys.h> 75 #include <vm/vm_pagequeue.h> 76 #include <vm/uma_int.h> 77 78 #define VMEM_OPTORDER 5 79 #define VMEM_OPTVALUE (1 << VMEM_OPTORDER) 80 #define VMEM_MAXORDER \ 81 (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER) 82 83 #define VMEM_HASHSIZE_MIN 16 84 #define VMEM_HASHSIZE_MAX 131072 85 86 #define VMEM_QCACHE_IDX_MAX 16 87 88 #define VMEM_FITMASK (M_BESTFIT | M_FIRSTFIT | M_NEXTFIT) 89 90 #define VMEM_FLAGS (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | \ 91 M_BESTFIT | M_FIRSTFIT | M_NEXTFIT) 92 93 #define BT_FLAGS (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM) 94 95 #define QC_NAME_MAX 16 96 97 /* 98 * Data structures private to vmem. 99 */ 100 MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures"); 101 102 typedef struct vmem_btag bt_t; 103 104 TAILQ_HEAD(vmem_seglist, vmem_btag); 105 LIST_HEAD(vmem_freelist, vmem_btag); 106 LIST_HEAD(vmem_hashlist, vmem_btag); 107 108 struct qcache { 109 uma_zone_t qc_cache; 110 vmem_t *qc_vmem; 111 vmem_size_t qc_size; 112 char qc_name[QC_NAME_MAX]; 113 }; 114 typedef struct qcache qcache_t; 115 #define QC_POOL_TO_QCACHE(pool) ((qcache_t *)(pool->pr_qcache)) 116 117 #define VMEM_NAME_MAX 16 118 119 /* boundary tag */ 120 struct vmem_btag { 121 TAILQ_ENTRY(vmem_btag) bt_seglist; 122 union { 123 LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */ 124 LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */ 125 } bt_u; 126 #define bt_hashlist bt_u.u_hashlist 127 #define bt_freelist bt_u.u_freelist 128 vmem_addr_t bt_start; 129 vmem_size_t bt_size; 130 int bt_type; 131 }; 132 133 /* vmem arena */ 134 struct vmem { 135 struct mtx_padalign vm_lock; 136 struct cv vm_cv; 137 char vm_name[VMEM_NAME_MAX+1]; 138 LIST_ENTRY(vmem) vm_alllist; 139 struct vmem_hashlist vm_hash0[VMEM_HASHSIZE_MIN]; 140 struct vmem_freelist vm_freelist[VMEM_MAXORDER]; 141 struct vmem_seglist vm_seglist; 142 struct vmem_hashlist *vm_hashlist; 143 vmem_size_t vm_hashsize; 144 145 /* Constant after init */ 146 vmem_size_t vm_qcache_max; 147 vmem_size_t vm_quantum_mask; 148 vmem_size_t vm_import_quantum; 149 int vm_quantum_shift; 150 151 /* Written on alloc/free */ 152 LIST_HEAD(, vmem_btag) vm_freetags; 153 int vm_nfreetags; 154 int vm_nbusytag; 155 vmem_size_t vm_inuse; 156 vmem_size_t vm_size; 157 vmem_size_t vm_limit; 158 struct vmem_btag vm_cursor; 159 160 /* Used on import. */ 161 vmem_import_t *vm_importfn; 162 vmem_release_t *vm_releasefn; 163 void *vm_arg; 164 165 /* Space exhaustion callback. */ 166 vmem_reclaim_t *vm_reclaimfn; 167 168 /* quantum cache */ 169 qcache_t vm_qcache[VMEM_QCACHE_IDX_MAX]; 170 }; 171 172 #define BT_TYPE_SPAN 1 /* Allocated from importfn */ 173 #define BT_TYPE_SPAN_STATIC 2 /* vmem_add() or create. */ 174 #define BT_TYPE_FREE 3 /* Available space. */ 175 #define BT_TYPE_BUSY 4 /* Used space. */ 176 #define BT_TYPE_CURSOR 5 /* Cursor for nextfit allocations. */ 177 #define BT_ISSPAN_P(bt) ((bt)->bt_type <= BT_TYPE_SPAN_STATIC) 178 179 #define BT_END(bt) ((bt)->bt_start + (bt)->bt_size - 1) 180 181 #if defined(DIAGNOSTIC) 182 static int enable_vmem_check = 0; 183 SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RWTUN, 184 &enable_vmem_check, 0, "Enable vmem check"); 185 static void vmem_check(vmem_t *); 186 #endif 187 188 static struct callout vmem_periodic_ch; 189 static int vmem_periodic_interval; 190 static struct task vmem_periodic_wk; 191 192 static struct mtx_padalign __exclusive_cache_line vmem_list_lock; 193 static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list); 194 static uma_zone_t vmem_zone; 195 196 /* ---- misc */ 197 #define VMEM_CONDVAR_INIT(vm, wchan) cv_init(&vm->vm_cv, wchan) 198 #define VMEM_CONDVAR_DESTROY(vm) cv_destroy(&vm->vm_cv) 199 #define VMEM_CONDVAR_WAIT(vm) cv_wait(&vm->vm_cv, &vm->vm_lock) 200 #define VMEM_CONDVAR_BROADCAST(vm) cv_broadcast(&vm->vm_cv) 201 202 #define VMEM_LOCK(vm) mtx_lock(&vm->vm_lock) 203 #define VMEM_TRYLOCK(vm) mtx_trylock(&vm->vm_lock) 204 #define VMEM_UNLOCK(vm) mtx_unlock(&vm->vm_lock) 205 #define VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF) 206 #define VMEM_LOCK_DESTROY(vm) mtx_destroy(&vm->vm_lock) 207 #define VMEM_ASSERT_LOCKED(vm) mtx_assert(&vm->vm_lock, MA_OWNED); 208 209 #define VMEM_ALIGNUP(addr, align) (-(-(addr) & -(align))) 210 211 #define VMEM_CROSS_P(addr1, addr2, boundary) \ 212 ((((addr1) ^ (addr2)) & -(boundary)) != 0) 213 214 #define ORDER2SIZE(order) ((order) < VMEM_OPTVALUE ? ((order) + 1) : \ 215 (vmem_size_t)1 << ((order) - (VMEM_OPTVALUE - VMEM_OPTORDER - 1))) 216 #define SIZE2ORDER(size) ((size) <= VMEM_OPTVALUE ? ((size) - 1) : \ 217 (flsl(size) + (VMEM_OPTVALUE - VMEM_OPTORDER - 2))) 218 219 /* 220 * Maximum number of boundary tags that may be required to satisfy an 221 * allocation. Two may be required to import. Another two may be 222 * required to clip edges. 223 */ 224 #define BT_MAXALLOC 4 225 226 /* 227 * Max free limits the number of locally cached boundary tags. We 228 * just want to avoid hitting the zone allocator for every call. 229 */ 230 #define BT_MAXFREE (BT_MAXALLOC * 8) 231 232 /* Allocator for boundary tags. */ 233 static uma_zone_t vmem_bt_zone; 234 235 /* boot time arena storage. */ 236 static struct vmem kernel_arena_storage; 237 static struct vmem buffer_arena_storage; 238 static struct vmem transient_arena_storage; 239 vmem_t *kernel_arena = &kernel_arena_storage; 240 vmem_t *buffer_arena = &buffer_arena_storage; 241 vmem_t *transient_arena = &transient_arena_storage; 242 243 #ifdef DEBUG_MEMGUARD 244 static struct vmem memguard_arena_storage; 245 vmem_t *memguard_arena = &memguard_arena_storage; 246 #endif 247 248 static bool 249 bt_isbusy(bt_t *bt) 250 { 251 return (bt->bt_type == BT_TYPE_BUSY); 252 } 253 254 static bool 255 bt_isfree(bt_t *bt) 256 { 257 return (bt->bt_type == BT_TYPE_FREE); 258 } 259 260 /* 261 * Fill the vmem's boundary tag cache. We guarantee that boundary tag 262 * allocation will not fail once bt_fill() passes. To do so we cache 263 * at least the maximum possible tag allocations in the arena. 264 */ 265 static __noinline int 266 _bt_fill(vmem_t *vm, int flags) 267 { 268 bt_t *bt; 269 270 VMEM_ASSERT_LOCKED(vm); 271 272 /* 273 * Only allow the kernel arena and arenas derived from kernel arena to 274 * dip into reserve tags. They are where new tags come from. 275 */ 276 flags &= BT_FLAGS; 277 if (vm != kernel_arena && vm->vm_arg != kernel_arena) 278 flags &= ~M_USE_RESERVE; 279 280 /* 281 * Loop until we meet the reserve. To minimize the lock shuffle 282 * and prevent simultaneous fills we first try a NOWAIT regardless 283 * of the caller's flags. Specify M_NOVM so we don't recurse while 284 * holding a vmem lock. 285 */ 286 while (vm->vm_nfreetags < BT_MAXALLOC) { 287 bt = uma_zalloc(vmem_bt_zone, 288 (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM); 289 if (bt == NULL) { 290 VMEM_UNLOCK(vm); 291 bt = uma_zalloc(vmem_bt_zone, flags); 292 VMEM_LOCK(vm); 293 if (bt == NULL) 294 break; 295 } 296 LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); 297 vm->vm_nfreetags++; 298 } 299 300 if (vm->vm_nfreetags < BT_MAXALLOC) 301 return ENOMEM; 302 303 return 0; 304 } 305 306 static inline int 307 bt_fill(vmem_t *vm, int flags) 308 { 309 if (vm->vm_nfreetags >= BT_MAXALLOC) 310 return (0); 311 return (_bt_fill(vm, flags)); 312 } 313 314 /* 315 * Pop a tag off of the freetag stack. 316 */ 317 static bt_t * 318 bt_alloc(vmem_t *vm) 319 { 320 bt_t *bt; 321 322 VMEM_ASSERT_LOCKED(vm); 323 bt = LIST_FIRST(&vm->vm_freetags); 324 MPASS(bt != NULL); 325 LIST_REMOVE(bt, bt_freelist); 326 vm->vm_nfreetags--; 327 328 return bt; 329 } 330 331 /* 332 * Trim the per-vmem free list. Returns with the lock released to 333 * avoid allocator recursions. 334 */ 335 static void 336 bt_freetrim(vmem_t *vm, int freelimit) 337 { 338 LIST_HEAD(, vmem_btag) freetags; 339 bt_t *bt; 340 341 LIST_INIT(&freetags); 342 VMEM_ASSERT_LOCKED(vm); 343 while (vm->vm_nfreetags > freelimit) { 344 bt = LIST_FIRST(&vm->vm_freetags); 345 LIST_REMOVE(bt, bt_freelist); 346 vm->vm_nfreetags--; 347 LIST_INSERT_HEAD(&freetags, bt, bt_freelist); 348 } 349 VMEM_UNLOCK(vm); 350 while ((bt = LIST_FIRST(&freetags)) != NULL) { 351 LIST_REMOVE(bt, bt_freelist); 352 uma_zfree(vmem_bt_zone, bt); 353 } 354 } 355 356 static inline void 357 bt_free(vmem_t *vm, bt_t *bt) 358 { 359 360 VMEM_ASSERT_LOCKED(vm); 361 MPASS(LIST_FIRST(&vm->vm_freetags) != bt); 362 LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist); 363 vm->vm_nfreetags++; 364 } 365 366 /* 367 * Hide MAXALLOC tags before dropping the arena lock to ensure that a 368 * concurrent allocation attempt does not grab them. 369 */ 370 static void 371 bt_save(vmem_t *vm) 372 { 373 KASSERT(vm->vm_nfreetags >= BT_MAXALLOC, 374 ("%s: insufficient free tags %d", __func__, vm->vm_nfreetags)); 375 vm->vm_nfreetags -= BT_MAXALLOC; 376 } 377 378 static void 379 bt_restore(vmem_t *vm) 380 { 381 vm->vm_nfreetags += BT_MAXALLOC; 382 } 383 384 /* 385 * freelist[0] ... [1, 1] 386 * freelist[1] ... [2, 2] 387 * : 388 * freelist[29] ... [30, 30] 389 * freelist[30] ... [31, 31] 390 * freelist[31] ... [32, 63] 391 * freelist[33] ... [64, 127] 392 * : 393 * freelist[n] ... [(1 << (n - 26)), (1 << (n - 25)) - 1] 394 * : 395 */ 396 397 static struct vmem_freelist * 398 bt_freehead_tofree(vmem_t *vm, vmem_size_t size) 399 { 400 const vmem_size_t qsize = size >> vm->vm_quantum_shift; 401 const int idx = SIZE2ORDER(qsize); 402 403 MPASS(size != 0 && qsize != 0); 404 MPASS((size & vm->vm_quantum_mask) == 0); 405 MPASS(idx >= 0); 406 MPASS(idx < VMEM_MAXORDER); 407 408 return &vm->vm_freelist[idx]; 409 } 410 411 /* 412 * bt_freehead_toalloc: return the freelist for the given size and allocation 413 * strategy. 414 * 415 * For M_FIRSTFIT, return the list in which any blocks are large enough 416 * for the requested size. otherwise, return the list which can have blocks 417 * large enough for the requested size. 418 */ 419 static struct vmem_freelist * 420 bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat) 421 { 422 const vmem_size_t qsize = size >> vm->vm_quantum_shift; 423 int idx = SIZE2ORDER(qsize); 424 425 MPASS(size != 0 && qsize != 0); 426 MPASS((size & vm->vm_quantum_mask) == 0); 427 428 if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) { 429 idx++; 430 /* check too large request? */ 431 } 432 MPASS(idx >= 0); 433 MPASS(idx < VMEM_MAXORDER); 434 435 return &vm->vm_freelist[idx]; 436 } 437 438 /* ---- boundary tag hash */ 439 440 static struct vmem_hashlist * 441 bt_hashhead(vmem_t *vm, vmem_addr_t addr) 442 { 443 struct vmem_hashlist *list; 444 unsigned int hash; 445 446 hash = hash32_buf(&addr, sizeof(addr), 0); 447 list = &vm->vm_hashlist[hash % vm->vm_hashsize]; 448 449 return list; 450 } 451 452 static bt_t * 453 bt_lookupbusy(vmem_t *vm, vmem_addr_t addr) 454 { 455 struct vmem_hashlist *list; 456 bt_t *bt; 457 458 VMEM_ASSERT_LOCKED(vm); 459 list = bt_hashhead(vm, addr); 460 LIST_FOREACH(bt, list, bt_hashlist) { 461 if (bt->bt_start == addr) { 462 break; 463 } 464 } 465 466 return bt; 467 } 468 469 static void 470 bt_rembusy(vmem_t *vm, bt_t *bt) 471 { 472 473 VMEM_ASSERT_LOCKED(vm); 474 MPASS(vm->vm_nbusytag > 0); 475 vm->vm_inuse -= bt->bt_size; 476 vm->vm_nbusytag--; 477 LIST_REMOVE(bt, bt_hashlist); 478 } 479 480 static void 481 bt_insbusy(vmem_t *vm, bt_t *bt) 482 { 483 struct vmem_hashlist *list; 484 485 VMEM_ASSERT_LOCKED(vm); 486 MPASS(bt->bt_type == BT_TYPE_BUSY); 487 488 list = bt_hashhead(vm, bt->bt_start); 489 LIST_INSERT_HEAD(list, bt, bt_hashlist); 490 vm->vm_nbusytag++; 491 vm->vm_inuse += bt->bt_size; 492 } 493 494 /* ---- boundary tag list */ 495 496 static void 497 bt_remseg(vmem_t *vm, bt_t *bt) 498 { 499 500 MPASS(bt->bt_type != BT_TYPE_CURSOR); 501 TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist); 502 bt_free(vm, bt); 503 } 504 505 static void 506 bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev) 507 { 508 509 TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist); 510 } 511 512 static void 513 bt_insseg_tail(vmem_t *vm, bt_t *bt) 514 { 515 516 TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist); 517 } 518 519 static void 520 bt_remfree(vmem_t *vm __unused, bt_t *bt) 521 { 522 523 MPASS(bt->bt_type == BT_TYPE_FREE); 524 525 LIST_REMOVE(bt, bt_freelist); 526 } 527 528 static void 529 bt_insfree(vmem_t *vm, bt_t *bt) 530 { 531 struct vmem_freelist *list; 532 533 list = bt_freehead_tofree(vm, bt->bt_size); 534 LIST_INSERT_HEAD(list, bt, bt_freelist); 535 } 536 537 /* ---- vmem internal functions */ 538 539 /* 540 * Import from the arena into the quantum cache in UMA. 541 * 542 * We use VMEM_ADDR_QCACHE_MIN instead of 0: uma_zalloc() returns 0 to indicate 543 * failure, so UMA can't be used to cache a resource with value 0. 544 */ 545 static int 546 qc_import(void *arg, void **store, int cnt, int domain, int flags) 547 { 548 qcache_t *qc; 549 vmem_addr_t addr; 550 int i; 551 552 KASSERT((flags & M_WAITOK) == 0, ("blocking allocation")); 553 554 qc = arg; 555 for (i = 0; i < cnt; i++) { 556 if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0, 557 VMEM_ADDR_QCACHE_MIN, VMEM_ADDR_MAX, flags, &addr) != 0) 558 break; 559 store[i] = (void *)addr; 560 } 561 return (i); 562 } 563 564 /* 565 * Release memory from the UMA cache to the arena. 566 */ 567 static void 568 qc_release(void *arg, void **store, int cnt) 569 { 570 qcache_t *qc; 571 int i; 572 573 qc = arg; 574 for (i = 0; i < cnt; i++) 575 vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size); 576 } 577 578 static void 579 qc_init(vmem_t *vm, vmem_size_t qcache_max) 580 { 581 qcache_t *qc; 582 vmem_size_t size; 583 int qcache_idx_max; 584 int i; 585 586 MPASS((qcache_max & vm->vm_quantum_mask) == 0); 587 qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift, 588 VMEM_QCACHE_IDX_MAX); 589 vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift; 590 for (i = 0; i < qcache_idx_max; i++) { 591 qc = &vm->vm_qcache[i]; 592 size = (i + 1) << vm->vm_quantum_shift; 593 snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu", 594 vm->vm_name, size); 595 qc->qc_vmem = vm; 596 qc->qc_size = size; 597 qc->qc_cache = uma_zcache_create(qc->qc_name, size, 598 NULL, NULL, NULL, NULL, qc_import, qc_release, qc, 0); 599 MPASS(qc->qc_cache); 600 } 601 } 602 603 static void 604 qc_destroy(vmem_t *vm) 605 { 606 int qcache_idx_max; 607 int i; 608 609 qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; 610 for (i = 0; i < qcache_idx_max; i++) 611 uma_zdestroy(vm->vm_qcache[i].qc_cache); 612 } 613 614 static void 615 qc_drain(vmem_t *vm) 616 { 617 int qcache_idx_max; 618 int i; 619 620 qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; 621 for (i = 0; i < qcache_idx_max; i++) 622 uma_zone_reclaim(vm->vm_qcache[i].qc_cache, UMA_RECLAIM_DRAIN); 623 } 624 625 #ifndef UMA_USE_DMAP 626 627 static struct mtx_padalign __exclusive_cache_line vmem_bt_lock; 628 629 /* 630 * vmem_bt_alloc: Allocate a new page of boundary tags. 631 * 632 * On architectures with UMA_USE_DMAP there is no recursion; no address 633 * space need be allocated to allocate boundary tags. For the others, we 634 * must handle recursion. Boundary tags are necessary to allocate new 635 * boundary tags. 636 * 637 * UMA guarantees that enough tags are held in reserve to allocate a new 638 * page of kva. We dip into this reserve by specifying M_USE_RESERVE only 639 * when allocating the page to hold new boundary tags. In this way the 640 * reserve is automatically filled by the allocation that uses the reserve. 641 * 642 * We still have to guarantee that the new tags are allocated atomically since 643 * many threads may try concurrently. The bt_lock provides this guarantee. 644 * We convert WAITOK allocations to NOWAIT and then handle the blocking here 645 * on failure. It's ok to return NULL for a WAITOK allocation as UMA will 646 * loop again after checking to see if we lost the race to allocate. 647 * 648 * There is a small race between vmem_bt_alloc() returning the page and the 649 * zone lock being acquired to add the page to the zone. For WAITOK 650 * allocations we just pause briefly. NOWAIT may experience a transient 651 * failure. To alleviate this we permit a small number of simultaneous 652 * fills to proceed concurrently so NOWAIT is less likely to fail unless 653 * we are really out of KVA. 654 */ 655 static void * 656 vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, 657 int wait) 658 { 659 vmem_addr_t addr; 660 661 *pflag = UMA_SLAB_KERNEL; 662 663 /* 664 * Single thread boundary tag allocation so that the address space 665 * and memory are added in one atomic operation. 666 */ 667 mtx_lock(&vmem_bt_lock); 668 if (vmem_xalloc(vm_dom[domain].vmd_kernel_arena, bytes, 0, 0, 0, 669 VMEM_ADDR_MIN, VMEM_ADDR_MAX, 670 M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT, &addr) == 0) { 671 if (kmem_back_domain(domain, kernel_object, addr, bytes, 672 M_NOWAIT | M_USE_RESERVE) == 0) { 673 mtx_unlock(&vmem_bt_lock); 674 return ((void *)addr); 675 } 676 vmem_xfree(vm_dom[domain].vmd_kernel_arena, addr, bytes); 677 mtx_unlock(&vmem_bt_lock); 678 /* 679 * Out of memory, not address space. This may not even be 680 * possible due to M_USE_RESERVE page allocation. 681 */ 682 if (wait & M_WAITOK) 683 vm_wait_domain(domain); 684 return (NULL); 685 } 686 mtx_unlock(&vmem_bt_lock); 687 /* 688 * We're either out of address space or lost a fill race. 689 */ 690 if (wait & M_WAITOK) 691 pause("btalloc", 1); 692 693 return (NULL); 694 } 695 #endif 696 697 void 698 vmem_startup(void) 699 { 700 701 mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF); 702 vmem_zone = uma_zcreate("vmem", 703 sizeof(struct vmem), NULL, NULL, NULL, NULL, 704 UMA_ALIGN_PTR, 0); 705 vmem_bt_zone = uma_zcreate("vmem btag", 706 sizeof(struct vmem_btag), NULL, NULL, NULL, NULL, 707 UMA_ALIGN_PTR, UMA_ZONE_VM); 708 #ifndef UMA_USE_DMAP 709 mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF); 710 uma_prealloc(vmem_bt_zone, BT_MAXALLOC); 711 /* 712 * Reserve enough tags to allocate new tags. We allow multiple 713 * CPUs to attempt to allocate new tags concurrently to limit 714 * false restarts in UMA. vmem_bt_alloc() allocates from a per-domain 715 * arena, which may involve importing a range from the kernel arena, 716 * so we need to keep at least 2 * BT_MAXALLOC tags reserved. 717 */ 718 uma_zone_reserve(vmem_bt_zone, 2 * BT_MAXALLOC * mp_ncpus); 719 uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc); 720 #endif 721 } 722 723 /* ---- rehash */ 724 725 static int 726 vmem_rehash(vmem_t *vm, vmem_size_t newhashsize) 727 { 728 bt_t *bt; 729 struct vmem_hashlist *newhashlist; 730 struct vmem_hashlist *oldhashlist; 731 vmem_size_t i, oldhashsize; 732 733 MPASS(newhashsize > 0); 734 735 newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize, 736 M_VMEM, M_NOWAIT); 737 if (newhashlist == NULL) 738 return ENOMEM; 739 for (i = 0; i < newhashsize; i++) { 740 LIST_INIT(&newhashlist[i]); 741 } 742 743 VMEM_LOCK(vm); 744 oldhashlist = vm->vm_hashlist; 745 oldhashsize = vm->vm_hashsize; 746 vm->vm_hashlist = newhashlist; 747 vm->vm_hashsize = newhashsize; 748 if (oldhashlist == NULL) { 749 VMEM_UNLOCK(vm); 750 return 0; 751 } 752 for (i = 0; i < oldhashsize; i++) { 753 while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) { 754 bt_rembusy(vm, bt); 755 bt_insbusy(vm, bt); 756 } 757 } 758 VMEM_UNLOCK(vm); 759 760 if (oldhashlist != vm->vm_hash0) 761 free(oldhashlist, M_VMEM); 762 763 return 0; 764 } 765 766 static void 767 vmem_periodic_kick(void *dummy) 768 { 769 770 taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk); 771 } 772 773 static void 774 vmem_periodic(void *unused, int pending) 775 { 776 vmem_t *vm; 777 vmem_size_t desired; 778 vmem_size_t current; 779 780 mtx_lock(&vmem_list_lock); 781 LIST_FOREACH(vm, &vmem_list, vm_alllist) { 782 #ifdef DIAGNOSTIC 783 /* Convenient time to verify vmem state. */ 784 if (enable_vmem_check == 1) { 785 VMEM_LOCK(vm); 786 vmem_check(vm); 787 VMEM_UNLOCK(vm); 788 } 789 #endif 790 desired = 1 << flsl(vm->vm_nbusytag); 791 desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN), 792 VMEM_HASHSIZE_MAX); 793 current = vm->vm_hashsize; 794 795 /* Grow in powers of two. Shrink less aggressively. */ 796 if (desired >= current * 2 || desired * 4 <= current) 797 vmem_rehash(vm, desired); 798 799 /* 800 * Periodically wake up threads waiting for resources, 801 * so they could ask for reclamation again. 802 */ 803 VMEM_CONDVAR_BROADCAST(vm); 804 } 805 mtx_unlock(&vmem_list_lock); 806 807 callout_reset(&vmem_periodic_ch, vmem_periodic_interval, 808 vmem_periodic_kick, NULL); 809 } 810 811 static void 812 vmem_start_callout(void *unused) 813 { 814 815 TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL); 816 vmem_periodic_interval = hz * 10; 817 callout_init(&vmem_periodic_ch, 1); 818 callout_reset(&vmem_periodic_ch, vmem_periodic_interval, 819 vmem_periodic_kick, NULL); 820 } 821 SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL); 822 823 static void 824 vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type) 825 { 826 bt_t *btfree, *btprev, *btspan; 827 828 VMEM_ASSERT_LOCKED(vm); 829 MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC); 830 MPASS((size & vm->vm_quantum_mask) == 0); 831 832 if (vm->vm_releasefn == NULL) { 833 /* 834 * The new segment will never be released, so see if it is 835 * contiguous with respect to an existing segment. In this case 836 * a span tag is not needed, and it may be possible now or in 837 * the future to coalesce the new segment with an existing free 838 * segment. 839 */ 840 btprev = TAILQ_LAST(&vm->vm_seglist, vmem_seglist); 841 if ((!bt_isbusy(btprev) && !bt_isfree(btprev)) || 842 btprev->bt_start + btprev->bt_size != addr) 843 btprev = NULL; 844 } else { 845 btprev = NULL; 846 } 847 848 if (btprev == NULL || bt_isbusy(btprev)) { 849 if (btprev == NULL) { 850 btspan = bt_alloc(vm); 851 btspan->bt_type = type; 852 btspan->bt_start = addr; 853 btspan->bt_size = size; 854 bt_insseg_tail(vm, btspan); 855 } 856 857 btfree = bt_alloc(vm); 858 btfree->bt_type = BT_TYPE_FREE; 859 btfree->bt_start = addr; 860 btfree->bt_size = size; 861 bt_insseg_tail(vm, btfree); 862 bt_insfree(vm, btfree); 863 } else { 864 bt_remfree(vm, btprev); 865 btprev->bt_size += size; 866 bt_insfree(vm, btprev); 867 } 868 869 vm->vm_size += size; 870 } 871 872 static void 873 vmem_destroy1(vmem_t *vm) 874 { 875 bt_t *bt; 876 877 /* 878 * Drain per-cpu quantum caches. 879 */ 880 qc_destroy(vm); 881 882 /* 883 * The vmem should now only contain empty segments. 884 */ 885 VMEM_LOCK(vm); 886 MPASS(vm->vm_nbusytag == 0); 887 888 TAILQ_REMOVE(&vm->vm_seglist, &vm->vm_cursor, bt_seglist); 889 while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL) 890 bt_remseg(vm, bt); 891 892 if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0) 893 free(vm->vm_hashlist, M_VMEM); 894 895 bt_freetrim(vm, 0); 896 897 VMEM_CONDVAR_DESTROY(vm); 898 VMEM_LOCK_DESTROY(vm); 899 uma_zfree(vmem_zone, vm); 900 } 901 902 static int 903 vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags) 904 { 905 vmem_addr_t addr; 906 int error; 907 908 if (vm->vm_importfn == NULL) 909 return (EINVAL); 910 911 /* 912 * To make sure we get a span that meets the alignment we double it 913 * and add the size to the tail. This slightly overestimates. 914 */ 915 if (align != vm->vm_quantum_mask + 1) 916 size = (align * 2) + size; 917 size = roundup(size, vm->vm_import_quantum); 918 919 if (vm->vm_limit != 0 && vm->vm_limit < vm->vm_size + size) 920 return (ENOMEM); 921 922 bt_save(vm); 923 VMEM_UNLOCK(vm); 924 error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr); 925 VMEM_LOCK(vm); 926 bt_restore(vm); 927 if (error) 928 return (ENOMEM); 929 930 vmem_add1(vm, addr, size, BT_TYPE_SPAN); 931 932 return 0; 933 } 934 935 /* 936 * vmem_fit: check if a bt can satisfy the given restrictions. 937 * 938 * it's a caller's responsibility to ensure the region is big enough 939 * before calling us. 940 */ 941 static int 942 vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align, 943 vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr, 944 vmem_addr_t maxaddr, vmem_addr_t *addrp) 945 { 946 vmem_addr_t start; 947 vmem_addr_t end; 948 949 MPASS(size > 0); 950 MPASS(bt->bt_size >= size); /* caller's responsibility */ 951 952 /* 953 * XXX assumption: vmem_addr_t and vmem_size_t are 954 * unsigned integer of the same size. 955 */ 956 957 start = bt->bt_start; 958 if (start < minaddr) { 959 start = minaddr; 960 } 961 end = BT_END(bt); 962 if (end > maxaddr) 963 end = maxaddr; 964 if (start > end) 965 return (ENOMEM); 966 967 start = VMEM_ALIGNUP(start - phase, align) + phase; 968 if (start < bt->bt_start) 969 start += align; 970 if (VMEM_CROSS_P(start, start + size - 1, nocross)) { 971 MPASS(align < nocross); 972 start = VMEM_ALIGNUP(start - phase, nocross) + phase; 973 } 974 if (start <= end && end - start >= size - 1) { 975 MPASS((start & (align - 1)) == phase); 976 MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross)); 977 MPASS(minaddr <= start); 978 MPASS(maxaddr == 0 || start + size - 1 <= maxaddr); 979 MPASS(bt->bt_start <= start); 980 MPASS(BT_END(bt) - start >= size - 1); 981 *addrp = start; 982 983 return (0); 984 } 985 return (ENOMEM); 986 } 987 988 /* 989 * vmem_clip: Trim the boundary tag edges to the requested start and size. 990 */ 991 static void 992 vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size) 993 { 994 bt_t *btnew; 995 bt_t *btprev; 996 997 VMEM_ASSERT_LOCKED(vm); 998 MPASS(bt->bt_type == BT_TYPE_FREE); 999 MPASS(bt->bt_size >= size); 1000 bt_remfree(vm, bt); 1001 if (bt->bt_start != start) { 1002 btprev = bt_alloc(vm); 1003 btprev->bt_type = BT_TYPE_FREE; 1004 btprev->bt_start = bt->bt_start; 1005 btprev->bt_size = start - bt->bt_start; 1006 bt->bt_start = start; 1007 bt->bt_size -= btprev->bt_size; 1008 bt_insfree(vm, btprev); 1009 bt_insseg(vm, btprev, 1010 TAILQ_PREV(bt, vmem_seglist, bt_seglist)); 1011 } 1012 MPASS(bt->bt_start == start); 1013 if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) { 1014 /* split */ 1015 btnew = bt_alloc(vm); 1016 btnew->bt_type = BT_TYPE_BUSY; 1017 btnew->bt_start = bt->bt_start; 1018 btnew->bt_size = size; 1019 bt->bt_start = bt->bt_start + size; 1020 bt->bt_size -= size; 1021 bt_insfree(vm, bt); 1022 bt_insseg(vm, btnew, 1023 TAILQ_PREV(bt, vmem_seglist, bt_seglist)); 1024 bt_insbusy(vm, btnew); 1025 bt = btnew; 1026 } else { 1027 bt->bt_type = BT_TYPE_BUSY; 1028 bt_insbusy(vm, bt); 1029 } 1030 MPASS(bt->bt_size >= size); 1031 } 1032 1033 static int 1034 vmem_try_fetch(vmem_t *vm, const vmem_size_t size, vmem_size_t align, int flags) 1035 { 1036 vmem_size_t avail; 1037 1038 VMEM_ASSERT_LOCKED(vm); 1039 1040 /* 1041 * XXX it is possible to fail to meet xalloc constraints with the 1042 * imported region. It is up to the user to specify the 1043 * import quantum such that it can satisfy any allocation. 1044 */ 1045 if (vmem_import(vm, size, align, flags) == 0) 1046 return (1); 1047 1048 /* 1049 * Try to free some space from the quantum cache or reclaim 1050 * functions if available. 1051 */ 1052 if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) { 1053 avail = vm->vm_size - vm->vm_inuse; 1054 bt_save(vm); 1055 VMEM_UNLOCK(vm); 1056 if (vm->vm_qcache_max != 0) 1057 qc_drain(vm); 1058 if (vm->vm_reclaimfn != NULL) 1059 vm->vm_reclaimfn(vm, flags); 1060 VMEM_LOCK(vm); 1061 bt_restore(vm); 1062 /* If we were successful retry even NOWAIT. */ 1063 if (vm->vm_size - vm->vm_inuse > avail) 1064 return (1); 1065 } 1066 if ((flags & M_NOWAIT) != 0) 1067 return (0); 1068 bt_save(vm); 1069 VMEM_CONDVAR_WAIT(vm); 1070 bt_restore(vm); 1071 return (1); 1072 } 1073 1074 static int 1075 vmem_try_release(vmem_t *vm, struct vmem_btag *bt, const bool remfree) 1076 { 1077 struct vmem_btag *prev; 1078 1079 MPASS(bt->bt_type == BT_TYPE_FREE); 1080 1081 if (vm->vm_releasefn == NULL) 1082 return (0); 1083 1084 prev = TAILQ_PREV(bt, vmem_seglist, bt_seglist); 1085 MPASS(prev != NULL); 1086 MPASS(prev->bt_type != BT_TYPE_FREE); 1087 1088 if (prev->bt_type == BT_TYPE_SPAN && prev->bt_size == bt->bt_size) { 1089 vmem_addr_t spanaddr; 1090 vmem_size_t spansize; 1091 1092 MPASS(prev->bt_start == bt->bt_start); 1093 spanaddr = prev->bt_start; 1094 spansize = prev->bt_size; 1095 if (remfree) 1096 bt_remfree(vm, bt); 1097 bt_remseg(vm, bt); 1098 bt_remseg(vm, prev); 1099 vm->vm_size -= spansize; 1100 VMEM_CONDVAR_BROADCAST(vm); 1101 bt_freetrim(vm, BT_MAXFREE); 1102 vm->vm_releasefn(vm->vm_arg, spanaddr, spansize); 1103 return (1); 1104 } 1105 return (0); 1106 } 1107 1108 static int 1109 vmem_xalloc_nextfit(vmem_t *vm, const vmem_size_t size, vmem_size_t align, 1110 const vmem_size_t phase, const vmem_size_t nocross, int flags, 1111 vmem_addr_t *addrp) 1112 { 1113 struct vmem_btag *bt, *cursor, *next, *prev; 1114 int error; 1115 1116 error = ENOMEM; 1117 VMEM_LOCK(vm); 1118 1119 /* 1120 * Make sure we have enough tags to complete the operation. 1121 */ 1122 if (bt_fill(vm, flags) != 0) 1123 goto out; 1124 1125 retry: 1126 /* 1127 * Find the next free tag meeting our constraints. If one is found, 1128 * perform the allocation. 1129 */ 1130 for (cursor = &vm->vm_cursor, bt = TAILQ_NEXT(cursor, bt_seglist); 1131 bt != cursor; bt = TAILQ_NEXT(bt, bt_seglist)) { 1132 if (bt == NULL) 1133 bt = TAILQ_FIRST(&vm->vm_seglist); 1134 if (bt->bt_type == BT_TYPE_FREE && bt->bt_size >= size && 1135 (error = vmem_fit(bt, size, align, phase, nocross, 1136 VMEM_ADDR_MIN, VMEM_ADDR_MAX, addrp)) == 0) { 1137 vmem_clip(vm, bt, *addrp, size); 1138 break; 1139 } 1140 } 1141 1142 /* 1143 * Try to coalesce free segments around the cursor. If we succeed, and 1144 * have not yet satisfied the allocation request, try again with the 1145 * newly coalesced segment. 1146 */ 1147 if ((next = TAILQ_NEXT(cursor, bt_seglist)) != NULL && 1148 (prev = TAILQ_PREV(cursor, vmem_seglist, bt_seglist)) != NULL && 1149 next->bt_type == BT_TYPE_FREE && prev->bt_type == BT_TYPE_FREE && 1150 prev->bt_start + prev->bt_size == next->bt_start) { 1151 prev->bt_size += next->bt_size; 1152 bt_remfree(vm, next); 1153 bt_remseg(vm, next); 1154 1155 /* 1156 * The coalesced segment might be able to satisfy our request. 1157 * If not, we might need to release it from the arena. 1158 */ 1159 if (error == ENOMEM && prev->bt_size >= size && 1160 (error = vmem_fit(prev, size, align, phase, nocross, 1161 VMEM_ADDR_MIN, VMEM_ADDR_MAX, addrp)) == 0) { 1162 vmem_clip(vm, prev, *addrp, size); 1163 bt = prev; 1164 } else 1165 (void)vmem_try_release(vm, prev, true); 1166 } 1167 1168 /* 1169 * If the allocation was successful, advance the cursor. 1170 */ 1171 if (error == 0) { 1172 TAILQ_REMOVE(&vm->vm_seglist, cursor, bt_seglist); 1173 for (; bt != NULL && bt->bt_start < *addrp + size; 1174 bt = TAILQ_NEXT(bt, bt_seglist)) 1175 ; 1176 if (bt != NULL) 1177 TAILQ_INSERT_BEFORE(bt, cursor, bt_seglist); 1178 else 1179 TAILQ_INSERT_HEAD(&vm->vm_seglist, cursor, bt_seglist); 1180 } 1181 1182 /* 1183 * Attempt to bring additional resources into the arena. If that fails 1184 * and M_WAITOK is specified, sleep waiting for resources to be freed. 1185 */ 1186 if (error == ENOMEM && vmem_try_fetch(vm, size, align, flags)) 1187 goto retry; 1188 1189 out: 1190 VMEM_UNLOCK(vm); 1191 return (error); 1192 } 1193 1194 /* ---- vmem API */ 1195 1196 void 1197 vmem_set_import(vmem_t *vm, vmem_import_t *importfn, 1198 vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum) 1199 { 1200 1201 VMEM_LOCK(vm); 1202 KASSERT(vm->vm_size == 0, ("%s: arena is non-empty", __func__)); 1203 vm->vm_importfn = importfn; 1204 vm->vm_releasefn = releasefn; 1205 vm->vm_arg = arg; 1206 vm->vm_import_quantum = import_quantum; 1207 VMEM_UNLOCK(vm); 1208 } 1209 1210 void 1211 vmem_set_limit(vmem_t *vm, vmem_size_t limit) 1212 { 1213 1214 VMEM_LOCK(vm); 1215 vm->vm_limit = limit; 1216 VMEM_UNLOCK(vm); 1217 } 1218 1219 void 1220 vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn) 1221 { 1222 1223 VMEM_LOCK(vm); 1224 vm->vm_reclaimfn = reclaimfn; 1225 VMEM_UNLOCK(vm); 1226 } 1227 1228 /* 1229 * vmem_init: Initializes vmem arena. 1230 */ 1231 vmem_t * 1232 vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size, 1233 vmem_size_t quantum, vmem_size_t qcache_max, int flags) 1234 { 1235 vmem_size_t i; 1236 1237 MPASS(quantum > 0); 1238 MPASS((quantum & (quantum - 1)) == 0); 1239 1240 bzero(vm, sizeof(*vm)); 1241 1242 VMEM_CONDVAR_INIT(vm, name); 1243 VMEM_LOCK_INIT(vm, name); 1244 vm->vm_nfreetags = 0; 1245 LIST_INIT(&vm->vm_freetags); 1246 strlcpy(vm->vm_name, name, sizeof(vm->vm_name)); 1247 vm->vm_quantum_mask = quantum - 1; 1248 vm->vm_quantum_shift = flsl(quantum) - 1; 1249 vm->vm_nbusytag = 0; 1250 vm->vm_size = 0; 1251 vm->vm_limit = 0; 1252 vm->vm_inuse = 0; 1253 qc_init(vm, qcache_max); 1254 1255 TAILQ_INIT(&vm->vm_seglist); 1256 vm->vm_cursor.bt_start = vm->vm_cursor.bt_size = 0; 1257 vm->vm_cursor.bt_type = BT_TYPE_CURSOR; 1258 TAILQ_INSERT_TAIL(&vm->vm_seglist, &vm->vm_cursor, bt_seglist); 1259 1260 for (i = 0; i < VMEM_MAXORDER; i++) 1261 LIST_INIT(&vm->vm_freelist[i]); 1262 1263 memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0)); 1264 vm->vm_hashsize = VMEM_HASHSIZE_MIN; 1265 vm->vm_hashlist = vm->vm_hash0; 1266 1267 if (size != 0) { 1268 if (vmem_add(vm, base, size, flags) != 0) { 1269 vmem_destroy1(vm); 1270 return NULL; 1271 } 1272 } 1273 1274 mtx_lock(&vmem_list_lock); 1275 LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist); 1276 mtx_unlock(&vmem_list_lock); 1277 1278 return vm; 1279 } 1280 1281 /* 1282 * vmem_create: create an arena. 1283 */ 1284 vmem_t * 1285 vmem_create(const char *name, vmem_addr_t base, vmem_size_t size, 1286 vmem_size_t quantum, vmem_size_t qcache_max, int flags) 1287 { 1288 1289 vmem_t *vm; 1290 1291 vm = uma_zalloc(vmem_zone, flags & (M_WAITOK|M_NOWAIT)); 1292 if (vm == NULL) 1293 return (NULL); 1294 if (vmem_init(vm, name, base, size, quantum, qcache_max, 1295 flags) == NULL) 1296 return (NULL); 1297 return (vm); 1298 } 1299 1300 void 1301 vmem_destroy(vmem_t *vm) 1302 { 1303 1304 mtx_lock(&vmem_list_lock); 1305 LIST_REMOVE(vm, vm_alllist); 1306 mtx_unlock(&vmem_list_lock); 1307 1308 vmem_destroy1(vm); 1309 } 1310 1311 vmem_size_t 1312 vmem_roundup_size(vmem_t *vm, vmem_size_t size) 1313 { 1314 1315 return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask; 1316 } 1317 1318 /* 1319 * vmem_alloc: allocate resource from the arena. 1320 */ 1321 int 1322 vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp) 1323 { 1324 const int strat __unused = flags & VMEM_FITMASK; 1325 qcache_t *qc; 1326 1327 flags &= VMEM_FLAGS; 1328 MPASS(size > 0); 1329 MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT || strat == M_NEXTFIT); 1330 if ((flags & M_NOWAIT) == 0) 1331 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc"); 1332 1333 if (size <= vm->vm_qcache_max) { 1334 /* 1335 * Resource 0 cannot be cached, so avoid a blocking allocation 1336 * in qc_import() and give the vmem_xalloc() call below a chance 1337 * to return 0. 1338 */ 1339 qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; 1340 *addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, 1341 (flags & ~M_WAITOK) | M_NOWAIT); 1342 if (__predict_true(*addrp != 0)) 1343 return (0); 1344 } 1345 1346 return (vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, 1347 flags, addrp)); 1348 } 1349 1350 int 1351 vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align, 1352 const vmem_size_t phase, const vmem_size_t nocross, 1353 const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags, 1354 vmem_addr_t *addrp) 1355 { 1356 const vmem_size_t size = vmem_roundup_size(vm, size0); 1357 struct vmem_freelist *list; 1358 struct vmem_freelist *first; 1359 struct vmem_freelist *end; 1360 bt_t *bt; 1361 int error; 1362 int strat; 1363 1364 flags &= VMEM_FLAGS; 1365 strat = flags & VMEM_FITMASK; 1366 MPASS(size0 > 0); 1367 MPASS(size > 0); 1368 MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT || strat == M_NEXTFIT); 1369 MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK)); 1370 if ((flags & M_NOWAIT) == 0) 1371 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc"); 1372 MPASS((align & vm->vm_quantum_mask) == 0); 1373 MPASS((align & (align - 1)) == 0); 1374 MPASS((phase & vm->vm_quantum_mask) == 0); 1375 MPASS((nocross & vm->vm_quantum_mask) == 0); 1376 MPASS((nocross & (nocross - 1)) == 0); 1377 MPASS((align == 0 && phase == 0) || phase < align); 1378 MPASS(nocross == 0 || nocross >= size); 1379 MPASS(minaddr <= maxaddr); 1380 MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross)); 1381 if (strat == M_NEXTFIT) 1382 MPASS(minaddr == VMEM_ADDR_MIN && maxaddr == VMEM_ADDR_MAX); 1383 1384 if (align == 0) 1385 align = vm->vm_quantum_mask + 1; 1386 *addrp = 0; 1387 1388 /* 1389 * Next-fit allocations don't use the freelists. 1390 */ 1391 if (strat == M_NEXTFIT) 1392 return (vmem_xalloc_nextfit(vm, size0, align, phase, nocross, 1393 flags, addrp)); 1394 1395 end = &vm->vm_freelist[VMEM_MAXORDER]; 1396 /* 1397 * choose a free block from which we allocate. 1398 */ 1399 first = bt_freehead_toalloc(vm, size, strat); 1400 VMEM_LOCK(vm); 1401 1402 /* 1403 * Make sure we have enough tags to complete the operation. 1404 */ 1405 error = bt_fill(vm, flags); 1406 if (error != 0) 1407 goto out; 1408 for (;;) { 1409 /* 1410 * Scan freelists looking for a tag that satisfies the 1411 * allocation. If we're doing BESTFIT we may encounter 1412 * sizes below the request. If we're doing FIRSTFIT we 1413 * inspect only the first element from each list. 1414 */ 1415 for (list = first; list < end; list++) { 1416 LIST_FOREACH(bt, list, bt_freelist) { 1417 if (bt->bt_size >= size) { 1418 error = vmem_fit(bt, size, align, phase, 1419 nocross, minaddr, maxaddr, addrp); 1420 if (error == 0) { 1421 vmem_clip(vm, bt, *addrp, size); 1422 goto out; 1423 } 1424 } 1425 /* FIRST skips to the next list. */ 1426 if (strat == M_FIRSTFIT) 1427 break; 1428 } 1429 } 1430 1431 /* 1432 * Retry if the fast algorithm failed. 1433 */ 1434 if (strat == M_FIRSTFIT) { 1435 strat = M_BESTFIT; 1436 first = bt_freehead_toalloc(vm, size, strat); 1437 continue; 1438 } 1439 1440 /* 1441 * Try a few measures to bring additional resources into the 1442 * arena. If all else fails, we will sleep waiting for 1443 * resources to be freed. 1444 */ 1445 if (!vmem_try_fetch(vm, size, align, flags)) { 1446 error = ENOMEM; 1447 break; 1448 } 1449 } 1450 out: 1451 VMEM_UNLOCK(vm); 1452 if (error != 0 && (flags & M_NOWAIT) == 0) 1453 panic("failed to allocate waiting allocation\n"); 1454 1455 return (error); 1456 } 1457 1458 /* 1459 * vmem_free: free the resource to the arena. 1460 */ 1461 void 1462 vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) 1463 { 1464 qcache_t *qc; 1465 MPASS(size > 0); 1466 1467 if (size <= vm->vm_qcache_max && 1468 __predict_true(addr >= VMEM_ADDR_QCACHE_MIN)) { 1469 qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift]; 1470 uma_zfree(qc->qc_cache, (void *)addr); 1471 } else 1472 vmem_xfree(vm, addr, size); 1473 } 1474 1475 void 1476 vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size __unused) 1477 { 1478 bt_t *bt; 1479 bt_t *t; 1480 1481 MPASS(size > 0); 1482 1483 VMEM_LOCK(vm); 1484 bt = bt_lookupbusy(vm, addr); 1485 MPASS(bt != NULL); 1486 MPASS(bt->bt_start == addr); 1487 MPASS(bt->bt_size == vmem_roundup_size(vm, size) || 1488 bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask); 1489 MPASS(bt->bt_type == BT_TYPE_BUSY); 1490 bt_rembusy(vm, bt); 1491 bt->bt_type = BT_TYPE_FREE; 1492 1493 /* coalesce */ 1494 t = TAILQ_NEXT(bt, bt_seglist); 1495 if (t != NULL && t->bt_type == BT_TYPE_FREE) { 1496 MPASS(BT_END(bt) < t->bt_start); /* YYY */ 1497 bt->bt_size += t->bt_size; 1498 bt_remfree(vm, t); 1499 bt_remseg(vm, t); 1500 } 1501 t = TAILQ_PREV(bt, vmem_seglist, bt_seglist); 1502 if (t != NULL && t->bt_type == BT_TYPE_FREE) { 1503 MPASS(BT_END(t) < bt->bt_start); /* YYY */ 1504 bt->bt_size += t->bt_size; 1505 bt->bt_start = t->bt_start; 1506 bt_remfree(vm, t); 1507 bt_remseg(vm, t); 1508 } 1509 1510 if (!vmem_try_release(vm, bt, false)) { 1511 bt_insfree(vm, bt); 1512 VMEM_CONDVAR_BROADCAST(vm); 1513 bt_freetrim(vm, BT_MAXFREE); 1514 } 1515 } 1516 1517 /* 1518 * vmem_add: 1519 * 1520 */ 1521 int 1522 vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags) 1523 { 1524 int error; 1525 1526 flags &= VMEM_FLAGS; 1527 1528 VMEM_LOCK(vm); 1529 error = bt_fill(vm, flags); 1530 if (error == 0) 1531 vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC); 1532 VMEM_UNLOCK(vm); 1533 1534 return (error); 1535 } 1536 1537 /* 1538 * vmem_size: information about arenas size 1539 */ 1540 vmem_size_t 1541 vmem_size(vmem_t *vm, int typemask) 1542 { 1543 int i; 1544 1545 switch (typemask) { 1546 case VMEM_ALLOC: 1547 return vm->vm_inuse; 1548 case VMEM_FREE: 1549 return vm->vm_size - vm->vm_inuse; 1550 case VMEM_FREE|VMEM_ALLOC: 1551 return vm->vm_size; 1552 case VMEM_MAXFREE: 1553 VMEM_LOCK(vm); 1554 for (i = VMEM_MAXORDER - 1; i >= 0; i--) { 1555 if (LIST_EMPTY(&vm->vm_freelist[i])) 1556 continue; 1557 VMEM_UNLOCK(vm); 1558 return ((vmem_size_t)ORDER2SIZE(i) << 1559 vm->vm_quantum_shift); 1560 } 1561 VMEM_UNLOCK(vm); 1562 return (0); 1563 default: 1564 panic("vmem_size"); 1565 } 1566 } 1567 1568 /* ---- debug */ 1569 1570 #if defined(DDB) || defined(DIAGNOSTIC) 1571 1572 static void bt_dump(const bt_t *, int (*)(const char *, ...) 1573 __printflike(1, 2)); 1574 1575 static const char * 1576 bt_type_string(int type) 1577 { 1578 1579 switch (type) { 1580 case BT_TYPE_BUSY: 1581 return "busy"; 1582 case BT_TYPE_FREE: 1583 return "free"; 1584 case BT_TYPE_SPAN: 1585 return "span"; 1586 case BT_TYPE_SPAN_STATIC: 1587 return "static span"; 1588 case BT_TYPE_CURSOR: 1589 return "cursor"; 1590 default: 1591 break; 1592 } 1593 return "BOGUS"; 1594 } 1595 1596 static void 1597 bt_dump(const bt_t *bt, int (*pr)(const char *, ...)) 1598 { 1599 1600 (*pr)("\t%p: %jx %jx, %d(%s)\n", 1601 bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size, 1602 bt->bt_type, bt_type_string(bt->bt_type)); 1603 } 1604 1605 static void 1606 vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2)) 1607 { 1608 const bt_t *bt; 1609 int i; 1610 1611 (*pr)("vmem %p '%s'\n", vm, vm->vm_name); 1612 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { 1613 bt_dump(bt, pr); 1614 } 1615 1616 for (i = 0; i < VMEM_MAXORDER; i++) { 1617 const struct vmem_freelist *fl = &vm->vm_freelist[i]; 1618 1619 if (LIST_EMPTY(fl)) { 1620 continue; 1621 } 1622 1623 (*pr)("freelist[%d]\n", i); 1624 LIST_FOREACH(bt, fl, bt_freelist) { 1625 bt_dump(bt, pr); 1626 } 1627 } 1628 } 1629 1630 #endif /* defined(DDB) || defined(DIAGNOSTIC) */ 1631 1632 #if defined(DDB) 1633 #include <ddb/ddb.h> 1634 1635 static bt_t * 1636 vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr) 1637 { 1638 bt_t *bt; 1639 1640 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { 1641 if (BT_ISSPAN_P(bt)) { 1642 continue; 1643 } 1644 if (bt->bt_start <= addr && addr <= BT_END(bt)) { 1645 return bt; 1646 } 1647 } 1648 1649 return NULL; 1650 } 1651 1652 void 1653 vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...)) 1654 { 1655 vmem_t *vm; 1656 1657 LIST_FOREACH(vm, &vmem_list, vm_alllist) { 1658 bt_t *bt; 1659 1660 bt = vmem_whatis_lookup(vm, addr); 1661 if (bt == NULL) { 1662 continue; 1663 } 1664 (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n", 1665 (void *)addr, (void *)bt->bt_start, 1666 (vmem_size_t)(addr - bt->bt_start), vm->vm_name, 1667 (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free"); 1668 } 1669 } 1670 1671 void 1672 vmem_printall(const char *modif, int (*pr)(const char *, ...)) 1673 { 1674 const vmem_t *vm; 1675 1676 LIST_FOREACH(vm, &vmem_list, vm_alllist) { 1677 vmem_dump(vm, pr); 1678 } 1679 } 1680 1681 void 1682 vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...)) 1683 { 1684 const vmem_t *vm = (const void *)addr; 1685 1686 vmem_dump(vm, pr); 1687 } 1688 1689 DB_SHOW_COMMAND(vmemdump, vmemdump) 1690 { 1691 1692 if (!have_addr) { 1693 db_printf("usage: show vmemdump <addr>\n"); 1694 return; 1695 } 1696 1697 vmem_dump((const vmem_t *)addr, db_printf); 1698 } 1699 1700 DB_SHOW_ALL_COMMAND(vmemdump, vmemdumpall) 1701 { 1702 const vmem_t *vm; 1703 1704 LIST_FOREACH(vm, &vmem_list, vm_alllist) 1705 vmem_dump(vm, db_printf); 1706 } 1707 1708 DB_SHOW_COMMAND(vmem, vmem_summ) 1709 { 1710 const vmem_t *vm = (const void *)addr; 1711 const bt_t *bt; 1712 size_t ft[VMEM_MAXORDER], ut[VMEM_MAXORDER]; 1713 size_t fs[VMEM_MAXORDER], us[VMEM_MAXORDER]; 1714 int ord; 1715 1716 if (!have_addr) { 1717 db_printf("usage: show vmem <addr>\n"); 1718 return; 1719 } 1720 1721 db_printf("vmem %p '%s'\n", vm, vm->vm_name); 1722 db_printf("\tquantum:\t%zu\n", vm->vm_quantum_mask + 1); 1723 db_printf("\tsize:\t%zu\n", vm->vm_size); 1724 db_printf("\tinuse:\t%zu\n", vm->vm_inuse); 1725 db_printf("\tfree:\t%zu\n", vm->vm_size - vm->vm_inuse); 1726 db_printf("\tbusy tags:\t%d\n", vm->vm_nbusytag); 1727 db_printf("\tfree tags:\t%d\n", vm->vm_nfreetags); 1728 1729 memset(&ft, 0, sizeof(ft)); 1730 memset(&ut, 0, sizeof(ut)); 1731 memset(&fs, 0, sizeof(fs)); 1732 memset(&us, 0, sizeof(us)); 1733 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { 1734 ord = SIZE2ORDER(bt->bt_size >> vm->vm_quantum_shift); 1735 if (bt->bt_type == BT_TYPE_BUSY) { 1736 ut[ord]++; 1737 us[ord] += bt->bt_size; 1738 } else if (bt->bt_type == BT_TYPE_FREE) { 1739 ft[ord]++; 1740 fs[ord] += bt->bt_size; 1741 } 1742 } 1743 db_printf("\t\t\tinuse\tsize\t\tfree\tsize\n"); 1744 for (ord = 0; ord < VMEM_MAXORDER; ord++) { 1745 if (ut[ord] == 0 && ft[ord] == 0) 1746 continue; 1747 db_printf("\t%-15zu %zu\t%-15zu %zu\t%-16zu\n", 1748 ORDER2SIZE(ord) << vm->vm_quantum_shift, 1749 ut[ord], us[ord], ft[ord], fs[ord]); 1750 } 1751 } 1752 1753 DB_SHOW_ALL_COMMAND(vmem, vmem_summall) 1754 { 1755 const vmem_t *vm; 1756 1757 LIST_FOREACH(vm, &vmem_list, vm_alllist) 1758 vmem_summ((db_expr_t)vm, TRUE, count, modif); 1759 } 1760 #endif /* defined(DDB) */ 1761 1762 #define vmem_printf printf 1763 1764 #if defined(DIAGNOSTIC) 1765 1766 static bool 1767 vmem_check_sanity(vmem_t *vm) 1768 { 1769 const bt_t *bt, *bt2; 1770 1771 MPASS(vm != NULL); 1772 1773 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { 1774 if (bt->bt_start > BT_END(bt)) { 1775 printf("corrupted tag\n"); 1776 bt_dump(bt, vmem_printf); 1777 return false; 1778 } 1779 } 1780 TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) { 1781 if (bt->bt_type == BT_TYPE_CURSOR) { 1782 if (bt->bt_start != 0 || bt->bt_size != 0) { 1783 printf("corrupted cursor\n"); 1784 return false; 1785 } 1786 continue; 1787 } 1788 TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) { 1789 if (bt == bt2) { 1790 continue; 1791 } 1792 if (bt2->bt_type == BT_TYPE_CURSOR) { 1793 continue; 1794 } 1795 if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) { 1796 continue; 1797 } 1798 if (bt->bt_start <= BT_END(bt2) && 1799 bt2->bt_start <= BT_END(bt)) { 1800 printf("overwrapped tags\n"); 1801 bt_dump(bt, vmem_printf); 1802 bt_dump(bt2, vmem_printf); 1803 return false; 1804 } 1805 } 1806 } 1807 1808 return true; 1809 } 1810 1811 static void 1812 vmem_check(vmem_t *vm) 1813 { 1814 1815 if (!vmem_check_sanity(vm)) { 1816 panic("insanity vmem %p", vm); 1817 } 1818 } 1819 1820 #endif /* defined(DIAGNOSTIC) */ 1821