1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2021 Oxide Computer Company 14 */ 15 16 /* 17 * VMM Memory Reservoir 18 * 19 * 20 * In order to make the allocation of large (multi-GiB) chunks of memory 21 * for bhyve VMs easier, we introduce the "VMM Reservoir", where system 22 * operators can set aside a substantial portion of system memory exclusively 23 * for VMs. This memory is unavailable for general use by the rest of the 24 * system. Rather than having to scour the freelist, reap kmem caches, or put 25 * pressure on the ARC, bhyve guest memory allocations can quickly determine if 26 * there is adequate reservoir memory available. Since the pages stored in the 27 * reservoir are pre-zeroed, it can be immediately used when allocated to a 28 * guest. When the memory is returned to the reservoir, it is zeroed once more 29 * to avoid leaking any sensitive data from that guest. 30 * 31 * 32 * Transient Allocations 33 * 34 * While the explicit reservoir model may work well for some applications, 35 * others may want a more traditional model, where pages for guest memory 36 * objects are allocated on demand, rather than from a pool set aside from the 37 * system. In this case, the allocation can be made in "transient" mode, where 38 * the memory is allocated normally, even if there is free capacity in the 39 * reservoir. When use of the transient allocation is complete (the guest is 40 * halted and destroyed), the pages will be freed back to the system, rather 41 * than added back to the reservoir. 42 * 43 * From an implementation standpoint, transient allocations follow the same 44 * code paths as ones using the reservoir normally. Those allocations have a 45 * tag which marks them as transient, and used/free size tallies are maintained 46 * separately for normal and transient operations. When performing a transient 47 * allocation, that amount of memory is immediately added to the reservoir , 48 * from which the allocation can be made. When freeing a transient allocation, 49 * a matching amount of memory is removed from the reservoir as part of the 50 * operation. This allows both allocation types to coexist without too much 51 * additional machinery. 52 * 53 * 54 * Administration 55 * 56 * Operators may increase, decrease, and query the the amount of memory 57 * allocated to the reservoir and from to VMs via ioctls against the vmmctl 58 * device. The total amount added to the reservoir is arbitrarily limited at 59 * this time by `vmmr_total_limit` which defaults to 80% of physmem. This is 60 * done to prevent the reservoir from inadvertently growing to a size where the 61 * system has inadequate memory to make forward progress. Memory may only be 62 * removed from the reservoir when it is free (not allocated by any guest VMs). 63 * 64 * 65 * Page Tracking 66 * 67 * The reservoir currently uses vnode association to keep track of pages under 68 * its control (either designated to the reservoir and free, or allocated to a 69 * guest VM object). This means using the existing VM system primitives for 70 * page_t instances being associated with a given (vnode, offset) tuple. It 71 * means that spans of pages, either free or allocated, need only to store a 72 * length (of the span) and an offset (into the vnode) in order to gain access 73 * to all of the underlying pages associated with that span. Associating the 74 * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be 75 * properly tracked as KAS pages, but be excluded from normal dumps (unless the 76 * operator has chosen to dump all of RAM). 77 */ 78 79 #include <sys/types.h> 80 #include <sys/mutex.h> 81 #include <sys/avl.h> 82 #include <sys/list.h> 83 #include <sys/machparam.h> 84 #include <sys/kmem.h> 85 #include <sys/stddef.h> 86 #include <sys/null.h> 87 #include <sys/errno.h> 88 #include <sys/systm.h> 89 #include <sys/sunddi.h> 90 #include <sys/policy.h> 91 #include <vm/seg_kmem.h> 92 #include <vm/hat_i86.h> 93 94 #include <sys/vmm_reservoir.h> 95 #include <sys/vmm_dev.h> 96 97 static kmutex_t vmmr_lock; 98 99 static size_t vmmr_free_sz; 100 static size_t vmmr_free_transient_sz; 101 static size_t vmmr_adding_sz; 102 static size_t vmmr_alloc_sz; 103 static size_t vmmr_alloc_transient_sz; 104 static size_t vmmr_empty_sz; 105 106 static uintptr_t vmmr_empty_last; 107 /* Upper limit for the size (free + allocated) of the reservoir */ 108 static size_t vmmr_total_limit; 109 110 /* VA range allocated from the VMM arena for the mappings */ 111 static uintptr_t vmmr_va; 112 static uintptr_t vmmr_va_sz; 113 114 /* Pair of AVL trees to store set of spans ordered by addr and size */ 115 typedef struct vmmr_treepair { 116 avl_tree_t by_addr; 117 avl_tree_t by_size; 118 } vmmr_treepair_t; 119 120 /* Spans of free memory in the reservoir */ 121 static vmmr_treepair_t vmmr_free_tp; 122 123 /* Spans of empty (not backed by memory) space in the reservoir */ 124 static vmmr_treepair_t vmmr_empty_tp; 125 126 /* Regions of memory allocated from the reservoir */ 127 static list_t vmmr_alloc_regions; 128 129 struct vmmr_span { 130 uintptr_t vs_addr; 131 size_t vs_size; 132 avl_node_t vs_by_addr; 133 avl_node_t vs_by_size; 134 uintptr_t vs_region_addr; 135 }; 136 typedef struct vmmr_span vmmr_span_t; 137 138 struct vmmr_region { 139 size_t vr_size; 140 avl_tree_t vr_spans; 141 list_node_t vr_node; 142 bool vr_transient; 143 }; 144 145 static int 146 vmmr_cmp_addr(const void *a, const void *b) 147 { 148 const vmmr_span_t *sa = a; 149 const vmmr_span_t *sb = b; 150 151 if (sa->vs_addr == sb->vs_addr) { 152 return (0); 153 } else if (sa->vs_addr < sb->vs_addr) { 154 return (-1); 155 } else { 156 return (1); 157 } 158 } 159 160 static int 161 vmmr_cmp_size(const void *a, const void *b) 162 { 163 const vmmr_span_t *sa = a; 164 const vmmr_span_t *sb = b; 165 166 if (sa->vs_size == sb->vs_size) { 167 /* 168 * Since discontiguous spans could have the same size in a 169 * by-size tree, differentiate them (as required by AVL) by 170 * address so they can safely coexist while remaining sorted. 171 */ 172 return (vmmr_cmp_addr(a, b)); 173 } else if (sa->vs_size < sb->vs_size) { 174 return (-1); 175 } else { 176 return (1); 177 } 178 } 179 180 static int 181 vmmr_cmp_region_addr(const void *a, const void *b) 182 { 183 const vmmr_span_t *sa = a; 184 const vmmr_span_t *sb = b; 185 186 if (sa->vs_region_addr == sb->vs_region_addr) { 187 return (0); 188 } else if (sa->vs_region_addr < sb->vs_region_addr) { 189 return (-1); 190 } else { 191 return (1); 192 } 193 } 194 195 static void 196 vmmr_tp_init(vmmr_treepair_t *tree) 197 { 198 avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t), 199 offsetof(vmmr_span_t, vs_by_addr)); 200 avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t), 201 offsetof(vmmr_span_t, vs_by_size)); 202 } 203 204 static void 205 vmmr_tp_destroy(vmmr_treepair_t *tree) 206 { 207 void *vcp = NULL; 208 vmmr_span_t *span; 209 210 while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) { 211 /* Freeing spans will be done when tearing down by-size tree */ 212 } 213 while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) { 214 kmem_free(span, sizeof (*span)); 215 } 216 avl_destroy(&tree->by_addr); 217 avl_destroy(&tree->by_size); 218 } 219 220 /* 221 * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent 222 * span(s). Such concatenation could result in the `to_add` span being freed, 223 * so the caller cannot use it after this returns. 224 */ 225 static void 226 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree) 227 { 228 avl_tree_t *by_addr = &tree->by_addr; 229 avl_tree_t *by_size = &tree->by_size; 230 vmmr_span_t *node; 231 avl_index_t where; 232 233 /* This addr should not already exist in the treepair */ 234 node = avl_find(by_addr, to_add, &where); 235 ASSERT3P(node, ==, NULL); 236 237 node = avl_nearest(by_addr, where, AVL_BEFORE); 238 if (node != NULL && 239 (node->vs_addr + node->vs_size) == to_add->vs_addr) { 240 /* concat with preceeding item */ 241 avl_remove(by_addr, node); 242 avl_remove(by_size, node); 243 node->vs_size += to_add->vs_size; 244 kmem_free(to_add, sizeof (*to_add)); 245 246 /* 247 * Since this now-concatenated span could be adjacent one 248 * trailing it, fall through to perform that check. 249 */ 250 to_add = node; 251 } 252 253 node = avl_nearest(by_addr, where, AVL_AFTER); 254 if (node != NULL && 255 (to_add->vs_addr + to_add->vs_size) == node->vs_addr) { 256 /* concat with trailing item */ 257 avl_remove(by_addr, node); 258 avl_remove(by_size, node); 259 node->vs_addr = to_add->vs_addr; 260 node->vs_size += to_add->vs_size; 261 avl_add(by_addr, node); 262 avl_add(by_size, node); 263 264 kmem_free(to_add, sizeof (*to_add)); 265 return; 266 } 267 268 /* simply insert */ 269 avl_add(by_addr, to_add); 270 avl_add(by_size, to_add); 271 } 272 273 /* 274 * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of 275 * the exact target size is not present, but a larger one is. May return a span 276 * with a size smaller than the target if splitting is not an option. 277 */ 278 static vmmr_span_t * 279 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree) 280 { 281 avl_tree_t *by_addr = &tree->by_addr; 282 avl_tree_t *by_size = &tree->by_size; 283 vmmr_span_t *span; 284 avl_index_t where; 285 286 ASSERT3U(target_sz, !=, 0); 287 ASSERT(!avl_is_empty(by_addr)); 288 ASSERT(!avl_is_empty(by_size)); 289 290 vmmr_span_t search = { .vs_size = target_sz }; 291 span = avl_find(by_size, &search, &where); 292 if (span == NULL) { 293 /* Try for a larger span (instead of exact match) */ 294 span = avl_nearest(by_size, where, AVL_AFTER); 295 if (span == NULL) { 296 /* 297 * Caller will need to collect several smaller spans in 298 * order to fulfill their request. 299 */ 300 span = avl_nearest(by_size, where, AVL_BEFORE); 301 ASSERT3P(span, !=, NULL); 302 } 303 } 304 305 if (span->vs_size <= target_sz) { 306 avl_remove(by_size, span); 307 avl_remove(by_addr, span); 308 309 return (span); 310 } else { 311 /* Split off adequate chunk from larger span */ 312 uintptr_t start = span->vs_addr + span->vs_size - target_sz; 313 314 avl_remove(by_size, span); 315 span->vs_size -= target_sz; 316 avl_add(by_size, span); 317 318 vmmr_span_t *split_span = 319 kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 320 split_span->vs_addr = start; 321 split_span->vs_size = target_sz; 322 323 return (split_span); 324 } 325 } 326 327 void 328 vmmr_init() 329 { 330 mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL); 331 332 /* 333 * `vmm_total_limit` represents the absolute maximum size of the VMM 334 * memory reservoir. It is meant to provide some measure of protection 335 * against an operator pushing the system into unrecoverable memory 336 * starvation through explicit or transient additions to the reservoir. 337 * 338 * There will be many situations where this limit would be inadequate to 339 * prevent kernel memory starvation in the face of certain operator 340 * actions. It is a balance to be struck between safety and allowing 341 * large systems to reach high utilization. 342 * 343 * The value is based off of pages_pp_maximum: "Number of currently 344 * available pages that cannot be 'locked'". It is sized as all of 345 * `physmem` less 120% of `pages_pp_maximum`. 346 */ 347 vmmr_total_limit = 348 (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10; 349 350 vmmr_empty_last = 0; 351 vmmr_free_sz = 0; 352 vmmr_alloc_sz = 0; 353 vmmr_empty_sz = 0; 354 vmmr_adding_sz = 0; 355 vmmr_free_transient_sz = 0; 356 vmmr_alloc_transient_sz = 0; 357 358 vmmr_tp_init(&vmmr_free_tp); 359 vmmr_tp_init(&vmmr_empty_tp); 360 361 list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t), 362 offsetof(vmmr_region_t, vr_node)); 363 364 /* Grab a chunk of VA for the reservoir */ 365 vmmr_va_sz = physmem * PAGESIZE; 366 vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP); 367 } 368 369 void 370 vmmr_fini() 371 { 372 mutex_enter(&vmmr_lock); 373 VERIFY3U(vmmr_alloc_sz, ==, 0); 374 VERIFY3U(vmmr_free_sz, ==, 0); 375 VERIFY3U(vmmr_adding_sz, ==, 0); 376 VERIFY3U(vmmr_alloc_transient_sz, ==, 0); 377 VERIFY3U(vmmr_free_transient_sz, ==, 0); 378 VERIFY(avl_is_empty(&vmmr_free_tp.by_addr)); 379 VERIFY(avl_is_empty(&vmmr_free_tp.by_size)); 380 VERIFY(list_is_empty(&vmmr_alloc_regions)); 381 382 vmmr_tp_destroy(&vmmr_free_tp); 383 vmmr_tp_destroy(&vmmr_empty_tp); 384 list_destroy(&vmmr_alloc_regions); 385 386 /* Release reservoir VA chunk */ 387 vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz); 388 vmmr_va = 0; 389 vmmr_va_sz = 0; 390 vmmr_total_limit = 0; 391 vmmr_empty_last = 0; 392 393 mutex_exit(&vmmr_lock); 394 mutex_destroy(&vmmr_lock); 395 } 396 397 bool 398 vmmr_is_empty() 399 { 400 mutex_enter(&vmmr_lock); 401 bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 && 402 vmmr_free_sz == 0 && vmmr_free_transient_sz == 0); 403 mutex_exit(&vmmr_lock); 404 return (res); 405 } 406 407 int 408 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp) 409 { 410 VERIFY3U(sz & PAGEOFFSET, ==, 0); 411 412 if (!transient) { 413 mutex_enter(&vmmr_lock); 414 if (sz > vmmr_free_sz) { 415 mutex_exit(&vmmr_lock); 416 return (ENOSPC); 417 } 418 } else { 419 int err; 420 421 err = vmmr_add(sz, true); 422 if (err != 0) { 423 return (err); 424 } 425 mutex_enter(&vmmr_lock); 426 VERIFY3U(vmmr_free_transient_sz, >=, sz); 427 } 428 429 vmmr_region_t *region; 430 region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP); 431 avl_create(®ion->vr_spans, vmmr_cmp_region_addr, 432 sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr)); 433 region->vr_size = sz; 434 435 size_t remain = sz; 436 uintptr_t map_at = 0; 437 while (remain > 0) { 438 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 439 440 /* 441 * We have already ensured that adequate free memory is present 442 * in the reservoir for this allocation. 443 */ 444 VERIFY3P(span, !=, NULL); 445 ASSERT3U(span->vs_size, <=, remain); 446 447 span->vs_region_addr = map_at; 448 avl_add(®ion->vr_spans, span); 449 map_at += span->vs_size; 450 remain -= span->vs_size; 451 } 452 453 if (!transient) { 454 vmmr_free_sz -= sz; 455 vmmr_alloc_sz += sz; 456 } else { 457 vmmr_free_transient_sz -= sz; 458 vmmr_alloc_transient_sz += sz; 459 region->vr_transient = true; 460 } 461 list_insert_tail(&vmmr_alloc_regions, region); 462 mutex_exit(&vmmr_lock); 463 464 *resp = region; 465 return (0); 466 } 467 468 void * 469 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off) 470 { 471 /* just use KPM region for now */ 472 return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off))); 473 } 474 475 pfn_t 476 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off) 477 { 478 VERIFY3U(off & PAGEOFFSET, ==, 0); 479 VERIFY3U(off, <, region->vr_size); 480 481 vmmr_span_t search = { 482 .vs_region_addr = off 483 }; 484 avl_index_t where; 485 vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where); 486 487 if (span == NULL) { 488 span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE); 489 ASSERT3P(span, !=, NULL); 490 } 491 uintptr_t span_off = off - span->vs_region_addr + span->vs_addr; 492 page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off); 493 VERIFY(pp != NULL); 494 return (pp->p_pagenum); 495 } 496 497 void 498 vmmr_free(vmmr_region_t *region) 499 { 500 mutex_enter(&vmmr_lock); 501 if (!region->vr_transient) { 502 VERIFY3U(region->vr_size, <=, vmmr_alloc_sz); 503 } else { 504 VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz); 505 } 506 list_remove(&vmmr_alloc_regions, region); 507 mutex_exit(&vmmr_lock); 508 509 /* Zero the contents */ 510 for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) { 511 bzero(vmmr_region_mem_at(region, off), PAGESIZE); 512 } 513 514 mutex_enter(&vmmr_lock); 515 516 /* Put the contained span(s) back in the free pool */ 517 void *cookie = NULL; 518 vmmr_span_t *span; 519 while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) { 520 span->vs_region_addr = 0; 521 vmmr_tp_insert_concat(span, &vmmr_free_tp); 522 } 523 avl_destroy(®ion->vr_spans); 524 if (!region->vr_transient) { 525 vmmr_free_sz += region->vr_size; 526 vmmr_alloc_sz -= region->vr_size; 527 } else { 528 vmmr_free_transient_sz += region->vr_size; 529 vmmr_alloc_transient_sz -= region->vr_size; 530 } 531 mutex_exit(&vmmr_lock); 532 533 if (region->vr_transient) { 534 /* 535 * Since the transient capacity was previously allocated for 536 * this region, its removal should not fail. 537 */ 538 VERIFY0(vmmr_remove(region->vr_size, true)); 539 } 540 kmem_free(region, sizeof (*region)); 541 } 542 543 static void 544 vmmr_destroy_pages(vmmr_span_t *span) 545 { 546 const uintptr_t end = span->vs_addr + span->vs_size; 547 struct vnode *vp = &kvps[KV_VVP]; 548 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 549 page_t *pp; 550 551 /* Page-free logic cribbed from segkmem_xfree(): */ 552 pp = page_find(vp, (u_offset_t)pos); 553 VERIFY(pp != NULL); 554 if (!page_tryupgrade(pp)) { 555 /* 556 * Some other thread has a sharelock. Wait for 557 * it to drop the lock so we can free this page. 558 */ 559 page_unlock(pp); 560 pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL); 561 } 562 563 /* 564 * Clear p_lckcnt so page_destroy() doesn't update availrmem. 565 * That will be taken care of later via page_unresv(). 566 */ 567 pp->p_lckcnt = 0; 568 page_destroy(pp, 0); 569 } 570 } 571 572 static int 573 vmmr_alloc_pages(const vmmr_span_t *span) 574 { 575 struct seg kseg = { 576 .s_as = &kas 577 }; 578 struct vnode *vp = &kvps[KV_VVP]; 579 580 const uintptr_t end = span->vs_addr + span->vs_size; 581 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 582 page_t *pp; 583 584 pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE, 585 PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos)); 586 587 if (pp == NULL) { 588 /* Destroy any already-created pages */ 589 if (pos != span->vs_addr) { 590 vmmr_span_t destroy_span = { 591 .vs_addr = span->vs_addr, 592 .vs_size = pos - span->vs_addr, 593 }; 594 595 vmmr_destroy_pages(&destroy_span); 596 } 597 return (ENOMEM); 598 } 599 600 /* mimic page state from segkmem */ 601 ASSERT(PAGE_EXCL(pp)); 602 page_io_unlock(pp); 603 pp->p_lckcnt = 1; 604 page_downgrade(pp); 605 606 /* pre-zero the page */ 607 bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE); 608 } 609 610 return (0); 611 } 612 613 static int 614 vmmr_resv_wait() 615 { 616 if (delay_sig(hz >> 2) != 0) { 617 /* bail due to interruption */ 618 return (0); 619 } 620 return (1); 621 } 622 623 static void 624 vmmr_remove_raw(size_t sz) 625 { 626 VERIFY3U(sz & PAGEOFFSET, ==, 0); 627 VERIFY(MUTEX_HELD(&vmmr_lock)); 628 629 size_t remain = sz; 630 while (remain > 0) { 631 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 632 633 /* 634 * The caller must ensure that at least `sz` amount is present 635 * in the free treepair. 636 */ 637 VERIFY3P(span, !=, NULL); 638 ASSERT3U(span->vs_size, <=, remain); 639 640 /* TODO: perhaps arrange to destroy pages outside the lock? */ 641 vmmr_destroy_pages(span); 642 643 remain -= span->vs_size; 644 vmmr_tp_insert_concat(span, &vmmr_empty_tp); 645 } 646 647 vmmr_empty_sz += sz; 648 } 649 650 int 651 vmmr_add(size_t sz, bool transient) 652 { 653 VERIFY3U(sz & PAGEOFFSET, ==, 0); 654 655 mutex_enter(&vmmr_lock); 656 /* 657 * Make sure that the amount added is not going to breach the limits 658 * we've chosen 659 */ 660 const size_t current_total = 661 vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz + 662 vmmr_alloc_transient_sz + vmmr_free_transient_sz; 663 if ((current_total + sz) < current_total) { 664 mutex_exit(&vmmr_lock); 665 return (EOVERFLOW); 666 } 667 if ((current_total + sz) > vmmr_total_limit) { 668 mutex_exit(&vmmr_lock); 669 return (ENOSPC); 670 } 671 vmmr_adding_sz += sz; 672 mutex_exit(&vmmr_lock); 673 674 /* Wait for enough pages to become available */ 675 if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) { 676 mutex_enter(&vmmr_lock); 677 vmmr_adding_sz -= sz; 678 mutex_exit(&vmmr_lock); 679 680 return (EINTR); 681 } 682 683 mutex_enter(&vmmr_lock); 684 size_t added = 0; 685 size_t remain = sz; 686 while (added < sz) { 687 vmmr_span_t *span = NULL; 688 689 if (vmmr_empty_sz > 0) { 690 span = vmmr_tp_remove_split(remain, &vmmr_empty_tp); 691 692 vmmr_empty_sz -= span->vs_size; 693 } else { 694 /* 695 * No empty space to fill with new pages, so just tack 696 * it on at the end instead. 697 */ 698 span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 699 span->vs_addr = vmmr_empty_last; 700 span->vs_size = remain; 701 vmmr_empty_last += remain; 702 } 703 VERIFY3P(span, !=, NULL); 704 705 706 /* Allocate the actual pages to back this span */ 707 mutex_exit(&vmmr_lock); 708 int err = vmmr_alloc_pages(span); 709 mutex_enter(&vmmr_lock); 710 711 /* 712 * If an error is encountered during page allocation for the 713 * span, unwind any progress made by the addition request. 714 */ 715 if (err != 0) { 716 /* 717 * Without pages allocated to this span, it is now 718 * tracked as empty. 719 */ 720 vmmr_empty_sz += span->vs_size; 721 vmmr_tp_insert_concat(span, &vmmr_empty_tp); 722 723 if (added != 0) { 724 vmmr_remove_raw(added); 725 } 726 727 vmmr_adding_sz -= sz; 728 mutex_exit(&vmmr_lock); 729 730 page_unresv(sz >> PAGESHIFT); 731 return (err); 732 } 733 734 /* 735 * The allocated-page-bearing span is placed in the "free" 736 * treepair now, but is not officially exposed for consumption 737 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated. 738 * 739 * This allows us to unwind the allocation in case of a failure 740 * without the risk of the freshly added span(s) being snapped 741 * up by a consumer already. 742 */ 743 added += span->vs_size; 744 remain -= span->vs_size; 745 vmmr_tp_insert_concat(span, &vmmr_free_tp); 746 } 747 748 /* Make the added memory usable by exposing it to the size accounting */ 749 if (!transient) { 750 vmmr_free_sz += added; 751 } else { 752 vmmr_free_transient_sz += added; 753 } 754 ASSERT3U(added, ==, sz); 755 vmmr_adding_sz -= added; 756 757 mutex_exit(&vmmr_lock); 758 return (0); 759 } 760 761 int 762 vmmr_remove(size_t sz, bool transient) 763 { 764 VERIFY3U(sz & PAGEOFFSET, ==, 0); 765 766 mutex_enter(&vmmr_lock); 767 if ((!transient && sz > vmmr_free_sz) || 768 (transient && sz > vmmr_free_transient_sz)) { 769 mutex_exit(&vmmr_lock); 770 return (ENOSPC); 771 } 772 773 vmmr_remove_raw(sz); 774 775 if (!transient) { 776 vmmr_free_sz -= sz; 777 } else { 778 vmmr_free_transient_sz -= sz; 779 } 780 mutex_exit(&vmmr_lock); 781 page_unresv(sz >> PAGESHIFT); 782 return (0); 783 } 784 785 int 786 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 787 { 788 switch (cmd) { 789 case VMM_RESV_QUERY: { 790 struct vmm_resv_query res; 791 void *datap = (void *)(uintptr_t)arg; 792 793 /* For now, anyone in GZ can query */ 794 if (crgetzoneid(cr) != GLOBAL_ZONEID) { 795 return (EPERM); 796 } 797 mutex_enter(&vmmr_lock); 798 res.vrq_free_sz = vmmr_free_sz; 799 res.vrq_alloc_sz = vmmr_alloc_sz; 800 res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz; 801 res.vrq_limit = vmmr_total_limit; 802 mutex_exit(&vmmr_lock); 803 if (ddi_copyout(&res, datap, sizeof (res), md) != 0) { 804 return (EFAULT); 805 } 806 break; 807 } 808 case VMM_RESV_ADD: { 809 if (secpolicy_sys_config(cr, B_FALSE) != 0) { 810 return (EPERM); 811 } 812 return (vmmr_add((size_t)arg, false)); 813 } 814 case VMM_RESV_REMOVE: { 815 if (secpolicy_sys_config(cr, B_FALSE) != 0) { 816 return (EPERM); 817 } 818 return (vmmr_remove((size_t)arg, false)); 819 } 820 default: 821 return (ENOTTY); 822 } 823 return (0); 824 } 825