1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2023 Oxide Computer Company 14 */ 15 16 /* 17 * VMM Memory Reservoir 18 * 19 * 20 * In order to make the allocation of large (multi-GiB) chunks of memory 21 * for bhyve VMs easier, we introduce the "VMM Reservoir", where system 22 * operators can set aside a substantial portion of system memory exclusively 23 * for VMs. This memory is unavailable for general use by the rest of the 24 * system. Rather than having to scour the freelist, reap kmem caches, or put 25 * pressure on the ARC, bhyve guest memory allocations can quickly determine if 26 * there is adequate reservoir memory available. Since the pages stored in the 27 * reservoir are pre-zeroed, it can be immediately used when allocated to a 28 * guest. When the memory is returned to the reservoir, it is zeroed once more 29 * to avoid leaking any sensitive data from that guest. 30 * 31 * 32 * Transient Allocations 33 * 34 * While the explicit reservoir model may work well for some applications, 35 * others may want a more traditional model, where pages for guest memory 36 * objects are allocated on demand, rather than from a pool set aside from the 37 * system. In this case, the allocation can be made in "transient" mode, where 38 * the memory is allocated normally, even if there is free capacity in the 39 * reservoir. When use of the transient allocation is complete (the guest is 40 * halted and destroyed), the pages will be freed back to the system, rather 41 * than added back to the reservoir. 42 * 43 * From an implementation standpoint, transient allocations follow the same 44 * code paths as ones using the reservoir normally. Those allocations have a 45 * tag which marks them as transient, and used/free size tallies are maintained 46 * separately for normal and transient operations. When performing a transient 47 * allocation, that amount of memory is immediately added to the reservoir , 48 * from which the allocation can be made. When freeing a transient allocation, 49 * a matching amount of memory is removed from the reservoir as part of the 50 * operation. This allows both allocation types to coexist without too much 51 * additional machinery. 52 * 53 * 54 * Administration 55 * 56 * Operators may attempt to alter the amount of memory allocated to the 57 * reservoir via an ioctl against the vmmctl device. The total amount of memory 58 * in the reservoir (free, or allocated to VMs) is limited by 59 * `vmm_total_limit` (see its definition for how this limit is calculated). 60 * 61 * The limit is in place to prevent the reservoir from inadvertently growing 62 * to a size where the system has inadequate memory to make forward progress. 63 * Shrinking the reservoir is only possible when it contains free (not 64 * allocated by any guest VMs) memory. 65 * 66 * 67 * Page Tracking 68 * 69 * The reservoir currently uses vnode association to keep track of pages under 70 * its control (either designated to the reservoir and free, or allocated to a 71 * guest VM object). This means using the existing VM system primitives for 72 * page_t instances being associated with a given (vnode, offset) tuple. It 73 * means that spans of pages, either free or allocated, need only to store a 74 * length (of the span) and an offset (into the vnode) in order to gain access 75 * to all of the underlying pages associated with that span. Associating the 76 * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be 77 * properly tracked as KAS pages, but be excluded from normal dumps (unless the 78 * operator has chosen to dump all of RAM). 79 */ 80 81 #include <sys/types.h> 82 #include <sys/mutex.h> 83 #include <sys/avl.h> 84 #include <sys/list.h> 85 #include <sys/machparam.h> 86 #include <sys/kmem.h> 87 #include <sys/stddef.h> 88 #include <sys/null.h> 89 #include <sys/errno.h> 90 #include <sys/systm.h> 91 #include <sys/sunddi.h> 92 #include <sys/policy.h> 93 #include <vm/seg_kmem.h> 94 #include <vm/hat_i86.h> 95 #include <sys/kstat.h> 96 97 #include <sys/vmm_reservoir.h> 98 #include <sys/vmm_dev.h> 99 #include <sys/vmm_impl.h> 100 101 #define VMMR_TARGET_INACTIVE SIZE_MAX 102 103 static kmutex_t vmmr_lock; 104 105 static size_t vmmr_free_sz; 106 static size_t vmmr_free_transient_sz; 107 static size_t vmmr_adding_sz; 108 static size_t vmmr_alloc_sz; 109 static size_t vmmr_alloc_transient_sz; 110 static size_t vmmr_empty_sz; 111 112 /* 113 * Target size of the reservoir during active vmmr_set_target() operation. 114 * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active. 115 */ 116 static size_t vmmr_target_sz; 117 118 static uintptr_t vmmr_empty_last; 119 /* Upper limit for the size (free + allocated) of the reservoir */ 120 static size_t vmmr_total_limit; 121 122 /* VA range allocated from the VMM arena for the mappings */ 123 static uintptr_t vmmr_va; 124 static uintptr_t vmmr_va_sz; 125 126 static kstat_t *vmmr_kstat; 127 128 /* Pair of AVL trees to store set of spans ordered by addr and size */ 129 typedef struct vmmr_treepair { 130 avl_tree_t by_addr; 131 avl_tree_t by_size; 132 } vmmr_treepair_t; 133 134 /* Spans of free memory in the reservoir */ 135 static vmmr_treepair_t vmmr_free_tp; 136 137 /* Spans of empty (not backed by memory) space in the reservoir */ 138 static vmmr_treepair_t vmmr_empty_tp; 139 140 /* Regions of memory allocated from the reservoir */ 141 static list_t vmmr_alloc_regions; 142 143 struct vmmr_span { 144 uintptr_t vs_addr; 145 size_t vs_size; 146 avl_node_t vs_by_addr; 147 avl_node_t vs_by_size; 148 uintptr_t vs_region_addr; 149 }; 150 typedef struct vmmr_span vmmr_span_t; 151 152 struct vmmr_region { 153 size_t vr_size; 154 avl_tree_t vr_spans; 155 list_node_t vr_node; 156 bool vr_transient; 157 }; 158 159 typedef struct vmmr_kstats { 160 kstat_named_t vmrks_bytes_free; 161 kstat_named_t vmrks_bytes_alloc; 162 kstat_named_t vmrks_bytes_transient; 163 kstat_named_t vmrks_bytes_limit; 164 } vmmr_kstats_t; 165 166 167 static int vmmr_add(size_t, bool); 168 static int vmmr_remove(size_t, bool); 169 170 static int 171 vmmr_cmp_addr(const void *a, const void *b) 172 { 173 const vmmr_span_t *sa = a; 174 const vmmr_span_t *sb = b; 175 176 if (sa->vs_addr == sb->vs_addr) { 177 return (0); 178 } else if (sa->vs_addr < sb->vs_addr) { 179 return (-1); 180 } else { 181 return (1); 182 } 183 } 184 185 static int 186 vmmr_cmp_size(const void *a, const void *b) 187 { 188 const vmmr_span_t *sa = a; 189 const vmmr_span_t *sb = b; 190 191 if (sa->vs_size == sb->vs_size) { 192 /* 193 * Since discontiguous spans could have the same size in a 194 * by-size tree, differentiate them (as required by AVL) by 195 * address so they can safely coexist while remaining sorted. 196 */ 197 return (vmmr_cmp_addr(a, b)); 198 } else if (sa->vs_size < sb->vs_size) { 199 return (-1); 200 } else { 201 return (1); 202 } 203 } 204 205 static int 206 vmmr_cmp_region_addr(const void *a, const void *b) 207 { 208 const vmmr_span_t *sa = a; 209 const vmmr_span_t *sb = b; 210 211 if (sa->vs_region_addr == sb->vs_region_addr) { 212 return (0); 213 } else if (sa->vs_region_addr < sb->vs_region_addr) { 214 return (-1); 215 } else { 216 return (1); 217 } 218 } 219 220 static void 221 vmmr_tp_init(vmmr_treepair_t *tree) 222 { 223 avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t), 224 offsetof(vmmr_span_t, vs_by_addr)); 225 avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t), 226 offsetof(vmmr_span_t, vs_by_size)); 227 } 228 229 static void 230 vmmr_tp_destroy(vmmr_treepair_t *tree) 231 { 232 void *vcp = NULL; 233 vmmr_span_t *span; 234 235 while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) { 236 /* Freeing spans will be done when tearing down by-size tree */ 237 } 238 while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) { 239 kmem_free(span, sizeof (*span)); 240 } 241 avl_destroy(&tree->by_addr); 242 avl_destroy(&tree->by_size); 243 } 244 245 /* 246 * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent 247 * span(s). Such concatenation could result in the `to_add` span being freed, 248 * so the caller cannot use it after this returns. 249 */ 250 static void 251 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree) 252 { 253 avl_tree_t *by_addr = &tree->by_addr; 254 avl_tree_t *by_size = &tree->by_size; 255 vmmr_span_t *node; 256 avl_index_t where; 257 258 /* This addr should not already exist in the treepair */ 259 node = avl_find(by_addr, to_add, &where); 260 ASSERT3P(node, ==, NULL); 261 262 node = avl_nearest(by_addr, where, AVL_BEFORE); 263 if (node != NULL && 264 (node->vs_addr + node->vs_size) == to_add->vs_addr) { 265 /* concat with preceeding item */ 266 avl_remove(by_addr, node); 267 avl_remove(by_size, node); 268 node->vs_size += to_add->vs_size; 269 kmem_free(to_add, sizeof (*to_add)); 270 271 /* 272 * Since this now-concatenated span could be adjacent one 273 * trailing it, fall through to perform that check. 274 */ 275 to_add = node; 276 } 277 278 node = avl_nearest(by_addr, where, AVL_AFTER); 279 if (node != NULL && 280 (to_add->vs_addr + to_add->vs_size) == node->vs_addr) { 281 /* concat with trailing item */ 282 avl_remove(by_addr, node); 283 avl_remove(by_size, node); 284 node->vs_addr = to_add->vs_addr; 285 node->vs_size += to_add->vs_size; 286 avl_add(by_addr, node); 287 avl_add(by_size, node); 288 289 kmem_free(to_add, sizeof (*to_add)); 290 return; 291 } 292 293 /* simply insert */ 294 avl_add(by_addr, to_add); 295 avl_add(by_size, to_add); 296 } 297 298 /* 299 * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of 300 * the exact target size is not present, but a larger one is. May return a span 301 * with a size smaller than the target if splitting is not an option. 302 */ 303 static vmmr_span_t * 304 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree) 305 { 306 avl_tree_t *by_addr = &tree->by_addr; 307 avl_tree_t *by_size = &tree->by_size; 308 vmmr_span_t *span; 309 avl_index_t where; 310 311 ASSERT3U(target_sz, !=, 0); 312 ASSERT(!avl_is_empty(by_addr)); 313 ASSERT(!avl_is_empty(by_size)); 314 315 vmmr_span_t search = { .vs_size = target_sz }; 316 span = avl_find(by_size, &search, &where); 317 if (span == NULL) { 318 /* Try for a larger span (instead of exact match) */ 319 span = avl_nearest(by_size, where, AVL_AFTER); 320 if (span == NULL) { 321 /* 322 * Caller will need to collect several smaller spans in 323 * order to fulfill their request. 324 */ 325 span = avl_nearest(by_size, where, AVL_BEFORE); 326 ASSERT3P(span, !=, NULL); 327 } 328 } 329 330 if (span->vs_size <= target_sz) { 331 avl_remove(by_size, span); 332 avl_remove(by_addr, span); 333 334 return (span); 335 } else { 336 /* Split off adequate chunk from larger span */ 337 uintptr_t start = span->vs_addr + span->vs_size - target_sz; 338 339 avl_remove(by_size, span); 340 span->vs_size -= target_sz; 341 avl_add(by_size, span); 342 343 vmmr_span_t *split_span = 344 kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 345 split_span->vs_addr = start; 346 split_span->vs_size = target_sz; 347 348 return (split_span); 349 } 350 } 351 352 static int 353 vmmr_kstat_update(struct kstat *ksp, int rw) 354 { 355 vmmr_kstats_t *vkp = ksp->ks_data; 356 357 mutex_enter(&vmmr_lock); 358 vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz; 359 vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz; 360 /* 361 * In addition to the memory which is actually actually allocated to 362 * transient consumers, memory which is considered free-for-transient is 363 * also included in the sizing. 364 */ 365 vkp->vmrks_bytes_transient.value.ui64 = 366 vmmr_alloc_transient_sz + vmmr_free_transient_sz; 367 vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit; 368 mutex_exit(&vmmr_lock); 369 370 return (0); 371 } 372 373 int 374 vmmr_init() 375 { 376 mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL); 377 378 /* 379 * `vmm_total_limit` represents the absolute maximum size of the VMM 380 * memory reservoir. It is meant to provide some measure of protection 381 * against an operator pushing the system into unrecoverable memory 382 * starvation through explicit or transient additions to the reservoir. 383 * 384 * There will be many situations where this limit would be inadequate to 385 * prevent kernel memory starvation in the face of certain operator 386 * actions. It is a balance to be struck between safety and allowing 387 * large systems to reach high utilization. 388 * 389 * The value is based off of pages_pp_maximum: "Number of currently 390 * available pages that cannot be 'locked'". It is sized as all of 391 * `physmem` less 120% of `pages_pp_maximum`. 392 */ 393 vmmr_total_limit = 394 (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10; 395 396 vmmr_empty_last = 0; 397 vmmr_free_sz = 0; 398 vmmr_alloc_sz = 0; 399 vmmr_empty_sz = 0; 400 vmmr_adding_sz = 0; 401 vmmr_free_transient_sz = 0; 402 vmmr_alloc_transient_sz = 0; 403 vmmr_target_sz = VMMR_TARGET_INACTIVE; 404 405 /* 406 * Attempt kstat allocation early, since it is the only part of 407 * reservoir initialization which is fallible. 408 */ 409 kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir", 410 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 411 sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID); 412 if (ksp == NULL) { 413 mutex_destroy(&vmmr_lock); 414 return (ENOMEM); 415 } 416 417 vmmr_kstats_t *vkp = ksp->ks_data; 418 419 kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free", 420 KSTAT_DATA_UINT64); 421 kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc", 422 KSTAT_DATA_UINT64); 423 kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc", 424 KSTAT_DATA_UINT64); 425 kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit", 426 KSTAT_DATA_UINT64); 427 ksp->ks_private = NULL; 428 ksp->ks_update = vmmr_kstat_update; 429 vmmr_kstat = ksp; 430 431 vmmr_tp_init(&vmmr_free_tp); 432 vmmr_tp_init(&vmmr_empty_tp); 433 434 list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t), 435 offsetof(vmmr_region_t, vr_node)); 436 437 /* Grab a chunk of VA for the reservoir */ 438 vmmr_va_sz = physmem * PAGESIZE; 439 vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP); 440 441 kstat_install(vmmr_kstat); 442 443 return (0); 444 } 445 446 void 447 vmmr_fini() 448 { 449 mutex_enter(&vmmr_lock); 450 VERIFY3U(vmmr_alloc_sz, ==, 0); 451 VERIFY3U(vmmr_free_sz, ==, 0); 452 VERIFY3U(vmmr_adding_sz, ==, 0); 453 VERIFY3U(vmmr_alloc_transient_sz, ==, 0); 454 VERIFY3U(vmmr_free_transient_sz, ==, 0); 455 VERIFY(avl_is_empty(&vmmr_free_tp.by_addr)); 456 VERIFY(avl_is_empty(&vmmr_free_tp.by_size)); 457 VERIFY(list_is_empty(&vmmr_alloc_regions)); 458 459 kstat_delete(vmmr_kstat); 460 vmmr_kstat = NULL; 461 462 vmmr_tp_destroy(&vmmr_free_tp); 463 vmmr_tp_destroy(&vmmr_empty_tp); 464 list_destroy(&vmmr_alloc_regions); 465 466 /* Release reservoir VA chunk */ 467 vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz); 468 vmmr_va = 0; 469 vmmr_va_sz = 0; 470 vmmr_total_limit = 0; 471 vmmr_empty_last = 0; 472 473 mutex_exit(&vmmr_lock); 474 mutex_destroy(&vmmr_lock); 475 } 476 477 bool 478 vmmr_is_empty() 479 { 480 mutex_enter(&vmmr_lock); 481 bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 && 482 vmmr_free_sz == 0 && vmmr_free_transient_sz == 0); 483 mutex_exit(&vmmr_lock); 484 return (res); 485 } 486 487 int 488 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp) 489 { 490 VERIFY3U(sz & PAGEOFFSET, ==, 0); 491 492 if (!transient) { 493 mutex_enter(&vmmr_lock); 494 if (sz > vmmr_free_sz) { 495 mutex_exit(&vmmr_lock); 496 return (ENOSPC); 497 } 498 } else { 499 int err; 500 501 mutex_enter(&vmmr_lock); 502 err = vmmr_add(sz, true); 503 if (err != 0) { 504 mutex_exit(&vmmr_lock); 505 return (err); 506 } 507 VERIFY3U(vmmr_free_transient_sz, >=, sz); 508 } 509 510 vmmr_region_t *region; 511 region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP); 512 avl_create(®ion->vr_spans, vmmr_cmp_region_addr, 513 sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr)); 514 region->vr_size = sz; 515 516 size_t remain = sz; 517 uintptr_t map_at = 0; 518 while (remain > 0) { 519 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 520 521 /* 522 * We have already ensured that adequate free memory is present 523 * in the reservoir for this allocation. 524 */ 525 VERIFY3P(span, !=, NULL); 526 ASSERT3U(span->vs_size, <=, remain); 527 528 span->vs_region_addr = map_at; 529 avl_add(®ion->vr_spans, span); 530 map_at += span->vs_size; 531 remain -= span->vs_size; 532 } 533 534 if (!transient) { 535 vmmr_free_sz -= sz; 536 vmmr_alloc_sz += sz; 537 } else { 538 vmmr_free_transient_sz -= sz; 539 vmmr_alloc_transient_sz += sz; 540 region->vr_transient = true; 541 } 542 list_insert_tail(&vmmr_alloc_regions, region); 543 mutex_exit(&vmmr_lock); 544 545 *resp = region; 546 return (0); 547 } 548 549 void * 550 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off) 551 { 552 /* just use KPM region for now */ 553 return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off))); 554 } 555 556 pfn_t 557 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off) 558 { 559 VERIFY3U(off & PAGEOFFSET, ==, 0); 560 VERIFY3U(off, <, region->vr_size); 561 562 vmmr_span_t search = { 563 .vs_region_addr = off 564 }; 565 avl_index_t where; 566 vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where); 567 568 if (span == NULL) { 569 span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE); 570 ASSERT3P(span, !=, NULL); 571 } 572 uintptr_t span_off = off - span->vs_region_addr + span->vs_addr; 573 page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off); 574 VERIFY(pp != NULL); 575 return (pp->p_pagenum); 576 } 577 578 void 579 vmmr_free(vmmr_region_t *region) 580 { 581 mutex_enter(&vmmr_lock); 582 if (!region->vr_transient) { 583 VERIFY3U(region->vr_size, <=, vmmr_alloc_sz); 584 } else { 585 VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz); 586 } 587 list_remove(&vmmr_alloc_regions, region); 588 mutex_exit(&vmmr_lock); 589 590 /* Zero the contents (while not monopolizing vmmr_lock) */ 591 for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) { 592 bzero(vmmr_region_mem_at(region, off), PAGESIZE); 593 } 594 595 mutex_enter(&vmmr_lock); 596 597 /* Put the contained span(s) back in the free pool */ 598 void *cookie = NULL; 599 vmmr_span_t *span; 600 while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) { 601 span->vs_region_addr = 0; 602 vmmr_tp_insert_concat(span, &vmmr_free_tp); 603 } 604 avl_destroy(®ion->vr_spans); 605 if (!region->vr_transient) { 606 vmmr_free_sz += region->vr_size; 607 vmmr_alloc_sz -= region->vr_size; 608 } else { 609 vmmr_free_transient_sz += region->vr_size; 610 vmmr_alloc_transient_sz -= region->vr_size; 611 } 612 613 if (region->vr_transient) { 614 /* 615 * Since the transient capacity was previously allocated for 616 * this region, its removal should not fail. 617 */ 618 VERIFY0(vmmr_remove(region->vr_size, true)); 619 } 620 kmem_free(region, sizeof (*region)); 621 mutex_exit(&vmmr_lock); 622 } 623 624 static void 625 vmmr_destroy_pages(vmmr_span_t *span) 626 { 627 const uintptr_t end = span->vs_addr + span->vs_size; 628 struct vnode *vp = &kvps[KV_VVP]; 629 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 630 page_t *pp; 631 632 /* Page-free logic cribbed from segkmem_xfree(): */ 633 pp = page_find(vp, (u_offset_t)pos); 634 VERIFY(pp != NULL); 635 if (!page_tryupgrade(pp)) { 636 /* 637 * Some other thread has a sharelock. Wait for 638 * it to drop the lock so we can free this page. 639 */ 640 page_unlock(pp); 641 pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL); 642 } 643 644 /* 645 * Clear p_lckcnt so page_destroy() doesn't update availrmem. 646 * That will be taken care of later via page_unresv(). 647 */ 648 pp->p_lckcnt = 0; 649 page_destroy(pp, 0); 650 } 651 } 652 653 static int 654 vmmr_alloc_pages(const vmmr_span_t *span) 655 { 656 struct seg kseg = { 657 .s_as = &kas 658 }; 659 struct vnode *vp = &kvps[KV_VVP]; 660 661 const uintptr_t end = span->vs_addr + span->vs_size; 662 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 663 page_t *pp; 664 665 pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE, 666 PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos)); 667 668 if (pp == NULL) { 669 /* Destroy any already-created pages */ 670 if (pos != span->vs_addr) { 671 vmmr_span_t destroy_span = { 672 .vs_addr = span->vs_addr, 673 .vs_size = pos - span->vs_addr, 674 }; 675 676 vmmr_destroy_pages(&destroy_span); 677 } 678 return (ENOMEM); 679 } 680 681 /* mimic page state from segkmem */ 682 ASSERT(PAGE_EXCL(pp)); 683 page_io_unlock(pp); 684 pp->p_lckcnt = 1; 685 page_downgrade(pp); 686 687 /* pre-zero the page */ 688 bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE); 689 } 690 691 return (0); 692 } 693 694 static int 695 vmmr_resv_wait() 696 { 697 if (delay_sig(hz >> 2) != 0) { 698 /* bail due to interruption */ 699 return (0); 700 } 701 return (1); 702 } 703 704 static void 705 vmmr_remove_raw(size_t sz) 706 { 707 VERIFY3U(sz & PAGEOFFSET, ==, 0); 708 VERIFY(MUTEX_HELD(&vmmr_lock)); 709 710 size_t remain = sz; 711 while (remain > 0) { 712 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 713 714 /* 715 * The caller must ensure that at least `sz` amount is present 716 * in the free treepair. 717 */ 718 VERIFY3P(span, !=, NULL); 719 ASSERT3U(span->vs_size, <=, remain); 720 721 /* TODO: perhaps arrange to destroy pages outside the lock? */ 722 vmmr_destroy_pages(span); 723 724 remain -= span->vs_size; 725 vmmr_tp_insert_concat(span, &vmmr_empty_tp); 726 } 727 728 vmmr_empty_sz += sz; 729 } 730 731 /* 732 * Add memory to vmm reservoir. Memory may be marked for transient use, where 733 * the addition is part of a transient allocation from the reservoir. Otherwise 734 * it is placed in the reservoir to be available for non-transient allocations. 735 * 736 * Expects vmmr_lock to be held when called, and will return with it held, but 737 * will drop it during portions of the addition. 738 */ 739 static int 740 vmmr_add(size_t sz, bool transient) 741 { 742 VERIFY3U(sz & PAGEOFFSET, ==, 0); 743 VERIFY3U(sz, >, 0); 744 VERIFY(MUTEX_HELD(&vmmr_lock)); 745 746 /* 747 * Make sure that the amount added is not going to breach the limits 748 * we've chosen 749 */ 750 const size_t current_total = 751 vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz + 752 vmmr_alloc_transient_sz + vmmr_free_transient_sz; 753 if ((current_total + sz) < current_total) { 754 return (EOVERFLOW); 755 } 756 if ((current_total + sz) > vmmr_total_limit) { 757 return (ENOSPC); 758 } 759 vmmr_adding_sz += sz; 760 mutex_exit(&vmmr_lock); 761 762 /* Wait for enough pages to become available */ 763 if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) { 764 mutex_enter(&vmmr_lock); 765 vmmr_adding_sz -= sz; 766 return (EINTR); 767 } 768 769 mutex_enter(&vmmr_lock); 770 size_t added = 0; 771 size_t remain = sz; 772 while (added < sz) { 773 vmmr_span_t *span = NULL; 774 775 if (vmmr_empty_sz > 0) { 776 span = vmmr_tp_remove_split(remain, &vmmr_empty_tp); 777 778 vmmr_empty_sz -= span->vs_size; 779 } else { 780 /* 781 * No empty space to fill with new pages, so just tack 782 * it on at the end instead. 783 */ 784 span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 785 span->vs_addr = vmmr_empty_last; 786 span->vs_size = remain; 787 vmmr_empty_last += remain; 788 } 789 VERIFY3P(span, !=, NULL); 790 791 792 /* Allocate the actual pages to back this span */ 793 mutex_exit(&vmmr_lock); 794 int err = vmmr_alloc_pages(span); 795 mutex_enter(&vmmr_lock); 796 797 /* 798 * If an error is encountered during page allocation for the 799 * span, unwind any progress made by the addition request. 800 */ 801 if (err != 0) { 802 /* 803 * Without pages allocated to this span, it is now 804 * tracked as empty. 805 */ 806 vmmr_empty_sz += span->vs_size; 807 vmmr_tp_insert_concat(span, &vmmr_empty_tp); 808 809 if (added != 0) { 810 vmmr_remove_raw(added); 811 } 812 813 vmmr_adding_sz -= sz; 814 815 page_unresv(sz >> PAGESHIFT); 816 return (err); 817 } 818 819 /* 820 * The allocated-page-bearing span is placed in the "free" 821 * treepair now, but is not officially exposed for consumption 822 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated. 823 * 824 * This allows us to unwind the allocation in case of a failure 825 * without the risk of the freshly added span(s) being snapped 826 * up by a consumer already. 827 */ 828 added += span->vs_size; 829 remain -= span->vs_size; 830 vmmr_tp_insert_concat(span, &vmmr_free_tp); 831 } 832 833 /* Make the added memory usable by exposing it to the size accounting */ 834 if (!transient) { 835 vmmr_free_sz += added; 836 } else { 837 vmmr_free_transient_sz += added; 838 } 839 ASSERT3U(added, ==, sz); 840 vmmr_adding_sz -= added; 841 842 return (0); 843 } 844 845 /* 846 * Remove memory from vmm reservoir. Normally this will remove memory from the 847 * reservoir which was available for non-transient allocations. If the removal 848 * is part of a vmmr_free() of a transient allocation, it will act on only that 849 * transient region being freed, not the available memory in the reservoir. 850 * 851 * Expects vmmr_lock to be held when called, and will return with it held, but 852 * may drop it during portions of the removal. 853 */ 854 static int 855 vmmr_remove(size_t sz, bool transient) 856 { 857 VERIFY3U(sz & PAGEOFFSET, ==, 0); 858 VERIFY(sz); 859 VERIFY(MUTEX_HELD(&vmmr_lock)); 860 861 if ((!transient && sz > vmmr_free_sz) || 862 (transient && sz > vmmr_free_transient_sz)) { 863 return (ENOSPC); 864 } 865 866 vmmr_remove_raw(sz); 867 868 if (!transient) { 869 vmmr_free_sz -= sz; 870 } else { 871 vmmr_free_transient_sz -= sz; 872 } 873 page_unresv(sz >> PAGESHIFT); 874 return (0); 875 } 876 877 static int 878 vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp) 879 { 880 VERIFY(resp != NULL); 881 882 mutex_enter(&vmmr_lock); 883 884 size_t current_sz = vmmr_alloc_sz + vmmr_free_sz; 885 886 /* Be sure to communicate current size in case of an early bail-out */ 887 *resp = current_sz; 888 889 if ((target_sz & PAGEOFFSET) != 0 || 890 (chunk_sz & PAGEOFFSET) != 0) { 891 mutex_exit(&vmmr_lock); 892 return (EINVAL); 893 } 894 /* Reject sentinel value */ 895 if (target_sz == VMMR_TARGET_INACTIVE) { 896 mutex_exit(&vmmr_lock); 897 return (EINVAL); 898 } 899 900 /* Already at target size */ 901 if (target_sz == current_sz) { 902 mutex_exit(&vmmr_lock); 903 return (0); 904 } 905 906 /* Reject racing requests size */ 907 if (vmmr_target_sz != VMMR_TARGET_INACTIVE) { 908 mutex_exit(&vmmr_lock); 909 return (EALREADY); 910 } 911 /* Record the target now to excluding a racing request */ 912 vmmr_target_sz = target_sz; 913 914 int err = 0; 915 do { 916 /* Be sensitive to signal interruption */ 917 if (issig(JUSTLOOKING) != 0) { 918 mutex_exit(&vmmr_lock); 919 const bool sig_bail = issig(FORREAL) != 0; 920 mutex_enter(&vmmr_lock); 921 if (sig_bail) { 922 err = EINTR; 923 break; 924 } 925 } 926 927 if (current_sz > target_sz) { 928 /* Shrinking reservoir */ 929 930 size_t req_sz = current_sz - target_sz; 931 if (chunk_sz != 0) { 932 req_sz = MIN(req_sz, chunk_sz); 933 } 934 err = vmmr_remove(req_sz, false); 935 } else { 936 /* Growing reservoir */ 937 ASSERT(current_sz < target_sz); 938 939 size_t req_sz = target_sz - current_sz; 940 if (chunk_sz != 0) { 941 req_sz = MIN(req_sz, chunk_sz); 942 } 943 err = vmmr_add(req_sz, false); 944 } 945 946 current_sz = vmmr_alloc_sz + vmmr_free_sz; 947 } while (err == 0 && current_sz != target_sz); 948 949 /* Clear the target now that we are done (success or not) */ 950 vmmr_target_sz = VMMR_TARGET_INACTIVE; 951 mutex_exit(&vmmr_lock); 952 *resp = current_sz; 953 return (err); 954 } 955 956 int 957 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 958 { 959 /* 960 * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we 961 * do not need to duplicate such checks here. 962 */ 963 964 switch (cmd) { 965 case VMM_RESV_QUERY: { 966 struct vmm_resv_query res; 967 void *datap = (void *)(uintptr_t)arg; 968 969 /* For now, anyone with access to vmmctl device can query */ 970 mutex_enter(&vmmr_lock); 971 res.vrq_free_sz = vmmr_free_sz; 972 res.vrq_alloc_sz = vmmr_alloc_sz; 973 res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz; 974 res.vrq_limit = vmmr_total_limit; 975 mutex_exit(&vmmr_lock); 976 if (ddi_copyout(&res, datap, sizeof (res), md) != 0) { 977 return (EFAULT); 978 } 979 break; 980 } 981 case VMM_RESV_SET_TARGET: { 982 if (secpolicy_sys_config(cr, B_FALSE) != 0) { 983 return (EPERM); 984 } 985 986 struct vmm_resv_target tgt; 987 void *datap = (void *)(uintptr_t)arg; 988 989 if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) { 990 return (EFAULT); 991 } 992 993 int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz, 994 &tgt.vrt_result_sz); 995 996 /* 997 * Attempt to communicate the resultant size of the reservoir if 998 * setting it to the target was a success, or if we were 999 * interrupted (by a signal) while doing so. 1000 */ 1001 if (err == 0 || err == EINTR) { 1002 if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) { 1003 err = EFAULT; 1004 } 1005 } 1006 1007 return (err); 1008 } 1009 default: 1010 return (ENOTTY); 1011 } 1012 return (0); 1013 } 1014