1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2023 Oxide Computer Company 15 */ 16 17 /* 18 * VMM Memory Reservoir 19 * 20 * 21 * In order to make the allocation of large (multi-GiB) chunks of memory 22 * for bhyve VMs easier, we introduce the "VMM Reservoir", where system 23 * operators can set aside a substantial portion of system memory exclusively 24 * for VMs. This memory is unavailable for general use by the rest of the 25 * system. Rather than having to scour the freelist, reap kmem caches, or put 26 * pressure on the ARC, bhyve guest memory allocations can quickly determine if 27 * there is adequate reservoir memory available. Since the pages stored in the 28 * reservoir are pre-zeroed, it can be immediately used when allocated to a 29 * guest. When the memory is returned to the reservoir, it is zeroed once more 30 * to avoid leaking any sensitive data from that guest. 31 * 32 * 33 * Transient Allocations 34 * 35 * While the explicit reservoir model may work well for some applications, 36 * others may want a more traditional model, where pages for guest memory 37 * objects are allocated on demand, rather than from a pool set aside from the 38 * system. In this case, the allocation can be made in "transient" mode, where 39 * the memory is allocated normally, even if there is free capacity in the 40 * reservoir. When use of the transient allocation is complete (the guest is 41 * halted and destroyed), the pages will be freed back to the system, rather 42 * than added back to the reservoir. 43 * 44 * From an implementation standpoint, transient allocations follow the same 45 * code paths as ones using the reservoir normally. Those allocations have a 46 * tag which marks them as transient, and used/free size tallies are maintained 47 * separately for normal and transient operations. When performing a transient 48 * allocation, that amount of memory is immediately added to the reservoir , 49 * from which the allocation can be made. When freeing a transient allocation, 50 * a matching amount of memory is removed from the reservoir as part of the 51 * operation. This allows both allocation types to coexist without too much 52 * additional machinery. 53 * 54 * 55 * Administration 56 * 57 * Operators may attempt to alter the amount of memory allocated to the 58 * reservoir via an ioctl against the vmmctl device. The total amount of memory 59 * in the reservoir (free, or allocated to VMs) is limited by 60 * `vmm_total_limit` (see its definition for how this limit is calculated). 61 * 62 * The limit is in place to prevent the reservoir from inadvertently growing 63 * to a size where the system has inadequate memory to make forward progress. 64 * Shrinking the reservoir is only possible when it contains free (not 65 * allocated by any guest VMs) memory. 66 * 67 * 68 * Page Tracking 69 * 70 * The reservoir currently uses vnode association to keep track of pages under 71 * its control (either designated to the reservoir and free, or allocated to a 72 * guest VM object). This means using the existing VM system primitives for 73 * page_t instances being associated with a given (vnode, offset) tuple. It 74 * means that spans of pages, either free or allocated, need only to store a 75 * length (of the span) and an offset (into the vnode) in order to gain access 76 * to all of the underlying pages associated with that span. Associating the 77 * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be 78 * properly tracked as KAS pages, but be excluded from normal dumps (unless the 79 * operator has chosen to dump all of RAM). 80 */ 81 82 #include <sys/types.h> 83 #include <sys/mutex.h> 84 #include <sys/avl.h> 85 #include <sys/list.h> 86 #include <sys/machparam.h> 87 #include <sys/kmem.h> 88 #include <sys/stddef.h> 89 #include <sys/null.h> 90 #include <sys/errno.h> 91 #include <sys/systm.h> 92 #include <sys/sunddi.h> 93 #include <sys/policy.h> 94 #include <vm/seg_kmem.h> 95 #include <vm/hat_i86.h> 96 #include <sys/kstat.h> 97 98 #include <sys/vmm_reservoir.h> 99 #include <sys/vmm_dev.h> 100 #include <sys/vmm_impl.h> 101 102 #define VMMR_TARGET_INACTIVE SIZE_MAX 103 104 static kmutex_t vmmr_lock; 105 106 static size_t vmmr_free_sz; 107 static size_t vmmr_free_transient_sz; 108 static size_t vmmr_adding_sz; 109 static size_t vmmr_alloc_sz; 110 static size_t vmmr_alloc_transient_sz; 111 static size_t vmmr_empty_sz; 112 113 /* 114 * Target size of the reservoir during active vmmr_set_target() operation. 115 * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active. 116 */ 117 static size_t vmmr_target_sz; 118 119 static uintptr_t vmmr_empty_last; 120 /* Upper limit for the size (free + allocated) of the reservoir */ 121 static size_t vmmr_total_limit; 122 123 /* VA range allocated from the VMM arena for the mappings */ 124 static uintptr_t vmmr_va; 125 static uintptr_t vmmr_va_sz; 126 127 static kstat_t *vmmr_kstat; 128 129 /* Pair of AVL trees to store set of spans ordered by addr and size */ 130 typedef struct vmmr_treepair { 131 avl_tree_t by_addr; 132 avl_tree_t by_size; 133 } vmmr_treepair_t; 134 135 /* Spans of free memory in the reservoir */ 136 static vmmr_treepair_t vmmr_free_tp; 137 138 /* Spans of empty (not backed by memory) space in the reservoir */ 139 static vmmr_treepair_t vmmr_empty_tp; 140 141 /* Regions of memory allocated from the reservoir */ 142 static list_t vmmr_alloc_regions; 143 144 struct vmmr_span { 145 uintptr_t vs_addr; 146 size_t vs_size; 147 avl_node_t vs_by_addr; 148 avl_node_t vs_by_size; 149 uintptr_t vs_region_addr; 150 }; 151 typedef struct vmmr_span vmmr_span_t; 152 153 struct vmmr_region { 154 size_t vr_size; 155 avl_tree_t vr_spans; 156 list_node_t vr_node; 157 bool vr_transient; 158 }; 159 160 typedef struct vmmr_kstats { 161 kstat_named_t vmrks_bytes_free; 162 kstat_named_t vmrks_bytes_alloc; 163 kstat_named_t vmrks_bytes_transient; 164 kstat_named_t vmrks_bytes_limit; 165 } vmmr_kstats_t; 166 167 168 static int vmmr_add(size_t, bool); 169 static int vmmr_remove(size_t, bool); 170 171 static int 172 vmmr_cmp_addr(const void *a, const void *b) 173 { 174 const vmmr_span_t *sa = a; 175 const vmmr_span_t *sb = b; 176 177 if (sa->vs_addr == sb->vs_addr) { 178 return (0); 179 } else if (sa->vs_addr < sb->vs_addr) { 180 return (-1); 181 } else { 182 return (1); 183 } 184 } 185 186 static int 187 vmmr_cmp_size(const void *a, const void *b) 188 { 189 const vmmr_span_t *sa = a; 190 const vmmr_span_t *sb = b; 191 192 if (sa->vs_size == sb->vs_size) { 193 /* 194 * Since discontiguous spans could have the same size in a 195 * by-size tree, differentiate them (as required by AVL) by 196 * address so they can safely coexist while remaining sorted. 197 */ 198 return (vmmr_cmp_addr(a, b)); 199 } else if (sa->vs_size < sb->vs_size) { 200 return (-1); 201 } else { 202 return (1); 203 } 204 } 205 206 static int 207 vmmr_cmp_region_addr(const void *a, const void *b) 208 { 209 const vmmr_span_t *sa = a; 210 const vmmr_span_t *sb = b; 211 212 if (sa->vs_region_addr == sb->vs_region_addr) { 213 return (0); 214 } else if (sa->vs_region_addr < sb->vs_region_addr) { 215 return (-1); 216 } else { 217 return (1); 218 } 219 } 220 221 static void 222 vmmr_tp_init(vmmr_treepair_t *tree) 223 { 224 avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t), 225 offsetof(vmmr_span_t, vs_by_addr)); 226 avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t), 227 offsetof(vmmr_span_t, vs_by_size)); 228 } 229 230 static void 231 vmmr_tp_destroy(vmmr_treepair_t *tree) 232 { 233 void *vcp = NULL; 234 vmmr_span_t *span; 235 236 while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) { 237 /* Freeing spans will be done when tearing down by-size tree */ 238 } 239 while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) { 240 kmem_free(span, sizeof (*span)); 241 } 242 avl_destroy(&tree->by_addr); 243 avl_destroy(&tree->by_size); 244 } 245 246 /* 247 * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent 248 * span(s). Such concatenation could result in the `to_add` span being freed, 249 * so the caller cannot use it after this returns. 250 */ 251 static void 252 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree) 253 { 254 avl_tree_t *by_addr = &tree->by_addr; 255 avl_tree_t *by_size = &tree->by_size; 256 vmmr_span_t *node; 257 avl_index_t where; 258 259 /* This addr should not already exist in the treepair */ 260 node = avl_find(by_addr, to_add, &where); 261 ASSERT3P(node, ==, NULL); 262 263 node = avl_nearest(by_addr, where, AVL_BEFORE); 264 if (node != NULL && 265 (node->vs_addr + node->vs_size) == to_add->vs_addr) { 266 /* concat with preceeding item */ 267 avl_remove(by_addr, node); 268 avl_remove(by_size, node); 269 node->vs_size += to_add->vs_size; 270 kmem_free(to_add, sizeof (*to_add)); 271 272 /* 273 * Since this now-concatenated span could be adjacent one 274 * trailing it, fall through to perform that check. 275 */ 276 to_add = node; 277 } 278 279 node = avl_nearest(by_addr, where, AVL_AFTER); 280 if (node != NULL && 281 (to_add->vs_addr + to_add->vs_size) == node->vs_addr) { 282 /* concat with trailing item */ 283 avl_remove(by_addr, node); 284 avl_remove(by_size, node); 285 node->vs_addr = to_add->vs_addr; 286 node->vs_size += to_add->vs_size; 287 avl_add(by_addr, node); 288 avl_add(by_size, node); 289 290 kmem_free(to_add, sizeof (*to_add)); 291 return; 292 } 293 294 /* simply insert */ 295 avl_add(by_addr, to_add); 296 avl_add(by_size, to_add); 297 } 298 299 /* 300 * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of 301 * the exact target size is not present, but a larger one is. May return a span 302 * with a size smaller than the target if splitting is not an option. 303 */ 304 static vmmr_span_t * 305 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree) 306 { 307 avl_tree_t *by_addr = &tree->by_addr; 308 avl_tree_t *by_size = &tree->by_size; 309 vmmr_span_t *span; 310 avl_index_t where; 311 312 ASSERT3U(target_sz, !=, 0); 313 ASSERT(!avl_is_empty(by_addr)); 314 ASSERT(!avl_is_empty(by_size)); 315 316 vmmr_span_t search = { .vs_size = target_sz }; 317 span = avl_find(by_size, &search, &where); 318 if (span == NULL) { 319 /* Try for a larger span (instead of exact match) */ 320 span = avl_nearest(by_size, where, AVL_AFTER); 321 if (span == NULL) { 322 /* 323 * Caller will need to collect several smaller spans in 324 * order to fulfill their request. 325 */ 326 span = avl_nearest(by_size, where, AVL_BEFORE); 327 ASSERT3P(span, !=, NULL); 328 } 329 } 330 331 if (span->vs_size <= target_sz) { 332 avl_remove(by_size, span); 333 avl_remove(by_addr, span); 334 335 return (span); 336 } else { 337 /* Split off adequate chunk from larger span */ 338 uintptr_t start = span->vs_addr + span->vs_size - target_sz; 339 340 avl_remove(by_size, span); 341 span->vs_size -= target_sz; 342 avl_add(by_size, span); 343 344 vmmr_span_t *split_span = 345 kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 346 split_span->vs_addr = start; 347 split_span->vs_size = target_sz; 348 349 return (split_span); 350 } 351 } 352 353 static int 354 vmmr_kstat_update(struct kstat *ksp, int rw) 355 { 356 vmmr_kstats_t *vkp = ksp->ks_data; 357 358 mutex_enter(&vmmr_lock); 359 vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz; 360 vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz; 361 /* 362 * In addition to the memory which is actually actually allocated to 363 * transient consumers, memory which is considered free-for-transient is 364 * also included in the sizing. 365 */ 366 vkp->vmrks_bytes_transient.value.ui64 = 367 vmmr_alloc_transient_sz + vmmr_free_transient_sz; 368 vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit; 369 mutex_exit(&vmmr_lock); 370 371 return (0); 372 } 373 374 int 375 vmmr_init() 376 { 377 mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL); 378 379 /* 380 * `vmm_total_limit` represents the absolute maximum size of the VMM 381 * memory reservoir. It is meant to provide some measure of protection 382 * against an operator pushing the system into unrecoverable memory 383 * starvation through explicit or transient additions to the reservoir. 384 * 385 * There will be many situations where this limit would be inadequate to 386 * prevent kernel memory starvation in the face of certain operator 387 * actions. It is a balance to be struck between safety and allowing 388 * large systems to reach high utilization. 389 * 390 * The value is based off of pages_pp_maximum: "Number of currently 391 * available pages that cannot be 'locked'". It is sized as all of 392 * `physmem` less 120% of `pages_pp_maximum`. 393 */ 394 vmmr_total_limit = 395 (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10; 396 397 vmmr_empty_last = 0; 398 vmmr_free_sz = 0; 399 vmmr_alloc_sz = 0; 400 vmmr_empty_sz = 0; 401 vmmr_adding_sz = 0; 402 vmmr_free_transient_sz = 0; 403 vmmr_alloc_transient_sz = 0; 404 vmmr_target_sz = VMMR_TARGET_INACTIVE; 405 406 /* 407 * Attempt kstat allocation early, since it is the only part of 408 * reservoir initialization which is fallible. 409 */ 410 kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir", 411 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 412 sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID); 413 if (ksp == NULL) { 414 mutex_destroy(&vmmr_lock); 415 return (ENOMEM); 416 } 417 418 vmmr_kstats_t *vkp = ksp->ks_data; 419 420 kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free", 421 KSTAT_DATA_UINT64); 422 kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc", 423 KSTAT_DATA_UINT64); 424 kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc", 425 KSTAT_DATA_UINT64); 426 kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit", 427 KSTAT_DATA_UINT64); 428 ksp->ks_private = NULL; 429 ksp->ks_update = vmmr_kstat_update; 430 vmmr_kstat = ksp; 431 432 vmmr_tp_init(&vmmr_free_tp); 433 vmmr_tp_init(&vmmr_empty_tp); 434 435 list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t), 436 offsetof(vmmr_region_t, vr_node)); 437 438 /* Grab a chunk of VA for the reservoir */ 439 vmmr_va_sz = physmem * PAGESIZE; 440 vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP); 441 442 kstat_install(vmmr_kstat); 443 444 return (0); 445 } 446 447 void 448 vmmr_fini() 449 { 450 mutex_enter(&vmmr_lock); 451 VERIFY3U(vmmr_alloc_sz, ==, 0); 452 VERIFY3U(vmmr_free_sz, ==, 0); 453 VERIFY3U(vmmr_adding_sz, ==, 0); 454 VERIFY3U(vmmr_alloc_transient_sz, ==, 0); 455 VERIFY3U(vmmr_free_transient_sz, ==, 0); 456 VERIFY(avl_is_empty(&vmmr_free_tp.by_addr)); 457 VERIFY(avl_is_empty(&vmmr_free_tp.by_size)); 458 VERIFY(list_is_empty(&vmmr_alloc_regions)); 459 460 kstat_delete(vmmr_kstat); 461 vmmr_kstat = NULL; 462 463 vmmr_tp_destroy(&vmmr_free_tp); 464 vmmr_tp_destroy(&vmmr_empty_tp); 465 list_destroy(&vmmr_alloc_regions); 466 467 /* Release reservoir VA chunk */ 468 vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz); 469 vmmr_va = 0; 470 vmmr_va_sz = 0; 471 vmmr_total_limit = 0; 472 vmmr_empty_last = 0; 473 474 mutex_exit(&vmmr_lock); 475 mutex_destroy(&vmmr_lock); 476 } 477 478 bool 479 vmmr_is_empty() 480 { 481 mutex_enter(&vmmr_lock); 482 bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 && 483 vmmr_free_sz == 0 && vmmr_free_transient_sz == 0); 484 mutex_exit(&vmmr_lock); 485 return (res); 486 } 487 488 int 489 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp) 490 { 491 VERIFY3U(sz & PAGEOFFSET, ==, 0); 492 493 if (!transient) { 494 mutex_enter(&vmmr_lock); 495 if (sz > vmmr_free_sz) { 496 mutex_exit(&vmmr_lock); 497 return (ENOSPC); 498 } 499 } else { 500 int err; 501 502 mutex_enter(&vmmr_lock); 503 err = vmmr_add(sz, true); 504 if (err != 0) { 505 mutex_exit(&vmmr_lock); 506 return (err); 507 } 508 VERIFY3U(vmmr_free_transient_sz, >=, sz); 509 } 510 511 vmmr_region_t *region; 512 region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP); 513 avl_create(®ion->vr_spans, vmmr_cmp_region_addr, 514 sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr)); 515 region->vr_size = sz; 516 517 size_t remain = sz; 518 uintptr_t map_at = 0; 519 while (remain > 0) { 520 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 521 522 /* 523 * We have already ensured that adequate free memory is present 524 * in the reservoir for this allocation. 525 */ 526 VERIFY3P(span, !=, NULL); 527 ASSERT3U(span->vs_size, <=, remain); 528 529 span->vs_region_addr = map_at; 530 avl_add(®ion->vr_spans, span); 531 map_at += span->vs_size; 532 remain -= span->vs_size; 533 } 534 535 if (!transient) { 536 vmmr_free_sz -= sz; 537 vmmr_alloc_sz += sz; 538 } else { 539 vmmr_free_transient_sz -= sz; 540 vmmr_alloc_transient_sz += sz; 541 region->vr_transient = true; 542 } 543 list_insert_tail(&vmmr_alloc_regions, region); 544 mutex_exit(&vmmr_lock); 545 546 *resp = region; 547 return (0); 548 } 549 550 void * 551 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off) 552 { 553 /* just use KPM region for now */ 554 return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off))); 555 } 556 557 pfn_t 558 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off) 559 { 560 VERIFY3U(off & PAGEOFFSET, ==, 0); 561 VERIFY3U(off, <, region->vr_size); 562 563 vmmr_span_t search = { 564 .vs_region_addr = off 565 }; 566 avl_index_t where; 567 vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where); 568 569 if (span == NULL) { 570 span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE); 571 ASSERT3P(span, !=, NULL); 572 } 573 uintptr_t span_off = off - span->vs_region_addr + span->vs_addr; 574 page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off); 575 VERIFY(pp != NULL); 576 return (pp->p_pagenum); 577 } 578 579 void 580 vmmr_free(vmmr_region_t *region) 581 { 582 mutex_enter(&vmmr_lock); 583 if (!region->vr_transient) { 584 VERIFY3U(region->vr_size, <=, vmmr_alloc_sz); 585 } else { 586 VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz); 587 } 588 list_remove(&vmmr_alloc_regions, region); 589 mutex_exit(&vmmr_lock); 590 591 /* Zero the contents (while not monopolizing vmmr_lock) */ 592 for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) { 593 bzero(vmmr_region_mem_at(region, off), PAGESIZE); 594 } 595 596 mutex_enter(&vmmr_lock); 597 598 /* Put the contained span(s) back in the free pool */ 599 void *cookie = NULL; 600 vmmr_span_t *span; 601 while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) { 602 span->vs_region_addr = 0; 603 vmmr_tp_insert_concat(span, &vmmr_free_tp); 604 } 605 avl_destroy(®ion->vr_spans); 606 if (!region->vr_transient) { 607 vmmr_free_sz += region->vr_size; 608 vmmr_alloc_sz -= region->vr_size; 609 } else { 610 vmmr_free_transient_sz += region->vr_size; 611 vmmr_alloc_transient_sz -= region->vr_size; 612 } 613 614 if (region->vr_transient) { 615 /* 616 * Since the transient capacity was previously allocated for 617 * this region, its removal should not fail. 618 */ 619 VERIFY0(vmmr_remove(region->vr_size, true)); 620 } 621 kmem_free(region, sizeof (*region)); 622 mutex_exit(&vmmr_lock); 623 } 624 625 static void 626 vmmr_destroy_pages(vmmr_span_t *span) 627 { 628 const uintptr_t end = span->vs_addr + span->vs_size; 629 struct vnode *vp = &kvps[KV_VVP]; 630 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 631 page_t *pp; 632 633 /* Page-free logic cribbed from segkmem_xfree(): */ 634 pp = page_find(vp, (u_offset_t)pos); 635 VERIFY(pp != NULL); 636 if (!page_tryupgrade(pp)) { 637 /* 638 * Some other thread has a sharelock. Wait for 639 * it to drop the lock so we can free this page. 640 */ 641 page_unlock(pp); 642 pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL); 643 } 644 645 /* 646 * Clear p_lckcnt so page_destroy() doesn't update availrmem. 647 * That will be taken care of later via page_unresv(). 648 */ 649 pp->p_lckcnt = 0; 650 page_destroy(pp, 0); 651 } 652 } 653 654 static int 655 vmmr_alloc_pages(const vmmr_span_t *span) 656 { 657 struct seg kseg = { 658 .s_as = &kas 659 }; 660 struct vnode *vp = &kvps[KV_VVP]; 661 662 const uintptr_t end = span->vs_addr + span->vs_size; 663 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 664 page_t *pp; 665 666 pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE, 667 PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos)); 668 669 if (pp == NULL) { 670 /* Destroy any already-created pages */ 671 if (pos != span->vs_addr) { 672 vmmr_span_t destroy_span = { 673 .vs_addr = span->vs_addr, 674 .vs_size = pos - span->vs_addr, 675 }; 676 677 vmmr_destroy_pages(&destroy_span); 678 } 679 return (ENOMEM); 680 } 681 682 /* mimic page state from segkmem */ 683 ASSERT(PAGE_EXCL(pp)); 684 page_io_unlock(pp); 685 pp->p_lckcnt = 1; 686 page_downgrade(pp); 687 688 /* pre-zero the page */ 689 bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE); 690 } 691 692 return (0); 693 } 694 695 static int 696 vmmr_resv_wait() 697 { 698 if (delay_sig(hz >> 2) != 0) { 699 /* bail due to interruption */ 700 return (0); 701 } 702 return (1); 703 } 704 705 static void 706 vmmr_remove_raw(size_t sz) 707 { 708 VERIFY3U(sz & PAGEOFFSET, ==, 0); 709 VERIFY(MUTEX_HELD(&vmmr_lock)); 710 711 size_t remain = sz; 712 while (remain > 0) { 713 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 714 715 /* 716 * The caller must ensure that at least `sz` amount is present 717 * in the free treepair. 718 */ 719 VERIFY3P(span, !=, NULL); 720 ASSERT3U(span->vs_size, <=, remain); 721 722 /* TODO: perhaps arrange to destroy pages outside the lock? */ 723 vmmr_destroy_pages(span); 724 725 remain -= span->vs_size; 726 vmmr_tp_insert_concat(span, &vmmr_empty_tp); 727 } 728 729 vmmr_empty_sz += sz; 730 } 731 732 /* 733 * Add memory to vmm reservoir. Memory may be marked for transient use, where 734 * the addition is part of a transient allocation from the reservoir. Otherwise 735 * it is placed in the reservoir to be available for non-transient allocations. 736 * 737 * Expects vmmr_lock to be held when called, and will return with it held, but 738 * will drop it during portions of the addition. 739 */ 740 static int 741 vmmr_add(size_t sz, bool transient) 742 { 743 VERIFY3U(sz & PAGEOFFSET, ==, 0); 744 VERIFY3U(sz, >, 0); 745 VERIFY(MUTEX_HELD(&vmmr_lock)); 746 747 /* 748 * Make sure that the amount added is not going to breach the limits 749 * we've chosen 750 */ 751 const size_t current_total = 752 vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz + 753 vmmr_alloc_transient_sz + vmmr_free_transient_sz; 754 if ((current_total + sz) < current_total) { 755 return (EOVERFLOW); 756 } 757 if ((current_total + sz) > vmmr_total_limit) { 758 return (ENOSPC); 759 } 760 vmmr_adding_sz += sz; 761 mutex_exit(&vmmr_lock); 762 763 /* Wait for enough pages to become available */ 764 if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) { 765 mutex_enter(&vmmr_lock); 766 vmmr_adding_sz -= sz; 767 return (EINTR); 768 } 769 770 mutex_enter(&vmmr_lock); 771 size_t added = 0; 772 size_t remain = sz; 773 while (added < sz) { 774 vmmr_span_t *span = NULL; 775 776 if (vmmr_empty_sz > 0) { 777 span = vmmr_tp_remove_split(remain, &vmmr_empty_tp); 778 779 vmmr_empty_sz -= span->vs_size; 780 } else { 781 /* 782 * No empty space to fill with new pages, so just tack 783 * it on at the end instead. 784 */ 785 span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 786 span->vs_addr = vmmr_empty_last; 787 span->vs_size = remain; 788 vmmr_empty_last += remain; 789 } 790 VERIFY3P(span, !=, NULL); 791 792 793 /* Allocate the actual pages to back this span */ 794 mutex_exit(&vmmr_lock); 795 int err = vmmr_alloc_pages(span); 796 mutex_enter(&vmmr_lock); 797 798 /* 799 * If an error is encountered during page allocation for the 800 * span, unwind any progress made by the addition request. 801 */ 802 if (err != 0) { 803 /* 804 * Without pages allocated to this span, it is now 805 * tracked as empty. 806 */ 807 vmmr_empty_sz += span->vs_size; 808 vmmr_tp_insert_concat(span, &vmmr_empty_tp); 809 810 if (added != 0) { 811 vmmr_remove_raw(added); 812 } 813 814 vmmr_adding_sz -= sz; 815 816 page_unresv(sz >> PAGESHIFT); 817 return (err); 818 } 819 820 /* 821 * The allocated-page-bearing span is placed in the "free" 822 * treepair now, but is not officially exposed for consumption 823 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated. 824 * 825 * This allows us to unwind the allocation in case of a failure 826 * without the risk of the freshly added span(s) being snapped 827 * up by a consumer already. 828 */ 829 added += span->vs_size; 830 remain -= span->vs_size; 831 vmmr_tp_insert_concat(span, &vmmr_free_tp); 832 } 833 834 /* Make the added memory usable by exposing it to the size accounting */ 835 if (!transient) { 836 vmmr_free_sz += added; 837 } else { 838 vmmr_free_transient_sz += added; 839 } 840 ASSERT3U(added, ==, sz); 841 vmmr_adding_sz -= added; 842 843 return (0); 844 } 845 846 /* 847 * Remove memory from vmm reservoir. Normally this will remove memory from the 848 * reservoir which was available for non-transient allocations. If the removal 849 * is part of a vmmr_free() of a transient allocation, it will act on only that 850 * transient region being freed, not the available memory in the reservoir. 851 * 852 * Expects vmmr_lock to be held when called, and will return with it held, but 853 * may drop it during portions of the removal. 854 */ 855 static int 856 vmmr_remove(size_t sz, bool transient) 857 { 858 VERIFY3U(sz & PAGEOFFSET, ==, 0); 859 VERIFY(sz); 860 VERIFY(MUTEX_HELD(&vmmr_lock)); 861 862 if ((!transient && sz > vmmr_free_sz) || 863 (transient && sz > vmmr_free_transient_sz)) { 864 return (ENOSPC); 865 } 866 867 vmmr_remove_raw(sz); 868 869 if (!transient) { 870 vmmr_free_sz -= sz; 871 } else { 872 vmmr_free_transient_sz -= sz; 873 } 874 page_unresv(sz >> PAGESHIFT); 875 return (0); 876 } 877 878 static int 879 vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp) 880 { 881 VERIFY(resp != NULL); 882 883 mutex_enter(&vmmr_lock); 884 885 size_t current_sz = vmmr_alloc_sz + vmmr_free_sz; 886 887 /* Be sure to communicate current size in case of an early bail-out */ 888 *resp = current_sz; 889 890 if ((target_sz & PAGEOFFSET) != 0 || 891 (chunk_sz & PAGEOFFSET) != 0) { 892 mutex_exit(&vmmr_lock); 893 return (EINVAL); 894 } 895 /* Reject sentinel value */ 896 if (target_sz == VMMR_TARGET_INACTIVE) { 897 mutex_exit(&vmmr_lock); 898 return (EINVAL); 899 } 900 901 /* Already at target size */ 902 if (target_sz == current_sz) { 903 mutex_exit(&vmmr_lock); 904 return (0); 905 } 906 907 /* Reject racing requests size */ 908 if (vmmr_target_sz != VMMR_TARGET_INACTIVE) { 909 mutex_exit(&vmmr_lock); 910 return (EALREADY); 911 } 912 /* Record the target now to excluding a racing request */ 913 vmmr_target_sz = target_sz; 914 915 int err = 0; 916 do { 917 /* Be sensitive to signal interruption */ 918 if (issig(JUSTLOOKING) != 0) { 919 mutex_exit(&vmmr_lock); 920 const bool sig_bail = issig(FORREAL) != 0; 921 mutex_enter(&vmmr_lock); 922 if (sig_bail) { 923 err = EINTR; 924 break; 925 } 926 } 927 928 if (current_sz > target_sz) { 929 /* Shrinking reservoir */ 930 931 size_t req_sz = current_sz - target_sz; 932 if (chunk_sz != 0) { 933 req_sz = MIN(req_sz, chunk_sz); 934 } 935 err = vmmr_remove(req_sz, false); 936 } else { 937 /* Growing reservoir */ 938 ASSERT(current_sz < target_sz); 939 940 size_t req_sz = target_sz - current_sz; 941 if (chunk_sz != 0) { 942 req_sz = MIN(req_sz, chunk_sz); 943 } 944 err = vmmr_add(req_sz, false); 945 } 946 947 current_sz = vmmr_alloc_sz + vmmr_free_sz; 948 } while (err == 0 && current_sz != target_sz); 949 950 /* Clear the target now that we are done (success or not) */ 951 vmmr_target_sz = VMMR_TARGET_INACTIVE; 952 mutex_exit(&vmmr_lock); 953 *resp = current_sz; 954 return (err); 955 } 956 957 int 958 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 959 { 960 /* 961 * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we 962 * do not need to duplicate such checks here. 963 */ 964 965 switch (cmd) { 966 case VMM_RESV_QUERY: { 967 struct vmm_resv_query res; 968 void *datap = (void *)(uintptr_t)arg; 969 970 /* For now, anyone with access to vmmctl device can query */ 971 mutex_enter(&vmmr_lock); 972 res.vrq_free_sz = vmmr_free_sz; 973 res.vrq_alloc_sz = vmmr_alloc_sz; 974 res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz; 975 res.vrq_limit = vmmr_total_limit; 976 mutex_exit(&vmmr_lock); 977 if (ddi_copyout(&res, datap, sizeof (res), md) != 0) { 978 return (EFAULT); 979 } 980 break; 981 } 982 case VMM_RESV_SET_TARGET: { 983 if (secpolicy_sys_config(cr, B_FALSE) != 0) { 984 return (EPERM); 985 } 986 987 struct vmm_resv_target tgt; 988 void *datap = (void *)(uintptr_t)arg; 989 990 if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) { 991 return (EFAULT); 992 } 993 994 int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz, 995 &tgt.vrt_result_sz); 996 997 /* 998 * Attempt to communicate the resultant size of the reservoir if 999 * setting it to the target was a success, or if we were 1000 * interrupted (by a signal) while doing so. 1001 */ 1002 if (err == 0 || err == EINTR) { 1003 if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) { 1004 err = EFAULT; 1005 } 1006 } 1007 1008 return (err); 1009 } 1010 default: 1011 return (ENOTTY); 1012 } 1013 return (0); 1014 } 1015