1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2023 Oxide Computer Company 14 */ 15 16 /* 17 * VMM Memory Reservoir 18 * 19 * 20 * In order to make the allocation of large (multi-GiB) chunks of memory 21 * for bhyve VMs easier, we introduce the "VMM Reservoir", where system 22 * operators can set aside a substantial portion of system memory exclusively 23 * for VMs. This memory is unavailable for general use by the rest of the 24 * system. Rather than having to scour the freelist, reap kmem caches, or put 25 * pressure on the ARC, bhyve guest memory allocations can quickly determine if 26 * there is adequate reservoir memory available. Since the pages stored in the 27 * reservoir are pre-zeroed, it can be immediately used when allocated to a 28 * guest. When the memory is returned to the reservoir, it is zeroed once more 29 * to avoid leaking any sensitive data from that guest. 30 * 31 * 32 * Transient Allocations 33 * 34 * While the explicit reservoir model may work well for some applications, 35 * others may want a more traditional model, where pages for guest memory 36 * objects are allocated on demand, rather than from a pool set aside from the 37 * system. In this case, the allocation can be made in "transient" mode, where 38 * the memory is allocated normally, even if there is free capacity in the 39 * reservoir. When use of the transient allocation is complete (the guest is 40 * halted and destroyed), the pages will be freed back to the system, rather 41 * than added back to the reservoir. 42 * 43 * From an implementation standpoint, transient allocations follow the same 44 * code paths as ones using the reservoir normally. Those allocations have a 45 * tag which marks them as transient, and used/free size tallies are maintained 46 * separately for normal and transient operations. When performing a transient 47 * allocation, that amount of memory is immediately added to the reservoir , 48 * from which the allocation can be made. When freeing a transient allocation, 49 * a matching amount of memory is removed from the reservoir as part of the 50 * operation. This allows both allocation types to coexist without too much 51 * additional machinery. 52 * 53 * 54 * Administration 55 * 56 * Operators may attempt to alter the amount of memory allocated to the 57 * reservoir via an ioctl against the vmmctl device. The total amount of memory 58 * in the reservoir (free, or allocated to VMs) is arbitrarily limited at this 59 * time by `vmmr_total_limit`, which defaults to 80% of physmem. This is done 60 * to prevent the reservoir from inadvertently growing to a size where the 61 * system has inadequate memory to make forward progress. Shrinking the 62 * reservoir is only possible when it contains free (not allocated by any guest 63 * VMs) memory. 64 * 65 * 66 * Page Tracking 67 * 68 * The reservoir currently uses vnode association to keep track of pages under 69 * its control (either designated to the reservoir and free, or allocated to a 70 * guest VM object). This means using the existing VM system primitives for 71 * page_t instances being associated with a given (vnode, offset) tuple. It 72 * means that spans of pages, either free or allocated, need only to store a 73 * length (of the span) and an offset (into the vnode) in order to gain access 74 * to all of the underlying pages associated with that span. Associating the 75 * pages against `kvps[KV_VVP]` (the VMM kernel vnode) means they will be 76 * properly tracked as KAS pages, but be excluded from normal dumps (unless the 77 * operator has chosen to dump all of RAM). 78 */ 79 80 #include <sys/types.h> 81 #include <sys/mutex.h> 82 #include <sys/avl.h> 83 #include <sys/list.h> 84 #include <sys/machparam.h> 85 #include <sys/kmem.h> 86 #include <sys/stddef.h> 87 #include <sys/null.h> 88 #include <sys/errno.h> 89 #include <sys/systm.h> 90 #include <sys/sunddi.h> 91 #include <sys/policy.h> 92 #include <vm/seg_kmem.h> 93 #include <vm/hat_i86.h> 94 #include <sys/kstat.h> 95 96 #include <sys/vmm_reservoir.h> 97 #include <sys/vmm_dev.h> 98 #include <sys/vmm_impl.h> 99 100 #define VMMR_TARGET_INACTIVE SIZE_MAX 101 102 static kmutex_t vmmr_lock; 103 104 static size_t vmmr_free_sz; 105 static size_t vmmr_free_transient_sz; 106 static size_t vmmr_adding_sz; 107 static size_t vmmr_alloc_sz; 108 static size_t vmmr_alloc_transient_sz; 109 static size_t vmmr_empty_sz; 110 111 /* 112 * Target size of the reservoir during active vmmr_set_target() operation. 113 * It holds the sentinel value of VMMR_TARGET_INACTIVE when no resize is active. 114 */ 115 static size_t vmmr_target_sz; 116 117 static uintptr_t vmmr_empty_last; 118 /* Upper limit for the size (free + allocated) of the reservoir */ 119 static size_t vmmr_total_limit; 120 121 /* VA range allocated from the VMM arena for the mappings */ 122 static uintptr_t vmmr_va; 123 static uintptr_t vmmr_va_sz; 124 125 static kstat_t *vmmr_kstat; 126 127 /* Pair of AVL trees to store set of spans ordered by addr and size */ 128 typedef struct vmmr_treepair { 129 avl_tree_t by_addr; 130 avl_tree_t by_size; 131 } vmmr_treepair_t; 132 133 /* Spans of free memory in the reservoir */ 134 static vmmr_treepair_t vmmr_free_tp; 135 136 /* Spans of empty (not backed by memory) space in the reservoir */ 137 static vmmr_treepair_t vmmr_empty_tp; 138 139 /* Regions of memory allocated from the reservoir */ 140 static list_t vmmr_alloc_regions; 141 142 struct vmmr_span { 143 uintptr_t vs_addr; 144 size_t vs_size; 145 avl_node_t vs_by_addr; 146 avl_node_t vs_by_size; 147 uintptr_t vs_region_addr; 148 }; 149 typedef struct vmmr_span vmmr_span_t; 150 151 struct vmmr_region { 152 size_t vr_size; 153 avl_tree_t vr_spans; 154 list_node_t vr_node; 155 bool vr_transient; 156 }; 157 158 typedef struct vmmr_kstats { 159 kstat_named_t vmrks_bytes_free; 160 kstat_named_t vmrks_bytes_alloc; 161 kstat_named_t vmrks_bytes_transient; 162 kstat_named_t vmrks_bytes_limit; 163 } vmmr_kstats_t; 164 165 166 static int vmmr_add(size_t, bool); 167 static int vmmr_remove(size_t, bool); 168 169 static int 170 vmmr_cmp_addr(const void *a, const void *b) 171 { 172 const vmmr_span_t *sa = a; 173 const vmmr_span_t *sb = b; 174 175 if (sa->vs_addr == sb->vs_addr) { 176 return (0); 177 } else if (sa->vs_addr < sb->vs_addr) { 178 return (-1); 179 } else { 180 return (1); 181 } 182 } 183 184 static int 185 vmmr_cmp_size(const void *a, const void *b) 186 { 187 const vmmr_span_t *sa = a; 188 const vmmr_span_t *sb = b; 189 190 if (sa->vs_size == sb->vs_size) { 191 /* 192 * Since discontiguous spans could have the same size in a 193 * by-size tree, differentiate them (as required by AVL) by 194 * address so they can safely coexist while remaining sorted. 195 */ 196 return (vmmr_cmp_addr(a, b)); 197 } else if (sa->vs_size < sb->vs_size) { 198 return (-1); 199 } else { 200 return (1); 201 } 202 } 203 204 static int 205 vmmr_cmp_region_addr(const void *a, const void *b) 206 { 207 const vmmr_span_t *sa = a; 208 const vmmr_span_t *sb = b; 209 210 if (sa->vs_region_addr == sb->vs_region_addr) { 211 return (0); 212 } else if (sa->vs_region_addr < sb->vs_region_addr) { 213 return (-1); 214 } else { 215 return (1); 216 } 217 } 218 219 static void 220 vmmr_tp_init(vmmr_treepair_t *tree) 221 { 222 avl_create(&tree->by_addr, vmmr_cmp_addr, sizeof (vmmr_span_t), 223 offsetof(vmmr_span_t, vs_by_addr)); 224 avl_create(&tree->by_size, vmmr_cmp_size, sizeof (vmmr_span_t), 225 offsetof(vmmr_span_t, vs_by_size)); 226 } 227 228 static void 229 vmmr_tp_destroy(vmmr_treepair_t *tree) 230 { 231 void *vcp = NULL; 232 vmmr_span_t *span; 233 234 while (avl_destroy_nodes(&tree->by_addr, &vcp) != NULL) { 235 /* Freeing spans will be done when tearing down by-size tree */ 236 } 237 while ((span = avl_destroy_nodes(&tree->by_size, &vcp)) != NULL) { 238 kmem_free(span, sizeof (*span)); 239 } 240 avl_destroy(&tree->by_addr); 241 avl_destroy(&tree->by_size); 242 } 243 244 /* 245 * Insert a vmmr_span_t into a treepair, concatenating if possible with adjacent 246 * span(s). Such concatenation could result in the `to_add` span being freed, 247 * so the caller cannot use it after this returns. 248 */ 249 static void 250 vmmr_tp_insert_concat(vmmr_span_t *to_add, vmmr_treepair_t *tree) 251 { 252 avl_tree_t *by_addr = &tree->by_addr; 253 avl_tree_t *by_size = &tree->by_size; 254 vmmr_span_t *node; 255 avl_index_t where; 256 257 /* This addr should not already exist in the treepair */ 258 node = avl_find(by_addr, to_add, &where); 259 ASSERT3P(node, ==, NULL); 260 261 node = avl_nearest(by_addr, where, AVL_BEFORE); 262 if (node != NULL && 263 (node->vs_addr + node->vs_size) == to_add->vs_addr) { 264 /* concat with preceeding item */ 265 avl_remove(by_addr, node); 266 avl_remove(by_size, node); 267 node->vs_size += to_add->vs_size; 268 kmem_free(to_add, sizeof (*to_add)); 269 270 /* 271 * Since this now-concatenated span could be adjacent one 272 * trailing it, fall through to perform that check. 273 */ 274 to_add = node; 275 } 276 277 node = avl_nearest(by_addr, where, AVL_AFTER); 278 if (node != NULL && 279 (to_add->vs_addr + to_add->vs_size) == node->vs_addr) { 280 /* concat with trailing item */ 281 avl_remove(by_addr, node); 282 avl_remove(by_size, node); 283 node->vs_addr = to_add->vs_addr; 284 node->vs_size += to_add->vs_size; 285 avl_add(by_addr, node); 286 avl_add(by_size, node); 287 288 kmem_free(to_add, sizeof (*to_add)); 289 return; 290 } 291 292 /* simply insert */ 293 avl_add(by_addr, to_add); 294 avl_add(by_size, to_add); 295 } 296 297 /* 298 * Remove a vmmr_span_t from a treepair, splitting if necessary when a span of 299 * the exact target size is not present, but a larger one is. May return a span 300 * with a size smaller than the target if splitting is not an option. 301 */ 302 static vmmr_span_t * 303 vmmr_tp_remove_split(size_t target_sz, vmmr_treepair_t *tree) 304 { 305 avl_tree_t *by_addr = &tree->by_addr; 306 avl_tree_t *by_size = &tree->by_size; 307 vmmr_span_t *span; 308 avl_index_t where; 309 310 ASSERT3U(target_sz, !=, 0); 311 ASSERT(!avl_is_empty(by_addr)); 312 ASSERT(!avl_is_empty(by_size)); 313 314 vmmr_span_t search = { .vs_size = target_sz }; 315 span = avl_find(by_size, &search, &where); 316 if (span == NULL) { 317 /* Try for a larger span (instead of exact match) */ 318 span = avl_nearest(by_size, where, AVL_AFTER); 319 if (span == NULL) { 320 /* 321 * Caller will need to collect several smaller spans in 322 * order to fulfill their request. 323 */ 324 span = avl_nearest(by_size, where, AVL_BEFORE); 325 ASSERT3P(span, !=, NULL); 326 } 327 } 328 329 if (span->vs_size <= target_sz) { 330 avl_remove(by_size, span); 331 avl_remove(by_addr, span); 332 333 return (span); 334 } else { 335 /* Split off adequate chunk from larger span */ 336 uintptr_t start = span->vs_addr + span->vs_size - target_sz; 337 338 avl_remove(by_size, span); 339 span->vs_size -= target_sz; 340 avl_add(by_size, span); 341 342 vmmr_span_t *split_span = 343 kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 344 split_span->vs_addr = start; 345 split_span->vs_size = target_sz; 346 347 return (split_span); 348 } 349 } 350 351 static int 352 vmmr_kstat_update(struct kstat *ksp, int rw) 353 { 354 vmmr_kstats_t *vkp = ksp->ks_data; 355 356 mutex_enter(&vmmr_lock); 357 vkp->vmrks_bytes_free.value.ui64 = vmmr_free_sz; 358 vkp->vmrks_bytes_alloc.value.ui64 = vmmr_alloc_sz; 359 /* 360 * In addition to the memory which is actually actually allocated to 361 * transient consumers, memory which is considered free-for-transient is 362 * also included in the sizing. 363 */ 364 vkp->vmrks_bytes_transient.value.ui64 = 365 vmmr_alloc_transient_sz + vmmr_free_transient_sz; 366 vkp->vmrks_bytes_limit.value.ui64 = vmmr_total_limit; 367 mutex_exit(&vmmr_lock); 368 369 return (0); 370 } 371 372 int 373 vmmr_init() 374 { 375 mutex_init(&vmmr_lock, NULL, MUTEX_DEFAULT, NULL); 376 377 /* 378 * `vmm_total_limit` represents the absolute maximum size of the VMM 379 * memory reservoir. It is meant to provide some measure of protection 380 * against an operator pushing the system into unrecoverable memory 381 * starvation through explicit or transient additions to the reservoir. 382 * 383 * There will be many situations where this limit would be inadequate to 384 * prevent kernel memory starvation in the face of certain operator 385 * actions. It is a balance to be struck between safety and allowing 386 * large systems to reach high utilization. 387 * 388 * The value is based off of pages_pp_maximum: "Number of currently 389 * available pages that cannot be 'locked'". It is sized as all of 390 * `physmem` less 120% of `pages_pp_maximum`. 391 */ 392 vmmr_total_limit = 393 (((physmem * 10) - (pages_pp_maximum * 12)) * PAGESIZE) / 10; 394 395 vmmr_empty_last = 0; 396 vmmr_free_sz = 0; 397 vmmr_alloc_sz = 0; 398 vmmr_empty_sz = 0; 399 vmmr_adding_sz = 0; 400 vmmr_free_transient_sz = 0; 401 vmmr_alloc_transient_sz = 0; 402 vmmr_target_sz = VMMR_TARGET_INACTIVE; 403 404 /* 405 * Attempt kstat allocation early, since it is the only part of 406 * reservoir initialization which is fallible. 407 */ 408 kstat_t *ksp = kstat_create_zone(VMM_MODULE_NAME, 0, "vmm_reservoir", 409 VMM_KSTAT_CLASS, KSTAT_TYPE_NAMED, 410 sizeof (vmmr_kstats_t) / sizeof (kstat_named_t), 0, GLOBAL_ZONEID); 411 if (ksp == NULL) { 412 mutex_destroy(&vmmr_lock); 413 return (ENOMEM); 414 } 415 416 vmmr_kstats_t *vkp = ksp->ks_data; 417 418 kstat_named_init(&vkp->vmrks_bytes_free, "bytes_free", 419 KSTAT_DATA_UINT64); 420 kstat_named_init(&vkp->vmrks_bytes_alloc, "bytes_alloc", 421 KSTAT_DATA_UINT64); 422 kstat_named_init(&vkp->vmrks_bytes_transient, "bytes_transient_alloc", 423 KSTAT_DATA_UINT64); 424 kstat_named_init(&vkp->vmrks_bytes_limit, "bytes_limit", 425 KSTAT_DATA_UINT64); 426 ksp->ks_private = NULL; 427 ksp->ks_update = vmmr_kstat_update; 428 vmmr_kstat = ksp; 429 430 vmmr_tp_init(&vmmr_free_tp); 431 vmmr_tp_init(&vmmr_empty_tp); 432 433 list_create(&vmmr_alloc_regions, sizeof (vmmr_region_t), 434 offsetof(vmmr_region_t, vr_node)); 435 436 /* Grab a chunk of VA for the reservoir */ 437 vmmr_va_sz = physmem * PAGESIZE; 438 vmmr_va = (uintptr_t)vmem_alloc(kvmm_arena, vmmr_va_sz, VM_SLEEP); 439 440 kstat_install(vmmr_kstat); 441 442 return (0); 443 } 444 445 void 446 vmmr_fini() 447 { 448 mutex_enter(&vmmr_lock); 449 VERIFY3U(vmmr_alloc_sz, ==, 0); 450 VERIFY3U(vmmr_free_sz, ==, 0); 451 VERIFY3U(vmmr_adding_sz, ==, 0); 452 VERIFY3U(vmmr_alloc_transient_sz, ==, 0); 453 VERIFY3U(vmmr_free_transient_sz, ==, 0); 454 VERIFY(avl_is_empty(&vmmr_free_tp.by_addr)); 455 VERIFY(avl_is_empty(&vmmr_free_tp.by_size)); 456 VERIFY(list_is_empty(&vmmr_alloc_regions)); 457 458 kstat_delete(vmmr_kstat); 459 vmmr_kstat = NULL; 460 461 vmmr_tp_destroy(&vmmr_free_tp); 462 vmmr_tp_destroy(&vmmr_empty_tp); 463 list_destroy(&vmmr_alloc_regions); 464 465 /* Release reservoir VA chunk */ 466 vmem_free(kvmm_arena, (void *)vmmr_va, vmmr_va_sz); 467 vmmr_va = 0; 468 vmmr_va_sz = 0; 469 vmmr_total_limit = 0; 470 vmmr_empty_last = 0; 471 472 mutex_exit(&vmmr_lock); 473 mutex_destroy(&vmmr_lock); 474 } 475 476 bool 477 vmmr_is_empty() 478 { 479 mutex_enter(&vmmr_lock); 480 bool res = (vmmr_alloc_sz == 0 && vmmr_alloc_transient_sz == 0 && 481 vmmr_free_sz == 0 && vmmr_free_transient_sz == 0); 482 mutex_exit(&vmmr_lock); 483 return (res); 484 } 485 486 int 487 vmmr_alloc(size_t sz, bool transient, vmmr_region_t **resp) 488 { 489 VERIFY3U(sz & PAGEOFFSET, ==, 0); 490 491 if (!transient) { 492 mutex_enter(&vmmr_lock); 493 if (sz > vmmr_free_sz) { 494 mutex_exit(&vmmr_lock); 495 return (ENOSPC); 496 } 497 } else { 498 int err; 499 500 mutex_enter(&vmmr_lock); 501 err = vmmr_add(sz, true); 502 if (err != 0) { 503 mutex_exit(&vmmr_lock); 504 return (err); 505 } 506 VERIFY3U(vmmr_free_transient_sz, >=, sz); 507 } 508 509 vmmr_region_t *region; 510 region = kmem_zalloc(sizeof (vmmr_region_t), KM_SLEEP); 511 avl_create(®ion->vr_spans, vmmr_cmp_region_addr, 512 sizeof (vmmr_span_t), offsetof(vmmr_span_t, vs_by_addr)); 513 region->vr_size = sz; 514 515 size_t remain = sz; 516 uintptr_t map_at = 0; 517 while (remain > 0) { 518 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 519 520 /* 521 * We have already ensured that adequate free memory is present 522 * in the reservoir for this allocation. 523 */ 524 VERIFY3P(span, !=, NULL); 525 ASSERT3U(span->vs_size, <=, remain); 526 527 span->vs_region_addr = map_at; 528 avl_add(®ion->vr_spans, span); 529 map_at += span->vs_size; 530 remain -= span->vs_size; 531 } 532 533 if (!transient) { 534 vmmr_free_sz -= sz; 535 vmmr_alloc_sz += sz; 536 } else { 537 vmmr_free_transient_sz -= sz; 538 vmmr_alloc_transient_sz += sz; 539 region->vr_transient = true; 540 } 541 list_insert_tail(&vmmr_alloc_regions, region); 542 mutex_exit(&vmmr_lock); 543 544 *resp = region; 545 return (0); 546 } 547 548 void * 549 vmmr_region_mem_at(vmmr_region_t *region, uintptr_t off) 550 { 551 /* just use KPM region for now */ 552 return (hat_kpm_pfn2va(vmmr_region_pfn_at(region, off))); 553 } 554 555 pfn_t 556 vmmr_region_pfn_at(vmmr_region_t *region, uintptr_t off) 557 { 558 VERIFY3U(off & PAGEOFFSET, ==, 0); 559 VERIFY3U(off, <, region->vr_size); 560 561 vmmr_span_t search = { 562 .vs_region_addr = off 563 }; 564 avl_index_t where; 565 vmmr_span_t *span = avl_find(®ion->vr_spans, &search, &where); 566 567 if (span == NULL) { 568 span = avl_nearest(®ion->vr_spans, where, AVL_BEFORE); 569 ASSERT3P(span, !=, NULL); 570 } 571 uintptr_t span_off = off - span->vs_region_addr + span->vs_addr; 572 page_t *pp = page_find(&kvps[KV_VVP], (u_offset_t)span_off); 573 VERIFY(pp != NULL); 574 return (pp->p_pagenum); 575 } 576 577 void 578 vmmr_free(vmmr_region_t *region) 579 { 580 mutex_enter(&vmmr_lock); 581 if (!region->vr_transient) { 582 VERIFY3U(region->vr_size, <=, vmmr_alloc_sz); 583 } else { 584 VERIFY3U(region->vr_size, <=, vmmr_alloc_transient_sz); 585 } 586 list_remove(&vmmr_alloc_regions, region); 587 mutex_exit(&vmmr_lock); 588 589 /* Zero the contents (while not monopolizing vmmr_lock) */ 590 for (uintptr_t off = 0; off < region->vr_size; off += PAGESIZE) { 591 bzero(vmmr_region_mem_at(region, off), PAGESIZE); 592 } 593 594 mutex_enter(&vmmr_lock); 595 596 /* Put the contained span(s) back in the free pool */ 597 void *cookie = NULL; 598 vmmr_span_t *span; 599 while ((span = avl_destroy_nodes(®ion->vr_spans, &cookie)) != NULL) { 600 span->vs_region_addr = 0; 601 vmmr_tp_insert_concat(span, &vmmr_free_tp); 602 } 603 avl_destroy(®ion->vr_spans); 604 if (!region->vr_transient) { 605 vmmr_free_sz += region->vr_size; 606 vmmr_alloc_sz -= region->vr_size; 607 } else { 608 vmmr_free_transient_sz += region->vr_size; 609 vmmr_alloc_transient_sz -= region->vr_size; 610 } 611 612 if (region->vr_transient) { 613 /* 614 * Since the transient capacity was previously allocated for 615 * this region, its removal should not fail. 616 */ 617 VERIFY0(vmmr_remove(region->vr_size, true)); 618 } 619 kmem_free(region, sizeof (*region)); 620 mutex_exit(&vmmr_lock); 621 } 622 623 static void 624 vmmr_destroy_pages(vmmr_span_t *span) 625 { 626 const uintptr_t end = span->vs_addr + span->vs_size; 627 struct vnode *vp = &kvps[KV_VVP]; 628 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 629 page_t *pp; 630 631 /* Page-free logic cribbed from segkmem_xfree(): */ 632 pp = page_find(vp, (u_offset_t)pos); 633 VERIFY(pp != NULL); 634 if (!page_tryupgrade(pp)) { 635 /* 636 * Some other thread has a sharelock. Wait for 637 * it to drop the lock so we can free this page. 638 */ 639 page_unlock(pp); 640 pp = page_lookup(vp, (u_offset_t)pos, SE_EXCL); 641 } 642 643 /* 644 * Clear p_lckcnt so page_destroy() doesn't update availrmem. 645 * That will be taken care of later via page_unresv(). 646 */ 647 pp->p_lckcnt = 0; 648 page_destroy(pp, 0); 649 } 650 } 651 652 static int 653 vmmr_alloc_pages(const vmmr_span_t *span) 654 { 655 struct seg kseg = { 656 .s_as = &kas 657 }; 658 struct vnode *vp = &kvps[KV_VVP]; 659 660 const uintptr_t end = span->vs_addr + span->vs_size; 661 for (uintptr_t pos = span->vs_addr; pos < end; pos += PAGESIZE) { 662 page_t *pp; 663 664 pp = page_create_va(vp, (u_offset_t)pos, PAGESIZE, 665 PG_EXCL | PG_NORELOC, &kseg, (void *)(vmmr_va + pos)); 666 667 if (pp == NULL) { 668 /* Destroy any already-created pages */ 669 if (pos != span->vs_addr) { 670 vmmr_span_t destroy_span = { 671 .vs_addr = span->vs_addr, 672 .vs_size = pos - span->vs_addr, 673 }; 674 675 vmmr_destroy_pages(&destroy_span); 676 } 677 return (ENOMEM); 678 } 679 680 /* mimic page state from segkmem */ 681 ASSERT(PAGE_EXCL(pp)); 682 page_io_unlock(pp); 683 pp->p_lckcnt = 1; 684 page_downgrade(pp); 685 686 /* pre-zero the page */ 687 bzero(hat_kpm_pfn2va(pp->p_pagenum), PAGESIZE); 688 } 689 690 return (0); 691 } 692 693 static int 694 vmmr_resv_wait() 695 { 696 if (delay_sig(hz >> 2) != 0) { 697 /* bail due to interruption */ 698 return (0); 699 } 700 return (1); 701 } 702 703 static void 704 vmmr_remove_raw(size_t sz) 705 { 706 VERIFY3U(sz & PAGEOFFSET, ==, 0); 707 VERIFY(MUTEX_HELD(&vmmr_lock)); 708 709 size_t remain = sz; 710 while (remain > 0) { 711 vmmr_span_t *span = vmmr_tp_remove_split(remain, &vmmr_free_tp); 712 713 /* 714 * The caller must ensure that at least `sz` amount is present 715 * in the free treepair. 716 */ 717 VERIFY3P(span, !=, NULL); 718 ASSERT3U(span->vs_size, <=, remain); 719 720 /* TODO: perhaps arrange to destroy pages outside the lock? */ 721 vmmr_destroy_pages(span); 722 723 remain -= span->vs_size; 724 vmmr_tp_insert_concat(span, &vmmr_empty_tp); 725 } 726 727 vmmr_empty_sz += sz; 728 } 729 730 /* 731 * Add memory to vmm reservoir. Memory may be marked for transient use, where 732 * the addition is part of a transient allocation from the reservoir. Otherwise 733 * it is placed in the reservoir to be available for non-transient allocations. 734 * 735 * Expects vmmr_lock to be held when called, and will return with it held, but 736 * will drop it during portions of the addition. 737 */ 738 static int 739 vmmr_add(size_t sz, bool transient) 740 { 741 VERIFY3U(sz & PAGEOFFSET, ==, 0); 742 VERIFY3U(sz, >, 0); 743 VERIFY(MUTEX_HELD(&vmmr_lock)); 744 745 /* 746 * Make sure that the amount added is not going to breach the limits 747 * we've chosen 748 */ 749 const size_t current_total = 750 vmmr_alloc_sz + vmmr_free_sz + vmmr_adding_sz + 751 vmmr_alloc_transient_sz + vmmr_free_transient_sz; 752 if ((current_total + sz) < current_total) { 753 return (EOVERFLOW); 754 } 755 if ((current_total + sz) > vmmr_total_limit) { 756 return (ENOSPC); 757 } 758 vmmr_adding_sz += sz; 759 mutex_exit(&vmmr_lock); 760 761 /* Wait for enough pages to become available */ 762 if (page_xresv(sz >> PAGESHIFT, KM_SLEEP, vmmr_resv_wait) == 0) { 763 mutex_enter(&vmmr_lock); 764 vmmr_adding_sz -= sz; 765 return (EINTR); 766 } 767 768 mutex_enter(&vmmr_lock); 769 size_t added = 0; 770 size_t remain = sz; 771 while (added < sz) { 772 vmmr_span_t *span = NULL; 773 774 if (vmmr_empty_sz > 0) { 775 span = vmmr_tp_remove_split(remain, &vmmr_empty_tp); 776 777 vmmr_empty_sz -= span->vs_size; 778 } else { 779 /* 780 * No empty space to fill with new pages, so just tack 781 * it on at the end instead. 782 */ 783 span = kmem_zalloc(sizeof (vmmr_span_t), KM_SLEEP); 784 span->vs_addr = vmmr_empty_last; 785 span->vs_size = remain; 786 vmmr_empty_last += remain; 787 } 788 VERIFY3P(span, !=, NULL); 789 790 791 /* Allocate the actual pages to back this span */ 792 mutex_exit(&vmmr_lock); 793 int err = vmmr_alloc_pages(span); 794 mutex_enter(&vmmr_lock); 795 796 /* 797 * If an error is encountered during page allocation for the 798 * span, unwind any progress made by the addition request. 799 */ 800 if (err != 0) { 801 /* 802 * Without pages allocated to this span, it is now 803 * tracked as empty. 804 */ 805 vmmr_empty_sz += span->vs_size; 806 vmmr_tp_insert_concat(span, &vmmr_empty_tp); 807 808 if (added != 0) { 809 vmmr_remove_raw(added); 810 } 811 812 vmmr_adding_sz -= sz; 813 814 page_unresv(sz >> PAGESHIFT); 815 return (err); 816 } 817 818 /* 819 * The allocated-page-bearing span is placed in the "free" 820 * treepair now, but is not officially exposed for consumption 821 * until `vmm_free_sz` or `vmm_free_transient_sz` are updated. 822 * 823 * This allows us to unwind the allocation in case of a failure 824 * without the risk of the freshly added span(s) being snapped 825 * up by a consumer already. 826 */ 827 added += span->vs_size; 828 remain -= span->vs_size; 829 vmmr_tp_insert_concat(span, &vmmr_free_tp); 830 } 831 832 /* Make the added memory usable by exposing it to the size accounting */ 833 if (!transient) { 834 vmmr_free_sz += added; 835 } else { 836 vmmr_free_transient_sz += added; 837 } 838 ASSERT3U(added, ==, sz); 839 vmmr_adding_sz -= added; 840 841 return (0); 842 } 843 844 /* 845 * Remove memory from vmm reservoir. Normally this will remove memory from the 846 * reservoir which was available for non-transient allocations. If the removal 847 * is part of a vmmr_free() of a transient allocation, it will act on only that 848 * transient region being freed, not the available memory in the reservoir. 849 * 850 * Expects vmmr_lock to be held when called, and will return with it held, but 851 * may drop it during portions of the removal. 852 */ 853 static int 854 vmmr_remove(size_t sz, bool transient) 855 { 856 VERIFY3U(sz & PAGEOFFSET, ==, 0); 857 VERIFY(sz); 858 VERIFY(MUTEX_HELD(&vmmr_lock)); 859 860 if ((!transient && sz > vmmr_free_sz) || 861 (transient && sz > vmmr_free_transient_sz)) { 862 return (ENOSPC); 863 } 864 865 vmmr_remove_raw(sz); 866 867 if (!transient) { 868 vmmr_free_sz -= sz; 869 } else { 870 vmmr_free_transient_sz -= sz; 871 } 872 page_unresv(sz >> PAGESHIFT); 873 return (0); 874 } 875 876 static int 877 vmmr_set_target(size_t target_sz, size_t chunk_sz, size_t *resp) 878 { 879 VERIFY(resp != NULL); 880 881 mutex_enter(&vmmr_lock); 882 883 size_t current_sz = vmmr_alloc_sz + vmmr_free_sz; 884 885 /* Be sure to communicate current size in case of an early bail-out */ 886 *resp = current_sz; 887 888 if ((target_sz & PAGEOFFSET) != 0 || 889 (chunk_sz & PAGEOFFSET) != 0) { 890 mutex_exit(&vmmr_lock); 891 return (EINVAL); 892 } 893 /* Reject sentinel value */ 894 if (target_sz == VMMR_TARGET_INACTIVE) { 895 mutex_exit(&vmmr_lock); 896 return (EINVAL); 897 } 898 899 /* Already at target size */ 900 if (target_sz == current_sz) { 901 mutex_exit(&vmmr_lock); 902 return (0); 903 } 904 905 /* Reject racing requests size */ 906 if (vmmr_target_sz != VMMR_TARGET_INACTIVE) { 907 mutex_exit(&vmmr_lock); 908 return (EALREADY); 909 } 910 /* Record the target now to excluding a racing request */ 911 vmmr_target_sz = target_sz; 912 913 int err = 0; 914 do { 915 /* Be sensitive to signal interruption */ 916 if (issig(JUSTLOOKING) != 0) { 917 mutex_exit(&vmmr_lock); 918 const bool sig_bail = issig(FORREAL) != 0; 919 mutex_enter(&vmmr_lock); 920 if (sig_bail) { 921 err = EINTR; 922 break; 923 } 924 } 925 926 if (current_sz > target_sz) { 927 /* Shrinking reservoir */ 928 929 size_t req_sz = current_sz - target_sz; 930 if (chunk_sz != 0) { 931 req_sz = MIN(req_sz, chunk_sz); 932 } 933 err = vmmr_remove(req_sz, false); 934 } else { 935 /* Growing reservoir */ 936 ASSERT(current_sz < target_sz); 937 938 size_t req_sz = target_sz - current_sz; 939 if (chunk_sz != 0) { 940 req_sz = MIN(req_sz, chunk_sz); 941 } 942 err = vmmr_add(req_sz, false); 943 } 944 945 current_sz = vmmr_alloc_sz + vmmr_free_sz; 946 } while (err == 0 && current_sz != target_sz); 947 948 /* Clear the target now that we are done (success or not) */ 949 vmmr_target_sz = VMMR_TARGET_INACTIVE; 950 mutex_exit(&vmmr_lock); 951 *resp = current_sz; 952 return (err); 953 } 954 955 int 956 vmmr_ioctl(int cmd, intptr_t arg, int md, cred_t *cr, int *rvalp) 957 { 958 /* 959 * Since an LP64 datamodel is enforced by our caller (vmm_ioctl()), we 960 * do not need to duplicate such checks here. 961 */ 962 963 switch (cmd) { 964 case VMM_RESV_QUERY: { 965 struct vmm_resv_query res; 966 void *datap = (void *)(uintptr_t)arg; 967 968 /* For now, anyone in GZ can query */ 969 if (crgetzoneid(cr) != GLOBAL_ZONEID) { 970 return (EPERM); 971 } 972 mutex_enter(&vmmr_lock); 973 res.vrq_free_sz = vmmr_free_sz; 974 res.vrq_alloc_sz = vmmr_alloc_sz; 975 res.vrq_alloc_transient_sz = vmmr_alloc_transient_sz; 976 res.vrq_limit = vmmr_total_limit; 977 mutex_exit(&vmmr_lock); 978 if (ddi_copyout(&res, datap, sizeof (res), md) != 0) { 979 return (EFAULT); 980 } 981 break; 982 } 983 case VMM_RESV_SET_TARGET: { 984 if (secpolicy_sys_config(cr, B_FALSE) != 0) { 985 return (EPERM); 986 } 987 988 struct vmm_resv_target tgt; 989 void *datap = (void *)(uintptr_t)arg; 990 991 if (ddi_copyin(datap, &tgt, sizeof (tgt), md) != 0) { 992 return (EFAULT); 993 } 994 995 int err = vmmr_set_target(tgt.vrt_target_sz, tgt.vrt_chunk_sz, 996 &tgt.vrt_result_sz); 997 998 /* 999 * Attempt to communicate the resultant size of the reservoir if 1000 * setting it to the target was a success, or if we were 1001 * interrupted (by a signal) while doing so. 1002 */ 1003 if (err == 0 || err == EINTR) { 1004 if (ddi_copyout(&tgt, datap, sizeof (tgt), md) != 0) { 1005 err = EFAULT; 1006 } 1007 } 1008 1009 return (err); 1010 } 1011 default: 1012 return (ENOTTY); 1013 } 1014 return (0); 1015 } 1016