1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2025 Oxide Computer Company 16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/kmem.h> 21 #include <sys/thread.h> 22 #include <sys/list.h> 23 #include <sys/mman.h> 24 #include <sys/types.h> 25 #include <sys/ddi.h> 26 #include <sys/sysmacros.h> 27 #include <sys/machsystm.h> 28 #include <sys/vmsystm.h> 29 #include <sys/x86_archext.h> 30 #include <vm/as.h> 31 #include <vm/hat_i86.h> 32 #include <vm/seg_vn.h> 33 #include <vm/seg_kmem.h> 34 35 #include <sys/vmm_vm.h> 36 #include <sys/seg_vmm.h> 37 #include <sys/vmm_kernel.h> 38 #include <sys/vmm_reservoir.h> 39 #include <sys/vmm_gpt.h> 40 #include "vmm_util.h" 41 42 43 /* 44 * VMM Virtual Memory 45 * 46 * History 47 * 48 * When bhyve was ported to illumos, one significant hole was handling guest 49 * memory and memory accesses. In the original Pluribus port, bhyve itself 50 * manually handled the EPT structures for guest memory. The updated sources 51 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM 52 * system for memory allocations and management of the EPT structures. Keeping 53 * source differences to a minimum was a priority, so illumos-bhyve implemented 54 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to 55 * boot and run guests. 56 * 57 * While the VM shim was successful in getting illumos-bhyve to a functional 58 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the 59 * compatibility interfaces made it awkward to use. As source differences with 60 * the upstream kernel code became less of a concern, and upcoming features 61 * (such as live migration) would demand more of those VM interfaces, it became 62 * clear that an overhaul was prudent. 63 * 64 * Design 65 * 66 * The new VM system for bhyve retains a number of the same concepts as what it 67 * replaces: 68 * 69 * - `vmspace_t` is the top-level entity for a guest memory space 70 * - `vm_object_t` represents a memory object which can be mapped into a vmspace 71 * - `vm_page_t` represents a page hold within a given vmspace, providing access 72 * to the underlying memory page 73 * 74 * Unlike the old code, where most of the involved structures were exposed via 75 * public definitions, this replacement VM interface keeps all involved 76 * structures opaque to consumers. Furthermore, there is a clear delineation 77 * between infrequent administrative operations (such as mapping/unmapping 78 * regions) and common data-path operations (attempting a page hold at a given 79 * guest-physical address). Those administrative operations are performed 80 * directly against the vmspace, whereas the data-path operations are performed 81 * through a `vm_client_t` handle. That VM client abstraction is meant to 82 * reduce contention and overhead for frequent access operations and provide 83 * debugging insight into how different subcomponents are accessing the vmspace. 84 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv 85 * interface) and each VMM userspace segment mapping. 86 * 87 * Exclusion 88 * 89 * Making changes to the vmspace (such as mapping or unmapping regions) requires 90 * other accessors be excluded while the change is underway to prevent them from 91 * observing invalid intermediate states. A simple approach could use a mutex 92 * or rwlock to achieve this, but that risks contention when the rate of access 93 * to the vmspace is high. 94 * 95 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion 96 * at a per-vm_client_t basis. While this raises the cost for vmspace changes, 97 * it means that the much more common page accesses through the vm_client can 98 * normally proceed unimpeded and independently. 99 * 100 * When a change to the vmspace is required, the caller will put the vmspace in 101 * a 'hold' state, iterating over all associated vm_client instances, waiting 102 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before 103 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on 104 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) 105 * will block until the hold condition is cleared. Once the hold is asserted 106 * for all clients, the vmspace change can proceed with confidence. Upon 107 * completion of that operation, VCS_HOLD is cleared from the clients, and they 108 * are released to resume vmspace accesses. 109 * 110 * vCPU Consumers 111 * 112 * Access to the vmspace for vCPUs running in guest context is different from 113 * emulation-related vm_client activity: they solely rely on the contents of the 114 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude 115 * client access is not feasible when entering guest context, since interrupts 116 * are disabled, making it impossible to block entry. This is not a concern as 117 * long as vmspace modifications never place the page tables in invalid states 118 * (either intermediate, or final). The vm_client hold mechanism does provide 119 * the means to IPI vCPU consumers which will trigger a notification once they 120 * report their exit from guest context. This can be used to ensure that page 121 * table modifications are made visible to those vCPUs within a certain 122 * time frame. 123 */ 124 125 typedef struct vmspace_mapping { 126 list_node_t vmsm_node; 127 vm_object_t *vmsm_object; /* object backing this mapping */ 128 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ 129 size_t vmsm_len; /* length (in bytes) of mapping */ 130 off_t vmsm_offset; /* byte offset into object */ 131 uint_t vmsm_prot; 132 } vmspace_mapping_t; 133 134 #define VMSM_OFFSET(vmsm, addr) ( \ 135 (vmsm)->vmsm_offset + \ 136 ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) 137 138 typedef enum vm_client_state { 139 VCS_IDLE = 0, 140 /* currently accessing vmspace for client operation (hold or fault) */ 141 VCS_ACTIVE = (1 << 0), 142 /* client hold requested/asserted */ 143 VCS_HOLD = (1 << 1), 144 /* vCPU is accessing page tables in guest context */ 145 VCS_ON_CPU = (1 << 2), 146 /* client has been orphaned (no more access to vmspace) */ 147 VCS_ORPHANED = (1 << 3), 148 /* client undergoing destroy operation */ 149 VCS_DESTROY = (1 << 4), 150 } vm_client_state_t; 151 152 struct vmspace { 153 kmutex_t vms_lock; 154 kcondvar_t vms_cv; 155 bool vms_held; 156 uintptr_t vms_size; /* immutable after creation */ 157 158 /* (nested) page table state */ 159 vmm_gpt_t *vms_gpt; 160 uint64_t vms_pt_gen; 161 uint64_t vms_pages_mapped; 162 bool vms_track_dirty; 163 164 list_t vms_maplist; 165 list_t vms_clients; 166 }; 167 168 struct vm_client { 169 vmspace_t *vmc_space; 170 list_node_t vmc_node; 171 172 kmutex_t vmc_lock; 173 kcondvar_t vmc_cv; 174 vm_client_state_t vmc_state; 175 int vmc_cpu_active; 176 uint64_t vmc_cpu_gen; 177 bool vmc_track_dirty; 178 vmc_inval_cb_t vmc_inval_func; 179 void *vmc_inval_data; 180 181 list_t vmc_held_pages; 182 }; 183 184 typedef enum vm_object_type { 185 VMOT_NONE, 186 VMOT_MEM, 187 VMOT_MMIO, 188 } vm_object_type_t; 189 190 struct vm_object { 191 uint_t vmo_refcnt; /* manipulated with atomic ops */ 192 193 /* Fields below are fixed at creation time */ 194 vm_object_type_t vmo_type; 195 size_t vmo_size; 196 void *vmo_data; 197 uint8_t vmo_attr; 198 }; 199 200 /* Convenience consolidation of all flag(s) for validity checking */ 201 #define VPF_ALL (VPF_DEFER_DIRTY) 202 203 struct vm_page { 204 vm_client_t *vmp_client; 205 list_node_t vmp_node; 206 vm_page_t *vmp_chain; 207 uintptr_t vmp_gpa; 208 pfn_t vmp_pfn; 209 uint64_t *vmp_ptep; 210 vm_object_t *vmp_obj_ref; 211 uint8_t vmp_prot; 212 uint8_t vmp_flags; 213 }; 214 215 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); 216 static void vmspace_hold_enter(vmspace_t *); 217 static void vmspace_hold_exit(vmspace_t *, bool); 218 static void vmspace_clients_invalidate(vmspace_t *, uintptr_t, size_t); 219 static int vmspace_ensure_mapped(vmspace_t *, uintptr_t, int, pfn_t *, 220 uint64_t *); 221 static void vmc_space_hold(vm_client_t *); 222 static void vmc_space_release(vm_client_t *, bool); 223 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); 224 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); 225 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); 226 227 bool 228 vmm_vm_init(void) 229 { 230 if (vmm_is_intel()) { 231 extern struct vmm_pte_impl ept_pte_impl; 232 return (vmm_gpt_init(&ept_pte_impl)); 233 } else if (vmm_is_svm()) { 234 extern struct vmm_pte_impl rvi_pte_impl; 235 return (vmm_gpt_init(&rvi_pte_impl)); 236 } else { 237 /* Caller should have already rejected other vendors */ 238 panic("Unexpected hypervisor hardware vendor"); 239 } 240 } 241 242 void 243 vmm_vm_fini(void) 244 { 245 vmm_gpt_fini(); 246 } 247 248 /* 249 * Create a new vmspace with a maximum address of `end`. 250 */ 251 vmspace_t * 252 vmspace_alloc(size_t end) 253 { 254 vmspace_t *vms; 255 const uintptr_t size = end + 1; 256 257 /* 258 * This whole mess is built on the assumption that a 64-bit address 259 * space is available to work with for the various pagetable tricks. 260 */ 261 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && 262 size <= (uintptr_t)USERLIMIT); 263 264 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); 265 vms->vms_size = size; 266 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), 267 offsetof(vmspace_mapping_t, vmsm_node)); 268 list_create(&vms->vms_clients, sizeof (vm_client_t), 269 offsetof(vm_client_t, vmc_node)); 270 271 vms->vms_gpt = vmm_gpt_alloc(); 272 vms->vms_pt_gen = 1; 273 vms->vms_track_dirty = false; 274 275 return (vms); 276 } 277 278 /* 279 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining 280 * clients will be orphaned. 281 */ 282 void 283 vmspace_destroy(vmspace_t *vms) 284 { 285 mutex_enter(&vms->vms_lock); 286 VERIFY(list_is_empty(&vms->vms_maplist)); 287 288 if (!list_is_empty(&vms->vms_clients)) { 289 vm_client_t *vmc = list_head(&vms->vms_clients); 290 while (vmc != NULL) { 291 vmc = vmc_space_orphan(vmc, vms); 292 } 293 /* 294 * Wait for any clients which were in the process of destroying 295 * themselves to disappear. 296 */ 297 while (!list_is_empty(&vms->vms_clients)) { 298 cv_wait(&vms->vms_cv, &vms->vms_lock); 299 } 300 } 301 VERIFY(list_is_empty(&vms->vms_clients)); 302 303 vmm_gpt_free(vms->vms_gpt); 304 mutex_exit(&vms->vms_lock); 305 306 mutex_destroy(&vms->vms_lock); 307 cv_destroy(&vms->vms_cv); 308 list_destroy(&vms->vms_maplist); 309 list_destroy(&vms->vms_clients); 310 311 kmem_free(vms, sizeof (*vms)); 312 } 313 314 /* 315 * Retrieve the count of resident (mapped into the page tables) pages. 316 */ 317 uint64_t 318 vmspace_resident_count(vmspace_t *vms) 319 { 320 return (vms->vms_pages_mapped); 321 } 322 323 /* 324 * Perform an operation on the status (accessed/dirty) bits held in the page 325 * tables of this vmspace. 326 * 327 * Such manipulations race against both hardware writes (from running vCPUs) and 328 * emulated accesses reflected from userspace. Safe functionality depends on 329 * the VM instance being read-locked to prevent vmspace_map/vmspace_unmap 330 * operations from changing the page tables during the walk. 331 */ 332 void 333 vmspace_bits_operate(vmspace_t *vms, const uint64_t gpa, size_t len, 334 vmspace_bit_oper_t oper, uint8_t *bitmap) 335 { 336 const bool bit_input = (oper & VBO_FLAG_BITMAP_IN) != 0; 337 const bool bit_output = (oper & VBO_FLAG_BITMAP_OUT) != 0; 338 const vmspace_bit_oper_t oper_only = 339 oper & ~(VBO_FLAG_BITMAP_IN | VBO_FLAG_BITMAP_OUT); 340 vmm_gpt_t *gpt = vms->vms_gpt; 341 342 /* 343 * The bitmap cannot be NULL if the requested operation involves reading 344 * or writing from it. 345 */ 346 ASSERT(bitmap != NULL || (!bit_input && !bit_output)); 347 348 vmm_gpt_iter_t iter; 349 vmm_gpt_iter_entry_t entry; 350 vmm_gpt_iter_init(&iter, gpt, gpa, len); 351 352 while (vmm_gpt_iter_next(&iter, &entry)) { 353 const size_t offset = (entry.vgie_gpa - gpa); 354 const uint64_t pfn_offset = offset >> PAGESHIFT; 355 const size_t bit_offset = pfn_offset / 8; 356 const uint8_t bit_mask = 1 << (pfn_offset % 8); 357 358 if (bit_input && (bitmap[bit_offset] & bit_mask) == 0) { 359 continue; 360 } 361 362 bool value = false; 363 uint64_t *ptep = entry.vgie_ptep; 364 if (ptep == NULL) { 365 if (bit_output) { 366 bitmap[bit_offset] &= ~bit_mask; 367 } 368 continue; 369 } 370 371 switch (oper_only) { 372 case VBO_GET_DIRTY: 373 value = vmm_gpte_query_dirty(ptep); 374 break; 375 case VBO_SET_DIRTY: { 376 uint_t prot = 0; 377 bool present_writable = false; 378 pfn_t pfn; 379 380 /* 381 * To avoid blindly setting the dirty bit on otherwise 382 * empty PTEs, we must first check if the entry for the 383 * address in question has been populated. 384 * 385 * Only if the page is marked both Present and Writable 386 * will we permit the dirty bit to be set. 387 */ 388 if (!vmm_gpte_is_mapped(ptep, &pfn, &prot)) { 389 int err = vmspace_ensure_mapped(vms, 390 entry.vgie_gpa, PROT_WRITE, &pfn, ptep); 391 if (err == 0) { 392 present_writable = true; 393 } 394 } else if ((prot & PROT_WRITE) != 0) { 395 present_writable = true; 396 } 397 398 if (present_writable) { 399 value = !vmm_gpte_reset_dirty(ptep, true); 400 } 401 break; 402 } 403 case VBO_RESET_DIRTY: 404 /* 405 * Although at first glance, it may seem like the act of 406 * resetting the dirty bit may require the same care as 407 * setting it, the constraints make for a simpler task. 408 * 409 * Any PTEs with the dirty bit set will have already 410 * been properly populated. 411 */ 412 value = vmm_gpte_reset_dirty(ptep, false); 413 break; 414 default: 415 panic("unrecognized operator: %d", oper_only); 416 break; 417 } 418 if (bit_output) { 419 if (value) { 420 bitmap[bit_offset] |= bit_mask; 421 } else { 422 bitmap[bit_offset] &= ~bit_mask; 423 } 424 } 425 } 426 427 /* 428 * Invalidate the address range potentially effected by the changes to 429 * page table bits, issuing shoot-downs for those who might have it in 430 * cache. 431 */ 432 vmspace_hold_enter(vms); 433 vms->vms_pt_gen++; 434 vmspace_clients_invalidate(vms, gpa, len); 435 vmspace_hold_exit(vms, true); 436 } 437 438 /* 439 * Is dirty-page-tracking enabled for the vmspace? 440 */ 441 bool 442 vmspace_get_tracking(vmspace_t *vms) 443 { 444 mutex_enter(&vms->vms_lock); 445 const bool val = vms->vms_track_dirty; 446 mutex_exit(&vms->vms_lock); 447 return (val); 448 } 449 450 /* 451 * Set the state (enabled/disabled) of dirty-page-tracking for the vmspace. 452 */ 453 int 454 vmspace_set_tracking(vmspace_t *vms, bool enable_dirty_tracking) 455 { 456 if (enable_dirty_tracking && !vmm_gpt_can_track_dirty(vms->vms_gpt)) { 457 /* Do not allow this to be set if it is not supported */ 458 return (ENOTSUP); 459 } 460 461 vmspace_hold_enter(vms); 462 if (vms->vms_track_dirty == enable_dirty_tracking) { 463 /* No further effort required if state already matches */ 464 vmspace_hold_exit(vms, false); 465 return (0); 466 } 467 468 vms->vms_track_dirty = enable_dirty_tracking; 469 470 /* Configure all existing clients for new tracking behavior */ 471 for (vm_client_t *vmc = list_head(&vms->vms_clients); 472 vmc != NULL; 473 vmc = list_next(&vms->vms_clients, vmc)) { 474 mutex_enter(&vmc->vmc_lock); 475 vmc->vmc_track_dirty = enable_dirty_tracking; 476 mutex_exit(&vmc->vmc_lock); 477 } 478 479 /* 480 * Notify all clients of what is considered an invalidation of the 481 * entire vmspace. 482 */ 483 vms->vms_pt_gen++; 484 vmspace_clients_invalidate(vms, 0, vms->vms_size); 485 486 vmspace_hold_exit(vms, true); 487 return (0); 488 } 489 490 static pfn_t 491 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) 492 { 493 vmmr_region_t *region; 494 pfn_t pfn; 495 496 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); 497 498 region = vmo->vmo_data; 499 pfn = vmmr_region_pfn_at(region, off); 500 501 return (pfn); 502 } 503 504 static pfn_t 505 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) 506 { 507 pfn_t pfn; 508 509 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); 510 ASSERT3P(vmo->vmo_data, !=, NULL); 511 ASSERT3U(off, <, vmo->vmo_size); 512 513 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; 514 515 return (pfn); 516 } 517 518 /* 519 * Allocate a VM object backed by VMM reservoir memory. 520 */ 521 vm_object_t * 522 vm_object_mem_allocate(size_t size, bool transient) 523 { 524 int err; 525 vmmr_region_t *region = NULL; 526 vm_object_t *vmo; 527 528 ASSERT3U(size, !=, 0); 529 ASSERT3U(size & PAGEOFFSET, ==, 0); 530 531 err = vmmr_alloc(size, transient, ®ion); 532 if (err != 0) { 533 return (NULL); 534 } 535 536 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 537 538 /* For now, these are to stay fixed after allocation */ 539 vmo->vmo_type = VMOT_MEM; 540 vmo->vmo_size = size; 541 vmo->vmo_attr = MTRR_TYPE_WB; 542 vmo->vmo_data = region; 543 vmo->vmo_refcnt = 1; 544 545 return (vmo); 546 } 547 548 static vm_object_t * 549 vm_object_mmio_allocate(size_t size, uintptr_t hpa) 550 { 551 vm_object_t *vmo; 552 553 ASSERT3U(size, !=, 0); 554 ASSERT3U(size & PAGEOFFSET, ==, 0); 555 ASSERT3U(hpa & PAGEOFFSET, ==, 0); 556 557 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 558 559 /* For now, these are to stay fixed after allocation */ 560 vmo->vmo_type = VMOT_MMIO; 561 vmo->vmo_size = size; 562 vmo->vmo_attr = MTRR_TYPE_UC; 563 vmo->vmo_data = (void *)hpa; 564 vmo->vmo_refcnt = 1; 565 566 return (vmo); 567 } 568 569 /* 570 * Allocate a VM object backed by an existing range of physical memory. 571 */ 572 vm_object_t * 573 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) 574 { 575 int error; 576 vm_object_t *obj; 577 578 obj = vm_object_mmio_allocate(len, hpa); 579 if (obj != NULL) { 580 error = vmspace_map(vmspace, obj, 0, gpa, len, 581 PROT_READ | PROT_WRITE); 582 if (error != 0) { 583 vm_object_release(obj); 584 obj = NULL; 585 } 586 } 587 588 return (obj); 589 } 590 591 /* 592 * Release a vm_object reference 593 */ 594 void 595 vm_object_release(vm_object_t *vmo) 596 { 597 ASSERT(vmo != NULL); 598 599 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); 600 /* underflow would be a deadly serious mistake */ 601 VERIFY3U(ref, !=, UINT_MAX); 602 if (ref != 0) { 603 return; 604 } 605 606 switch (vmo->vmo_type) { 607 case VMOT_MEM: 608 vmmr_free((vmmr_region_t *)vmo->vmo_data); 609 break; 610 case VMOT_MMIO: 611 break; 612 default: 613 panic("unexpected object type %u", vmo->vmo_type); 614 break; 615 } 616 617 vmo->vmo_data = NULL; 618 vmo->vmo_size = 0; 619 kmem_free(vmo, sizeof (*vmo)); 620 } 621 622 /* 623 * Increase refcount for vm_object reference 624 */ 625 void 626 vm_object_reference(vm_object_t *vmo) 627 { 628 ASSERT(vmo != NULL); 629 630 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); 631 /* overflow would be a deadly serious mistake */ 632 VERIFY3U(ref, !=, 0); 633 } 634 635 /* 636 * Get the host-physical PFN for a given offset into a vm_object. 637 * 638 * The provided `off` must be within the allocated size of the vm_object. 639 */ 640 pfn_t 641 vm_object_pfn(vm_object_t *vmo, uintptr_t off) 642 { 643 const uintptr_t aligned_off = off & PAGEMASK; 644 645 switch (vmo->vmo_type) { 646 case VMOT_MEM: 647 return (vm_object_pager_reservoir(vmo, aligned_off)); 648 case VMOT_MMIO: 649 return (vm_object_pager_mmio(vmo, aligned_off)); 650 case VMOT_NONE: 651 break; 652 } 653 panic("unexpected object type %u", vmo->vmo_type); 654 } 655 656 static vmspace_mapping_t * 657 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) 658 { 659 vmspace_mapping_t *vmsm; 660 list_t *ml = &vms->vms_maplist; 661 const uintptr_t range_end = addr + size; 662 663 ASSERT3U(addr, <=, range_end); 664 665 if (addr >= vms->vms_size) { 666 return (NULL); 667 } 668 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 669 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; 670 671 if (addr >= vmsm->vmsm_addr && addr < seg_end) { 672 if (range_end <= seg_end) { 673 return (vmsm); 674 } else { 675 return (NULL); 676 } 677 } 678 } 679 return (NULL); 680 } 681 682 /* 683 * Check to see if any mappings reside within [addr, addr + size) span in the 684 * vmspace, returning true if that span is indeed empty. 685 */ 686 static bool 687 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) 688 { 689 vmspace_mapping_t *vmsm; 690 list_t *ml = &vms->vms_maplist; 691 const uintptr_t range_end = addr + size - 1; 692 693 ASSERT(MUTEX_HELD(&vms->vms_lock)); 694 ASSERT(size > 0); 695 696 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 697 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; 698 699 /* 700 * The two ranges do not overlap if the start of either of 701 * them is after the end of the other. 702 */ 703 if (vmsm->vmsm_addr > range_end || addr > seg_end) 704 continue; 705 return (false); 706 } 707 return (true); 708 } 709 710 static void 711 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) 712 { 713 list_t *ml = &vms->vms_maplist; 714 715 ASSERT(MUTEX_HELD(&vms->vms_lock)); 716 ASSERT(vms->vms_held); 717 718 list_remove(ml, vmsm); 719 vm_object_release(vmsm->vmsm_object); 720 kmem_free(vmsm, sizeof (*vmsm)); 721 } 722 723 /* 724 * Enter a hold state on the vmspace. This ensures that all VM clients 725 * associated with the vmspace are excluded from establishing new page holds, 726 * or any other actions which would require accessing vmspace state subject to 727 * potential change. 728 * 729 * Returns with vmspace_t`vms_lock held. 730 */ 731 static void 732 vmspace_hold_enter(vmspace_t *vms) 733 { 734 mutex_enter(&vms->vms_lock); 735 VERIFY(!vms->vms_held); 736 737 vm_client_t *vmc = list_head(&vms->vms_clients); 738 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 739 vmc_space_hold(vmc); 740 } 741 vms->vms_held = true; 742 } 743 744 /* 745 * Exit a hold state on the vmspace. This releases all VM clients associated 746 * with the vmspace to be able to establish new page holds, and partake in other 747 * actions which require accessing changed vmspace state. If `kick_on_cpu` is 748 * true, then any CPUs actively using the page tables will be IPIed, and the 749 * call will block until they have acknowledged being ready to use the latest 750 * state of the tables. 751 * 752 * Requires vmspace_t`vms_lock be held, which is released as part of the call. 753 */ 754 static void 755 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) 756 { 757 ASSERT(MUTEX_HELD(&vms->vms_lock)); 758 VERIFY(vms->vms_held); 759 760 vm_client_t *vmc = list_head(&vms->vms_clients); 761 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 762 vmc_space_release(vmc, kick_on_cpu); 763 } 764 vms->vms_held = false; 765 mutex_exit(&vms->vms_lock); 766 } 767 768 static void 769 vmspace_clients_invalidate(vmspace_t *vms, uintptr_t gpa, size_t len) 770 { 771 ASSERT(MUTEX_HELD(&vms->vms_lock)); 772 VERIFY(vms->vms_held); 773 774 for (vm_client_t *vmc = list_head(&vms->vms_clients); 775 vmc != NULL; 776 vmc = list_next(&vms->vms_clients, vmc)) { 777 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); 778 } 779 } 780 781 /* 782 * Attempt to map a vm_object span into the vmspace. 783 * 784 * Requirements: 785 * - `obj_off`, `addr`, and `len` must be page-aligned 786 * - `obj_off` cannot be greater than the allocated size of the object 787 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated 788 * size of the object 789 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address 790 * of the vmspace 791 */ 792 int 793 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, 794 size_t len, uint8_t prot) 795 { 796 vmspace_mapping_t *vmsm; 797 int res = 0; 798 799 if (len == 0 || (addr + len) < addr || 800 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { 801 return (EINVAL); 802 } 803 if ((addr + len) >= vms->vms_size) { 804 return (ENOMEM); 805 } 806 807 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); 808 809 vmspace_hold_enter(vms); 810 if (!vm_mapping_gap(vms, addr, len)) { 811 kmem_free(vmsm, sizeof (*vmsm)); 812 res = ENOMEM; 813 } else { 814 vmsm->vmsm_object = vmo; 815 vmsm->vmsm_addr = addr; 816 vmsm->vmsm_len = len; 817 vmsm->vmsm_offset = (off_t)obj_off; 818 vmsm->vmsm_prot = prot; 819 list_insert_tail(&vms->vms_maplist, vmsm); 820 821 /* 822 * Make sure the GPT has tables ready for leaf entries across 823 * the entire new mapping. 824 */ 825 vmm_gpt_populate_region(vms->vms_gpt, addr, len); 826 } 827 vmspace_hold_exit(vms, false); 828 return (res); 829 } 830 831 /* 832 * Unmap a region of the vmspace. 833 * 834 * Presently the [start, end) span must equal a region previously mapped by a 835 * call to vmspace_map(). 836 */ 837 int 838 vmspace_unmap(vmspace_t *vms, uintptr_t addr, uintptr_t len) 839 { 840 const uintptr_t end = addr + len; 841 vmspace_mapping_t *vmsm; 842 vm_client_t *vmc; 843 uint64_t gen = 0; 844 845 ASSERT3U(addr, <, end); 846 847 vmspace_hold_enter(vms); 848 /* expect to match existing mapping exactly */ 849 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL || 850 vmsm->vmsm_addr != addr || vmsm->vmsm_len != len) { 851 vmspace_hold_exit(vms, false); 852 return (ENOENT); 853 } 854 855 /* Prepare clients (and their held pages) for the unmap. */ 856 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 857 vmc = list_next(&vms->vms_clients, vmc)) { 858 vmc_space_unmap(vmc, addr, len, vmsm->vmsm_object); 859 } 860 861 /* Clear all PTEs for region */ 862 if (vmm_gpt_unmap_region(vms->vms_gpt, addr, len) != 0) { 863 vms->vms_pt_gen++; 864 gen = vms->vms_pt_gen; 865 } 866 /* ... and the intermediate (directory) PTEs as well */ 867 vmm_gpt_vacate_region(vms->vms_gpt, addr, len); 868 869 /* 870 * If pages were actually unmapped from the GPT, provide clients with 871 * an invalidation notice. 872 */ 873 if (gen != 0) { 874 vmspace_clients_invalidate(vms, addr, len); 875 } 876 877 vm_mapping_remove(vms, vmsm); 878 vmspace_hold_exit(vms, true); 879 return (0); 880 } 881 882 /* 883 * For a given GPA in the vmspace, ensure that the backing page (if any) is 884 * properly mapped as present in the provided PTE. 885 */ 886 static int 887 vmspace_ensure_mapped(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 888 uint64_t *leaf_pte) 889 { 890 vmspace_mapping_t *vmsm; 891 vm_object_t *vmo; 892 pfn_t pfn; 893 894 ASSERT(pfnp != NULL); 895 ASSERT(leaf_pte != NULL); 896 897 vmsm = vm_mapping_find(vms, gpa, PAGESIZE); 898 if (vmsm == NULL) { 899 return (FC_NOMAP); 900 } 901 if ((req_prot & vmsm->vmsm_prot) != req_prot) { 902 return (FC_PROT); 903 } 904 905 vmo = vmsm->vmsm_object; 906 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 907 VERIFY(pfn != PFN_INVALID); 908 909 if (vmm_gpt_map_at(vms->vms_gpt, leaf_pte, pfn, vmsm->vmsm_prot, 910 vmo->vmo_attr)) { 911 atomic_inc_64(&vms->vms_pages_mapped); 912 } 913 914 *pfnp = pfn; 915 return (0); 916 } 917 918 /* 919 * Look up the PTE for a given GPA in the vmspace, populating it with 920 * appropriate contents (pfn, protection, etc) if it is empty, but backed by a 921 * valid mapping. 922 */ 923 static int 924 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 925 uint64_t **ptepp) 926 { 927 vmm_gpt_t *gpt = vms->vms_gpt; 928 uint64_t *entries[MAX_GPT_LEVEL], *leaf; 929 pfn_t pfn = PFN_INVALID; 930 uint_t prot; 931 932 ASSERT0(gpa & PAGEOFFSET); 933 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); 934 935 (void) vmm_gpt_walk(gpt, gpa, entries, LEVEL1); 936 leaf = entries[LEVEL1]; 937 if (leaf == NULL) { 938 /* 939 * Since we populated the intermediate tables for any regions 940 * mapped in the GPT, an empty leaf entry indicates there is no 941 * mapping, populated or not, at this GPA. 942 */ 943 return (FC_NOMAP); 944 } 945 946 if (vmm_gpte_is_mapped(leaf, &pfn, &prot)) { 947 if ((req_prot & prot) != req_prot) { 948 return (FC_PROT); 949 } 950 } else { 951 int err = vmspace_ensure_mapped(vms, gpa, req_prot, &pfn, leaf); 952 if (err != 0) { 953 return (err); 954 } 955 } 956 957 ASSERT(pfn != PFN_INVALID && leaf != NULL); 958 if (pfnp != NULL) { 959 *pfnp = pfn; 960 } 961 if (ptepp != NULL) { 962 *ptepp = leaf; 963 } 964 return (0); 965 } 966 967 /* 968 * Populate (make resident in the page tables) a region of the vmspace. 969 * 970 * Presently the [start, end) span must equal a region previously mapped by a 971 * call to vmspace_map(). 972 */ 973 int 974 vmspace_populate(vmspace_t *vms, uintptr_t addr, uintptr_t len) 975 { 976 ASSERT0(addr & PAGEOFFSET); 977 ASSERT0(len & PAGEOFFSET); 978 979 vmspace_mapping_t *vmsm; 980 mutex_enter(&vms->vms_lock); 981 982 /* For the time being, only exact-match mappings are expected */ 983 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL) { 984 mutex_exit(&vms->vms_lock); 985 return (FC_NOMAP); 986 } 987 988 vm_object_t *vmo = vmsm->vmsm_object; 989 const int prot = vmsm->vmsm_prot; 990 const uint8_t attr = vmo->vmo_attr; 991 vmm_gpt_t *gpt = vms->vms_gpt; 992 size_t populated = 0; 993 994 vmm_gpt_iter_t iter; 995 vmm_gpt_iter_entry_t entry; 996 vmm_gpt_iter_init(&iter, gpt, addr, len); 997 while (vmm_gpt_iter_next(&iter, &entry)) { 998 const pfn_t pfn = 999 vm_object_pfn(vmo, VMSM_OFFSET(vmsm, entry.vgie_gpa)); 1000 VERIFY(pfn != PFN_INVALID); 1001 1002 if (vmm_gpt_map_at(gpt, entry.vgie_ptep, pfn, prot, attr)) { 1003 populated++; 1004 } 1005 } 1006 atomic_add_64(&vms->vms_pages_mapped, populated); 1007 1008 mutex_exit(&vms->vms_lock); 1009 return (0); 1010 } 1011 1012 /* 1013 * Allocate a client from a given vmspace. 1014 */ 1015 vm_client_t * 1016 vmspace_client_alloc(vmspace_t *vms) 1017 { 1018 vm_client_t *vmc; 1019 1020 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); 1021 vmc->vmc_space = vms; 1022 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); 1023 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); 1024 vmc->vmc_state = VCS_IDLE; 1025 vmc->vmc_cpu_active = -1; 1026 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), 1027 offsetof(vm_page_t, vmp_node)); 1028 vmc->vmc_track_dirty = vms->vms_track_dirty; 1029 1030 mutex_enter(&vms->vms_lock); 1031 list_insert_tail(&vms->vms_clients, vmc); 1032 mutex_exit(&vms->vms_lock); 1033 1034 return (vmc); 1035 } 1036 1037 /* 1038 * Get the nested page table root pointer (EPTP/NCR3) value. 1039 */ 1040 uint64_t 1041 vmspace_table_root(vmspace_t *vms) 1042 { 1043 return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty)); 1044 } 1045 1046 /* 1047 * Get the current generation number of the nested page table. 1048 */ 1049 uint64_t 1050 vmspace_table_gen(vmspace_t *vms) 1051 { 1052 return (vms->vms_pt_gen); 1053 } 1054 1055 /* 1056 * Mark a vm_client as active. This will block if/while the client is held by 1057 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will 1058 * fail if the vm_client has been orphaned. 1059 */ 1060 static int 1061 vmc_activate(vm_client_t *vmc) 1062 { 1063 mutex_enter(&vmc->vmc_lock); 1064 VERIFY0(vmc->vmc_state & VCS_ACTIVE); 1065 if ((vmc->vmc_state & VCS_ORPHANED) != 0) { 1066 mutex_exit(&vmc->vmc_lock); 1067 return (ENXIO); 1068 } 1069 while ((vmc->vmc_state & VCS_HOLD) != 0) { 1070 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1071 } 1072 vmc->vmc_state |= VCS_ACTIVE; 1073 return (0); 1074 } 1075 1076 /* 1077 * Mark a vm_client as no longer active. It must be called with 1078 * vm_client_t`vmc_lock already held, and will return with it released. 1079 */ 1080 static void 1081 vmc_deactivate(vm_client_t *vmc) 1082 { 1083 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1084 VERIFY(vmc->vmc_state & VCS_ACTIVE); 1085 1086 vmc->vmc_state ^= VCS_ACTIVE; 1087 if ((vmc->vmc_state & VCS_HOLD) != 0) { 1088 cv_broadcast(&vmc->vmc_cv); 1089 } 1090 mutex_exit(&vmc->vmc_lock); 1091 } 1092 1093 /* 1094 * Indicate that a CPU will be utilizing the nested page tables through this VM 1095 * client. Interrupts (and/or the GIF) are expected to be disabled when calling 1096 * this function. Returns the generation number of the nested page table (to be 1097 * used for TLB invalidations). 1098 */ 1099 uint64_t 1100 vmc_table_enter(vm_client_t *vmc) 1101 { 1102 vmspace_t *vms = vmc->vmc_space; 1103 uint64_t gen; 1104 1105 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1106 ASSERT3S(vmc->vmc_cpu_active, ==, -1); 1107 1108 /* 1109 * Since the NPT activation occurs with interrupts disabled, this must 1110 * be done without taking vmc_lock like normal. 1111 */ 1112 gen = vms->vms_pt_gen; 1113 vmc->vmc_cpu_active = CPU->cpu_id; 1114 vmc->vmc_cpu_gen = gen; 1115 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); 1116 1117 return (gen); 1118 } 1119 1120 /* 1121 * Indicate that this VM client is not longer (directly) using the underlying 1122 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling 1123 * this function. 1124 */ 1125 void 1126 vmc_table_exit(vm_client_t *vmc) 1127 { 1128 mutex_enter(&vmc->vmc_lock); 1129 1130 ASSERT(vmc->vmc_state & VCS_ON_CPU); 1131 vmc->vmc_state ^= VCS_ON_CPU; 1132 vmc->vmc_cpu_active = -1; 1133 if ((vmc->vmc_state & VCS_HOLD) != 0) { 1134 cv_broadcast(&vmc->vmc_cv); 1135 } 1136 1137 mutex_exit(&vmc->vmc_lock); 1138 } 1139 1140 static void 1141 vmc_space_hold(vm_client_t *vmc) 1142 { 1143 mutex_enter(&vmc->vmc_lock); 1144 VERIFY0(vmc->vmc_state & VCS_HOLD); 1145 1146 /* 1147 * Because vmc_table_enter() alters vmc_state from a context where 1148 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting 1149 * VMC_HOLD must be done atomically here. 1150 */ 1151 atomic_or_uint(&vmc->vmc_state, VCS_HOLD); 1152 1153 /* Wait for client to go inactive */ 1154 while ((vmc->vmc_state & VCS_ACTIVE) != 0) { 1155 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1156 } 1157 mutex_exit(&vmc->vmc_lock); 1158 } 1159 1160 static void 1161 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) 1162 { 1163 mutex_enter(&vmc->vmc_lock); 1164 VERIFY(vmc->vmc_state & VCS_HOLD); 1165 1166 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { 1167 poke_cpu(vmc->vmc_cpu_active); 1168 1169 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1170 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1171 } 1172 } 1173 1174 /* 1175 * Because vmc_table_enter() alters vmc_state from a context where 1176 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing 1177 * VMC_HOLD must be done atomically here. 1178 */ 1179 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); 1180 cv_broadcast(&vmc->vmc_cv); 1181 mutex_exit(&vmc->vmc_lock); 1182 } 1183 1184 static void 1185 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, 1186 uint64_t gen) 1187 { 1188 mutex_enter(&vmc->vmc_lock); 1189 VERIFY(vmc->vmc_state & VCS_HOLD); 1190 if ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1191 /* 1192 * Wait for clients using an old generation of the page tables 1193 * to exit guest context, where they subsequently flush the TLB 1194 * for the new generation. 1195 */ 1196 if (vmc->vmc_cpu_gen < gen) { 1197 poke_cpu(vmc->vmc_cpu_active); 1198 1199 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1200 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1201 } 1202 } 1203 } 1204 if (vmc->vmc_inval_func != NULL) { 1205 vmc_inval_cb_t func = vmc->vmc_inval_func; 1206 void *data = vmc->vmc_inval_data; 1207 1208 /* 1209 * Perform the actual invalidation call outside vmc_lock to 1210 * avoid lock ordering issues in the consumer. Since the client 1211 * is under VCS_HOLD, this is safe. 1212 */ 1213 mutex_exit(&vmc->vmc_lock); 1214 func(data, addr, size); 1215 mutex_enter(&vmc->vmc_lock); 1216 } 1217 mutex_exit(&vmc->vmc_lock); 1218 } 1219 1220 static void 1221 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, 1222 vm_object_t *vmo) 1223 { 1224 mutex_enter(&vmc->vmc_lock); 1225 VERIFY(vmc->vmc_state & VCS_HOLD); 1226 1227 /* 1228 * With the current vCPU exclusion invariants in place, we do not expect 1229 * a vCPU to be in guest context during an unmap. 1230 */ 1231 VERIFY0(vmc->vmc_state & VCS_ON_CPU); 1232 1233 /* 1234 * Any holds against the unmapped region need to establish their own 1235 * reference to the underlying object to avoid a potential 1236 * use-after-free. 1237 */ 1238 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1239 vmp != NULL; 1240 vmp = list_next(&vmc->vmc_held_pages, vmc)) { 1241 if (vmp->vmp_gpa < addr || 1242 vmp->vmp_gpa >= (addr + size)) { 1243 /* Hold outside region in question */ 1244 continue; 1245 } 1246 if (vmp->vmp_obj_ref == NULL) { 1247 vm_object_reference(vmo); 1248 vmp->vmp_obj_ref = vmo; 1249 /* For an unmapped region, PTE is now meaningless */ 1250 vmp->vmp_ptep = NULL; 1251 } else { 1252 /* 1253 * Object could have gone through cycle of 1254 * unmap-map-unmap before the hold was released. 1255 */ 1256 VERIFY3P(vmp->vmp_ptep, ==, NULL); 1257 } 1258 } 1259 mutex_exit(&vmc->vmc_lock); 1260 } 1261 1262 static vm_client_t * 1263 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) 1264 { 1265 vm_client_t *next; 1266 1267 ASSERT(MUTEX_HELD(&vms->vms_lock)); 1268 1269 mutex_enter(&vmc->vmc_lock); 1270 VERIFY3P(vmc->vmc_space, ==, vms); 1271 VERIFY0(vmc->vmc_state & VCS_ORPHANED); 1272 if (vmc->vmc_state & VCS_DESTROY) { 1273 /* 1274 * This vm_client is currently undergoing destruction, so it 1275 * does not need to be orphaned. Let it proceed with its own 1276 * clean-up task. 1277 */ 1278 next = list_next(&vms->vms_clients, vmc); 1279 } else { 1280 /* 1281 * Clients are only orphaned when the containing vmspace is 1282 * being torn down. All mappings from the vmspace should 1283 * already be gone, meaning any remaining held pages should have 1284 * direct references to the object. 1285 */ 1286 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1287 vmp != NULL; 1288 vmp = list_next(&vmc->vmc_held_pages, vmp)) { 1289 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1290 ASSERT3P(vmp->vmp_obj_ref, !=, NULL); 1291 } 1292 1293 /* 1294 * After this point, the client will be orphaned, unable to 1295 * establish new page holds (or access any vmspace-related 1296 * resources) and is in charge of cleaning up after itself. 1297 */ 1298 vmc->vmc_state |= VCS_ORPHANED; 1299 next = list_next(&vms->vms_clients, vmc); 1300 list_remove(&vms->vms_clients, vmc); 1301 vmc->vmc_space = NULL; 1302 } 1303 mutex_exit(&vmc->vmc_lock); 1304 return (next); 1305 } 1306 1307 /* 1308 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1309 */ 1310 vm_page_t * 1311 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags) 1312 { 1313 vmspace_t *vms = vmc->vmc_space; 1314 vm_page_t *vmp; 1315 pfn_t pfn = PFN_INVALID; 1316 uint64_t *ptep = NULL; 1317 1318 ASSERT0(gpa & PAGEOFFSET); 1319 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); 1320 ASSERT0(prot & ~PROT_ALL); 1321 ASSERT0(flags & ~VPF_ALL); 1322 1323 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); 1324 if (vmc_activate(vmc) != 0) { 1325 kmem_free(vmp, sizeof (*vmp)); 1326 return (NULL); 1327 } 1328 1329 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { 1330 vmc_deactivate(vmc); 1331 kmem_free(vmp, sizeof (*vmp)); 1332 return (NULL); 1333 } 1334 ASSERT(pfn != PFN_INVALID && ptep != NULL); 1335 1336 vmp->vmp_client = vmc; 1337 vmp->vmp_chain = NULL; 1338 vmp->vmp_gpa = gpa; 1339 vmp->vmp_pfn = pfn; 1340 vmp->vmp_ptep = ptep; 1341 vmp->vmp_obj_ref = NULL; 1342 vmp->vmp_prot = (uint8_t)prot; 1343 vmp->vmp_flags = (uint8_t)flags; 1344 list_insert_tail(&vmc->vmc_held_pages, vmp); 1345 vmc_deactivate(vmc); 1346 1347 return (vmp); 1348 } 1349 1350 /* 1351 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1352 */ 1353 vm_page_t * 1354 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) 1355 { 1356 return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT)); 1357 } 1358 1359 int 1360 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) 1361 { 1362 vmspace_t *vms = vmc->vmc_space; 1363 int err; 1364 1365 err = vmc_activate(vmc); 1366 if (err == 0) { 1367 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); 1368 vmc_deactivate(vmc); 1369 } 1370 1371 return (err); 1372 } 1373 1374 /* 1375 * Allocate an additional vm_client_t, based on an existing one. Only the 1376 * associatation with the vmspace is cloned, not existing holds or any 1377 * configured invalidation function. 1378 */ 1379 vm_client_t * 1380 vmc_clone(vm_client_t *vmc) 1381 { 1382 vmspace_t *vms = vmc->vmc_space; 1383 1384 return (vmspace_client_alloc(vms)); 1385 } 1386 1387 /* 1388 * Register a function (and associated data pointer) to be called when an 1389 * address range in the vmspace is invalidated. 1390 */ 1391 int 1392 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) 1393 { 1394 int err; 1395 1396 err = vmc_activate(vmc); 1397 if (err == 0) { 1398 vmc->vmc_inval_func = func; 1399 vmc->vmc_inval_data = data; 1400 vmc_deactivate(vmc); 1401 } 1402 1403 return (err); 1404 } 1405 1406 /* 1407 * Destroy a vm_client_t instance. 1408 * 1409 * No pages held through this vm_client_t may be outstanding when performing a 1410 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to 1411 * vmc_table_exit() has been made). 1412 */ 1413 void 1414 vmc_destroy(vm_client_t *vmc) 1415 { 1416 mutex_enter(&vmc->vmc_lock); 1417 1418 VERIFY(list_is_empty(&vmc->vmc_held_pages)); 1419 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1420 1421 if ((vmc->vmc_state & VCS_ORPHANED) == 0) { 1422 vmspace_t *vms; 1423 1424 /* 1425 * Deassociation with the parent vmspace must be done carefully: 1426 * The vmspace could attempt to orphan this vm_client while we 1427 * release vmc_lock in order to take vms_lock (the required 1428 * order). The client is marked to indicate that destruction is 1429 * under way. Doing so prevents any racing orphan operation 1430 * from applying to this client, allowing us to deassociate from 1431 * the vmspace safely. 1432 */ 1433 vmc->vmc_state |= VCS_DESTROY; 1434 vms = vmc->vmc_space; 1435 mutex_exit(&vmc->vmc_lock); 1436 1437 mutex_enter(&vms->vms_lock); 1438 mutex_enter(&vmc->vmc_lock); 1439 list_remove(&vms->vms_clients, vmc); 1440 /* 1441 * If the vmspace began its own destruction operation while we 1442 * were navigating the locks, be sure to notify it about this 1443 * vm_client being deassociated. 1444 */ 1445 cv_signal(&vms->vms_cv); 1446 mutex_exit(&vmc->vmc_lock); 1447 mutex_exit(&vms->vms_lock); 1448 } else { 1449 VERIFY3P(vmc->vmc_space, ==, NULL); 1450 mutex_exit(&vmc->vmc_lock); 1451 } 1452 1453 mutex_destroy(&vmc->vmc_lock); 1454 cv_destroy(&vmc->vmc_cv); 1455 list_destroy(&vmc->vmc_held_pages); 1456 1457 kmem_free(vmc, sizeof (*vmc)); 1458 } 1459 1460 static __inline void * 1461 vmp_ptr(const vm_page_t *vmp) 1462 { 1463 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); 1464 1465 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); 1466 return ((void *)((uintptr_t)kpm_vbase + paddr)); 1467 } 1468 1469 /* 1470 * Get a readable kernel-virtual pointer for a held page. 1471 * 1472 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() 1473 * call to acquire this page reference. 1474 */ 1475 const void * 1476 vmp_get_readable(const vm_page_t *vmp) 1477 { 1478 ASSERT(vmp->vmp_prot & PROT_READ); 1479 1480 return (vmp_ptr(vmp)); 1481 } 1482 1483 /* 1484 * Get a writable kernel-virtual pointer for a held page. 1485 * 1486 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() 1487 * call to acquire this page reference. 1488 */ 1489 void * 1490 vmp_get_writable(const vm_page_t *vmp) 1491 { 1492 ASSERT(vmp->vmp_prot & PROT_WRITE); 1493 1494 return (vmp_ptr(vmp)); 1495 } 1496 1497 /* 1498 * Get the host-physical PFN for a held page. 1499 */ 1500 pfn_t 1501 vmp_get_pfn(const vm_page_t *vmp) 1502 { 1503 return (vmp->vmp_pfn); 1504 } 1505 1506 /* 1507 * If this page was deferring dirty-marking in the corresponding vmspace page 1508 * tables, clear such a state so it is considered dirty from now on. 1509 */ 1510 void 1511 vmp_mark_dirty(vm_page_t *vmp) 1512 { 1513 ASSERT((vmp->vmp_prot & PROT_WRITE) != 0); 1514 1515 atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY); 1516 } 1517 1518 /* 1519 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. 1520 */ 1521 void 1522 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) 1523 { 1524 ASSERT3P(vmp->vmp_chain, ==, NULL); 1525 1526 vmp->vmp_chain = to_chain; 1527 } 1528 1529 /* 1530 * Retrieve the pointer from the page-chaining in `vmp`. 1531 */ 1532 vm_page_t * 1533 vmp_next(const vm_page_t *vmp) 1534 { 1535 return (vmp->vmp_chain); 1536 } 1537 1538 static __inline bool 1539 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) 1540 { 1541 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1542 1543 bool was_unmapped = false; 1544 1545 list_remove(&vmc->vmc_held_pages, vmp); 1546 if (vmp->vmp_obj_ref != NULL) { 1547 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1548 1549 vm_object_release(vmp->vmp_obj_ref); 1550 was_unmapped = true; 1551 } else { 1552 ASSERT3P(vmp->vmp_ptep, !=, NULL); 1553 1554 /* 1555 * Track appropriate (accessed/dirty) bits for the guest-virtual 1556 * address corresponding to this page, if it is from the vmspace 1557 * rather than a direct reference to an underlying object. 1558 * 1559 * The protection and/or configured flags may obviate the need 1560 * for such an update. 1561 */ 1562 if ((vmp->vmp_prot & PROT_WRITE) != 0 && 1563 (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 && 1564 vmc->vmc_track_dirty) { 1565 (void) vmm_gpte_reset_dirty(vmp->vmp_ptep, true); 1566 } 1567 } 1568 kmem_free(vmp, sizeof (*vmp)); 1569 return (was_unmapped); 1570 } 1571 1572 /* 1573 * Release held page. Returns true if page resided on region which was 1574 * subsequently unmapped. 1575 */ 1576 bool 1577 vmp_release(vm_page_t *vmp) 1578 { 1579 vm_client_t *vmc = vmp->vmp_client; 1580 1581 VERIFY(vmc != NULL); 1582 1583 mutex_enter(&vmc->vmc_lock); 1584 const bool was_unmapped = vmp_release_inner(vmp, vmc); 1585 mutex_exit(&vmc->vmc_lock); 1586 return (was_unmapped); 1587 } 1588 1589 /* 1590 * Release a chain of pages which were associated via vmp_chain() (setting 1591 * page-chaining pointer). Returns true if any pages resided upon a region 1592 * which was subsequently unmapped. 1593 * 1594 * All of those pages must have been held through the same vm_client_t. 1595 */ 1596 bool 1597 vmp_release_chain(vm_page_t *vmp) 1598 { 1599 vm_client_t *vmc = vmp->vmp_client; 1600 bool any_unmapped = false; 1601 1602 ASSERT(vmp != NULL); 1603 1604 mutex_enter(&vmc->vmc_lock); 1605 while (vmp != NULL) { 1606 vm_page_t *next = vmp->vmp_chain; 1607 1608 /* We expect all pages in chain to be from same client */ 1609 ASSERT3P(vmp->vmp_client, ==, vmc); 1610 1611 if (vmp_release_inner(vmp, vmc)) { 1612 any_unmapped = true; 1613 } 1614 vmp = next; 1615 } 1616 mutex_exit(&vmc->vmc_lock); 1617 return (any_unmapped); 1618 } 1619 1620 1621 int 1622 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, 1623 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) 1624 { 1625 vm_object_t *vmo; 1626 int err; 1627 1628 if (segoff < 0 || len <= 0 || 1629 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { 1630 return (EINVAL); 1631 } 1632 if ((prot & PROT_USER) == 0) { 1633 return (ENOTSUP); 1634 } 1635 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); 1636 if (err != 0) { 1637 return (err); 1638 } 1639 1640 VERIFY(segoff >= 0); 1641 VERIFY(len <= vmo->vmo_size); 1642 VERIFY((len + segoff) <= vmo->vmo_size); 1643 1644 if (vmo->vmo_type != VMOT_MEM) { 1645 /* Only support memory objects for now */ 1646 return (ENOTSUP); 1647 } 1648 1649 as_rangelock(as); 1650 1651 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); 1652 if (err == 0) { 1653 segvmm_crargs_t svma; 1654 1655 svma.prot = prot; 1656 svma.offset = segoff; 1657 svma.vmo = vmo; 1658 svma.vmc = NULL; 1659 1660 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); 1661 } 1662 1663 as_rangeunlock(as); 1664 return (err); 1665 } 1666 1667 int 1668 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, 1669 off_t len, uint_t prot, uint_t maxprot, uint_t flags) 1670 { 1671 1672 const uintptr_t gpa = (uintptr_t)off; 1673 const size_t size = (uintptr_t)len; 1674 int err; 1675 1676 if (off < 0 || len <= 0 || 1677 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { 1678 return (EINVAL); 1679 } 1680 if ((prot & PROT_USER) == 0) { 1681 return (ENOTSUP); 1682 } 1683 1684 as_rangelock(as); 1685 1686 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); 1687 if (err == 0) { 1688 segvmm_crargs_t svma; 1689 1690 svma.prot = prot; 1691 svma.offset = gpa; 1692 svma.vmo = NULL; 1693 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); 1694 1695 err = as_map(as, *addrp, len, segvmm_create, &svma); 1696 } 1697 1698 as_rangeunlock(as); 1699 return (err); 1700 } 1701