1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2025 Oxide Computer Company 16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/kmem.h> 21 #include <sys/thread.h> 22 #include <sys/list.h> 23 #include <sys/mman.h> 24 #include <sys/types.h> 25 #include <sys/ddi.h> 26 #include <sys/sysmacros.h> 27 #include <sys/machsystm.h> 28 #include <sys/vmsystm.h> 29 #include <sys/x86_archext.h> 30 #include <vm/as.h> 31 #include <vm/hat_i86.h> 32 #include <vm/seg_vn.h> 33 #include <vm/seg_kmem.h> 34 35 #include <sys/vmm_vm.h> 36 #include <sys/seg_vmm.h> 37 #include <sys/vmm_kernel.h> 38 #include <sys/vmm_reservoir.h> 39 #include <sys/vmm_gpt.h> 40 41 42 /* 43 * VMM Virtual Memory 44 * 45 * History 46 * 47 * When bhyve was ported to illumos, one significant hole was handling guest 48 * memory and memory accesses. In the original Pluribus port, bhyve itself 49 * manually handled the EPT structures for guest memory. The updated sources 50 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM 51 * system for memory allocations and management of the EPT structures. Keeping 52 * source differences to a minimum was a priority, so illumos-bhyve implemented 53 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to 54 * boot and run guests. 55 * 56 * While the VM shim was successful in getting illumos-bhyve to a functional 57 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the 58 * compatibility interfaces made it awkward to use. As source differences with 59 * the upstream kernel code became less of a concern, and upcoming features 60 * (such as live migration) would demand more of those VM interfaces, it became 61 * clear that an overhaul was prudent. 62 * 63 * Design 64 * 65 * The new VM system for bhyve retains a number of the same concepts as what it 66 * replaces: 67 * 68 * - `vmspace_t` is the top-level entity for a guest memory space 69 * - `vm_object_t` represents a memory object which can be mapped into a vmspace 70 * - `vm_page_t` represents a page hold within a given vmspace, providing access 71 * to the underlying memory page 72 * 73 * Unlike the old code, where most of the involved structures were exposed via 74 * public definitions, this replacement VM interface keeps all involved 75 * structures opaque to consumers. Furthermore, there is a clear delineation 76 * between infrequent administrative operations (such as mapping/unmapping 77 * regions) and common data-path operations (attempting a page hold at a given 78 * guest-physical address). Those administrative operations are performed 79 * directly against the vmspace, whereas the data-path operations are performed 80 * through a `vm_client_t` handle. That VM client abstraction is meant to 81 * reduce contention and overhead for frequent access operations and provide 82 * debugging insight into how different subcomponents are accessing the vmspace. 83 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv 84 * interface) and each VMM userspace segment mapping. 85 * 86 * Exclusion 87 * 88 * Making changes to the vmspace (such as mapping or unmapping regions) requires 89 * other accessors be excluded while the change is underway to prevent them from 90 * observing invalid intermediate states. A simple approach could use a mutex 91 * or rwlock to achieve this, but that risks contention when the rate of access 92 * to the vmspace is high. 93 * 94 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion 95 * at a per-vm_client_t basis. While this raises the cost for vmspace changes, 96 * it means that the much more common page accesses through the vm_client can 97 * normally proceed unimpeded and independently. 98 * 99 * When a change to the vmspace is required, the caller will put the vmspace in 100 * a 'hold' state, iterating over all associated vm_client instances, waiting 101 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before 102 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on 103 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) 104 * will block until the hold condition is cleared. Once the hold is asserted 105 * for all clients, the vmspace change can proceed with confidence. Upon 106 * completion of that operation, VCS_HOLD is cleared from the clients, and they 107 * are released to resume vmspace accesses. 108 * 109 * vCPU Consumers 110 * 111 * Access to the vmspace for vCPUs running in guest context is different from 112 * emulation-related vm_client activity: they solely rely on the contents of the 113 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude 114 * client access is not feasible when entering guest context, since interrupts 115 * are disabled, making it impossible to block entry. This is not a concern as 116 * long as vmspace modifications never place the page tables in invalid states 117 * (either intermediate, or final). The vm_client hold mechanism does provide 118 * the means to IPI vCPU consumers which will trigger a notification once they 119 * report their exit from guest context. This can be used to ensure that page 120 * table modifications are made visible to those vCPUs within a certain 121 * time frame. 122 */ 123 124 typedef struct vmspace_mapping { 125 list_node_t vmsm_node; 126 vm_object_t *vmsm_object; /* object backing this mapping */ 127 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ 128 size_t vmsm_len; /* length (in bytes) of mapping */ 129 off_t vmsm_offset; /* byte offset into object */ 130 uint_t vmsm_prot; 131 } vmspace_mapping_t; 132 133 #define VMSM_OFFSET(vmsm, addr) ( \ 134 (vmsm)->vmsm_offset + \ 135 ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) 136 137 typedef enum vm_client_state { 138 VCS_IDLE = 0, 139 /* currently accessing vmspace for client operation (hold or fault) */ 140 VCS_ACTIVE = (1 << 0), 141 /* client hold requested/asserted */ 142 VCS_HOLD = (1 << 1), 143 /* vCPU is accessing page tables in guest context */ 144 VCS_ON_CPU = (1 << 2), 145 /* client has been orphaned (no more access to vmspace) */ 146 VCS_ORPHANED = (1 << 3), 147 /* client undergoing destroy operation */ 148 VCS_DESTROY = (1 << 4), 149 } vm_client_state_t; 150 151 struct vmspace { 152 kmutex_t vms_lock; 153 kcondvar_t vms_cv; 154 bool vms_held; 155 uintptr_t vms_size; /* immutable after creation */ 156 157 /* (nested) page table state */ 158 vmm_gpt_t *vms_gpt; 159 uint64_t vms_pt_gen; 160 uint64_t vms_pages_mapped; 161 bool vms_track_dirty; 162 163 list_t vms_maplist; 164 list_t vms_clients; 165 }; 166 167 struct vm_client { 168 vmspace_t *vmc_space; 169 list_node_t vmc_node; 170 171 kmutex_t vmc_lock; 172 kcondvar_t vmc_cv; 173 vm_client_state_t vmc_state; 174 int vmc_cpu_active; 175 uint64_t vmc_cpu_gen; 176 bool vmc_track_dirty; 177 vmc_inval_cb_t vmc_inval_func; 178 void *vmc_inval_data; 179 180 list_t vmc_held_pages; 181 }; 182 183 typedef enum vm_object_type { 184 VMOT_NONE, 185 VMOT_MEM, 186 VMOT_MMIO, 187 } vm_object_type_t; 188 189 struct vm_object { 190 uint_t vmo_refcnt; /* manipulated with atomic ops */ 191 192 /* Fields below are fixed at creation time */ 193 vm_object_type_t vmo_type; 194 size_t vmo_size; 195 void *vmo_data; 196 uint8_t vmo_attr; 197 }; 198 199 /* Convenience consolidation of all flag(s) for validity checking */ 200 #define VPF_ALL (VPF_DEFER_DIRTY) 201 202 struct vm_page { 203 vm_client_t *vmp_client; 204 list_node_t vmp_node; 205 vm_page_t *vmp_chain; 206 uintptr_t vmp_gpa; 207 pfn_t vmp_pfn; 208 uint64_t *vmp_ptep; 209 vm_object_t *vmp_obj_ref; 210 uint8_t vmp_prot; 211 uint8_t vmp_flags; 212 }; 213 214 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); 215 static void vmspace_hold_enter(vmspace_t *); 216 static void vmspace_hold_exit(vmspace_t *, bool); 217 static void vmspace_clients_invalidate(vmspace_t *, uintptr_t, size_t); 218 static int vmspace_ensure_mapped(vmspace_t *, uintptr_t, int, pfn_t *, 219 uint64_t *); 220 static void vmc_space_hold(vm_client_t *); 221 static void vmc_space_release(vm_client_t *, bool); 222 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); 223 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); 224 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); 225 226 227 /* 228 * Create a new vmspace with a maximum address of `end`. 229 */ 230 vmspace_t * 231 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) 232 { 233 vmspace_t *vms; 234 const uintptr_t size = end + 1; 235 236 /* 237 * This whole mess is built on the assumption that a 64-bit address 238 * space is available to work with for the various pagetable tricks. 239 */ 240 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && 241 size <= (uintptr_t)USERLIMIT); 242 243 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); 244 vms->vms_size = size; 245 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), 246 offsetof(vmspace_mapping_t, vmsm_node)); 247 list_create(&vms->vms_clients, sizeof (vm_client_t), 248 offsetof(vm_client_t, vmc_node)); 249 250 vms->vms_gpt = vmm_gpt_alloc(pte_ops); 251 vms->vms_pt_gen = 1; 252 vms->vms_track_dirty = track_dirty; 253 254 return (vms); 255 } 256 257 /* 258 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining 259 * clients will be orphaned. 260 */ 261 void 262 vmspace_destroy(vmspace_t *vms) 263 { 264 mutex_enter(&vms->vms_lock); 265 VERIFY(list_is_empty(&vms->vms_maplist)); 266 267 if (!list_is_empty(&vms->vms_clients)) { 268 vm_client_t *vmc = list_head(&vms->vms_clients); 269 while (vmc != NULL) { 270 vmc = vmc_space_orphan(vmc, vms); 271 } 272 /* 273 * Wait for any clients which were in the process of destroying 274 * themselves to disappear. 275 */ 276 while (!list_is_empty(&vms->vms_clients)) { 277 cv_wait(&vms->vms_cv, &vms->vms_lock); 278 } 279 } 280 VERIFY(list_is_empty(&vms->vms_clients)); 281 282 vmm_gpt_free(vms->vms_gpt); 283 mutex_exit(&vms->vms_lock); 284 285 mutex_destroy(&vms->vms_lock); 286 cv_destroy(&vms->vms_cv); 287 list_destroy(&vms->vms_maplist); 288 list_destroy(&vms->vms_clients); 289 290 kmem_free(vms, sizeof (*vms)); 291 } 292 293 /* 294 * Retrieve the count of resident (mapped into the page tables) pages. 295 */ 296 uint64_t 297 vmspace_resident_count(vmspace_t *vms) 298 { 299 return (vms->vms_pages_mapped); 300 } 301 302 /* 303 * Perform an operation on the status (accessed/dirty) bits held in the page 304 * tables of this vmspace. 305 * 306 * Such manipulations race against both hardware writes (from running vCPUs) and 307 * emulated accesses reflected from userspace. Safe functionality depends on 308 * the VM instance being read-locked to prevent vmspace_map/vmspace_unmap 309 * operations from changing the page tables during the walk. 310 */ 311 void 312 vmspace_bits_operate(vmspace_t *vms, const uint64_t gpa, size_t len, 313 vmspace_bit_oper_t oper, uint8_t *bitmap) 314 { 315 const bool bit_input = (oper & VBO_FLAG_BITMAP_IN) != 0; 316 const bool bit_output = (oper & VBO_FLAG_BITMAP_OUT) != 0; 317 const vmspace_bit_oper_t oper_only = 318 oper & ~(VBO_FLAG_BITMAP_IN | VBO_FLAG_BITMAP_OUT); 319 vmm_gpt_t *gpt = vms->vms_gpt; 320 321 /* 322 * The bitmap cannot be NULL if the requested operation involves reading 323 * or writing from it. 324 */ 325 ASSERT(bitmap != NULL || (!bit_input && !bit_output)); 326 327 vmm_gpt_iter_t iter; 328 vmm_gpt_iter_entry_t entry; 329 vmm_gpt_iter_init(&iter, gpt, gpa, len); 330 331 while (vmm_gpt_iter_next(&iter, &entry)) { 332 const size_t offset = (entry.vgie_gpa - gpa); 333 const uint64_t pfn_offset = offset >> PAGESHIFT; 334 const size_t bit_offset = pfn_offset / 8; 335 const uint8_t bit_mask = 1 << (pfn_offset % 8); 336 337 if (bit_input && (bitmap[bit_offset] & bit_mask) == 0) { 338 continue; 339 } 340 341 bool value = false; 342 uint64_t *ptep = entry.vgie_ptep; 343 if (ptep == NULL) { 344 if (bit_output) { 345 bitmap[bit_offset] &= ~bit_mask; 346 } 347 continue; 348 } 349 350 switch (oper_only) { 351 case VBO_GET_DIRTY: 352 value = vmm_gpt_query(gpt, ptep, VGQ_DIRTY); 353 break; 354 case VBO_SET_DIRTY: { 355 uint_t prot = 0; 356 bool present_writable = false; 357 pfn_t pfn; 358 359 /* 360 * To avoid blindly setting the dirty bit on otherwise 361 * empty PTEs, we must first check if the entry for the 362 * address in question has been populated. 363 * 364 * Only if the page is marked both Present and Writable 365 * will we permit the dirty bit to be set. 366 */ 367 if (!vmm_gpt_is_mapped(gpt, ptep, &pfn, &prot)) { 368 int err = vmspace_ensure_mapped(vms, 369 entry.vgie_gpa, PROT_WRITE, &pfn, ptep); 370 if (err == 0) { 371 present_writable = true; 372 } 373 } else if ((prot & PROT_WRITE) != 0) { 374 present_writable = true; 375 } 376 377 if (present_writable) { 378 value = !vmm_gpt_reset_dirty(gpt, ptep, true); 379 } 380 break; 381 } 382 case VBO_RESET_DIRTY: 383 /* 384 * Although at first glance, it may seem like the act of 385 * resetting the dirty bit may require the same care as 386 * setting it, the constraints make for a simpler task. 387 * 388 * Any PTEs with the dirty bit set will have already 389 * been properly populated. 390 */ 391 value = vmm_gpt_reset_dirty(gpt, ptep, false); 392 break; 393 default: 394 panic("unrecognized operator: %d", oper_only); 395 break; 396 } 397 if (bit_output) { 398 if (value) { 399 bitmap[bit_offset] |= bit_mask; 400 } else { 401 bitmap[bit_offset] &= ~bit_mask; 402 } 403 } 404 } 405 406 /* 407 * Invalidate the address range potentially effected by the changes to 408 * page table bits, issuing shoot-downs for those who might have it in 409 * cache. 410 */ 411 vmspace_hold_enter(vms); 412 vms->vms_pt_gen++; 413 vmspace_clients_invalidate(vms, gpa, len); 414 vmspace_hold_exit(vms, true); 415 } 416 417 /* 418 * Is dirty-page-tracking enabled for the vmspace? 419 */ 420 bool 421 vmspace_get_tracking(vmspace_t *vms) 422 { 423 mutex_enter(&vms->vms_lock); 424 const bool val = vms->vms_track_dirty; 425 mutex_exit(&vms->vms_lock); 426 return (val); 427 } 428 429 /* 430 * Set the state (enabled/disabled) of dirty-page-tracking for the vmspace. 431 */ 432 int 433 vmspace_set_tracking(vmspace_t *vms, bool enable_dirty_tracking) 434 { 435 if (enable_dirty_tracking && !vmm_gpt_can_track_dirty(vms->vms_gpt)) { 436 /* Do not allow this to be set if it is not supported */ 437 return (ENOTSUP); 438 } 439 440 vmspace_hold_enter(vms); 441 if (vms->vms_track_dirty == enable_dirty_tracking) { 442 /* No further effort required if state already matches */ 443 vmspace_hold_exit(vms, false); 444 return (0); 445 } 446 447 vms->vms_track_dirty = enable_dirty_tracking; 448 449 /* Configure all existing clients for new tracking behavior */ 450 for (vm_client_t *vmc = list_head(&vms->vms_clients); 451 vmc != NULL; 452 vmc = list_next(&vms->vms_clients, vmc)) { 453 mutex_enter(&vmc->vmc_lock); 454 vmc->vmc_track_dirty = enable_dirty_tracking; 455 mutex_exit(&vmc->vmc_lock); 456 } 457 458 /* 459 * Notify all clients of what is considered an invalidation of the 460 * entire vmspace. 461 */ 462 vms->vms_pt_gen++; 463 vmspace_clients_invalidate(vms, 0, vms->vms_size); 464 465 vmspace_hold_exit(vms, true); 466 return (0); 467 } 468 469 static pfn_t 470 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) 471 { 472 vmmr_region_t *region; 473 pfn_t pfn; 474 475 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); 476 477 region = vmo->vmo_data; 478 pfn = vmmr_region_pfn_at(region, off); 479 480 return (pfn); 481 } 482 483 static pfn_t 484 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) 485 { 486 pfn_t pfn; 487 488 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); 489 ASSERT3P(vmo->vmo_data, !=, NULL); 490 ASSERT3U(off, <, vmo->vmo_size); 491 492 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; 493 494 return (pfn); 495 } 496 497 /* 498 * Allocate a VM object backed by VMM reservoir memory. 499 */ 500 vm_object_t * 501 vm_object_mem_allocate(size_t size, bool transient) 502 { 503 int err; 504 vmmr_region_t *region = NULL; 505 vm_object_t *vmo; 506 507 ASSERT3U(size, !=, 0); 508 ASSERT3U(size & PAGEOFFSET, ==, 0); 509 510 err = vmmr_alloc(size, transient, ®ion); 511 if (err != 0) { 512 return (NULL); 513 } 514 515 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 516 517 /* For now, these are to stay fixed after allocation */ 518 vmo->vmo_type = VMOT_MEM; 519 vmo->vmo_size = size; 520 vmo->vmo_attr = MTRR_TYPE_WB; 521 vmo->vmo_data = region; 522 vmo->vmo_refcnt = 1; 523 524 return (vmo); 525 } 526 527 static vm_object_t * 528 vm_object_mmio_allocate(size_t size, uintptr_t hpa) 529 { 530 vm_object_t *vmo; 531 532 ASSERT3U(size, !=, 0); 533 ASSERT3U(size & PAGEOFFSET, ==, 0); 534 ASSERT3U(hpa & PAGEOFFSET, ==, 0); 535 536 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 537 538 /* For now, these are to stay fixed after allocation */ 539 vmo->vmo_type = VMOT_MMIO; 540 vmo->vmo_size = size; 541 vmo->vmo_attr = MTRR_TYPE_UC; 542 vmo->vmo_data = (void *)hpa; 543 vmo->vmo_refcnt = 1; 544 545 return (vmo); 546 } 547 548 /* 549 * Allocate a VM object backed by an existing range of physical memory. 550 */ 551 vm_object_t * 552 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) 553 { 554 int error; 555 vm_object_t *obj; 556 557 obj = vm_object_mmio_allocate(len, hpa); 558 if (obj != NULL) { 559 error = vmspace_map(vmspace, obj, 0, gpa, len, 560 PROT_READ | PROT_WRITE); 561 if (error != 0) { 562 vm_object_release(obj); 563 obj = NULL; 564 } 565 } 566 567 return (obj); 568 } 569 570 /* 571 * Release a vm_object reference 572 */ 573 void 574 vm_object_release(vm_object_t *vmo) 575 { 576 ASSERT(vmo != NULL); 577 578 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); 579 /* underflow would be a deadly serious mistake */ 580 VERIFY3U(ref, !=, UINT_MAX); 581 if (ref != 0) { 582 return; 583 } 584 585 switch (vmo->vmo_type) { 586 case VMOT_MEM: 587 vmmr_free((vmmr_region_t *)vmo->vmo_data); 588 break; 589 case VMOT_MMIO: 590 break; 591 default: 592 panic("unexpected object type %u", vmo->vmo_type); 593 break; 594 } 595 596 vmo->vmo_data = NULL; 597 vmo->vmo_size = 0; 598 kmem_free(vmo, sizeof (*vmo)); 599 } 600 601 /* 602 * Increase refcount for vm_object reference 603 */ 604 void 605 vm_object_reference(vm_object_t *vmo) 606 { 607 ASSERT(vmo != NULL); 608 609 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); 610 /* overflow would be a deadly serious mistake */ 611 VERIFY3U(ref, !=, 0); 612 } 613 614 /* 615 * Get the host-physical PFN for a given offset into a vm_object. 616 * 617 * The provided `off` must be within the allocated size of the vm_object. 618 */ 619 pfn_t 620 vm_object_pfn(vm_object_t *vmo, uintptr_t off) 621 { 622 const uintptr_t aligned_off = off & PAGEMASK; 623 624 switch (vmo->vmo_type) { 625 case VMOT_MEM: 626 return (vm_object_pager_reservoir(vmo, aligned_off)); 627 case VMOT_MMIO: 628 return (vm_object_pager_mmio(vmo, aligned_off)); 629 case VMOT_NONE: 630 break; 631 } 632 panic("unexpected object type %u", vmo->vmo_type); 633 } 634 635 static vmspace_mapping_t * 636 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) 637 { 638 vmspace_mapping_t *vmsm; 639 list_t *ml = &vms->vms_maplist; 640 const uintptr_t range_end = addr + size; 641 642 ASSERT3U(addr, <=, range_end); 643 644 if (addr >= vms->vms_size) { 645 return (NULL); 646 } 647 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 648 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; 649 650 if (addr >= vmsm->vmsm_addr && addr < seg_end) { 651 if (range_end <= seg_end) { 652 return (vmsm); 653 } else { 654 return (NULL); 655 } 656 } 657 } 658 return (NULL); 659 } 660 661 /* 662 * Check to see if any mappings reside within [addr, addr + size) span in the 663 * vmspace, returning true if that span is indeed empty. 664 */ 665 static bool 666 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) 667 { 668 vmspace_mapping_t *vmsm; 669 list_t *ml = &vms->vms_maplist; 670 const uintptr_t range_end = addr + size - 1; 671 672 ASSERT(MUTEX_HELD(&vms->vms_lock)); 673 ASSERT(size > 0); 674 675 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 676 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; 677 678 /* 679 * The two ranges do not overlap if the start of either of 680 * them is after the end of the other. 681 */ 682 if (vmsm->vmsm_addr > range_end || addr > seg_end) 683 continue; 684 return (false); 685 } 686 return (true); 687 } 688 689 static void 690 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) 691 { 692 list_t *ml = &vms->vms_maplist; 693 694 ASSERT(MUTEX_HELD(&vms->vms_lock)); 695 ASSERT(vms->vms_held); 696 697 list_remove(ml, vmsm); 698 vm_object_release(vmsm->vmsm_object); 699 kmem_free(vmsm, sizeof (*vmsm)); 700 } 701 702 /* 703 * Enter a hold state on the vmspace. This ensures that all VM clients 704 * associated with the vmspace are excluded from establishing new page holds, 705 * or any other actions which would require accessing vmspace state subject to 706 * potential change. 707 * 708 * Returns with vmspace_t`vms_lock held. 709 */ 710 static void 711 vmspace_hold_enter(vmspace_t *vms) 712 { 713 mutex_enter(&vms->vms_lock); 714 VERIFY(!vms->vms_held); 715 716 vm_client_t *vmc = list_head(&vms->vms_clients); 717 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 718 vmc_space_hold(vmc); 719 } 720 vms->vms_held = true; 721 } 722 723 /* 724 * Exit a hold state on the vmspace. This releases all VM clients associated 725 * with the vmspace to be able to establish new page holds, and partake in other 726 * actions which require accessing changed vmspace state. If `kick_on_cpu` is 727 * true, then any CPUs actively using the page tables will be IPIed, and the 728 * call will block until they have acknowledged being ready to use the latest 729 * state of the tables. 730 * 731 * Requires vmspace_t`vms_lock be held, which is released as part of the call. 732 */ 733 static void 734 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) 735 { 736 ASSERT(MUTEX_HELD(&vms->vms_lock)); 737 VERIFY(vms->vms_held); 738 739 vm_client_t *vmc = list_head(&vms->vms_clients); 740 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 741 vmc_space_release(vmc, kick_on_cpu); 742 } 743 vms->vms_held = false; 744 mutex_exit(&vms->vms_lock); 745 } 746 747 static void 748 vmspace_clients_invalidate(vmspace_t *vms, uintptr_t gpa, size_t len) 749 { 750 ASSERT(MUTEX_HELD(&vms->vms_lock)); 751 VERIFY(vms->vms_held); 752 753 for (vm_client_t *vmc = list_head(&vms->vms_clients); 754 vmc != NULL; 755 vmc = list_next(&vms->vms_clients, vmc)) { 756 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); 757 } 758 } 759 760 /* 761 * Attempt to map a vm_object span into the vmspace. 762 * 763 * Requirements: 764 * - `obj_off`, `addr`, and `len` must be page-aligned 765 * - `obj_off` cannot be greater than the allocated size of the object 766 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated 767 * size of the object 768 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address 769 * of the vmspace 770 */ 771 int 772 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, 773 size_t len, uint8_t prot) 774 { 775 vmspace_mapping_t *vmsm; 776 int res = 0; 777 778 if (len == 0 || (addr + len) < addr || 779 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { 780 return (EINVAL); 781 } 782 if ((addr + len) >= vms->vms_size) { 783 return (ENOMEM); 784 } 785 786 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); 787 788 vmspace_hold_enter(vms); 789 if (!vm_mapping_gap(vms, addr, len)) { 790 kmem_free(vmsm, sizeof (*vmsm)); 791 res = ENOMEM; 792 } else { 793 vmsm->vmsm_object = vmo; 794 vmsm->vmsm_addr = addr; 795 vmsm->vmsm_len = len; 796 vmsm->vmsm_offset = (off_t)obj_off; 797 vmsm->vmsm_prot = prot; 798 list_insert_tail(&vms->vms_maplist, vmsm); 799 800 /* 801 * Make sure the GPT has tables ready for leaf entries across 802 * the entire new mapping. 803 */ 804 vmm_gpt_populate_region(vms->vms_gpt, addr, len); 805 } 806 vmspace_hold_exit(vms, false); 807 return (res); 808 } 809 810 /* 811 * Unmap a region of the vmspace. 812 * 813 * Presently the [start, end) span must equal a region previously mapped by a 814 * call to vmspace_map(). 815 */ 816 int 817 vmspace_unmap(vmspace_t *vms, uintptr_t addr, uintptr_t len) 818 { 819 const uintptr_t end = addr + len; 820 vmspace_mapping_t *vmsm; 821 vm_client_t *vmc; 822 uint64_t gen = 0; 823 824 ASSERT3U(addr, <, end); 825 826 vmspace_hold_enter(vms); 827 /* expect to match existing mapping exactly */ 828 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL || 829 vmsm->vmsm_addr != addr || vmsm->vmsm_len != len) { 830 vmspace_hold_exit(vms, false); 831 return (ENOENT); 832 } 833 834 /* Prepare clients (and their held pages) for the unmap. */ 835 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 836 vmc = list_next(&vms->vms_clients, vmc)) { 837 vmc_space_unmap(vmc, addr, len, vmsm->vmsm_object); 838 } 839 840 /* Clear all PTEs for region */ 841 if (vmm_gpt_unmap_region(vms->vms_gpt, addr, len) != 0) { 842 vms->vms_pt_gen++; 843 gen = vms->vms_pt_gen; 844 } 845 /* ... and the intermediate (directory) PTEs as well */ 846 vmm_gpt_vacate_region(vms->vms_gpt, addr, len); 847 848 /* 849 * If pages were actually unmapped from the GPT, provide clients with 850 * an invalidation notice. 851 */ 852 if (gen != 0) { 853 vmspace_clients_invalidate(vms, addr, len); 854 } 855 856 vm_mapping_remove(vms, vmsm); 857 vmspace_hold_exit(vms, true); 858 return (0); 859 } 860 861 /* 862 * For a given GPA in the vmspace, ensure that the backing page (if any) is 863 * properly mapped as present in the provided PTE. 864 */ 865 static int 866 vmspace_ensure_mapped(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 867 uint64_t *leaf_pte) 868 { 869 vmspace_mapping_t *vmsm; 870 vm_object_t *vmo; 871 pfn_t pfn; 872 873 ASSERT(pfnp != NULL); 874 ASSERT(leaf_pte != NULL); 875 876 vmsm = vm_mapping_find(vms, gpa, PAGESIZE); 877 if (vmsm == NULL) { 878 return (FC_NOMAP); 879 } 880 if ((req_prot & vmsm->vmsm_prot) != req_prot) { 881 return (FC_PROT); 882 } 883 884 vmo = vmsm->vmsm_object; 885 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 886 VERIFY(pfn != PFN_INVALID); 887 888 if (vmm_gpt_map_at(vms->vms_gpt, leaf_pte, pfn, vmsm->vmsm_prot, 889 vmo->vmo_attr)) { 890 atomic_inc_64(&vms->vms_pages_mapped); 891 } 892 893 *pfnp = pfn; 894 return (0); 895 } 896 897 /* 898 * Look up the PTE for a given GPA in the vmspace, populating it with 899 * appropriate contents (pfn, protection, etc) if it is empty, but backed by a 900 * valid mapping. 901 */ 902 static int 903 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 904 uint64_t **ptepp) 905 { 906 vmm_gpt_t *gpt = vms->vms_gpt; 907 uint64_t *entries[MAX_GPT_LEVEL], *leaf; 908 pfn_t pfn = PFN_INVALID; 909 uint_t prot; 910 911 ASSERT0(gpa & PAGEOFFSET); 912 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); 913 914 (void) vmm_gpt_walk(gpt, gpa, entries, LEVEL1); 915 leaf = entries[LEVEL1]; 916 if (leaf == NULL) { 917 /* 918 * Since we populated the intermediate tables for any regions 919 * mapped in the GPT, an empty leaf entry indicates there is no 920 * mapping, populated or not, at this GPA. 921 */ 922 return (FC_NOMAP); 923 } 924 925 if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { 926 if ((req_prot & prot) != req_prot) { 927 return (FC_PROT); 928 } 929 } else { 930 int err = vmspace_ensure_mapped(vms, gpa, req_prot, &pfn, leaf); 931 if (err != 0) { 932 return (err); 933 } 934 } 935 936 ASSERT(pfn != PFN_INVALID && leaf != NULL); 937 if (pfnp != NULL) { 938 *pfnp = pfn; 939 } 940 if (ptepp != NULL) { 941 *ptepp = leaf; 942 } 943 return (0); 944 } 945 946 /* 947 * Populate (make resident in the page tables) a region of the vmspace. 948 * 949 * Presently the [start, end) span must equal a region previously mapped by a 950 * call to vmspace_map(). 951 */ 952 int 953 vmspace_populate(vmspace_t *vms, uintptr_t addr, uintptr_t len) 954 { 955 ASSERT0(addr & PAGEOFFSET); 956 ASSERT0(len & PAGEOFFSET); 957 958 vmspace_mapping_t *vmsm; 959 mutex_enter(&vms->vms_lock); 960 961 /* For the time being, only exact-match mappings are expected */ 962 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL) { 963 mutex_exit(&vms->vms_lock); 964 return (FC_NOMAP); 965 } 966 967 vm_object_t *vmo = vmsm->vmsm_object; 968 const int prot = vmsm->vmsm_prot; 969 const uint8_t attr = vmo->vmo_attr; 970 vmm_gpt_t *gpt = vms->vms_gpt; 971 size_t populated = 0; 972 973 vmm_gpt_iter_t iter; 974 vmm_gpt_iter_entry_t entry; 975 vmm_gpt_iter_init(&iter, gpt, addr, len); 976 while (vmm_gpt_iter_next(&iter, &entry)) { 977 const pfn_t pfn = 978 vm_object_pfn(vmo, VMSM_OFFSET(vmsm, entry.vgie_gpa)); 979 VERIFY(pfn != PFN_INVALID); 980 981 if (vmm_gpt_map_at(gpt, entry.vgie_ptep, pfn, prot, attr)) { 982 populated++; 983 } 984 } 985 atomic_add_64(&vms->vms_pages_mapped, populated); 986 987 mutex_exit(&vms->vms_lock); 988 return (0); 989 } 990 991 /* 992 * Allocate a client from a given vmspace. 993 */ 994 vm_client_t * 995 vmspace_client_alloc(vmspace_t *vms) 996 { 997 vm_client_t *vmc; 998 999 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); 1000 vmc->vmc_space = vms; 1001 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); 1002 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); 1003 vmc->vmc_state = VCS_IDLE; 1004 vmc->vmc_cpu_active = -1; 1005 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), 1006 offsetof(vm_page_t, vmp_node)); 1007 vmc->vmc_track_dirty = vms->vms_track_dirty; 1008 1009 mutex_enter(&vms->vms_lock); 1010 list_insert_tail(&vms->vms_clients, vmc); 1011 mutex_exit(&vms->vms_lock); 1012 1013 return (vmc); 1014 } 1015 1016 /* 1017 * Get the nested page table root pointer (EPTP/NCR3) value. 1018 */ 1019 uint64_t 1020 vmspace_table_root(vmspace_t *vms) 1021 { 1022 return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty)); 1023 } 1024 1025 /* 1026 * Get the current generation number of the nested page table. 1027 */ 1028 uint64_t 1029 vmspace_table_gen(vmspace_t *vms) 1030 { 1031 return (vms->vms_pt_gen); 1032 } 1033 1034 /* 1035 * Mark a vm_client as active. This will block if/while the client is held by 1036 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will 1037 * fail if the vm_client has been orphaned. 1038 */ 1039 static int 1040 vmc_activate(vm_client_t *vmc) 1041 { 1042 mutex_enter(&vmc->vmc_lock); 1043 VERIFY0(vmc->vmc_state & VCS_ACTIVE); 1044 if ((vmc->vmc_state & VCS_ORPHANED) != 0) { 1045 mutex_exit(&vmc->vmc_lock); 1046 return (ENXIO); 1047 } 1048 while ((vmc->vmc_state & VCS_HOLD) != 0) { 1049 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1050 } 1051 vmc->vmc_state |= VCS_ACTIVE; 1052 return (0); 1053 } 1054 1055 /* 1056 * Mark a vm_client as no longer active. It must be called with 1057 * vm_client_t`vmc_lock already held, and will return with it released. 1058 */ 1059 static void 1060 vmc_deactivate(vm_client_t *vmc) 1061 { 1062 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1063 VERIFY(vmc->vmc_state & VCS_ACTIVE); 1064 1065 vmc->vmc_state ^= VCS_ACTIVE; 1066 if ((vmc->vmc_state & VCS_HOLD) != 0) { 1067 cv_broadcast(&vmc->vmc_cv); 1068 } 1069 mutex_exit(&vmc->vmc_lock); 1070 } 1071 1072 /* 1073 * Indicate that a CPU will be utilizing the nested page tables through this VM 1074 * client. Interrupts (and/or the GIF) are expected to be disabled when calling 1075 * this function. Returns the generation number of the nested page table (to be 1076 * used for TLB invalidations). 1077 */ 1078 uint64_t 1079 vmc_table_enter(vm_client_t *vmc) 1080 { 1081 vmspace_t *vms = vmc->vmc_space; 1082 uint64_t gen; 1083 1084 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1085 ASSERT3S(vmc->vmc_cpu_active, ==, -1); 1086 1087 /* 1088 * Since the NPT activation occurs with interrupts disabled, this must 1089 * be done without taking vmc_lock like normal. 1090 */ 1091 gen = vms->vms_pt_gen; 1092 vmc->vmc_cpu_active = CPU->cpu_id; 1093 vmc->vmc_cpu_gen = gen; 1094 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); 1095 1096 return (gen); 1097 } 1098 1099 /* 1100 * Indicate that this VM client is not longer (directly) using the underlying 1101 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling 1102 * this function. 1103 */ 1104 void 1105 vmc_table_exit(vm_client_t *vmc) 1106 { 1107 mutex_enter(&vmc->vmc_lock); 1108 1109 ASSERT(vmc->vmc_state & VCS_ON_CPU); 1110 vmc->vmc_state ^= VCS_ON_CPU; 1111 vmc->vmc_cpu_active = -1; 1112 if ((vmc->vmc_state & VCS_HOLD) != 0) { 1113 cv_broadcast(&vmc->vmc_cv); 1114 } 1115 1116 mutex_exit(&vmc->vmc_lock); 1117 } 1118 1119 static void 1120 vmc_space_hold(vm_client_t *vmc) 1121 { 1122 mutex_enter(&vmc->vmc_lock); 1123 VERIFY0(vmc->vmc_state & VCS_HOLD); 1124 1125 /* 1126 * Because vmc_table_enter() alters vmc_state from a context where 1127 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting 1128 * VMC_HOLD must be done atomically here. 1129 */ 1130 atomic_or_uint(&vmc->vmc_state, VCS_HOLD); 1131 1132 /* Wait for client to go inactive */ 1133 while ((vmc->vmc_state & VCS_ACTIVE) != 0) { 1134 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1135 } 1136 mutex_exit(&vmc->vmc_lock); 1137 } 1138 1139 static void 1140 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) 1141 { 1142 mutex_enter(&vmc->vmc_lock); 1143 VERIFY(vmc->vmc_state & VCS_HOLD); 1144 1145 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { 1146 poke_cpu(vmc->vmc_cpu_active); 1147 1148 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1149 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1150 } 1151 } 1152 1153 /* 1154 * Because vmc_table_enter() alters vmc_state from a context where 1155 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing 1156 * VMC_HOLD must be done atomically here. 1157 */ 1158 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); 1159 cv_broadcast(&vmc->vmc_cv); 1160 mutex_exit(&vmc->vmc_lock); 1161 } 1162 1163 static void 1164 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, 1165 uint64_t gen) 1166 { 1167 mutex_enter(&vmc->vmc_lock); 1168 VERIFY(vmc->vmc_state & VCS_HOLD); 1169 if ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1170 /* 1171 * Wait for clients using an old generation of the page tables 1172 * to exit guest context, where they subsequently flush the TLB 1173 * for the new generation. 1174 */ 1175 if (vmc->vmc_cpu_gen < gen) { 1176 poke_cpu(vmc->vmc_cpu_active); 1177 1178 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1179 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1180 } 1181 } 1182 } 1183 if (vmc->vmc_inval_func != NULL) { 1184 vmc_inval_cb_t func = vmc->vmc_inval_func; 1185 void *data = vmc->vmc_inval_data; 1186 1187 /* 1188 * Perform the actual invalidation call outside vmc_lock to 1189 * avoid lock ordering issues in the consumer. Since the client 1190 * is under VCS_HOLD, this is safe. 1191 */ 1192 mutex_exit(&vmc->vmc_lock); 1193 func(data, addr, size); 1194 mutex_enter(&vmc->vmc_lock); 1195 } 1196 mutex_exit(&vmc->vmc_lock); 1197 } 1198 1199 static void 1200 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, 1201 vm_object_t *vmo) 1202 { 1203 mutex_enter(&vmc->vmc_lock); 1204 VERIFY(vmc->vmc_state & VCS_HOLD); 1205 1206 /* 1207 * With the current vCPU exclusion invariants in place, we do not expect 1208 * a vCPU to be in guest context during an unmap. 1209 */ 1210 VERIFY0(vmc->vmc_state & VCS_ON_CPU); 1211 1212 /* 1213 * Any holds against the unmapped region need to establish their own 1214 * reference to the underlying object to avoid a potential 1215 * use-after-free. 1216 */ 1217 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1218 vmp != NULL; 1219 vmp = list_next(&vmc->vmc_held_pages, vmc)) { 1220 if (vmp->vmp_gpa < addr || 1221 vmp->vmp_gpa >= (addr + size)) { 1222 /* Hold outside region in question */ 1223 continue; 1224 } 1225 if (vmp->vmp_obj_ref == NULL) { 1226 vm_object_reference(vmo); 1227 vmp->vmp_obj_ref = vmo; 1228 /* For an unmapped region, PTE is now meaningless */ 1229 vmp->vmp_ptep = NULL; 1230 } else { 1231 /* 1232 * Object could have gone through cycle of 1233 * unmap-map-unmap before the hold was released. 1234 */ 1235 VERIFY3P(vmp->vmp_ptep, ==, NULL); 1236 } 1237 } 1238 mutex_exit(&vmc->vmc_lock); 1239 } 1240 1241 static vm_client_t * 1242 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) 1243 { 1244 vm_client_t *next; 1245 1246 ASSERT(MUTEX_HELD(&vms->vms_lock)); 1247 1248 mutex_enter(&vmc->vmc_lock); 1249 VERIFY3P(vmc->vmc_space, ==, vms); 1250 VERIFY0(vmc->vmc_state & VCS_ORPHANED); 1251 if (vmc->vmc_state & VCS_DESTROY) { 1252 /* 1253 * This vm_client is currently undergoing destruction, so it 1254 * does not need to be orphaned. Let it proceed with its own 1255 * clean-up task. 1256 */ 1257 next = list_next(&vms->vms_clients, vmc); 1258 } else { 1259 /* 1260 * Clients are only orphaned when the containing vmspace is 1261 * being torn down. All mappings from the vmspace should 1262 * already be gone, meaning any remaining held pages should have 1263 * direct references to the object. 1264 */ 1265 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1266 vmp != NULL; 1267 vmp = list_next(&vmc->vmc_held_pages, vmp)) { 1268 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1269 ASSERT3P(vmp->vmp_obj_ref, !=, NULL); 1270 } 1271 1272 /* 1273 * After this point, the client will be orphaned, unable to 1274 * establish new page holds (or access any vmspace-related 1275 * resources) and is in charge of cleaning up after itself. 1276 */ 1277 vmc->vmc_state |= VCS_ORPHANED; 1278 next = list_next(&vms->vms_clients, vmc); 1279 list_remove(&vms->vms_clients, vmc); 1280 vmc->vmc_space = NULL; 1281 } 1282 mutex_exit(&vmc->vmc_lock); 1283 return (next); 1284 } 1285 1286 /* 1287 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1288 */ 1289 vm_page_t * 1290 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags) 1291 { 1292 vmspace_t *vms = vmc->vmc_space; 1293 vm_page_t *vmp; 1294 pfn_t pfn = PFN_INVALID; 1295 uint64_t *ptep = NULL; 1296 1297 ASSERT0(gpa & PAGEOFFSET); 1298 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); 1299 ASSERT0(prot & ~PROT_ALL); 1300 ASSERT0(flags & ~VPF_ALL); 1301 1302 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); 1303 if (vmc_activate(vmc) != 0) { 1304 kmem_free(vmp, sizeof (*vmp)); 1305 return (NULL); 1306 } 1307 1308 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { 1309 vmc_deactivate(vmc); 1310 kmem_free(vmp, sizeof (*vmp)); 1311 return (NULL); 1312 } 1313 ASSERT(pfn != PFN_INVALID && ptep != NULL); 1314 1315 vmp->vmp_client = vmc; 1316 vmp->vmp_chain = NULL; 1317 vmp->vmp_gpa = gpa; 1318 vmp->vmp_pfn = pfn; 1319 vmp->vmp_ptep = ptep; 1320 vmp->vmp_obj_ref = NULL; 1321 vmp->vmp_prot = (uint8_t)prot; 1322 vmp->vmp_flags = (uint8_t)flags; 1323 list_insert_tail(&vmc->vmc_held_pages, vmp); 1324 vmc_deactivate(vmc); 1325 1326 return (vmp); 1327 } 1328 1329 /* 1330 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1331 */ 1332 vm_page_t * 1333 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) 1334 { 1335 return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT)); 1336 } 1337 1338 int 1339 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) 1340 { 1341 vmspace_t *vms = vmc->vmc_space; 1342 int err; 1343 1344 err = vmc_activate(vmc); 1345 if (err == 0) { 1346 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); 1347 vmc_deactivate(vmc); 1348 } 1349 1350 return (err); 1351 } 1352 1353 /* 1354 * Allocate an additional vm_client_t, based on an existing one. Only the 1355 * associatation with the vmspace is cloned, not existing holds or any 1356 * configured invalidation function. 1357 */ 1358 vm_client_t * 1359 vmc_clone(vm_client_t *vmc) 1360 { 1361 vmspace_t *vms = vmc->vmc_space; 1362 1363 return (vmspace_client_alloc(vms)); 1364 } 1365 1366 /* 1367 * Register a function (and associated data pointer) to be called when an 1368 * address range in the vmspace is invalidated. 1369 */ 1370 int 1371 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) 1372 { 1373 int err; 1374 1375 err = vmc_activate(vmc); 1376 if (err == 0) { 1377 vmc->vmc_inval_func = func; 1378 vmc->vmc_inval_data = data; 1379 vmc_deactivate(vmc); 1380 } 1381 1382 return (err); 1383 } 1384 1385 /* 1386 * Destroy a vm_client_t instance. 1387 * 1388 * No pages held through this vm_client_t may be outstanding when performing a 1389 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to 1390 * vmc_table_exit() has been made). 1391 */ 1392 void 1393 vmc_destroy(vm_client_t *vmc) 1394 { 1395 mutex_enter(&vmc->vmc_lock); 1396 1397 VERIFY(list_is_empty(&vmc->vmc_held_pages)); 1398 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1399 1400 if ((vmc->vmc_state & VCS_ORPHANED) == 0) { 1401 vmspace_t *vms; 1402 1403 /* 1404 * Deassociation with the parent vmspace must be done carefully: 1405 * The vmspace could attempt to orphan this vm_client while we 1406 * release vmc_lock in order to take vms_lock (the required 1407 * order). The client is marked to indicate that destruction is 1408 * under way. Doing so prevents any racing orphan operation 1409 * from applying to this client, allowing us to deassociate from 1410 * the vmspace safely. 1411 */ 1412 vmc->vmc_state |= VCS_DESTROY; 1413 vms = vmc->vmc_space; 1414 mutex_exit(&vmc->vmc_lock); 1415 1416 mutex_enter(&vms->vms_lock); 1417 mutex_enter(&vmc->vmc_lock); 1418 list_remove(&vms->vms_clients, vmc); 1419 /* 1420 * If the vmspace began its own destruction operation while we 1421 * were navigating the locks, be sure to notify it about this 1422 * vm_client being deassociated. 1423 */ 1424 cv_signal(&vms->vms_cv); 1425 mutex_exit(&vmc->vmc_lock); 1426 mutex_exit(&vms->vms_lock); 1427 } else { 1428 VERIFY3P(vmc->vmc_space, ==, NULL); 1429 mutex_exit(&vmc->vmc_lock); 1430 } 1431 1432 mutex_destroy(&vmc->vmc_lock); 1433 cv_destroy(&vmc->vmc_cv); 1434 list_destroy(&vmc->vmc_held_pages); 1435 1436 kmem_free(vmc, sizeof (*vmc)); 1437 } 1438 1439 static __inline void * 1440 vmp_ptr(const vm_page_t *vmp) 1441 { 1442 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); 1443 1444 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); 1445 return ((void *)((uintptr_t)kpm_vbase + paddr)); 1446 } 1447 1448 /* 1449 * Get a readable kernel-virtual pointer for a held page. 1450 * 1451 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() 1452 * call to acquire this page reference. 1453 */ 1454 const void * 1455 vmp_get_readable(const vm_page_t *vmp) 1456 { 1457 ASSERT(vmp->vmp_prot & PROT_READ); 1458 1459 return (vmp_ptr(vmp)); 1460 } 1461 1462 /* 1463 * Get a writable kernel-virtual pointer for a held page. 1464 * 1465 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() 1466 * call to acquire this page reference. 1467 */ 1468 void * 1469 vmp_get_writable(const vm_page_t *vmp) 1470 { 1471 ASSERT(vmp->vmp_prot & PROT_WRITE); 1472 1473 return (vmp_ptr(vmp)); 1474 } 1475 1476 /* 1477 * Get the host-physical PFN for a held page. 1478 */ 1479 pfn_t 1480 vmp_get_pfn(const vm_page_t *vmp) 1481 { 1482 return (vmp->vmp_pfn); 1483 } 1484 1485 /* 1486 * If this page was deferring dirty-marking in the corresponding vmspace page 1487 * tables, clear such a state so it is considered dirty from now on. 1488 */ 1489 void 1490 vmp_mark_dirty(vm_page_t *vmp) 1491 { 1492 ASSERT((vmp->vmp_prot & PROT_WRITE) != 0); 1493 1494 atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY); 1495 } 1496 1497 /* 1498 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. 1499 */ 1500 void 1501 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) 1502 { 1503 ASSERT3P(vmp->vmp_chain, ==, NULL); 1504 1505 vmp->vmp_chain = to_chain; 1506 } 1507 1508 /* 1509 * Retrieve the pointer from the page-chaining in `vmp`. 1510 */ 1511 vm_page_t * 1512 vmp_next(const vm_page_t *vmp) 1513 { 1514 return (vmp->vmp_chain); 1515 } 1516 1517 static __inline bool 1518 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) 1519 { 1520 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1521 1522 bool was_unmapped = false; 1523 1524 list_remove(&vmc->vmc_held_pages, vmp); 1525 if (vmp->vmp_obj_ref != NULL) { 1526 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1527 1528 vm_object_release(vmp->vmp_obj_ref); 1529 was_unmapped = true; 1530 } else { 1531 ASSERT3P(vmp->vmp_ptep, !=, NULL); 1532 1533 /* 1534 * Track appropriate (accessed/dirty) bits for the guest-virtual 1535 * address corresponding to this page, if it is from the vmspace 1536 * rather than a direct reference to an underlying object. 1537 * 1538 * The protection and/or configured flags may obviate the need 1539 * for such an update. 1540 */ 1541 if ((vmp->vmp_prot & PROT_WRITE) != 0 && 1542 (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 && 1543 vmc->vmc_track_dirty) { 1544 vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; 1545 (void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); 1546 } 1547 } 1548 kmem_free(vmp, sizeof (*vmp)); 1549 return (was_unmapped); 1550 } 1551 1552 /* 1553 * Release held page. Returns true if page resided on region which was 1554 * subsequently unmapped. 1555 */ 1556 bool 1557 vmp_release(vm_page_t *vmp) 1558 { 1559 vm_client_t *vmc = vmp->vmp_client; 1560 1561 VERIFY(vmc != NULL); 1562 1563 mutex_enter(&vmc->vmc_lock); 1564 const bool was_unmapped = vmp_release_inner(vmp, vmc); 1565 mutex_exit(&vmc->vmc_lock); 1566 return (was_unmapped); 1567 } 1568 1569 /* 1570 * Release a chain of pages which were associated via vmp_chain() (setting 1571 * page-chaining pointer). Returns true if any pages resided upon a region 1572 * which was subsequently unmapped. 1573 * 1574 * All of those pages must have been held through the same vm_client_t. 1575 */ 1576 bool 1577 vmp_release_chain(vm_page_t *vmp) 1578 { 1579 vm_client_t *vmc = vmp->vmp_client; 1580 bool any_unmapped = false; 1581 1582 ASSERT(vmp != NULL); 1583 1584 mutex_enter(&vmc->vmc_lock); 1585 while (vmp != NULL) { 1586 vm_page_t *next = vmp->vmp_chain; 1587 1588 /* We expect all pages in chain to be from same client */ 1589 ASSERT3P(vmp->vmp_client, ==, vmc); 1590 1591 if (vmp_release_inner(vmp, vmc)) { 1592 any_unmapped = true; 1593 } 1594 vmp = next; 1595 } 1596 mutex_exit(&vmc->vmc_lock); 1597 return (any_unmapped); 1598 } 1599 1600 1601 int 1602 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, 1603 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) 1604 { 1605 vm_object_t *vmo; 1606 int err; 1607 1608 if (segoff < 0 || len <= 0 || 1609 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { 1610 return (EINVAL); 1611 } 1612 if ((prot & PROT_USER) == 0) { 1613 return (ENOTSUP); 1614 } 1615 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); 1616 if (err != 0) { 1617 return (err); 1618 } 1619 1620 VERIFY(segoff >= 0); 1621 VERIFY(len <= vmo->vmo_size); 1622 VERIFY((len + segoff) <= vmo->vmo_size); 1623 1624 if (vmo->vmo_type != VMOT_MEM) { 1625 /* Only support memory objects for now */ 1626 return (ENOTSUP); 1627 } 1628 1629 as_rangelock(as); 1630 1631 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); 1632 if (err == 0) { 1633 segvmm_crargs_t svma; 1634 1635 svma.prot = prot; 1636 svma.offset = segoff; 1637 svma.vmo = vmo; 1638 svma.vmc = NULL; 1639 1640 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); 1641 } 1642 1643 as_rangeunlock(as); 1644 return (err); 1645 } 1646 1647 int 1648 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, 1649 off_t len, uint_t prot, uint_t maxprot, uint_t flags) 1650 { 1651 1652 const uintptr_t gpa = (uintptr_t)off; 1653 const size_t size = (uintptr_t)len; 1654 int err; 1655 1656 if (off < 0 || len <= 0 || 1657 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { 1658 return (EINVAL); 1659 } 1660 if ((prot & PROT_USER) == 0) { 1661 return (ENOTSUP); 1662 } 1663 1664 as_rangelock(as); 1665 1666 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); 1667 if (err == 0) { 1668 segvmm_crargs_t svma; 1669 1670 svma.prot = prot; 1671 svma.offset = gpa; 1672 svma.vmo = NULL; 1673 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); 1674 1675 err = as_map(as, *addrp, len, segvmm_create, &svma); 1676 } 1677 1678 as_rangeunlock(as); 1679 return (err); 1680 } 1681