1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2023 Oxide Computer Company 16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/kmem.h> 21 #include <sys/thread.h> 22 #include <sys/list.h> 23 #include <sys/mman.h> 24 #include <sys/types.h> 25 #include <sys/ddi.h> 26 #include <sys/sysmacros.h> 27 #include <sys/machsystm.h> 28 #include <sys/vmsystm.h> 29 #include <sys/x86_archext.h> 30 #include <vm/as.h> 31 #include <vm/hat_i86.h> 32 #include <vm/seg_vn.h> 33 #include <vm/seg_kmem.h> 34 35 #include <sys/vmm_vm.h> 36 #include <sys/seg_vmm.h> 37 #include <sys/vmm_kernel.h> 38 #include <sys/vmm_reservoir.h> 39 #include <sys/vmm_gpt.h> 40 41 42 /* 43 * VMM Virtual Memory 44 * 45 * History 46 * 47 * When bhyve was ported to illumos, one significant hole was handling guest 48 * memory and memory accesses. In the original Pluribus port, bhyve itself 49 * manually handled the EPT structures for guest memory. The updated sources 50 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM 51 * system for memory allocations and management of the EPT structures. Keeping 52 * source differences to a minimum was a priority, so illumos-bhyve implemented 53 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to 54 * boot and run guests. 55 * 56 * While the VM shim was successful in getting illumos-bhyve to a functional 57 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the 58 * compatibility interfaces made it awkward to use. As source differences with 59 * the upstream kernel code became less of a concern, and upcoming features 60 * (such as live migration) would demand more of those VM interfaces, it became 61 * clear that an overhaul was prudent. 62 * 63 * Design 64 * 65 * The new VM system for bhyve retains a number of the same concepts as what it 66 * replaces: 67 * 68 * - `vmspace_t` is the top-level entity for a guest memory space 69 * - `vm_object_t` represents a memory object which can be mapped into a vmspace 70 * - `vm_page_t` represents a page hold within a given vmspace, providing access 71 * to the underlying memory page 72 * 73 * Unlike the old code, where most of the involved structures were exposed via 74 * public definitions, this replacement VM interface keeps all involved 75 * structures opaque to consumers. Furthermore, there is a clear delineation 76 * between infrequent administrative operations (such as mapping/unmapping 77 * regions) and common data-path operations (attempting a page hold at a given 78 * guest-physical address). Those administrative operations are performed 79 * directly against the vmspace, whereas the data-path operations are performed 80 * through a `vm_client_t` handle. That VM client abstraction is meant to 81 * reduce contention and overhead for frequent access operations and provide 82 * debugging insight into how different subcomponents are accessing the vmspace. 83 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv 84 * interface) and each VMM userspace segment mapping. 85 * 86 * Exclusion 87 * 88 * Making changes to the vmspace (such as mapping or unmapping regions) requires 89 * other accessors be excluded while the change is underway to prevent them from 90 * observing invalid intermediate states. A simple approach could use a mutex 91 * or rwlock to achieve this, but that risks contention when the rate of access 92 * to the vmspace is high. 93 * 94 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion 95 * at a per-vm_client_t basis. While this raises the cost for vmspace changes, 96 * it means that the much more common page accesses through the vm_client can 97 * normally proceed unimpeded and independently. 98 * 99 * When a change to the vmspace is required, the caller will put the vmspace in 100 * a 'hold' state, iterating over all associated vm_client instances, waiting 101 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before 102 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on 103 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) 104 * will block until the hold condition is cleared. Once the hold is asserted 105 * for all clients, the vmspace change can proceed with confidence. Upon 106 * completion of that operation, VCS_HOLD is cleared from the clients, and they 107 * are released to resume vmspace accesses. 108 * 109 * vCPU Consumers 110 * 111 * Access to the vmspace for vCPUs running in guest context is different from 112 * emulation-related vm_client activity: they solely rely on the contents of the 113 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude 114 * client access is not feasible when entering guest context, since interrupts 115 * are disabled, making it impossible to block entry. This is not a concern as 116 * long as vmspace modifications never place the page tables in invalid states 117 * (either intermediate, or final). The vm_client hold mechanism does provide 118 * the means to IPI vCPU consumers which will trigger a notification once they 119 * report their exit from guest context. This can be used to ensure that page 120 * table modifications are made visible to those vCPUs within a certain 121 * time frame. 122 */ 123 124 typedef struct vmspace_mapping { 125 list_node_t vmsm_node; 126 vm_object_t *vmsm_object; /* object backing this mapping */ 127 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ 128 size_t vmsm_len; /* length (in bytes) of mapping */ 129 off_t vmsm_offset; /* byte offset into object */ 130 uint_t vmsm_prot; 131 } vmspace_mapping_t; 132 133 #define VMSM_OFFSET(vmsm, addr) ( \ 134 (vmsm)->vmsm_offset + \ 135 ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) 136 137 typedef enum vm_client_state { 138 VCS_IDLE = 0, 139 /* currently accessing vmspace for client operation (hold or fault) */ 140 VCS_ACTIVE = (1 << 0), 141 /* client hold requested/asserted */ 142 VCS_HOLD = (1 << 1), 143 /* vCPU is accessing page tables in guest context */ 144 VCS_ON_CPU = (1 << 2), 145 /* client has been orphaned (no more access to vmspace) */ 146 VCS_ORPHANED = (1 << 3), 147 /* client undergoing destroy operation */ 148 VCS_DESTROY = (1 << 4), 149 } vm_client_state_t; 150 151 struct vmspace { 152 kmutex_t vms_lock; 153 kcondvar_t vms_cv; 154 bool vms_held; 155 uintptr_t vms_size; /* immutable after creation */ 156 157 /* (nested) page table state */ 158 vmm_gpt_t *vms_gpt; 159 uint64_t vms_pt_gen; 160 uint64_t vms_pages_mapped; 161 bool vms_track_dirty; 162 163 list_t vms_maplist; 164 list_t vms_clients; 165 }; 166 167 struct vm_client { 168 vmspace_t *vmc_space; 169 list_node_t vmc_node; 170 171 kmutex_t vmc_lock; 172 kcondvar_t vmc_cv; 173 vm_client_state_t vmc_state; 174 int vmc_cpu_active; 175 uint64_t vmc_cpu_gen; 176 bool vmc_track_dirty; 177 vmc_inval_cb_t vmc_inval_func; 178 void *vmc_inval_data; 179 180 list_t vmc_held_pages; 181 }; 182 183 typedef enum vm_object_type { 184 VMOT_NONE, 185 VMOT_MEM, 186 VMOT_MMIO, 187 } vm_object_type_t; 188 189 struct vm_object { 190 uint_t vmo_refcnt; /* manipulated with atomic ops */ 191 192 /* Fields below are fixed at creation time */ 193 vm_object_type_t vmo_type; 194 size_t vmo_size; 195 void *vmo_data; 196 uint8_t vmo_attr; 197 }; 198 199 /* Convenience consolidation of all flag(s) for validity checking */ 200 #define VPF_ALL (VPF_DEFER_DIRTY) 201 202 struct vm_page { 203 vm_client_t *vmp_client; 204 list_node_t vmp_node; 205 vm_page_t *vmp_chain; 206 uintptr_t vmp_gpa; 207 pfn_t vmp_pfn; 208 uint64_t *vmp_ptep; 209 vm_object_t *vmp_obj_ref; 210 uint8_t vmp_prot; 211 uint8_t vmp_flags; 212 }; 213 214 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); 215 static void vmspace_hold_enter(vmspace_t *); 216 static void vmspace_hold_exit(vmspace_t *, bool); 217 static void vmspace_clients_invalidate(vmspace_t *, uintptr_t, size_t); 218 static int vmspace_ensure_mapped(vmspace_t *, uintptr_t, int, pfn_t *, 219 uint64_t *); 220 static void vmc_space_hold(vm_client_t *); 221 static void vmc_space_release(vm_client_t *, bool); 222 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); 223 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); 224 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); 225 226 227 /* 228 * Create a new vmspace with a maximum address of `end`. 229 */ 230 vmspace_t * 231 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) 232 { 233 vmspace_t *vms; 234 const uintptr_t size = end + 1; 235 236 /* 237 * This whole mess is built on the assumption that a 64-bit address 238 * space is available to work with for the various pagetable tricks. 239 */ 240 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && 241 size <= (uintptr_t)USERLIMIT); 242 243 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); 244 vms->vms_size = size; 245 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), 246 offsetof(vmspace_mapping_t, vmsm_node)); 247 list_create(&vms->vms_clients, sizeof (vm_client_t), 248 offsetof(vm_client_t, vmc_node)); 249 250 vms->vms_gpt = vmm_gpt_alloc(pte_ops); 251 vms->vms_pt_gen = 1; 252 vms->vms_track_dirty = track_dirty; 253 254 return (vms); 255 } 256 257 /* 258 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining 259 * clients will be orphaned. 260 */ 261 void 262 vmspace_destroy(vmspace_t *vms) 263 { 264 mutex_enter(&vms->vms_lock); 265 VERIFY(list_is_empty(&vms->vms_maplist)); 266 267 if (!list_is_empty(&vms->vms_clients)) { 268 vm_client_t *vmc = list_head(&vms->vms_clients); 269 while (vmc != NULL) { 270 vmc = vmc_space_orphan(vmc, vms); 271 } 272 /* 273 * Wait for any clients which were in the process of destroying 274 * themselves to disappear. 275 */ 276 while (!list_is_empty(&vms->vms_clients)) { 277 cv_wait(&vms->vms_cv, &vms->vms_lock); 278 } 279 } 280 VERIFY(list_is_empty(&vms->vms_clients)); 281 282 vmm_gpt_free(vms->vms_gpt); 283 mutex_exit(&vms->vms_lock); 284 285 mutex_destroy(&vms->vms_lock); 286 cv_destroy(&vms->vms_cv); 287 list_destroy(&vms->vms_maplist); 288 list_destroy(&vms->vms_clients); 289 290 kmem_free(vms, sizeof (*vms)); 291 } 292 293 /* 294 * Retrieve the count of resident (mapped into the page tables) pages. 295 */ 296 uint64_t 297 vmspace_resident_count(vmspace_t *vms) 298 { 299 return (vms->vms_pages_mapped); 300 } 301 302 /* 303 * Perform an operation on the status (accessed/dirty) bits held in the page 304 * tables of this vmspace. 305 * 306 * Such manipulations race against both hardware writes (from running vCPUs) and 307 * emulated accesses reflected from userspace. Safe functionality depends on 308 * the VM instance being read-locked to prevent vmspace_map/vmspace_unmap 309 * operations from changing the page tables during the walk. 310 */ 311 void 312 vmspace_bits_operate(vmspace_t *vms, uint64_t gpa, size_t len, 313 vmspace_bit_oper_t oper, uint8_t *bitmap) 314 { 315 const bool bit_input = (oper & VBO_FLAG_BITMAP_IN) != 0; 316 const bool bit_output = (oper & VBO_FLAG_BITMAP_OUT) != 0; 317 const vmspace_bit_oper_t oper_only = 318 oper & ~(VBO_FLAG_BITMAP_IN | VBO_FLAG_BITMAP_OUT); 319 vmm_gpt_t *gpt = vms->vms_gpt; 320 321 /* 322 * The bitmap cannot be NULL if the requested operation involves reading 323 * or writing from it. 324 */ 325 ASSERT(bitmap != NULL || (!bit_input && !bit_output)); 326 327 for (size_t offset = 0; offset < len; offset += PAGESIZE) { 328 const uint64_t pfn_offset = offset >> PAGESHIFT; 329 const size_t bit_offset = pfn_offset / 8; 330 const uint8_t bit_mask = 1 << (pfn_offset % 8); 331 332 if (bit_input && (bitmap[bit_offset] & bit_mask) == 0) { 333 continue; 334 } 335 336 bool value = false; 337 uint64_t *entry = vmm_gpt_lookup(gpt, gpa + offset); 338 if (entry == NULL) { 339 if (bit_output) { 340 bitmap[bit_offset] &= ~bit_mask; 341 } 342 continue; 343 } 344 345 switch (oper_only) { 346 case VBO_GET_DIRTY: 347 value = vmm_gpt_query(gpt, entry, VGQ_DIRTY); 348 break; 349 case VBO_SET_DIRTY: { 350 uint_t prot = 0; 351 bool present_writable = false; 352 pfn_t pfn; 353 354 /* 355 * To avoid blindly setting the dirty bit on otherwise 356 * empty PTEs, we must first check if the entry for the 357 * address in question has been populated. 358 * 359 * Only if the page is marked both Present and Writable 360 * will we permit the dirty bit to be set. 361 */ 362 if (!vmm_gpt_is_mapped(gpt, entry, &pfn, &prot)) { 363 int err = vmspace_ensure_mapped(vms, gpa, 364 PROT_WRITE, &pfn, entry); 365 if (err == 0) { 366 present_writable = true; 367 } 368 } else if ((prot & PROT_WRITE) != 0) { 369 present_writable = true; 370 } 371 372 if (present_writable) { 373 value = !vmm_gpt_reset_dirty(gpt, entry, true); 374 } 375 break; 376 } 377 case VBO_RESET_DIRTY: 378 /* 379 * Although at first glance, it may seem like the act of 380 * resetting the dirty bit may require the same care as 381 * setting it, the constraints make for a simpler task. 382 * 383 * Any PTEs with the dirty bit set will have already 384 * been properly populated. 385 */ 386 value = vmm_gpt_reset_dirty(gpt, entry, false); 387 break; 388 default: 389 panic("unrecognized operator: %d", oper_only); 390 break; 391 } 392 if (bit_output) { 393 if (value) { 394 bitmap[bit_offset] |= bit_mask; 395 } else { 396 bitmap[bit_offset] &= ~bit_mask; 397 } 398 } 399 } 400 401 /* 402 * Invalidate the address range potentially effected by the changes to 403 * page table bits, issuing shoot-downs for those who might have it in 404 * cache. 405 */ 406 vmspace_hold_enter(vms); 407 vms->vms_pt_gen++; 408 vmspace_clients_invalidate(vms, gpa, len); 409 vmspace_hold_exit(vms, true); 410 } 411 412 /* 413 * Is dirty-page-tracking enabled for the vmspace? 414 */ 415 bool 416 vmspace_get_tracking(vmspace_t *vms) 417 { 418 mutex_enter(&vms->vms_lock); 419 const bool val = vms->vms_track_dirty; 420 mutex_exit(&vms->vms_lock); 421 return (val); 422 } 423 424 /* 425 * Set the state (enabled/disabled) of dirty-page-tracking for the vmspace. 426 */ 427 int 428 vmspace_set_tracking(vmspace_t *vms, bool enable_dirty_tracking) 429 { 430 if (enable_dirty_tracking && !vmm_gpt_can_track_dirty(vms->vms_gpt)) { 431 /* Do not allow this to be set if it is not supported */ 432 return (ENOTSUP); 433 } 434 435 vmspace_hold_enter(vms); 436 if (vms->vms_track_dirty == enable_dirty_tracking) { 437 /* No further effort required if state already matches */ 438 vmspace_hold_exit(vms, false); 439 return (0); 440 } 441 442 vms->vms_track_dirty = enable_dirty_tracking; 443 444 /* Configure all existing clients for new tracking behavior */ 445 for (vm_client_t *vmc = list_head(&vms->vms_clients); 446 vmc != NULL; 447 vmc = list_next(&vms->vms_clients, vmc)) { 448 mutex_enter(&vmc->vmc_lock); 449 vmc->vmc_track_dirty = enable_dirty_tracking; 450 mutex_exit(&vmc->vmc_lock); 451 } 452 453 /* 454 * Notify all clients of what is considered an invalidation of the 455 * entire vmspace. 456 */ 457 vms->vms_pt_gen++; 458 vmspace_clients_invalidate(vms, 0, vms->vms_size); 459 460 vmspace_hold_exit(vms, true); 461 return (0); 462 } 463 464 static pfn_t 465 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) 466 { 467 vmmr_region_t *region; 468 pfn_t pfn; 469 470 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); 471 472 region = vmo->vmo_data; 473 pfn = vmmr_region_pfn_at(region, off); 474 475 return (pfn); 476 } 477 478 static pfn_t 479 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) 480 { 481 pfn_t pfn; 482 483 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); 484 ASSERT3P(vmo->vmo_data, !=, NULL); 485 ASSERT3U(off, <, vmo->vmo_size); 486 487 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; 488 489 return (pfn); 490 } 491 492 /* 493 * Allocate a VM object backed by VMM reservoir memory. 494 */ 495 vm_object_t * 496 vm_object_mem_allocate(size_t size, bool transient) 497 { 498 int err; 499 vmmr_region_t *region = NULL; 500 vm_object_t *vmo; 501 502 ASSERT3U(size, !=, 0); 503 ASSERT3U(size & PAGEOFFSET, ==, 0); 504 505 err = vmmr_alloc(size, transient, ®ion); 506 if (err != 0) { 507 return (NULL); 508 } 509 510 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 511 512 /* For now, these are to stay fixed after allocation */ 513 vmo->vmo_type = VMOT_MEM; 514 vmo->vmo_size = size; 515 vmo->vmo_attr = MTRR_TYPE_WB; 516 vmo->vmo_data = region; 517 vmo->vmo_refcnt = 1; 518 519 return (vmo); 520 } 521 522 static vm_object_t * 523 vm_object_mmio_allocate(size_t size, uintptr_t hpa) 524 { 525 vm_object_t *vmo; 526 527 ASSERT3U(size, !=, 0); 528 ASSERT3U(size & PAGEOFFSET, ==, 0); 529 ASSERT3U(hpa & PAGEOFFSET, ==, 0); 530 531 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 532 533 /* For now, these are to stay fixed after allocation */ 534 vmo->vmo_type = VMOT_MMIO; 535 vmo->vmo_size = size; 536 vmo->vmo_attr = MTRR_TYPE_UC; 537 vmo->vmo_data = (void *)hpa; 538 vmo->vmo_refcnt = 1; 539 540 return (vmo); 541 } 542 543 /* 544 * Allocate a VM object backed by an existing range of physical memory. 545 */ 546 vm_object_t * 547 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) 548 { 549 int error; 550 vm_object_t *obj; 551 552 obj = vm_object_mmio_allocate(len, hpa); 553 if (obj != NULL) { 554 error = vmspace_map(vmspace, obj, 0, gpa, len, 555 PROT_READ | PROT_WRITE); 556 if (error != 0) { 557 vm_object_release(obj); 558 obj = NULL; 559 } 560 } 561 562 return (obj); 563 } 564 565 /* 566 * Release a vm_object reference 567 */ 568 void 569 vm_object_release(vm_object_t *vmo) 570 { 571 ASSERT(vmo != NULL); 572 573 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); 574 /* underflow would be a deadly serious mistake */ 575 VERIFY3U(ref, !=, UINT_MAX); 576 if (ref != 0) { 577 return; 578 } 579 580 switch (vmo->vmo_type) { 581 case VMOT_MEM: 582 vmmr_free((vmmr_region_t *)vmo->vmo_data); 583 break; 584 case VMOT_MMIO: 585 break; 586 default: 587 panic("unexpected object type %u", vmo->vmo_type); 588 break; 589 } 590 591 vmo->vmo_data = NULL; 592 vmo->vmo_size = 0; 593 kmem_free(vmo, sizeof (*vmo)); 594 } 595 596 /* 597 * Increase refcount for vm_object reference 598 */ 599 void 600 vm_object_reference(vm_object_t *vmo) 601 { 602 ASSERT(vmo != NULL); 603 604 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); 605 /* overflow would be a deadly serious mistake */ 606 VERIFY3U(ref, !=, 0); 607 } 608 609 /* 610 * Get the host-physical PFN for a given offset into a vm_object. 611 * 612 * The provided `off` must be within the allocated size of the vm_object. 613 */ 614 pfn_t 615 vm_object_pfn(vm_object_t *vmo, uintptr_t off) 616 { 617 const uintptr_t aligned_off = off & PAGEMASK; 618 619 switch (vmo->vmo_type) { 620 case VMOT_MEM: 621 return (vm_object_pager_reservoir(vmo, aligned_off)); 622 case VMOT_MMIO: 623 return (vm_object_pager_mmio(vmo, aligned_off)); 624 case VMOT_NONE: 625 break; 626 } 627 panic("unexpected object type %u", vmo->vmo_type); 628 } 629 630 static vmspace_mapping_t * 631 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) 632 { 633 vmspace_mapping_t *vmsm; 634 list_t *ml = &vms->vms_maplist; 635 const uintptr_t range_end = addr + size; 636 637 ASSERT3U(addr, <=, range_end); 638 639 if (addr >= vms->vms_size) { 640 return (NULL); 641 } 642 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 643 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; 644 645 if (addr >= vmsm->vmsm_addr && addr < seg_end) { 646 if (range_end <= seg_end) { 647 return (vmsm); 648 } else { 649 return (NULL); 650 } 651 } 652 } 653 return (NULL); 654 } 655 656 /* 657 * Check to see if any mappings reside within [addr, addr + size) span in the 658 * vmspace, returning true if that span is indeed empty. 659 */ 660 static bool 661 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) 662 { 663 vmspace_mapping_t *vmsm; 664 list_t *ml = &vms->vms_maplist; 665 const uintptr_t range_end = addr + size - 1; 666 667 ASSERT(MUTEX_HELD(&vms->vms_lock)); 668 ASSERT(size > 0); 669 670 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 671 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; 672 673 /* 674 * The two ranges do not overlap if the start of either of 675 * them is after the end of the other. 676 */ 677 if (vmsm->vmsm_addr > range_end || addr > seg_end) 678 continue; 679 return (false); 680 } 681 return (true); 682 } 683 684 static void 685 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) 686 { 687 list_t *ml = &vms->vms_maplist; 688 689 ASSERT(MUTEX_HELD(&vms->vms_lock)); 690 ASSERT(vms->vms_held); 691 692 list_remove(ml, vmsm); 693 vm_object_release(vmsm->vmsm_object); 694 kmem_free(vmsm, sizeof (*vmsm)); 695 } 696 697 /* 698 * Enter a hold state on the vmspace. This ensures that all VM clients 699 * associated with the vmspace are excluded from establishing new page holds, 700 * or any other actions which would require accessing vmspace state subject to 701 * potential change. 702 * 703 * Returns with vmspace_t`vms_lock held. 704 */ 705 static void 706 vmspace_hold_enter(vmspace_t *vms) 707 { 708 mutex_enter(&vms->vms_lock); 709 VERIFY(!vms->vms_held); 710 711 vm_client_t *vmc = list_head(&vms->vms_clients); 712 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 713 vmc_space_hold(vmc); 714 } 715 vms->vms_held = true; 716 } 717 718 /* 719 * Exit a hold state on the vmspace. This releases all VM clients associated 720 * with the vmspace to be able to establish new page holds, and partake in other 721 * actions which require accessing changed vmspace state. If `kick_on_cpu` is 722 * true, then any CPUs actively using the page tables will be IPIed, and the 723 * call will block until they have acknowledged being ready to use the latest 724 * state of the tables. 725 * 726 * Requires vmspace_t`vms_lock be held, which is released as part of the call. 727 */ 728 static void 729 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) 730 { 731 ASSERT(MUTEX_HELD(&vms->vms_lock)); 732 VERIFY(vms->vms_held); 733 734 vm_client_t *vmc = list_head(&vms->vms_clients); 735 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 736 vmc_space_release(vmc, kick_on_cpu); 737 } 738 vms->vms_held = false; 739 mutex_exit(&vms->vms_lock); 740 } 741 742 static void 743 vmspace_clients_invalidate(vmspace_t *vms, uintptr_t gpa, size_t len) 744 { 745 ASSERT(MUTEX_HELD(&vms->vms_lock)); 746 VERIFY(vms->vms_held); 747 748 for (vm_client_t *vmc = list_head(&vms->vms_clients); 749 vmc != NULL; 750 vmc = list_next(&vms->vms_clients, vmc)) { 751 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); 752 } 753 } 754 755 /* 756 * Attempt to map a vm_object span into the vmspace. 757 * 758 * Requirements: 759 * - `obj_off`, `addr`, and `len` must be page-aligned 760 * - `obj_off` cannot be greater than the allocated size of the object 761 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated 762 * size of the object 763 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address 764 * of the vmspace 765 */ 766 int 767 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, 768 size_t len, uint8_t prot) 769 { 770 vmspace_mapping_t *vmsm; 771 int res = 0; 772 773 if (len == 0 || (addr + len) < addr || 774 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { 775 return (EINVAL); 776 } 777 if ((addr + len) >= vms->vms_size) { 778 return (ENOMEM); 779 } 780 781 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); 782 783 vmspace_hold_enter(vms); 784 if (!vm_mapping_gap(vms, addr, len)) { 785 kmem_free(vmsm, sizeof (*vmsm)); 786 res = ENOMEM; 787 } else { 788 vmsm->vmsm_object = vmo; 789 vmsm->vmsm_addr = addr; 790 vmsm->vmsm_len = len; 791 vmsm->vmsm_offset = (off_t)obj_off; 792 vmsm->vmsm_prot = prot; 793 list_insert_tail(&vms->vms_maplist, vmsm); 794 795 /* 796 * Make sure the GPT has tables ready for leaf entries across 797 * the entire new mapping. 798 */ 799 vmm_gpt_populate_region(vms->vms_gpt, addr, len); 800 } 801 vmspace_hold_exit(vms, false); 802 return (res); 803 } 804 805 /* 806 * Unmap a region of the vmspace. 807 * 808 * Presently the [start, end) span must equal a region previously mapped by a 809 * call to vmspace_map(). 810 */ 811 int 812 vmspace_unmap(vmspace_t *vms, uintptr_t addr, uintptr_t len) 813 { 814 const uintptr_t end = addr + len; 815 vmspace_mapping_t *vmsm; 816 vm_client_t *vmc; 817 uint64_t gen = 0; 818 819 ASSERT3U(addr, <, end); 820 821 vmspace_hold_enter(vms); 822 /* expect to match existing mapping exactly */ 823 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL || 824 vmsm->vmsm_addr != addr || vmsm->vmsm_len != len) { 825 vmspace_hold_exit(vms, false); 826 return (ENOENT); 827 } 828 829 /* Prepare clients (and their held pages) for the unmap. */ 830 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 831 vmc = list_next(&vms->vms_clients, vmc)) { 832 vmc_space_unmap(vmc, addr, len, vmsm->vmsm_object); 833 } 834 835 /* Clear all PTEs for region */ 836 if (vmm_gpt_unmap_region(vms->vms_gpt, addr, len) != 0) { 837 vms->vms_pt_gen++; 838 gen = vms->vms_pt_gen; 839 } 840 /* ... and the intermediate (directory) PTEs as well */ 841 vmm_gpt_vacate_region(vms->vms_gpt, addr, len); 842 843 /* 844 * If pages were actually unmapped from the GPT, provide clients with 845 * an invalidation notice. 846 */ 847 if (gen != 0) { 848 vmspace_clients_invalidate(vms, addr, len); 849 } 850 851 vm_mapping_remove(vms, vmsm); 852 vmspace_hold_exit(vms, true); 853 return (0); 854 } 855 856 /* 857 * For a given GPA in the vmspace, ensure that the backing page (if any) is 858 * properly mapped as present in the provided PTE. 859 */ 860 static int 861 vmspace_ensure_mapped(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 862 uint64_t *leaf_pte) 863 { 864 vmspace_mapping_t *vmsm; 865 vm_object_t *vmo; 866 pfn_t pfn; 867 868 ASSERT(pfnp != NULL); 869 ASSERT(leaf_pte != NULL); 870 871 vmsm = vm_mapping_find(vms, gpa, PAGESIZE); 872 if (vmsm == NULL) { 873 return (FC_NOMAP); 874 } 875 if ((req_prot & vmsm->vmsm_prot) != req_prot) { 876 return (FC_PROT); 877 } 878 879 vmo = vmsm->vmsm_object; 880 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 881 VERIFY(pfn != PFN_INVALID); 882 883 if (vmm_gpt_map_at(vms->vms_gpt, leaf_pte, pfn, vmsm->vmsm_prot, 884 vmo->vmo_attr)) { 885 atomic_inc_64(&vms->vms_pages_mapped); 886 } 887 888 *pfnp = pfn; 889 return (0); 890 } 891 892 /* 893 * Look up the PTE for a given GPA in the vmspace, populating it with 894 * appropriate contents (pfn, protection, etc) if it is empty, but backed by a 895 * valid mapping. 896 */ 897 static int 898 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 899 uint64_t **ptepp) 900 { 901 vmm_gpt_t *gpt = vms->vms_gpt; 902 uint64_t *entries[MAX_GPT_LEVEL], *leaf; 903 pfn_t pfn = PFN_INVALID; 904 uint_t prot; 905 906 ASSERT0(gpa & PAGEOFFSET); 907 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); 908 909 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 910 leaf = entries[LEVEL1]; 911 if (leaf == NULL) { 912 /* 913 * Since we populated the intermediate tables for any regions 914 * mapped in the GPT, an empty leaf entry indicates there is no 915 * mapping, populated or not, at this GPT. 916 */ 917 return (FC_NOMAP); 918 } 919 920 if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { 921 if ((req_prot & prot) != req_prot) { 922 return (FC_PROT); 923 } 924 } else { 925 int err = vmspace_ensure_mapped(vms, gpa, req_prot, &pfn, leaf); 926 if (err != 0) { 927 return (err); 928 } 929 } 930 931 ASSERT(pfn != PFN_INVALID && leaf != NULL); 932 if (pfnp != NULL) { 933 *pfnp = pfn; 934 } 935 if (ptepp != NULL) { 936 *ptepp = leaf; 937 } 938 return (0); 939 } 940 941 /* 942 * Populate (make resident in the page tables) a region of the vmspace. 943 * 944 * Presently the [start, end) span must equal a region previously mapped by a 945 * call to vmspace_map(). 946 */ 947 int 948 vmspace_populate(vmspace_t *vms, uintptr_t addr, uintptr_t len) 949 { 950 vmspace_mapping_t *vmsm; 951 mutex_enter(&vms->vms_lock); 952 953 /* For the time being, only exact-match mappings are expected */ 954 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL) { 955 mutex_exit(&vms->vms_lock); 956 return (FC_NOMAP); 957 } 958 959 vm_object_t *vmo = vmsm->vmsm_object; 960 const int prot = vmsm->vmsm_prot; 961 const uint8_t attr = vmo->vmo_attr; 962 size_t populated = 0; 963 const size_t end = addr + len; 964 for (uintptr_t gpa = addr & PAGEMASK; gpa < end; gpa += PAGESIZE) { 965 const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 966 VERIFY(pfn != PFN_INVALID); 967 968 if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) { 969 populated++; 970 } 971 } 972 atomic_add_64(&vms->vms_pages_mapped, populated); 973 974 mutex_exit(&vms->vms_lock); 975 return (0); 976 } 977 978 /* 979 * Allocate a client from a given vmspace. 980 */ 981 vm_client_t * 982 vmspace_client_alloc(vmspace_t *vms) 983 { 984 vm_client_t *vmc; 985 986 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); 987 vmc->vmc_space = vms; 988 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); 989 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); 990 vmc->vmc_state = VCS_IDLE; 991 vmc->vmc_cpu_active = -1; 992 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), 993 offsetof(vm_page_t, vmp_node)); 994 vmc->vmc_track_dirty = vms->vms_track_dirty; 995 996 mutex_enter(&vms->vms_lock); 997 list_insert_tail(&vms->vms_clients, vmc); 998 mutex_exit(&vms->vms_lock); 999 1000 return (vmc); 1001 } 1002 1003 /* 1004 * Get the nested page table root pointer (EPTP/NCR3) value. 1005 */ 1006 uint64_t 1007 vmspace_table_root(vmspace_t *vms) 1008 { 1009 return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty)); 1010 } 1011 1012 /* 1013 * Get the current generation number of the nested page table. 1014 */ 1015 uint64_t 1016 vmspace_table_gen(vmspace_t *vms) 1017 { 1018 return (vms->vms_pt_gen); 1019 } 1020 1021 /* 1022 * Mark a vm_client as active. This will block if/while the client is held by 1023 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will 1024 * fail if the vm_client has been orphaned. 1025 */ 1026 static int 1027 vmc_activate(vm_client_t *vmc) 1028 { 1029 mutex_enter(&vmc->vmc_lock); 1030 VERIFY0(vmc->vmc_state & VCS_ACTIVE); 1031 if ((vmc->vmc_state & VCS_ORPHANED) != 0) { 1032 mutex_exit(&vmc->vmc_lock); 1033 return (ENXIO); 1034 } 1035 while ((vmc->vmc_state & VCS_HOLD) != 0) { 1036 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1037 } 1038 vmc->vmc_state |= VCS_ACTIVE; 1039 return (0); 1040 } 1041 1042 /* 1043 * Mark a vm_client as no longer active. It must be called with 1044 * vm_client_t`vmc_lock already held, and will return with it released. 1045 */ 1046 static void 1047 vmc_deactivate(vm_client_t *vmc) 1048 { 1049 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1050 VERIFY(vmc->vmc_state & VCS_ACTIVE); 1051 1052 vmc->vmc_state ^= VCS_ACTIVE; 1053 if ((vmc->vmc_state & VCS_HOLD) != 0) { 1054 cv_broadcast(&vmc->vmc_cv); 1055 } 1056 mutex_exit(&vmc->vmc_lock); 1057 } 1058 1059 /* 1060 * Indicate that a CPU will be utilizing the nested page tables through this VM 1061 * client. Interrupts (and/or the GIF) are expected to be disabled when calling 1062 * this function. Returns the generation number of the nested page table (to be 1063 * used for TLB invalidations). 1064 */ 1065 uint64_t 1066 vmc_table_enter(vm_client_t *vmc) 1067 { 1068 vmspace_t *vms = vmc->vmc_space; 1069 uint64_t gen; 1070 1071 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1072 ASSERT3S(vmc->vmc_cpu_active, ==, -1); 1073 1074 /* 1075 * Since the NPT activation occurs with interrupts disabled, this must 1076 * be done without taking vmc_lock like normal. 1077 */ 1078 gen = vms->vms_pt_gen; 1079 vmc->vmc_cpu_active = CPU->cpu_id; 1080 vmc->vmc_cpu_gen = gen; 1081 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); 1082 1083 return (gen); 1084 } 1085 1086 /* 1087 * Indicate that this VM client is not longer (directly) using the underlying 1088 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling 1089 * this function. 1090 */ 1091 void 1092 vmc_table_exit(vm_client_t *vmc) 1093 { 1094 mutex_enter(&vmc->vmc_lock); 1095 1096 ASSERT(vmc->vmc_state & VCS_ON_CPU); 1097 vmc->vmc_state ^= VCS_ON_CPU; 1098 vmc->vmc_cpu_active = -1; 1099 if ((vmc->vmc_state & VCS_HOLD) != 0) { 1100 cv_broadcast(&vmc->vmc_cv); 1101 } 1102 1103 mutex_exit(&vmc->vmc_lock); 1104 } 1105 1106 static void 1107 vmc_space_hold(vm_client_t *vmc) 1108 { 1109 mutex_enter(&vmc->vmc_lock); 1110 VERIFY0(vmc->vmc_state & VCS_HOLD); 1111 1112 /* 1113 * Because vmc_table_enter() alters vmc_state from a context where 1114 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting 1115 * VMC_HOLD must be done atomically here. 1116 */ 1117 atomic_or_uint(&vmc->vmc_state, VCS_HOLD); 1118 1119 /* Wait for client to go inactive */ 1120 while ((vmc->vmc_state & VCS_ACTIVE) != 0) { 1121 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1122 } 1123 mutex_exit(&vmc->vmc_lock); 1124 } 1125 1126 static void 1127 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) 1128 { 1129 mutex_enter(&vmc->vmc_lock); 1130 VERIFY(vmc->vmc_state & VCS_HOLD); 1131 1132 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { 1133 poke_cpu(vmc->vmc_cpu_active); 1134 1135 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1136 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1137 } 1138 } 1139 1140 /* 1141 * Because vmc_table_enter() alters vmc_state from a context where 1142 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing 1143 * VMC_HOLD must be done atomically here. 1144 */ 1145 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); 1146 cv_broadcast(&vmc->vmc_cv); 1147 mutex_exit(&vmc->vmc_lock); 1148 } 1149 1150 static void 1151 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, 1152 uint64_t gen) 1153 { 1154 mutex_enter(&vmc->vmc_lock); 1155 VERIFY(vmc->vmc_state & VCS_HOLD); 1156 if ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1157 /* 1158 * Wait for clients using an old generation of the page tables 1159 * to exit guest context, where they subsequently flush the TLB 1160 * for the new generation. 1161 */ 1162 if (vmc->vmc_cpu_gen < gen) { 1163 poke_cpu(vmc->vmc_cpu_active); 1164 1165 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1166 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1167 } 1168 } 1169 } 1170 if (vmc->vmc_inval_func != NULL) { 1171 vmc_inval_cb_t func = vmc->vmc_inval_func; 1172 void *data = vmc->vmc_inval_data; 1173 1174 /* 1175 * Perform the actual invalidation call outside vmc_lock to 1176 * avoid lock ordering issues in the consumer. Since the client 1177 * is under VCS_HOLD, this is safe. 1178 */ 1179 mutex_exit(&vmc->vmc_lock); 1180 func(data, addr, size); 1181 mutex_enter(&vmc->vmc_lock); 1182 } 1183 mutex_exit(&vmc->vmc_lock); 1184 } 1185 1186 static void 1187 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, 1188 vm_object_t *vmo) 1189 { 1190 mutex_enter(&vmc->vmc_lock); 1191 VERIFY(vmc->vmc_state & VCS_HOLD); 1192 1193 /* 1194 * With the current vCPU exclusion invariants in place, we do not expect 1195 * a vCPU to be in guest context during an unmap. 1196 */ 1197 VERIFY0(vmc->vmc_state & VCS_ON_CPU); 1198 1199 /* 1200 * Any holds against the unmapped region need to establish their own 1201 * reference to the underlying object to avoid a potential 1202 * use-after-free. 1203 */ 1204 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1205 vmp != NULL; 1206 vmp = list_next(&vmc->vmc_held_pages, vmc)) { 1207 if (vmp->vmp_gpa < addr || 1208 vmp->vmp_gpa >= (addr + size)) { 1209 /* Hold outside region in question */ 1210 continue; 1211 } 1212 if (vmp->vmp_obj_ref == NULL) { 1213 vm_object_reference(vmo); 1214 vmp->vmp_obj_ref = vmo; 1215 /* For an unmapped region, PTE is now meaningless */ 1216 vmp->vmp_ptep = NULL; 1217 } else { 1218 /* 1219 * Object could have gone through cycle of 1220 * unmap-map-unmap before the hold was released. 1221 */ 1222 VERIFY3P(vmp->vmp_ptep, ==, NULL); 1223 } 1224 } 1225 mutex_exit(&vmc->vmc_lock); 1226 } 1227 1228 static vm_client_t * 1229 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) 1230 { 1231 vm_client_t *next; 1232 1233 ASSERT(MUTEX_HELD(&vms->vms_lock)); 1234 1235 mutex_enter(&vmc->vmc_lock); 1236 VERIFY3P(vmc->vmc_space, ==, vms); 1237 VERIFY0(vmc->vmc_state & VCS_ORPHANED); 1238 if (vmc->vmc_state & VCS_DESTROY) { 1239 /* 1240 * This vm_client is currently undergoing destruction, so it 1241 * does not need to be orphaned. Let it proceed with its own 1242 * clean-up task. 1243 */ 1244 next = list_next(&vms->vms_clients, vmc); 1245 } else { 1246 /* 1247 * Clients are only orphaned when the containing vmspace is 1248 * being torn down. All mappings from the vmspace should 1249 * already be gone, meaning any remaining held pages should have 1250 * direct references to the object. 1251 */ 1252 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1253 vmp != NULL; 1254 vmp = list_next(&vmc->vmc_held_pages, vmp)) { 1255 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1256 ASSERT3P(vmp->vmp_obj_ref, !=, NULL); 1257 } 1258 1259 /* 1260 * After this point, the client will be orphaned, unable to 1261 * establish new page holds (or access any vmspace-related 1262 * resources) and is in charge of cleaning up after itself. 1263 */ 1264 vmc->vmc_state |= VCS_ORPHANED; 1265 next = list_next(&vms->vms_clients, vmc); 1266 list_remove(&vms->vms_clients, vmc); 1267 vmc->vmc_space = NULL; 1268 } 1269 mutex_exit(&vmc->vmc_lock); 1270 return (next); 1271 } 1272 1273 /* 1274 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1275 */ 1276 vm_page_t * 1277 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags) 1278 { 1279 vmspace_t *vms = vmc->vmc_space; 1280 vm_page_t *vmp; 1281 pfn_t pfn = PFN_INVALID; 1282 uint64_t *ptep = NULL; 1283 1284 ASSERT0(gpa & PAGEOFFSET); 1285 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); 1286 ASSERT0(prot & ~PROT_ALL); 1287 ASSERT0(flags & ~VPF_ALL); 1288 1289 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); 1290 if (vmc_activate(vmc) != 0) { 1291 kmem_free(vmp, sizeof (*vmp)); 1292 return (NULL); 1293 } 1294 1295 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { 1296 vmc_deactivate(vmc); 1297 kmem_free(vmp, sizeof (*vmp)); 1298 return (NULL); 1299 } 1300 ASSERT(pfn != PFN_INVALID && ptep != NULL); 1301 1302 vmp->vmp_client = vmc; 1303 vmp->vmp_chain = NULL; 1304 vmp->vmp_gpa = gpa; 1305 vmp->vmp_pfn = pfn; 1306 vmp->vmp_ptep = ptep; 1307 vmp->vmp_obj_ref = NULL; 1308 vmp->vmp_prot = (uint8_t)prot; 1309 vmp->vmp_flags = (uint8_t)flags; 1310 list_insert_tail(&vmc->vmc_held_pages, vmp); 1311 vmc_deactivate(vmc); 1312 1313 return (vmp); 1314 } 1315 1316 /* 1317 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1318 */ 1319 vm_page_t * 1320 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) 1321 { 1322 return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT)); 1323 } 1324 1325 int 1326 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) 1327 { 1328 vmspace_t *vms = vmc->vmc_space; 1329 int err; 1330 1331 err = vmc_activate(vmc); 1332 if (err == 0) { 1333 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); 1334 vmc_deactivate(vmc); 1335 } 1336 1337 return (err); 1338 } 1339 1340 /* 1341 * Allocate an additional vm_client_t, based on an existing one. Only the 1342 * associatation with the vmspace is cloned, not existing holds or any 1343 * configured invalidation function. 1344 */ 1345 vm_client_t * 1346 vmc_clone(vm_client_t *vmc) 1347 { 1348 vmspace_t *vms = vmc->vmc_space; 1349 1350 return (vmspace_client_alloc(vms)); 1351 } 1352 1353 /* 1354 * Register a function (and associated data pointer) to be called when an 1355 * address range in the vmspace is invalidated. 1356 */ 1357 int 1358 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) 1359 { 1360 int err; 1361 1362 err = vmc_activate(vmc); 1363 if (err == 0) { 1364 vmc->vmc_inval_func = func; 1365 vmc->vmc_inval_data = data; 1366 vmc_deactivate(vmc); 1367 } 1368 1369 return (err); 1370 } 1371 1372 /* 1373 * Destroy a vm_client_t instance. 1374 * 1375 * No pages held through this vm_client_t may be outstanding when performing a 1376 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to 1377 * vmc_table_exit() has been made). 1378 */ 1379 void 1380 vmc_destroy(vm_client_t *vmc) 1381 { 1382 mutex_enter(&vmc->vmc_lock); 1383 1384 VERIFY(list_is_empty(&vmc->vmc_held_pages)); 1385 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1386 1387 if ((vmc->vmc_state & VCS_ORPHANED) == 0) { 1388 vmspace_t *vms; 1389 1390 /* 1391 * Deassociation with the parent vmspace must be done carefully: 1392 * The vmspace could attempt to orphan this vm_client while we 1393 * release vmc_lock in order to take vms_lock (the required 1394 * order). The client is marked to indicate that destruction is 1395 * under way. Doing so prevents any racing orphan operation 1396 * from applying to this client, allowing us to deassociate from 1397 * the vmspace safely. 1398 */ 1399 vmc->vmc_state |= VCS_DESTROY; 1400 vms = vmc->vmc_space; 1401 mutex_exit(&vmc->vmc_lock); 1402 1403 mutex_enter(&vms->vms_lock); 1404 mutex_enter(&vmc->vmc_lock); 1405 list_remove(&vms->vms_clients, vmc); 1406 /* 1407 * If the vmspace began its own destruction operation while we 1408 * were navigating the locks, be sure to notify it about this 1409 * vm_client being deassociated. 1410 */ 1411 cv_signal(&vms->vms_cv); 1412 mutex_exit(&vmc->vmc_lock); 1413 mutex_exit(&vms->vms_lock); 1414 } else { 1415 VERIFY3P(vmc->vmc_space, ==, NULL); 1416 mutex_exit(&vmc->vmc_lock); 1417 } 1418 1419 mutex_destroy(&vmc->vmc_lock); 1420 cv_destroy(&vmc->vmc_cv); 1421 list_destroy(&vmc->vmc_held_pages); 1422 1423 kmem_free(vmc, sizeof (*vmc)); 1424 } 1425 1426 static __inline void * 1427 vmp_ptr(const vm_page_t *vmp) 1428 { 1429 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); 1430 1431 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); 1432 return ((void *)((uintptr_t)kpm_vbase + paddr)); 1433 } 1434 1435 /* 1436 * Get a readable kernel-virtual pointer for a held page. 1437 * 1438 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() 1439 * call to acquire this page reference. 1440 */ 1441 const void * 1442 vmp_get_readable(const vm_page_t *vmp) 1443 { 1444 ASSERT(vmp->vmp_prot & PROT_READ); 1445 1446 return (vmp_ptr(vmp)); 1447 } 1448 1449 /* 1450 * Get a writable kernel-virtual pointer for a held page. 1451 * 1452 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() 1453 * call to acquire this page reference. 1454 */ 1455 void * 1456 vmp_get_writable(const vm_page_t *vmp) 1457 { 1458 ASSERT(vmp->vmp_prot & PROT_WRITE); 1459 1460 return (vmp_ptr(vmp)); 1461 } 1462 1463 /* 1464 * Get the host-physical PFN for a held page. 1465 */ 1466 pfn_t 1467 vmp_get_pfn(const vm_page_t *vmp) 1468 { 1469 return (vmp->vmp_pfn); 1470 } 1471 1472 /* 1473 * If this page was deferring dirty-marking in the corresponding vmspace page 1474 * tables, clear such a state so it is considered dirty from now on. 1475 */ 1476 void 1477 vmp_mark_dirty(vm_page_t *vmp) 1478 { 1479 ASSERT((vmp->vmp_prot & PROT_WRITE) != 0); 1480 1481 atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY); 1482 } 1483 1484 /* 1485 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. 1486 */ 1487 void 1488 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) 1489 { 1490 ASSERT3P(vmp->vmp_chain, ==, NULL); 1491 1492 vmp->vmp_chain = to_chain; 1493 } 1494 1495 /* 1496 * Retrieve the pointer from the page-chaining in `vmp`. 1497 */ 1498 vm_page_t * 1499 vmp_next(const vm_page_t *vmp) 1500 { 1501 return (vmp->vmp_chain); 1502 } 1503 1504 static __inline bool 1505 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) 1506 { 1507 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1508 1509 bool was_unmapped = false; 1510 1511 list_remove(&vmc->vmc_held_pages, vmp); 1512 if (vmp->vmp_obj_ref != NULL) { 1513 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1514 1515 vm_object_release(vmp->vmp_obj_ref); 1516 was_unmapped = true; 1517 } else { 1518 ASSERT3P(vmp->vmp_ptep, !=, NULL); 1519 1520 /* 1521 * Track appropriate (accessed/dirty) bits for the guest-virtual 1522 * address corresponding to this page, if it is from the vmspace 1523 * rather than a direct reference to an underlying object. 1524 * 1525 * The protection and/or configured flags may obviate the need 1526 * for such an update. 1527 */ 1528 if ((vmp->vmp_prot & PROT_WRITE) != 0 && 1529 (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 && 1530 vmc->vmc_track_dirty) { 1531 vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; 1532 (void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); 1533 } 1534 } 1535 kmem_free(vmp, sizeof (*vmp)); 1536 return (was_unmapped); 1537 } 1538 1539 /* 1540 * Release held page. Returns true if page resided on region which was 1541 * subsequently unmapped. 1542 */ 1543 bool 1544 vmp_release(vm_page_t *vmp) 1545 { 1546 vm_client_t *vmc = vmp->vmp_client; 1547 1548 VERIFY(vmc != NULL); 1549 1550 mutex_enter(&vmc->vmc_lock); 1551 const bool was_unmapped = vmp_release_inner(vmp, vmc); 1552 mutex_exit(&vmc->vmc_lock); 1553 return (was_unmapped); 1554 } 1555 1556 /* 1557 * Release a chain of pages which were associated via vmp_chain() (setting 1558 * page-chaining pointer). Returns true if any pages resided upon a region 1559 * which was subsequently unmapped. 1560 * 1561 * All of those pages must have been held through the same vm_client_t. 1562 */ 1563 bool 1564 vmp_release_chain(vm_page_t *vmp) 1565 { 1566 vm_client_t *vmc = vmp->vmp_client; 1567 bool any_unmapped = false; 1568 1569 ASSERT(vmp != NULL); 1570 1571 mutex_enter(&vmc->vmc_lock); 1572 while (vmp != NULL) { 1573 vm_page_t *next = vmp->vmp_chain; 1574 1575 /* We expect all pages in chain to be from same client */ 1576 ASSERT3P(vmp->vmp_client, ==, vmc); 1577 1578 if (vmp_release_inner(vmp, vmc)) { 1579 any_unmapped = true; 1580 } 1581 vmp = next; 1582 } 1583 mutex_exit(&vmc->vmc_lock); 1584 return (any_unmapped); 1585 } 1586 1587 1588 int 1589 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, 1590 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) 1591 { 1592 vm_object_t *vmo; 1593 int err; 1594 1595 if (segoff < 0 || len <= 0 || 1596 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { 1597 return (EINVAL); 1598 } 1599 if ((prot & PROT_USER) == 0) { 1600 return (ENOTSUP); 1601 } 1602 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); 1603 if (err != 0) { 1604 return (err); 1605 } 1606 1607 VERIFY(segoff >= 0); 1608 VERIFY(len <= vmo->vmo_size); 1609 VERIFY((len + segoff) <= vmo->vmo_size); 1610 1611 if (vmo->vmo_type != VMOT_MEM) { 1612 /* Only support memory objects for now */ 1613 return (ENOTSUP); 1614 } 1615 1616 as_rangelock(as); 1617 1618 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); 1619 if (err == 0) { 1620 segvmm_crargs_t svma; 1621 1622 svma.prot = prot; 1623 svma.offset = segoff; 1624 svma.vmo = vmo; 1625 svma.vmc = NULL; 1626 1627 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); 1628 } 1629 1630 as_rangeunlock(as); 1631 return (err); 1632 } 1633 1634 int 1635 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, 1636 off_t len, uint_t prot, uint_t maxprot, uint_t flags) 1637 { 1638 1639 const uintptr_t gpa = (uintptr_t)off; 1640 const size_t size = (uintptr_t)len; 1641 int err; 1642 1643 if (off < 0 || len <= 0 || 1644 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { 1645 return (EINVAL); 1646 } 1647 if ((prot & PROT_USER) == 0) { 1648 return (ENOTSUP); 1649 } 1650 1651 as_rangelock(as); 1652 1653 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); 1654 if (err == 0) { 1655 segvmm_crargs_t svma; 1656 1657 svma.prot = prot; 1658 svma.offset = gpa; 1659 svma.vmo = NULL; 1660 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); 1661 1662 err = as_map(as, *addrp, len, segvmm_create, &svma); 1663 } 1664 1665 as_rangeunlock(as); 1666 return (err); 1667 } 1668