1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2022 Oxide Computer Company 16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/kmem.h> 21 #include <sys/thread.h> 22 #include <sys/list.h> 23 #include <sys/mman.h> 24 #include <sys/types.h> 25 #include <sys/ddi.h> 26 #include <sys/sysmacros.h> 27 #include <sys/machsystm.h> 28 #include <sys/vmsystm.h> 29 #include <sys/x86_archext.h> 30 #include <vm/as.h> 31 #include <vm/hat_i86.h> 32 #include <vm/seg_vn.h> 33 #include <vm/seg_kmem.h> 34 35 #include <sys/vmm_vm.h> 36 #include <sys/seg_vmm.h> 37 #include <sys/vmm_kernel.h> 38 #include <sys/vmm_reservoir.h> 39 #include <sys/vmm_gpt.h> 40 41 42 /* 43 * VMM Virtual Memory 44 * 45 * History 46 * 47 * When bhyve was ported to illumos, one significant hole was handling guest 48 * memory and memory accesses. In the original Pluribus port, bhyve itself 49 * manually handled the EPT structures for guest memory. The updated sources 50 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM 51 * system for memory allocations and management of the EPT structures. Keeping 52 * source differences to a minimum was a priority, so illumos-bhyve implemented 53 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to 54 * boot and run guests. 55 * 56 * While the VM shim was successful in getting illumos-bhyve to a functional 57 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the 58 * compatibility interfaces made it awkward to use. As source differences with 59 * the upstream kernel code became less of a concern, and upcoming features 60 * (such as live migration) would demand more of those VM interfaces, it became 61 * clear that an overhaul was prudent. 62 * 63 * Design 64 * 65 * The new VM system for bhyve retains a number of the same concepts as what it 66 * replaces: 67 * 68 * - `vmspace_t` is the top-level entity for a guest memory space 69 * - `vm_object_t` represents a memory object which can be mapped into a vmspace 70 * - `vm_page_t` represents a page hold within a given vmspace, providing access 71 * to the underlying memory page 72 * 73 * Unlike the old code, where most of the involved structures were exposed via 74 * public definitions, this replacement VM interface keeps all involved 75 * structures opaque to consumers. Furthermore, there is a clear delineation 76 * between infrequent administrative operations (such as mapping/unmapping 77 * regions) and common data-path operations (attempting a page hold at a given 78 * guest-physical address). Those administrative operations are performed 79 * directly against the vmspace, whereas the data-path operations are performed 80 * through a `vm_client_t` handle. That VM client abstraction is meant to 81 * reduce contention and overhead for frequent access operations and provide 82 * debugging insight into how different subcomponents are accessing the vmspace. 83 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv 84 * interface) and each VMM userspace segment mapping. 85 * 86 * Exclusion 87 * 88 * Making changes to the vmspace (such as mapping or unmapping regions) requires 89 * other accessors be excluded while the change is underway to prevent them from 90 * observing invalid intermediate states. A simple approach could use a mutex 91 * or rwlock to achieve this, but that risks contention when the rate of access 92 * to the vmspace is high. 93 * 94 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion 95 * at a per-vm_client_t basis. While this raises the cost for vmspace changes, 96 * it means that the much more common page accesses through the vm_client can 97 * normally proceed unimpeded and independently. 98 * 99 * When a change to the vmspace is required, the caller will put the vmspace in 100 * a 'hold' state, iterating over all associated vm_client instances, waiting 101 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before 102 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on 103 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) 104 * will block until the hold condition is cleared. Once the hold is asserted 105 * for all clients, the vmspace change can proceed with confidence. Upon 106 * completion of that operation, VCS_HOLD is cleared from the clients, and they 107 * are released to resume vmspace accesses. 108 * 109 * vCPU Consumers 110 * 111 * Access to the vmspace for vCPUs running in guest context is different from 112 * emulation-related vm_client activity: they solely rely on the contents of the 113 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude 114 * client access is not feasible when entering guest context, since interrupts 115 * are disabled, making it impossible to block entry. This is not a concern as 116 * long as vmspace modifications never place the page tables in invalid states 117 * (either intermediate, or final). The vm_client hold mechanism does provide 118 * the means to IPI vCPU consumers which will trigger a notification once they 119 * report their exit from guest context. This can be used to ensure that page 120 * table modifications are made visible to those vCPUs within a certain 121 * time frame. 122 */ 123 124 typedef struct vmspace_mapping { 125 list_node_t vmsm_node; 126 vm_object_t *vmsm_object; /* object backing this mapping */ 127 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ 128 size_t vmsm_len; /* length (in bytes) of mapping */ 129 off_t vmsm_offset; /* byte offset into object */ 130 uint_t vmsm_prot; 131 } vmspace_mapping_t; 132 133 #define VMSM_OFFSET(vmsm, addr) ( \ 134 (vmsm)->vmsm_offset + \ 135 ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) 136 137 typedef enum vm_client_state { 138 VCS_IDLE = 0, 139 /* currently accessing vmspace for client operation (hold or fault) */ 140 VCS_ACTIVE = (1 << 0), 141 /* client hold requested/asserted */ 142 VCS_HOLD = (1 << 1), 143 /* vCPU is accessing page tables in guest context */ 144 VCS_ON_CPU = (1 << 2), 145 /* client has been orphaned (no more access to vmspace) */ 146 VCS_ORPHANED = (1 << 3), 147 /* client undergoing destroy operation */ 148 VCS_DESTROY = (1 << 4), 149 } vm_client_state_t; 150 151 struct vmspace { 152 kmutex_t vms_lock; 153 kcondvar_t vms_cv; 154 bool vms_held; 155 uintptr_t vms_size; /* immutable after creation */ 156 157 /* (nested) page table state */ 158 vmm_gpt_t *vms_gpt; 159 uint64_t vms_pt_gen; 160 uint64_t vms_pages_mapped; 161 bool vms_track_dirty; 162 163 list_t vms_maplist; 164 list_t vms_clients; 165 }; 166 167 struct vm_client { 168 vmspace_t *vmc_space; 169 list_node_t vmc_node; 170 171 kmutex_t vmc_lock; 172 kcondvar_t vmc_cv; 173 vm_client_state_t vmc_state; 174 int vmc_cpu_active; 175 uint64_t vmc_cpu_gen; 176 bool vmc_track_dirty; 177 vmc_inval_cb_t vmc_inval_func; 178 void *vmc_inval_data; 179 180 list_t vmc_held_pages; 181 }; 182 183 typedef enum vm_object_type { 184 VMOT_NONE, 185 VMOT_MEM, 186 VMOT_MMIO, 187 } vm_object_type_t; 188 189 struct vm_object { 190 uint_t vmo_refcnt; /* manipulated with atomic ops */ 191 192 /* Fields below are fixed at creation time */ 193 vm_object_type_t vmo_type; 194 size_t vmo_size; 195 void *vmo_data; 196 uint8_t vmo_attr; 197 }; 198 199 /* Convenience consolidation of all flag(s) for validity checking */ 200 #define VPF_ALL (VPF_DEFER_DIRTY) 201 202 struct vm_page { 203 vm_client_t *vmp_client; 204 list_node_t vmp_node; 205 vm_page_t *vmp_chain; 206 uintptr_t vmp_gpa; 207 pfn_t vmp_pfn; 208 uint64_t *vmp_ptep; 209 vm_object_t *vmp_obj_ref; 210 uint8_t vmp_prot; 211 uint8_t vmp_flags; 212 }; 213 214 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); 215 static void vmspace_hold_enter(vmspace_t *); 216 static void vmspace_hold_exit(vmspace_t *, bool); 217 static void vmc_space_hold(vm_client_t *); 218 static void vmc_space_release(vm_client_t *, bool); 219 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); 220 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); 221 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); 222 223 224 /* 225 * Create a new vmspace with a maximum address of `end`. 226 */ 227 vmspace_t * 228 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) 229 { 230 vmspace_t *vms; 231 const uintptr_t size = end + 1; 232 233 /* 234 * This whole mess is built on the assumption that a 64-bit address 235 * space is available to work with for the various pagetable tricks. 236 */ 237 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && 238 size <= (uintptr_t)USERLIMIT); 239 240 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); 241 vms->vms_size = size; 242 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), 243 offsetof(vmspace_mapping_t, vmsm_node)); 244 list_create(&vms->vms_clients, sizeof (vm_client_t), 245 offsetof(vm_client_t, vmc_node)); 246 247 vms->vms_gpt = vmm_gpt_alloc(pte_ops); 248 vms->vms_pt_gen = 1; 249 vms->vms_track_dirty = track_dirty; 250 251 return (vms); 252 } 253 254 /* 255 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining 256 * clients will be orphaned. 257 */ 258 void 259 vmspace_destroy(vmspace_t *vms) 260 { 261 mutex_enter(&vms->vms_lock); 262 VERIFY(list_is_empty(&vms->vms_maplist)); 263 264 if (!list_is_empty(&vms->vms_clients)) { 265 vm_client_t *vmc = list_head(&vms->vms_clients); 266 while (vmc != NULL) { 267 vmc = vmc_space_orphan(vmc, vms); 268 } 269 /* 270 * Wait for any clients which were in the process of destroying 271 * themselves to disappear. 272 */ 273 while (!list_is_empty(&vms->vms_clients)) { 274 cv_wait(&vms->vms_cv, &vms->vms_lock); 275 } 276 } 277 VERIFY(list_is_empty(&vms->vms_clients)); 278 279 vmm_gpt_free(vms->vms_gpt); 280 mutex_exit(&vms->vms_lock); 281 282 mutex_destroy(&vms->vms_lock); 283 cv_destroy(&vms->vms_cv); 284 list_destroy(&vms->vms_maplist); 285 list_destroy(&vms->vms_clients); 286 287 kmem_free(vms, sizeof (*vms)); 288 } 289 290 /* 291 * Retrieve the count of resident (mapped into the page tables) pages. 292 */ 293 uint64_t 294 vmspace_resident_count(vmspace_t *vms) 295 { 296 return (vms->vms_pages_mapped); 297 } 298 299 int 300 vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap) 301 { 302 if (!vms->vms_track_dirty) 303 return (EPERM); 304 305 /* 306 * Accumulate dirty bits into the given bit vector. Note that this 307 * races both against hardware writes from running vCPUs and 308 * reflections from userspace. 309 * 310 * Called from a userspace-visible ioctl, this depends on the VM 311 * instance being read-locked to prevent vmspace_map/vmspace_unmap 312 * operations from changing the page tables during the walk. 313 */ 314 for (size_t offset = 0; offset < len; offset += PAGESIZE) { 315 bool bit = false; 316 uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset); 317 if (entry != NULL) 318 bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false); 319 uint64_t pfn_offset = offset >> PAGESHIFT; 320 size_t bit_offset = pfn_offset / 8; 321 size_t bit_index = pfn_offset % 8; 322 bitmap[bit_offset] |= (bit << bit_index); 323 } 324 325 /* 326 * Now invalidate those bits and shoot down address spaces that 327 * may have them cached. 328 */ 329 vmspace_hold_enter(vms); 330 vms->vms_pt_gen++; 331 for (vm_client_t *vmc = list_head(&vms->vms_clients); 332 vmc != NULL; 333 vmc = list_next(&vms->vms_clients, vmc)) { 334 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); 335 } 336 vmspace_hold_exit(vms, true); 337 338 return (0); 339 } 340 341 static pfn_t 342 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) 343 { 344 vmmr_region_t *region; 345 pfn_t pfn; 346 347 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); 348 349 region = vmo->vmo_data; 350 pfn = vmmr_region_pfn_at(region, off); 351 352 return (pfn); 353 } 354 355 static pfn_t 356 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) 357 { 358 pfn_t pfn; 359 360 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); 361 ASSERT3P(vmo->vmo_data, !=, NULL); 362 ASSERT3U(off, <, vmo->vmo_size); 363 364 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; 365 366 return (pfn); 367 } 368 369 /* 370 * Allocate a VM object backed by VMM reservoir memory. 371 */ 372 vm_object_t * 373 vm_object_mem_allocate(size_t size, bool transient) 374 { 375 int err; 376 vmmr_region_t *region = NULL; 377 vm_object_t *vmo; 378 379 ASSERT3U(size, !=, 0); 380 ASSERT3U(size & PAGEOFFSET, ==, 0); 381 382 err = vmmr_alloc(size, transient, ®ion); 383 if (err != 0) { 384 return (NULL); 385 } 386 387 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 388 389 /* For now, these are to stay fixed after allocation */ 390 vmo->vmo_type = VMOT_MEM; 391 vmo->vmo_size = size; 392 vmo->vmo_attr = MTRR_TYPE_WB; 393 vmo->vmo_data = region; 394 vmo->vmo_refcnt = 1; 395 396 return (vmo); 397 } 398 399 static vm_object_t * 400 vm_object_mmio_allocate(size_t size, uintptr_t hpa) 401 { 402 vm_object_t *vmo; 403 404 ASSERT3U(size, !=, 0); 405 ASSERT3U(size & PAGEOFFSET, ==, 0); 406 ASSERT3U(hpa & PAGEOFFSET, ==, 0); 407 408 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 409 410 /* For now, these are to stay fixed after allocation */ 411 vmo->vmo_type = VMOT_MMIO; 412 vmo->vmo_size = size; 413 vmo->vmo_attr = MTRR_TYPE_UC; 414 vmo->vmo_data = (void *)hpa; 415 vmo->vmo_refcnt = 1; 416 417 return (vmo); 418 } 419 420 /* 421 * Allocate a VM object backed by an existing range of physical memory. 422 */ 423 vm_object_t * 424 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) 425 { 426 int error; 427 vm_object_t *obj; 428 429 obj = vm_object_mmio_allocate(len, hpa); 430 if (obj != NULL) { 431 error = vmspace_map(vmspace, obj, 0, gpa, len, 432 PROT_READ | PROT_WRITE); 433 if (error != 0) { 434 vm_object_release(obj); 435 obj = NULL; 436 } 437 } 438 439 return (obj); 440 } 441 442 /* 443 * Release a vm_object reference 444 */ 445 void 446 vm_object_release(vm_object_t *vmo) 447 { 448 ASSERT(vmo != NULL); 449 450 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); 451 /* underflow would be a deadly serious mistake */ 452 VERIFY3U(ref, !=, UINT_MAX); 453 if (ref != 0) { 454 return; 455 } 456 457 switch (vmo->vmo_type) { 458 case VMOT_MEM: 459 vmmr_free((vmmr_region_t *)vmo->vmo_data); 460 break; 461 case VMOT_MMIO: 462 break; 463 default: 464 panic("unexpected object type %u", vmo->vmo_type); 465 break; 466 } 467 468 vmo->vmo_data = NULL; 469 vmo->vmo_size = 0; 470 kmem_free(vmo, sizeof (*vmo)); 471 } 472 473 /* 474 * Increase refcount for vm_object reference 475 */ 476 void 477 vm_object_reference(vm_object_t *vmo) 478 { 479 ASSERT(vmo != NULL); 480 481 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); 482 /* overflow would be a deadly serious mistake */ 483 VERIFY3U(ref, !=, 0); 484 } 485 486 /* 487 * Get the host-physical PFN for a given offset into a vm_object. 488 * 489 * The provided `off` must be within the allocated size of the vm_object. 490 */ 491 pfn_t 492 vm_object_pfn(vm_object_t *vmo, uintptr_t off) 493 { 494 const uintptr_t aligned_off = off & PAGEMASK; 495 496 switch (vmo->vmo_type) { 497 case VMOT_MEM: 498 return (vm_object_pager_reservoir(vmo, aligned_off)); 499 case VMOT_MMIO: 500 return (vm_object_pager_mmio(vmo, aligned_off)); 501 case VMOT_NONE: 502 break; 503 } 504 panic("unexpected object type %u", vmo->vmo_type); 505 } 506 507 static vmspace_mapping_t * 508 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) 509 { 510 vmspace_mapping_t *vmsm; 511 list_t *ml = &vms->vms_maplist; 512 const uintptr_t range_end = addr + size; 513 514 ASSERT3U(addr, <=, range_end); 515 516 if (addr >= vms->vms_size) { 517 return (NULL); 518 } 519 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 520 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; 521 522 if (addr >= vmsm->vmsm_addr && addr < seg_end) { 523 if (range_end <= seg_end) { 524 return (vmsm); 525 } else { 526 return (NULL); 527 } 528 } 529 } 530 return (NULL); 531 } 532 533 /* 534 * Check to see if any mappings reside within [addr, addr + size) span in the 535 * vmspace, returning true if that span is indeed empty. 536 */ 537 static bool 538 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) 539 { 540 vmspace_mapping_t *vmsm; 541 list_t *ml = &vms->vms_maplist; 542 const uintptr_t range_end = addr + size - 1; 543 544 ASSERT(MUTEX_HELD(&vms->vms_lock)); 545 ASSERT(size > 0); 546 547 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 548 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; 549 550 /* 551 * The two ranges do not overlap if the start of either of 552 * them is after the end of the other. 553 */ 554 if (vmsm->vmsm_addr > range_end || addr > seg_end) 555 continue; 556 return (false); 557 } 558 return (true); 559 } 560 561 static void 562 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) 563 { 564 list_t *ml = &vms->vms_maplist; 565 566 ASSERT(MUTEX_HELD(&vms->vms_lock)); 567 ASSERT(vms->vms_held); 568 569 list_remove(ml, vmsm); 570 vm_object_release(vmsm->vmsm_object); 571 kmem_free(vmsm, sizeof (*vmsm)); 572 } 573 574 /* 575 * Enter a hold state on the vmspace. This ensures that all VM clients 576 * associated with the vmspace are excluded from establishing new page holds, 577 * or any other actions which would require accessing vmspace state subject to 578 * potential change. 579 * 580 * Returns with vmspace_t`vms_lock held. 581 */ 582 static void 583 vmspace_hold_enter(vmspace_t *vms) 584 { 585 mutex_enter(&vms->vms_lock); 586 VERIFY(!vms->vms_held); 587 588 vm_client_t *vmc = list_head(&vms->vms_clients); 589 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 590 vmc_space_hold(vmc); 591 } 592 vms->vms_held = true; 593 } 594 595 /* 596 * Exit a hold state on the vmspace. This releases all VM clients associated 597 * with the vmspace to be able to establish new page holds, and partake in other 598 * actions which require accessing changed vmspace state. If `kick_on_cpu` is 599 * true, then any CPUs actively using the page tables will be IPIed, and the 600 * call will block until they have acknowledged being ready to use the latest 601 * state of the tables. 602 * 603 * Requires vmspace_t`vms_lock be held, which is released as part of the call. 604 */ 605 static void 606 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) 607 { 608 ASSERT(MUTEX_HELD(&vms->vms_lock)); 609 VERIFY(vms->vms_held); 610 611 vm_client_t *vmc = list_head(&vms->vms_clients); 612 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 613 vmc_space_release(vmc, kick_on_cpu); 614 } 615 vms->vms_held = false; 616 mutex_exit(&vms->vms_lock); 617 } 618 619 /* 620 * Attempt to map a vm_object span into the vmspace. 621 * 622 * Requirements: 623 * - `obj_off`, `addr`, and `len` must be page-aligned 624 * - `obj_off` cannot be greater than the allocated size of the object 625 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated 626 * size of the object 627 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address 628 * of the vmspace 629 */ 630 int 631 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, 632 size_t len, uint8_t prot) 633 { 634 vmspace_mapping_t *vmsm; 635 int res = 0; 636 637 if (len == 0 || (addr + len) < addr || 638 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { 639 return (EINVAL); 640 } 641 if ((addr + len) >= vms->vms_size) { 642 return (ENOMEM); 643 } 644 645 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); 646 647 vmspace_hold_enter(vms); 648 if (!vm_mapping_gap(vms, addr, len)) { 649 kmem_free(vmsm, sizeof (*vmsm)); 650 res = ENOMEM; 651 } else { 652 vmsm->vmsm_object = vmo; 653 vmsm->vmsm_addr = addr; 654 vmsm->vmsm_len = len; 655 vmsm->vmsm_offset = (off_t)obj_off; 656 vmsm->vmsm_prot = prot; 657 list_insert_tail(&vms->vms_maplist, vmsm); 658 659 /* 660 * Make sure the GPT has tables ready for leaf entries across 661 * the entire new mapping. 662 */ 663 vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len); 664 } 665 vmspace_hold_exit(vms, false); 666 return (res); 667 } 668 669 /* 670 * Unmap a region of the vmspace. 671 * 672 * Presently the [start, end) span must equal a region previously mapped by a 673 * call to vmspace_map(). 674 */ 675 int 676 vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end) 677 { 678 const size_t size = (size_t)(end - start); 679 vmspace_mapping_t *vmsm; 680 vm_client_t *vmc; 681 uint64_t gen = 0; 682 683 ASSERT(start < end); 684 685 vmspace_hold_enter(vms); 686 /* expect to match existing mapping exactly */ 687 if ((vmsm = vm_mapping_find(vms, start, size)) == NULL || 688 vmsm->vmsm_addr != start || vmsm->vmsm_len != size) { 689 vmspace_hold_exit(vms, false); 690 return (ENOENT); 691 } 692 693 /* Prepare clients (and their held pages) for the unmap. */ 694 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 695 vmc = list_next(&vms->vms_clients, vmc)) { 696 vmc_space_unmap(vmc, start, size, vmsm->vmsm_object); 697 } 698 699 /* Clear all PTEs for region */ 700 if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) { 701 vms->vms_pt_gen++; 702 gen = vms->vms_pt_gen; 703 } 704 /* ... and the intermediate (directory) PTEs as well */ 705 vmm_gpt_vacate_region(vms->vms_gpt, start, end); 706 707 /* 708 * If pages were actually unmapped from the GPT, provide clients with 709 * an invalidation notice. 710 */ 711 if (gen != 0) { 712 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 713 vmc = list_next(&vms->vms_clients, vmc)) { 714 vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen); 715 } 716 } 717 718 vm_mapping_remove(vms, vmsm); 719 vmspace_hold_exit(vms, true); 720 return (0); 721 } 722 723 static int 724 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 725 uint64_t **ptepp) 726 { 727 vmm_gpt_t *gpt = vms->vms_gpt; 728 uint64_t *entries[MAX_GPT_LEVEL], *leaf; 729 pfn_t pfn = PFN_INVALID; 730 uint_t prot; 731 732 ASSERT0(gpa & PAGEOFFSET); 733 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); 734 735 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 736 leaf = entries[LEVEL1]; 737 if (leaf == NULL) { 738 /* 739 * Since we populated the intermediate tables for any regions 740 * mapped in the GPT, an empty leaf entry indicates there is no 741 * mapping, populated or not, at this GPT. 742 */ 743 return (FC_NOMAP); 744 } 745 746 if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { 747 if ((req_prot & prot) != req_prot) { 748 return (FC_PROT); 749 } 750 } else { 751 vmspace_mapping_t *vmsm; 752 vm_object_t *vmo; 753 754 vmsm = vm_mapping_find(vms, gpa, PAGESIZE); 755 if (vmsm == NULL) { 756 return (FC_NOMAP); 757 } 758 759 if ((req_prot & vmsm->vmsm_prot) != req_prot) { 760 return (FC_PROT); 761 } 762 vmo = vmsm->vmsm_object; 763 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 764 VERIFY(pfn != PFN_INVALID); 765 766 if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot, 767 vmo->vmo_attr)) { 768 atomic_inc_64(&vms->vms_pages_mapped); 769 } 770 } 771 772 ASSERT(pfn != PFN_INVALID && leaf != NULL); 773 if (pfnp != NULL) { 774 *pfnp = pfn; 775 } 776 if (ptepp != NULL) { 777 *ptepp = leaf; 778 } 779 return (0); 780 } 781 782 /* 783 * Populate (make resident in the page tables) a region of the vmspace. 784 * 785 * Presently the [start, end) span must equal a region previously mapped by a 786 * call to vmspace_map(). 787 */ 788 int 789 vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end) 790 { 791 const size_t size = end - start; 792 vmspace_mapping_t *vmsm; 793 794 mutex_enter(&vms->vms_lock); 795 796 /* For the time being, only exact-match mappings are expected */ 797 if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) { 798 mutex_exit(&vms->vms_lock); 799 return (FC_NOMAP); 800 } 801 802 vm_object_t *vmo = vmsm->vmsm_object; 803 const int prot = vmsm->vmsm_prot; 804 const uint8_t attr = vmo->vmo_attr; 805 size_t populated = 0; 806 for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) { 807 const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 808 VERIFY(pfn != PFN_INVALID); 809 810 if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) { 811 populated++; 812 } 813 } 814 atomic_add_64(&vms->vms_pages_mapped, populated); 815 816 mutex_exit(&vms->vms_lock); 817 return (0); 818 } 819 820 /* 821 * Allocate a client from a given vmspace. 822 */ 823 vm_client_t * 824 vmspace_client_alloc(vmspace_t *vms) 825 { 826 vm_client_t *vmc; 827 828 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); 829 vmc->vmc_space = vms; 830 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); 831 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); 832 vmc->vmc_state = VCS_IDLE; 833 vmc->vmc_cpu_active = -1; 834 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), 835 offsetof(vm_page_t, vmp_node)); 836 vmc->vmc_track_dirty = vms->vms_track_dirty; 837 838 mutex_enter(&vms->vms_lock); 839 list_insert_tail(&vms->vms_clients, vmc); 840 mutex_exit(&vms->vms_lock); 841 842 return (vmc); 843 } 844 845 /* 846 * Get the nested page table root pointer (EPTP/NCR3) value. 847 */ 848 uint64_t 849 vmspace_table_root(vmspace_t *vms) 850 { 851 return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty)); 852 } 853 854 /* 855 * Get the current generation number of the nested page table. 856 */ 857 uint64_t 858 vmspace_table_gen(vmspace_t *vms) 859 { 860 return (vms->vms_pt_gen); 861 } 862 863 /* 864 * Mark a vm_client as active. This will block if/while the client is held by 865 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will 866 * fail if the vm_client has been orphaned. 867 */ 868 static int 869 vmc_activate(vm_client_t *vmc) 870 { 871 mutex_enter(&vmc->vmc_lock); 872 VERIFY0(vmc->vmc_state & VCS_ACTIVE); 873 if ((vmc->vmc_state & VCS_ORPHANED) != 0) { 874 mutex_exit(&vmc->vmc_lock); 875 return (ENXIO); 876 } 877 while ((vmc->vmc_state & VCS_HOLD) != 0) { 878 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 879 } 880 vmc->vmc_state |= VCS_ACTIVE; 881 return (0); 882 } 883 884 /* 885 * Mark a vm_client as no longer active. It must be called with 886 * vm_client_t`vmc_lock already held, and will return with it released. 887 */ 888 static void 889 vmc_deactivate(vm_client_t *vmc) 890 { 891 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 892 VERIFY(vmc->vmc_state & VCS_ACTIVE); 893 894 vmc->vmc_state ^= VCS_ACTIVE; 895 if ((vmc->vmc_state & VCS_HOLD) != 0) { 896 cv_broadcast(&vmc->vmc_cv); 897 } 898 mutex_exit(&vmc->vmc_lock); 899 } 900 901 /* 902 * Indicate that a CPU will be utilizing the nested page tables through this VM 903 * client. Interrupts (and/or the GIF) are expected to be disabled when calling 904 * this function. Returns the generation number of the nested page table (to be 905 * used for TLB invalidations). 906 */ 907 uint64_t 908 vmc_table_enter(vm_client_t *vmc) 909 { 910 vmspace_t *vms = vmc->vmc_space; 911 uint64_t gen; 912 913 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 914 ASSERT3S(vmc->vmc_cpu_active, ==, -1); 915 916 /* 917 * Since the NPT activation occurs with interrupts disabled, this must 918 * be done without taking vmc_lock like normal. 919 */ 920 gen = vms->vms_pt_gen; 921 vmc->vmc_cpu_active = CPU->cpu_id; 922 vmc->vmc_cpu_gen = gen; 923 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); 924 925 return (gen); 926 } 927 928 /* 929 * Indicate that this VM client is not longer (directly) using the underlying 930 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling 931 * this function. 932 */ 933 void 934 vmc_table_exit(vm_client_t *vmc) 935 { 936 mutex_enter(&vmc->vmc_lock); 937 938 ASSERT(vmc->vmc_state & VCS_ON_CPU); 939 vmc->vmc_state ^= VCS_ON_CPU; 940 vmc->vmc_cpu_active = -1; 941 if ((vmc->vmc_state & VCS_HOLD) != 0) { 942 cv_broadcast(&vmc->vmc_cv); 943 } 944 945 mutex_exit(&vmc->vmc_lock); 946 } 947 948 static void 949 vmc_space_hold(vm_client_t *vmc) 950 { 951 mutex_enter(&vmc->vmc_lock); 952 VERIFY0(vmc->vmc_state & VCS_HOLD); 953 954 /* 955 * Because vmc_table_enter() alters vmc_state from a context where 956 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting 957 * VMC_HOLD must be done atomically here. 958 */ 959 atomic_or_uint(&vmc->vmc_state, VCS_HOLD); 960 961 /* Wait for client to go inactive */ 962 while ((vmc->vmc_state & VCS_ACTIVE) != 0) { 963 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 964 } 965 mutex_exit(&vmc->vmc_lock); 966 } 967 968 static void 969 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) 970 { 971 mutex_enter(&vmc->vmc_lock); 972 VERIFY(vmc->vmc_state & VCS_HOLD); 973 974 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { 975 poke_cpu(vmc->vmc_cpu_active); 976 977 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 978 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 979 } 980 } 981 982 /* 983 * Because vmc_table_enter() alters vmc_state from a context where 984 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing 985 * VMC_HOLD must be done atomically here. 986 */ 987 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); 988 cv_broadcast(&vmc->vmc_cv); 989 mutex_exit(&vmc->vmc_lock); 990 } 991 992 static void 993 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, 994 uint64_t gen) 995 { 996 mutex_enter(&vmc->vmc_lock); 997 VERIFY(vmc->vmc_state & VCS_HOLD); 998 if ((vmc->vmc_state & VCS_ON_CPU) != 0) { 999 /* 1000 * Wait for clients using an old generation of the page tables 1001 * to exit guest context, where they subsequently flush the TLB 1002 * for the new generation. 1003 */ 1004 if (vmc->vmc_cpu_gen < gen) { 1005 poke_cpu(vmc->vmc_cpu_active); 1006 1007 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1008 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1009 } 1010 } 1011 } 1012 if (vmc->vmc_inval_func != NULL) { 1013 vmc_inval_cb_t func = vmc->vmc_inval_func; 1014 void *data = vmc->vmc_inval_data; 1015 1016 /* 1017 * Perform the actual invalidation call outside vmc_lock to 1018 * avoid lock ordering issues in the consumer. Since the client 1019 * is under VCS_HOLD, this is safe. 1020 */ 1021 mutex_exit(&vmc->vmc_lock); 1022 func(data, addr, size); 1023 mutex_enter(&vmc->vmc_lock); 1024 } 1025 mutex_exit(&vmc->vmc_lock); 1026 } 1027 1028 static void 1029 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, 1030 vm_object_t *vmo) 1031 { 1032 mutex_enter(&vmc->vmc_lock); 1033 VERIFY(vmc->vmc_state & VCS_HOLD); 1034 1035 /* 1036 * With the current vCPU exclusion invariants in place, we do not expect 1037 * a vCPU to be in guest context during an unmap. 1038 */ 1039 VERIFY0(vmc->vmc_state & VCS_ON_CPU); 1040 1041 /* 1042 * Any holds against the unmapped region need to establish their own 1043 * reference to the underlying object to avoid a potential 1044 * use-after-free. 1045 */ 1046 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1047 vmp != NULL; 1048 vmp = list_next(&vmc->vmc_held_pages, vmc)) { 1049 if (vmp->vmp_gpa < addr || 1050 vmp->vmp_gpa >= (addr + size)) { 1051 /* Hold outside region in question */ 1052 continue; 1053 } 1054 if (vmp->vmp_obj_ref == NULL) { 1055 vm_object_reference(vmo); 1056 vmp->vmp_obj_ref = vmo; 1057 /* For an unmapped region, PTE is now meaningless */ 1058 vmp->vmp_ptep = NULL; 1059 } else { 1060 /* 1061 * Object could have gone through cycle of 1062 * unmap-map-unmap before the hold was released. 1063 */ 1064 VERIFY3P(vmp->vmp_ptep, ==, NULL); 1065 } 1066 } 1067 mutex_exit(&vmc->vmc_lock); 1068 } 1069 1070 static vm_client_t * 1071 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) 1072 { 1073 vm_client_t *next; 1074 1075 ASSERT(MUTEX_HELD(&vms->vms_lock)); 1076 1077 mutex_enter(&vmc->vmc_lock); 1078 VERIFY3P(vmc->vmc_space, ==, vms); 1079 VERIFY0(vmc->vmc_state & VCS_ORPHANED); 1080 if (vmc->vmc_state & VCS_DESTROY) { 1081 /* 1082 * This vm_client is currently undergoing destruction, so it 1083 * does not need to be orphaned. Let it proceed with its own 1084 * clean-up task. 1085 */ 1086 next = list_next(&vms->vms_clients, vmc); 1087 } else { 1088 /* 1089 * Clients are only orphaned when the containing vmspace is 1090 * being torn down. All mappings from the vmspace should 1091 * already be gone, meaning any remaining held pages should have 1092 * direct references to the object. 1093 */ 1094 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1095 vmp != NULL; 1096 vmp = list_next(&vmc->vmc_held_pages, vmp)) { 1097 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1098 ASSERT3P(vmp->vmp_obj_ref, !=, NULL); 1099 } 1100 1101 /* 1102 * After this point, the client will be orphaned, unable to 1103 * establish new page holds (or access any vmspace-related 1104 * resources) and is in charge of cleaning up after itself. 1105 */ 1106 vmc->vmc_state |= VCS_ORPHANED; 1107 next = list_next(&vms->vms_clients, vmc); 1108 list_remove(&vms->vms_clients, vmc); 1109 vmc->vmc_space = NULL; 1110 } 1111 mutex_exit(&vmc->vmc_lock); 1112 return (next); 1113 } 1114 1115 /* 1116 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1117 */ 1118 vm_page_t * 1119 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags) 1120 { 1121 vmspace_t *vms = vmc->vmc_space; 1122 vm_page_t *vmp; 1123 pfn_t pfn = PFN_INVALID; 1124 uint64_t *ptep = NULL; 1125 1126 ASSERT0(gpa & PAGEOFFSET); 1127 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); 1128 ASSERT0(prot & ~PROT_ALL); 1129 ASSERT0(flags & ~VPF_ALL); 1130 1131 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); 1132 if (vmc_activate(vmc) != 0) { 1133 kmem_free(vmp, sizeof (*vmp)); 1134 return (NULL); 1135 } 1136 1137 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { 1138 vmc_deactivate(vmc); 1139 kmem_free(vmp, sizeof (*vmp)); 1140 return (NULL); 1141 } 1142 ASSERT(pfn != PFN_INVALID && ptep != NULL); 1143 1144 vmp->vmp_client = vmc; 1145 vmp->vmp_chain = NULL; 1146 vmp->vmp_gpa = gpa; 1147 vmp->vmp_pfn = pfn; 1148 vmp->vmp_ptep = ptep; 1149 vmp->vmp_obj_ref = NULL; 1150 vmp->vmp_prot = (uint8_t)prot; 1151 vmp->vmp_flags = (uint8_t)flags; 1152 list_insert_tail(&vmc->vmc_held_pages, vmp); 1153 vmc_deactivate(vmc); 1154 1155 return (vmp); 1156 } 1157 1158 /* 1159 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1160 */ 1161 vm_page_t * 1162 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) 1163 { 1164 return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT)); 1165 } 1166 1167 int 1168 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) 1169 { 1170 vmspace_t *vms = vmc->vmc_space; 1171 int err; 1172 1173 err = vmc_activate(vmc); 1174 if (err == 0) { 1175 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); 1176 vmc_deactivate(vmc); 1177 } 1178 1179 return (err); 1180 } 1181 1182 /* 1183 * Allocate an additional vm_client_t, based on an existing one. Only the 1184 * associatation with the vmspace is cloned, not existing holds or any 1185 * configured invalidation function. 1186 */ 1187 vm_client_t * 1188 vmc_clone(vm_client_t *vmc) 1189 { 1190 vmspace_t *vms = vmc->vmc_space; 1191 1192 return (vmspace_client_alloc(vms)); 1193 } 1194 1195 /* 1196 * Register a function (and associated data pointer) to be called when an 1197 * address range in the vmspace is invalidated. 1198 */ 1199 int 1200 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) 1201 { 1202 int err; 1203 1204 err = vmc_activate(vmc); 1205 if (err == 0) { 1206 vmc->vmc_inval_func = func; 1207 vmc->vmc_inval_data = data; 1208 vmc_deactivate(vmc); 1209 } 1210 1211 return (err); 1212 } 1213 1214 /* 1215 * Destroy a vm_client_t instance. 1216 * 1217 * No pages held through this vm_client_t may be outstanding when performing a 1218 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to 1219 * vmc_table_exit() has been made). 1220 */ 1221 void 1222 vmc_destroy(vm_client_t *vmc) 1223 { 1224 mutex_enter(&vmc->vmc_lock); 1225 1226 VERIFY(list_is_empty(&vmc->vmc_held_pages)); 1227 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1228 1229 if ((vmc->vmc_state & VCS_ORPHANED) == 0) { 1230 vmspace_t *vms; 1231 1232 /* 1233 * Deassociation with the parent vmspace must be done carefully: 1234 * The vmspace could attempt to orphan this vm_client while we 1235 * release vmc_lock in order to take vms_lock (the required 1236 * order). The client is marked to indicate that destruction is 1237 * under way. Doing so prevents any racing orphan operation 1238 * from applying to this client, allowing us to deassociate from 1239 * the vmspace safely. 1240 */ 1241 vmc->vmc_state |= VCS_DESTROY; 1242 vms = vmc->vmc_space; 1243 mutex_exit(&vmc->vmc_lock); 1244 1245 mutex_enter(&vms->vms_lock); 1246 mutex_enter(&vmc->vmc_lock); 1247 list_remove(&vms->vms_clients, vmc); 1248 /* 1249 * If the vmspace began its own destruction operation while we 1250 * were navigating the locks, be sure to notify it about this 1251 * vm_client being deassociated. 1252 */ 1253 cv_signal(&vms->vms_cv); 1254 mutex_exit(&vmc->vmc_lock); 1255 mutex_exit(&vms->vms_lock); 1256 } else { 1257 VERIFY3P(vmc->vmc_space, ==, NULL); 1258 mutex_exit(&vmc->vmc_lock); 1259 } 1260 1261 mutex_destroy(&vmc->vmc_lock); 1262 cv_destroy(&vmc->vmc_cv); 1263 list_destroy(&vmc->vmc_held_pages); 1264 1265 kmem_free(vmc, sizeof (*vmc)); 1266 } 1267 1268 static __inline void * 1269 vmp_ptr(const vm_page_t *vmp) 1270 { 1271 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); 1272 1273 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); 1274 return ((void *)((uintptr_t)kpm_vbase + paddr)); 1275 } 1276 1277 /* 1278 * Get a readable kernel-virtual pointer for a held page. 1279 * 1280 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() 1281 * call to acquire this page reference. 1282 */ 1283 const void * 1284 vmp_get_readable(const vm_page_t *vmp) 1285 { 1286 ASSERT(vmp->vmp_prot & PROT_READ); 1287 1288 return (vmp_ptr(vmp)); 1289 } 1290 1291 /* 1292 * Get a writable kernel-virtual pointer for a held page. 1293 * 1294 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() 1295 * call to acquire this page reference. 1296 */ 1297 void * 1298 vmp_get_writable(const vm_page_t *vmp) 1299 { 1300 ASSERT(vmp->vmp_prot & PROT_WRITE); 1301 1302 return (vmp_ptr(vmp)); 1303 } 1304 1305 /* 1306 * Get the host-physical PFN for a held page. 1307 */ 1308 pfn_t 1309 vmp_get_pfn(const vm_page_t *vmp) 1310 { 1311 return (vmp->vmp_pfn); 1312 } 1313 1314 /* 1315 * If this page was deferring dirty-marking in the corresponding vmspace page 1316 * tables, clear such a state so it is considered dirty from now on. 1317 */ 1318 void 1319 vmp_mark_dirty(vm_page_t *vmp) 1320 { 1321 ASSERT((vmp->vmp_prot & PROT_WRITE) != 0); 1322 1323 atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY); 1324 } 1325 1326 /* 1327 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. 1328 */ 1329 void 1330 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) 1331 { 1332 ASSERT3P(vmp->vmp_chain, ==, NULL); 1333 1334 vmp->vmp_chain = to_chain; 1335 } 1336 1337 /* 1338 * Retrieve the pointer from the page-chaining in `vmp`. 1339 */ 1340 vm_page_t * 1341 vmp_next(const vm_page_t *vmp) 1342 { 1343 return (vmp->vmp_chain); 1344 } 1345 1346 static __inline bool 1347 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) 1348 { 1349 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1350 1351 bool was_unmapped = false; 1352 1353 list_remove(&vmc->vmc_held_pages, vmp); 1354 if (vmp->vmp_obj_ref != NULL) { 1355 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1356 1357 vm_object_release(vmp->vmp_obj_ref); 1358 was_unmapped = true; 1359 } else { 1360 ASSERT3P(vmp->vmp_ptep, !=, NULL); 1361 1362 /* 1363 * Track appropriate (accessed/dirty) bits for the guest-virtual 1364 * address corresponding to this page, if it is from the vmspace 1365 * rather than a direct reference to an underlying object. 1366 * 1367 * The protection and/or configured flags may obviate the need 1368 * for such an update. 1369 */ 1370 if ((vmp->vmp_prot & PROT_WRITE) != 0 && 1371 (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 && 1372 vmc->vmc_track_dirty) { 1373 vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; 1374 (void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); 1375 } 1376 } 1377 kmem_free(vmp, sizeof (*vmp)); 1378 return (was_unmapped); 1379 } 1380 1381 /* 1382 * Release held page. Returns true if page resided on region which was 1383 * subsequently unmapped. 1384 */ 1385 bool 1386 vmp_release(vm_page_t *vmp) 1387 { 1388 vm_client_t *vmc = vmp->vmp_client; 1389 1390 VERIFY(vmc != NULL); 1391 1392 mutex_enter(&vmc->vmc_lock); 1393 const bool was_unmapped = vmp_release_inner(vmp, vmc); 1394 mutex_exit(&vmc->vmc_lock); 1395 return (was_unmapped); 1396 } 1397 1398 /* 1399 * Release a chain of pages which were associated via vmp_chain() (setting 1400 * page-chaining pointer). Returns true if any pages resided upon a region 1401 * which was subsequently unmapped. 1402 * 1403 * All of those pages must have been held through the same vm_client_t. 1404 */ 1405 bool 1406 vmp_release_chain(vm_page_t *vmp) 1407 { 1408 vm_client_t *vmc = vmp->vmp_client; 1409 bool any_unmapped = false; 1410 1411 ASSERT(vmp != NULL); 1412 1413 mutex_enter(&vmc->vmc_lock); 1414 while (vmp != NULL) { 1415 vm_page_t *next = vmp->vmp_chain; 1416 1417 /* We expect all pages in chain to be from same client */ 1418 ASSERT3P(vmp->vmp_client, ==, vmc); 1419 1420 if (vmp_release_inner(vmp, vmc)) { 1421 any_unmapped = true; 1422 } 1423 vmp = next; 1424 } 1425 mutex_exit(&vmc->vmc_lock); 1426 return (any_unmapped); 1427 } 1428 1429 1430 int 1431 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, 1432 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) 1433 { 1434 vm_object_t *vmo; 1435 int err; 1436 1437 if (segoff < 0 || len <= 0 || 1438 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { 1439 return (EINVAL); 1440 } 1441 if ((prot & PROT_USER) == 0) { 1442 return (ENOTSUP); 1443 } 1444 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); 1445 if (err != 0) { 1446 return (err); 1447 } 1448 1449 VERIFY(segoff >= 0); 1450 VERIFY(len <= vmo->vmo_size); 1451 VERIFY((len + segoff) <= vmo->vmo_size); 1452 1453 if (vmo->vmo_type != VMOT_MEM) { 1454 /* Only support memory objects for now */ 1455 return (ENOTSUP); 1456 } 1457 1458 as_rangelock(as); 1459 1460 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); 1461 if (err == 0) { 1462 segvmm_crargs_t svma; 1463 1464 svma.prot = prot; 1465 svma.offset = segoff; 1466 svma.vmo = vmo; 1467 svma.vmc = NULL; 1468 1469 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); 1470 } 1471 1472 as_rangeunlock(as); 1473 return (err); 1474 } 1475 1476 int 1477 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, 1478 off_t len, uint_t prot, uint_t maxprot, uint_t flags) 1479 { 1480 1481 const uintptr_t gpa = (uintptr_t)off; 1482 const size_t size = (uintptr_t)len; 1483 int err; 1484 1485 if (off < 0 || len <= 0 || 1486 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { 1487 return (EINVAL); 1488 } 1489 if ((prot & PROT_USER) == 0) { 1490 return (ENOTSUP); 1491 } 1492 1493 as_rangelock(as); 1494 1495 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); 1496 if (err == 0) { 1497 segvmm_crargs_t svma; 1498 1499 svma.prot = prot; 1500 svma.offset = gpa; 1501 svma.vmo = NULL; 1502 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); 1503 1504 err = as_map(as, *addrp, len, segvmm_create, &svma); 1505 } 1506 1507 as_rangeunlock(as); 1508 return (err); 1509 } 1510