1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2022 Oxide Computer Company 16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/kmem.h> 21 #include <sys/thread.h> 22 #include <sys/list.h> 23 #include <sys/mman.h> 24 #include <sys/types.h> 25 #include <sys/ddi.h> 26 #include <sys/sysmacros.h> 27 #include <sys/machsystm.h> 28 #include <sys/vmsystm.h> 29 #include <sys/x86_archext.h> 30 #include <vm/as.h> 31 #include <vm/hat_i86.h> 32 #include <vm/seg_vn.h> 33 #include <vm/seg_kmem.h> 34 35 #include <sys/vmm_vm.h> 36 #include <sys/seg_vmm.h> 37 #include <sys/vmm_kernel.h> 38 #include <sys/vmm_reservoir.h> 39 #include <sys/vmm_gpt.h> 40 41 42 /* 43 * VMM Virtual Memory 44 * 45 * History 46 * 47 * When bhyve was ported to illumos, one significant hole was handling guest 48 * memory and memory accesses. In the original Pluribus port, bhyve itself 49 * manually handled the EPT structures for guest memory. The updated sources 50 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM 51 * system for memory allocations and management of the EPT structures. Keeping 52 * source differences to a minimum was a priority, so illumos-bhyve implemented 53 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to 54 * boot and run guests. 55 * 56 * While the VM shim was successful in getting illumos-bhyve to a functional 57 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the 58 * compatibility interfaces made it awkward to use. As source differences with 59 * the upstream kernel code became less of a concern, and upcoming features 60 * (such as live migration) would demand more of those VM interfaces, it became 61 * clear that an overhaul was prudent. 62 * 63 * Design 64 * 65 * The new VM system for bhyve retains a number of the same concepts as what it 66 * replaces: 67 * 68 * - `vmspace_t` is the top-level entity for a guest memory space 69 * - `vm_object_t` represents a memory object which can be mapped into a vmspace 70 * - `vm_page_t` represents a page hold within a given vmspace, providing access 71 * to the underlying memory page 72 * 73 * Unlike the old code, where most of the involved structures were exposed via 74 * public definitions, this replacement VM interface keeps all involved 75 * structures opaque to consumers. Furthermore, there is a clear delineation 76 * between infrequent administrative operations (such as mapping/unmapping 77 * regions) and common data-path operations (attempting a page hold at a given 78 * guest-physical address). Those administrative operations are performed 79 * directly against the vmspace, whereas the data-path operations are performed 80 * through a `vm_client_t` handle. That VM client abstraction is meant to 81 * reduce contention and overhead for frequent access operations and provide 82 * debugging insight into how different subcomponents are accessing the vmspace. 83 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv 84 * interface) and each VMM userspace segment mapping. 85 * 86 * Exclusion 87 * 88 * Making changes to the vmspace (such as mapping or unmapping regions) requires 89 * other accessors be excluded while the change is underway to prevent them from 90 * observing invalid intermediate states. A simple approach could use a mutex 91 * or rwlock to achieve this, but that risks contention when the rate of access 92 * to the vmspace is high. 93 * 94 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion 95 * at a per-vm_client_t basis. While this raises the cost for vmspace changes, 96 * it means that the much more common page accesses through the vm_client can 97 * normally proceed unimpeded and independently. 98 * 99 * When a change to the vmspace is required, the caller will put the vmspace in 100 * a 'hold' state, iterating over all associated vm_client instances, waiting 101 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before 102 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on 103 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) 104 * will block until the hold condition is cleared. Once the hold is asserted 105 * for all clients, the vmspace change can proceed with confidence. Upon 106 * completion of that operation, VCS_HOLD is cleared from the clients, and they 107 * are released to resume vmspace accesses. 108 * 109 * vCPU Consumers 110 * 111 * Access to the vmspace for vCPUs running in guest context is different from 112 * emulation-related vm_client activity: they solely rely on the contents of the 113 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude 114 * client access is not feasible when entering guest context, since interrupts 115 * are disabled, making it impossible to block entry. This is not a concern as 116 * long as vmspace modifications never place the page tables in invalid states 117 * (either intermediate, or final). The vm_client hold mechanism does provide 118 * the means to IPI vCPU consumers which will trigger a notification once they 119 * report their exit from guest context. This can be used to ensure that page 120 * table modifications are made visible to those vCPUs within a certain 121 * time frame. 122 */ 123 124 typedef struct vmspace_mapping { 125 list_node_t vmsm_node; 126 vm_object_t *vmsm_object; /* object backing this mapping */ 127 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ 128 size_t vmsm_len; /* length (in bytes) of mapping */ 129 off_t vmsm_offset; /* byte offset into object */ 130 uint_t vmsm_prot; 131 } vmspace_mapping_t; 132 133 #define VMSM_OFFSET(vmsm, addr) ( \ 134 (vmsm)->vmsm_offset + \ 135 ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) 136 137 typedef enum vm_client_state { 138 VCS_IDLE = 0, 139 /* currently accessing vmspace for client operation (hold or fault) */ 140 VCS_ACTIVE = (1 << 0), 141 /* client hold requested/asserted */ 142 VCS_HOLD = (1 << 1), 143 /* vCPU is accessing page tables in guest context */ 144 VCS_ON_CPU = (1 << 2), 145 /* client has been orphaned (no more access to vmspace) */ 146 VCS_ORPHANED = (1 << 3), 147 /* client undergoing destroy operation */ 148 VCS_DESTROY = (1 << 4), 149 } vm_client_state_t; 150 151 struct vmspace { 152 kmutex_t vms_lock; 153 kcondvar_t vms_cv; 154 bool vms_held; 155 uintptr_t vms_size; /* immutable after creation */ 156 157 /* (nested) page table state */ 158 vmm_gpt_t *vms_gpt; 159 uint64_t vms_pt_gen; 160 uint64_t vms_pages_mapped; 161 bool vms_track_dirty; 162 163 list_t vms_maplist; 164 list_t vms_clients; 165 }; 166 167 struct vm_client { 168 vmspace_t *vmc_space; 169 list_node_t vmc_node; 170 171 kmutex_t vmc_lock; 172 kcondvar_t vmc_cv; 173 vm_client_state_t vmc_state; 174 int vmc_cpu_active; 175 uint64_t vmc_cpu_gen; 176 bool vmc_track_dirty; 177 vmc_inval_cb_t vmc_inval_func; 178 void *vmc_inval_data; 179 180 list_t vmc_held_pages; 181 }; 182 183 typedef enum vm_object_type { 184 VMOT_NONE, 185 VMOT_MEM, 186 VMOT_MMIO, 187 } vm_object_type_t; 188 189 struct vm_object { 190 uint_t vmo_refcnt; /* manipulated with atomic ops */ 191 192 /* Fields below are fixed at creation time */ 193 vm_object_type_t vmo_type; 194 size_t vmo_size; 195 void *vmo_data; 196 uint8_t vmo_attr; 197 }; 198 199 struct vm_page { 200 vm_client_t *vmp_client; 201 list_node_t vmp_node; 202 vm_page_t *vmp_chain; 203 uintptr_t vmp_gpa; 204 pfn_t vmp_pfn; 205 uint64_t *vmp_ptep; 206 vm_object_t *vmp_obj_ref; 207 int vmp_prot; 208 }; 209 210 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); 211 static void vmspace_hold_enter(vmspace_t *); 212 static void vmspace_hold_exit(vmspace_t *, bool); 213 static void vmc_space_hold(vm_client_t *); 214 static void vmc_space_release(vm_client_t *, bool); 215 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); 216 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); 217 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); 218 219 220 /* 221 * Create a new vmspace with a maximum address of `end`. 222 */ 223 vmspace_t * 224 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) 225 { 226 vmspace_t *vms; 227 const uintptr_t size = end + 1; 228 229 /* 230 * This whole mess is built on the assumption that a 64-bit address 231 * space is available to work with for the various pagetable tricks. 232 */ 233 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && 234 size <= (uintptr_t)USERLIMIT); 235 236 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); 237 vms->vms_size = size; 238 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), 239 offsetof(vmspace_mapping_t, vmsm_node)); 240 list_create(&vms->vms_clients, sizeof (vm_client_t), 241 offsetof(vm_client_t, vmc_node)); 242 243 vms->vms_gpt = vmm_gpt_alloc(pte_ops); 244 vms->vms_pt_gen = 1; 245 vms->vms_track_dirty = track_dirty; 246 247 return (vms); 248 } 249 250 /* 251 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining 252 * clients will be orphaned. 253 */ 254 void 255 vmspace_destroy(vmspace_t *vms) 256 { 257 mutex_enter(&vms->vms_lock); 258 VERIFY(list_is_empty(&vms->vms_maplist)); 259 260 if (!list_is_empty(&vms->vms_clients)) { 261 vm_client_t *vmc = list_head(&vms->vms_clients); 262 while (vmc != NULL) { 263 vmc = vmc_space_orphan(vmc, vms); 264 } 265 /* 266 * Wait for any clients which were in the process of destroying 267 * themselves to disappear. 268 */ 269 while (!list_is_empty(&vms->vms_clients)) { 270 cv_wait(&vms->vms_cv, &vms->vms_lock); 271 } 272 } 273 VERIFY(list_is_empty(&vms->vms_clients)); 274 275 vmm_gpt_free(vms->vms_gpt); 276 mutex_exit(&vms->vms_lock); 277 278 mutex_destroy(&vms->vms_lock); 279 cv_destroy(&vms->vms_cv); 280 list_destroy(&vms->vms_maplist); 281 list_destroy(&vms->vms_clients); 282 283 kmem_free(vms, sizeof (*vms)); 284 } 285 286 /* 287 * Retrieve the count of resident (mapped into the page tables) pages. 288 */ 289 uint64_t 290 vmspace_resident_count(vmspace_t *vms) 291 { 292 return (vms->vms_pages_mapped); 293 } 294 295 int 296 vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap) 297 { 298 if (!vms->vms_track_dirty) 299 return (EPERM); 300 301 /* 302 * Accumulate dirty bits into the given bit vector. Note that this 303 * races both against hardware writes from running vCPUs and 304 * reflections from userspace. 305 * 306 * Called from a userspace-visible ioctl, this depends on the VM 307 * instance being read-locked to prevent vmspace_map/vmspace_unmap 308 * operations from changing the page tables during the walk. 309 */ 310 for (size_t offset = 0; offset < len; offset += PAGESIZE) { 311 bool bit = false; 312 uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset); 313 if (entry != NULL) 314 bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false); 315 uint64_t pfn_offset = offset >> PAGESHIFT; 316 size_t bit_offset = pfn_offset / 8; 317 size_t bit_index = pfn_offset % 8; 318 bitmap[bit_offset] |= (bit << bit_index); 319 } 320 321 /* 322 * Now invalidate those bits and shoot down address spaces that 323 * may have them cached. 324 */ 325 vmspace_hold_enter(vms); 326 vms->vms_pt_gen++; 327 for (vm_client_t *vmc = list_head(&vms->vms_clients); 328 vmc != NULL; 329 vmc = list_next(&vms->vms_clients, vmc)) { 330 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); 331 } 332 vmspace_hold_exit(vms, true); 333 334 return (0); 335 } 336 337 static pfn_t 338 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) 339 { 340 vmmr_region_t *region; 341 pfn_t pfn; 342 343 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); 344 345 region = vmo->vmo_data; 346 pfn = vmmr_region_pfn_at(region, off); 347 348 return (pfn); 349 } 350 351 static pfn_t 352 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) 353 { 354 pfn_t pfn; 355 356 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); 357 ASSERT3P(vmo->vmo_data, !=, NULL); 358 ASSERT3U(off, <, vmo->vmo_size); 359 360 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; 361 362 return (pfn); 363 } 364 365 /* 366 * Allocate a VM object backed by VMM reservoir memory. 367 */ 368 vm_object_t * 369 vm_object_mem_allocate(size_t size, bool transient) 370 { 371 int err; 372 vmmr_region_t *region = NULL; 373 vm_object_t *vmo; 374 375 ASSERT3U(size, !=, 0); 376 ASSERT3U(size & PAGEOFFSET, ==, 0); 377 378 err = vmmr_alloc(size, transient, ®ion); 379 if (err != 0) { 380 return (NULL); 381 } 382 383 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 384 385 /* For now, these are to stay fixed after allocation */ 386 vmo->vmo_type = VMOT_MEM; 387 vmo->vmo_size = size; 388 vmo->vmo_attr = MTRR_TYPE_WB; 389 vmo->vmo_data = region; 390 vmo->vmo_refcnt = 1; 391 392 return (vmo); 393 } 394 395 static vm_object_t * 396 vm_object_mmio_allocate(size_t size, uintptr_t hpa) 397 { 398 vm_object_t *vmo; 399 400 ASSERT3U(size, !=, 0); 401 ASSERT3U(size & PAGEOFFSET, ==, 0); 402 ASSERT3U(hpa & PAGEOFFSET, ==, 0); 403 404 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 405 406 /* For now, these are to stay fixed after allocation */ 407 vmo->vmo_type = VMOT_MMIO; 408 vmo->vmo_size = size; 409 vmo->vmo_attr = MTRR_TYPE_UC; 410 vmo->vmo_data = (void *)hpa; 411 vmo->vmo_refcnt = 1; 412 413 return (vmo); 414 } 415 416 /* 417 * Allocate a VM object backed by an existing range of physical memory. 418 */ 419 vm_object_t * 420 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) 421 { 422 int error; 423 vm_object_t *obj; 424 425 obj = vm_object_mmio_allocate(len, hpa); 426 if (obj != NULL) { 427 error = vmspace_map(vmspace, obj, 0, gpa, len, 428 PROT_READ | PROT_WRITE); 429 if (error != 0) { 430 vm_object_release(obj); 431 obj = NULL; 432 } 433 } 434 435 return (obj); 436 } 437 438 /* 439 * Release a vm_object reference 440 */ 441 void 442 vm_object_release(vm_object_t *vmo) 443 { 444 ASSERT(vmo != NULL); 445 446 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); 447 /* underflow would be a deadly serious mistake */ 448 VERIFY3U(ref, !=, UINT_MAX); 449 if (ref != 0) { 450 return; 451 } 452 453 switch (vmo->vmo_type) { 454 case VMOT_MEM: 455 vmmr_free((vmmr_region_t *)vmo->vmo_data); 456 break; 457 case VMOT_MMIO: 458 break; 459 default: 460 panic("unexpected object type %u", vmo->vmo_type); 461 break; 462 } 463 464 vmo->vmo_data = NULL; 465 vmo->vmo_size = 0; 466 kmem_free(vmo, sizeof (*vmo)); 467 } 468 469 /* 470 * Increase refcount for vm_object reference 471 */ 472 void 473 vm_object_reference(vm_object_t *vmo) 474 { 475 ASSERT(vmo != NULL); 476 477 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); 478 /* overflow would be a deadly serious mistake */ 479 VERIFY3U(ref, !=, 0); 480 } 481 482 /* 483 * Get the host-physical PFN for a given offset into a vm_object. 484 * 485 * The provided `off` must be within the allocated size of the vm_object. 486 */ 487 pfn_t 488 vm_object_pfn(vm_object_t *vmo, uintptr_t off) 489 { 490 const uintptr_t aligned_off = off & PAGEMASK; 491 492 switch (vmo->vmo_type) { 493 case VMOT_MEM: 494 return (vm_object_pager_reservoir(vmo, aligned_off)); 495 case VMOT_MMIO: 496 return (vm_object_pager_mmio(vmo, aligned_off)); 497 case VMOT_NONE: 498 break; 499 } 500 panic("unexpected object type %u", vmo->vmo_type); 501 } 502 503 static vmspace_mapping_t * 504 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) 505 { 506 vmspace_mapping_t *vmsm; 507 list_t *ml = &vms->vms_maplist; 508 const uintptr_t range_end = addr + size; 509 510 ASSERT3U(addr, <=, range_end); 511 512 if (addr >= vms->vms_size) { 513 return (NULL); 514 } 515 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 516 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; 517 518 if (addr >= vmsm->vmsm_addr && addr < seg_end) { 519 if (range_end <= seg_end) { 520 return (vmsm); 521 } else { 522 return (NULL); 523 } 524 } 525 } 526 return (NULL); 527 } 528 529 /* 530 * Check to see if any mappings reside within [addr, addr + size) span in the 531 * vmspace, returning true if that span is indeed empty. 532 */ 533 static bool 534 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) 535 { 536 vmspace_mapping_t *vmsm; 537 list_t *ml = &vms->vms_maplist; 538 const uintptr_t range_end = addr + size - 1; 539 540 ASSERT(MUTEX_HELD(&vms->vms_lock)); 541 ASSERT(size > 0); 542 543 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 544 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; 545 546 /* 547 * The two ranges do not overlap if the start of either of 548 * them is after the end of the other. 549 */ 550 if (vmsm->vmsm_addr > range_end || addr > seg_end) 551 continue; 552 return (false); 553 } 554 return (true); 555 } 556 557 static void 558 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) 559 { 560 list_t *ml = &vms->vms_maplist; 561 562 ASSERT(MUTEX_HELD(&vms->vms_lock)); 563 ASSERT(vms->vms_held); 564 565 list_remove(ml, vmsm); 566 vm_object_release(vmsm->vmsm_object); 567 kmem_free(vmsm, sizeof (*vmsm)); 568 } 569 570 /* 571 * Enter a hold state on the vmspace. This ensures that all VM clients 572 * associated with the vmspace are excluded from establishing new page holds, 573 * or any other actions which would require accessing vmspace state subject to 574 * potential change. 575 * 576 * Returns with vmspace_t`vms_lock held. 577 */ 578 static void 579 vmspace_hold_enter(vmspace_t *vms) 580 { 581 mutex_enter(&vms->vms_lock); 582 VERIFY(!vms->vms_held); 583 584 vm_client_t *vmc = list_head(&vms->vms_clients); 585 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 586 vmc_space_hold(vmc); 587 } 588 vms->vms_held = true; 589 } 590 591 /* 592 * Exit a hold state on the vmspace. This releases all VM clients associated 593 * with the vmspace to be able to establish new page holds, and partake in other 594 * actions which require accessing changed vmspace state. If `kick_on_cpu` is 595 * true, then any CPUs actively using the page tables will be IPIed, and the 596 * call will block until they have acknowledged being ready to use the latest 597 * state of the tables. 598 * 599 * Requires vmspace_t`vms_lock be held, which is released as part of the call. 600 */ 601 static void 602 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) 603 { 604 ASSERT(MUTEX_HELD(&vms->vms_lock)); 605 VERIFY(vms->vms_held); 606 607 vm_client_t *vmc = list_head(&vms->vms_clients); 608 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 609 vmc_space_release(vmc, kick_on_cpu); 610 } 611 vms->vms_held = false; 612 mutex_exit(&vms->vms_lock); 613 } 614 615 /* 616 * Attempt to map a vm_object span into the vmspace. 617 * 618 * Requirements: 619 * - `obj_off`, `addr`, and `len` must be page-aligned 620 * - `obj_off` cannot be greater than the allocated size of the object 621 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated 622 * size of the object 623 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address 624 * of the vmspace 625 */ 626 int 627 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, 628 size_t len, uint8_t prot) 629 { 630 vmspace_mapping_t *vmsm; 631 int res = 0; 632 633 if (len == 0 || (addr + len) < addr || 634 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { 635 return (EINVAL); 636 } 637 if ((addr + len) >= vms->vms_size) { 638 return (ENOMEM); 639 } 640 641 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); 642 643 vmspace_hold_enter(vms); 644 if (!vm_mapping_gap(vms, addr, len)) { 645 kmem_free(vmsm, sizeof (*vmsm)); 646 res = ENOMEM; 647 } else { 648 vmsm->vmsm_object = vmo; 649 vmsm->vmsm_addr = addr; 650 vmsm->vmsm_len = len; 651 vmsm->vmsm_offset = (off_t)obj_off; 652 vmsm->vmsm_prot = prot; 653 list_insert_tail(&vms->vms_maplist, vmsm); 654 655 /* 656 * Make sure the GPT has tables ready for leaf entries across 657 * the entire new mapping. 658 */ 659 vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len); 660 } 661 vmspace_hold_exit(vms, false); 662 return (res); 663 } 664 665 /* 666 * Unmap a region of the vmspace. 667 * 668 * Presently the [start, end) span must equal a region previously mapped by a 669 * call to vmspace_map(). 670 */ 671 int 672 vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end) 673 { 674 const size_t size = (size_t)(end - start); 675 vmspace_mapping_t *vmsm; 676 vm_client_t *vmc; 677 uint64_t gen = 0; 678 679 ASSERT(start < end); 680 681 vmspace_hold_enter(vms); 682 /* expect to match existing mapping exactly */ 683 if ((vmsm = vm_mapping_find(vms, start, size)) == NULL || 684 vmsm->vmsm_addr != start || vmsm->vmsm_len != size) { 685 vmspace_hold_exit(vms, false); 686 return (ENOENT); 687 } 688 689 /* Prepare clients (and their held pages) for the unmap. */ 690 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 691 vmc = list_next(&vms->vms_clients, vmc)) { 692 vmc_space_unmap(vmc, start, size, vmsm->vmsm_object); 693 } 694 695 /* Clear all PTEs for region */ 696 if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) { 697 vms->vms_pt_gen++; 698 gen = vms->vms_pt_gen; 699 } 700 /* ... and the intermediate (directory) PTEs as well */ 701 vmm_gpt_vacate_region(vms->vms_gpt, start, end); 702 703 /* 704 * If pages were actually unmapped from the GPT, provide clients with 705 * an invalidation notice. 706 */ 707 if (gen != 0) { 708 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 709 vmc = list_next(&vms->vms_clients, vmc)) { 710 vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen); 711 } 712 } 713 714 vm_mapping_remove(vms, vmsm); 715 vmspace_hold_exit(vms, true); 716 return (0); 717 } 718 719 static int 720 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 721 uint64_t **ptepp) 722 { 723 vmm_gpt_t *gpt = vms->vms_gpt; 724 uint64_t *entries[MAX_GPT_LEVEL], *leaf; 725 pfn_t pfn = PFN_INVALID; 726 uint_t prot; 727 728 ASSERT0(gpa & PAGEOFFSET); 729 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); 730 731 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 732 leaf = entries[LEVEL1]; 733 if (leaf == NULL) { 734 /* 735 * Since we populated the intermediate tables for any regions 736 * mapped in the GPT, an empty leaf entry indicates there is no 737 * mapping, populated or not, at this GPT. 738 */ 739 return (FC_NOMAP); 740 } 741 742 if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { 743 if ((req_prot & prot) != req_prot) { 744 return (FC_PROT); 745 } 746 } else { 747 vmspace_mapping_t *vmsm; 748 vm_object_t *vmo; 749 750 vmsm = vm_mapping_find(vms, gpa, PAGESIZE); 751 if (vmsm == NULL) { 752 return (FC_NOMAP); 753 } 754 755 if ((req_prot & vmsm->vmsm_prot) != req_prot) { 756 return (FC_PROT); 757 } 758 vmo = vmsm->vmsm_object; 759 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 760 VERIFY(pfn != PFN_INVALID); 761 762 if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot, 763 vmo->vmo_attr)) { 764 atomic_inc_64(&vms->vms_pages_mapped); 765 } 766 } 767 768 ASSERT(pfn != PFN_INVALID && leaf != NULL); 769 if (pfnp != NULL) { 770 *pfnp = pfn; 771 } 772 if (ptepp != NULL) { 773 *ptepp = leaf; 774 } 775 return (0); 776 } 777 778 /* 779 * Populate (make resident in the page tables) a region of the vmspace. 780 * 781 * Presently the [start, end) span must equal a region previously mapped by a 782 * call to vmspace_map(). 783 */ 784 int 785 vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end) 786 { 787 const size_t size = end - start; 788 vmspace_mapping_t *vmsm; 789 790 mutex_enter(&vms->vms_lock); 791 792 /* For the time being, only exact-match mappings are expected */ 793 if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) { 794 mutex_exit(&vms->vms_lock); 795 return (FC_NOMAP); 796 } 797 798 vm_object_t *vmo = vmsm->vmsm_object; 799 const int prot = vmsm->vmsm_prot; 800 const uint8_t attr = vmo->vmo_attr; 801 size_t populated = 0; 802 for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) { 803 const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 804 VERIFY(pfn != PFN_INVALID); 805 806 if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) { 807 populated++; 808 } 809 } 810 atomic_add_64(&vms->vms_pages_mapped, populated); 811 812 mutex_exit(&vms->vms_lock); 813 return (0); 814 } 815 816 /* 817 * Allocate a client from a given vmspace. 818 */ 819 vm_client_t * 820 vmspace_client_alloc(vmspace_t *vms) 821 { 822 vm_client_t *vmc; 823 824 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); 825 vmc->vmc_space = vms; 826 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); 827 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); 828 vmc->vmc_state = VCS_IDLE; 829 vmc->vmc_cpu_active = -1; 830 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), 831 offsetof(vm_page_t, vmp_node)); 832 vmc->vmc_track_dirty = vms->vms_track_dirty; 833 834 mutex_enter(&vms->vms_lock); 835 list_insert_tail(&vms->vms_clients, vmc); 836 mutex_exit(&vms->vms_lock); 837 838 return (vmc); 839 } 840 841 /* 842 * Get the nested page table root pointer (EPTP/NCR3) value. 843 */ 844 uint64_t 845 vmspace_table_root(vmspace_t *vms) 846 { 847 return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty)); 848 } 849 850 /* 851 * Get the current generation number of the nested page table. 852 */ 853 uint64_t 854 vmspace_table_gen(vmspace_t *vms) 855 { 856 return (vms->vms_pt_gen); 857 } 858 859 /* 860 * Mark a vm_client as active. This will block if/while the client is held by 861 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will 862 * fail if the vm_client has been orphaned. 863 */ 864 static int 865 vmc_activate(vm_client_t *vmc) 866 { 867 mutex_enter(&vmc->vmc_lock); 868 VERIFY0(vmc->vmc_state & VCS_ACTIVE); 869 if ((vmc->vmc_state & VCS_ORPHANED) != 0) { 870 mutex_exit(&vmc->vmc_lock); 871 return (ENXIO); 872 } 873 while ((vmc->vmc_state & VCS_HOLD) != 0) { 874 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 875 } 876 vmc->vmc_state |= VCS_ACTIVE; 877 return (0); 878 } 879 880 /* 881 * Mark a vm_client as no longer active. It must be called with 882 * vm_client_t`vmc_lock already held, and will return with it released. 883 */ 884 static void 885 vmc_deactivate(vm_client_t *vmc) 886 { 887 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 888 VERIFY(vmc->vmc_state & VCS_ACTIVE); 889 890 vmc->vmc_state ^= VCS_ACTIVE; 891 if ((vmc->vmc_state & VCS_HOLD) != 0) { 892 cv_broadcast(&vmc->vmc_cv); 893 } 894 mutex_exit(&vmc->vmc_lock); 895 } 896 897 /* 898 * Indicate that a CPU will be utilizing the nested page tables through this VM 899 * client. Interrupts (and/or the GIF) are expected to be disabled when calling 900 * this function. Returns the generation number of the nested page table (to be 901 * used for TLB invalidations). 902 */ 903 uint64_t 904 vmc_table_enter(vm_client_t *vmc) 905 { 906 vmspace_t *vms = vmc->vmc_space; 907 uint64_t gen; 908 909 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 910 ASSERT3S(vmc->vmc_cpu_active, ==, -1); 911 912 /* 913 * Since the NPT activation occurs with interrupts disabled, this must 914 * be done without taking vmc_lock like normal. 915 */ 916 gen = vms->vms_pt_gen; 917 vmc->vmc_cpu_active = CPU->cpu_id; 918 vmc->vmc_cpu_gen = gen; 919 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); 920 921 return (gen); 922 } 923 924 /* 925 * Indicate that this VM client is not longer (directly) using the underlying 926 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling 927 * this function. 928 */ 929 void 930 vmc_table_exit(vm_client_t *vmc) 931 { 932 mutex_enter(&vmc->vmc_lock); 933 934 ASSERT(vmc->vmc_state & VCS_ON_CPU); 935 vmc->vmc_state ^= VCS_ON_CPU; 936 vmc->vmc_cpu_active = -1; 937 if ((vmc->vmc_state & VCS_HOLD) != 0) { 938 cv_broadcast(&vmc->vmc_cv); 939 } 940 941 mutex_exit(&vmc->vmc_lock); 942 } 943 944 static void 945 vmc_space_hold(vm_client_t *vmc) 946 { 947 mutex_enter(&vmc->vmc_lock); 948 VERIFY0(vmc->vmc_state & VCS_HOLD); 949 950 /* 951 * Because vmc_table_enter() alters vmc_state from a context where 952 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting 953 * VMC_HOLD must be done atomically here. 954 */ 955 atomic_or_uint(&vmc->vmc_state, VCS_HOLD); 956 957 /* Wait for client to go inactive */ 958 while ((vmc->vmc_state & VCS_ACTIVE) != 0) { 959 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 960 } 961 mutex_exit(&vmc->vmc_lock); 962 } 963 964 static void 965 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) 966 { 967 mutex_enter(&vmc->vmc_lock); 968 VERIFY(vmc->vmc_state & VCS_HOLD); 969 970 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { 971 poke_cpu(vmc->vmc_cpu_active); 972 973 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 974 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 975 } 976 } 977 978 /* 979 * Because vmc_table_enter() alters vmc_state from a context where 980 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing 981 * VMC_HOLD must be done atomically here. 982 */ 983 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); 984 cv_broadcast(&vmc->vmc_cv); 985 mutex_exit(&vmc->vmc_lock); 986 } 987 988 static void 989 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, 990 uint64_t gen) 991 { 992 mutex_enter(&vmc->vmc_lock); 993 VERIFY(vmc->vmc_state & VCS_HOLD); 994 if ((vmc->vmc_state & VCS_ON_CPU) != 0) { 995 /* 996 * Wait for clients using an old generation of the page tables 997 * to exit guest context, where they subsequently flush the TLB 998 * for the new generation. 999 */ 1000 if (vmc->vmc_cpu_gen < gen) { 1001 poke_cpu(vmc->vmc_cpu_active); 1002 1003 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1004 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1005 } 1006 } 1007 } 1008 if (vmc->vmc_inval_func != NULL) { 1009 vmc_inval_cb_t func = vmc->vmc_inval_func; 1010 void *data = vmc->vmc_inval_data; 1011 1012 /* 1013 * Perform the actual invalidation call outside vmc_lock to 1014 * avoid lock ordering issues in the consumer. Since the client 1015 * is under VCS_HOLD, this is safe. 1016 */ 1017 mutex_exit(&vmc->vmc_lock); 1018 func(data, addr, size); 1019 mutex_enter(&vmc->vmc_lock); 1020 } 1021 mutex_exit(&vmc->vmc_lock); 1022 } 1023 1024 static void 1025 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, 1026 vm_object_t *vmo) 1027 { 1028 mutex_enter(&vmc->vmc_lock); 1029 VERIFY(vmc->vmc_state & VCS_HOLD); 1030 1031 /* 1032 * With the current vCPU exclusion invariants in place, we do not expect 1033 * a vCPU to be in guest context during an unmap. 1034 */ 1035 VERIFY0(vmc->vmc_state & VCS_ON_CPU); 1036 1037 /* 1038 * Any holds against the unmapped region need to establish their own 1039 * reference to the underlying object to avoid a potential 1040 * use-after-free. 1041 */ 1042 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1043 vmp != NULL; 1044 vmp = list_next(&vmc->vmc_held_pages, vmc)) { 1045 if (vmp->vmp_gpa < addr || 1046 vmp->vmp_gpa >= (addr + size)) { 1047 /* Hold outside region in question */ 1048 continue; 1049 } 1050 if (vmp->vmp_obj_ref == NULL) { 1051 vm_object_reference(vmo); 1052 vmp->vmp_obj_ref = vmo; 1053 /* For an unmapped region, PTE is now meaningless */ 1054 vmp->vmp_ptep = NULL; 1055 } else { 1056 /* 1057 * Object could have gone through cycle of 1058 * unmap-map-unmap before the hold was released. 1059 */ 1060 VERIFY3P(vmp->vmp_ptep, ==, NULL); 1061 } 1062 } 1063 mutex_exit(&vmc->vmc_lock); 1064 } 1065 1066 static vm_client_t * 1067 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) 1068 { 1069 vm_client_t *next; 1070 1071 ASSERT(MUTEX_HELD(&vms->vms_lock)); 1072 1073 mutex_enter(&vmc->vmc_lock); 1074 VERIFY3P(vmc->vmc_space, ==, vms); 1075 VERIFY0(vmc->vmc_state & VCS_ORPHANED); 1076 if (vmc->vmc_state & VCS_DESTROY) { 1077 /* 1078 * This vm_client is currently undergoing destruction, so it 1079 * does not need to be orphaned. Let it proceed with its own 1080 * clean-up task. 1081 */ 1082 next = list_next(&vms->vms_clients, vmc); 1083 } else { 1084 /* 1085 * Clients are only orphaned when the containing vmspace is 1086 * being torn down. All mappings from the vmspace should 1087 * already be gone, meaning any remaining held pages should have 1088 * direct references to the object. 1089 */ 1090 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1091 vmp != NULL; 1092 vmp = list_next(&vmc->vmc_held_pages, vmp)) { 1093 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1094 ASSERT3P(vmp->vmp_obj_ref, !=, NULL); 1095 } 1096 1097 /* 1098 * After this point, the client will be orphaned, unable to 1099 * establish new page holds (or access any vmspace-related 1100 * resources) and is in charge of cleaning up after itself. 1101 */ 1102 vmc->vmc_state |= VCS_ORPHANED; 1103 next = list_next(&vms->vms_clients, vmc); 1104 list_remove(&vms->vms_clients, vmc); 1105 vmc->vmc_space = NULL; 1106 } 1107 mutex_exit(&vmc->vmc_lock); 1108 return (next); 1109 } 1110 1111 /* 1112 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1113 */ 1114 vm_page_t * 1115 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) 1116 { 1117 vmspace_t *vms = vmc->vmc_space; 1118 vm_page_t *vmp; 1119 pfn_t pfn = PFN_INVALID; 1120 uint64_t *ptep = NULL; 1121 1122 ASSERT0(gpa & PAGEOFFSET); 1123 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); 1124 1125 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); 1126 if (vmc_activate(vmc) != 0) { 1127 kmem_free(vmp, sizeof (*vmp)); 1128 return (NULL); 1129 } 1130 1131 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { 1132 vmc_deactivate(vmc); 1133 kmem_free(vmp, sizeof (*vmp)); 1134 return (NULL); 1135 } 1136 ASSERT(pfn != PFN_INVALID && ptep != NULL); 1137 1138 vmp->vmp_client = vmc; 1139 vmp->vmp_chain = NULL; 1140 vmp->vmp_gpa = gpa; 1141 vmp->vmp_pfn = pfn; 1142 vmp->vmp_ptep = ptep; 1143 vmp->vmp_obj_ref = NULL; 1144 vmp->vmp_prot = prot; 1145 list_insert_tail(&vmc->vmc_held_pages, vmp); 1146 vmc_deactivate(vmc); 1147 1148 return (vmp); 1149 } 1150 1151 int 1152 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) 1153 { 1154 vmspace_t *vms = vmc->vmc_space; 1155 int err; 1156 1157 err = vmc_activate(vmc); 1158 if (err == 0) { 1159 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); 1160 vmc_deactivate(vmc); 1161 } 1162 1163 return (err); 1164 } 1165 1166 /* 1167 * Allocate an additional vm_client_t, based on an existing one. Only the 1168 * associatation with the vmspace is cloned, not existing holds or any 1169 * configured invalidation function. 1170 */ 1171 vm_client_t * 1172 vmc_clone(vm_client_t *vmc) 1173 { 1174 vmspace_t *vms = vmc->vmc_space; 1175 1176 return (vmspace_client_alloc(vms)); 1177 } 1178 1179 /* 1180 * Register a function (and associated data pointer) to be called when an 1181 * address range in the vmspace is invalidated. 1182 */ 1183 int 1184 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) 1185 { 1186 int err; 1187 1188 err = vmc_activate(vmc); 1189 if (err == 0) { 1190 vmc->vmc_inval_func = func; 1191 vmc->vmc_inval_data = data; 1192 vmc_deactivate(vmc); 1193 } 1194 1195 return (err); 1196 } 1197 1198 /* 1199 * Destroy a vm_client_t instance. 1200 * 1201 * No pages held through this vm_client_t may be outstanding when performing a 1202 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to 1203 * vmc_table_exit() has been made). 1204 */ 1205 void 1206 vmc_destroy(vm_client_t *vmc) 1207 { 1208 mutex_enter(&vmc->vmc_lock); 1209 1210 VERIFY(list_is_empty(&vmc->vmc_held_pages)); 1211 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1212 1213 if ((vmc->vmc_state & VCS_ORPHANED) == 0) { 1214 vmspace_t *vms; 1215 1216 /* 1217 * Deassociation with the parent vmspace must be done carefully: 1218 * The vmspace could attempt to orphan this vm_client while we 1219 * release vmc_lock in order to take vms_lock (the required 1220 * order). The client is marked to indicate that destruction is 1221 * under way. Doing so prevents any racing orphan operation 1222 * from applying to this client, allowing us to deassociate from 1223 * the vmspace safely. 1224 */ 1225 vmc->vmc_state |= VCS_DESTROY; 1226 vms = vmc->vmc_space; 1227 mutex_exit(&vmc->vmc_lock); 1228 1229 mutex_enter(&vms->vms_lock); 1230 mutex_enter(&vmc->vmc_lock); 1231 list_remove(&vms->vms_clients, vmc); 1232 /* 1233 * If the vmspace began its own destruction operation while we 1234 * were navigating the locks, be sure to notify it about this 1235 * vm_client being deassociated. 1236 */ 1237 cv_signal(&vms->vms_cv); 1238 mutex_exit(&vmc->vmc_lock); 1239 mutex_exit(&vms->vms_lock); 1240 } else { 1241 VERIFY3P(vmc->vmc_space, ==, NULL); 1242 mutex_exit(&vmc->vmc_lock); 1243 } 1244 1245 mutex_destroy(&vmc->vmc_lock); 1246 cv_destroy(&vmc->vmc_cv); 1247 list_destroy(&vmc->vmc_held_pages); 1248 1249 kmem_free(vmc, sizeof (*vmc)); 1250 } 1251 1252 static __inline void * 1253 vmp_ptr(const vm_page_t *vmp) 1254 { 1255 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); 1256 1257 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); 1258 return ((void *)((uintptr_t)kpm_vbase + paddr)); 1259 } 1260 1261 /* 1262 * Get a readable kernel-virtual pointer for a held page. 1263 * 1264 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() 1265 * call to acquire this page reference. 1266 */ 1267 const void * 1268 vmp_get_readable(const vm_page_t *vmp) 1269 { 1270 ASSERT(vmp->vmp_prot & PROT_READ); 1271 1272 return (vmp_ptr(vmp)); 1273 } 1274 1275 /* 1276 * Get a writable kernel-virtual pointer for a held page. 1277 * 1278 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() 1279 * call to acquire this page reference. 1280 */ 1281 void * 1282 vmp_get_writable(const vm_page_t *vmp) 1283 { 1284 ASSERT(vmp->vmp_prot & PROT_WRITE); 1285 1286 return (vmp_ptr(vmp)); 1287 } 1288 1289 /* 1290 * Get the host-physical PFN for a held page. 1291 */ 1292 pfn_t 1293 vmp_get_pfn(const vm_page_t *vmp) 1294 { 1295 return (vmp->vmp_pfn); 1296 } 1297 1298 /* 1299 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. 1300 */ 1301 void 1302 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) 1303 { 1304 ASSERT3P(vmp->vmp_chain, ==, NULL); 1305 1306 vmp->vmp_chain = to_chain; 1307 } 1308 1309 /* 1310 * Retrieve the pointer from the page-chaining in `vmp`. 1311 */ 1312 vm_page_t * 1313 vmp_next(const vm_page_t *vmp) 1314 { 1315 return (vmp->vmp_chain); 1316 } 1317 1318 static __inline bool 1319 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) 1320 { 1321 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1322 1323 bool was_unmapped = false; 1324 1325 list_remove(&vmc->vmc_held_pages, vmp); 1326 if (vmp->vmp_obj_ref != NULL) { 1327 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1328 1329 vm_object_release(vmp->vmp_obj_ref); 1330 was_unmapped = true; 1331 } else { 1332 ASSERT3P(vmp->vmp_ptep, !=, NULL); 1333 1334 if ((vmp->vmp_prot & PROT_WRITE) != 0 && vmc->vmc_track_dirty) { 1335 vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; 1336 (void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); 1337 } 1338 } 1339 kmem_free(vmp, sizeof (*vmp)); 1340 return (was_unmapped); 1341 } 1342 1343 /* 1344 * Release held page. Returns true if page resided on region which was 1345 * subsequently unmapped. 1346 */ 1347 bool 1348 vmp_release(vm_page_t *vmp) 1349 { 1350 vm_client_t *vmc = vmp->vmp_client; 1351 1352 VERIFY(vmc != NULL); 1353 1354 mutex_enter(&vmc->vmc_lock); 1355 const bool was_unmapped = vmp_release_inner(vmp, vmc); 1356 mutex_exit(&vmc->vmc_lock); 1357 return (was_unmapped); 1358 } 1359 1360 /* 1361 * Release a chain of pages which were associated via vmp_chain() (setting 1362 * page-chaining pointer). Returns true if any pages resided upon a region 1363 * which was subsequently unmapped. 1364 * 1365 * All of those pages must have been held through the same vm_client_t. 1366 */ 1367 bool 1368 vmp_release_chain(vm_page_t *vmp) 1369 { 1370 vm_client_t *vmc = vmp->vmp_client; 1371 bool any_unmapped = false; 1372 1373 ASSERT(vmp != NULL); 1374 1375 mutex_enter(&vmc->vmc_lock); 1376 while (vmp != NULL) { 1377 vm_page_t *next = vmp->vmp_chain; 1378 1379 /* We expect all pages in chain to be from same client */ 1380 ASSERT3P(vmp->vmp_client, ==, vmc); 1381 1382 if (vmp_release_inner(vmp, vmc)) { 1383 any_unmapped = true; 1384 } 1385 vmp = next; 1386 } 1387 mutex_exit(&vmc->vmc_lock); 1388 return (any_unmapped); 1389 } 1390 1391 1392 int 1393 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, 1394 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) 1395 { 1396 vm_object_t *vmo; 1397 int err; 1398 1399 if (segoff < 0 || len <= 0 || 1400 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { 1401 return (EINVAL); 1402 } 1403 if ((prot & PROT_USER) == 0) { 1404 return (ENOTSUP); 1405 } 1406 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); 1407 if (err != 0) { 1408 return (err); 1409 } 1410 1411 VERIFY(segoff >= 0); 1412 VERIFY(len <= vmo->vmo_size); 1413 VERIFY((len + segoff) <= vmo->vmo_size); 1414 1415 if (vmo->vmo_type != VMOT_MEM) { 1416 /* Only support memory objects for now */ 1417 return (ENOTSUP); 1418 } 1419 1420 as_rangelock(as); 1421 1422 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); 1423 if (err == 0) { 1424 segvmm_crargs_t svma; 1425 1426 svma.prot = prot; 1427 svma.offset = segoff; 1428 svma.vmo = vmo; 1429 svma.vmc = NULL; 1430 1431 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); 1432 } 1433 1434 as_rangeunlock(as); 1435 return (err); 1436 } 1437 1438 int 1439 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, 1440 off_t len, uint_t prot, uint_t maxprot, uint_t flags) 1441 { 1442 1443 const uintptr_t gpa = (uintptr_t)off; 1444 const size_t size = (uintptr_t)len; 1445 int err; 1446 1447 if (off < 0 || len <= 0 || 1448 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { 1449 return (EINVAL); 1450 } 1451 if ((prot & PROT_USER) == 0) { 1452 return (ENOTSUP); 1453 } 1454 1455 as_rangelock(as); 1456 1457 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); 1458 if (err == 0) { 1459 segvmm_crargs_t svma; 1460 1461 svma.prot = prot; 1462 svma.offset = gpa; 1463 svma.vmo = NULL; 1464 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); 1465 1466 err = as_map(as, *addrp, len, segvmm_create, &svma); 1467 } 1468 1469 as_rangeunlock(as); 1470 return (err); 1471 } 1472