1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2022 Oxide Computer Company 16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/kmem.h> 21 #include <sys/thread.h> 22 #include <sys/list.h> 23 #include <sys/mman.h> 24 #include <sys/types.h> 25 #include <sys/ddi.h> 26 #include <sys/sysmacros.h> 27 #include <sys/machsystm.h> 28 #include <sys/vmsystm.h> 29 #include <sys/malloc.h> 30 #include <sys/x86_archext.h> 31 #include <vm/as.h> 32 #include <vm/hat_i86.h> 33 #include <vm/seg_vn.h> 34 #include <vm/seg_kmem.h> 35 36 #include <sys/vmm_vm.h> 37 #include <sys/seg_vmm.h> 38 #include <sys/vmm_kernel.h> 39 #include <sys/vmm_reservoir.h> 40 #include <sys/vmm_gpt.h> 41 42 43 /* 44 * VMM Virtual Memory 45 * 46 * History 47 * 48 * When bhyve was ported to illumos, one significant hole was handling guest 49 * memory and memory accesses. In the original Pluribus port, bhyve itself 50 * manually handled the EPT structures for guest memory. The updated sources 51 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM 52 * system for memory allocations and management of the EPT structures. Keeping 53 * source differences to a minimum was a priority, so illumos-bhyve implemented 54 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to 55 * boot and run guests. 56 * 57 * While the VM shim was successful in getting illumos-bhyve to a functional 58 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the 59 * compatibility interfaces made it awkward to use. As source differences with 60 * the upstream kernel code became less of a concern, and upcoming features 61 * (such as live migration) would demand more of those VM interfaces, it became 62 * clear that an overhaul was prudent. 63 * 64 * Design 65 * 66 * The new VM system for bhyve retains a number of the same concepts as what it 67 * replaces: 68 * 69 * - `vmspace_t` is the top-level entity for a guest memory space 70 * - `vm_object_t` represents a memory object which can be mapped into a vmspace 71 * - `vm_page_t` represents a page hold within a given vmspace, providing access 72 * to the underlying memory page 73 * 74 * Unlike the old code, where most of the involved structures were exposed via 75 * public definitions, this replacement VM interface keeps all involved 76 * structures opaque to consumers. Furthermore, there is a clear delineation 77 * between infrequent administrative operations (such as mapping/unmapping 78 * regions) and common data-path operations (attempting a page hold at a given 79 * guest-physical address). Those administrative operations are performed 80 * directly against the vmspace, whereas the data-path operations are performed 81 * through a `vm_client_t` handle. That VM client abstraction is meant to 82 * reduce contention and overhead for frequent access operations and provide 83 * debugging insight into how different subcomponents are accessing the vmspace. 84 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv 85 * interface) and each VMM userspace segment mapping. 86 * 87 * Exclusion 88 * 89 * Making changes to the vmspace (such as mapping or unmapping regions) requires 90 * other accessors be excluded while the change is underway to prevent them from 91 * observing invalid intermediate states. A simple approach could use a mutex 92 * or rwlock to achieve this, but that risks contention when the rate of access 93 * to the vmspace is high. 94 * 95 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion 96 * at a per-vm_client_t basis. While this raises the cost for vmspace changes, 97 * it means that the much more common page accesses through the vm_client can 98 * normally proceed unimpeded and independently. 99 * 100 * When a change to the vmspace is required, the caller will put the vmspace in 101 * a 'hold' state, iterating over all associated vm_client instances, waiting 102 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before 103 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on 104 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) 105 * will block until the hold condition is cleared. Once the hold is asserted 106 * for all clients, the vmspace change can proceed with confidence. Upon 107 * completion of that operation, VCS_HOLD is cleared from the clients, and they 108 * are released to resume vmspace accesses. 109 * 110 * vCPU Consumers 111 * 112 * Access to the vmspace for vCPUs running in guest context is different from 113 * emulation-related vm_client activity: they solely rely on the contents of the 114 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude 115 * client access is not feasible when entering guest context, since interrupts 116 * are disabled, making it impossible to block entry. This is not a concern as 117 * long as vmspace modifications never place the page tables in invalid states 118 * (either intermediate, or final). The vm_client hold mechanism does provide 119 * the means to IPI vCPU consumers which will trigger a notification once they 120 * report their exit from guest context. This can be used to ensure that page 121 * table modifications are made visible to those vCPUs within a certain 122 * time frame. 123 */ 124 125 typedef struct vmspace_mapping { 126 list_node_t vmsm_node; 127 vm_object_t *vmsm_object; /* object backing this mapping */ 128 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ 129 size_t vmsm_len; /* length (in bytes) of mapping */ 130 off_t vmsm_offset; /* byte offset into object */ 131 uint_t vmsm_prot; 132 } vmspace_mapping_t; 133 134 #define VMSM_OFFSET(vmsm, addr) ( \ 135 (vmsm)->vmsm_offset + \ 136 ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) 137 138 typedef enum vm_client_state { 139 VCS_IDLE = 0, 140 /* currently accessing vmspace for client operation (hold or fault) */ 141 VCS_ACTIVE = (1 << 0), 142 /* client hold requested/asserted */ 143 VCS_HOLD = (1 << 1), 144 /* vCPU is accessing page tables in guest context */ 145 VCS_ON_CPU = (1 << 2), 146 /* client has been orphaned (no more access to vmspace) */ 147 VCS_ORPHANED = (1 << 3), 148 /* client undergoing destroy operation */ 149 VCS_DESTROY = (1 << 4), 150 } vm_client_state_t; 151 152 struct vmspace { 153 kmutex_t vms_lock; 154 kcondvar_t vms_cv; 155 bool vms_held; 156 uintptr_t vms_size; /* immutable after creation */ 157 158 /* (nested) page table state */ 159 vmm_gpt_t *vms_gpt; 160 uint64_t vms_pt_gen; 161 uint64_t vms_pages_mapped; 162 bool vms_track_dirty; 163 164 list_t vms_maplist; 165 list_t vms_clients; 166 }; 167 168 struct vm_client { 169 vmspace_t *vmc_space; 170 list_node_t vmc_node; 171 172 kmutex_t vmc_lock; 173 kcondvar_t vmc_cv; 174 vm_client_state_t vmc_state; 175 int vmc_cpu_active; 176 uint64_t vmc_cpu_gen; 177 bool vmc_track_dirty; 178 vmc_inval_cb_t vmc_inval_func; 179 void *vmc_inval_data; 180 181 list_t vmc_held_pages; 182 }; 183 184 typedef enum vm_object_type { 185 VMOT_NONE, 186 VMOT_MEM, 187 VMOT_MMIO, 188 } vm_object_type_t; 189 190 struct vm_object { 191 uint_t vmo_refcnt; /* manipulated with atomic ops */ 192 193 /* Fields below are fixed at creation time */ 194 vm_object_type_t vmo_type; 195 size_t vmo_size; 196 void *vmo_data; 197 uint8_t vmo_attr; 198 }; 199 200 struct vm_page { 201 vm_client_t *vmp_client; 202 list_node_t vmp_node; 203 vm_page_t *vmp_chain; 204 uintptr_t vmp_gpa; 205 pfn_t vmp_pfn; 206 uint64_t *vmp_ptep; 207 vm_object_t *vmp_obj_ref; 208 int vmp_prot; 209 }; 210 211 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); 212 static void vmspace_hold_enter(vmspace_t *); 213 static void vmspace_hold_exit(vmspace_t *, bool); 214 static void vmc_space_hold(vm_client_t *); 215 static void vmc_space_release(vm_client_t *, bool); 216 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); 217 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); 218 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); 219 220 221 /* 222 * Create a new vmspace with a maximum address of `end`. 223 */ 224 vmspace_t * 225 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) 226 { 227 vmspace_t *vms; 228 const uintptr_t size = end + 1; 229 230 /* 231 * This whole mess is built on the assumption that a 64-bit address 232 * space is available to work with for the various pagetable tricks. 233 */ 234 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && 235 size <= (uintptr_t)USERLIMIT); 236 237 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); 238 vms->vms_size = size; 239 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), 240 offsetof(vmspace_mapping_t, vmsm_node)); 241 list_create(&vms->vms_clients, sizeof (vm_client_t), 242 offsetof(vm_client_t, vmc_node)); 243 244 vms->vms_gpt = vmm_gpt_alloc(pte_ops); 245 vms->vms_pt_gen = 1; 246 vms->vms_track_dirty = track_dirty; 247 248 return (vms); 249 } 250 251 /* 252 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining 253 * clients will be orphaned. 254 */ 255 void 256 vmspace_destroy(vmspace_t *vms) 257 { 258 mutex_enter(&vms->vms_lock); 259 VERIFY(list_is_empty(&vms->vms_maplist)); 260 261 if (!list_is_empty(&vms->vms_clients)) { 262 vm_client_t *vmc = list_head(&vms->vms_clients); 263 while (vmc != NULL) { 264 vmc = vmc_space_orphan(vmc, vms); 265 } 266 /* 267 * Wait for any clients which were in the process of destroying 268 * themselves to disappear. 269 */ 270 while (!list_is_empty(&vms->vms_clients)) { 271 cv_wait(&vms->vms_cv, &vms->vms_lock); 272 } 273 } 274 VERIFY(list_is_empty(&vms->vms_clients)); 275 276 vmm_gpt_free(vms->vms_gpt); 277 mutex_exit(&vms->vms_lock); 278 279 mutex_destroy(&vms->vms_lock); 280 cv_destroy(&vms->vms_cv); 281 list_destroy(&vms->vms_maplist); 282 list_destroy(&vms->vms_clients); 283 284 kmem_free(vms, sizeof (*vms)); 285 } 286 287 /* 288 * Retrieve the count of resident (mapped into the page tables) pages. 289 */ 290 uint64_t 291 vmspace_resident_count(vmspace_t *vms) 292 { 293 return (vms->vms_pages_mapped); 294 } 295 296 void 297 vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap) 298 { 299 /* 300 * Accumulate dirty bits into the given bit vector. Note that this 301 * races both against hardware writes from running vCPUs and 302 * reflections from userspace. 303 * 304 * Called from a userspace-visible ioctl, this depends on the VM 305 * instance being read-locked to prevent vmspace_map/vmspace_unmap 306 * operations from changing the page tables during the walk. 307 */ 308 for (size_t offset = 0; offset < len; offset += PAGESIZE) { 309 bool bit = false; 310 uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset); 311 if (entry != NULL) 312 bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false); 313 uint64_t pfn_offset = offset >> PAGESHIFT; 314 size_t bit_offset = pfn_offset / 8; 315 size_t bit_index = pfn_offset % 8; 316 bitmap[bit_offset] |= (bit << bit_index); 317 } 318 319 /* 320 * Now invalidate those bits and shoot down address spaces that 321 * may have them cached. 322 */ 323 vmspace_hold_enter(vms); 324 vms->vms_pt_gen++; 325 for (vm_client_t *vmc = list_head(&vms->vms_clients); 326 vmc != NULL; 327 vmc = list_next(&vms->vms_clients, vmc)) { 328 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); 329 } 330 vmspace_hold_exit(vms, true); 331 } 332 333 static pfn_t 334 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) 335 { 336 vmmr_region_t *region; 337 pfn_t pfn; 338 339 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); 340 341 region = vmo->vmo_data; 342 pfn = vmmr_region_pfn_at(region, off); 343 344 return (pfn); 345 } 346 347 static pfn_t 348 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) 349 { 350 pfn_t pfn; 351 352 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); 353 ASSERT3P(vmo->vmo_data, !=, NULL); 354 ASSERT3U(off, <, vmo->vmo_size); 355 356 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; 357 358 return (pfn); 359 } 360 361 /* 362 * Allocate a VM object backed by VMM reservoir memory. 363 */ 364 vm_object_t * 365 vm_object_mem_allocate(size_t size, bool transient) 366 { 367 int err; 368 vmmr_region_t *region = NULL; 369 vm_object_t *vmo; 370 371 ASSERT3U(size, !=, 0); 372 ASSERT3U(size & PAGEOFFSET, ==, 0); 373 374 err = vmmr_alloc(size, transient, ®ion); 375 if (err != 0) { 376 return (NULL); 377 } 378 379 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 380 381 /* For now, these are to stay fixed after allocation */ 382 vmo->vmo_type = VMOT_MEM; 383 vmo->vmo_size = size; 384 vmo->vmo_attr = MTRR_TYPE_WB; 385 vmo->vmo_data = region; 386 vmo->vmo_refcnt = 1; 387 388 return (vmo); 389 } 390 391 static vm_object_t * 392 vm_object_mmio_allocate(size_t size, uintptr_t hpa) 393 { 394 vm_object_t *vmo; 395 396 ASSERT3U(size, !=, 0); 397 ASSERT3U(size & PAGEOFFSET, ==, 0); 398 ASSERT3U(hpa & PAGEOFFSET, ==, 0); 399 400 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 401 402 /* For now, these are to stay fixed after allocation */ 403 vmo->vmo_type = VMOT_MMIO; 404 vmo->vmo_size = size; 405 vmo->vmo_attr = MTRR_TYPE_UC; 406 vmo->vmo_data = (void *)hpa; 407 vmo->vmo_refcnt = 1; 408 409 return (vmo); 410 } 411 412 /* 413 * Allocate a VM object backed by an existing range of physical memory. 414 */ 415 vm_object_t * 416 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) 417 { 418 int error; 419 vm_object_t *obj; 420 421 obj = vm_object_mmio_allocate(len, hpa); 422 if (obj != NULL) { 423 error = vmspace_map(vmspace, obj, 0, gpa, len, 424 PROT_READ | PROT_WRITE); 425 if (error != 0) { 426 vm_object_release(obj); 427 obj = NULL; 428 } 429 } 430 431 return (obj); 432 } 433 434 /* 435 * Release a vm_object reference 436 */ 437 void 438 vm_object_release(vm_object_t *vmo) 439 { 440 ASSERT(vmo != NULL); 441 442 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); 443 /* underflow would be a deadly serious mistake */ 444 VERIFY3U(ref, !=, UINT_MAX); 445 if (ref != 0) { 446 return; 447 } 448 449 switch (vmo->vmo_type) { 450 case VMOT_MEM: 451 vmmr_free((vmmr_region_t *)vmo->vmo_data); 452 break; 453 case VMOT_MMIO: 454 break; 455 default: 456 panic("unexpected object type %u", vmo->vmo_type); 457 break; 458 } 459 460 vmo->vmo_data = NULL; 461 vmo->vmo_size = 0; 462 kmem_free(vmo, sizeof (*vmo)); 463 } 464 465 /* 466 * Increase refcount for vm_object reference 467 */ 468 void 469 vm_object_reference(vm_object_t *vmo) 470 { 471 ASSERT(vmo != NULL); 472 473 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); 474 /* overflow would be a deadly serious mistake */ 475 VERIFY3U(ref, !=, 0); 476 } 477 478 /* 479 * Get the host-physical PFN for a given offset into a vm_object. 480 * 481 * The provided `off` must be within the allocated size of the vm_object. 482 */ 483 pfn_t 484 vm_object_pfn(vm_object_t *vmo, uintptr_t off) 485 { 486 const uintptr_t aligned_off = off & PAGEMASK; 487 488 switch (vmo->vmo_type) { 489 case VMOT_MEM: 490 return (vm_object_pager_reservoir(vmo, aligned_off)); 491 case VMOT_MMIO: 492 return (vm_object_pager_mmio(vmo, aligned_off)); 493 case VMOT_NONE: 494 break; 495 } 496 panic("unexpected object type %u", vmo->vmo_type); 497 } 498 499 static vmspace_mapping_t * 500 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) 501 { 502 vmspace_mapping_t *vmsm; 503 list_t *ml = &vms->vms_maplist; 504 const uintptr_t range_end = addr + size; 505 506 ASSERT3U(addr, <=, range_end); 507 508 if (addr >= vms->vms_size) { 509 return (NULL); 510 } 511 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 512 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; 513 514 if (addr >= vmsm->vmsm_addr && addr < seg_end) { 515 if (range_end <= seg_end) { 516 return (vmsm); 517 } else { 518 return (NULL); 519 } 520 } 521 } 522 return (NULL); 523 } 524 525 /* 526 * Check to see if any mappings reside within [addr, addr + size) span in the 527 * vmspace, returning true if that span is indeed empty. 528 */ 529 static bool 530 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) 531 { 532 vmspace_mapping_t *vmsm; 533 list_t *ml = &vms->vms_maplist; 534 const uintptr_t range_end = addr + size - 1; 535 536 ASSERT(MUTEX_HELD(&vms->vms_lock)); 537 ASSERT(size > 0); 538 539 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 540 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; 541 542 /* 543 * The two ranges do not overlap if the start of either of 544 * them is after the end of the other. 545 */ 546 if (vmsm->vmsm_addr > range_end || addr > seg_end) 547 continue; 548 return (false); 549 } 550 return (true); 551 } 552 553 static void 554 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) 555 { 556 list_t *ml = &vms->vms_maplist; 557 558 ASSERT(MUTEX_HELD(&vms->vms_lock)); 559 ASSERT(vms->vms_held); 560 561 list_remove(ml, vmsm); 562 vm_object_release(vmsm->vmsm_object); 563 kmem_free(vmsm, sizeof (*vmsm)); 564 } 565 566 /* 567 * Enter a hold state on the vmspace. This ensures that all VM clients 568 * associated with the vmspace are excluded from establishing new page holds, 569 * or any other actions which would require accessing vmspace state subject to 570 * potential change. 571 * 572 * Returns with vmspace_t`vms_lock held. 573 */ 574 static void 575 vmspace_hold_enter(vmspace_t *vms) 576 { 577 mutex_enter(&vms->vms_lock); 578 VERIFY(!vms->vms_held); 579 580 vm_client_t *vmc = list_head(&vms->vms_clients); 581 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 582 vmc_space_hold(vmc); 583 } 584 vms->vms_held = true; 585 } 586 587 /* 588 * Exit a hold state on the vmspace. This releases all VM clients associated 589 * with the vmspace to be able to establish new page holds, and partake in other 590 * actions which require accessing changed vmspace state. If `kick_on_cpu` is 591 * true, then any CPUs actively using the page tables will be IPIed, and the 592 * call will block until they have acknowledged being ready to use the latest 593 * state of the tables. 594 * 595 * Requires vmspace_t`vms_lock be held, which is released as part of the call. 596 */ 597 static void 598 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) 599 { 600 ASSERT(MUTEX_HELD(&vms->vms_lock)); 601 VERIFY(vms->vms_held); 602 603 vm_client_t *vmc = list_head(&vms->vms_clients); 604 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 605 vmc_space_release(vmc, kick_on_cpu); 606 } 607 vms->vms_held = false; 608 mutex_exit(&vms->vms_lock); 609 } 610 611 /* 612 * Attempt to map a vm_object span into the vmspace. 613 * 614 * Requirements: 615 * - `obj_off`, `addr`, and `len` must be page-aligned 616 * - `obj_off` cannot be greater than the allocated size of the object 617 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated 618 * size of the object 619 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address 620 * of the vmspace 621 */ 622 int 623 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, 624 size_t len, uint8_t prot) 625 { 626 vmspace_mapping_t *vmsm; 627 int res = 0; 628 629 if (len == 0 || (addr + len) < addr || 630 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { 631 return (EINVAL); 632 } 633 if ((addr + len) >= vms->vms_size) { 634 return (ENOMEM); 635 } 636 637 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); 638 639 vmspace_hold_enter(vms); 640 if (!vm_mapping_gap(vms, addr, len)) { 641 kmem_free(vmsm, sizeof (*vmsm)); 642 res = ENOMEM; 643 } else { 644 vmsm->vmsm_object = vmo; 645 vmsm->vmsm_addr = addr; 646 vmsm->vmsm_len = len; 647 vmsm->vmsm_offset = (off_t)obj_off; 648 vmsm->vmsm_prot = prot; 649 list_insert_tail(&vms->vms_maplist, vmsm); 650 651 /* 652 * Make sure the GPT has tables ready for leaf entries across 653 * the entire new mapping. 654 */ 655 vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len); 656 } 657 vmspace_hold_exit(vms, false); 658 return (res); 659 } 660 661 /* 662 * Unmap a region of the vmspace. 663 * 664 * Presently the [start, end) span must equal a region previously mapped by a 665 * call to vmspace_map(). 666 */ 667 int 668 vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end) 669 { 670 const size_t size = (size_t)(end - start); 671 vmspace_mapping_t *vmsm; 672 vm_client_t *vmc; 673 uint64_t gen = 0; 674 675 ASSERT(start < end); 676 677 vmspace_hold_enter(vms); 678 /* expect to match existing mapping exactly */ 679 if ((vmsm = vm_mapping_find(vms, start, size)) == NULL || 680 vmsm->vmsm_addr != start || vmsm->vmsm_len != size) { 681 vmspace_hold_exit(vms, false); 682 return (ENOENT); 683 } 684 685 /* Prepare clients (and their held pages) for the unmap. */ 686 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 687 vmc = list_next(&vms->vms_clients, vmc)) { 688 vmc_space_unmap(vmc, start, size, vmsm->vmsm_object); 689 } 690 691 /* Clear all PTEs for region */ 692 if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) { 693 vms->vms_pt_gen++; 694 gen = vms->vms_pt_gen; 695 } 696 /* ... and the intermediate (directory) PTEs as well */ 697 vmm_gpt_vacate_region(vms->vms_gpt, start, end); 698 699 /* 700 * If pages were actually unmapped from the GPT, provide clients with 701 * an invalidation notice. 702 */ 703 if (gen != 0) { 704 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 705 vmc = list_next(&vms->vms_clients, vmc)) { 706 vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen); 707 } 708 } 709 710 vm_mapping_remove(vms, vmsm); 711 vmspace_hold_exit(vms, true); 712 return (0); 713 } 714 715 static int 716 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 717 uint64_t **ptepp) 718 { 719 vmm_gpt_t *gpt = vms->vms_gpt; 720 uint64_t *entries[MAX_GPT_LEVEL], *leaf; 721 pfn_t pfn = PFN_INVALID; 722 uint_t prot; 723 724 ASSERT0(gpa & PAGEOFFSET); 725 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); 726 727 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 728 leaf = entries[LEVEL1]; 729 if (leaf == NULL) { 730 /* 731 * Since we populated the intermediate tables for any regions 732 * mapped in the GPT, an empty leaf entry indicates there is no 733 * mapping, populated or not, at this GPT. 734 */ 735 return (FC_NOMAP); 736 } 737 738 if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { 739 if ((req_prot & prot) != req_prot) { 740 return (FC_PROT); 741 } 742 } else { 743 vmspace_mapping_t *vmsm; 744 vm_object_t *vmo; 745 746 vmsm = vm_mapping_find(vms, gpa, PAGESIZE); 747 if (vmsm == NULL) { 748 return (FC_NOMAP); 749 } 750 751 if ((req_prot & vmsm->vmsm_prot) != req_prot) { 752 return (FC_PROT); 753 } 754 vmo = vmsm->vmsm_object; 755 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 756 VERIFY(pfn != PFN_INVALID); 757 758 if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot, 759 vmo->vmo_attr)) { 760 atomic_inc_64(&vms->vms_pages_mapped); 761 } 762 } 763 764 ASSERT(pfn != PFN_INVALID && leaf != NULL); 765 if (pfnp != NULL) { 766 *pfnp = pfn; 767 } 768 if (ptepp != NULL) { 769 *ptepp = leaf; 770 } 771 return (0); 772 } 773 774 /* 775 * Populate (make resident in the page tables) a region of the vmspace. 776 * 777 * Presently the [start, end) span must equal a region previously mapped by a 778 * call to vmspace_map(). 779 */ 780 int 781 vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end) 782 { 783 const size_t size = end - start; 784 vmspace_mapping_t *vmsm; 785 786 mutex_enter(&vms->vms_lock); 787 788 /* For the time being, only exact-match mappings are expected */ 789 if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) { 790 mutex_exit(&vms->vms_lock); 791 return (FC_NOMAP); 792 } 793 794 vm_object_t *vmo = vmsm->vmsm_object; 795 const int prot = vmsm->vmsm_prot; 796 const uint8_t attr = vmo->vmo_attr; 797 size_t populated = 0; 798 for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) { 799 const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 800 VERIFY(pfn != PFN_INVALID); 801 802 if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) { 803 populated++; 804 } 805 } 806 atomic_add_64(&vms->vms_pages_mapped, populated); 807 808 mutex_exit(&vms->vms_lock); 809 return (0); 810 } 811 812 /* 813 * Allocate a client from a given vmspace. 814 */ 815 vm_client_t * 816 vmspace_client_alloc(vmspace_t *vms) 817 { 818 vm_client_t *vmc; 819 820 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); 821 vmc->vmc_space = vms; 822 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); 823 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); 824 vmc->vmc_state = VCS_IDLE; 825 vmc->vmc_cpu_active = -1; 826 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), 827 offsetof(vm_page_t, vmp_node)); 828 vmc->vmc_track_dirty = vms->vms_track_dirty; 829 830 mutex_enter(&vms->vms_lock); 831 list_insert_tail(&vms->vms_clients, vmc); 832 mutex_exit(&vms->vms_lock); 833 834 return (vmc); 835 } 836 837 /* 838 * Get the nested page table root pointer (EPTP/NCR3) value. 839 */ 840 uint64_t 841 vmspace_table_root(vmspace_t *vms) 842 { 843 return (vmm_gpt_get_pmtp(vms->vms_gpt)); 844 } 845 846 /* 847 * Get the current generation number of the nested page table. 848 */ 849 uint64_t 850 vmspace_table_gen(vmspace_t *vms) 851 { 852 return (vms->vms_pt_gen); 853 } 854 855 /* 856 * Mark a vm_client as active. This will block if/while the client is held by 857 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will 858 * fail if the vm_client has been orphaned. 859 */ 860 static int 861 vmc_activate(vm_client_t *vmc) 862 { 863 mutex_enter(&vmc->vmc_lock); 864 VERIFY0(vmc->vmc_state & VCS_ACTIVE); 865 if ((vmc->vmc_state & VCS_ORPHANED) != 0) { 866 mutex_exit(&vmc->vmc_lock); 867 return (ENXIO); 868 } 869 while ((vmc->vmc_state & VCS_HOLD) != 0) { 870 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 871 } 872 vmc->vmc_state |= VCS_ACTIVE; 873 return (0); 874 } 875 876 /* 877 * Mark a vm_client as no longer active. It must be called with 878 * vm_client_t`vmc_lock already held, and will return with it released. 879 */ 880 static void 881 vmc_deactivate(vm_client_t *vmc) 882 { 883 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 884 VERIFY(vmc->vmc_state & VCS_ACTIVE); 885 886 vmc->vmc_state ^= VCS_ACTIVE; 887 if ((vmc->vmc_state & VCS_HOLD) != 0) { 888 cv_broadcast(&vmc->vmc_cv); 889 } 890 mutex_exit(&vmc->vmc_lock); 891 } 892 893 /* 894 * Indicate that a CPU will be utilizing the nested page tables through this VM 895 * client. Interrupts (and/or the GIF) are expected to be disabled when calling 896 * this function. Returns the generation number of the nested page table (to be 897 * used for TLB invalidations). 898 */ 899 uint64_t 900 vmc_table_enter(vm_client_t *vmc) 901 { 902 vmspace_t *vms = vmc->vmc_space; 903 uint64_t gen; 904 905 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 906 ASSERT3S(vmc->vmc_cpu_active, ==, -1); 907 908 /* 909 * Since the NPT activation occurs with interrupts disabled, this must 910 * be done without taking vmc_lock like normal. 911 */ 912 gen = vms->vms_pt_gen; 913 vmc->vmc_cpu_active = CPU->cpu_id; 914 vmc->vmc_cpu_gen = gen; 915 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); 916 917 return (gen); 918 } 919 920 /* 921 * Indicate that this VM client is not longer (directly) using the underlying 922 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling 923 * this function. 924 */ 925 void 926 vmc_table_exit(vm_client_t *vmc) 927 { 928 mutex_enter(&vmc->vmc_lock); 929 930 ASSERT(vmc->vmc_state & VCS_ON_CPU); 931 vmc->vmc_state ^= VCS_ON_CPU; 932 vmc->vmc_cpu_active = -1; 933 if ((vmc->vmc_state & VCS_HOLD) != 0) { 934 cv_broadcast(&vmc->vmc_cv); 935 } 936 937 mutex_exit(&vmc->vmc_lock); 938 } 939 940 static void 941 vmc_space_hold(vm_client_t *vmc) 942 { 943 mutex_enter(&vmc->vmc_lock); 944 VERIFY0(vmc->vmc_state & VCS_HOLD); 945 946 /* 947 * Because vmc_table_enter() alters vmc_state from a context where 948 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting 949 * VMC_HOLD must be done atomically here. 950 */ 951 atomic_or_uint(&vmc->vmc_state, VCS_HOLD); 952 953 /* Wait for client to go inactive */ 954 while ((vmc->vmc_state & VCS_ACTIVE) != 0) { 955 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 956 } 957 mutex_exit(&vmc->vmc_lock); 958 } 959 960 static void 961 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) 962 { 963 mutex_enter(&vmc->vmc_lock); 964 VERIFY(vmc->vmc_state & VCS_HOLD); 965 966 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { 967 poke_cpu(vmc->vmc_cpu_active); 968 969 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 970 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 971 } 972 } 973 974 /* 975 * Because vmc_table_enter() alters vmc_state from a context where 976 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing 977 * VMC_HOLD must be done atomically here. 978 */ 979 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); 980 cv_broadcast(&vmc->vmc_cv); 981 mutex_exit(&vmc->vmc_lock); 982 } 983 984 static void 985 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, 986 uint64_t gen) 987 { 988 mutex_enter(&vmc->vmc_lock); 989 VERIFY(vmc->vmc_state & VCS_HOLD); 990 if ((vmc->vmc_state & VCS_ON_CPU) != 0) { 991 /* 992 * Wait for clients using an old generation of the page tables 993 * to exit guest context, where they subsequently flush the TLB 994 * for the new generation. 995 */ 996 if (vmc->vmc_cpu_gen < gen) { 997 poke_cpu(vmc->vmc_cpu_active); 998 999 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1000 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1001 } 1002 } 1003 } 1004 if (vmc->vmc_inval_func != NULL) { 1005 vmc_inval_cb_t func = vmc->vmc_inval_func; 1006 void *data = vmc->vmc_inval_data; 1007 1008 /* 1009 * Perform the actual invalidation call outside vmc_lock to 1010 * avoid lock ordering issues in the consumer. Since the client 1011 * is under VCS_HOLD, this is safe. 1012 */ 1013 mutex_exit(&vmc->vmc_lock); 1014 func(data, addr, size); 1015 mutex_enter(&vmc->vmc_lock); 1016 } 1017 mutex_exit(&vmc->vmc_lock); 1018 } 1019 1020 static void 1021 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, 1022 vm_object_t *vmo) 1023 { 1024 mutex_enter(&vmc->vmc_lock); 1025 VERIFY(vmc->vmc_state & VCS_HOLD); 1026 1027 /* 1028 * With the current vCPU exclusion invariants in place, we do not expect 1029 * a vCPU to be in guest context during an unmap. 1030 */ 1031 VERIFY0(vmc->vmc_state & VCS_ON_CPU); 1032 1033 /* 1034 * Any holds against the unmapped region need to establish their own 1035 * reference to the underlying object to avoid a potential 1036 * use-after-free. 1037 */ 1038 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1039 vmp != NULL; 1040 vmp = list_next(&vmc->vmc_held_pages, vmc)) { 1041 if (vmp->vmp_gpa < addr || 1042 vmp->vmp_gpa >= (addr + size)) { 1043 /* Hold outside region in question */ 1044 continue; 1045 } 1046 if (vmp->vmp_obj_ref == NULL) { 1047 vm_object_reference(vmo); 1048 vmp->vmp_obj_ref = vmo; 1049 /* For an unmapped region, PTE is now meaningless */ 1050 vmp->vmp_ptep = NULL; 1051 } else { 1052 /* 1053 * Object could have gone through cycle of 1054 * unmap-map-unmap before the hold was released. 1055 */ 1056 VERIFY3P(vmp->vmp_ptep, ==, NULL); 1057 } 1058 } 1059 mutex_exit(&vmc->vmc_lock); 1060 } 1061 1062 static vm_client_t * 1063 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) 1064 { 1065 vm_client_t *next; 1066 1067 ASSERT(MUTEX_HELD(&vms->vms_lock)); 1068 1069 mutex_enter(&vmc->vmc_lock); 1070 VERIFY3P(vmc->vmc_space, ==, vms); 1071 VERIFY0(vmc->vmc_state & VCS_ORPHANED); 1072 if (vmc->vmc_state & VCS_DESTROY) { 1073 /* 1074 * This vm_client is currently undergoing destruction, so it 1075 * does not need to be orphaned. Let it proceed with its own 1076 * clean-up task. 1077 */ 1078 next = list_next(&vms->vms_clients, vmc); 1079 } else { 1080 /* 1081 * Clients are only orphaned when the containing vmspace is 1082 * being torn down. All mappings from the vmspace should 1083 * already be gone, meaning any remaining held pages should have 1084 * direct references to the object. 1085 */ 1086 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1087 vmp != NULL; 1088 vmp = list_next(&vmc->vmc_held_pages, vmp)) { 1089 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1090 ASSERT3P(vmp->vmp_obj_ref, !=, NULL); 1091 } 1092 1093 /* 1094 * After this point, the client will be orphaned, unable to 1095 * establish new page holds (or access any vmspace-related 1096 * resources) and is in charge of cleaning up after itself. 1097 */ 1098 vmc->vmc_state |= VCS_ORPHANED; 1099 next = list_next(&vms->vms_clients, vmc); 1100 list_remove(&vms->vms_clients, vmc); 1101 vmc->vmc_space = NULL; 1102 } 1103 mutex_exit(&vmc->vmc_lock); 1104 return (next); 1105 } 1106 1107 /* 1108 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1109 */ 1110 vm_page_t * 1111 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) 1112 { 1113 vmspace_t *vms = vmc->vmc_space; 1114 vm_page_t *vmp; 1115 pfn_t pfn = PFN_INVALID; 1116 uint64_t *ptep = NULL; 1117 1118 ASSERT0(gpa & PAGEOFFSET); 1119 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); 1120 1121 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); 1122 if (vmc_activate(vmc) != 0) { 1123 kmem_free(vmp, sizeof (*vmp)); 1124 return (NULL); 1125 } 1126 1127 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { 1128 vmc_deactivate(vmc); 1129 kmem_free(vmp, sizeof (*vmp)); 1130 return (NULL); 1131 } 1132 ASSERT(pfn != PFN_INVALID && ptep != NULL); 1133 1134 vmp->vmp_client = vmc; 1135 vmp->vmp_chain = NULL; 1136 vmp->vmp_gpa = gpa; 1137 vmp->vmp_pfn = pfn; 1138 vmp->vmp_ptep = ptep; 1139 vmp->vmp_obj_ref = NULL; 1140 vmp->vmp_prot = prot; 1141 list_insert_tail(&vmc->vmc_held_pages, vmp); 1142 vmc_deactivate(vmc); 1143 1144 return (vmp); 1145 } 1146 1147 int 1148 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) 1149 { 1150 vmspace_t *vms = vmc->vmc_space; 1151 int err; 1152 1153 err = vmc_activate(vmc); 1154 if (err == 0) { 1155 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); 1156 vmc_deactivate(vmc); 1157 } 1158 1159 return (err); 1160 } 1161 1162 /* 1163 * Allocate an additional vm_client_t, based on an existing one. Only the 1164 * associatation with the vmspace is cloned, not existing holds or any 1165 * configured invalidation function. 1166 */ 1167 vm_client_t * 1168 vmc_clone(vm_client_t *vmc) 1169 { 1170 vmspace_t *vms = vmc->vmc_space; 1171 1172 return (vmspace_client_alloc(vms)); 1173 } 1174 1175 /* 1176 * Register a function (and associated data pointer) to be called when an 1177 * address range in the vmspace is invalidated. 1178 */ 1179 int 1180 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) 1181 { 1182 int err; 1183 1184 err = vmc_activate(vmc); 1185 if (err == 0) { 1186 vmc->vmc_inval_func = func; 1187 vmc->vmc_inval_data = data; 1188 vmc_deactivate(vmc); 1189 } 1190 1191 return (err); 1192 } 1193 1194 /* 1195 * Destroy a vm_client_t instance. 1196 * 1197 * No pages held through this vm_client_t may be outstanding when performing a 1198 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to 1199 * vmc_table_exit() has been made). 1200 */ 1201 void 1202 vmc_destroy(vm_client_t *vmc) 1203 { 1204 mutex_enter(&vmc->vmc_lock); 1205 1206 VERIFY(list_is_empty(&vmc->vmc_held_pages)); 1207 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1208 1209 if ((vmc->vmc_state & VCS_ORPHANED) == 0) { 1210 vmspace_t *vms; 1211 1212 /* 1213 * Deassociation with the parent vmspace must be done carefully: 1214 * The vmspace could attempt to orphan this vm_client while we 1215 * release vmc_lock in order to take vms_lock (the required 1216 * order). The client is marked to indicate that destruction is 1217 * under way. Doing so prevents any racing orphan operation 1218 * from applying to this client, allowing us to deassociate from 1219 * the vmspace safely. 1220 */ 1221 vmc->vmc_state |= VCS_DESTROY; 1222 vms = vmc->vmc_space; 1223 mutex_exit(&vmc->vmc_lock); 1224 1225 mutex_enter(&vms->vms_lock); 1226 mutex_enter(&vmc->vmc_lock); 1227 list_remove(&vms->vms_clients, vmc); 1228 /* 1229 * If the vmspace began its own destruction operation while we 1230 * were navigating the locks, be sure to notify it about this 1231 * vm_client being deassociated. 1232 */ 1233 cv_signal(&vms->vms_cv); 1234 mutex_exit(&vmc->vmc_lock); 1235 mutex_exit(&vms->vms_lock); 1236 } else { 1237 VERIFY3P(vmc->vmc_space, ==, NULL); 1238 mutex_exit(&vmc->vmc_lock); 1239 } 1240 1241 mutex_destroy(&vmc->vmc_lock); 1242 cv_destroy(&vmc->vmc_cv); 1243 list_destroy(&vmc->vmc_held_pages); 1244 1245 kmem_free(vmc, sizeof (*vmc)); 1246 } 1247 1248 static __inline void * 1249 vmp_ptr(const vm_page_t *vmp) 1250 { 1251 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); 1252 1253 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); 1254 return ((void *)((uintptr_t)kpm_vbase + paddr)); 1255 } 1256 1257 /* 1258 * Get a readable kernel-virtual pointer for a held page. 1259 * 1260 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() 1261 * call to acquire this page reference. 1262 */ 1263 const void * 1264 vmp_get_readable(const vm_page_t *vmp) 1265 { 1266 ASSERT(vmp->vmp_prot & PROT_READ); 1267 1268 return (vmp_ptr(vmp)); 1269 } 1270 1271 /* 1272 * Get a writable kernel-virtual pointer for a held page. 1273 * 1274 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() 1275 * call to acquire this page reference. 1276 */ 1277 void * 1278 vmp_get_writable(const vm_page_t *vmp) 1279 { 1280 ASSERT(vmp->vmp_prot & PROT_WRITE); 1281 1282 return (vmp_ptr(vmp)); 1283 } 1284 1285 /* 1286 * Get the host-physical PFN for a held page. 1287 */ 1288 pfn_t 1289 vmp_get_pfn(const vm_page_t *vmp) 1290 { 1291 return (vmp->vmp_pfn); 1292 } 1293 1294 /* 1295 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. 1296 */ 1297 void 1298 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) 1299 { 1300 ASSERT3P(vmp->vmp_chain, ==, NULL); 1301 1302 vmp->vmp_chain = to_chain; 1303 } 1304 1305 /* 1306 * Retrieve the pointer from the page-chaining in `vmp`. 1307 */ 1308 vm_page_t * 1309 vmp_next(const vm_page_t *vmp) 1310 { 1311 return (vmp->vmp_chain); 1312 } 1313 1314 static __inline bool 1315 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) 1316 { 1317 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1318 1319 bool was_unmapped = false; 1320 1321 list_remove(&vmc->vmc_held_pages, vmp); 1322 if (vmp->vmp_obj_ref != NULL) { 1323 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1324 1325 vm_object_release(vmp->vmp_obj_ref); 1326 was_unmapped = true; 1327 } else { 1328 ASSERT3P(vmp->vmp_ptep, !=, NULL); 1329 1330 if ((vmp->vmp_prot & PROT_WRITE) != 0 && vmc->vmc_track_dirty) { 1331 vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; 1332 (void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); 1333 } 1334 } 1335 kmem_free(vmp, sizeof (*vmp)); 1336 return (was_unmapped); 1337 } 1338 1339 /* 1340 * Release held page. Returns true if page resided on region which was 1341 * subsequently unmapped. 1342 */ 1343 bool 1344 vmp_release(vm_page_t *vmp) 1345 { 1346 vm_client_t *vmc = vmp->vmp_client; 1347 1348 VERIFY(vmc != NULL); 1349 1350 mutex_enter(&vmc->vmc_lock); 1351 const bool was_unmapped = vmp_release_inner(vmp, vmc); 1352 mutex_exit(&vmc->vmc_lock); 1353 return (was_unmapped); 1354 } 1355 1356 /* 1357 * Release a chain of pages which were associated via vmp_chain() (setting 1358 * page-chaining pointer). Returns true if any pages resided upon a region 1359 * which was subsequently unmapped. 1360 * 1361 * All of those pages must have been held through the same vm_client_t. 1362 */ 1363 bool 1364 vmp_release_chain(vm_page_t *vmp) 1365 { 1366 vm_client_t *vmc = vmp->vmp_client; 1367 bool any_unmapped = false; 1368 1369 ASSERT(vmp != NULL); 1370 1371 mutex_enter(&vmc->vmc_lock); 1372 while (vmp != NULL) { 1373 vm_page_t *next = vmp->vmp_chain; 1374 1375 /* We expect all pages in chain to be from same client */ 1376 ASSERT3P(vmp->vmp_client, ==, vmc); 1377 1378 if (vmp_release_inner(vmp, vmc)) { 1379 any_unmapped = true; 1380 } 1381 vmp = next; 1382 } 1383 mutex_exit(&vmc->vmc_lock); 1384 return (any_unmapped); 1385 } 1386 1387 1388 int 1389 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, 1390 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) 1391 { 1392 vm_object_t *vmo; 1393 int err; 1394 1395 if (segoff < 0 || len <= 0 || 1396 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { 1397 return (EINVAL); 1398 } 1399 if ((prot & PROT_USER) == 0) { 1400 return (ENOTSUP); 1401 } 1402 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); 1403 if (err != 0) { 1404 return (err); 1405 } 1406 1407 VERIFY(segoff >= 0); 1408 VERIFY(len <= vmo->vmo_size); 1409 VERIFY((len + segoff) <= vmo->vmo_size); 1410 1411 if (vmo->vmo_type != VMOT_MEM) { 1412 /* Only support memory objects for now */ 1413 return (ENOTSUP); 1414 } 1415 1416 as_rangelock(as); 1417 1418 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); 1419 if (err == 0) { 1420 segvmm_crargs_t svma; 1421 1422 svma.prot = prot; 1423 svma.offset = segoff; 1424 svma.vmo = vmo; 1425 svma.vmc = NULL; 1426 1427 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); 1428 } 1429 1430 as_rangeunlock(as); 1431 return (err); 1432 } 1433 1434 int 1435 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, 1436 off_t len, uint_t prot, uint_t maxprot, uint_t flags) 1437 { 1438 1439 const uintptr_t gpa = (uintptr_t)off; 1440 const size_t size = (uintptr_t)len; 1441 int err; 1442 1443 if (off < 0 || len <= 0 || 1444 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { 1445 return (EINVAL); 1446 } 1447 if ((prot & PROT_USER) == 0) { 1448 return (ENOTSUP); 1449 } 1450 1451 as_rangelock(as); 1452 1453 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); 1454 if (err == 0) { 1455 segvmm_crargs_t svma; 1456 1457 svma.prot = prot; 1458 svma.offset = gpa; 1459 svma.vmo = NULL; 1460 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); 1461 1462 err = as_map(as, *addrp, len, segvmm_create, &svma); 1463 } 1464 1465 as_rangeunlock(as); 1466 return (err); 1467 } 1468