1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2022 Oxide Computer Company 16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/kmem.h> 21 #include <sys/thread.h> 22 #include <sys/list.h> 23 #include <sys/mman.h> 24 #include <sys/types.h> 25 #include <sys/ddi.h> 26 #include <sys/sysmacros.h> 27 #include <sys/machsystm.h> 28 #include <sys/vmsystm.h> 29 #include <sys/x86_archext.h> 30 #include <vm/as.h> 31 #include <vm/hat_i86.h> 32 #include <vm/seg_vn.h> 33 #include <vm/seg_kmem.h> 34 35 #include <sys/vmm_vm.h> 36 #include <sys/seg_vmm.h> 37 #include <sys/vmm_kernel.h> 38 #include <sys/vmm_reservoir.h> 39 #include <sys/vmm_gpt.h> 40 41 42 /* 43 * VMM Virtual Memory 44 * 45 * History 46 * 47 * When bhyve was ported to illumos, one significant hole was handling guest 48 * memory and memory accesses. In the original Pluribus port, bhyve itself 49 * manually handled the EPT structures for guest memory. The updated sources 50 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM 51 * system for memory allocations and management of the EPT structures. Keeping 52 * source differences to a minimum was a priority, so illumos-bhyve implemented 53 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to 54 * boot and run guests. 55 * 56 * While the VM shim was successful in getting illumos-bhyve to a functional 57 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the 58 * compatibility interfaces made it awkward to use. As source differences with 59 * the upstream kernel code became less of a concern, and upcoming features 60 * (such as live migration) would demand more of those VM interfaces, it became 61 * clear that an overhaul was prudent. 62 * 63 * Design 64 * 65 * The new VM system for bhyve retains a number of the same concepts as what it 66 * replaces: 67 * 68 * - `vmspace_t` is the top-level entity for a guest memory space 69 * - `vm_object_t` represents a memory object which can be mapped into a vmspace 70 * - `vm_page_t` represents a page hold within a given vmspace, providing access 71 * to the underlying memory page 72 * 73 * Unlike the old code, where most of the involved structures were exposed via 74 * public definitions, this replacement VM interface keeps all involved 75 * structures opaque to consumers. Furthermore, there is a clear delineation 76 * between infrequent administrative operations (such as mapping/unmapping 77 * regions) and common data-path operations (attempting a page hold at a given 78 * guest-physical address). Those administrative operations are performed 79 * directly against the vmspace, whereas the data-path operations are performed 80 * through a `vm_client_t` handle. That VM client abstraction is meant to 81 * reduce contention and overhead for frequent access operations and provide 82 * debugging insight into how different subcomponents are accessing the vmspace. 83 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv 84 * interface) and each VMM userspace segment mapping. 85 * 86 * Exclusion 87 * 88 * Making changes to the vmspace (such as mapping or unmapping regions) requires 89 * other accessors be excluded while the change is underway to prevent them from 90 * observing invalid intermediate states. A simple approach could use a mutex 91 * or rwlock to achieve this, but that risks contention when the rate of access 92 * to the vmspace is high. 93 * 94 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion 95 * at a per-vm_client_t basis. While this raises the cost for vmspace changes, 96 * it means that the much more common page accesses through the vm_client can 97 * normally proceed unimpeded and independently. 98 * 99 * When a change to the vmspace is required, the caller will put the vmspace in 100 * a 'hold' state, iterating over all associated vm_client instances, waiting 101 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before 102 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on 103 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) 104 * will block until the hold condition is cleared. Once the hold is asserted 105 * for all clients, the vmspace change can proceed with confidence. Upon 106 * completion of that operation, VCS_HOLD is cleared from the clients, and they 107 * are released to resume vmspace accesses. 108 * 109 * vCPU Consumers 110 * 111 * Access to the vmspace for vCPUs running in guest context is different from 112 * emulation-related vm_client activity: they solely rely on the contents of the 113 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude 114 * client access is not feasible when entering guest context, since interrupts 115 * are disabled, making it impossible to block entry. This is not a concern as 116 * long as vmspace modifications never place the page tables in invalid states 117 * (either intermediate, or final). The vm_client hold mechanism does provide 118 * the means to IPI vCPU consumers which will trigger a notification once they 119 * report their exit from guest context. This can be used to ensure that page 120 * table modifications are made visible to those vCPUs within a certain 121 * time frame. 122 */ 123 124 typedef struct vmspace_mapping { 125 list_node_t vmsm_node; 126 vm_object_t *vmsm_object; /* object backing this mapping */ 127 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ 128 size_t vmsm_len; /* length (in bytes) of mapping */ 129 off_t vmsm_offset; /* byte offset into object */ 130 uint_t vmsm_prot; 131 } vmspace_mapping_t; 132 133 #define VMSM_OFFSET(vmsm, addr) ( \ 134 (vmsm)->vmsm_offset + \ 135 ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) 136 137 typedef enum vm_client_state { 138 VCS_IDLE = 0, 139 /* currently accessing vmspace for client operation (hold or fault) */ 140 VCS_ACTIVE = (1 << 0), 141 /* client hold requested/asserted */ 142 VCS_HOLD = (1 << 1), 143 /* vCPU is accessing page tables in guest context */ 144 VCS_ON_CPU = (1 << 2), 145 /* client has been orphaned (no more access to vmspace) */ 146 VCS_ORPHANED = (1 << 3), 147 /* client undergoing destroy operation */ 148 VCS_DESTROY = (1 << 4), 149 } vm_client_state_t; 150 151 struct vmspace { 152 kmutex_t vms_lock; 153 kcondvar_t vms_cv; 154 bool vms_held; 155 uintptr_t vms_size; /* immutable after creation */ 156 157 /* (nested) page table state */ 158 vmm_gpt_t *vms_gpt; 159 uint64_t vms_pt_gen; 160 uint64_t vms_pages_mapped; 161 bool vms_track_dirty; 162 163 list_t vms_maplist; 164 list_t vms_clients; 165 }; 166 167 struct vm_client { 168 vmspace_t *vmc_space; 169 list_node_t vmc_node; 170 171 kmutex_t vmc_lock; 172 kcondvar_t vmc_cv; 173 vm_client_state_t vmc_state; 174 int vmc_cpu_active; 175 uint64_t vmc_cpu_gen; 176 bool vmc_track_dirty; 177 vmc_inval_cb_t vmc_inval_func; 178 void *vmc_inval_data; 179 180 list_t vmc_held_pages; 181 }; 182 183 typedef enum vm_object_type { 184 VMOT_NONE, 185 VMOT_MEM, 186 VMOT_MMIO, 187 } vm_object_type_t; 188 189 struct vm_object { 190 uint_t vmo_refcnt; /* manipulated with atomic ops */ 191 192 /* Fields below are fixed at creation time */ 193 vm_object_type_t vmo_type; 194 size_t vmo_size; 195 void *vmo_data; 196 uint8_t vmo_attr; 197 }; 198 199 struct vm_page { 200 vm_client_t *vmp_client; 201 list_node_t vmp_node; 202 vm_page_t *vmp_chain; 203 uintptr_t vmp_gpa; 204 pfn_t vmp_pfn; 205 uint64_t *vmp_ptep; 206 vm_object_t *vmp_obj_ref; 207 int vmp_prot; 208 }; 209 210 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); 211 static void vmspace_hold_enter(vmspace_t *); 212 static void vmspace_hold_exit(vmspace_t *, bool); 213 static void vmc_space_hold(vm_client_t *); 214 static void vmc_space_release(vm_client_t *, bool); 215 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); 216 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); 217 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); 218 219 220 /* 221 * Create a new vmspace with a maximum address of `end`. 222 */ 223 vmspace_t * 224 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) 225 { 226 vmspace_t *vms; 227 const uintptr_t size = end + 1; 228 229 /* 230 * This whole mess is built on the assumption that a 64-bit address 231 * space is available to work with for the various pagetable tricks. 232 */ 233 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && 234 size <= (uintptr_t)USERLIMIT); 235 236 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); 237 vms->vms_size = size; 238 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), 239 offsetof(vmspace_mapping_t, vmsm_node)); 240 list_create(&vms->vms_clients, sizeof (vm_client_t), 241 offsetof(vm_client_t, vmc_node)); 242 243 vms->vms_gpt = vmm_gpt_alloc(pte_ops); 244 vms->vms_pt_gen = 1; 245 vms->vms_track_dirty = track_dirty; 246 247 return (vms); 248 } 249 250 /* 251 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining 252 * clients will be orphaned. 253 */ 254 void 255 vmspace_destroy(vmspace_t *vms) 256 { 257 mutex_enter(&vms->vms_lock); 258 VERIFY(list_is_empty(&vms->vms_maplist)); 259 260 if (!list_is_empty(&vms->vms_clients)) { 261 vm_client_t *vmc = list_head(&vms->vms_clients); 262 while (vmc != NULL) { 263 vmc = vmc_space_orphan(vmc, vms); 264 } 265 /* 266 * Wait for any clients which were in the process of destroying 267 * themselves to disappear. 268 */ 269 while (!list_is_empty(&vms->vms_clients)) { 270 cv_wait(&vms->vms_cv, &vms->vms_lock); 271 } 272 } 273 VERIFY(list_is_empty(&vms->vms_clients)); 274 275 vmm_gpt_free(vms->vms_gpt); 276 mutex_exit(&vms->vms_lock); 277 278 mutex_destroy(&vms->vms_lock); 279 cv_destroy(&vms->vms_cv); 280 list_destroy(&vms->vms_maplist); 281 list_destroy(&vms->vms_clients); 282 283 kmem_free(vms, sizeof (*vms)); 284 } 285 286 /* 287 * Retrieve the count of resident (mapped into the page tables) pages. 288 */ 289 uint64_t 290 vmspace_resident_count(vmspace_t *vms) 291 { 292 return (vms->vms_pages_mapped); 293 } 294 295 void 296 vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap) 297 { 298 /* 299 * Accumulate dirty bits into the given bit vector. Note that this 300 * races both against hardware writes from running vCPUs and 301 * reflections from userspace. 302 * 303 * Called from a userspace-visible ioctl, this depends on the VM 304 * instance being read-locked to prevent vmspace_map/vmspace_unmap 305 * operations from changing the page tables during the walk. 306 */ 307 for (size_t offset = 0; offset < len; offset += PAGESIZE) { 308 bool bit = false; 309 uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset); 310 if (entry != NULL) 311 bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false); 312 uint64_t pfn_offset = offset >> PAGESHIFT; 313 size_t bit_offset = pfn_offset / 8; 314 size_t bit_index = pfn_offset % 8; 315 bitmap[bit_offset] |= (bit << bit_index); 316 } 317 318 /* 319 * Now invalidate those bits and shoot down address spaces that 320 * may have them cached. 321 */ 322 vmspace_hold_enter(vms); 323 vms->vms_pt_gen++; 324 for (vm_client_t *vmc = list_head(&vms->vms_clients); 325 vmc != NULL; 326 vmc = list_next(&vms->vms_clients, vmc)) { 327 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); 328 } 329 vmspace_hold_exit(vms, true); 330 } 331 332 static pfn_t 333 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) 334 { 335 vmmr_region_t *region; 336 pfn_t pfn; 337 338 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); 339 340 region = vmo->vmo_data; 341 pfn = vmmr_region_pfn_at(region, off); 342 343 return (pfn); 344 } 345 346 static pfn_t 347 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) 348 { 349 pfn_t pfn; 350 351 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); 352 ASSERT3P(vmo->vmo_data, !=, NULL); 353 ASSERT3U(off, <, vmo->vmo_size); 354 355 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; 356 357 return (pfn); 358 } 359 360 /* 361 * Allocate a VM object backed by VMM reservoir memory. 362 */ 363 vm_object_t * 364 vm_object_mem_allocate(size_t size, bool transient) 365 { 366 int err; 367 vmmr_region_t *region = NULL; 368 vm_object_t *vmo; 369 370 ASSERT3U(size, !=, 0); 371 ASSERT3U(size & PAGEOFFSET, ==, 0); 372 373 err = vmmr_alloc(size, transient, ®ion); 374 if (err != 0) { 375 return (NULL); 376 } 377 378 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 379 380 /* For now, these are to stay fixed after allocation */ 381 vmo->vmo_type = VMOT_MEM; 382 vmo->vmo_size = size; 383 vmo->vmo_attr = MTRR_TYPE_WB; 384 vmo->vmo_data = region; 385 vmo->vmo_refcnt = 1; 386 387 return (vmo); 388 } 389 390 static vm_object_t * 391 vm_object_mmio_allocate(size_t size, uintptr_t hpa) 392 { 393 vm_object_t *vmo; 394 395 ASSERT3U(size, !=, 0); 396 ASSERT3U(size & PAGEOFFSET, ==, 0); 397 ASSERT3U(hpa & PAGEOFFSET, ==, 0); 398 399 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 400 401 /* For now, these are to stay fixed after allocation */ 402 vmo->vmo_type = VMOT_MMIO; 403 vmo->vmo_size = size; 404 vmo->vmo_attr = MTRR_TYPE_UC; 405 vmo->vmo_data = (void *)hpa; 406 vmo->vmo_refcnt = 1; 407 408 return (vmo); 409 } 410 411 /* 412 * Allocate a VM object backed by an existing range of physical memory. 413 */ 414 vm_object_t * 415 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) 416 { 417 int error; 418 vm_object_t *obj; 419 420 obj = vm_object_mmio_allocate(len, hpa); 421 if (obj != NULL) { 422 error = vmspace_map(vmspace, obj, 0, gpa, len, 423 PROT_READ | PROT_WRITE); 424 if (error != 0) { 425 vm_object_release(obj); 426 obj = NULL; 427 } 428 } 429 430 return (obj); 431 } 432 433 /* 434 * Release a vm_object reference 435 */ 436 void 437 vm_object_release(vm_object_t *vmo) 438 { 439 ASSERT(vmo != NULL); 440 441 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); 442 /* underflow would be a deadly serious mistake */ 443 VERIFY3U(ref, !=, UINT_MAX); 444 if (ref != 0) { 445 return; 446 } 447 448 switch (vmo->vmo_type) { 449 case VMOT_MEM: 450 vmmr_free((vmmr_region_t *)vmo->vmo_data); 451 break; 452 case VMOT_MMIO: 453 break; 454 default: 455 panic("unexpected object type %u", vmo->vmo_type); 456 break; 457 } 458 459 vmo->vmo_data = NULL; 460 vmo->vmo_size = 0; 461 kmem_free(vmo, sizeof (*vmo)); 462 } 463 464 /* 465 * Increase refcount for vm_object reference 466 */ 467 void 468 vm_object_reference(vm_object_t *vmo) 469 { 470 ASSERT(vmo != NULL); 471 472 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); 473 /* overflow would be a deadly serious mistake */ 474 VERIFY3U(ref, !=, 0); 475 } 476 477 /* 478 * Get the host-physical PFN for a given offset into a vm_object. 479 * 480 * The provided `off` must be within the allocated size of the vm_object. 481 */ 482 pfn_t 483 vm_object_pfn(vm_object_t *vmo, uintptr_t off) 484 { 485 const uintptr_t aligned_off = off & PAGEMASK; 486 487 switch (vmo->vmo_type) { 488 case VMOT_MEM: 489 return (vm_object_pager_reservoir(vmo, aligned_off)); 490 case VMOT_MMIO: 491 return (vm_object_pager_mmio(vmo, aligned_off)); 492 case VMOT_NONE: 493 break; 494 } 495 panic("unexpected object type %u", vmo->vmo_type); 496 } 497 498 static vmspace_mapping_t * 499 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) 500 { 501 vmspace_mapping_t *vmsm; 502 list_t *ml = &vms->vms_maplist; 503 const uintptr_t range_end = addr + size; 504 505 ASSERT3U(addr, <=, range_end); 506 507 if (addr >= vms->vms_size) { 508 return (NULL); 509 } 510 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 511 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; 512 513 if (addr >= vmsm->vmsm_addr && addr < seg_end) { 514 if (range_end <= seg_end) { 515 return (vmsm); 516 } else { 517 return (NULL); 518 } 519 } 520 } 521 return (NULL); 522 } 523 524 /* 525 * Check to see if any mappings reside within [addr, addr + size) span in the 526 * vmspace, returning true if that span is indeed empty. 527 */ 528 static bool 529 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) 530 { 531 vmspace_mapping_t *vmsm; 532 list_t *ml = &vms->vms_maplist; 533 const uintptr_t range_end = addr + size - 1; 534 535 ASSERT(MUTEX_HELD(&vms->vms_lock)); 536 ASSERT(size > 0); 537 538 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 539 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; 540 541 /* 542 * The two ranges do not overlap if the start of either of 543 * them is after the end of the other. 544 */ 545 if (vmsm->vmsm_addr > range_end || addr > seg_end) 546 continue; 547 return (false); 548 } 549 return (true); 550 } 551 552 static void 553 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) 554 { 555 list_t *ml = &vms->vms_maplist; 556 557 ASSERT(MUTEX_HELD(&vms->vms_lock)); 558 ASSERT(vms->vms_held); 559 560 list_remove(ml, vmsm); 561 vm_object_release(vmsm->vmsm_object); 562 kmem_free(vmsm, sizeof (*vmsm)); 563 } 564 565 /* 566 * Enter a hold state on the vmspace. This ensures that all VM clients 567 * associated with the vmspace are excluded from establishing new page holds, 568 * or any other actions which would require accessing vmspace state subject to 569 * potential change. 570 * 571 * Returns with vmspace_t`vms_lock held. 572 */ 573 static void 574 vmspace_hold_enter(vmspace_t *vms) 575 { 576 mutex_enter(&vms->vms_lock); 577 VERIFY(!vms->vms_held); 578 579 vm_client_t *vmc = list_head(&vms->vms_clients); 580 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 581 vmc_space_hold(vmc); 582 } 583 vms->vms_held = true; 584 } 585 586 /* 587 * Exit a hold state on the vmspace. This releases all VM clients associated 588 * with the vmspace to be able to establish new page holds, and partake in other 589 * actions which require accessing changed vmspace state. If `kick_on_cpu` is 590 * true, then any CPUs actively using the page tables will be IPIed, and the 591 * call will block until they have acknowledged being ready to use the latest 592 * state of the tables. 593 * 594 * Requires vmspace_t`vms_lock be held, which is released as part of the call. 595 */ 596 static void 597 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) 598 { 599 ASSERT(MUTEX_HELD(&vms->vms_lock)); 600 VERIFY(vms->vms_held); 601 602 vm_client_t *vmc = list_head(&vms->vms_clients); 603 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 604 vmc_space_release(vmc, kick_on_cpu); 605 } 606 vms->vms_held = false; 607 mutex_exit(&vms->vms_lock); 608 } 609 610 /* 611 * Attempt to map a vm_object span into the vmspace. 612 * 613 * Requirements: 614 * - `obj_off`, `addr`, and `len` must be page-aligned 615 * - `obj_off` cannot be greater than the allocated size of the object 616 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated 617 * size of the object 618 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address 619 * of the vmspace 620 */ 621 int 622 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, 623 size_t len, uint8_t prot) 624 { 625 vmspace_mapping_t *vmsm; 626 int res = 0; 627 628 if (len == 0 || (addr + len) < addr || 629 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { 630 return (EINVAL); 631 } 632 if ((addr + len) >= vms->vms_size) { 633 return (ENOMEM); 634 } 635 636 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); 637 638 vmspace_hold_enter(vms); 639 if (!vm_mapping_gap(vms, addr, len)) { 640 kmem_free(vmsm, sizeof (*vmsm)); 641 res = ENOMEM; 642 } else { 643 vmsm->vmsm_object = vmo; 644 vmsm->vmsm_addr = addr; 645 vmsm->vmsm_len = len; 646 vmsm->vmsm_offset = (off_t)obj_off; 647 vmsm->vmsm_prot = prot; 648 list_insert_tail(&vms->vms_maplist, vmsm); 649 650 /* 651 * Make sure the GPT has tables ready for leaf entries across 652 * the entire new mapping. 653 */ 654 vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len); 655 } 656 vmspace_hold_exit(vms, false); 657 return (res); 658 } 659 660 /* 661 * Unmap a region of the vmspace. 662 * 663 * Presently the [start, end) span must equal a region previously mapped by a 664 * call to vmspace_map(). 665 */ 666 int 667 vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end) 668 { 669 const size_t size = (size_t)(end - start); 670 vmspace_mapping_t *vmsm; 671 vm_client_t *vmc; 672 uint64_t gen = 0; 673 674 ASSERT(start < end); 675 676 vmspace_hold_enter(vms); 677 /* expect to match existing mapping exactly */ 678 if ((vmsm = vm_mapping_find(vms, start, size)) == NULL || 679 vmsm->vmsm_addr != start || vmsm->vmsm_len != size) { 680 vmspace_hold_exit(vms, false); 681 return (ENOENT); 682 } 683 684 /* Prepare clients (and their held pages) for the unmap. */ 685 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 686 vmc = list_next(&vms->vms_clients, vmc)) { 687 vmc_space_unmap(vmc, start, size, vmsm->vmsm_object); 688 } 689 690 /* Clear all PTEs for region */ 691 if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) { 692 vms->vms_pt_gen++; 693 gen = vms->vms_pt_gen; 694 } 695 /* ... and the intermediate (directory) PTEs as well */ 696 vmm_gpt_vacate_region(vms->vms_gpt, start, end); 697 698 /* 699 * If pages were actually unmapped from the GPT, provide clients with 700 * an invalidation notice. 701 */ 702 if (gen != 0) { 703 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 704 vmc = list_next(&vms->vms_clients, vmc)) { 705 vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen); 706 } 707 } 708 709 vm_mapping_remove(vms, vmsm); 710 vmspace_hold_exit(vms, true); 711 return (0); 712 } 713 714 static int 715 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 716 uint64_t **ptepp) 717 { 718 vmm_gpt_t *gpt = vms->vms_gpt; 719 uint64_t *entries[MAX_GPT_LEVEL], *leaf; 720 pfn_t pfn = PFN_INVALID; 721 uint_t prot; 722 723 ASSERT0(gpa & PAGEOFFSET); 724 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); 725 726 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 727 leaf = entries[LEVEL1]; 728 if (leaf == NULL) { 729 /* 730 * Since we populated the intermediate tables for any regions 731 * mapped in the GPT, an empty leaf entry indicates there is no 732 * mapping, populated or not, at this GPT. 733 */ 734 return (FC_NOMAP); 735 } 736 737 if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { 738 if ((req_prot & prot) != req_prot) { 739 return (FC_PROT); 740 } 741 } else { 742 vmspace_mapping_t *vmsm; 743 vm_object_t *vmo; 744 745 vmsm = vm_mapping_find(vms, gpa, PAGESIZE); 746 if (vmsm == NULL) { 747 return (FC_NOMAP); 748 } 749 750 if ((req_prot & vmsm->vmsm_prot) != req_prot) { 751 return (FC_PROT); 752 } 753 vmo = vmsm->vmsm_object; 754 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 755 VERIFY(pfn != PFN_INVALID); 756 757 if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot, 758 vmo->vmo_attr)) { 759 atomic_inc_64(&vms->vms_pages_mapped); 760 } 761 } 762 763 ASSERT(pfn != PFN_INVALID && leaf != NULL); 764 if (pfnp != NULL) { 765 *pfnp = pfn; 766 } 767 if (ptepp != NULL) { 768 *ptepp = leaf; 769 } 770 return (0); 771 } 772 773 /* 774 * Populate (make resident in the page tables) a region of the vmspace. 775 * 776 * Presently the [start, end) span must equal a region previously mapped by a 777 * call to vmspace_map(). 778 */ 779 int 780 vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end) 781 { 782 const size_t size = end - start; 783 vmspace_mapping_t *vmsm; 784 785 mutex_enter(&vms->vms_lock); 786 787 /* For the time being, only exact-match mappings are expected */ 788 if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) { 789 mutex_exit(&vms->vms_lock); 790 return (FC_NOMAP); 791 } 792 793 vm_object_t *vmo = vmsm->vmsm_object; 794 const int prot = vmsm->vmsm_prot; 795 const uint8_t attr = vmo->vmo_attr; 796 size_t populated = 0; 797 for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) { 798 const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 799 VERIFY(pfn != PFN_INVALID); 800 801 if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) { 802 populated++; 803 } 804 } 805 atomic_add_64(&vms->vms_pages_mapped, populated); 806 807 mutex_exit(&vms->vms_lock); 808 return (0); 809 } 810 811 /* 812 * Allocate a client from a given vmspace. 813 */ 814 vm_client_t * 815 vmspace_client_alloc(vmspace_t *vms) 816 { 817 vm_client_t *vmc; 818 819 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); 820 vmc->vmc_space = vms; 821 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); 822 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); 823 vmc->vmc_state = VCS_IDLE; 824 vmc->vmc_cpu_active = -1; 825 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), 826 offsetof(vm_page_t, vmp_node)); 827 vmc->vmc_track_dirty = vms->vms_track_dirty; 828 829 mutex_enter(&vms->vms_lock); 830 list_insert_tail(&vms->vms_clients, vmc); 831 mutex_exit(&vms->vms_lock); 832 833 return (vmc); 834 } 835 836 /* 837 * Get the nested page table root pointer (EPTP/NCR3) value. 838 */ 839 uint64_t 840 vmspace_table_root(vmspace_t *vms) 841 { 842 return (vmm_gpt_get_pmtp(vms->vms_gpt)); 843 } 844 845 /* 846 * Get the current generation number of the nested page table. 847 */ 848 uint64_t 849 vmspace_table_gen(vmspace_t *vms) 850 { 851 return (vms->vms_pt_gen); 852 } 853 854 /* 855 * Mark a vm_client as active. This will block if/while the client is held by 856 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will 857 * fail if the vm_client has been orphaned. 858 */ 859 static int 860 vmc_activate(vm_client_t *vmc) 861 { 862 mutex_enter(&vmc->vmc_lock); 863 VERIFY0(vmc->vmc_state & VCS_ACTIVE); 864 if ((vmc->vmc_state & VCS_ORPHANED) != 0) { 865 mutex_exit(&vmc->vmc_lock); 866 return (ENXIO); 867 } 868 while ((vmc->vmc_state & VCS_HOLD) != 0) { 869 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 870 } 871 vmc->vmc_state |= VCS_ACTIVE; 872 return (0); 873 } 874 875 /* 876 * Mark a vm_client as no longer active. It must be called with 877 * vm_client_t`vmc_lock already held, and will return with it released. 878 */ 879 static void 880 vmc_deactivate(vm_client_t *vmc) 881 { 882 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 883 VERIFY(vmc->vmc_state & VCS_ACTIVE); 884 885 vmc->vmc_state ^= VCS_ACTIVE; 886 if ((vmc->vmc_state & VCS_HOLD) != 0) { 887 cv_broadcast(&vmc->vmc_cv); 888 } 889 mutex_exit(&vmc->vmc_lock); 890 } 891 892 /* 893 * Indicate that a CPU will be utilizing the nested page tables through this VM 894 * client. Interrupts (and/or the GIF) are expected to be disabled when calling 895 * this function. Returns the generation number of the nested page table (to be 896 * used for TLB invalidations). 897 */ 898 uint64_t 899 vmc_table_enter(vm_client_t *vmc) 900 { 901 vmspace_t *vms = vmc->vmc_space; 902 uint64_t gen; 903 904 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 905 ASSERT3S(vmc->vmc_cpu_active, ==, -1); 906 907 /* 908 * Since the NPT activation occurs with interrupts disabled, this must 909 * be done without taking vmc_lock like normal. 910 */ 911 gen = vms->vms_pt_gen; 912 vmc->vmc_cpu_active = CPU->cpu_id; 913 vmc->vmc_cpu_gen = gen; 914 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); 915 916 return (gen); 917 } 918 919 /* 920 * Indicate that this VM client is not longer (directly) using the underlying 921 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling 922 * this function. 923 */ 924 void 925 vmc_table_exit(vm_client_t *vmc) 926 { 927 mutex_enter(&vmc->vmc_lock); 928 929 ASSERT(vmc->vmc_state & VCS_ON_CPU); 930 vmc->vmc_state ^= VCS_ON_CPU; 931 vmc->vmc_cpu_active = -1; 932 if ((vmc->vmc_state & VCS_HOLD) != 0) { 933 cv_broadcast(&vmc->vmc_cv); 934 } 935 936 mutex_exit(&vmc->vmc_lock); 937 } 938 939 static void 940 vmc_space_hold(vm_client_t *vmc) 941 { 942 mutex_enter(&vmc->vmc_lock); 943 VERIFY0(vmc->vmc_state & VCS_HOLD); 944 945 /* 946 * Because vmc_table_enter() alters vmc_state from a context where 947 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting 948 * VMC_HOLD must be done atomically here. 949 */ 950 atomic_or_uint(&vmc->vmc_state, VCS_HOLD); 951 952 /* Wait for client to go inactive */ 953 while ((vmc->vmc_state & VCS_ACTIVE) != 0) { 954 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 955 } 956 mutex_exit(&vmc->vmc_lock); 957 } 958 959 static void 960 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) 961 { 962 mutex_enter(&vmc->vmc_lock); 963 VERIFY(vmc->vmc_state & VCS_HOLD); 964 965 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { 966 poke_cpu(vmc->vmc_cpu_active); 967 968 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 969 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 970 } 971 } 972 973 /* 974 * Because vmc_table_enter() alters vmc_state from a context where 975 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing 976 * VMC_HOLD must be done atomically here. 977 */ 978 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); 979 cv_broadcast(&vmc->vmc_cv); 980 mutex_exit(&vmc->vmc_lock); 981 } 982 983 static void 984 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, 985 uint64_t gen) 986 { 987 mutex_enter(&vmc->vmc_lock); 988 VERIFY(vmc->vmc_state & VCS_HOLD); 989 if ((vmc->vmc_state & VCS_ON_CPU) != 0) { 990 /* 991 * Wait for clients using an old generation of the page tables 992 * to exit guest context, where they subsequently flush the TLB 993 * for the new generation. 994 */ 995 if (vmc->vmc_cpu_gen < gen) { 996 poke_cpu(vmc->vmc_cpu_active); 997 998 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 999 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1000 } 1001 } 1002 } 1003 if (vmc->vmc_inval_func != NULL) { 1004 vmc_inval_cb_t func = vmc->vmc_inval_func; 1005 void *data = vmc->vmc_inval_data; 1006 1007 /* 1008 * Perform the actual invalidation call outside vmc_lock to 1009 * avoid lock ordering issues in the consumer. Since the client 1010 * is under VCS_HOLD, this is safe. 1011 */ 1012 mutex_exit(&vmc->vmc_lock); 1013 func(data, addr, size); 1014 mutex_enter(&vmc->vmc_lock); 1015 } 1016 mutex_exit(&vmc->vmc_lock); 1017 } 1018 1019 static void 1020 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, 1021 vm_object_t *vmo) 1022 { 1023 mutex_enter(&vmc->vmc_lock); 1024 VERIFY(vmc->vmc_state & VCS_HOLD); 1025 1026 /* 1027 * With the current vCPU exclusion invariants in place, we do not expect 1028 * a vCPU to be in guest context during an unmap. 1029 */ 1030 VERIFY0(vmc->vmc_state & VCS_ON_CPU); 1031 1032 /* 1033 * Any holds against the unmapped region need to establish their own 1034 * reference to the underlying object to avoid a potential 1035 * use-after-free. 1036 */ 1037 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1038 vmp != NULL; 1039 vmp = list_next(&vmc->vmc_held_pages, vmc)) { 1040 if (vmp->vmp_gpa < addr || 1041 vmp->vmp_gpa >= (addr + size)) { 1042 /* Hold outside region in question */ 1043 continue; 1044 } 1045 if (vmp->vmp_obj_ref == NULL) { 1046 vm_object_reference(vmo); 1047 vmp->vmp_obj_ref = vmo; 1048 /* For an unmapped region, PTE is now meaningless */ 1049 vmp->vmp_ptep = NULL; 1050 } else { 1051 /* 1052 * Object could have gone through cycle of 1053 * unmap-map-unmap before the hold was released. 1054 */ 1055 VERIFY3P(vmp->vmp_ptep, ==, NULL); 1056 } 1057 } 1058 mutex_exit(&vmc->vmc_lock); 1059 } 1060 1061 static vm_client_t * 1062 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) 1063 { 1064 vm_client_t *next; 1065 1066 ASSERT(MUTEX_HELD(&vms->vms_lock)); 1067 1068 mutex_enter(&vmc->vmc_lock); 1069 VERIFY3P(vmc->vmc_space, ==, vms); 1070 VERIFY0(vmc->vmc_state & VCS_ORPHANED); 1071 if (vmc->vmc_state & VCS_DESTROY) { 1072 /* 1073 * This vm_client is currently undergoing destruction, so it 1074 * does not need to be orphaned. Let it proceed with its own 1075 * clean-up task. 1076 */ 1077 next = list_next(&vms->vms_clients, vmc); 1078 } else { 1079 /* 1080 * Clients are only orphaned when the containing vmspace is 1081 * being torn down. All mappings from the vmspace should 1082 * already be gone, meaning any remaining held pages should have 1083 * direct references to the object. 1084 */ 1085 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1086 vmp != NULL; 1087 vmp = list_next(&vmc->vmc_held_pages, vmp)) { 1088 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1089 ASSERT3P(vmp->vmp_obj_ref, !=, NULL); 1090 } 1091 1092 /* 1093 * After this point, the client will be orphaned, unable to 1094 * establish new page holds (or access any vmspace-related 1095 * resources) and is in charge of cleaning up after itself. 1096 */ 1097 vmc->vmc_state |= VCS_ORPHANED; 1098 next = list_next(&vms->vms_clients, vmc); 1099 list_remove(&vms->vms_clients, vmc); 1100 vmc->vmc_space = NULL; 1101 } 1102 mutex_exit(&vmc->vmc_lock); 1103 return (next); 1104 } 1105 1106 /* 1107 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1108 */ 1109 vm_page_t * 1110 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) 1111 { 1112 vmspace_t *vms = vmc->vmc_space; 1113 vm_page_t *vmp; 1114 pfn_t pfn = PFN_INVALID; 1115 uint64_t *ptep = NULL; 1116 1117 ASSERT0(gpa & PAGEOFFSET); 1118 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); 1119 1120 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); 1121 if (vmc_activate(vmc) != 0) { 1122 kmem_free(vmp, sizeof (*vmp)); 1123 return (NULL); 1124 } 1125 1126 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { 1127 vmc_deactivate(vmc); 1128 kmem_free(vmp, sizeof (*vmp)); 1129 return (NULL); 1130 } 1131 ASSERT(pfn != PFN_INVALID && ptep != NULL); 1132 1133 vmp->vmp_client = vmc; 1134 vmp->vmp_chain = NULL; 1135 vmp->vmp_gpa = gpa; 1136 vmp->vmp_pfn = pfn; 1137 vmp->vmp_ptep = ptep; 1138 vmp->vmp_obj_ref = NULL; 1139 vmp->vmp_prot = prot; 1140 list_insert_tail(&vmc->vmc_held_pages, vmp); 1141 vmc_deactivate(vmc); 1142 1143 return (vmp); 1144 } 1145 1146 int 1147 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) 1148 { 1149 vmspace_t *vms = vmc->vmc_space; 1150 int err; 1151 1152 err = vmc_activate(vmc); 1153 if (err == 0) { 1154 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); 1155 vmc_deactivate(vmc); 1156 } 1157 1158 return (err); 1159 } 1160 1161 /* 1162 * Allocate an additional vm_client_t, based on an existing one. Only the 1163 * associatation with the vmspace is cloned, not existing holds or any 1164 * configured invalidation function. 1165 */ 1166 vm_client_t * 1167 vmc_clone(vm_client_t *vmc) 1168 { 1169 vmspace_t *vms = vmc->vmc_space; 1170 1171 return (vmspace_client_alloc(vms)); 1172 } 1173 1174 /* 1175 * Register a function (and associated data pointer) to be called when an 1176 * address range in the vmspace is invalidated. 1177 */ 1178 int 1179 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) 1180 { 1181 int err; 1182 1183 err = vmc_activate(vmc); 1184 if (err == 0) { 1185 vmc->vmc_inval_func = func; 1186 vmc->vmc_inval_data = data; 1187 vmc_deactivate(vmc); 1188 } 1189 1190 return (err); 1191 } 1192 1193 /* 1194 * Destroy a vm_client_t instance. 1195 * 1196 * No pages held through this vm_client_t may be outstanding when performing a 1197 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to 1198 * vmc_table_exit() has been made). 1199 */ 1200 void 1201 vmc_destroy(vm_client_t *vmc) 1202 { 1203 mutex_enter(&vmc->vmc_lock); 1204 1205 VERIFY(list_is_empty(&vmc->vmc_held_pages)); 1206 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1207 1208 if ((vmc->vmc_state & VCS_ORPHANED) == 0) { 1209 vmspace_t *vms; 1210 1211 /* 1212 * Deassociation with the parent vmspace must be done carefully: 1213 * The vmspace could attempt to orphan this vm_client while we 1214 * release vmc_lock in order to take vms_lock (the required 1215 * order). The client is marked to indicate that destruction is 1216 * under way. Doing so prevents any racing orphan operation 1217 * from applying to this client, allowing us to deassociate from 1218 * the vmspace safely. 1219 */ 1220 vmc->vmc_state |= VCS_DESTROY; 1221 vms = vmc->vmc_space; 1222 mutex_exit(&vmc->vmc_lock); 1223 1224 mutex_enter(&vms->vms_lock); 1225 mutex_enter(&vmc->vmc_lock); 1226 list_remove(&vms->vms_clients, vmc); 1227 /* 1228 * If the vmspace began its own destruction operation while we 1229 * were navigating the locks, be sure to notify it about this 1230 * vm_client being deassociated. 1231 */ 1232 cv_signal(&vms->vms_cv); 1233 mutex_exit(&vmc->vmc_lock); 1234 mutex_exit(&vms->vms_lock); 1235 } else { 1236 VERIFY3P(vmc->vmc_space, ==, NULL); 1237 mutex_exit(&vmc->vmc_lock); 1238 } 1239 1240 mutex_destroy(&vmc->vmc_lock); 1241 cv_destroy(&vmc->vmc_cv); 1242 list_destroy(&vmc->vmc_held_pages); 1243 1244 kmem_free(vmc, sizeof (*vmc)); 1245 } 1246 1247 static __inline void * 1248 vmp_ptr(const vm_page_t *vmp) 1249 { 1250 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); 1251 1252 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); 1253 return ((void *)((uintptr_t)kpm_vbase + paddr)); 1254 } 1255 1256 /* 1257 * Get a readable kernel-virtual pointer for a held page. 1258 * 1259 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() 1260 * call to acquire this page reference. 1261 */ 1262 const void * 1263 vmp_get_readable(const vm_page_t *vmp) 1264 { 1265 ASSERT(vmp->vmp_prot & PROT_READ); 1266 1267 return (vmp_ptr(vmp)); 1268 } 1269 1270 /* 1271 * Get a writable kernel-virtual pointer for a held page. 1272 * 1273 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() 1274 * call to acquire this page reference. 1275 */ 1276 void * 1277 vmp_get_writable(const vm_page_t *vmp) 1278 { 1279 ASSERT(vmp->vmp_prot & PROT_WRITE); 1280 1281 return (vmp_ptr(vmp)); 1282 } 1283 1284 /* 1285 * Get the host-physical PFN for a held page. 1286 */ 1287 pfn_t 1288 vmp_get_pfn(const vm_page_t *vmp) 1289 { 1290 return (vmp->vmp_pfn); 1291 } 1292 1293 /* 1294 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. 1295 */ 1296 void 1297 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) 1298 { 1299 ASSERT3P(vmp->vmp_chain, ==, NULL); 1300 1301 vmp->vmp_chain = to_chain; 1302 } 1303 1304 /* 1305 * Retrieve the pointer from the page-chaining in `vmp`. 1306 */ 1307 vm_page_t * 1308 vmp_next(const vm_page_t *vmp) 1309 { 1310 return (vmp->vmp_chain); 1311 } 1312 1313 static __inline bool 1314 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) 1315 { 1316 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1317 1318 bool was_unmapped = false; 1319 1320 list_remove(&vmc->vmc_held_pages, vmp); 1321 if (vmp->vmp_obj_ref != NULL) { 1322 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1323 1324 vm_object_release(vmp->vmp_obj_ref); 1325 was_unmapped = true; 1326 } else { 1327 ASSERT3P(vmp->vmp_ptep, !=, NULL); 1328 1329 if ((vmp->vmp_prot & PROT_WRITE) != 0 && vmc->vmc_track_dirty) { 1330 vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; 1331 (void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); 1332 } 1333 } 1334 kmem_free(vmp, sizeof (*vmp)); 1335 return (was_unmapped); 1336 } 1337 1338 /* 1339 * Release held page. Returns true if page resided on region which was 1340 * subsequently unmapped. 1341 */ 1342 bool 1343 vmp_release(vm_page_t *vmp) 1344 { 1345 vm_client_t *vmc = vmp->vmp_client; 1346 1347 VERIFY(vmc != NULL); 1348 1349 mutex_enter(&vmc->vmc_lock); 1350 const bool was_unmapped = vmp_release_inner(vmp, vmc); 1351 mutex_exit(&vmc->vmc_lock); 1352 return (was_unmapped); 1353 } 1354 1355 /* 1356 * Release a chain of pages which were associated via vmp_chain() (setting 1357 * page-chaining pointer). Returns true if any pages resided upon a region 1358 * which was subsequently unmapped. 1359 * 1360 * All of those pages must have been held through the same vm_client_t. 1361 */ 1362 bool 1363 vmp_release_chain(vm_page_t *vmp) 1364 { 1365 vm_client_t *vmc = vmp->vmp_client; 1366 bool any_unmapped = false; 1367 1368 ASSERT(vmp != NULL); 1369 1370 mutex_enter(&vmc->vmc_lock); 1371 while (vmp != NULL) { 1372 vm_page_t *next = vmp->vmp_chain; 1373 1374 /* We expect all pages in chain to be from same client */ 1375 ASSERT3P(vmp->vmp_client, ==, vmc); 1376 1377 if (vmp_release_inner(vmp, vmc)) { 1378 any_unmapped = true; 1379 } 1380 vmp = next; 1381 } 1382 mutex_exit(&vmc->vmc_lock); 1383 return (any_unmapped); 1384 } 1385 1386 1387 int 1388 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, 1389 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) 1390 { 1391 vm_object_t *vmo; 1392 int err; 1393 1394 if (segoff < 0 || len <= 0 || 1395 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { 1396 return (EINVAL); 1397 } 1398 if ((prot & PROT_USER) == 0) { 1399 return (ENOTSUP); 1400 } 1401 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); 1402 if (err != 0) { 1403 return (err); 1404 } 1405 1406 VERIFY(segoff >= 0); 1407 VERIFY(len <= vmo->vmo_size); 1408 VERIFY((len + segoff) <= vmo->vmo_size); 1409 1410 if (vmo->vmo_type != VMOT_MEM) { 1411 /* Only support memory objects for now */ 1412 return (ENOTSUP); 1413 } 1414 1415 as_rangelock(as); 1416 1417 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); 1418 if (err == 0) { 1419 segvmm_crargs_t svma; 1420 1421 svma.prot = prot; 1422 svma.offset = segoff; 1423 svma.vmo = vmo; 1424 svma.vmc = NULL; 1425 1426 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); 1427 } 1428 1429 as_rangeunlock(as); 1430 return (err); 1431 } 1432 1433 int 1434 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, 1435 off_t len, uint_t prot, uint_t maxprot, uint_t flags) 1436 { 1437 1438 const uintptr_t gpa = (uintptr_t)off; 1439 const size_t size = (uintptr_t)len; 1440 int err; 1441 1442 if (off < 0 || len <= 0 || 1443 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { 1444 return (EINVAL); 1445 } 1446 if ((prot & PROT_USER) == 0) { 1447 return (ENOTSUP); 1448 } 1449 1450 as_rangelock(as); 1451 1452 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); 1453 if (err == 0) { 1454 segvmm_crargs_t svma; 1455 1456 svma.prot = prot; 1457 svma.offset = gpa; 1458 svma.vmo = NULL; 1459 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); 1460 1461 err = as_map(as, *addrp, len, segvmm_create, &svma); 1462 } 1463 1464 as_rangeunlock(as); 1465 return (err); 1466 } 1467