/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ /* * Copyright 2019 Joyent, Inc. * Copyright 2022 Oxide Computer Company * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * VMM Virtual Memory * * History * * When bhyve was ported to illumos, one significant hole was handling guest * memory and memory accesses. In the original Pluribus port, bhyve itself * manually handled the EPT structures for guest memory. The updated sources * (from FreeBSD 11) took a different approach, using the native FreeBSD VM * system for memory allocations and management of the EPT structures. Keeping * source differences to a minimum was a priority, so illumos-bhyve implemented * a makeshift "VM shim" which exposed the bare minimum of those interfaces to * boot and run guests. * * While the VM shim was successful in getting illumos-bhyve to a functional * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the * compatibility interfaces made it awkward to use. As source differences with * the upstream kernel code became less of a concern, and upcoming features * (such as live migration) would demand more of those VM interfaces, it became * clear that an overhaul was prudent. * * Design * * The new VM system for bhyve retains a number of the same concepts as what it * replaces: * * - `vmspace_t` is the top-level entity for a guest memory space * - `vm_object_t` represents a memory object which can be mapped into a vmspace * - `vm_page_t` represents a page hold within a given vmspace, providing access * to the underlying memory page * * Unlike the old code, where most of the involved structures were exposed via * public definitions, this replacement VM interface keeps all involved * structures opaque to consumers. Furthermore, there is a clear delineation * between infrequent administrative operations (such as mapping/unmapping * regions) and common data-path operations (attempting a page hold at a given * guest-physical address). Those administrative operations are performed * directly against the vmspace, whereas the data-path operations are performed * through a `vm_client_t` handle. That VM client abstraction is meant to * reduce contention and overhead for frequent access operations and provide * debugging insight into how different subcomponents are accessing the vmspace. * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv * interface) and each VMM userspace segment mapping. * * Exclusion * * Making changes to the vmspace (such as mapping or unmapping regions) requires * other accessors be excluded while the change is underway to prevent them from * observing invalid intermediate states. A simple approach could use a mutex * or rwlock to achieve this, but that risks contention when the rate of access * to the vmspace is high. * * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion * at a per-vm_client_t basis. While this raises the cost for vmspace changes, * it means that the much more common page accesses through the vm_client can * normally proceed unimpeded and independently. * * When a change to the vmspace is required, the caller will put the vmspace in * a 'hold' state, iterating over all associated vm_client instances, waiting * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) * will block until the hold condition is cleared. Once the hold is asserted * for all clients, the vmspace change can proceed with confidence. Upon * completion of that operation, VCS_HOLD is cleared from the clients, and they * are released to resume vmspace accesses. * * vCPU Consumers * * Access to the vmspace for vCPUs running in guest context is different from * emulation-related vm_client activity: they solely rely on the contents of the * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude * client access is not feasible when entering guest context, since interrupts * are disabled, making it impossible to block entry. This is not a concern as * long as vmspace modifications never place the page tables in invalid states * (either intermediate, or final). The vm_client hold mechanism does provide * the means to IPI vCPU consumers which will trigger a notification once they * report their exit from guest context. This can be used to ensure that page * table modifications are made visible to those vCPUs within a certain * time frame. */ typedef struct vmspace_mapping { list_node_t vmsm_node; vm_object_t *vmsm_object; /* object backing this mapping */ uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ size_t vmsm_len; /* length (in bytes) of mapping */ off_t vmsm_offset; /* byte offset into object */ uint_t vmsm_prot; } vmspace_mapping_t; #define VMSM_OFFSET(vmsm, addr) ( \ (vmsm)->vmsm_offset + \ ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) typedef enum vm_client_state { VCS_IDLE = 0, /* currently accessing vmspace for client operation (hold or fault) */ VCS_ACTIVE = (1 << 0), /* client hold requested/asserted */ VCS_HOLD = (1 << 1), /* vCPU is accessing page tables in guest context */ VCS_ON_CPU = (1 << 2), /* client has been orphaned (no more access to vmspace) */ VCS_ORPHANED = (1 << 3), /* client undergoing destroy operation */ VCS_DESTROY = (1 << 4), } vm_client_state_t; struct vmspace { kmutex_t vms_lock; kcondvar_t vms_cv; bool vms_held; uintptr_t vms_size; /* immutable after creation */ /* (nested) page table state */ vmm_gpt_t *vms_gpt; uint64_t vms_pt_gen; uint64_t vms_pages_mapped; bool vms_track_dirty; list_t vms_maplist; list_t vms_clients; }; struct vm_client { vmspace_t *vmc_space; list_node_t vmc_node; kmutex_t vmc_lock; kcondvar_t vmc_cv; vm_client_state_t vmc_state; int vmc_cpu_active; uint64_t vmc_cpu_gen; bool vmc_track_dirty; vmc_inval_cb_t vmc_inval_func; void *vmc_inval_data; list_t vmc_held_pages; }; typedef enum vm_object_type { VMOT_NONE, VMOT_MEM, VMOT_MMIO, } vm_object_type_t; struct vm_object { uint_t vmo_refcnt; /* manipulated with atomic ops */ /* Fields below are fixed at creation time */ vm_object_type_t vmo_type; size_t vmo_size; void *vmo_data; uint8_t vmo_attr; }; struct vm_page { vm_client_t *vmp_client; list_node_t vmp_node; vm_page_t *vmp_chain; uintptr_t vmp_gpa; pfn_t vmp_pfn; uint64_t *vmp_ptep; vm_object_t *vmp_obj_ref; int vmp_prot; }; static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); static void vmspace_hold_enter(vmspace_t *); static void vmspace_hold_exit(vmspace_t *, bool); static void vmc_space_hold(vm_client_t *); static void vmc_space_release(vm_client_t *, bool); static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); /* * Create a new vmspace with a maximum address of `end`. */ vmspace_t * vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) { vmspace_t *vms; const uintptr_t size = end + 1; /* * This whole mess is built on the assumption that a 64-bit address * space is available to work with for the various pagetable tricks. */ VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && size <= (uintptr_t)USERLIMIT); vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); vms->vms_size = size; list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), offsetof(vmspace_mapping_t, vmsm_node)); list_create(&vms->vms_clients, sizeof (vm_client_t), offsetof(vm_client_t, vmc_node)); vms->vms_gpt = vmm_gpt_alloc(pte_ops); vms->vms_pt_gen = 1; vms->vms_track_dirty = track_dirty; return (vms); } /* * Destroy a vmspace. All regions in the space must be unmapped. Any remaining * clients will be orphaned. */ void vmspace_destroy(vmspace_t *vms) { mutex_enter(&vms->vms_lock); VERIFY(list_is_empty(&vms->vms_maplist)); if (!list_is_empty(&vms->vms_clients)) { vm_client_t *vmc = list_head(&vms->vms_clients); while (vmc != NULL) { vmc = vmc_space_orphan(vmc, vms); } /* * Wait for any clients which were in the process of destroying * themselves to disappear. */ while (!list_is_empty(&vms->vms_clients)) { cv_wait(&vms->vms_cv, &vms->vms_lock); } } VERIFY(list_is_empty(&vms->vms_clients)); vmm_gpt_free(vms->vms_gpt); mutex_exit(&vms->vms_lock); mutex_destroy(&vms->vms_lock); cv_destroy(&vms->vms_cv); list_destroy(&vms->vms_maplist); list_destroy(&vms->vms_clients); kmem_free(vms, sizeof (*vms)); } /* * Retrieve the count of resident (mapped into the page tables) pages. */ uint64_t vmspace_resident_count(vmspace_t *vms) { return (vms->vms_pages_mapped); } int vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap) { if (!vms->vms_track_dirty) return (EPERM); /* * Accumulate dirty bits into the given bit vector. Note that this * races both against hardware writes from running vCPUs and * reflections from userspace. * * Called from a userspace-visible ioctl, this depends on the VM * instance being read-locked to prevent vmspace_map/vmspace_unmap * operations from changing the page tables during the walk. */ for (size_t offset = 0; offset < len; offset += PAGESIZE) { bool bit = false; uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset); if (entry != NULL) bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false); uint64_t pfn_offset = offset >> PAGESHIFT; size_t bit_offset = pfn_offset / 8; size_t bit_index = pfn_offset % 8; bitmap[bit_offset] |= (bit << bit_index); } /* * Now invalidate those bits and shoot down address spaces that * may have them cached. */ vmspace_hold_enter(vms); vms->vms_pt_gen++; for (vm_client_t *vmc = list_head(&vms->vms_clients); vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); } vmspace_hold_exit(vms, true); return (0); } static pfn_t vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) { vmmr_region_t *region; pfn_t pfn; ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); region = vmo->vmo_data; pfn = vmmr_region_pfn_at(region, off); return (pfn); } static pfn_t vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) { pfn_t pfn; ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); ASSERT3P(vmo->vmo_data, !=, NULL); ASSERT3U(off, <, vmo->vmo_size); pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; return (pfn); } /* * Allocate a VM object backed by VMM reservoir memory. */ vm_object_t * vm_object_mem_allocate(size_t size, bool transient) { int err; vmmr_region_t *region = NULL; vm_object_t *vmo; ASSERT3U(size, !=, 0); ASSERT3U(size & PAGEOFFSET, ==, 0); err = vmmr_alloc(size, transient, ®ion); if (err != 0) { return (NULL); } vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); /* For now, these are to stay fixed after allocation */ vmo->vmo_type = VMOT_MEM; vmo->vmo_size = size; vmo->vmo_attr = MTRR_TYPE_WB; vmo->vmo_data = region; vmo->vmo_refcnt = 1; return (vmo); } static vm_object_t * vm_object_mmio_allocate(size_t size, uintptr_t hpa) { vm_object_t *vmo; ASSERT3U(size, !=, 0); ASSERT3U(size & PAGEOFFSET, ==, 0); ASSERT3U(hpa & PAGEOFFSET, ==, 0); vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); /* For now, these are to stay fixed after allocation */ vmo->vmo_type = VMOT_MMIO; vmo->vmo_size = size; vmo->vmo_attr = MTRR_TYPE_UC; vmo->vmo_data = (void *)hpa; vmo->vmo_refcnt = 1; return (vmo); } /* * Allocate a VM object backed by an existing range of physical memory. */ vm_object_t * vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) { int error; vm_object_t *obj; obj = vm_object_mmio_allocate(len, hpa); if (obj != NULL) { error = vmspace_map(vmspace, obj, 0, gpa, len, PROT_READ | PROT_WRITE); if (error != 0) { vm_object_release(obj); obj = NULL; } } return (obj); } /* * Release a vm_object reference */ void vm_object_release(vm_object_t *vmo) { ASSERT(vmo != NULL); uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); /* underflow would be a deadly serious mistake */ VERIFY3U(ref, !=, UINT_MAX); if (ref != 0) { return; } switch (vmo->vmo_type) { case VMOT_MEM: vmmr_free((vmmr_region_t *)vmo->vmo_data); break; case VMOT_MMIO: break; default: panic("unexpected object type %u", vmo->vmo_type); break; } vmo->vmo_data = NULL; vmo->vmo_size = 0; kmem_free(vmo, sizeof (*vmo)); } /* * Increase refcount for vm_object reference */ void vm_object_reference(vm_object_t *vmo) { ASSERT(vmo != NULL); uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); /* overflow would be a deadly serious mistake */ VERIFY3U(ref, !=, 0); } /* * Get the host-physical PFN for a given offset into a vm_object. * * The provided `off` must be within the allocated size of the vm_object. */ pfn_t vm_object_pfn(vm_object_t *vmo, uintptr_t off) { const uintptr_t aligned_off = off & PAGEMASK; switch (vmo->vmo_type) { case VMOT_MEM: return (vm_object_pager_reservoir(vmo, aligned_off)); case VMOT_MMIO: return (vm_object_pager_mmio(vmo, aligned_off)); case VMOT_NONE: break; } panic("unexpected object type %u", vmo->vmo_type); } static vmspace_mapping_t * vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) { vmspace_mapping_t *vmsm; list_t *ml = &vms->vms_maplist; const uintptr_t range_end = addr + size; ASSERT3U(addr, <=, range_end); if (addr >= vms->vms_size) { return (NULL); } for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; if (addr >= vmsm->vmsm_addr && addr < seg_end) { if (range_end <= seg_end) { return (vmsm); } else { return (NULL); } } } return (NULL); } /* * Check to see if any mappings reside within [addr, addr + size) span in the * vmspace, returning true if that span is indeed empty. */ static bool vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) { vmspace_mapping_t *vmsm; list_t *ml = &vms->vms_maplist; const uintptr_t range_end = addr + size - 1; ASSERT(MUTEX_HELD(&vms->vms_lock)); ASSERT(size > 0); for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; /* * The two ranges do not overlap if the start of either of * them is after the end of the other. */ if (vmsm->vmsm_addr > range_end || addr > seg_end) continue; return (false); } return (true); } static void vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) { list_t *ml = &vms->vms_maplist; ASSERT(MUTEX_HELD(&vms->vms_lock)); ASSERT(vms->vms_held); list_remove(ml, vmsm); vm_object_release(vmsm->vmsm_object); kmem_free(vmsm, sizeof (*vmsm)); } /* * Enter a hold state on the vmspace. This ensures that all VM clients * associated with the vmspace are excluded from establishing new page holds, * or any other actions which would require accessing vmspace state subject to * potential change. * * Returns with vmspace_t`vms_lock held. */ static void vmspace_hold_enter(vmspace_t *vms) { mutex_enter(&vms->vms_lock); VERIFY(!vms->vms_held); vm_client_t *vmc = list_head(&vms->vms_clients); for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { vmc_space_hold(vmc); } vms->vms_held = true; } /* * Exit a hold state on the vmspace. This releases all VM clients associated * with the vmspace to be able to establish new page holds, and partake in other * actions which require accessing changed vmspace state. If `kick_on_cpu` is * true, then any CPUs actively using the page tables will be IPIed, and the * call will block until they have acknowledged being ready to use the latest * state of the tables. * * Requires vmspace_t`vms_lock be held, which is released as part of the call. */ static void vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) { ASSERT(MUTEX_HELD(&vms->vms_lock)); VERIFY(vms->vms_held); vm_client_t *vmc = list_head(&vms->vms_clients); for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { vmc_space_release(vmc, kick_on_cpu); } vms->vms_held = false; mutex_exit(&vms->vms_lock); } /* * Attempt to map a vm_object span into the vmspace. * * Requirements: * - `obj_off`, `addr`, and `len` must be page-aligned * - `obj_off` cannot be greater than the allocated size of the object * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated * size of the object * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address * of the vmspace */ int vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, size_t len, uint8_t prot) { vmspace_mapping_t *vmsm; int res = 0; if (len == 0 || (addr + len) < addr || obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { return (EINVAL); } if ((addr + len) >= vms->vms_size) { return (ENOMEM); } vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); vmspace_hold_enter(vms); if (!vm_mapping_gap(vms, addr, len)) { kmem_free(vmsm, sizeof (*vmsm)); res = ENOMEM; } else { vmsm->vmsm_object = vmo; vmsm->vmsm_addr = addr; vmsm->vmsm_len = len; vmsm->vmsm_offset = (off_t)obj_off; vmsm->vmsm_prot = prot; list_insert_tail(&vms->vms_maplist, vmsm); /* * Make sure the GPT has tables ready for leaf entries across * the entire new mapping. */ vmm_gpt_populate_region(vms->vms_gpt, addr, addr + len); } vmspace_hold_exit(vms, false); return (res); } /* * Unmap a region of the vmspace. * * Presently the [start, end) span must equal a region previously mapped by a * call to vmspace_map(). */ int vmspace_unmap(vmspace_t *vms, uintptr_t start, uintptr_t end) { const size_t size = (size_t)(end - start); vmspace_mapping_t *vmsm; vm_client_t *vmc; uint64_t gen = 0; ASSERT(start < end); vmspace_hold_enter(vms); /* expect to match existing mapping exactly */ if ((vmsm = vm_mapping_find(vms, start, size)) == NULL || vmsm->vmsm_addr != start || vmsm->vmsm_len != size) { vmspace_hold_exit(vms, false); return (ENOENT); } /* Prepare clients (and their held pages) for the unmap. */ for (vmc = list_head(&vms->vms_clients); vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { vmc_space_unmap(vmc, start, size, vmsm->vmsm_object); } /* Clear all PTEs for region */ if (vmm_gpt_unmap_region(vms->vms_gpt, start, end) != 0) { vms->vms_pt_gen++; gen = vms->vms_pt_gen; } /* ... and the intermediate (directory) PTEs as well */ vmm_gpt_vacate_region(vms->vms_gpt, start, end); /* * If pages were actually unmapped from the GPT, provide clients with * an invalidation notice. */ if (gen != 0) { for (vmc = list_head(&vms->vms_clients); vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { vmc_space_invalidate(vmc, start, size, vms->vms_pt_gen); } } vm_mapping_remove(vms, vmsm); vmspace_hold_exit(vms, true); return (0); } static int vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, uint64_t **ptepp) { vmm_gpt_t *gpt = vms->vms_gpt; uint64_t *entries[MAX_GPT_LEVEL], *leaf; pfn_t pfn = PFN_INVALID; uint_t prot; ASSERT0(gpa & PAGEOFFSET); ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); leaf = entries[LEVEL1]; if (leaf == NULL) { /* * Since we populated the intermediate tables for any regions * mapped in the GPT, an empty leaf entry indicates there is no * mapping, populated or not, at this GPT. */ return (FC_NOMAP); } if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { if ((req_prot & prot) != req_prot) { return (FC_PROT); } } else { vmspace_mapping_t *vmsm; vm_object_t *vmo; vmsm = vm_mapping_find(vms, gpa, PAGESIZE); if (vmsm == NULL) { return (FC_NOMAP); } if ((req_prot & vmsm->vmsm_prot) != req_prot) { return (FC_PROT); } vmo = vmsm->vmsm_object; pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); VERIFY(pfn != PFN_INVALID); if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot, vmo->vmo_attr)) { atomic_inc_64(&vms->vms_pages_mapped); } } ASSERT(pfn != PFN_INVALID && leaf != NULL); if (pfnp != NULL) { *pfnp = pfn; } if (ptepp != NULL) { *ptepp = leaf; } return (0); } /* * Populate (make resident in the page tables) a region of the vmspace. * * Presently the [start, end) span must equal a region previously mapped by a * call to vmspace_map(). */ int vmspace_populate(vmspace_t *vms, uintptr_t start, uintptr_t end) { const size_t size = end - start; vmspace_mapping_t *vmsm; mutex_enter(&vms->vms_lock); /* For the time being, only exact-match mappings are expected */ if ((vmsm = vm_mapping_find(vms, start, size)) == NULL) { mutex_exit(&vms->vms_lock); return (FC_NOMAP); } vm_object_t *vmo = vmsm->vmsm_object; const int prot = vmsm->vmsm_prot; const uint8_t attr = vmo->vmo_attr; size_t populated = 0; for (uintptr_t gpa = start & PAGEMASK; gpa < end; gpa += PAGESIZE) { const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); VERIFY(pfn != PFN_INVALID); if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) { populated++; } } atomic_add_64(&vms->vms_pages_mapped, populated); mutex_exit(&vms->vms_lock); return (0); } /* * Allocate a client from a given vmspace. */ vm_client_t * vmspace_client_alloc(vmspace_t *vms) { vm_client_t *vmc; vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); vmc->vmc_space = vms; mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); vmc->vmc_state = VCS_IDLE; vmc->vmc_cpu_active = -1; list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), offsetof(vm_page_t, vmp_node)); vmc->vmc_track_dirty = vms->vms_track_dirty; mutex_enter(&vms->vms_lock); list_insert_tail(&vms->vms_clients, vmc); mutex_exit(&vms->vms_lock); return (vmc); } /* * Get the nested page table root pointer (EPTP/NCR3) value. */ uint64_t vmspace_table_root(vmspace_t *vms) { return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty)); } /* * Get the current generation number of the nested page table. */ uint64_t vmspace_table_gen(vmspace_t *vms) { return (vms->vms_pt_gen); } /* * Mark a vm_client as active. This will block if/while the client is held by * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will * fail if the vm_client has been orphaned. */ static int vmc_activate(vm_client_t *vmc) { mutex_enter(&vmc->vmc_lock); VERIFY0(vmc->vmc_state & VCS_ACTIVE); if ((vmc->vmc_state & VCS_ORPHANED) != 0) { mutex_exit(&vmc->vmc_lock); return (ENXIO); } while ((vmc->vmc_state & VCS_HOLD) != 0) { cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); } vmc->vmc_state |= VCS_ACTIVE; return (0); } /* * Mark a vm_client as no longer active. It must be called with * vm_client_t`vmc_lock already held, and will return with it released. */ static void vmc_deactivate(vm_client_t *vmc) { ASSERT(MUTEX_HELD(&vmc->vmc_lock)); VERIFY(vmc->vmc_state & VCS_ACTIVE); vmc->vmc_state ^= VCS_ACTIVE; if ((vmc->vmc_state & VCS_HOLD) != 0) { cv_broadcast(&vmc->vmc_cv); } mutex_exit(&vmc->vmc_lock); } /* * Indicate that a CPU will be utilizing the nested page tables through this VM * client. Interrupts (and/or the GIF) are expected to be disabled when calling * this function. Returns the generation number of the nested page table (to be * used for TLB invalidations). */ uint64_t vmc_table_enter(vm_client_t *vmc) { vmspace_t *vms = vmc->vmc_space; uint64_t gen; ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); ASSERT3S(vmc->vmc_cpu_active, ==, -1); /* * Since the NPT activation occurs with interrupts disabled, this must * be done without taking vmc_lock like normal. */ gen = vms->vms_pt_gen; vmc->vmc_cpu_active = CPU->cpu_id; vmc->vmc_cpu_gen = gen; atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); return (gen); } /* * Indicate that this VM client is not longer (directly) using the underlying * page tables. Interrupts (and/or the GIF) must be enabled prior to calling * this function. */ void vmc_table_exit(vm_client_t *vmc) { mutex_enter(&vmc->vmc_lock); ASSERT(vmc->vmc_state & VCS_ON_CPU); vmc->vmc_state ^= VCS_ON_CPU; vmc->vmc_cpu_active = -1; if ((vmc->vmc_state & VCS_HOLD) != 0) { cv_broadcast(&vmc->vmc_cv); } mutex_exit(&vmc->vmc_lock); } static void vmc_space_hold(vm_client_t *vmc) { mutex_enter(&vmc->vmc_lock); VERIFY0(vmc->vmc_state & VCS_HOLD); /* * Because vmc_table_enter() alters vmc_state from a context where * interrupts are disabled, it cannot pay heed to vmc_lock, so setting * VMC_HOLD must be done atomically here. */ atomic_or_uint(&vmc->vmc_state, VCS_HOLD); /* Wait for client to go inactive */ while ((vmc->vmc_state & VCS_ACTIVE) != 0) { cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); } mutex_exit(&vmc->vmc_lock); } static void vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) { mutex_enter(&vmc->vmc_lock); VERIFY(vmc->vmc_state & VCS_HOLD); if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { poke_cpu(vmc->vmc_cpu_active); while ((vmc->vmc_state & VCS_ON_CPU) != 0) { cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); } } /* * Because vmc_table_enter() alters vmc_state from a context where * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing * VMC_HOLD must be done atomically here. */ atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); cv_broadcast(&vmc->vmc_cv); mutex_exit(&vmc->vmc_lock); } static void vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, uint64_t gen) { mutex_enter(&vmc->vmc_lock); VERIFY(vmc->vmc_state & VCS_HOLD); if ((vmc->vmc_state & VCS_ON_CPU) != 0) { /* * Wait for clients using an old generation of the page tables * to exit guest context, where they subsequently flush the TLB * for the new generation. */ if (vmc->vmc_cpu_gen < gen) { poke_cpu(vmc->vmc_cpu_active); while ((vmc->vmc_state & VCS_ON_CPU) != 0) { cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); } } } if (vmc->vmc_inval_func != NULL) { vmc_inval_cb_t func = vmc->vmc_inval_func; void *data = vmc->vmc_inval_data; /* * Perform the actual invalidation call outside vmc_lock to * avoid lock ordering issues in the consumer. Since the client * is under VCS_HOLD, this is safe. */ mutex_exit(&vmc->vmc_lock); func(data, addr, size); mutex_enter(&vmc->vmc_lock); } mutex_exit(&vmc->vmc_lock); } static void vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, vm_object_t *vmo) { mutex_enter(&vmc->vmc_lock); VERIFY(vmc->vmc_state & VCS_HOLD); /* * With the current vCPU exclusion invariants in place, we do not expect * a vCPU to be in guest context during an unmap. */ VERIFY0(vmc->vmc_state & VCS_ON_CPU); /* * Any holds against the unmapped region need to establish their own * reference to the underlying object to avoid a potential * use-after-free. */ for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); vmp != NULL; vmp = list_next(&vmc->vmc_held_pages, vmc)) { if (vmp->vmp_gpa < addr || vmp->vmp_gpa >= (addr + size)) { /* Hold outside region in question */ continue; } if (vmp->vmp_obj_ref == NULL) { vm_object_reference(vmo); vmp->vmp_obj_ref = vmo; /* For an unmapped region, PTE is now meaningless */ vmp->vmp_ptep = NULL; } else { /* * Object could have gone through cycle of * unmap-map-unmap before the hold was released. */ VERIFY3P(vmp->vmp_ptep, ==, NULL); } } mutex_exit(&vmc->vmc_lock); } static vm_client_t * vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) { vm_client_t *next; ASSERT(MUTEX_HELD(&vms->vms_lock)); mutex_enter(&vmc->vmc_lock); VERIFY3P(vmc->vmc_space, ==, vms); VERIFY0(vmc->vmc_state & VCS_ORPHANED); if (vmc->vmc_state & VCS_DESTROY) { /* * This vm_client is currently undergoing destruction, so it * does not need to be orphaned. Let it proceed with its own * clean-up task. */ next = list_next(&vms->vms_clients, vmc); } else { /* * Clients are only orphaned when the containing vmspace is * being torn down. All mappings from the vmspace should * already be gone, meaning any remaining held pages should have * direct references to the object. */ for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); vmp != NULL; vmp = list_next(&vmc->vmc_held_pages, vmp)) { ASSERT3P(vmp->vmp_ptep, ==, NULL); ASSERT3P(vmp->vmp_obj_ref, !=, NULL); } /* * After this point, the client will be orphaned, unable to * establish new page holds (or access any vmspace-related * resources) and is in charge of cleaning up after itself. */ vmc->vmc_state |= VCS_ORPHANED; next = list_next(&vms->vms_clients, vmc); list_remove(&vms->vms_clients, vmc); vmc->vmc_space = NULL; } mutex_exit(&vmc->vmc_lock); return (next); } /* * Attempt to hold a page at `gpa` inside the referenced vmspace. */ vm_page_t * vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) { vmspace_t *vms = vmc->vmc_space; vm_page_t *vmp; pfn_t pfn = PFN_INVALID; uint64_t *ptep = NULL; ASSERT0(gpa & PAGEOFFSET); ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); if (vmc_activate(vmc) != 0) { kmem_free(vmp, sizeof (*vmp)); return (NULL); } if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { vmc_deactivate(vmc); kmem_free(vmp, sizeof (*vmp)); return (NULL); } ASSERT(pfn != PFN_INVALID && ptep != NULL); vmp->vmp_client = vmc; vmp->vmp_chain = NULL; vmp->vmp_gpa = gpa; vmp->vmp_pfn = pfn; vmp->vmp_ptep = ptep; vmp->vmp_obj_ref = NULL; vmp->vmp_prot = prot; list_insert_tail(&vmc->vmc_held_pages, vmp); vmc_deactivate(vmc); return (vmp); } int vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) { vmspace_t *vms = vmc->vmc_space; int err; err = vmc_activate(vmc); if (err == 0) { err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); vmc_deactivate(vmc); } return (err); } /* * Allocate an additional vm_client_t, based on an existing one. Only the * associatation with the vmspace is cloned, not existing holds or any * configured invalidation function. */ vm_client_t * vmc_clone(vm_client_t *vmc) { vmspace_t *vms = vmc->vmc_space; return (vmspace_client_alloc(vms)); } /* * Register a function (and associated data pointer) to be called when an * address range in the vmspace is invalidated. */ int vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) { int err; err = vmc_activate(vmc); if (err == 0) { vmc->vmc_inval_func = func; vmc->vmc_inval_data = data; vmc_deactivate(vmc); } return (err); } /* * Destroy a vm_client_t instance. * * No pages held through this vm_client_t may be outstanding when performing a * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to * vmc_table_exit() has been made). */ void vmc_destroy(vm_client_t *vmc) { mutex_enter(&vmc->vmc_lock); VERIFY(list_is_empty(&vmc->vmc_held_pages)); VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); if ((vmc->vmc_state & VCS_ORPHANED) == 0) { vmspace_t *vms; /* * Deassociation with the parent vmspace must be done carefully: * The vmspace could attempt to orphan this vm_client while we * release vmc_lock in order to take vms_lock (the required * order). The client is marked to indicate that destruction is * under way. Doing so prevents any racing orphan operation * from applying to this client, allowing us to deassociate from * the vmspace safely. */ vmc->vmc_state |= VCS_DESTROY; vms = vmc->vmc_space; mutex_exit(&vmc->vmc_lock); mutex_enter(&vms->vms_lock); mutex_enter(&vmc->vmc_lock); list_remove(&vms->vms_clients, vmc); /* * If the vmspace began its own destruction operation while we * were navigating the locks, be sure to notify it about this * vm_client being deassociated. */ cv_signal(&vms->vms_cv); mutex_exit(&vmc->vmc_lock); mutex_exit(&vms->vms_lock); } else { VERIFY3P(vmc->vmc_space, ==, NULL); mutex_exit(&vmc->vmc_lock); } mutex_destroy(&vmc->vmc_lock); cv_destroy(&vmc->vmc_cv); list_destroy(&vmc->vmc_held_pages); kmem_free(vmc, sizeof (*vmc)); } static __inline void * vmp_ptr(const vm_page_t *vmp) { ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); return ((void *)((uintptr_t)kpm_vbase + paddr)); } /* * Get a readable kernel-virtual pointer for a held page. * * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() * call to acquire this page reference. */ const void * vmp_get_readable(const vm_page_t *vmp) { ASSERT(vmp->vmp_prot & PROT_READ); return (vmp_ptr(vmp)); } /* * Get a writable kernel-virtual pointer for a held page. * * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() * call to acquire this page reference. */ void * vmp_get_writable(const vm_page_t *vmp) { ASSERT(vmp->vmp_prot & PROT_WRITE); return (vmp_ptr(vmp)); } /* * Get the host-physical PFN for a held page. */ pfn_t vmp_get_pfn(const vm_page_t *vmp) { return (vmp->vmp_pfn); } /* * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. */ void vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) { ASSERT3P(vmp->vmp_chain, ==, NULL); vmp->vmp_chain = to_chain; } /* * Retrieve the pointer from the page-chaining in `vmp`. */ vm_page_t * vmp_next(const vm_page_t *vmp) { return (vmp->vmp_chain); } static __inline bool vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) { ASSERT(MUTEX_HELD(&vmc->vmc_lock)); bool was_unmapped = false; list_remove(&vmc->vmc_held_pages, vmp); if (vmp->vmp_obj_ref != NULL) { ASSERT3P(vmp->vmp_ptep, ==, NULL); vm_object_release(vmp->vmp_obj_ref); was_unmapped = true; } else { ASSERT3P(vmp->vmp_ptep, !=, NULL); if ((vmp->vmp_prot & PROT_WRITE) != 0 && vmc->vmc_track_dirty) { vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; (void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); } } kmem_free(vmp, sizeof (*vmp)); return (was_unmapped); } /* * Release held page. Returns true if page resided on region which was * subsequently unmapped. */ bool vmp_release(vm_page_t *vmp) { vm_client_t *vmc = vmp->vmp_client; VERIFY(vmc != NULL); mutex_enter(&vmc->vmc_lock); const bool was_unmapped = vmp_release_inner(vmp, vmc); mutex_exit(&vmc->vmc_lock); return (was_unmapped); } /* * Release a chain of pages which were associated via vmp_chain() (setting * page-chaining pointer). Returns true if any pages resided upon a region * which was subsequently unmapped. * * All of those pages must have been held through the same vm_client_t. */ bool vmp_release_chain(vm_page_t *vmp) { vm_client_t *vmc = vmp->vmp_client; bool any_unmapped = false; ASSERT(vmp != NULL); mutex_enter(&vmc->vmc_lock); while (vmp != NULL) { vm_page_t *next = vmp->vmp_chain; /* We expect all pages in chain to be from same client */ ASSERT3P(vmp->vmp_client, ==, vmc); if (vmp_release_inner(vmp, vmc)) { any_unmapped = true; } vmp = next; } mutex_exit(&vmc->vmc_lock); return (any_unmapped); } int vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) { vm_object_t *vmo; int err; if (segoff < 0 || len <= 0 || (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { return (EINVAL); } if ((prot & PROT_USER) == 0) { return (ENOTSUP); } err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); if (err != 0) { return (err); } VERIFY(segoff >= 0); VERIFY(len <= vmo->vmo_size); VERIFY((len + segoff) <= vmo->vmo_size); if (vmo->vmo_type != VMOT_MEM) { /* Only support memory objects for now */ return (ENOTSUP); } as_rangelock(as); err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); if (err == 0) { segvmm_crargs_t svma; svma.prot = prot; svma.offset = segoff; svma.vmo = vmo; svma.vmc = NULL; err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); } as_rangeunlock(as); return (err); } int vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, off_t len, uint_t prot, uint_t maxprot, uint_t flags) { const uintptr_t gpa = (uintptr_t)off; const size_t size = (uintptr_t)len; int err; if (off < 0 || len <= 0 || (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { return (EINVAL); } if ((prot & PROT_USER) == 0) { return (ENOTSUP); } as_rangelock(as); err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); if (err == 0) { segvmm_crargs_t svma; svma.prot = prot; svma.offset = gpa; svma.vmo = NULL; svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); err = as_map(as, *addrp, len, segvmm_create, &svma); } as_rangeunlock(as); return (err); }