/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2019 Joyent, Inc. * Copyright 2022 Oxide Computer Company */ #include #include #include #include #include #include #include #include #include #include /* * VMM Generic Page Tables * * Bhyve runs on AMD and Intel hosts and both support nested page tables * describing the guest's physical address space. But the two use different and * mutually incompatible page table formats: Intel uses the EPT, which is based * on the Itanium page table format, while AMD uses the nPT, which is based on * the x86_64 page table format. * * The GPT abstracts these format differences, and provides a single interface * for interacting with either kind of table structure. * * At a high-level, the GPT is a tree that mirrors the paging table radix tree. * It is parameterized with operations on PTEs that are specific to the table * type (EPT or nPT) and also keeps track of how many pages the table maps, as * well as a pointer to the root node in the tree. * * A node in the GPT keep pointers to its parent (NULL for the root), its * left-most child, and its rightward siblings. The node understands its * position in the tree in terms of its level it appears at and the index it * occupies at its parent's level, as well as how many children it has. It also * owns the physical memory page for the hardware page table entries that map * its children. Thus, for a node at any given level in the tree, the nested * PTE for that node's child at index $i$ is the i'th uint64_t in that node's * entry page and the entry page is part of the paging structure consumed by * hardware. * * The GPT interface provides functions for populating and vacating the tree for * regions in the guest physical address space, and for mapping and unmapping * pages in populated regions. Users must populate a region before mapping * pages into it, and must unmap pages before vacating the region. * * The interface also exposes a function for walking the table from the root to * a leaf entry, populating an array of pointers to PTEs. This walk uses the * hardware page structure itself, and is thus fast, though as a result it * potentially aliases entries; caveat emptor. The walk primitive is used for * mapping, unmapping, and lookups. * * Format-specific differences are abstracted by parameterizing the GPT with a * set of PTE operations specific to the platform. The GPT code makes use of * these when mapping or populating entries, resetting accessed and dirty bits * on entries, and similar operations. */ /* * A GPT node. * * Each node contains pointers to its parent, its left-most child, and its * rightward siblings. Interior nodes also maintain a reference count, and * each node contains its level and index in its parent's table. Finally, * each node contains the host PFN of the page that it links into the page * table, as well as a kernel pointer to table. * * On leaf nodes, the reference count tracks how many entries in the table are * covered by mapping from the containing vmspace. This is maintained during * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace * map/unmap operations, rather than in the data path of faults populating the * PTEs themselves. * * Note, this is carefully sized to fit exactly into a 64-byte cache line. */ typedef struct vmm_gpt_node vmm_gpt_node_t; struct vmm_gpt_node { uint64_t vgn_host_pfn; uint16_t vgn_level; uint16_t vgn_index; uint32_t vgn_ref_cnt; vmm_gpt_node_t *vgn_parent; vmm_gpt_node_t *vgn_children; vmm_gpt_node_t *vgn_siblings; uint64_t *vgn_entries; uint64_t vgn_gpa; uint64_t _vgn_pad; }; /* * A VMM Generic Page Table. * * The generic page table is a format-agnostic, 4-level paging structure * modeling a second-level page table (EPT on Intel, nPT on AMD). It * contains a counter of pages the table maps, a pointer to the root node * in the table, and is parameterized with a set of PTE operations specific * to the table type. */ struct vmm_gpt { vmm_gpt_node_t *vgpt_root; vmm_pte_ops_t *vgpt_pte_ops; }; /* * VMM Guest Page Tables */ /* * Allocates a vmm_gpt_node_t structure with corresponding page of memory to * hold the PTEs it contains. */ static vmm_gpt_node_t * vmm_gpt_node_alloc(void) { vmm_gpt_node_t *node; caddr_t page; node = kmem_zalloc(sizeof (*node), KM_SLEEP); /* * Note: despite the man page, allocating PAGESIZE bytes is * guaranteed to be page-aligned. */ page = kmem_zalloc(PAGESIZE, KM_SLEEP); node->vgn_entries = (uint64_t *)page; node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page); return (node); } /* * Allocates and initializes a vmm_gpt_t. */ vmm_gpt_t * vmm_gpt_alloc(vmm_pte_ops_t *pte_ops) { vmm_gpt_t *gpt; VERIFY(pte_ops != NULL); gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP); gpt->vgpt_pte_ops = pte_ops; gpt->vgpt_root = vmm_gpt_node_alloc(); return (gpt); } /* * Frees the given node, first nulling out all of its links to other nodes in * the tree, adjusting its parents reference count, and unlinking itself from * its parents page table. */ static void vmm_gpt_node_free(vmm_gpt_node_t *node) { ASSERT(node != NULL); ASSERT3U(node->vgn_ref_cnt, ==, 0); ASSERT(node->vgn_host_pfn != PFN_INVALID); ASSERT(node->vgn_entries != NULL); if (node->vgn_parent != NULL) { uint64_t *parent_entries = node->vgn_parent->vgn_entries; parent_entries[node->vgn_index] = 0; node->vgn_parent->vgn_ref_cnt--; } kmem_free(node->vgn_entries, PAGESIZE); kmem_free(node, sizeof (*node)); } /* * Frees the portion of the radix tree rooted at the given node. */ static void vmm_gpt_node_tree_free(vmm_gpt_node_t *node) { ASSERT(node != NULL); for (vmm_gpt_node_t *child = node->vgn_children, *next = NULL; child != NULL; child = next) { next = child->vgn_siblings; vmm_gpt_node_tree_free(child); } vmm_gpt_node_free(node); } /* * Cleans up a vmm_gpt_t by removing any lingering vmm_gpt_node_t entries * it refers to. */ void vmm_gpt_free(vmm_gpt_t *gpt) { vmm_gpt_node_tree_free(gpt->vgpt_root); kmem_free(gpt, sizeof (*gpt)); } /* * Return the index in the paging structure for the given level. */ static inline uint16_t vmm_gpt_node_index(uint64_t gpa, enum vmm_gpt_node_level level) { const int SHIFTS[MAX_GPT_LEVEL] = { 39, 30, 21, 12 }; const uint_t MASK = (1U << 9) - 1; ASSERT(level < MAX_GPT_LEVEL); return ((gpa >> SHIFTS[level]) & MASK); } /* * Finds the child for the given GPA in the given parent node. * Returns a pointer to node, or NULL if it is not found. */ static vmm_gpt_node_t * vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa) { if (parent == NULL) return (NULL); const uint16_t index = vmm_gpt_node_index(gpa, parent->vgn_level); for (vmm_gpt_node_t *child = parent->vgn_children; child != NULL && child->vgn_index <= index; child = child->vgn_siblings) { if (child->vgn_index == index) return (child); } return (NULL); } /* * Walks the GPT for the given GPA, accumulating entries to the given depth. If * the walk terminates before the depth is reached, the remaining entries are * written with NULLs. */ void vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries, enum vmm_gpt_node_level depth) { uint64_t *current_entries, entry; pfn_t pfn; ASSERT(gpt != NULL); current_entries = gpt->vgpt_root->vgn_entries; for (uint_t i = 0; i < depth; i++) { if (current_entries == NULL) { entries[i] = NULL; continue; } entries[i] = ¤t_entries[vmm_gpt_node_index(gpa, i)]; entry = *entries[i]; if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { current_entries = NULL; continue; } pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); current_entries = (uint64_t *)hat_kpm_pfn2va(pfn); } } /* * Looks up an entry given GPA. */ uint64_t * vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa) { uint64_t *entries[MAX_GPT_LEVEL]; vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); return (entries[LEVEL1]); } /* * Adds a node for the given GPA to the GPT as a child of the given parent. */ static void vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child, uint64_t gpa) { vmm_gpt_node_t **prevp; vmm_gpt_node_t *node; uint64_t *parent_entries, entry; ASSERT(gpt != NULL); ASSERT(gpt->vgpt_pte_ops != NULL); ASSERT(parent != NULL); ASSERT(child != NULL); ASSERT3U(parent->vgn_level, <, LEVEL1); const uint64_t gpa_mask[3] = { [LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */ [LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */ [LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */ }; const int index = vmm_gpt_node_index(gpa, parent->vgn_level); child->vgn_index = index; child->vgn_level = parent->vgn_level + 1; child->vgn_parent = parent; child->vgn_gpa = gpa & gpa_mask[parent->vgn_level]; parent_entries = parent->vgn_entries; entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn); parent_entries[index] = entry; for (prevp = &parent->vgn_children, node = parent->vgn_children; node != NULL; prevp = &node->vgn_siblings, node = node->vgn_siblings) { if (node->vgn_index > child->vgn_index) { break; } } if (node != NULL) ASSERT3U(node->vgn_index, !=, child->vgn_index); child->vgn_siblings = node; *prevp = child; parent->vgn_ref_cnt++; } /* * Populate the GPT with nodes so that a entries for the given GPA exist. Note * that this does not actually map the entry, but simply ensures that the * entries exist. */ static void vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa) { vmm_gpt_node_t *node, *child; ASSERT(gpt != NULL); ASSERT0(gpa & PAGEOFFSET); node = gpt->vgpt_root; for (uint_t i = 0; i < LEVEL1; i++) { ASSERT(node != NULL); child = vmm_gpt_node_find_child(node, gpa); if (child == NULL) { child = vmm_gpt_node_alloc(); ASSERT(child != NULL); vmm_gpt_add_child(gpt, node, child, gpa); } node = child; } /* * Bump the reference count for this leaf for the PTE that is now usable * by the mapping covering its GPA. */ ASSERT3U(node->vgn_level, ==, LEVEL1); ASSERT3U(node->vgn_ref_cnt, <, 512); node->vgn_ref_cnt++; } /* * Ensures that PTEs for the region of address space bounded by * [start, end) exist in the tree. */ void vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) { ASSERT0(start & PAGEOFFSET); ASSERT0(end & PAGEOFFSET); for (uint64_t page = start; page < end; page += PAGESIZE) { vmm_gpt_populate_entry(gpt, page); } } /* * Format a PTE and install it in the provided PTE-pointer. */ bool vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot, uint8_t attr) { uint64_t entry, old_entry; entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr); old_entry = atomic_cas_64(ptep, 0, entry); if (old_entry != 0) { ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==, gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry)); return (false); } return (true); } /* * Inserts an entry for a given GPA into the table. The caller must * ensure that a conflicting PFN is not mapped at the requested location. * Racing operations to map the same PFN at one location is acceptable and * properly handled. */ bool vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr) { uint64_t *entries[MAX_GPT_LEVEL]; ASSERT(gpt != NULL); vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); ASSERT(entries[LEVEL1] != NULL); return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr)); } /* * Removes a child node from its parent's list of children, and then frees * the now-orphaned child. */ static void vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child) { ASSERT(parent != NULL); ASSERT3P(child->vgn_children, ==, NULL); vmm_gpt_node_t **prevp = &parent->vgn_children; for (vmm_gpt_node_t *node = parent->vgn_children; node != NULL; prevp = &node->vgn_siblings, node = node->vgn_siblings) { if (node == child) { *prevp = node->vgn_siblings; vmm_gpt_node_free(node); break; } } } /* * Cleans up unused inner nodes in the GPT. Asserts that the leaf corresponding * to the entry does not map any additional pages. */ static void vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa) { vmm_gpt_node_t *nodes[MAX_GPT_LEVEL], *node; node = gpt->vgpt_root; for (uint_t i = 0; i < MAX_GPT_LEVEL; i++) { nodes[i] = node; node = vmm_gpt_node_find_child(node, gpa); } for (uint_t i = LEVEL1; i > 0; i--) { node = nodes[i]; if (node == NULL) continue; if (i == LEVEL1) { ASSERT0(node->vgn_entries[vmm_gpt_node_index(gpa, i)]); ASSERT3U(node->vgn_ref_cnt, !=, 0); /* * Just as vmm_gpt_populate_entry() increments the * reference count for leaf PTEs which become usable, * here we decrement it as they become unusable as the * mapping covering its GPA is removed. */ node->vgn_ref_cnt--; } if (node->vgn_ref_cnt != 0) break; vmm_gpt_node_remove_child(nodes[i - 1], nodes[i]); } } /* * Cleans up the unused inner nodes in the GPT for a region of guest physical * address space of [start, end). The region must map no pages. */ void vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) { ASSERT0(start & PAGEOFFSET); ASSERT0(end & PAGEOFFSET); for (uint64_t page = start; page < end; page += PAGESIZE) { vmm_gpt_vacate_entry(gpt, page); } } /* * Remove a mapping from the table. Returns false if the page was not mapped, * otherwise returns true. */ bool vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) { uint64_t *entries[MAX_GPT_LEVEL], entry; ASSERT(gpt != NULL); vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); if (entries[LEVEL1] == NULL) return (false); entry = *entries[LEVEL1]; *entries[LEVEL1] = 0; return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)); } /* * Un-maps the region of guest physical address space bounded by [start..end). * Returns the number of pages that are unmapped. */ size_t vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) { ASSERT0(start & PAGEOFFSET); ASSERT0(end & PAGEOFFSET); size_t num_unmapped = 0; for (uint64_t page = start; page < end; page += PAGESIZE) { if (vmm_gpt_unmap(gpt, page) != 0) { num_unmapped++; } } return (num_unmapped); } /* * Returns a value indicating whether or not this GPT maps the given * GPA. If the GPA is mapped, *protp will be filled with the protection * bits of the entry. Otherwise, it will be ignored. */ bool vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp) { uint64_t entry; if (ptep == NULL) { return (false); } entry = *ptep; if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { return (false); } *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry); return (true); } /* * Resets the accessed bit on the page table entry pointed to be `entry`. * If `on` is true, the bit will be set, otherwise it will be cleared. * The old value of the bit is returned. */ uint_t vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on) { ASSERT(entry != NULL); return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on)); } /* * Resets the dirty bit on the page table entry pointed to be `entry`. * If `on` is true, the bit will be set, otherwise it will be cleared. * The old value of the bit is returned. */ uint_t vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on) { ASSERT(entry != NULL); return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on)); } /* * Get properly formatted PML4 (EPTP/nCR3) for GPT. */ uint64_t vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty) { const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn; return (gpt->vgpt_pte_ops->vpeo_get_pmtp(root_pfn, track_dirty)); }