1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2019 Joyent, Inc.
15 * Copyright 2025 Oxide Computer Company
16 */
17
18 #include <sys/types.h>
19 #include <sys/atomic.h>
20 #include <sys/kmem.h>
21 #include <sys/sysmacros.h>
22 #include <sys/sunddi.h>
23 #include <sys/panic.h>
24 #include <vm/hat.h>
25 #include <vm/as.h>
26 #include <vm/hat_i86.h>
27
28 #include <sys/vmm_gpt.h>
29
30 /*
31 * VMM Generic Page Tables
32 *
33 * Bhyve runs on AMD and Intel hosts and both support nested page tables
34 * describing the guest's physical address space. But the two use different and
35 * mutually incompatible page table formats: Intel uses the EPT, which is based
36 * on the Itanium page table format, while AMD uses the nPT, which is based on
37 * the x86_64 page table format.
38 *
39 * The GPT abstracts these format differences, and provides a single interface
40 * for interacting with either kind of table structure.
41 *
42 * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
43 * It is parameterized with operations on PTEs that are specific to the table
44 * type (EPT or nPT) and also keeps track of how many pages the table maps, as
45 * well as a pointer to the root node in the tree.
46 *
47 * A node in the GPT keep pointers to its parent (NULL for the root), its
48 * left-most child, and its siblings. The node understands its position in the
49 * tree in terms of the level it appears at and the index it occupies at its
50 * parent's level, as well as how many children it has. It also owns the
51 * physical memory page for the hardware page table entries that map its
52 * children. Thus, for a node at any given level in the tree, the nested PTE
53 * for that node's child at index $i$ is the i'th uint64_t in that node's entry
54 * page and the entry page is part of the paging structure consumed by hardware.
55 *
56 * The GPT interface provides functions for populating and vacating the tree for
57 * regions in the guest physical address space, and for mapping and unmapping
58 * pages in populated regions. Users must populate a region before mapping
59 * pages into it, and must unmap pages before vacating the region.
60 *
61 * The interface also exposes a function for walking the table from the root to
62 * a leaf entry, populating an array of pointers to PTEs. This walk uses the
63 * hardware page structure itself, and is thus fast, though as a result it
64 * potentially aliases entries; caveat emptor. The walk primitive is used for
65 * mapping, unmapping, and lookups.
66 *
67 * Format-specific differences are abstracted by parameterizing the GPT with a
68 * set of PTE operations specific to the platform. The GPT code makes use of
69 * these when mapping or populating entries, resetting accessed and dirty bits
70 * on entries, and similar operations.
71 */
72
73 /*
74 * A GPT node.
75 *
76 * Each node contains pointers to its parent, its left-most child, and its
77 * siblings. Interior nodes also maintain a reference count, and each node
78 * contains its level and index in its parent's table. Finally, each node
79 * contains the host PFN of the page that it links into the page table, as well
80 * as a kernel pointer to table.
81 *
82 * On leaf nodes, the reference count tracks how many entries in the table are
83 * covered by mapping from the containing vmspace. This is maintained during
84 * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace
85 * map/unmap operations, rather than in the data path of faults populating the
86 * PTEs themselves.
87 *
88 * Note, this is carefully sized to fit exactly into a 64-byte cache line.
89 */
90 typedef struct vmm_gpt_node vmm_gpt_node_t;
91 struct vmm_gpt_node {
92 uint64_t vgn_host_pfn;
93 uint16_t vgn_level;
94 uint16_t vgn_index;
95 uint32_t vgn_ref_cnt;
96 vmm_gpt_node_t *vgn_parent;
97 vmm_gpt_node_t *vgn_children;
98 vmm_gpt_node_t *vgn_sib_next;
99 vmm_gpt_node_t *vgn_sib_prev;
100 vmm_gpt_entry_t *vgn_entries;
101 uint64_t vgn_gpa;
102 };
103
104 /* Maximum node index determined by number of entries in page table (512) */
105 #define PTE_PER_TABLE 512
106 #define MAX_NODE_IDX (PTE_PER_TABLE - 1)
107
108 /*
109 * A VMM Generic Page Table.
110 *
111 * The generic page table is a format-agnostic, 4-level paging structure
112 * modeling a second-level page table (EPT on Intel, nPT on AMD). It
113 * contains a counter of pages the table maps, a pointer to the root node
114 * in the table, and is parameterized with a set of PTE operations specific
115 * to the table type.
116 */
117 struct vmm_gpt {
118 vmm_gpt_node_t *vgpt_root;
119 vmm_pte_ops_t *vgpt_pte_ops;
120 };
121
122 /*
123 * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
124 * hold the PTEs it contains.
125 */
126 static vmm_gpt_node_t *
vmm_gpt_node_alloc(void)127 vmm_gpt_node_alloc(void)
128 {
129 vmm_gpt_node_t *node;
130 caddr_t page;
131
132 node = kmem_zalloc(sizeof (*node), KM_SLEEP);
133 /*
134 * Note: despite the man page, allocating PAGESIZE bytes is
135 * guaranteed to be page-aligned.
136 */
137 page = kmem_zalloc(PAGESIZE, KM_SLEEP);
138 node->vgn_entries = (vmm_gpt_entry_t *)page;
139 node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
140
141 return (node);
142 }
143
144 /*
145 * Allocates and initializes a vmm_gpt_t.
146 */
147 vmm_gpt_t *
vmm_gpt_alloc(vmm_pte_ops_t * pte_ops)148 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops)
149 {
150 vmm_gpt_t *gpt;
151
152 VERIFY(pte_ops != NULL);
153 gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP);
154 gpt->vgpt_pte_ops = pte_ops;
155 gpt->vgpt_root = vmm_gpt_node_alloc();
156
157 return (gpt);
158 }
159
160 /*
161 * Frees a given node. The node is expected to have no familial (parent,
162 * children, siblings) associations at this point. Accordingly, its reference
163 * count should be zero.
164 */
165 static void
vmm_gpt_node_free(vmm_gpt_node_t * node)166 vmm_gpt_node_free(vmm_gpt_node_t *node)
167 {
168 ASSERT(node != NULL);
169 ASSERT3U(node->vgn_ref_cnt, ==, 0);
170 ASSERT(node->vgn_host_pfn != PFN_INVALID);
171 ASSERT(node->vgn_entries != NULL);
172 ASSERT(node->vgn_parent == NULL);
173
174 kmem_free(node->vgn_entries, PAGESIZE);
175 kmem_free(node, sizeof (*node));
176 }
177
178 /*
179 * Frees a vmm_gpt_t. Any lingering nodes in the GPT will be freed too.
180 */
181 void
vmm_gpt_free(vmm_gpt_t * gpt)182 vmm_gpt_free(vmm_gpt_t *gpt)
183 {
184 /* Empty anything remaining in the tree */
185 vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK);
186
187 VERIFY(gpt->vgpt_root != NULL);
188 VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0);
189
190 vmm_gpt_node_free(gpt->vgpt_root);
191 kmem_free(gpt, sizeof (*gpt));
192 }
193
194 /*
195 * Given a GPA, return its corresponding index in a paging structure at the
196 * provided level.
197 */
198 static inline uint16_t
vmm_gpt_lvl_index(vmm_gpt_node_level_t level,uint64_t gpa)199 vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa)
200 {
201 ASSERT(level < MAX_GPT_LEVEL);
202
203 const uint16_t mask = MAX_NODE_IDX;
204 switch (level) {
205 case LEVEL4: return ((gpa >> 39) & mask);
206 case LEVEL3: return ((gpa >> 30) & mask);
207 case LEVEL2: return ((gpa >> 21) & mask);
208 case LEVEL1: return ((gpa >> 12) & mask);
209 default:
210 panic("impossible level value");
211 };
212 }
213
214 /* Get mask for addresses of entries at a given table level. */
215 static inline uint64_t
vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)216 vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)
217 {
218 ASSERT(level < MAX_GPT_LEVEL);
219
220 switch (level) {
221 case LEVEL4: return (0xffffff8000000000ul); /* entries cover 512G */
222 case LEVEL3: return (0xffffffffc0000000ul); /* entries cover 1G */
223 case LEVEL2: return (0xffffffffffe00000ul); /* entries cover 2M */
224 case LEVEL1: return (0xfffffffffffff000ul); /* entries cover 4K */
225 default:
226 panic("impossible level value");
227 };
228 }
229
230 /* Get length of GPA covered by entries at a given table level. */
231 static inline uint64_t
vmm_gpt_lvl_len(vmm_gpt_node_level_t level)232 vmm_gpt_lvl_len(vmm_gpt_node_level_t level)
233 {
234 ASSERT(level < MAX_GPT_LEVEL);
235
236 switch (level) {
237 case LEVEL4: return (0x8000000000ul); /* entries cover 512G */
238 case LEVEL3: return (0x40000000ul); /* entries cover 1G */
239 case LEVEL2: return (0x200000ul); /* entries cover 2M */
240 case LEVEL1: return (0x1000ul); /* entries cover 4K */
241 default:
242 panic("impossible level value");
243 };
244 }
245
246 /* Get the index of a PTE pointer based on its offset in that table */
247 static inline uint16_t
vmm_gpt_ptep_index(const vmm_gpt_entry_t * ptep)248 vmm_gpt_ptep_index(const vmm_gpt_entry_t *ptep)
249 {
250 const uintptr_t offset = (uintptr_t)ptep & 0xffful;
251 return (offset / sizeof (uint64_t));
252 }
253
254 /*
255 * Get the ending GPA which this node could possibly cover given its base
256 * address and level.
257 */
258 static inline uint64_t
vmm_gpt_node_end(vmm_gpt_node_t * node)259 vmm_gpt_node_end(vmm_gpt_node_t *node)
260 {
261 ASSERT(node->vgn_level > LEVEL4);
262 return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1));
263 }
264
265 /*
266 * Is this node the last entry in its parent node, based solely by its GPA?
267 */
268 static inline bool
vmm_gpt_node_is_last(vmm_gpt_node_t * node)269 vmm_gpt_node_is_last(vmm_gpt_node_t *node)
270 {
271 return (node->vgn_index == MAX_NODE_IDX);
272 }
273
274 /*
275 * How many table entries (if any) in this node are covered by the range of
276 * [start, end).
277 */
278 static uint16_t
vmm_gpt_node_entries_covered(vmm_gpt_node_t * node,uint64_t start,uint64_t end)279 vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end)
280 {
281 const uint64_t node_end = vmm_gpt_node_end(node);
282
283 /* Is this node covered at all by the region? */
284 if (start >= node_end || end <= node->vgn_gpa) {
285 return (0);
286 }
287
288 const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level);
289 const uint64_t covered_start = MAX(node->vgn_gpa, start & mask);
290 const uint64_t covered_end = MIN(node_end, end & mask);
291 const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level);
292
293 return ((covered_end - covered_start) / per_entry);
294 }
295
296 /*
297 * Find the next node (by address) in the tree at the same level.
298 *
299 * Returns NULL if this is the last node in the tree or if `only_seq` was true
300 * and there is an address gap between this node and the next.
301 */
302 static vmm_gpt_node_t *
vmm_gpt_node_next(vmm_gpt_node_t * node,bool only_seq)303 vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq)
304 {
305 ASSERT3P(node->vgn_parent, !=, NULL);
306 ASSERT3U(node->vgn_level, >, LEVEL4);
307
308 /*
309 * Next node sequentially would be the one at the address starting at
310 * the end of what is covered by this node.
311 */
312 const uint64_t gpa_match = vmm_gpt_node_end(node);
313
314 /* Try our next sibling */
315 vmm_gpt_node_t *next = node->vgn_sib_next;
316
317 if (next == NULL) {
318 /*
319 * If the next-sibling pointer is NULL on the node, it can mean
320 * one of two things:
321 *
322 * 1. This entry represents the space leading up to the trailing
323 * boundary of what this node covers.
324 *
325 * 2. The node is not entirely populated, and there is a gap
326 * between the last populated entry, and the trailing
327 * boundary of the node.
328 *
329 * Either way, the proper course of action is to check the first
330 * child of our parent's next sibling (if the parent is not the
331 * root of the GPT itself).
332 */
333 if (node->vgn_parent != NULL && node->vgn_level > LEVEL3) {
334 vmm_gpt_node_t *psibling =
335 vmm_gpt_node_next(node->vgn_parent, true);
336 if (psibling != NULL) {
337 next = psibling->vgn_children;
338 }
339 }
340 }
341
342 if (next != NULL &&
343 (next->vgn_gpa == gpa_match || !only_seq)) {
344 return (next);
345 }
346
347 return (NULL);
348 }
349
350
351 /*
352 * Finds the child for the given GPA in the given parent node.
353 * Returns a pointer to node, or NULL if it is not found.
354 */
355 static vmm_gpt_node_t *
vmm_gpt_node_find_child(vmm_gpt_node_t * parent,uint64_t gpa)356 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
357 {
358 const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa);
359 for (vmm_gpt_node_t *child = parent->vgn_children;
360 child != NULL && child->vgn_index <= index;
361 child = child->vgn_sib_next) {
362 if (child->vgn_index == index)
363 return (child);
364 }
365
366 return (NULL);
367 }
368
369 /*
370 * Add a child node to the GPT at a position determined by GPA, parent, and (if
371 * present) preceding sibling.
372 *
373 * If `parent` node contains any children, `prev_sibling` must be populated with
374 * a pointer to the node preceding (by GPA) the to-be-added child node.
375 */
376 static void
vmm_gpt_node_add(vmm_gpt_t * gpt,vmm_gpt_node_t * parent,vmm_gpt_node_t * child,uint64_t gpa,vmm_gpt_node_t * prev_sibling)377 vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent,
378 vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling)
379 {
380 ASSERT3U(parent->vgn_level, <, LEVEL1);
381 ASSERT3U(child->vgn_parent, ==, NULL);
382
383 const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa);
384 child->vgn_index = idx;
385 child->vgn_level = parent->vgn_level + 1;
386 child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level);
387
388 /* Establish familial connections */
389 child->vgn_parent = parent;
390 if (prev_sibling != NULL) {
391 ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa);
392
393 child->vgn_sib_next = prev_sibling->vgn_sib_next;
394 if (child->vgn_sib_next != NULL) {
395 child->vgn_sib_next->vgn_sib_prev = child;
396 }
397 child->vgn_sib_prev = prev_sibling;
398 prev_sibling->vgn_sib_next = child;
399 } else if (parent->vgn_children != NULL) {
400 vmm_gpt_node_t *next_sibling = parent->vgn_children;
401
402 ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa);
403 ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL);
404
405 child->vgn_sib_next = next_sibling;
406 child->vgn_sib_prev = NULL;
407 next_sibling->vgn_sib_prev = child;
408 parent->vgn_children = child;
409 } else {
410 parent->vgn_children = child;
411 child->vgn_sib_next = NULL;
412 child->vgn_sib_prev = NULL;
413 }
414
415 /* Configure PTE for child table */
416 parent->vgn_entries[idx] =
417 gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn);
418 parent->vgn_ref_cnt++;
419 }
420
421 /*
422 * Remove a child node from its relatives (parent, siblings) and free it.
423 */
424 static void
vmm_gpt_node_remove(vmm_gpt_node_t * child)425 vmm_gpt_node_remove(vmm_gpt_node_t *child)
426 {
427 ASSERT3P(child->vgn_children, ==, NULL);
428 ASSERT3U(child->vgn_ref_cnt, ==, 0);
429 ASSERT3P(child->vgn_parent, !=, NULL);
430
431 /* Unlink child from its siblings and parent */
432 vmm_gpt_node_t *parent = child->vgn_parent;
433 vmm_gpt_node_t *prev = child->vgn_sib_prev;
434 vmm_gpt_node_t *next = child->vgn_sib_next;
435 if (prev != NULL) {
436 ASSERT3P(prev->vgn_sib_next, ==, child);
437 prev->vgn_sib_next = next;
438 }
439 if (next != NULL) {
440 ASSERT3P(next->vgn_sib_prev, ==, child);
441 next->vgn_sib_prev = prev;
442 }
443 if (prev == NULL) {
444 ASSERT3P(parent->vgn_children, ==, child);
445 parent->vgn_children = next;
446 }
447 child->vgn_parent = NULL;
448 child->vgn_sib_next = NULL;
449 child->vgn_sib_prev = NULL;
450 parent->vgn_entries[child->vgn_index] = 0;
451 parent->vgn_ref_cnt--;
452
453 vmm_gpt_node_free(child);
454 }
455
456 /*
457 * Walks the GPT for the given GPA, accumulating entries to the given depth. If
458 * the walk terminates before the depth is reached, the remaining entries are
459 * written with NULLs.
460 *
461 * Returns the GPA corresponding to the deepest populated (non-NULL) entry
462 * encountered during the walk.
463 */
464 uint64_t
vmm_gpt_walk(vmm_gpt_t * gpt,uint64_t gpa,vmm_gpt_entry_t ** entries,vmm_gpt_node_level_t depth)465 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries,
466 vmm_gpt_node_level_t depth)
467 {
468 ASSERT(gpt != NULL);
469 ASSERT3U(depth, <, MAX_GPT_LEVEL);
470
471 vmm_gpt_entry_t *current_entries = gpt->vgpt_root->vgn_entries;
472 uint64_t mask = 0;
473 for (uint_t lvl = LEVEL4; lvl <= depth; lvl++) {
474 if (current_entries == NULL) {
475 entries[lvl] = NULL;
476 continue;
477 }
478 entries[lvl] = ¤t_entries[vmm_gpt_lvl_index(lvl, gpa)];
479 mask = vmm_gpt_lvl_mask(lvl);
480 const vmm_gpt_entry_t pte = *entries[lvl];
481 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(pte)) {
482 current_entries = NULL;
483 continue;
484 }
485 const pfn_t pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(pte);
486 current_entries = (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn);
487 }
488 return (gpa & mask);
489 }
490
491 /*
492 * Given a `gpa`, and the corresponding `entries` array as queried by a call to
493 * `vmm_gpt_walk()`, attempt to advance to the next PTE at the `depth` provided
494 * by the caller.
495 *
496 * As there may not be PTEs present at the requested `depth`, the amount that
497 * the GPA is advanced may be larger than what is expected for that provided
498 * depth.
499 *
500 * Returns the GPA correspending to the PTE to which `entries` was advanced to.
501 */
502 static uint64_t
vmm_gpt_walk_advance(vmm_gpt_t * gpt,uint64_t gpa,vmm_gpt_entry_t ** entries,vmm_gpt_node_level_t depth)503 vmm_gpt_walk_advance(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries,
504 vmm_gpt_node_level_t depth)
505 {
506 ASSERT(gpt != NULL);
507 ASSERT3U(depth, <, MAX_GPT_LEVEL);
508 ASSERT0(gpa & ~vmm_gpt_lvl_mask(depth));
509
510 /*
511 * Advanced to the next entry in the deepest level of PTE possible,
512 * ascending to higher levels if we reach the end of a table at any
513 * given point.
514 */
515 int lvl;
516 for (lvl = depth; lvl >= LEVEL4; lvl--) {
517 vmm_gpt_entry_t *ptep = entries[lvl];
518
519 if (ptep == NULL) {
520 continue;
521 }
522
523 uint16_t index = vmm_gpt_ptep_index(ptep);
524 ASSERT3U(vmm_gpt_lvl_index(lvl, gpa), ==, index);
525 if (index == MAX_NODE_IDX) {
526 continue;
527 }
528
529 gpa = (gpa & vmm_gpt_lvl_mask(lvl)) + vmm_gpt_lvl_len(lvl);
530 entries[lvl] = ptep + 1;
531 break;
532 }
533
534 if (lvl < LEVEL4) {
535 /* Advanced off the end of the tables */
536 return (UINT64_MAX);
537 }
538
539 /*
540 * Attempt to walk back down to the target level if any ascension was
541 * necessary during the above advancement.
542 */
543 vmm_gpt_entry_t pte = *entries[lvl];
544 lvl++;
545 for (; lvl < MAX_GPT_LEVEL; lvl++) {
546 if (lvl > depth ||
547 !gpt->vgpt_pte_ops->vpeo_pte_is_present(pte)) {
548 entries[lvl] = NULL;
549 continue;
550 }
551
552 const pfn_t pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(pte);
553 vmm_gpt_entry_t *next_table =
554 (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn);
555 const uint16_t index = vmm_gpt_lvl_index(lvl, gpa);
556 pte = next_table[index];
557 entries[lvl] = &next_table[index];
558 }
559
560 return (gpa);
561 }
562
563 /*
564 * Initialize iteration over GPT leaf entries between [addr, addr + len). The
565 * specified interval must be page-aligned and not overflow the end of the
566 * address space.
567 *
568 * Subsequent calls to vmm_gpt_iter_next() will emit the encountered entries.
569 */
570 void
vmm_gpt_iter_init(vmm_gpt_iter_t * iter,vmm_gpt_t * gpt,uint64_t addr,uint64_t len)571 vmm_gpt_iter_init(vmm_gpt_iter_t *iter, vmm_gpt_t *gpt, uint64_t addr,
572 uint64_t len)
573 {
574 ASSERT0(addr & PAGEOFFSET);
575 ASSERT0(len & PAGEOFFSET);
576 ASSERT3U((addr + len), >=, addr);
577
578 iter->vgi_gpt = gpt;
579 iter->vgi_addr = addr;
580 iter->vgi_end = addr + len;
581 iter->vgi_current = vmm_gpt_walk(gpt, addr, iter->vgi_entries, LEVEL1);
582 }
583
584 /*
585 * Fetch the next entry (if any) from an iterator initialized from a preceding
586 * call to vmm_gpt_iter_init(). Returns true when the next GPT leaf entry
587 * inside the iterator range has been populated in `entry`. Returns `false`
588 * when there are no such entries available or remaining in the range.
589 */
590 bool
vmm_gpt_iter_next(vmm_gpt_iter_t * iter,vmm_gpt_iter_entry_t * entry)591 vmm_gpt_iter_next(vmm_gpt_iter_t *iter, vmm_gpt_iter_entry_t *entry)
592 {
593 if (iter->vgi_current >= iter->vgi_end) {
594 return (false);
595 }
596
597 while (iter->vgi_current < iter->vgi_end) {
598 bool found = false;
599 if (iter->vgi_entries[LEVEL1] != NULL) {
600 entry->vgie_gpa = iter->vgi_current;
601 entry->vgie_ptep = iter->vgi_entries[LEVEL1];
602 found = true;
603 }
604
605 iter->vgi_current = vmm_gpt_walk_advance(iter->vgi_gpt,
606 iter->vgi_current, iter->vgi_entries, LEVEL1);
607
608 if (found) {
609 return (true);
610 }
611 }
612 return (false);
613 }
614
615 /*
616 * Populate child table nodes for a given level between the provided interval
617 * of [addr, addr + len). Caller is expected to provide a pointer to the parent
618 * node which would contain the child node for GPA at `addr`. A pointer to said
619 * child node will be returned when the operation is complete.
620 */
621 static vmm_gpt_node_t *
vmm_gpt_populate_region_lvl(vmm_gpt_t * gpt,uint64_t addr,uint64_t len,vmm_gpt_node_t * node_start)622 vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len,
623 vmm_gpt_node_t *node_start)
624 {
625 const vmm_gpt_node_level_t lvl = node_start->vgn_level;
626 const uint64_t end = addr + len;
627 const uint64_t incr = vmm_gpt_lvl_len(lvl);
628 uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl);
629 vmm_gpt_node_t *parent = node_start;
630
631 /* Try to locate node at starting address */
632 vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children;
633 while (node != NULL && node->vgn_gpa < gpa) {
634 prev = node;
635 node = node->vgn_sib_next;
636 }
637
638 /*
639 * If no node exists at the starting address, create one and link it
640 * into the parent.
641 */
642 if (node == NULL || node->vgn_gpa > gpa) {
643 /* Need to insert node for starting GPA */
644 node = vmm_gpt_node_alloc();
645 vmm_gpt_node_add(gpt, parent, node, gpa, prev);
646 }
647
648 vmm_gpt_node_t *front_node = node;
649 prev = node;
650 gpa += incr;
651
652 /*
653 * With a node at the starting address, walk forward creating nodes in
654 * any of the gaps.
655 */
656 for (; gpa < end; gpa += incr, prev = node) {
657 node = vmm_gpt_node_next(prev, true);
658 if (node != NULL) {
659 ASSERT3U(node->vgn_gpa, ==, gpa);
660
661 /* We may have crossed into a new parent */
662 parent = node->vgn_parent;
663 continue;
664 }
665
666 if (vmm_gpt_node_is_last(prev)) {
667 /*
668 * The node preceding this was the last one in its
669 * containing parent, so move on to that parent's
670 * sibling. We expect (demand) that it exist already.
671 */
672 parent = vmm_gpt_node_next(parent, true);
673 ASSERT(parent != NULL);
674
675 /*
676 * Forget our previous sibling, since it is of no use
677 * for assigning the new node to the a now-different
678 * parent.
679 */
680 prev = NULL;
681
682 }
683 node = vmm_gpt_node_alloc();
684 vmm_gpt_node_add(gpt, parent, node, gpa, prev);
685 }
686
687 return (front_node);
688 }
689
690 /*
691 * Ensures that PTEs for the region of address space bounded by
692 * [addr, addr + len) exist in the tree.
693 */
694 void
vmm_gpt_populate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)695 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
696 {
697 ASSERT0(addr & PAGEOFFSET);
698 ASSERT0(len & PAGEOFFSET);
699
700 /*
701 * Starting at the top of the tree, ensure that tables covering the
702 * requested region exist at each level.
703 */
704 vmm_gpt_node_t *node = gpt->vgpt_root;
705 for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
706 ASSERT3U(node->vgn_level, ==, lvl);
707
708 node = vmm_gpt_populate_region_lvl(gpt, addr, len, node);
709 }
710
711
712 /*
713 * Establish reference counts for the soon-to-be memory PTEs which will
714 * be filling these LEVEL1 tables.
715 */
716 uint64_t gpa = addr;
717 const uint64_t end = addr + len;
718 while (gpa < end) {
719 ASSERT(node != NULL);
720 ASSERT3U(node->vgn_level, ==, LEVEL1);
721
722 const uint16_t covered =
723 vmm_gpt_node_entries_covered(node, addr, end);
724
725 ASSERT(covered != 0);
726 ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE);
727 ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE);
728
729 node->vgn_ref_cnt += covered;
730
731 vmm_gpt_node_t *next = vmm_gpt_node_next(node, true);
732 if (next != NULL) {
733 gpa = next->vgn_gpa;
734 node = next;
735 } else {
736 /*
737 * We do not expect to find a subsequent node after
738 * filling the last node in the table, completing PTE
739 * accounting for the specified range.
740 */
741 VERIFY3U(end, <=, vmm_gpt_node_end(node));
742 break;
743 }
744 }
745 }
746
747 /*
748 * Format a PTE and install it in the provided PTE-pointer.
749 *
750 * The caller must ensure that a conflicting PFN is not mapped at the requested
751 * location. Racing operations to map the same PFN at one location are
752 * acceptable and properly handled.
753 */
754 bool
vmm_gpt_map_at(vmm_gpt_t * gpt,vmm_gpt_entry_t * ptep,pfn_t pfn,uint_t prot,uint8_t attr)755 vmm_gpt_map_at(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, pfn_t pfn, uint_t prot,
756 uint8_t attr)
757 {
758 vmm_gpt_entry_t pte, old_pte;
759
760 pte = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr);
761 old_pte = atomic_cas_64(ptep, 0, pte);
762 if (old_pte != 0) {
763 ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(pte), ==,
764 gpt->vgpt_pte_ops->vpeo_pte_pfn(old_pte));
765 return (false);
766 }
767
768 return (true);
769 }
770
771 /*
772 * Cleans up the unused inner nodes in the GPT for a region of guest physical
773 * address space of [addr, addr + len). The region must map no pages.
774 */
775 void
vmm_gpt_vacate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)776 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
777 {
778 ASSERT0(addr & PAGEOFFSET);
779 ASSERT0(len & PAGEOFFSET);
780
781 const uint64_t end = addr + len;
782 vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = {
783 [LEVEL4] = gpt->vgpt_root,
784 };
785
786 for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
787 node = vmm_gpt_node_find_child(starts[lvl], addr);
788 if (node == NULL) {
789 break;
790 }
791 starts[lvl + 1] = node;
792 }
793
794 /*
795 * Starting at the bottom of the tree, ensure that PTEs for pages have
796 * been cleared for the region, and remove the corresponding reference
797 * counts from the containing LEVEL1 tables.
798 */
799 uint64_t gpa = addr;
800 node = starts[LEVEL1];
801 while (gpa < end && node != NULL) {
802 const uint16_t covered =
803 vmm_gpt_node_entries_covered(node, addr, end);
804
805 ASSERT3U(node->vgn_ref_cnt, >=, covered);
806 node->vgn_ref_cnt -= covered;
807
808 node = vmm_gpt_node_next(node, false);
809 if (node != NULL) {
810 gpa = node->vgn_gpa;
811 }
812 }
813
814 /*
815 * With the page PTE references eliminated, work up from the bottom of
816 * the table, removing nodes which have no remaining references.
817 *
818 * This stops short of LEVEL4, which is the root table of the GPT. It
819 * is left standing to be cleaned up when the vmm_gpt_t is destroyed.
820 */
821 for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) {
822 gpa = addr;
823 node = starts[lvl];
824
825 while (gpa < end && node != NULL) {
826 vmm_gpt_node_t *next = vmm_gpt_node_next(node, false);
827
828 if (node->vgn_ref_cnt == 0) {
829 vmm_gpt_node_remove(node);
830 }
831 if (next != NULL) {
832 gpa = next->vgn_gpa;
833 }
834 node = next;
835 }
836 }
837 }
838
839 /*
840 * Remove a mapping from the table. Returns false if the page was not mapped,
841 * otherwise returns true.
842 */
843 bool
vmm_gpt_unmap(vmm_gpt_t * gpt,uint64_t gpa)844 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
845 {
846 vmm_gpt_entry_t *entries[MAX_GPT_LEVEL], pte;
847
848 ASSERT(gpt != NULL);
849 (void) vmm_gpt_walk(gpt, gpa, entries, LEVEL1);
850 if (entries[LEVEL1] == NULL)
851 return (false);
852
853 pte = *entries[LEVEL1];
854 *entries[LEVEL1] = 0;
855 return (gpt->vgpt_pte_ops->vpeo_pte_is_present(pte));
856 }
857
858 /*
859 * Un-maps the region of guest physical address space bounded by [start..end).
860 * Returns the number of pages that are unmapped.
861 */
862 size_t
vmm_gpt_unmap_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)863 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
864 {
865 ASSERT0(addr & PAGEOFFSET);
866 ASSERT0(len & PAGEOFFSET);
867
868 vmm_gpt_iter_t state;
869 vmm_gpt_iter_entry_t entry;
870 size_t num_unmapped = 0;
871
872 vmm_gpt_iter_init(&state, gpt, addr, len);
873 while (vmm_gpt_iter_next(&state, &entry)) {
874 if (entry.vgie_ptep == NULL) {
875 continue;
876 }
877
878 const vmm_gpt_entry_t pte = *entry.vgie_ptep;
879 *entry.vgie_ptep = 0;
880 if (gpt->vgpt_pte_ops->vpeo_pte_is_present(pte)) {
881 num_unmapped++;
882 }
883 }
884
885 return (num_unmapped);
886 }
887
888 /*
889 * Returns a value indicating whether or not this GPT maps the given
890 * GPA. If the GPA is mapped, *protp will be filled with the protection
891 * bits of the entry. Otherwise, it will be ignored.
892 */
893 bool
vmm_gpt_is_mapped(vmm_gpt_t * gpt,vmm_gpt_entry_t * ptep,pfn_t * pfnp,uint_t * protp)894 vmm_gpt_is_mapped(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, pfn_t *pfnp,
895 uint_t *protp)
896 {
897 vmm_gpt_entry_t pte;
898
899 ASSERT(pfnp != NULL);
900 ASSERT(protp != NULL);
901
902 if (ptep == NULL) {
903 return (false);
904 }
905 pte = *ptep;
906 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(pte)) {
907 return (false);
908 }
909 *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(pte);
910 *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(pte);
911 return (true);
912 }
913
914 /*
915 * Resets the accessed bit on the page table entry pointed to be `ptep`.
916 * If `on` is true, the bit will be set, otherwise it will be cleared.
917 * The old value of the bit is returned.
918 */
919 uint_t
vmm_gpt_reset_accessed(vmm_gpt_t * gpt,vmm_gpt_entry_t * ptep,bool on)920 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, bool on)
921 {
922 ASSERT(ptep != NULL);
923 return (gpt->vgpt_pte_ops->vpeo_reset_accessed(ptep, on));
924 }
925
926 /*
927 * Resets the dirty bit on the page table entry pointed to be `ptep`.
928 * If `on` is true, the bit will be set, otherwise it will be cleared.
929 * The old value of the bit is returned.
930 */
931 uint_t
vmm_gpt_reset_dirty(vmm_gpt_t * gpt,vmm_gpt_entry_t * ptep,bool on)932 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, bool on)
933 {
934 ASSERT(ptep != NULL);
935 return (gpt->vgpt_pte_ops->vpeo_reset_dirty(ptep, on));
936 }
937
938 /*
939 * Query state from PTE pointed to by `ptep`.
940 */
941 bool
vmm_gpt_query(vmm_gpt_t * gpt,vmm_gpt_entry_t * ptep,vmm_gpt_query_t query)942 vmm_gpt_query(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, vmm_gpt_query_t query)
943 {
944 ASSERT(ptep != NULL);
945 return (gpt->vgpt_pte_ops->vpeo_query(ptep, query));
946 }
947
948 /*
949 * Get properly formatted PML4 (EPTP/nCR3) for GPT.
950 */
951 uint64_t
vmm_gpt_get_pmtp(vmm_gpt_t * gpt,bool track_dirty)952 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty)
953 {
954 const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn;
955 return (gpt->vgpt_pte_ops->vpeo_get_pmtp(root_pfn, track_dirty));
956 }
957
958 /*
959 * Does the GPT hardware support dirty-page-tracking?
960 */
961 bool
vmm_gpt_can_track_dirty(vmm_gpt_t * gpt)962 vmm_gpt_can_track_dirty(vmm_gpt_t *gpt)
963 {
964 return (gpt->vgpt_pte_ops->vpeo_hw_ad_supported());
965 }
966