1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12
13 /*
14 * Copyright 2019 Joyent, Inc.
15 * Copyright 2025 Oxide Computer Company
16 */
17
18 #include <sys/types.h>
19 #include <sys/atomic.h>
20 #include <sys/kmem.h>
21 #include <sys/sysmacros.h>
22 #include <sys/sunddi.h>
23 #include <sys/panic.h>
24 #include <vm/hat.h>
25 #include <vm/as.h>
26 #include <vm/hat_i86.h>
27
28 #include <sys/vmm_gpt.h>
29 #include <sys/vmm_gpt_impl.h>
30
31 /*
32 * VMM Generic Page Tables
33 *
34 * Bhyve runs on AMD and Intel hosts and both support nested page tables
35 * describing the guest's physical address space. But the two use different and
36 * mutually incompatible page table formats: Intel uses the EPT, which is based
37 * on the Itanium page table format, while AMD uses the nPT, which is based on
38 * the x86_64 page table format.
39 *
40 * The GPT abstracts these format differences, and provides a single interface
41 * for interacting with either kind of table structure.
42 *
43 * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
44 * It is parameterized with operations on PTEs that are specific to the table
45 * type (EPT or nPT) and also keeps track of how many pages the table maps, as
46 * well as a pointer to the root node in the tree.
47 *
48 * A node in the GPT keep pointers to its parent (NULL for the root), its
49 * left-most child, and its siblings. The node understands its position in the
50 * tree in terms of the level it appears at and the index it occupies at its
51 * parent's level, as well as how many children it has. It also owns the
52 * physical memory page for the hardware page table entries that map its
53 * children. Thus, for a node at any given level in the tree, the nested PTE
54 * for that node's child at index $i$ is the i'th uint64_t in that node's entry
55 * page and the entry page is part of the paging structure consumed by hardware.
56 *
57 * The GPT interface provides functions for populating and vacating the tree for
58 * regions in the guest physical address space, and for mapping and unmapping
59 * pages in populated regions. Users must populate a region before mapping
60 * pages into it, and must unmap pages before vacating the region.
61 *
62 * The interface also exposes a function for walking the table from the root to
63 * a leaf entry, populating an array of pointers to PTEs. This walk uses the
64 * hardware page structure itself, and is thus fast, though as a result it
65 * potentially aliases entries; caveat emptor. The walk primitive is used for
66 * mapping, unmapping, and lookups.
67 *
68 * Format-specific differences are abstracted by parameterizing the GPT with a
69 * set of PTE operations specific to the platform. The GPT code makes use of
70 * these when mapping or populating entries, resetting accessed and dirty bits
71 * on entries, and similar operations.
72 */
73
74 /*
75 * A GPT node.
76 *
77 * Each node contains pointers to its parent, its left-most child, and its
78 * siblings. Interior nodes also maintain a reference count, and each node
79 * contains its level and index in its parent's table. Finally, each node
80 * contains the host PFN of the page that it links into the page table, as well
81 * as a kernel pointer to table.
82 *
83 * On leaf nodes, the reference count tracks how many entries in the table are
84 * covered by mapping from the containing vmspace. This is maintained during
85 * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace
86 * map/unmap operations, rather than in the data path of faults populating the
87 * PTEs themselves.
88 *
89 * Note, this is carefully sized to fit exactly into a 64-byte cache line.
90 */
91 typedef struct vmm_gpt_node vmm_gpt_node_t;
92 struct vmm_gpt_node {
93 uint64_t vgn_host_pfn;
94 uint16_t vgn_level;
95 uint16_t vgn_index;
96 uint32_t vgn_ref_cnt;
97 vmm_gpt_node_t *vgn_parent;
98 vmm_gpt_node_t *vgn_children;
99 vmm_gpt_node_t *vgn_sib_next;
100 vmm_gpt_node_t *vgn_sib_prev;
101 vmm_gpt_entry_t *vgn_entries;
102 uint64_t vgn_gpa;
103 };
104
105 /* Maximum node index determined by number of entries in page table (512) */
106 #define PTE_PER_TABLE 512
107 #define MAX_NODE_IDX (PTE_PER_TABLE - 1)
108
109 /*
110 * A VMM Generic Page Table.
111 *
112 * The generic page table is a format-agnostic, 4-level paging structure
113 * modeling a second-level page table (EPT on Intel, nPT on AMD). It
114 * contains a counter of pages the table maps, a pointer to the root node
115 * in the table, and is parameterized with a set of PTE operations specific
116 * to the table type.
117 */
118 struct vmm_gpt {
119 vmm_gpt_node_t *vgpt_root;
120 };
121
122 void
vmm_gpt_impl_panic(void)123 vmm_gpt_impl_panic(void)
124 {
125 /*
126 * Bail out if caller a makes it to indirect stub before complete
127 * hot-patching has occured.
128 */
129 panic("Indirect function not hot-patched");
130 }
131
132 /*
133 * Function indirection stubs
134 *
135 * These are made valid (no longer jumping to vmm_gpt_impl_panic()) by
136 * vmm_gpt_init(), and are only to be called after that has run successfully
137 * during module initialization.
138 */
139 uint64_t vmm_gpti_map_table(uint64_t);
140 uint64_t vmm_gpti_map_page(uint64_t, uint_t, uint8_t);
141 bool vmm_gpti_parse(uint64_t, pfn_t *, uint_t *);
142
143 const struct vmm_pte_impl vmm_pte_uninit_impl = {
144 .vpi_map_table = (void *)vmm_gpt_impl_panic,
145 .vpi_map_page = (void *)vmm_gpt_impl_panic,
146 .vpi_pte_parse = (void *)vmm_gpt_impl_panic,
147 .vpi_bit_accessed = 0,
148 .vpi_bit_dirty = 0,
149
150 .vpi_get_pmtp = (void *)vmm_gpt_impl_panic,
151 .vpi_hw_ad_supported = (void *)vmm_gpt_impl_panic,
152 };
153
154 static const struct vmm_pte_impl *vmm_pte_impl = &vmm_pte_uninit_impl;
155
156 #define JMP_NEAR_OPCODE 0xe9
157
158 struct jmp_instr {
159 uint8_t opcode;
160 int32_t off;
161 } __packed;
162
163
164 /*
165 * Given a pointer to an immediate-near-`jmp` instruction, calculate the address
166 * of its target.
167 */
168 static uintptr_t
jmp_off_to_addr(const struct jmp_instr * instp)169 jmp_off_to_addr(const struct jmp_instr *instp)
170 {
171 const uintptr_t next_rip = (uintptr_t)&instp[1];
172 const uintptr_t off = (uintptr_t)(intptr_t)instp->off;
173
174 return (next_rip + off);
175 }
176
177 /*
178 * Given a pointer to a immediate-near-`jmp` instruction, calculate the
179 * immediate value for a given target address, if possible.
180 *
181 * Returns false if the target address is not within range of the `jmp`.
182 * Stores the offset value in `resp` on success.
183 */
184 static bool
jmp_addr_to_off(const struct jmp_instr * instp,void * target,int32_t * resp)185 jmp_addr_to_off(const struct jmp_instr *instp, void *target, int32_t *resp)
186 {
187 const uintptr_t next_rip = (uintptr_t)&instp[1];
188 const intptr_t off = (uintptr_t)target - next_rip;
189
190 /* jump is not "near" */
191 if (off < INT32_MIN || off > INT32_MAX) {
192 return (false);
193 }
194
195 if (resp != NULL) {
196 *resp = (int32_t)off;
197 }
198 return (true);
199 }
200
201 /*
202 * Try to get the symbol name for a given kernel-virtual address.
203 *
204 * Returns a valid string pointer regardless of success or failure.
205 */
206 static inline const char *
addr_to_sym(void * addr)207 addr_to_sym(void *addr)
208 {
209 ulong_t ignored_offset;
210 const char *name = modgetsymname((uintptr_t)addr, &ignored_offset);
211
212 return (name != NULL ? name : "<null>");
213 }
214
215 /*
216 * Hot-patch frequently called PTE implementation functions.
217 *
218 * Since indirect function calls are relatively expensive in a post-Spectre
219 * world, we choose to call into the backend via stubs which are hot-patched
220 * during module initialization with the proper implementation. During module
221 * unload, it is called again to re-patch the stubs back to their uninitialized
222 * state of pointing to vmm_gpt_impl_panic().
223 */
224 static bool
vmm_gpt_patch_indirection(const struct vmm_pte_impl * old_impl,const struct vmm_pte_impl * new_impl)225 vmm_gpt_patch_indirection(const struct vmm_pte_impl *old_impl,
226 const struct vmm_pte_impl *new_impl)
227 {
228 struct indirection_patch {
229 void (*patch_site)();
230 void (*old_implf)();
231 void (*new_implf)();
232 };
233 const struct indirection_patch patches[] = {
234 {
235 .patch_site = (void *)vmm_gpti_map_table,
236 .old_implf = (void *)old_impl->vpi_map_table,
237 .new_implf = (void *)new_impl->vpi_map_table,
238 },
239 {
240 .patch_site = (void *)vmm_gpti_map_page,
241 .old_implf = (void *)old_impl->vpi_map_page,
242 .new_implf = (void *)new_impl->vpi_map_page,
243 },
244 {
245 .patch_site = (void *)vmm_gpti_parse,
246 .old_implf = (void *)old_impl->vpi_pte_parse,
247 .new_implf = (void *)new_impl->vpi_pte_parse,
248 },
249 };
250
251 /* Check the patches for validity first */
252 for (uint_t i = 0; i < ARRAY_SIZE(patches); i++) {
253 const struct indirection_patch *patch = &patches[i];
254 const struct jmp_instr *instp = (void *)patch->patch_site;
255
256 /* Expecting a near `jmp */
257 if (instp->opcode != JMP_NEAR_OPCODE) {
258 cmn_err(CE_WARN, "vmm: non-JMP instruction found when "
259 "attempting to hotpatch %s",
260 addr_to_sym(patch->patch_site));
261 return (false);
262 }
263 /* ... targeted at the expected function */
264 const uintptr_t old_target = jmp_off_to_addr(instp);
265 if (old_target != (uintptr_t)patch->old_implf) {
266 cmn_err(CE_WARN, "vmm: JMP instr @ %s has unexpected "
267 "target %s != %s",
268 addr_to_sym(patch->patch_site),
269 addr_to_sym((void *)old_target),
270 addr_to_sym(patch->old_implf));
271 return (false);
272 }
273 /*
274 * ... and which is close enough in the address space to the new
275 * intended target to be within range of the near `jmp`.
276 */
277 if (!jmp_addr_to_off(instp, patch->new_implf, NULL)) {
278 cmn_err(CE_WARN, "vmm: near-JMP to new target %s is "
279 "too far for site %s",
280 addr_to_sym(patch->new_implf),
281 addr_to_sym(patch->patch_site));
282 return (false);
283 }
284 }
285
286 for (uint_t i = 0; i < ARRAY_SIZE(patches); i++) {
287 const struct indirection_patch *patch = &patches[i];
288 struct jmp_instr *instp = (void *)patch->patch_site;
289
290 int32_t new_off;
291 VERIFY(jmp_addr_to_off(instp, patch->new_implf, &new_off));
292 /*
293 * We do not meet the demand from hot_patch_kernel_text() that
294 * the to-be-patched data be aligned with the patch size. On
295 * x86_64, with the caveat that no other CPU(s) will be
296 * concurrently executing the patched instructions, we can break
297 * the rules without fear of disaster.
298 */
299 hot_patch_kernel_text((caddr_t)&instp->off, new_off, 4);
300 }
301 return (true);
302 }
303
304 bool
vmm_gpt_init(const struct vmm_pte_impl * target_impl)305 vmm_gpt_init(const struct vmm_pte_impl *target_impl)
306 {
307 VERIFY3P(vmm_pte_impl, ==, &vmm_pte_uninit_impl);
308
309 if (vmm_gpt_patch_indirection(vmm_pte_impl, target_impl)) {
310 vmm_pte_impl = target_impl;
311 return (true);
312 }
313 return (false);
314 }
315
316 void
vmm_gpt_fini(void)317 vmm_gpt_fini(void)
318 {
319 /*
320 * Restore the indirection stubs back to their original (panicking)
321 * implementations. All potential callers should be excluded since this
322 * is only done during module unload.
323 */
324 VERIFY(vmm_gpt_patch_indirection(vmm_pte_impl, &vmm_pte_uninit_impl));
325 vmm_pte_impl = &vmm_pte_uninit_impl;
326 }
327
328 /*
329 * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
330 * hold the PTEs it contains.
331 */
332 static vmm_gpt_node_t *
vmm_gpt_node_alloc(void)333 vmm_gpt_node_alloc(void)
334 {
335 vmm_gpt_node_t *node;
336 caddr_t page;
337
338 node = kmem_zalloc(sizeof (*node), KM_SLEEP);
339 /*
340 * Note: despite the man page, allocating PAGESIZE bytes is
341 * guaranteed to be page-aligned.
342 */
343 page = kmem_zalloc(PAGESIZE, KM_SLEEP);
344 node->vgn_entries = (vmm_gpt_entry_t *)page;
345 node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
346
347 return (node);
348 }
349
350 /*
351 * Allocates and initializes a vmm_gpt_t.
352 */
353 vmm_gpt_t *
vmm_gpt_alloc(void)354 vmm_gpt_alloc(void)
355 {
356 vmm_gpt_t *gpt = kmem_zalloc(sizeof (vmm_gpt_t), KM_SLEEP);
357 gpt->vgpt_root = vmm_gpt_node_alloc();
358
359 return (gpt);
360 }
361
362 /*
363 * Frees a given node. The node is expected to have no familial (parent,
364 * children, siblings) associations at this point. Accordingly, its reference
365 * count should be zero.
366 */
367 static void
vmm_gpt_node_free(vmm_gpt_node_t * node)368 vmm_gpt_node_free(vmm_gpt_node_t *node)
369 {
370 ASSERT(node != NULL);
371 ASSERT3U(node->vgn_ref_cnt, ==, 0);
372 ASSERT(node->vgn_host_pfn != PFN_INVALID);
373 ASSERT(node->vgn_entries != NULL);
374 ASSERT(node->vgn_parent == NULL);
375
376 kmem_free(node->vgn_entries, PAGESIZE);
377 kmem_free(node, sizeof (*node));
378 }
379
380 /*
381 * Frees a vmm_gpt_t. Any lingering nodes in the GPT will be freed too.
382 */
383 void
vmm_gpt_free(vmm_gpt_t * gpt)384 vmm_gpt_free(vmm_gpt_t *gpt)
385 {
386 /* Empty anything remaining in the tree */
387 vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK);
388
389 VERIFY(gpt->vgpt_root != NULL);
390 VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0);
391
392 vmm_gpt_node_free(gpt->vgpt_root);
393 kmem_free(gpt, sizeof (*gpt));
394 }
395
396 /*
397 * Given a GPA, return its corresponding index in a paging structure at the
398 * provided level.
399 */
400 static inline uint16_t
vmm_gpt_lvl_index(vmm_gpt_node_level_t level,uint64_t gpa)401 vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa)
402 {
403 ASSERT(level < MAX_GPT_LEVEL);
404
405 const uint16_t mask = MAX_NODE_IDX;
406 switch (level) {
407 case LEVEL4: return ((gpa >> 39) & mask);
408 case LEVEL3: return ((gpa >> 30) & mask);
409 case LEVEL2: return ((gpa >> 21) & mask);
410 case LEVEL1: return ((gpa >> 12) & mask);
411 default:
412 panic("impossible level value");
413 };
414 }
415
416 /* Get mask for addresses of entries at a given table level. */
417 static inline uint64_t
vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)418 vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)
419 {
420 ASSERT(level < MAX_GPT_LEVEL);
421
422 switch (level) {
423 case LEVEL4: return (0xffffff8000000000ul); /* entries cover 512G */
424 case LEVEL3: return (0xffffffffc0000000ul); /* entries cover 1G */
425 case LEVEL2: return (0xffffffffffe00000ul); /* entries cover 2M */
426 case LEVEL1: return (0xfffffffffffff000ul); /* entries cover 4K */
427 default:
428 panic("impossible level value");
429 };
430 }
431
432 /* Get length of GPA covered by entries at a given table level. */
433 static inline uint64_t
vmm_gpt_lvl_len(vmm_gpt_node_level_t level)434 vmm_gpt_lvl_len(vmm_gpt_node_level_t level)
435 {
436 ASSERT(level < MAX_GPT_LEVEL);
437
438 switch (level) {
439 case LEVEL4: return (0x8000000000ul); /* entries cover 512G */
440 case LEVEL3: return (0x40000000ul); /* entries cover 1G */
441 case LEVEL2: return (0x200000ul); /* entries cover 2M */
442 case LEVEL1: return (0x1000ul); /* entries cover 4K */
443 default:
444 panic("impossible level value");
445 };
446 }
447
448 /* Get the index of a PTE pointer based on its offset in that table */
449 static inline uint16_t
vmm_gpt_ptep_index(const vmm_gpt_entry_t * ptep)450 vmm_gpt_ptep_index(const vmm_gpt_entry_t *ptep)
451 {
452 const uintptr_t offset = (uintptr_t)ptep & 0xffful;
453 return (offset / sizeof (uint64_t));
454 }
455
456 /*
457 * Get the ending GPA which this node could possibly cover given its base
458 * address and level.
459 */
460 static inline uint64_t
vmm_gpt_node_end(vmm_gpt_node_t * node)461 vmm_gpt_node_end(vmm_gpt_node_t *node)
462 {
463 ASSERT(node->vgn_level > LEVEL4);
464 return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1));
465 }
466
467 /*
468 * Is this node the last entry in its parent node, based solely by its GPA?
469 */
470 static inline bool
vmm_gpt_node_is_last(vmm_gpt_node_t * node)471 vmm_gpt_node_is_last(vmm_gpt_node_t *node)
472 {
473 return (node->vgn_index == MAX_NODE_IDX);
474 }
475
476 /*
477 * How many table entries (if any) in this node are covered by the range of
478 * [start, end).
479 */
480 static uint16_t
vmm_gpt_node_entries_covered(vmm_gpt_node_t * node,uint64_t start,uint64_t end)481 vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end)
482 {
483 const uint64_t node_end = vmm_gpt_node_end(node);
484
485 /* Is this node covered at all by the region? */
486 if (start >= node_end || end <= node->vgn_gpa) {
487 return (0);
488 }
489
490 const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level);
491 const uint64_t covered_start = MAX(node->vgn_gpa, start & mask);
492 const uint64_t covered_end = MIN(node_end, end & mask);
493 const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level);
494
495 return ((covered_end - covered_start) / per_entry);
496 }
497
498 /*
499 * Find the next node (by address) in the tree at the same level.
500 *
501 * Returns NULL if this is the last node in the tree or if `only_seq` was true
502 * and there is an address gap between this node and the next.
503 */
504 static vmm_gpt_node_t *
vmm_gpt_node_next(vmm_gpt_node_t * node,bool only_seq)505 vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq)
506 {
507 ASSERT3P(node->vgn_parent, !=, NULL);
508 ASSERT3U(node->vgn_level, >, LEVEL4);
509
510 /*
511 * Next node sequentially would be the one at the address starting at
512 * the end of what is covered by this node.
513 */
514 const uint64_t gpa_match = vmm_gpt_node_end(node);
515
516 /* Try our next sibling */
517 vmm_gpt_node_t *next = node->vgn_sib_next;
518
519 if (next == NULL) {
520 /*
521 * If the next-sibling pointer is NULL on the node, it can mean
522 * one of two things:
523 *
524 * 1. This entry represents the space leading up to the trailing
525 * boundary of what this node covers.
526 *
527 * 2. The node is not entirely populated, and there is a gap
528 * between the last populated entry, and the trailing
529 * boundary of the node.
530 *
531 * Either way, the proper course of action is to check the first
532 * child of our parent's next sibling (if the parent is not the
533 * root of the GPT itself).
534 */
535 if (node->vgn_parent != NULL && node->vgn_level > LEVEL3) {
536 vmm_gpt_node_t *psibling =
537 vmm_gpt_node_next(node->vgn_parent, true);
538 if (psibling != NULL) {
539 next = psibling->vgn_children;
540 }
541 }
542 }
543
544 if (next != NULL &&
545 (next->vgn_gpa == gpa_match || !only_seq)) {
546 return (next);
547 }
548
549 return (NULL);
550 }
551
552
553 /*
554 * Finds the child for the given GPA in the given parent node.
555 * Returns a pointer to node, or NULL if it is not found.
556 */
557 static vmm_gpt_node_t *
vmm_gpt_node_find_child(vmm_gpt_node_t * parent,uint64_t gpa)558 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
559 {
560 const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa);
561 for (vmm_gpt_node_t *child = parent->vgn_children;
562 child != NULL && child->vgn_index <= index;
563 child = child->vgn_sib_next) {
564 if (child->vgn_index == index)
565 return (child);
566 }
567
568 return (NULL);
569 }
570
571 /*
572 * Add a child node to the GPT at a position determined by GPA, parent, and (if
573 * present) preceding sibling.
574 *
575 * If `parent` node contains any children, `prev_sibling` must be populated with
576 * a pointer to the node preceding (by GPA) the to-be-added child node.
577 */
578 static void
vmm_gpt_node_add(vmm_gpt_t * gpt,vmm_gpt_node_t * parent,vmm_gpt_node_t * child,uint64_t gpa,vmm_gpt_node_t * prev_sibling)579 vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent,
580 vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling)
581 {
582 ASSERT3U(parent->vgn_level, <, LEVEL1);
583 ASSERT3U(child->vgn_parent, ==, NULL);
584
585 const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa);
586 child->vgn_index = idx;
587 child->vgn_level = parent->vgn_level + 1;
588 child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level);
589
590 /* Establish familial connections */
591 child->vgn_parent = parent;
592 if (prev_sibling != NULL) {
593 ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa);
594
595 child->vgn_sib_next = prev_sibling->vgn_sib_next;
596 if (child->vgn_sib_next != NULL) {
597 child->vgn_sib_next->vgn_sib_prev = child;
598 }
599 child->vgn_sib_prev = prev_sibling;
600 prev_sibling->vgn_sib_next = child;
601 } else if (parent->vgn_children != NULL) {
602 vmm_gpt_node_t *next_sibling = parent->vgn_children;
603
604 ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa);
605 ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL);
606
607 child->vgn_sib_next = next_sibling;
608 child->vgn_sib_prev = NULL;
609 next_sibling->vgn_sib_prev = child;
610 parent->vgn_children = child;
611 } else {
612 parent->vgn_children = child;
613 child->vgn_sib_next = NULL;
614 child->vgn_sib_prev = NULL;
615 }
616
617 /* Configure PTE for child table */
618 parent->vgn_entries[idx] = vmm_gpti_map_table(child->vgn_host_pfn);
619 parent->vgn_ref_cnt++;
620 }
621
622 /*
623 * Remove a child node from its relatives (parent, siblings) and free it.
624 */
625 static void
vmm_gpt_node_remove(vmm_gpt_node_t * child)626 vmm_gpt_node_remove(vmm_gpt_node_t *child)
627 {
628 ASSERT3P(child->vgn_children, ==, NULL);
629 ASSERT3U(child->vgn_ref_cnt, ==, 0);
630 ASSERT3P(child->vgn_parent, !=, NULL);
631
632 /* Unlink child from its siblings and parent */
633 vmm_gpt_node_t *parent = child->vgn_parent;
634 vmm_gpt_node_t *prev = child->vgn_sib_prev;
635 vmm_gpt_node_t *next = child->vgn_sib_next;
636 if (prev != NULL) {
637 ASSERT3P(prev->vgn_sib_next, ==, child);
638 prev->vgn_sib_next = next;
639 }
640 if (next != NULL) {
641 ASSERT3P(next->vgn_sib_prev, ==, child);
642 next->vgn_sib_prev = prev;
643 }
644 if (prev == NULL) {
645 ASSERT3P(parent->vgn_children, ==, child);
646 parent->vgn_children = next;
647 }
648 child->vgn_parent = NULL;
649 child->vgn_sib_next = NULL;
650 child->vgn_sib_prev = NULL;
651 parent->vgn_entries[child->vgn_index] = 0;
652 parent->vgn_ref_cnt--;
653
654 vmm_gpt_node_free(child);
655 }
656
657 /*
658 * Walks the GPT for the given GPA, accumulating entries to the given depth. If
659 * the walk terminates before the depth is reached, the remaining entries are
660 * written with NULLs.
661 *
662 * Returns the GPA corresponding to the deepest populated (non-NULL) entry
663 * encountered during the walk.
664 */
665 uint64_t
vmm_gpt_walk(vmm_gpt_t * gpt,uint64_t gpa,vmm_gpt_entry_t ** entries,vmm_gpt_node_level_t depth)666 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries,
667 vmm_gpt_node_level_t depth)
668 {
669 ASSERT(gpt != NULL);
670 ASSERT3U(depth, <, MAX_GPT_LEVEL);
671
672 vmm_gpt_entry_t *current_entries = gpt->vgpt_root->vgn_entries;
673 uint64_t mask = 0;
674 for (uint_t lvl = LEVEL4; lvl <= depth; lvl++) {
675 if (current_entries == NULL) {
676 entries[lvl] = NULL;
677 continue;
678 }
679 entries[lvl] = ¤t_entries[vmm_gpt_lvl_index(lvl, gpa)];
680 mask = vmm_gpt_lvl_mask(lvl);
681 const vmm_gpt_entry_t pte = *entries[lvl];
682
683 pfn_t pfn;
684 if (!vmm_gpti_parse(pte, &pfn, NULL)) {
685 current_entries = NULL;
686 continue;
687 }
688 current_entries = (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn);
689 }
690 return (gpa & mask);
691 }
692
693 /*
694 * Given a `gpa`, and the corresponding `entries` array as queried by a call to
695 * `vmm_gpt_walk()`, attempt to advance to the next PTE at the `depth` provided
696 * by the caller.
697 *
698 * As there may not be PTEs present at the requested `depth`, the amount that
699 * the GPA is advanced may be larger than what is expected for that provided
700 * depth.
701 *
702 * Returns the GPA correspending to the PTE to which `entries` was advanced to.
703 */
704 static uint64_t
vmm_gpt_walk_advance(vmm_gpt_t * gpt,uint64_t gpa,vmm_gpt_entry_t ** entries,vmm_gpt_node_level_t depth)705 vmm_gpt_walk_advance(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries,
706 vmm_gpt_node_level_t depth)
707 {
708 ASSERT(gpt != NULL);
709 ASSERT3U(depth, <, MAX_GPT_LEVEL);
710 ASSERT0(gpa & ~vmm_gpt_lvl_mask(depth));
711
712 /*
713 * Advanced to the next entry in the deepest level of PTE possible,
714 * ascending to higher levels if we reach the end of a table at any
715 * given point.
716 */
717 int lvl;
718 for (lvl = depth; lvl >= LEVEL4; lvl--) {
719 vmm_gpt_entry_t *ptep = entries[lvl];
720
721 if (ptep == NULL) {
722 continue;
723 }
724
725 uint16_t index = vmm_gpt_ptep_index(ptep);
726 ASSERT3U(vmm_gpt_lvl_index(lvl, gpa), ==, index);
727 if (index == MAX_NODE_IDX) {
728 continue;
729 }
730
731 gpa = (gpa & vmm_gpt_lvl_mask(lvl)) + vmm_gpt_lvl_len(lvl);
732 entries[lvl] = ptep + 1;
733 break;
734 }
735
736 if (lvl < LEVEL4) {
737 /* Advanced off the end of the tables */
738 return (UINT64_MAX);
739 }
740
741 /*
742 * Attempt to walk back down to the target level if any ascension was
743 * necessary during the above advancement.
744 */
745 vmm_gpt_entry_t pte = *entries[lvl];
746 lvl++;
747 for (; lvl < MAX_GPT_LEVEL; lvl++) {
748 pfn_t pfn;
749
750 if (lvl > depth || !vmm_gpti_parse(pte, &pfn, NULL)) {
751 entries[lvl] = NULL;
752 continue;
753 }
754
755 vmm_gpt_entry_t *next_table =
756 (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn);
757 const uint16_t index = vmm_gpt_lvl_index(lvl, gpa);
758 pte = next_table[index];
759 entries[lvl] = &next_table[index];
760 }
761
762 return (gpa);
763 }
764
765 /*
766 * Initialize iteration over GPT leaf entries between [addr, addr + len). The
767 * specified interval must be page-aligned and not overflow the end of the
768 * address space.
769 *
770 * Subsequent calls to vmm_gpt_iter_next() will emit the encountered entries.
771 */
772 void
vmm_gpt_iter_init(vmm_gpt_iter_t * iter,vmm_gpt_t * gpt,uint64_t addr,uint64_t len)773 vmm_gpt_iter_init(vmm_gpt_iter_t *iter, vmm_gpt_t *gpt, uint64_t addr,
774 uint64_t len)
775 {
776 ASSERT0(addr & PAGEOFFSET);
777 ASSERT0(len & PAGEOFFSET);
778 ASSERT3U((addr + len), >=, addr);
779
780 iter->vgi_gpt = gpt;
781 iter->vgi_addr = addr;
782 iter->vgi_end = addr + len;
783 iter->vgi_current = vmm_gpt_walk(gpt, addr, iter->vgi_entries, LEVEL1);
784 }
785
786 /*
787 * Fetch the next entry (if any) from an iterator initialized from a preceding
788 * call to vmm_gpt_iter_init(). Returns true when the next GPT leaf entry
789 * inside the iterator range has been populated in `entry`. Returns `false`
790 * when there are no such entries available or remaining in the range.
791 */
792 bool
vmm_gpt_iter_next(vmm_gpt_iter_t * iter,vmm_gpt_iter_entry_t * entry)793 vmm_gpt_iter_next(vmm_gpt_iter_t *iter, vmm_gpt_iter_entry_t *entry)
794 {
795 if (iter->vgi_current >= iter->vgi_end) {
796 return (false);
797 }
798
799 while (iter->vgi_current < iter->vgi_end) {
800 bool found = false;
801 if (iter->vgi_entries[LEVEL1] != NULL) {
802 entry->vgie_gpa = iter->vgi_current;
803 entry->vgie_ptep = iter->vgi_entries[LEVEL1];
804 found = true;
805 }
806
807 iter->vgi_current = vmm_gpt_walk_advance(iter->vgi_gpt,
808 iter->vgi_current, iter->vgi_entries, LEVEL1);
809
810 if (found) {
811 return (true);
812 }
813 }
814 return (false);
815 }
816
817 /*
818 * Populate child table nodes for a given level between the provided interval
819 * of [addr, addr + len). Caller is expected to provide a pointer to the parent
820 * node which would contain the child node for GPA at `addr`. A pointer to said
821 * child node will be returned when the operation is complete.
822 */
823 static vmm_gpt_node_t *
vmm_gpt_populate_region_lvl(vmm_gpt_t * gpt,uint64_t addr,uint64_t len,vmm_gpt_node_t * node_start)824 vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len,
825 vmm_gpt_node_t *node_start)
826 {
827 const vmm_gpt_node_level_t lvl = node_start->vgn_level;
828 const uint64_t end = addr + len;
829 const uint64_t incr = vmm_gpt_lvl_len(lvl);
830 uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl);
831 vmm_gpt_node_t *parent = node_start;
832
833 /* Try to locate node at starting address */
834 vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children;
835 while (node != NULL && node->vgn_gpa < gpa) {
836 prev = node;
837 node = node->vgn_sib_next;
838 }
839
840 /*
841 * If no node exists at the starting address, create one and link it
842 * into the parent.
843 */
844 if (node == NULL || node->vgn_gpa > gpa) {
845 /* Need to insert node for starting GPA */
846 node = vmm_gpt_node_alloc();
847 vmm_gpt_node_add(gpt, parent, node, gpa, prev);
848 }
849
850 vmm_gpt_node_t *front_node = node;
851 prev = node;
852 gpa += incr;
853
854 /*
855 * With a node at the starting address, walk forward creating nodes in
856 * any of the gaps.
857 */
858 for (; gpa < end; gpa += incr, prev = node) {
859 node = vmm_gpt_node_next(prev, true);
860 if (node != NULL) {
861 ASSERT3U(node->vgn_gpa, ==, gpa);
862
863 /* We may have crossed into a new parent */
864 parent = node->vgn_parent;
865 continue;
866 }
867
868 if (vmm_gpt_node_is_last(prev)) {
869 /*
870 * The node preceding this was the last one in its
871 * containing parent, so move on to that parent's
872 * sibling. We expect (demand) that it exist already.
873 */
874 parent = vmm_gpt_node_next(parent, true);
875 ASSERT(parent != NULL);
876
877 /*
878 * Forget our previous sibling, since it is of no use
879 * for assigning the new node to the a now-different
880 * parent.
881 */
882 prev = NULL;
883
884 }
885 node = vmm_gpt_node_alloc();
886 vmm_gpt_node_add(gpt, parent, node, gpa, prev);
887 }
888
889 return (front_node);
890 }
891
892 /*
893 * Ensures that PTEs for the region of address space bounded by
894 * [addr, addr + len) exist in the tree.
895 */
896 void
vmm_gpt_populate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)897 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
898 {
899 ASSERT0(addr & PAGEOFFSET);
900 ASSERT0(len & PAGEOFFSET);
901
902 /*
903 * Starting at the top of the tree, ensure that tables covering the
904 * requested region exist at each level.
905 */
906 vmm_gpt_node_t *node = gpt->vgpt_root;
907 for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
908 ASSERT3U(node->vgn_level, ==, lvl);
909
910 node = vmm_gpt_populate_region_lvl(gpt, addr, len, node);
911 }
912
913
914 /*
915 * Establish reference counts for the soon-to-be memory PTEs which will
916 * be filling these LEVEL1 tables.
917 */
918 uint64_t gpa = addr;
919 const uint64_t end = addr + len;
920 while (gpa < end) {
921 ASSERT(node != NULL);
922 ASSERT3U(node->vgn_level, ==, LEVEL1);
923
924 const uint16_t covered =
925 vmm_gpt_node_entries_covered(node, addr, end);
926
927 ASSERT(covered != 0);
928 ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE);
929 ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE);
930
931 node->vgn_ref_cnt += covered;
932
933 vmm_gpt_node_t *next = vmm_gpt_node_next(node, true);
934 if (next != NULL) {
935 gpa = next->vgn_gpa;
936 node = next;
937 } else {
938 /*
939 * We do not expect to find a subsequent node after
940 * filling the last node in the table, completing PTE
941 * accounting for the specified range.
942 */
943 VERIFY3U(end, <=, vmm_gpt_node_end(node));
944 break;
945 }
946 }
947 }
948
949 /*
950 * Format a PTE and install it in the provided PTE-pointer.
951 *
952 * The caller must ensure that a conflicting PFN is not mapped at the requested
953 * location. Racing operations to map the same PFN at one location are
954 * acceptable and properly handled.
955 */
956 bool
vmm_gpt_map_at(vmm_gpt_t * gpt,vmm_gpt_entry_t * ptep,pfn_t pfn,uint_t prot,uint8_t attr)957 vmm_gpt_map_at(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, pfn_t pfn, uint_t prot,
958 uint8_t attr)
959 {
960 const vmm_gpt_entry_t pte = vmm_gpti_map_page(pfn, prot, attr);
961 const vmm_gpt_entry_t old_pte = atomic_cas_64(ptep, 0, pte);
962 if (old_pte != 0) {
963 #ifdef DEBUG
964 pfn_t new_pfn, old_pfn;
965
966 ASSERT(vmm_gpti_parse(pte, &new_pfn, NULL));
967 ASSERT(vmm_gpti_parse(old_pte, &old_pfn, NULL));
968 ASSERT3U(old_pfn, ==, new_pfn);
969 #endif /* DEBUG */
970 return (false);
971 }
972
973 return (true);
974 }
975
976 /*
977 * Cleans up the unused inner nodes in the GPT for a region of guest physical
978 * address space of [addr, addr + len). The region must map no pages.
979 */
980 void
vmm_gpt_vacate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)981 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
982 {
983 ASSERT0(addr & PAGEOFFSET);
984 ASSERT0(len & PAGEOFFSET);
985
986 const uint64_t end = addr + len;
987 vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = {
988 [LEVEL4] = gpt->vgpt_root,
989 };
990
991 for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
992 node = vmm_gpt_node_find_child(starts[lvl], addr);
993 if (node == NULL) {
994 break;
995 }
996 starts[lvl + 1] = node;
997 }
998
999 /*
1000 * Starting at the bottom of the tree, ensure that PTEs for pages have
1001 * been cleared for the region, and remove the corresponding reference
1002 * counts from the containing LEVEL1 tables.
1003 */
1004 uint64_t gpa = addr;
1005 node = starts[LEVEL1];
1006 while (gpa < end && node != NULL) {
1007 const uint16_t covered =
1008 vmm_gpt_node_entries_covered(node, addr, end);
1009
1010 ASSERT3U(node->vgn_ref_cnt, >=, covered);
1011 node->vgn_ref_cnt -= covered;
1012
1013 node = vmm_gpt_node_next(node, false);
1014 if (node != NULL) {
1015 gpa = node->vgn_gpa;
1016 }
1017 }
1018
1019 /*
1020 * With the page PTE references eliminated, work up from the bottom of
1021 * the table, removing nodes which have no remaining references.
1022 *
1023 * This stops short of LEVEL4, which is the root table of the GPT. It
1024 * is left standing to be cleaned up when the vmm_gpt_t is destroyed.
1025 */
1026 for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) {
1027 gpa = addr;
1028 node = starts[lvl];
1029
1030 while (gpa < end && node != NULL) {
1031 vmm_gpt_node_t *next = vmm_gpt_node_next(node, false);
1032
1033 if (node->vgn_ref_cnt == 0) {
1034 vmm_gpt_node_remove(node);
1035 }
1036 if (next != NULL) {
1037 gpa = next->vgn_gpa;
1038 }
1039 node = next;
1040 }
1041 }
1042 }
1043
1044 /*
1045 * Remove a mapping from the table. Returns false if the page was not mapped,
1046 * otherwise returns true.
1047 */
1048 bool
vmm_gpt_unmap(vmm_gpt_t * gpt,uint64_t gpa)1049 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
1050 {
1051 vmm_gpt_entry_t *entries[MAX_GPT_LEVEL], pte;
1052
1053 ASSERT(gpt != NULL);
1054 (void) vmm_gpt_walk(gpt, gpa, entries, LEVEL1);
1055 if (entries[LEVEL1] == NULL)
1056 return (false);
1057
1058 pte = *entries[LEVEL1];
1059 *entries[LEVEL1] = 0;
1060 return (vmm_gpti_parse(pte, NULL, NULL));
1061 }
1062
1063 /*
1064 * Un-maps the region of guest physical address space bounded by [start..end).
1065 * Returns the number of pages that are unmapped.
1066 */
1067 size_t
vmm_gpt_unmap_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)1068 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
1069 {
1070 ASSERT0(addr & PAGEOFFSET);
1071 ASSERT0(len & PAGEOFFSET);
1072
1073 vmm_gpt_iter_t state;
1074 vmm_gpt_iter_entry_t entry;
1075 size_t num_unmapped = 0;
1076
1077 vmm_gpt_iter_init(&state, gpt, addr, len);
1078 while (vmm_gpt_iter_next(&state, &entry)) {
1079 if (entry.vgie_ptep == NULL) {
1080 continue;
1081 }
1082
1083 const vmm_gpt_entry_t pte = *entry.vgie_ptep;
1084 *entry.vgie_ptep = 0;
1085 if (vmm_gpti_parse(pte, NULL, NULL)) {
1086 num_unmapped++;
1087 }
1088 }
1089
1090 return (num_unmapped);
1091 }
1092
1093 /*
1094 * Returns a value indicating whether or not this GPT maps the given
1095 * GPA. If the GPA is mapped, *protp will be filled with the protection
1096 * bits of the entry. Otherwise, it will be ignored.
1097 */
1098 bool
vmm_gpte_is_mapped(const vmm_gpt_entry_t * ptep,pfn_t * pfnp,uint_t * protp)1099 vmm_gpte_is_mapped(const vmm_gpt_entry_t *ptep, pfn_t *pfnp, uint_t *protp)
1100 {
1101 ASSERT(ptep != NULL);
1102
1103 return (vmm_gpti_parse(*ptep, pfnp, protp));
1104 }
1105
1106
1107 static uint_t
vmm_gpt_reset_bits(volatile uint64_t * ptep,uint64_t mask,uint64_t bits)1108 vmm_gpt_reset_bits(volatile uint64_t *ptep, uint64_t mask, uint64_t bits)
1109 {
1110 uint64_t pte, newpte, oldpte = 0;
1111
1112 /*
1113 * We use volatile and atomic ops here because we may be
1114 * racing against hardware modifying these bits.
1115 */
1116 VERIFY3P(ptep, !=, NULL);
1117 oldpte = *ptep;
1118 do {
1119 pte = oldpte;
1120 newpte = (pte & ~mask) | bits;
1121 oldpte = atomic_cas_64(ptep, pte, newpte);
1122 } while (oldpte != pte);
1123
1124 return (oldpte & mask);
1125 }
1126
1127 /*
1128 * Resets the accessed bit on the page table entry pointed to be `ptep`.
1129 * If `on` is true, the bit will be set, otherwise it will be cleared.
1130 * The old value of the bit is returned.
1131 */
1132 bool
vmm_gpte_reset_accessed(vmm_gpt_entry_t * ptep,bool on)1133 vmm_gpte_reset_accessed(vmm_gpt_entry_t *ptep, bool on)
1134 {
1135 ASSERT(ptep != NULL);
1136
1137 const uint64_t accessed_bit = vmm_pte_impl->vpi_bit_accessed;
1138 const uint64_t dirty_bit = vmm_pte_impl->vpi_bit_dirty;
1139
1140 const uint64_t old_state = vmm_gpt_reset_bits(ptep,
1141 accessed_bit | dirty_bit, on ? accessed_bit : 0);
1142 return (old_state != 0);
1143 }
1144
1145 /*
1146 * Resets the dirty bit on the page table entry pointed to be `ptep`.
1147 * If `on` is true, the bit will be set, otherwise it will be cleared.
1148 * The old value of the bit is returned.
1149 */
1150 bool
vmm_gpte_reset_dirty(vmm_gpt_entry_t * ptep,bool on)1151 vmm_gpte_reset_dirty(vmm_gpt_entry_t *ptep, bool on)
1152 {
1153 ASSERT(ptep != NULL);
1154 const uint64_t dirty_bit = vmm_pte_impl->vpi_bit_dirty;
1155
1156 const uint64_t old_state =
1157 vmm_gpt_reset_bits(ptep, dirty_bit, on ? dirty_bit : 0);
1158 return (old_state != 0);
1159 }
1160
1161 bool
vmm_gpte_query_accessed(const vmm_gpt_entry_t * ptep)1162 vmm_gpte_query_accessed(const vmm_gpt_entry_t *ptep)
1163 {
1164 ASSERT(ptep != NULL);
1165 return ((*ptep & vmm_pte_impl->vpi_bit_accessed) != 0);
1166 }
1167
1168 bool
vmm_gpte_query_dirty(const vmm_gpt_entry_t * ptep)1169 vmm_gpte_query_dirty(const vmm_gpt_entry_t *ptep)
1170 {
1171 ASSERT(ptep != NULL);
1172 return ((*ptep & vmm_pte_impl->vpi_bit_dirty) != 0);
1173 }
1174
1175 /*
1176 * Get properly formatted PML4 (EPTP/nCR3) for GPT.
1177 */
1178 uint64_t
vmm_gpt_get_pmtp(vmm_gpt_t * gpt,bool track_dirty)1179 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty)
1180 {
1181 const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn;
1182 return (vmm_pte_impl->vpi_get_pmtp(root_pfn, track_dirty));
1183 }
1184
1185 /*
1186 * Does the GPT hardware support dirty-page-tracking?
1187 */
1188 bool
vmm_gpt_can_track_dirty(vmm_gpt_t * gpt)1189 vmm_gpt_can_track_dirty(vmm_gpt_t *gpt)
1190 {
1191 return (vmm_pte_impl->vpi_hw_ad_supported());
1192 }
1193