xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_gpt.c (revision 50fe091cff3f2dccec5f588584a3ccb4f9933570)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2025 Oxide Computer Company
16  */
17 
18 #include <sys/types.h>
19 #include <sys/atomic.h>
20 #include <sys/kmem.h>
21 #include <sys/sysmacros.h>
22 #include <sys/sunddi.h>
23 #include <sys/panic.h>
24 #include <vm/hat.h>
25 #include <vm/as.h>
26 #include <vm/hat_i86.h>
27 
28 #include <sys/vmm_gpt.h>
29 #include <sys/vmm_gpt_impl.h>
30 
31 /*
32  * VMM Generic Page Tables
33  *
34  * Bhyve runs on AMD and Intel hosts and both support nested page tables
35  * describing the guest's physical address space.  But the two use different and
36  * mutually incompatible page table formats: Intel uses the EPT, which is based
37  * on the Itanium page table format, while AMD uses the nPT, which is based on
38  * the x86_64 page table format.
39  *
40  * The GPT abstracts these format differences, and provides a single interface
41  * for interacting with either kind of table structure.
42  *
43  * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
44  * It is parameterized with operations on PTEs that are specific to the table
45  * type (EPT or nPT) and also keeps track of how many pages the table maps, as
46  * well as a pointer to the root node in the tree.
47  *
48  * A node in the GPT keep pointers to its parent (NULL for the root), its
49  * left-most child, and its siblings.  The node understands its position in the
50  * tree in terms of the level it appears at and the index it occupies at its
51  * parent's level, as well as how many children it has.  It also owns the
52  * physical memory page for the hardware page table entries that map its
53  * children.  Thus, for a node at any given level in the tree, the nested PTE
54  * for that node's child at index $i$ is the i'th uint64_t in that node's entry
55  * page and the entry page is part of the paging structure consumed by hardware.
56  *
57  * The GPT interface provides functions for populating and vacating the tree for
58  * regions in the guest physical address space, and for mapping and unmapping
59  * pages in populated regions.  Users must populate a region before mapping
60  * pages into it, and must unmap pages before vacating the region.
61  *
62  * The interface also exposes a function for walking the table from the root to
63  * a leaf entry, populating an array of pointers to PTEs.  This walk uses the
64  * hardware page structure itself, and is thus fast, though as a result it
65  * potentially aliases entries; caveat emptor.  The walk primitive is used for
66  * mapping, unmapping, and lookups.
67  *
68  * Format-specific differences are abstracted by parameterizing the GPT with a
69  * set of PTE operations specific to the platform.  The GPT code makes use of
70  * these when mapping or populating entries, resetting accessed and dirty bits
71  * on entries, and similar operations.
72  */
73 
74 /*
75  * A GPT node.
76  *
77  * Each node contains pointers to its parent, its left-most child, and its
78  * siblings.  Interior nodes also maintain a reference count, and each node
79  * contains its level and index in its parent's table.  Finally, each node
80  * contains the host PFN of the page that it links into the page table, as well
81  * as a kernel pointer to table.
82  *
83  * On leaf nodes, the reference count tracks how many entries in the table are
84  * covered by mapping from the containing vmspace.  This is maintained during
85  * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace
86  * map/unmap operations, rather than in the data path of faults populating the
87  * PTEs themselves.
88  *
89  * Note, this is carefully sized to fit exactly into a 64-byte cache line.
90  */
91 typedef struct vmm_gpt_node vmm_gpt_node_t;
92 struct vmm_gpt_node {
93 	uint64_t	vgn_host_pfn;
94 	uint16_t	vgn_level;
95 	uint16_t	vgn_index;
96 	uint32_t	vgn_ref_cnt;
97 	vmm_gpt_node_t	*vgn_parent;
98 	vmm_gpt_node_t	*vgn_children;
99 	vmm_gpt_node_t	*vgn_sib_next;
100 	vmm_gpt_node_t	*vgn_sib_prev;
101 	vmm_gpt_entry_t	*vgn_entries;
102 	uint64_t	vgn_gpa;
103 };
104 
105 /* Maximum node index determined by number of entries in page table (512) */
106 #define	PTE_PER_TABLE	512
107 #define	MAX_NODE_IDX	(PTE_PER_TABLE - 1)
108 
109 /*
110  * A VMM Generic Page Table.
111  *
112  * The generic page table is a format-agnostic, 4-level paging structure
113  * modeling a second-level page table (EPT on Intel, nPT on AMD).  It
114  * contains a counter of pages the table maps, a pointer to the root node
115  * in the table, and is parameterized with a set of PTE operations specific
116  * to the table type.
117  */
118 struct vmm_gpt {
119 	vmm_gpt_node_t	*vgpt_root;
120 };
121 
122 void
vmm_gpt_impl_panic(void)123 vmm_gpt_impl_panic(void)
124 {
125 	/*
126 	 * Bail out if caller a makes it to indirect stub before complete
127 	 * hot-patching has occured.
128 	 */
129 	panic("Indirect function not hot-patched");
130 }
131 
132 /*
133  * Function indirection stubs
134  *
135  * These are made valid (no longer jumping to vmm_gpt_impl_panic()) by
136  * vmm_gpt_init(), and are only to be called after that has run successfully
137  * during module initialization.
138  */
139 uint64_t vmm_gpti_map_table(uint64_t);
140 uint64_t vmm_gpti_map_page(uint64_t, uint_t, uint8_t);
141 bool vmm_gpti_parse(uint64_t, pfn_t *, uint_t *);
142 
143 const struct vmm_pte_impl vmm_pte_uninit_impl = {
144 	.vpi_map_table		= (void *)vmm_gpt_impl_panic,
145 	.vpi_map_page		= (void *)vmm_gpt_impl_panic,
146 	.vpi_pte_parse		= (void *)vmm_gpt_impl_panic,
147 	.vpi_bit_accessed	= 0,
148 	.vpi_bit_dirty		= 0,
149 
150 	.vpi_get_pmtp		= (void *)vmm_gpt_impl_panic,
151 	.vpi_hw_ad_supported	= (void *)vmm_gpt_impl_panic,
152 };
153 
154 static const struct vmm_pte_impl *vmm_pte_impl = &vmm_pte_uninit_impl;
155 
156 #define	JMP_NEAR_OPCODE	0xe9
157 
158 struct jmp_instr {
159 	uint8_t opcode;
160 	int32_t off;
161 } __packed;
162 
163 
164 /*
165  * Given a pointer to an immediate-near-`jmp` instruction, calculate the address
166  * of its target.
167  */
168 static uintptr_t
jmp_off_to_addr(const struct jmp_instr * instp)169 jmp_off_to_addr(const struct jmp_instr *instp)
170 {
171 	const uintptr_t next_rip = (uintptr_t)&instp[1];
172 	const uintptr_t off = (uintptr_t)(intptr_t)instp->off;
173 
174 	return (next_rip + off);
175 }
176 
177 /*
178  * Given a pointer to a immediate-near-`jmp` instruction, calculate the
179  * immediate value for a given target address, if possible.
180  *
181  * Returns false if the target address is not within range of the `jmp`.
182  * Stores the offset value in `resp` on success.
183  */
184 static bool
jmp_addr_to_off(const struct jmp_instr * instp,void * target,int32_t * resp)185 jmp_addr_to_off(const struct jmp_instr *instp, void *target, int32_t *resp)
186 {
187 	const uintptr_t next_rip = (uintptr_t)&instp[1];
188 	const intptr_t off = (uintptr_t)target - next_rip;
189 
190 	/* jump is not "near" */
191 	if (off < INT32_MIN || off > INT32_MAX) {
192 		return (false);
193 	}
194 
195 	if (resp != NULL) {
196 		*resp = (int32_t)off;
197 	}
198 	return (true);
199 }
200 
201 /*
202  * Try to get the symbol name for a given kernel-virtual address.
203  *
204  * Returns a valid string pointer regardless of success or failure.
205  */
206 static inline const char *
addr_to_sym(void * addr)207 addr_to_sym(void *addr)
208 {
209 	ulong_t ignored_offset;
210 	const char *name = modgetsymname((uintptr_t)addr, &ignored_offset);
211 
212 	return (name != NULL ? name : "<null>");
213 }
214 
215 /*
216  * Hot-patch frequently called PTE implementation functions.
217  *
218  * Since indirect function calls are relatively expensive in a post-Spectre
219  * world, we choose to call into the backend via stubs which are hot-patched
220  * during module initialization with the proper implementation.  During module
221  * unload, it is called again to re-patch the stubs back to their uninitialized
222  * state of pointing to vmm_gpt_impl_panic().
223  */
224 static bool
vmm_gpt_patch_indirection(const struct vmm_pte_impl * old_impl,const struct vmm_pte_impl * new_impl)225 vmm_gpt_patch_indirection(const struct vmm_pte_impl *old_impl,
226     const struct vmm_pte_impl *new_impl)
227 {
228 	struct indirection_patch {
229 		void (*patch_site)();
230 		void (*old_implf)();
231 		void (*new_implf)();
232 	};
233 	const struct indirection_patch patches[] = {
234 		{
235 			.patch_site	= (void *)vmm_gpti_map_table,
236 			.old_implf	= (void *)old_impl->vpi_map_table,
237 			.new_implf	= (void *)new_impl->vpi_map_table,
238 		},
239 		{
240 			.patch_site	= (void *)vmm_gpti_map_page,
241 			.old_implf	= (void *)old_impl->vpi_map_page,
242 			.new_implf	= (void *)new_impl->vpi_map_page,
243 		},
244 		{
245 			.patch_site	= (void *)vmm_gpti_parse,
246 			.old_implf	= (void *)old_impl->vpi_pte_parse,
247 			.new_implf	= (void *)new_impl->vpi_pte_parse,
248 		},
249 	};
250 
251 	/* Check the patches for validity first */
252 	for (uint_t i = 0; i < ARRAY_SIZE(patches); i++) {
253 		const struct indirection_patch *patch = &patches[i];
254 		const struct jmp_instr *instp = (void *)patch->patch_site;
255 
256 		/* Expecting a near `jmp */
257 		if (instp->opcode != JMP_NEAR_OPCODE) {
258 			cmn_err(CE_WARN, "vmm: non-JMP instruction found when "
259 			    "attempting to hotpatch %s",
260 			    addr_to_sym(patch->patch_site));
261 			return (false);
262 		}
263 		/* ... targeted at the expected function */
264 		const uintptr_t old_target = jmp_off_to_addr(instp);
265 		if (old_target != (uintptr_t)patch->old_implf) {
266 			cmn_err(CE_WARN, "vmm: JMP instr @ %s has unexpected "
267 			    "target %s != %s",
268 			    addr_to_sym(patch->patch_site),
269 			    addr_to_sym((void *)old_target),
270 			    addr_to_sym(patch->old_implf));
271 			return (false);
272 		}
273 		/*
274 		 * ... and which is close enough in the address space to the new
275 		 * intended target to be within range of the near `jmp`.
276 		 */
277 		if (!jmp_addr_to_off(instp, patch->new_implf, NULL)) {
278 			cmn_err(CE_WARN, "vmm: near-JMP to new target %s is "
279 			    "too far for site %s",
280 			    addr_to_sym(patch->new_implf),
281 			    addr_to_sym(patch->patch_site));
282 			return (false);
283 		}
284 	}
285 
286 	for (uint_t i = 0; i < ARRAY_SIZE(patches); i++) {
287 		const struct indirection_patch *patch = &patches[i];
288 		struct jmp_instr *instp = (void *)patch->patch_site;
289 
290 		int32_t new_off;
291 		VERIFY(jmp_addr_to_off(instp, patch->new_implf, &new_off));
292 		/*
293 		 * We do not meet the demand from hot_patch_kernel_text() that
294 		 * the to-be-patched data be aligned with the patch size.  On
295 		 * x86_64, with the caveat that no other CPU(s) will be
296 		 * concurrently executing the patched instructions, we can break
297 		 * the rules without fear of disaster.
298 		 */
299 		hot_patch_kernel_text((caddr_t)&instp->off, new_off, 4);
300 	}
301 	return (true);
302 }
303 
304 bool
vmm_gpt_init(const struct vmm_pte_impl * target_impl)305 vmm_gpt_init(const struct vmm_pte_impl *target_impl)
306 {
307 	VERIFY3P(vmm_pte_impl, ==, &vmm_pte_uninit_impl);
308 
309 	if (vmm_gpt_patch_indirection(vmm_pte_impl, target_impl)) {
310 		vmm_pte_impl = target_impl;
311 		return (true);
312 	}
313 	return (false);
314 }
315 
316 void
vmm_gpt_fini(void)317 vmm_gpt_fini(void)
318 {
319 	/*
320 	 * Restore the indirection stubs back to their original (panicking)
321 	 * implementations.  All potential callers should be excluded since this
322 	 * is only done during module unload.
323 	 */
324 	VERIFY(vmm_gpt_patch_indirection(vmm_pte_impl, &vmm_pte_uninit_impl));
325 	vmm_pte_impl = &vmm_pte_uninit_impl;
326 }
327 
328 /*
329  * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
330  * hold the PTEs it contains.
331  */
332 static vmm_gpt_node_t *
vmm_gpt_node_alloc(void)333 vmm_gpt_node_alloc(void)
334 {
335 	vmm_gpt_node_t *node;
336 	caddr_t page;
337 
338 	node = kmem_zalloc(sizeof (*node), KM_SLEEP);
339 	/*
340 	 * Note: despite the man page, allocating PAGESIZE bytes is
341 	 * guaranteed to be page-aligned.
342 	 */
343 	page = kmem_zalloc(PAGESIZE, KM_SLEEP);
344 	node->vgn_entries = (vmm_gpt_entry_t *)page;
345 	node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
346 
347 	return (node);
348 }
349 
350 /*
351  * Allocates and initializes a vmm_gpt_t.
352  */
353 vmm_gpt_t *
vmm_gpt_alloc(void)354 vmm_gpt_alloc(void)
355 {
356 	vmm_gpt_t *gpt = kmem_zalloc(sizeof (vmm_gpt_t), KM_SLEEP);
357 	gpt->vgpt_root = vmm_gpt_node_alloc();
358 
359 	return (gpt);
360 }
361 
362 /*
363  * Frees a given node.  The node is expected to have no familial (parent,
364  * children, siblings) associations at this point.  Accordingly, its reference
365  * count should be zero.
366  */
367 static void
vmm_gpt_node_free(vmm_gpt_node_t * node)368 vmm_gpt_node_free(vmm_gpt_node_t *node)
369 {
370 	ASSERT(node != NULL);
371 	ASSERT3U(node->vgn_ref_cnt, ==, 0);
372 	ASSERT(node->vgn_host_pfn != PFN_INVALID);
373 	ASSERT(node->vgn_entries != NULL);
374 	ASSERT(node->vgn_parent == NULL);
375 
376 	kmem_free(node->vgn_entries, PAGESIZE);
377 	kmem_free(node, sizeof (*node));
378 }
379 
380 /*
381  * Frees a vmm_gpt_t.  Any lingering nodes in the GPT will be freed too.
382  */
383 void
vmm_gpt_free(vmm_gpt_t * gpt)384 vmm_gpt_free(vmm_gpt_t *gpt)
385 {
386 	/* Empty anything remaining in the tree */
387 	vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK);
388 
389 	VERIFY(gpt->vgpt_root != NULL);
390 	VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0);
391 
392 	vmm_gpt_node_free(gpt->vgpt_root);
393 	kmem_free(gpt, sizeof (*gpt));
394 }
395 
396 /*
397  * Given a GPA, return its corresponding index in a paging structure at the
398  * provided level.
399  */
400 static inline uint16_t
vmm_gpt_lvl_index(vmm_gpt_node_level_t level,uint64_t gpa)401 vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa)
402 {
403 	ASSERT(level < MAX_GPT_LEVEL);
404 
405 	const uint16_t mask = MAX_NODE_IDX;
406 	switch (level) {
407 	case LEVEL4: return ((gpa >> 39) & mask);
408 	case LEVEL3: return ((gpa >> 30) & mask);
409 	case LEVEL2: return ((gpa >> 21) & mask);
410 	case LEVEL1: return ((gpa >> 12) & mask);
411 	default:
412 		panic("impossible level value");
413 	};
414 }
415 
416 /* Get mask for addresses of entries at a given table level. */
417 static inline uint64_t
vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)418 vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)
419 {
420 	ASSERT(level < MAX_GPT_LEVEL);
421 
422 	switch (level) {
423 	case LEVEL4: return (0xffffff8000000000ul);	/* entries cover 512G */
424 	case LEVEL3: return (0xffffffffc0000000ul);	/* entries cover 1G */
425 	case LEVEL2: return (0xffffffffffe00000ul);	/* entries cover 2M */
426 	case LEVEL1: return (0xfffffffffffff000ul);	/* entries cover 4K */
427 	default:
428 		panic("impossible level value");
429 	};
430 }
431 
432 /* Get length of GPA covered by entries at a given table level. */
433 static inline uint64_t
vmm_gpt_lvl_len(vmm_gpt_node_level_t level)434 vmm_gpt_lvl_len(vmm_gpt_node_level_t level)
435 {
436 	ASSERT(level < MAX_GPT_LEVEL);
437 
438 	switch (level) {
439 	case LEVEL4: return (0x8000000000ul);	/* entries cover 512G */
440 	case LEVEL3: return (0x40000000ul);	/* entries cover 1G */
441 	case LEVEL2: return (0x200000ul);	/* entries cover 2M */
442 	case LEVEL1: return (0x1000ul);		/* entries cover 4K */
443 	default:
444 		panic("impossible level value");
445 	};
446 }
447 
448 /* Get the index of a PTE pointer based on its offset in that table */
449 static inline uint16_t
vmm_gpt_ptep_index(const vmm_gpt_entry_t * ptep)450 vmm_gpt_ptep_index(const vmm_gpt_entry_t *ptep)
451 {
452 	const uintptr_t offset = (uintptr_t)ptep & 0xffful;
453 	return (offset / sizeof (uint64_t));
454 }
455 
456 /*
457  * Get the ending GPA which this node could possibly cover given its base
458  * address and level.
459  */
460 static inline uint64_t
vmm_gpt_node_end(vmm_gpt_node_t * node)461 vmm_gpt_node_end(vmm_gpt_node_t *node)
462 {
463 	ASSERT(node->vgn_level > LEVEL4);
464 	return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1));
465 }
466 
467 /*
468  * Is this node the last entry in its parent node, based solely by its GPA?
469  */
470 static inline bool
vmm_gpt_node_is_last(vmm_gpt_node_t * node)471 vmm_gpt_node_is_last(vmm_gpt_node_t *node)
472 {
473 	return (node->vgn_index == MAX_NODE_IDX);
474 }
475 
476 /*
477  * How many table entries (if any) in this node are covered by the range of
478  * [start, end).
479  */
480 static uint16_t
vmm_gpt_node_entries_covered(vmm_gpt_node_t * node,uint64_t start,uint64_t end)481 vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end)
482 {
483 	const uint64_t node_end = vmm_gpt_node_end(node);
484 
485 	/* Is this node covered at all by the region? */
486 	if (start >= node_end || end <= node->vgn_gpa) {
487 		return (0);
488 	}
489 
490 	const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level);
491 	const uint64_t covered_start = MAX(node->vgn_gpa, start & mask);
492 	const uint64_t covered_end = MIN(node_end, end & mask);
493 	const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level);
494 
495 	return ((covered_end - covered_start) / per_entry);
496 }
497 
498 /*
499  * Find the next node (by address) in the tree at the same level.
500  *
501  * Returns NULL if this is the last node in the tree or if `only_seq` was true
502  * and there is an address gap between this node and the next.
503  */
504 static vmm_gpt_node_t *
vmm_gpt_node_next(vmm_gpt_node_t * node,bool only_seq)505 vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq)
506 {
507 	ASSERT3P(node->vgn_parent, !=, NULL);
508 	ASSERT3U(node->vgn_level, >, LEVEL4);
509 
510 	/*
511 	 * Next node sequentially would be the one at the address starting at
512 	 * the end of what is covered by this node.
513 	 */
514 	const uint64_t gpa_match = vmm_gpt_node_end(node);
515 
516 	/* Try our next sibling */
517 	vmm_gpt_node_t *next = node->vgn_sib_next;
518 
519 	if (next == NULL) {
520 		/*
521 		 * If the next-sibling pointer is NULL on the node, it can mean
522 		 * one of two things:
523 		 *
524 		 * 1. This entry represents the space leading up to the trailing
525 		 *    boundary of what this node covers.
526 		 *
527 		 * 2. The node is not entirely populated, and there is a gap
528 		 *    between the last populated entry, and the trailing
529 		 *    boundary of the node.
530 		 *
531 		 * Either way, the proper course of action is to check the first
532 		 * child of our parent's next sibling (if the parent is not the
533 		 * root of the GPT itself).
534 		 */
535 		if (node->vgn_parent != NULL && node->vgn_level > LEVEL3) {
536 			vmm_gpt_node_t *psibling =
537 			    vmm_gpt_node_next(node->vgn_parent, true);
538 			if (psibling != NULL) {
539 				next = psibling->vgn_children;
540 			}
541 		}
542 	}
543 
544 	if (next != NULL &&
545 	    (next->vgn_gpa == gpa_match || !only_seq)) {
546 		return (next);
547 	}
548 
549 	return (NULL);
550 }
551 
552 
553 /*
554  * Finds the child for the given GPA in the given parent node.
555  * Returns a pointer to node, or NULL if it is not found.
556  */
557 static vmm_gpt_node_t *
vmm_gpt_node_find_child(vmm_gpt_node_t * parent,uint64_t gpa)558 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
559 {
560 	const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa);
561 	for (vmm_gpt_node_t *child = parent->vgn_children;
562 	    child != NULL && child->vgn_index <= index;
563 	    child = child->vgn_sib_next) {
564 		if (child->vgn_index == index)
565 			return (child);
566 	}
567 
568 	return (NULL);
569 }
570 
571 /*
572  * Add a child node to the GPT at a position determined by GPA, parent, and (if
573  * present) preceding sibling.
574  *
575  * If `parent` node contains any children, `prev_sibling` must be populated with
576  * a pointer to the node preceding (by GPA) the to-be-added child node.
577  */
578 static void
vmm_gpt_node_add(vmm_gpt_t * gpt,vmm_gpt_node_t * parent,vmm_gpt_node_t * child,uint64_t gpa,vmm_gpt_node_t * prev_sibling)579 vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent,
580     vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling)
581 {
582 	ASSERT3U(parent->vgn_level, <, LEVEL1);
583 	ASSERT3U(child->vgn_parent, ==, NULL);
584 
585 	const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa);
586 	child->vgn_index = idx;
587 	child->vgn_level = parent->vgn_level + 1;
588 	child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level);
589 
590 	/* Establish familial connections */
591 	child->vgn_parent = parent;
592 	if (prev_sibling != NULL) {
593 		ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa);
594 
595 		child->vgn_sib_next = prev_sibling->vgn_sib_next;
596 		if (child->vgn_sib_next != NULL) {
597 			child->vgn_sib_next->vgn_sib_prev = child;
598 		}
599 		child->vgn_sib_prev = prev_sibling;
600 		prev_sibling->vgn_sib_next = child;
601 	} else if (parent->vgn_children != NULL) {
602 		vmm_gpt_node_t *next_sibling = parent->vgn_children;
603 
604 		ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa);
605 		ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL);
606 
607 		child->vgn_sib_next = next_sibling;
608 		child->vgn_sib_prev = NULL;
609 		next_sibling->vgn_sib_prev = child;
610 		parent->vgn_children = child;
611 	} else {
612 		parent->vgn_children = child;
613 		child->vgn_sib_next = NULL;
614 		child->vgn_sib_prev = NULL;
615 	}
616 
617 	/* Configure PTE for child table */
618 	parent->vgn_entries[idx] = vmm_gpti_map_table(child->vgn_host_pfn);
619 	parent->vgn_ref_cnt++;
620 }
621 
622 /*
623  * Remove a child node from its relatives (parent, siblings) and free it.
624  */
625 static void
vmm_gpt_node_remove(vmm_gpt_node_t * child)626 vmm_gpt_node_remove(vmm_gpt_node_t *child)
627 {
628 	ASSERT3P(child->vgn_children, ==, NULL);
629 	ASSERT3U(child->vgn_ref_cnt, ==, 0);
630 	ASSERT3P(child->vgn_parent, !=, NULL);
631 
632 	/* Unlink child from its siblings and parent */
633 	vmm_gpt_node_t *parent = child->vgn_parent;
634 	vmm_gpt_node_t *prev = child->vgn_sib_prev;
635 	vmm_gpt_node_t *next = child->vgn_sib_next;
636 	if (prev != NULL) {
637 		ASSERT3P(prev->vgn_sib_next, ==, child);
638 		prev->vgn_sib_next = next;
639 	}
640 	if (next != NULL) {
641 		ASSERT3P(next->vgn_sib_prev, ==, child);
642 		next->vgn_sib_prev = prev;
643 	}
644 	if (prev == NULL) {
645 		ASSERT3P(parent->vgn_children, ==, child);
646 		parent->vgn_children = next;
647 	}
648 	child->vgn_parent = NULL;
649 	child->vgn_sib_next = NULL;
650 	child->vgn_sib_prev = NULL;
651 	parent->vgn_entries[child->vgn_index] = 0;
652 	parent->vgn_ref_cnt--;
653 
654 	vmm_gpt_node_free(child);
655 }
656 
657 /*
658  * Walks the GPT for the given GPA, accumulating entries to the given depth.  If
659  * the walk terminates before the depth is reached, the remaining entries are
660  * written with NULLs.
661  *
662  * Returns the GPA corresponding to the deepest populated (non-NULL) entry
663  * encountered during the walk.
664  */
665 uint64_t
vmm_gpt_walk(vmm_gpt_t * gpt,uint64_t gpa,vmm_gpt_entry_t ** entries,vmm_gpt_node_level_t depth)666 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries,
667     vmm_gpt_node_level_t depth)
668 {
669 	ASSERT(gpt != NULL);
670 	ASSERT3U(depth, <, MAX_GPT_LEVEL);
671 
672 	vmm_gpt_entry_t *current_entries = gpt->vgpt_root->vgn_entries;
673 	uint64_t mask = 0;
674 	for (uint_t lvl = LEVEL4; lvl <= depth; lvl++) {
675 		if (current_entries == NULL) {
676 			entries[lvl] = NULL;
677 			continue;
678 		}
679 		entries[lvl] = &current_entries[vmm_gpt_lvl_index(lvl, gpa)];
680 		mask = vmm_gpt_lvl_mask(lvl);
681 		const vmm_gpt_entry_t pte = *entries[lvl];
682 
683 		pfn_t pfn;
684 		if (!vmm_gpti_parse(pte, &pfn, NULL)) {
685 			current_entries = NULL;
686 			continue;
687 		}
688 		current_entries = (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn);
689 	}
690 	return (gpa & mask);
691 }
692 
693 /*
694  * Given a `gpa`, and the corresponding `entries` array as queried by a call to
695  * `vmm_gpt_walk()`, attempt to advance to the next PTE at the `depth` provided
696  * by the caller.
697  *
698  * As there may not be PTEs present at the requested `depth`, the amount that
699  * the GPA is advanced may be larger than what is expected for that provided
700  * depth.
701  *
702  * Returns the GPA correspending to the PTE to which `entries` was advanced to.
703  */
704 static uint64_t
vmm_gpt_walk_advance(vmm_gpt_t * gpt,uint64_t gpa,vmm_gpt_entry_t ** entries,vmm_gpt_node_level_t depth)705 vmm_gpt_walk_advance(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries,
706     vmm_gpt_node_level_t depth)
707 {
708 	ASSERT(gpt != NULL);
709 	ASSERT3U(depth, <, MAX_GPT_LEVEL);
710 	ASSERT0(gpa & ~vmm_gpt_lvl_mask(depth));
711 
712 	/*
713 	 * Advanced to the next entry in the deepest level of PTE possible,
714 	 * ascending to higher levels if we reach the end of a table at any
715 	 * given point.
716 	 */
717 	int lvl;
718 	for (lvl = depth; lvl >= LEVEL4; lvl--) {
719 		vmm_gpt_entry_t *ptep = entries[lvl];
720 
721 		if (ptep == NULL) {
722 			continue;
723 		}
724 
725 		uint16_t index = vmm_gpt_ptep_index(ptep);
726 		ASSERT3U(vmm_gpt_lvl_index(lvl, gpa), ==, index);
727 		if (index == MAX_NODE_IDX) {
728 			continue;
729 		}
730 
731 		gpa = (gpa & vmm_gpt_lvl_mask(lvl)) + vmm_gpt_lvl_len(lvl);
732 		entries[lvl] = ptep + 1;
733 		break;
734 	}
735 
736 	if (lvl < LEVEL4) {
737 		/* Advanced off the end of the tables */
738 		return (UINT64_MAX);
739 	}
740 
741 	/*
742 	 * Attempt to walk back down to the target level if any ascension was
743 	 * necessary during the above advancement.
744 	 */
745 	vmm_gpt_entry_t pte = *entries[lvl];
746 	lvl++;
747 	for (; lvl < MAX_GPT_LEVEL; lvl++) {
748 		pfn_t pfn;
749 
750 		if (lvl > depth || !vmm_gpti_parse(pte, &pfn, NULL)) {
751 			entries[lvl] = NULL;
752 			continue;
753 		}
754 
755 		vmm_gpt_entry_t *next_table =
756 		    (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn);
757 		const uint16_t index = vmm_gpt_lvl_index(lvl, gpa);
758 		pte = next_table[index];
759 		entries[lvl] = &next_table[index];
760 	}
761 
762 	return (gpa);
763 }
764 
765 /*
766  * Initialize iteration over GPT leaf entries between [addr, addr + len).  The
767  * specified interval must be page-aligned and not overflow the end of the
768  * address space.
769  *
770  * Subsequent calls to vmm_gpt_iter_next() will emit the encountered entries.
771  */
772 void
vmm_gpt_iter_init(vmm_gpt_iter_t * iter,vmm_gpt_t * gpt,uint64_t addr,uint64_t len)773 vmm_gpt_iter_init(vmm_gpt_iter_t *iter, vmm_gpt_t *gpt, uint64_t addr,
774     uint64_t len)
775 {
776 	ASSERT0(addr & PAGEOFFSET);
777 	ASSERT0(len & PAGEOFFSET);
778 	ASSERT3U((addr + len), >=, addr);
779 
780 	iter->vgi_gpt = gpt;
781 	iter->vgi_addr = addr;
782 	iter->vgi_end = addr + len;
783 	iter->vgi_current = vmm_gpt_walk(gpt, addr, iter->vgi_entries, LEVEL1);
784 }
785 
786 /*
787  * Fetch the next entry (if any) from an iterator initialized from a preceding
788  * call to vmm_gpt_iter_init().  Returns true when the next GPT leaf entry
789  * inside the iterator range has been populated in `entry`.  Returns `false`
790  * when there are no such entries available or remaining in the range.
791  */
792 bool
vmm_gpt_iter_next(vmm_gpt_iter_t * iter,vmm_gpt_iter_entry_t * entry)793 vmm_gpt_iter_next(vmm_gpt_iter_t *iter, vmm_gpt_iter_entry_t *entry)
794 {
795 	if (iter->vgi_current >= iter->vgi_end) {
796 		return (false);
797 	}
798 
799 	while (iter->vgi_current < iter->vgi_end) {
800 		bool found = false;
801 		if (iter->vgi_entries[LEVEL1] != NULL) {
802 			entry->vgie_gpa = iter->vgi_current;
803 			entry->vgie_ptep = iter->vgi_entries[LEVEL1];
804 			found = true;
805 		}
806 
807 		iter->vgi_current = vmm_gpt_walk_advance(iter->vgi_gpt,
808 		    iter->vgi_current, iter->vgi_entries, LEVEL1);
809 
810 		if (found) {
811 			return (true);
812 		}
813 	}
814 	return (false);
815 }
816 
817 /*
818  * Populate child table nodes for a given level between the provided interval
819  * of [addr, addr + len).  Caller is expected to provide a pointer to the parent
820  * node which would contain the child node for GPA at `addr`.  A pointer to said
821  * child node will be returned when the operation is complete.
822  */
823 static vmm_gpt_node_t *
vmm_gpt_populate_region_lvl(vmm_gpt_t * gpt,uint64_t addr,uint64_t len,vmm_gpt_node_t * node_start)824 vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len,
825     vmm_gpt_node_t *node_start)
826 {
827 	const vmm_gpt_node_level_t lvl = node_start->vgn_level;
828 	const uint64_t end = addr + len;
829 	const uint64_t incr = vmm_gpt_lvl_len(lvl);
830 	uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl);
831 	vmm_gpt_node_t *parent = node_start;
832 
833 	/* Try to locate node at starting address */
834 	vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children;
835 	while (node != NULL && node->vgn_gpa < gpa) {
836 		prev = node;
837 		node = node->vgn_sib_next;
838 	}
839 
840 	/*
841 	 * If no node exists at the starting address, create one and link it
842 	 * into the parent.
843 	 */
844 	if (node == NULL || node->vgn_gpa > gpa) {
845 		/* Need to insert node for starting GPA */
846 		node = vmm_gpt_node_alloc();
847 		vmm_gpt_node_add(gpt, parent, node, gpa, prev);
848 	}
849 
850 	vmm_gpt_node_t *front_node = node;
851 	prev = node;
852 	gpa += incr;
853 
854 	/*
855 	 * With a node at the starting address, walk forward creating nodes in
856 	 * any of the gaps.
857 	 */
858 	for (; gpa < end; gpa += incr, prev = node) {
859 		node = vmm_gpt_node_next(prev, true);
860 		if (node != NULL) {
861 			ASSERT3U(node->vgn_gpa, ==, gpa);
862 
863 			/* We may have crossed into a new parent */
864 			parent = node->vgn_parent;
865 			continue;
866 		}
867 
868 		if (vmm_gpt_node_is_last(prev)) {
869 			/*
870 			 * The node preceding this was the last one in its
871 			 * containing parent, so move on to that parent's
872 			 * sibling.  We expect (demand) that it exist already.
873 			 */
874 			parent = vmm_gpt_node_next(parent, true);
875 			ASSERT(parent != NULL);
876 
877 			/*
878 			 * Forget our previous sibling, since it is of no use
879 			 * for assigning the new node to the a now-different
880 			 * parent.
881 			 */
882 			prev = NULL;
883 
884 		}
885 		node = vmm_gpt_node_alloc();
886 		vmm_gpt_node_add(gpt, parent, node, gpa, prev);
887 	}
888 
889 	return (front_node);
890 }
891 
892 /*
893  * Ensures that PTEs for the region of address space bounded by
894  * [addr, addr + len) exist in the tree.
895  */
896 void
vmm_gpt_populate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)897 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
898 {
899 	ASSERT0(addr & PAGEOFFSET);
900 	ASSERT0(len & PAGEOFFSET);
901 
902 	/*
903 	 * Starting at the top of the tree, ensure that tables covering the
904 	 * requested region exist at each level.
905 	 */
906 	vmm_gpt_node_t *node = gpt->vgpt_root;
907 	for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
908 		ASSERT3U(node->vgn_level, ==, lvl);
909 
910 		node = vmm_gpt_populate_region_lvl(gpt, addr, len, node);
911 	}
912 
913 
914 	/*
915 	 * Establish reference counts for the soon-to-be memory PTEs which will
916 	 * be filling these LEVEL1 tables.
917 	 */
918 	uint64_t gpa = addr;
919 	const uint64_t end = addr + len;
920 	while (gpa < end) {
921 		ASSERT(node != NULL);
922 		ASSERT3U(node->vgn_level, ==, LEVEL1);
923 
924 		const uint16_t covered =
925 		    vmm_gpt_node_entries_covered(node, addr, end);
926 
927 		ASSERT(covered != 0);
928 		ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE);
929 		ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE);
930 
931 		node->vgn_ref_cnt += covered;
932 
933 		vmm_gpt_node_t *next = vmm_gpt_node_next(node, true);
934 		if (next != NULL) {
935 			gpa = next->vgn_gpa;
936 			node = next;
937 		} else {
938 			/*
939 			 * We do not expect to find a subsequent node after
940 			 * filling the last node in the table, completing PTE
941 			 * accounting for the specified range.
942 			 */
943 			VERIFY3U(end, <=, vmm_gpt_node_end(node));
944 			break;
945 		}
946 	}
947 }
948 
949 /*
950  * Format a PTE and install it in the provided PTE-pointer.
951  *
952  * The caller must ensure that a conflicting PFN is not mapped at the requested
953  * location.  Racing operations to map the same PFN at one location are
954  * acceptable and properly handled.
955  */
956 bool
vmm_gpt_map_at(vmm_gpt_t * gpt,vmm_gpt_entry_t * ptep,pfn_t pfn,uint_t prot,uint8_t attr)957 vmm_gpt_map_at(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, pfn_t pfn, uint_t prot,
958     uint8_t attr)
959 {
960 	const vmm_gpt_entry_t pte = vmm_gpti_map_page(pfn, prot, attr);
961 	const vmm_gpt_entry_t old_pte = atomic_cas_64(ptep, 0, pte);
962 	if (old_pte != 0) {
963 #ifdef DEBUG
964 		pfn_t new_pfn, old_pfn;
965 
966 		ASSERT(vmm_gpti_parse(pte, &new_pfn, NULL));
967 		ASSERT(vmm_gpti_parse(old_pte, &old_pfn, NULL));
968 		ASSERT3U(old_pfn, ==, new_pfn);
969 #endif /* DEBUG */
970 		return (false);
971 	}
972 
973 	return (true);
974 }
975 
976 /*
977  * Cleans up the unused inner nodes in the GPT for a region of guest physical
978  * address space of [addr, addr + len).  The region must map no pages.
979  */
980 void
vmm_gpt_vacate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)981 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
982 {
983 	ASSERT0(addr & PAGEOFFSET);
984 	ASSERT0(len & PAGEOFFSET);
985 
986 	const uint64_t end = addr + len;
987 	vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = {
988 		[LEVEL4] = gpt->vgpt_root,
989 	};
990 
991 	for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
992 		node = vmm_gpt_node_find_child(starts[lvl], addr);
993 		if (node == NULL) {
994 			break;
995 		}
996 		starts[lvl + 1] = node;
997 	}
998 
999 	/*
1000 	 * Starting at the bottom of the tree, ensure that PTEs for pages have
1001 	 * been cleared for the region, and remove the corresponding reference
1002 	 * counts from the containing LEVEL1 tables.
1003 	 */
1004 	uint64_t gpa = addr;
1005 	node = starts[LEVEL1];
1006 	while (gpa < end && node != NULL) {
1007 		const uint16_t covered =
1008 		    vmm_gpt_node_entries_covered(node, addr, end);
1009 
1010 		ASSERT3U(node->vgn_ref_cnt, >=, covered);
1011 		node->vgn_ref_cnt -= covered;
1012 
1013 		node = vmm_gpt_node_next(node, false);
1014 		if (node != NULL) {
1015 			gpa = node->vgn_gpa;
1016 		}
1017 	}
1018 
1019 	/*
1020 	 * With the page PTE references eliminated, work up from the bottom of
1021 	 * the table, removing nodes which have no remaining references.
1022 	 *
1023 	 * This stops short of LEVEL4, which is the root table of the GPT.  It
1024 	 * is left standing to be cleaned up when the vmm_gpt_t is destroyed.
1025 	 */
1026 	for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) {
1027 		gpa = addr;
1028 		node = starts[lvl];
1029 
1030 		while (gpa < end && node != NULL) {
1031 			vmm_gpt_node_t *next = vmm_gpt_node_next(node, false);
1032 
1033 			if (node->vgn_ref_cnt == 0) {
1034 				vmm_gpt_node_remove(node);
1035 			}
1036 			if (next != NULL) {
1037 				gpa = next->vgn_gpa;
1038 			}
1039 			node = next;
1040 		}
1041 	}
1042 }
1043 
1044 /*
1045  * Remove a mapping from the table.  Returns false if the page was not mapped,
1046  * otherwise returns true.
1047  */
1048 bool
vmm_gpt_unmap(vmm_gpt_t * gpt,uint64_t gpa)1049 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
1050 {
1051 	vmm_gpt_entry_t *entries[MAX_GPT_LEVEL], pte;
1052 
1053 	ASSERT(gpt != NULL);
1054 	(void) vmm_gpt_walk(gpt, gpa, entries, LEVEL1);
1055 	if (entries[LEVEL1] == NULL)
1056 		return (false);
1057 
1058 	pte = *entries[LEVEL1];
1059 	*entries[LEVEL1] = 0;
1060 	return (vmm_gpti_parse(pte, NULL, NULL));
1061 }
1062 
1063 /*
1064  * Un-maps the region of guest physical address space bounded by [start..end).
1065  * Returns the number of pages that are unmapped.
1066  */
1067 size_t
vmm_gpt_unmap_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)1068 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
1069 {
1070 	ASSERT0(addr & PAGEOFFSET);
1071 	ASSERT0(len & PAGEOFFSET);
1072 
1073 	vmm_gpt_iter_t state;
1074 	vmm_gpt_iter_entry_t entry;
1075 	size_t num_unmapped = 0;
1076 
1077 	vmm_gpt_iter_init(&state, gpt, addr, len);
1078 	while (vmm_gpt_iter_next(&state, &entry)) {
1079 		if (entry.vgie_ptep == NULL) {
1080 			continue;
1081 		}
1082 
1083 		const vmm_gpt_entry_t pte = *entry.vgie_ptep;
1084 		*entry.vgie_ptep = 0;
1085 		if (vmm_gpti_parse(pte, NULL, NULL)) {
1086 			num_unmapped++;
1087 		}
1088 	}
1089 
1090 	return (num_unmapped);
1091 }
1092 
1093 /*
1094  * Returns a value indicating whether or not this GPT maps the given
1095  * GPA.  If the GPA is mapped, *protp will be filled with the protection
1096  * bits of the entry.  Otherwise, it will be ignored.
1097  */
1098 bool
vmm_gpte_is_mapped(const vmm_gpt_entry_t * ptep,pfn_t * pfnp,uint_t * protp)1099 vmm_gpte_is_mapped(const vmm_gpt_entry_t *ptep, pfn_t *pfnp, uint_t *protp)
1100 {
1101 	ASSERT(ptep != NULL);
1102 
1103 	return (vmm_gpti_parse(*ptep, pfnp, protp));
1104 }
1105 
1106 
1107 static uint_t
vmm_gpt_reset_bits(volatile uint64_t * ptep,uint64_t mask,uint64_t bits)1108 vmm_gpt_reset_bits(volatile uint64_t *ptep, uint64_t mask, uint64_t bits)
1109 {
1110 	uint64_t pte, newpte, oldpte = 0;
1111 
1112 	/*
1113 	 * We use volatile and atomic ops here because we may be
1114 	 * racing against hardware modifying these bits.
1115 	 */
1116 	VERIFY3P(ptep, !=, NULL);
1117 	oldpte = *ptep;
1118 	do {
1119 		pte = oldpte;
1120 		newpte = (pte & ~mask) | bits;
1121 		oldpte = atomic_cas_64(ptep, pte, newpte);
1122 	} while (oldpte != pte);
1123 
1124 	return (oldpte & mask);
1125 }
1126 
1127 /*
1128  * Resets the accessed bit on the page table entry pointed to be `ptep`.
1129  * If `on` is true, the bit will be set, otherwise it will be cleared.
1130  * The old value of the bit is returned.
1131  */
1132 bool
vmm_gpte_reset_accessed(vmm_gpt_entry_t * ptep,bool on)1133 vmm_gpte_reset_accessed(vmm_gpt_entry_t *ptep, bool on)
1134 {
1135 	ASSERT(ptep != NULL);
1136 
1137 	const uint64_t accessed_bit = vmm_pte_impl->vpi_bit_accessed;
1138 	const uint64_t dirty_bit = vmm_pte_impl->vpi_bit_dirty;
1139 
1140 	const uint64_t old_state = vmm_gpt_reset_bits(ptep,
1141 	    accessed_bit | dirty_bit, on ? accessed_bit : 0);
1142 	return (old_state != 0);
1143 }
1144 
1145 /*
1146  * Resets the dirty bit on the page table entry pointed to be `ptep`.
1147  * If `on` is true, the bit will be set, otherwise it will be cleared.
1148  * The old value of the bit is returned.
1149  */
1150 bool
vmm_gpte_reset_dirty(vmm_gpt_entry_t * ptep,bool on)1151 vmm_gpte_reset_dirty(vmm_gpt_entry_t *ptep, bool on)
1152 {
1153 	ASSERT(ptep != NULL);
1154 	const uint64_t dirty_bit = vmm_pte_impl->vpi_bit_dirty;
1155 
1156 	const uint64_t old_state =
1157 	    vmm_gpt_reset_bits(ptep, dirty_bit, on ? dirty_bit : 0);
1158 	return (old_state != 0);
1159 }
1160 
1161 bool
vmm_gpte_query_accessed(const vmm_gpt_entry_t * ptep)1162 vmm_gpte_query_accessed(const vmm_gpt_entry_t *ptep)
1163 {
1164 	ASSERT(ptep != NULL);
1165 	return ((*ptep & vmm_pte_impl->vpi_bit_accessed) != 0);
1166 }
1167 
1168 bool
vmm_gpte_query_dirty(const vmm_gpt_entry_t * ptep)1169 vmm_gpte_query_dirty(const vmm_gpt_entry_t *ptep)
1170 {
1171 	ASSERT(ptep != NULL);
1172 	return ((*ptep & vmm_pte_impl->vpi_bit_dirty) != 0);
1173 }
1174 
1175 /*
1176  * Get properly formatted PML4 (EPTP/nCR3) for GPT.
1177  */
1178 uint64_t
vmm_gpt_get_pmtp(vmm_gpt_t * gpt,bool track_dirty)1179 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty)
1180 {
1181 	const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn;
1182 	return (vmm_pte_impl->vpi_get_pmtp(root_pfn, track_dirty));
1183 }
1184 
1185 /*
1186  * Does the GPT hardware support dirty-page-tracking?
1187  */
1188 bool
vmm_gpt_can_track_dirty(vmm_gpt_t * gpt)1189 vmm_gpt_can_track_dirty(vmm_gpt_t *gpt)
1190 {
1191 	return (vmm_pte_impl->vpi_hw_ad_supported());
1192 }
1193