xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_gpt.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2023 Oxide Computer Company
15  */
16 
17 #include <sys/types.h>
18 #include <sys/atomic.h>
19 #include <sys/kmem.h>
20 #include <sys/sysmacros.h>
21 #include <sys/sunddi.h>
22 #include <sys/panic.h>
23 #include <vm/hat.h>
24 #include <vm/as.h>
25 #include <vm/hat_i86.h>
26 
27 #include <sys/vmm_gpt.h>
28 
29 /*
30  * VMM Generic Page Tables
31  *
32  * Bhyve runs on AMD and Intel hosts and both support nested page tables
33  * describing the guest's physical address space.  But the two use different and
34  * mutually incompatible page table formats: Intel uses the EPT, which is based
35  * on the Itanium page table format, while AMD uses the nPT, which is based on
36  * the x86_64 page table format.
37  *
38  * The GPT abstracts these format differences, and provides a single interface
39  * for interacting with either kind of table structure.
40  *
41  * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
42  * It is parameterized with operations on PTEs that are specific to the table
43  * type (EPT or nPT) and also keeps track of how many pages the table maps, as
44  * well as a pointer to the root node in the tree.
45  *
46  * A node in the GPT keep pointers to its parent (NULL for the root), its
47  * left-most child, and its siblings.  The node understands its position in the
48  * tree in terms of the level it appears at and the index it occupies at its
49  * parent's level, as well as how many children it has.  It also owns the
50  * physical memory page for the hardware page table entries that map its
51  * children.  Thus, for a node at any given level in the tree, the nested PTE
52  * for that node's child at index $i$ is the i'th uint64_t in that node's entry
53  * page and the entry page is part of the paging structure consumed by hardware.
54  *
55  * The GPT interface provides functions for populating and vacating the tree for
56  * regions in the guest physical address space, and for mapping and unmapping
57  * pages in populated regions.  Users must populate a region before mapping
58  * pages into it, and must unmap pages before vacating the region.
59  *
60  * The interface also exposes a function for walking the table from the root to
61  * a leaf entry, populating an array of pointers to PTEs.  This walk uses the
62  * hardware page structure itself, and is thus fast, though as a result it
63  * potentially aliases entries; caveat emptor.  The walk primitive is used for
64  * mapping, unmapping, and lookups.
65  *
66  * Format-specific differences are abstracted by parameterizing the GPT with a
67  * set of PTE operations specific to the platform.  The GPT code makes use of
68  * these when mapping or populating entries, resetting accessed and dirty bits
69  * on entries, and similar operations.
70  */
71 
72 /*
73  * A GPT node.
74  *
75  * Each node contains pointers to its parent, its left-most child, and its
76  * siblings.  Interior nodes also maintain a reference count, and each node
77  * contains its level and index in its parent's table.  Finally, each node
78  * contains the host PFN of the page that it links into the page table, as well
79  * as a kernel pointer to table.
80  *
81  * On leaf nodes, the reference count tracks how many entries in the table are
82  * covered by mapping from the containing vmspace.  This is maintained during
83  * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace
84  * map/unmap operations, rather than in the data path of faults populating the
85  * PTEs themselves.
86  *
87  * Note, this is carefully sized to fit exactly into a 64-byte cache line.
88  */
89 typedef struct vmm_gpt_node vmm_gpt_node_t;
90 struct vmm_gpt_node {
91 	uint64_t	vgn_host_pfn;
92 	uint16_t	vgn_level;
93 	uint16_t	vgn_index;
94 	uint32_t	vgn_ref_cnt;
95 	vmm_gpt_node_t	*vgn_parent;
96 	vmm_gpt_node_t	*vgn_children;
97 	vmm_gpt_node_t	*vgn_sib_next;
98 	vmm_gpt_node_t	*vgn_sib_prev;
99 	uint64_t	*vgn_entries;
100 	uint64_t	vgn_gpa;
101 };
102 
103 /* Maximum node index determined by number of entries in page table (512) */
104 #define	PTE_PER_TABLE	512
105 #define	MAX_NODE_IDX	(PTE_PER_TABLE - 1)
106 
107 /*
108  * A VMM Generic Page Table.
109  *
110  * The generic page table is a format-agnostic, 4-level paging structure
111  * modeling a second-level page table (EPT on Intel, nPT on AMD).  It
112  * contains a counter of pages the table maps, a pointer to the root node
113  * in the table, and is parameterized with a set of PTE operations specific
114  * to the table type.
115  */
116 struct vmm_gpt {
117 	vmm_gpt_node_t	*vgpt_root;
118 	vmm_pte_ops_t	*vgpt_pte_ops;
119 };
120 
121 /*
122  * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
123  * hold the PTEs it contains.
124  */
125 static vmm_gpt_node_t *
126 vmm_gpt_node_alloc(void)
127 {
128 	vmm_gpt_node_t *node;
129 	caddr_t page;
130 
131 	node = kmem_zalloc(sizeof (*node), KM_SLEEP);
132 	/*
133 	 * Note: despite the man page, allocating PAGESIZE bytes is
134 	 * guaranteed to be page-aligned.
135 	 */
136 	page = kmem_zalloc(PAGESIZE, KM_SLEEP);
137 	node->vgn_entries = (uint64_t *)page;
138 	node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
139 
140 	return (node);
141 }
142 
143 /*
144  * Allocates and initializes a vmm_gpt_t.
145  */
146 vmm_gpt_t *
147 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops)
148 {
149 	vmm_gpt_t *gpt;
150 
151 	VERIFY(pte_ops != NULL);
152 	gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP);
153 	gpt->vgpt_pte_ops = pte_ops;
154 	gpt->vgpt_root = vmm_gpt_node_alloc();
155 
156 	return (gpt);
157 }
158 
159 /*
160  * Frees a given node.  The node is expected to have no familial (parent,
161  * children, siblings) associations at this point.  Accordingly, its reference
162  * count should be zero.
163  */
164 static void
165 vmm_gpt_node_free(vmm_gpt_node_t *node)
166 {
167 	ASSERT(node != NULL);
168 	ASSERT3U(node->vgn_ref_cnt, ==, 0);
169 	ASSERT(node->vgn_host_pfn != PFN_INVALID);
170 	ASSERT(node->vgn_entries != NULL);
171 	ASSERT(node->vgn_parent == NULL);
172 
173 	kmem_free(node->vgn_entries, PAGESIZE);
174 	kmem_free(node, sizeof (*node));
175 }
176 
177 /*
178  * Frees a vmm_gpt_t.  Any lingering nodes in the GPT will be freed too.
179  */
180 void
181 vmm_gpt_free(vmm_gpt_t *gpt)
182 {
183 	/* Empty anything remaining in the tree */
184 	vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK);
185 
186 	VERIFY(gpt->vgpt_root != NULL);
187 	VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0);
188 
189 	vmm_gpt_node_free(gpt->vgpt_root);
190 	kmem_free(gpt, sizeof (*gpt));
191 }
192 
193 /*
194  * Given a GPA, return its corresponding index in a paging structure at the
195  * provided level.
196  */
197 static inline uint16_t
198 vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa)
199 {
200 	ASSERT(level < MAX_GPT_LEVEL);
201 
202 	const uint_t shifts[] = {
203 		[LEVEL4] = 39,
204 		[LEVEL3] = 30,
205 		[LEVEL2] = 21,
206 		[LEVEL1] = 12,
207 	};
208 	const uint16_t mask = (1U << 9) - 1;
209 	return ((gpa >> shifts[level]) & mask);
210 }
211 
212 /* Get mask for addresses of entries at a given table level. */
213 static inline uint64_t
214 vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)
215 {
216 	ASSERT(level < MAX_GPT_LEVEL);
217 
218 	const uint64_t gpa_mask[] = {
219 		[LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */
220 		[LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */
221 		[LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */
222 		[LEVEL1] = 0xfffffffffffff000ul, /* entries cover 4K */
223 	};
224 	return (gpa_mask[level]);
225 }
226 
227 /* Get length of GPA covered by entries at a given table level. */
228 static inline uint64_t
229 vmm_gpt_lvl_len(vmm_gpt_node_level_t level)
230 {
231 	ASSERT(level < MAX_GPT_LEVEL);
232 
233 	const uint64_t gpa_len[] = {
234 		[LEVEL4] = 0x8000000000ul,	/* entries cover 512G */
235 		[LEVEL3] = 0x40000000ul,	/* entries cover 1G */
236 		[LEVEL2] = 0x200000ul,		/* entries cover 2M */
237 		[LEVEL1] = 0x1000ul,		/* entries cover 4K */
238 	};
239 	return (gpa_len[level]);
240 }
241 
242 /*
243  * Get the ending GPA which this node could possibly cover given its base
244  * address and level.
245  */
246 static inline uint64_t
247 vmm_gpt_node_end(vmm_gpt_node_t *node)
248 {
249 	ASSERT(node->vgn_level > LEVEL4);
250 	return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1));
251 }
252 
253 /*
254  * Is this node the last entry in its parent node, based solely by its GPA?
255  */
256 static inline bool
257 vmm_gpt_node_is_last(vmm_gpt_node_t *node)
258 {
259 	return (node->vgn_index == MAX_NODE_IDX);
260 }
261 
262 /*
263  * How many table entries (if any) in this node are covered by the range of
264  * [start, end).
265  */
266 static uint16_t
267 vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end)
268 {
269 	const uint64_t node_end = vmm_gpt_node_end(node);
270 
271 	/* Is this node covered at all by the region? */
272 	if (start >= node_end || end <= node->vgn_gpa) {
273 		return (0);
274 	}
275 
276 	const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level);
277 	const uint64_t covered_start = MAX(node->vgn_gpa, start & mask);
278 	const uint64_t covered_end = MIN(node_end, end & mask);
279 	const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level);
280 
281 	return ((covered_end - covered_start) / per_entry);
282 }
283 
284 /*
285  * Find the next node (by address) in the tree at the same level.
286  *
287  * Returns NULL if this is the last node in the tree or if `only_seq` was true
288  * and there is an address gap between this node and the next.
289  */
290 static vmm_gpt_node_t *
291 vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq)
292 {
293 	ASSERT3P(node->vgn_parent, !=, NULL);
294 	ASSERT3U(node->vgn_level, >, LEVEL4);
295 
296 	/*
297 	 * Next node sequentially would be the one at the address starting at
298 	 * the end of what is covered by this node.
299 	 */
300 	const uint64_t gpa_match = vmm_gpt_node_end(node);
301 
302 	/* Try our next sibling */
303 	vmm_gpt_node_t *next = node->vgn_sib_next;
304 	if (next != NULL) {
305 		if (next->vgn_gpa == gpa_match || !only_seq) {
306 			return (next);
307 		}
308 	} else {
309 		/*
310 		 * If the next-sibling pointer is NULL on the node, it can mean
311 		 * one of two things:
312 		 *
313 		 * 1. This entry represents the space leading up to the trailing
314 		 *    boundary of what this node covers.
315 		 *
316 		 * 2. The node is not entirely populated, and there is a gap
317 		 *    between the last populated entry, and the trailing
318 		 *    boundary of the node.
319 		 *
320 		 * Either way, the proper course of action is to check the first
321 		 * child of our parent's next sibling.
322 		 */
323 		vmm_gpt_node_t *pibling = node->vgn_parent->vgn_sib_next;
324 		if (pibling != NULL) {
325 			next = pibling->vgn_children;
326 			if (next != NULL) {
327 				if (next->vgn_gpa == gpa_match || !only_seq) {
328 					return (next);
329 				}
330 			}
331 		}
332 	}
333 
334 	return (NULL);
335 }
336 
337 
338 /*
339  * Finds the child for the given GPA in the given parent node.
340  * Returns a pointer to node, or NULL if it is not found.
341  */
342 static vmm_gpt_node_t *
343 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
344 {
345 	const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa);
346 	for (vmm_gpt_node_t *child = parent->vgn_children;
347 	    child != NULL && child->vgn_index <= index;
348 	    child = child->vgn_sib_next) {
349 		if (child->vgn_index == index)
350 			return (child);
351 	}
352 
353 	return (NULL);
354 }
355 
356 /*
357  * Add a child node to the GPT at a position determined by GPA, parent, and (if
358  * present) preceding sibling.
359  *
360  * If `parent` node contains any children, `prev_sibling` must be populated with
361  * a pointer to the node preceding (by GPA) the to-be-added child node.
362  */
363 static void
364 vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent,
365     vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling)
366 {
367 	ASSERT3U(parent->vgn_level, <, LEVEL1);
368 	ASSERT3U(child->vgn_parent, ==, NULL);
369 
370 	const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa);
371 	child->vgn_index = idx;
372 	child->vgn_level = parent->vgn_level + 1;
373 	child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level);
374 
375 	/* Establish familial connections */
376 	child->vgn_parent = parent;
377 	if (prev_sibling != NULL) {
378 		ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa);
379 
380 		child->vgn_sib_next = prev_sibling->vgn_sib_next;
381 		if (child->vgn_sib_next != NULL) {
382 			child->vgn_sib_next->vgn_sib_prev = child;
383 		}
384 		child->vgn_sib_prev = prev_sibling;
385 		prev_sibling->vgn_sib_next = child;
386 	} else if (parent->vgn_children != NULL) {
387 		vmm_gpt_node_t *next_sibling = parent->vgn_children;
388 
389 		ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa);
390 		ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL);
391 
392 		child->vgn_sib_next = next_sibling;
393 		child->vgn_sib_prev = NULL;
394 		next_sibling->vgn_sib_prev = child;
395 		parent->vgn_children = child;
396 	} else {
397 		parent->vgn_children = child;
398 		child->vgn_sib_next = NULL;
399 		child->vgn_sib_prev = NULL;
400 	}
401 
402 	/* Configure PTE for child table */
403 	parent->vgn_entries[idx] =
404 	    gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn);
405 	parent->vgn_ref_cnt++;
406 }
407 
408 /*
409  * Remove a child node from its relatives (parent, siblings) and free it.
410  */
411 static void
412 vmm_gpt_node_remove(vmm_gpt_node_t *child)
413 {
414 	ASSERT3P(child->vgn_children, ==, NULL);
415 	ASSERT3U(child->vgn_ref_cnt, ==, 0);
416 	ASSERT3P(child->vgn_parent, !=, NULL);
417 
418 	/* Unlink child from its siblings and parent */
419 	vmm_gpt_node_t *parent = child->vgn_parent;
420 	vmm_gpt_node_t *prev = child->vgn_sib_prev;
421 	vmm_gpt_node_t *next = child->vgn_sib_next;
422 	if (prev != NULL) {
423 		ASSERT3P(prev->vgn_sib_next, ==, child);
424 		prev->vgn_sib_next = next;
425 	}
426 	if (next != NULL) {
427 		ASSERT3P(next->vgn_sib_prev, ==, child);
428 		next->vgn_sib_prev = prev;
429 	}
430 	if (prev == NULL) {
431 		ASSERT3P(parent->vgn_children, ==, child);
432 		parent->vgn_children = next;
433 	}
434 	child->vgn_parent = NULL;
435 	child->vgn_sib_next = NULL;
436 	child->vgn_sib_prev = NULL;
437 	parent->vgn_entries[child->vgn_index] = 0;
438 	parent->vgn_ref_cnt--;
439 
440 	vmm_gpt_node_free(child);
441 }
442 
443 /*
444  * Walks the GPT for the given GPA, accumulating entries to the given depth.  If
445  * the walk terminates before the depth is reached, the remaining entries are
446  * written with NULLs.
447  */
448 void
449 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries,
450     vmm_gpt_node_level_t depth)
451 {
452 	uint64_t *current_entries, entry;
453 	pfn_t pfn;
454 
455 	ASSERT(gpt != NULL);
456 	current_entries = gpt->vgpt_root->vgn_entries;
457 	for (uint_t i = 0; i < depth; i++) {
458 		if (current_entries == NULL) {
459 			entries[i] = NULL;
460 			continue;
461 		}
462 		entries[i] = &current_entries[vmm_gpt_lvl_index(i, gpa)];
463 		entry = *entries[i];
464 		if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
465 			current_entries = NULL;
466 			continue;
467 		}
468 		pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
469 		current_entries = (uint64_t *)hat_kpm_pfn2va(pfn);
470 	}
471 }
472 
473 /*
474  * Looks up an entry given GPA.
475  */
476 uint64_t *
477 vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa)
478 {
479 	uint64_t *entries[MAX_GPT_LEVEL];
480 
481 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
482 
483 	return (entries[LEVEL1]);
484 }
485 
486 /*
487  * Populate child table nodes for a given level between the provided interval
488  * of [addr, addr + len).  Caller is expected to provide a pointer to the parent
489  * node which would contain the child node for GPA at `addr`.  A pointer to said
490  * child node will be returned when the operation is complete.
491  */
492 static vmm_gpt_node_t *
493 vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len,
494     vmm_gpt_node_t *node_start)
495 {
496 	const vmm_gpt_node_level_t lvl = node_start->vgn_level;
497 	const uint64_t end = addr + len;
498 	const uint64_t incr = vmm_gpt_lvl_len(lvl);
499 	uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl);
500 	vmm_gpt_node_t *parent = node_start;
501 
502 	/* Try to locate node at starting address */
503 	vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children;
504 	while (node != NULL && node->vgn_gpa < gpa) {
505 		prev = node;
506 		node = node->vgn_sib_next;
507 	}
508 
509 	/*
510 	 * If no node exists at the starting address, create one and link it
511 	 * into the parent.
512 	 */
513 	if (node == NULL || node->vgn_gpa > gpa) {
514 		/* Need to insert node for starting GPA */
515 		node = vmm_gpt_node_alloc();
516 		vmm_gpt_node_add(gpt, parent, node, gpa, prev);
517 	}
518 
519 	vmm_gpt_node_t *front_node = node;
520 	prev = node;
521 	gpa += incr;
522 
523 	/*
524 	 * With a node at the starting address, walk forward creating nodes in
525 	 * any of the gaps.
526 	 */
527 	for (; gpa < end; gpa += incr, prev = node) {
528 		node = vmm_gpt_node_next(prev, true);
529 		if (node != NULL) {
530 			ASSERT3U(node->vgn_gpa, ==, gpa);
531 
532 			/* We may have crossed into a new parent */
533 			parent = node->vgn_parent;
534 			continue;
535 		}
536 
537 		if (vmm_gpt_node_is_last(prev)) {
538 			/*
539 			 * The node preceding this was the last one in its
540 			 * containing parent, so move on to that parent's
541 			 * sibling.  We expect (demand) that it exist already.
542 			 */
543 			parent = vmm_gpt_node_next(parent, true);
544 			ASSERT(parent != NULL);
545 
546 			/*
547 			 * Forget our previous sibling, since it is of no use
548 			 * for assigning the new node to the a now-different
549 			 * parent.
550 			 */
551 			prev = NULL;
552 
553 		}
554 		node = vmm_gpt_node_alloc();
555 		vmm_gpt_node_add(gpt, parent, node, gpa, prev);
556 	}
557 
558 	return (front_node);
559 }
560 
561 /*
562  * Ensures that PTEs for the region of address space bounded by
563  * [addr, addr + len) exist in the tree.
564  */
565 void
566 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
567 {
568 	ASSERT0(addr & PAGEOFFSET);
569 	ASSERT0(len & PAGEOFFSET);
570 
571 	/*
572 	 * Starting at the top of the tree, ensure that tables covering the
573 	 * requested region exist at each level.
574 	 */
575 	vmm_gpt_node_t *node = gpt->vgpt_root;
576 	for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
577 		ASSERT3U(node->vgn_level, ==, lvl);
578 
579 		node = vmm_gpt_populate_region_lvl(gpt, addr, len, node);
580 	}
581 
582 
583 	/*
584 	 * Establish reference counts for the soon-to-be memory PTEs which will
585 	 * be filling these LEVEL1 tables.
586 	 */
587 	uint64_t gpa = addr;
588 	const uint64_t end = addr + len;
589 	while (gpa < end) {
590 		ASSERT(node != NULL);
591 		ASSERT3U(node->vgn_level, ==, LEVEL1);
592 
593 		const uint16_t covered =
594 		    vmm_gpt_node_entries_covered(node, addr, end);
595 
596 		ASSERT(covered != 0);
597 		ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE);
598 		ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE);
599 
600 		node->vgn_ref_cnt += covered;
601 
602 		vmm_gpt_node_t *next = vmm_gpt_node_next(node, true);
603 		if (next != NULL) {
604 			gpa = next->vgn_gpa;
605 			node = next;
606 		} else {
607 			/*
608 			 * We do not expect to find a subsequent node after
609 			 * filling the last node in the table, completing PTE
610 			 * accounting for the specified range.
611 			 */
612 			VERIFY3U(end, <=, vmm_gpt_node_end(node));
613 			break;
614 		}
615 	}
616 }
617 
618 /*
619  * Format a PTE and install it in the provided PTE-pointer.
620  */
621 bool
622 vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot,
623     uint8_t attr)
624 {
625 	uint64_t entry, old_entry;
626 
627 	entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr);
628 	old_entry = atomic_cas_64(ptep, 0, entry);
629 	if (old_entry != 0) {
630 		ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==,
631 		    gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry));
632 		return (false);
633 	}
634 
635 	return (true);
636 }
637 
638 /*
639  * Inserts an entry for a given GPA into the table.  The caller must
640  * ensure that a conflicting PFN is not mapped at the requested location.
641  * Racing operations to map the same PFN at one location is acceptable and
642  * properly handled.
643  */
644 bool
645 vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr)
646 {
647 	uint64_t *entries[MAX_GPT_LEVEL];
648 
649 	ASSERT(gpt != NULL);
650 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
651 	ASSERT(entries[LEVEL1] != NULL);
652 
653 	return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr));
654 }
655 
656 /*
657  * Cleans up the unused inner nodes in the GPT for a region of guest physical
658  * address space of [addr, addr + len).  The region must map no pages.
659  */
660 void
661 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
662 {
663 	ASSERT0(addr & PAGEOFFSET);
664 	ASSERT0(len & PAGEOFFSET);
665 
666 	const uint64_t end = addr + len;
667 	vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = {
668 		[LEVEL4] = gpt->vgpt_root,
669 	};
670 
671 	for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
672 		node = vmm_gpt_node_find_child(starts[lvl], addr);
673 		if (node == NULL) {
674 			break;
675 		}
676 		starts[lvl + 1] = node;
677 	}
678 
679 	/*
680 	 * Starting at the bottom of the tree, ensure that PTEs for pages have
681 	 * been cleared for the region, and remove the corresponding reference
682 	 * counts from the containing LEVEL1 tables.
683 	 */
684 	uint64_t gpa = addr;
685 	node = starts[LEVEL1];
686 	while (gpa < end && node != NULL) {
687 		const uint16_t covered =
688 		    vmm_gpt_node_entries_covered(node, addr, end);
689 
690 		ASSERT3U(node->vgn_ref_cnt, >=, covered);
691 		node->vgn_ref_cnt -= covered;
692 
693 		node = vmm_gpt_node_next(node, false);
694 		if (node != NULL) {
695 			gpa = node->vgn_gpa;
696 		}
697 	}
698 
699 	/*
700 	 * With the page PTE references eliminated, work up from the bottom of
701 	 * the table, removing nodes which have no remaining references.
702 	 *
703 	 * This stops short of LEVEL4, which is the root table of the GPT.  It
704 	 * is left standing to be cleaned up when the vmm_gpt_t is destroyed.
705 	 */
706 	for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) {
707 		gpa = addr;
708 		node = starts[lvl];
709 
710 		while (gpa < end && node != NULL) {
711 			vmm_gpt_node_t *next = vmm_gpt_node_next(node, false);
712 
713 			if (node->vgn_ref_cnt == 0) {
714 				vmm_gpt_node_remove(node);
715 			}
716 			if (next != NULL) {
717 				gpa = next->vgn_gpa;
718 			}
719 			node = next;
720 		}
721 	}
722 }
723 
724 /*
725  * Remove a mapping from the table.  Returns false if the page was not mapped,
726  * otherwise returns true.
727  */
728 bool
729 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
730 {
731 	uint64_t *entries[MAX_GPT_LEVEL], entry;
732 
733 	ASSERT(gpt != NULL);
734 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
735 	if (entries[LEVEL1] == NULL)
736 		return (false);
737 
738 	entry = *entries[LEVEL1];
739 	*entries[LEVEL1] = 0;
740 	return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry));
741 }
742 
743 /*
744  * Un-maps the region of guest physical address space bounded by [start..end).
745  * Returns the number of pages that are unmapped.
746  */
747 size_t
748 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
749 {
750 	ASSERT0(addr & PAGEOFFSET);
751 	ASSERT0(len & PAGEOFFSET);
752 
753 	const uint64_t end = addr + len;
754 	size_t num_unmapped = 0;
755 	for (uint64_t gpa = addr; gpa < end; gpa += PAGESIZE) {
756 		if (vmm_gpt_unmap(gpt, gpa) != 0) {
757 			num_unmapped++;
758 		}
759 	}
760 
761 	return (num_unmapped);
762 }
763 
764 /*
765  * Returns a value indicating whether or not this GPT maps the given
766  * GPA.  If the GPA is mapped, *protp will be filled with the protection
767  * bits of the entry.  Otherwise, it will be ignored.
768  */
769 bool
770 vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp)
771 {
772 	uint64_t entry;
773 
774 	ASSERT(pfnp != NULL);
775 	ASSERT(protp != NULL);
776 
777 	if (ptep == NULL) {
778 		return (false);
779 	}
780 	entry = *ptep;
781 	if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
782 		return (false);
783 	}
784 	*pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
785 	*protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry);
786 	return (true);
787 }
788 
789 /*
790  * Resets the accessed bit on the page table entry pointed to be `entry`.
791  * If `on` is true, the bit will be set, otherwise it will be cleared.
792  * The old value of the bit is returned.
793  */
794 uint_t
795 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on)
796 {
797 	ASSERT(entry != NULL);
798 	return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on));
799 }
800 
801 /*
802  * Resets the dirty bit on the page table entry pointed to be `entry`.
803  * If `on` is true, the bit will be set, otherwise it will be cleared.
804  * The old value of the bit is returned.
805  */
806 uint_t
807 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on)
808 {
809 	ASSERT(entry != NULL);
810 	return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on));
811 }
812 
813 /*
814  * Query state from PTE pointed to by `entry`.
815  */
816 bool
817 vmm_gpt_query(vmm_gpt_t *gpt, uint64_t *entry, vmm_gpt_query_t query)
818 {
819 	ASSERT(entry != NULL);
820 	return (gpt->vgpt_pte_ops->vpeo_query(entry, query));
821 }
822 
823 /*
824  * Get properly formatted PML4 (EPTP/nCR3) for GPT.
825  */
826 uint64_t
827 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty)
828 {
829 	const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn;
830 	return (gpt->vgpt_pte_ops->vpeo_get_pmtp(root_pfn, track_dirty));
831 }
832 
833 /*
834  * Does the GPT hardware support dirty-page-tracking?
835  */
836 bool
837 vmm_gpt_can_track_dirty(vmm_gpt_t *gpt)
838 {
839 	return (gpt->vgpt_pte_ops->vpeo_hw_ad_supported());
840 }
841