xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_gpt.c (revision 7366ca9eaafd8aa43de25a1c3290e783a3987945)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2025 Oxide Computer Company
16  */
17 
18 #include <sys/types.h>
19 #include <sys/atomic.h>
20 #include <sys/kmem.h>
21 #include <sys/sysmacros.h>
22 #include <sys/sunddi.h>
23 #include <sys/panic.h>
24 #include <vm/hat.h>
25 #include <vm/as.h>
26 #include <vm/hat_i86.h>
27 
28 #include <sys/vmm_gpt.h>
29 
30 /*
31  * VMM Generic Page Tables
32  *
33  * Bhyve runs on AMD and Intel hosts and both support nested page tables
34  * describing the guest's physical address space.  But the two use different and
35  * mutually incompatible page table formats: Intel uses the EPT, which is based
36  * on the Itanium page table format, while AMD uses the nPT, which is based on
37  * the x86_64 page table format.
38  *
39  * The GPT abstracts these format differences, and provides a single interface
40  * for interacting with either kind of table structure.
41  *
42  * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
43  * It is parameterized with operations on PTEs that are specific to the table
44  * type (EPT or nPT) and also keeps track of how many pages the table maps, as
45  * well as a pointer to the root node in the tree.
46  *
47  * A node in the GPT keep pointers to its parent (NULL for the root), its
48  * left-most child, and its siblings.  The node understands its position in the
49  * tree in terms of the level it appears at and the index it occupies at its
50  * parent's level, as well as how many children it has.  It also owns the
51  * physical memory page for the hardware page table entries that map its
52  * children.  Thus, for a node at any given level in the tree, the nested PTE
53  * for that node's child at index $i$ is the i'th uint64_t in that node's entry
54  * page and the entry page is part of the paging structure consumed by hardware.
55  *
56  * The GPT interface provides functions for populating and vacating the tree for
57  * regions in the guest physical address space, and for mapping and unmapping
58  * pages in populated regions.  Users must populate a region before mapping
59  * pages into it, and must unmap pages before vacating the region.
60  *
61  * The interface also exposes a function for walking the table from the root to
62  * a leaf entry, populating an array of pointers to PTEs.  This walk uses the
63  * hardware page structure itself, and is thus fast, though as a result it
64  * potentially aliases entries; caveat emptor.  The walk primitive is used for
65  * mapping, unmapping, and lookups.
66  *
67  * Format-specific differences are abstracted by parameterizing the GPT with a
68  * set of PTE operations specific to the platform.  The GPT code makes use of
69  * these when mapping or populating entries, resetting accessed and dirty bits
70  * on entries, and similar operations.
71  */
72 
73 /*
74  * A GPT node.
75  *
76  * Each node contains pointers to its parent, its left-most child, and its
77  * siblings.  Interior nodes also maintain a reference count, and each node
78  * contains its level and index in its parent's table.  Finally, each node
79  * contains the host PFN of the page that it links into the page table, as well
80  * as a kernel pointer to table.
81  *
82  * On leaf nodes, the reference count tracks how many entries in the table are
83  * covered by mapping from the containing vmspace.  This is maintained during
84  * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace
85  * map/unmap operations, rather than in the data path of faults populating the
86  * PTEs themselves.
87  *
88  * Note, this is carefully sized to fit exactly into a 64-byte cache line.
89  */
90 typedef struct vmm_gpt_node vmm_gpt_node_t;
91 struct vmm_gpt_node {
92 	uint64_t	vgn_host_pfn;
93 	uint16_t	vgn_level;
94 	uint16_t	vgn_index;
95 	uint32_t	vgn_ref_cnt;
96 	vmm_gpt_node_t	*vgn_parent;
97 	vmm_gpt_node_t	*vgn_children;
98 	vmm_gpt_node_t	*vgn_sib_next;
99 	vmm_gpt_node_t	*vgn_sib_prev;
100 	uint64_t	*vgn_entries;
101 	uint64_t	vgn_gpa;
102 };
103 
104 /* Maximum node index determined by number of entries in page table (512) */
105 #define	PTE_PER_TABLE	512
106 #define	MAX_NODE_IDX	(PTE_PER_TABLE - 1)
107 
108 /*
109  * A VMM Generic Page Table.
110  *
111  * The generic page table is a format-agnostic, 4-level paging structure
112  * modeling a second-level page table (EPT on Intel, nPT on AMD).  It
113  * contains a counter of pages the table maps, a pointer to the root node
114  * in the table, and is parameterized with a set of PTE operations specific
115  * to the table type.
116  */
117 struct vmm_gpt {
118 	vmm_gpt_node_t	*vgpt_root;
119 	vmm_pte_ops_t	*vgpt_pte_ops;
120 };
121 
122 /*
123  * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
124  * hold the PTEs it contains.
125  */
126 static vmm_gpt_node_t *
127 vmm_gpt_node_alloc(void)
128 {
129 	vmm_gpt_node_t *node;
130 	caddr_t page;
131 
132 	node = kmem_zalloc(sizeof (*node), KM_SLEEP);
133 	/*
134 	 * Note: despite the man page, allocating PAGESIZE bytes is
135 	 * guaranteed to be page-aligned.
136 	 */
137 	page = kmem_zalloc(PAGESIZE, KM_SLEEP);
138 	node->vgn_entries = (uint64_t *)page;
139 	node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
140 
141 	return (node);
142 }
143 
144 /*
145  * Allocates and initializes a vmm_gpt_t.
146  */
147 vmm_gpt_t *
148 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops)
149 {
150 	vmm_gpt_t *gpt;
151 
152 	VERIFY(pte_ops != NULL);
153 	gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP);
154 	gpt->vgpt_pte_ops = pte_ops;
155 	gpt->vgpt_root = vmm_gpt_node_alloc();
156 
157 	return (gpt);
158 }
159 
160 /*
161  * Frees a given node.  The node is expected to have no familial (parent,
162  * children, siblings) associations at this point.  Accordingly, its reference
163  * count should be zero.
164  */
165 static void
166 vmm_gpt_node_free(vmm_gpt_node_t *node)
167 {
168 	ASSERT(node != NULL);
169 	ASSERT3U(node->vgn_ref_cnt, ==, 0);
170 	ASSERT(node->vgn_host_pfn != PFN_INVALID);
171 	ASSERT(node->vgn_entries != NULL);
172 	ASSERT(node->vgn_parent == NULL);
173 
174 	kmem_free(node->vgn_entries, PAGESIZE);
175 	kmem_free(node, sizeof (*node));
176 }
177 
178 /*
179  * Frees a vmm_gpt_t.  Any lingering nodes in the GPT will be freed too.
180  */
181 void
182 vmm_gpt_free(vmm_gpt_t *gpt)
183 {
184 	/* Empty anything remaining in the tree */
185 	vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK);
186 
187 	VERIFY(gpt->vgpt_root != NULL);
188 	VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0);
189 
190 	vmm_gpt_node_free(gpt->vgpt_root);
191 	kmem_free(gpt, sizeof (*gpt));
192 }
193 
194 /*
195  * Given a GPA, return its corresponding index in a paging structure at the
196  * provided level.
197  */
198 static inline uint16_t
199 vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa)
200 {
201 	ASSERT(level < MAX_GPT_LEVEL);
202 
203 	const uint16_t mask = (1U << 9) - 1;
204 	switch (level) {
205 	case LEVEL4: return ((gpa >> 39) & mask);
206 	case LEVEL3: return ((gpa >> 30) & mask);
207 	case LEVEL2: return ((gpa >> 21) & mask);
208 	case LEVEL1: return ((gpa >> 12) & mask);
209 	default:
210 		panic("impossible level value");
211 	};
212 }
213 
214 /* Get mask for addresses of entries at a given table level. */
215 static inline uint64_t
216 vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)
217 {
218 	ASSERT(level < MAX_GPT_LEVEL);
219 
220 	switch (level) {
221 	case LEVEL4: return (0xffffff8000000000ul);	/* entries cover 512G */
222 	case LEVEL3: return (0xffffffffc0000000ul);	/* entries cover 1G */
223 	case LEVEL2: return (0xffffffffffe00000ul);	/* entries cover 2M */
224 	case LEVEL1: return (0xfffffffffffff000ul);	/* entries cover 4K */
225 	default:
226 		panic("impossible level value");
227 	};
228 }
229 
230 /* Get length of GPA covered by entries at a given table level. */
231 static inline uint64_t
232 vmm_gpt_lvl_len(vmm_gpt_node_level_t level)
233 {
234 	ASSERT(level < MAX_GPT_LEVEL);
235 
236 	switch (level) {
237 	case LEVEL4: return (0x8000000000ul);	/* entries cover 512G */
238 	case LEVEL3: return (0x40000000ul);	/* entries cover 1G */
239 	case LEVEL2: return (0x200000ul);	/* entries cover 2M */
240 	case LEVEL1: return (0x1000ul);		/* entries cover 4K */
241 	default:
242 		panic("impossible level value");
243 	};
244 }
245 
246 /*
247  * Get the ending GPA which this node could possibly cover given its base
248  * address and level.
249  */
250 static inline uint64_t
251 vmm_gpt_node_end(vmm_gpt_node_t *node)
252 {
253 	ASSERT(node->vgn_level > LEVEL4);
254 	return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1));
255 }
256 
257 /*
258  * Is this node the last entry in its parent node, based solely by its GPA?
259  */
260 static inline bool
261 vmm_gpt_node_is_last(vmm_gpt_node_t *node)
262 {
263 	return (node->vgn_index == MAX_NODE_IDX);
264 }
265 
266 /*
267  * How many table entries (if any) in this node are covered by the range of
268  * [start, end).
269  */
270 static uint16_t
271 vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end)
272 {
273 	const uint64_t node_end = vmm_gpt_node_end(node);
274 
275 	/* Is this node covered at all by the region? */
276 	if (start >= node_end || end <= node->vgn_gpa) {
277 		return (0);
278 	}
279 
280 	const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level);
281 	const uint64_t covered_start = MAX(node->vgn_gpa, start & mask);
282 	const uint64_t covered_end = MIN(node_end, end & mask);
283 	const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level);
284 
285 	return ((covered_end - covered_start) / per_entry);
286 }
287 
288 /*
289  * Find the next node (by address) in the tree at the same level.
290  *
291  * Returns NULL if this is the last node in the tree or if `only_seq` was true
292  * and there is an address gap between this node and the next.
293  */
294 static vmm_gpt_node_t *
295 vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq)
296 {
297 	ASSERT3P(node->vgn_parent, !=, NULL);
298 	ASSERT3U(node->vgn_level, >, LEVEL4);
299 
300 	/*
301 	 * Next node sequentially would be the one at the address starting at
302 	 * the end of what is covered by this node.
303 	 */
304 	const uint64_t gpa_match = vmm_gpt_node_end(node);
305 
306 	/* Try our next sibling */
307 	vmm_gpt_node_t *next = node->vgn_sib_next;
308 
309 	if (next == NULL) {
310 		/*
311 		 * If the next-sibling pointer is NULL on the node, it can mean
312 		 * one of two things:
313 		 *
314 		 * 1. This entry represents the space leading up to the trailing
315 		 *    boundary of what this node covers.
316 		 *
317 		 * 2. The node is not entirely populated, and there is a gap
318 		 *    between the last populated entry, and the trailing
319 		 *    boundary of the node.
320 		 *
321 		 * Either way, the proper course of action is to check the first
322 		 * child of our parent's next sibling (if the parent is not the
323 		 * root of the GPT itself).
324 		 */
325 		if (node->vgn_parent != NULL && node->vgn_level > LEVEL3) {
326 			vmm_gpt_node_t *psibling =
327 			    vmm_gpt_node_next(node->vgn_parent, true);
328 			if (psibling != NULL) {
329 				next = psibling->vgn_children;
330 			}
331 		}
332 	}
333 
334 	if (next != NULL &&
335 	    (next->vgn_gpa == gpa_match || !only_seq)) {
336 		return (next);
337 	}
338 
339 	return (NULL);
340 }
341 
342 
343 /*
344  * Finds the child for the given GPA in the given parent node.
345  * Returns a pointer to node, or NULL if it is not found.
346  */
347 static vmm_gpt_node_t *
348 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
349 {
350 	const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa);
351 	for (vmm_gpt_node_t *child = parent->vgn_children;
352 	    child != NULL && child->vgn_index <= index;
353 	    child = child->vgn_sib_next) {
354 		if (child->vgn_index == index)
355 			return (child);
356 	}
357 
358 	return (NULL);
359 }
360 
361 /*
362  * Add a child node to the GPT at a position determined by GPA, parent, and (if
363  * present) preceding sibling.
364  *
365  * If `parent` node contains any children, `prev_sibling` must be populated with
366  * a pointer to the node preceding (by GPA) the to-be-added child node.
367  */
368 static void
369 vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent,
370     vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling)
371 {
372 	ASSERT3U(parent->vgn_level, <, LEVEL1);
373 	ASSERT3U(child->vgn_parent, ==, NULL);
374 
375 	const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa);
376 	child->vgn_index = idx;
377 	child->vgn_level = parent->vgn_level + 1;
378 	child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level);
379 
380 	/* Establish familial connections */
381 	child->vgn_parent = parent;
382 	if (prev_sibling != NULL) {
383 		ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa);
384 
385 		child->vgn_sib_next = prev_sibling->vgn_sib_next;
386 		if (child->vgn_sib_next != NULL) {
387 			child->vgn_sib_next->vgn_sib_prev = child;
388 		}
389 		child->vgn_sib_prev = prev_sibling;
390 		prev_sibling->vgn_sib_next = child;
391 	} else if (parent->vgn_children != NULL) {
392 		vmm_gpt_node_t *next_sibling = parent->vgn_children;
393 
394 		ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa);
395 		ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL);
396 
397 		child->vgn_sib_next = next_sibling;
398 		child->vgn_sib_prev = NULL;
399 		next_sibling->vgn_sib_prev = child;
400 		parent->vgn_children = child;
401 	} else {
402 		parent->vgn_children = child;
403 		child->vgn_sib_next = NULL;
404 		child->vgn_sib_prev = NULL;
405 	}
406 
407 	/* Configure PTE for child table */
408 	parent->vgn_entries[idx] =
409 	    gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn);
410 	parent->vgn_ref_cnt++;
411 }
412 
413 /*
414  * Remove a child node from its relatives (parent, siblings) and free it.
415  */
416 static void
417 vmm_gpt_node_remove(vmm_gpt_node_t *child)
418 {
419 	ASSERT3P(child->vgn_children, ==, NULL);
420 	ASSERT3U(child->vgn_ref_cnt, ==, 0);
421 	ASSERT3P(child->vgn_parent, !=, NULL);
422 
423 	/* Unlink child from its siblings and parent */
424 	vmm_gpt_node_t *parent = child->vgn_parent;
425 	vmm_gpt_node_t *prev = child->vgn_sib_prev;
426 	vmm_gpt_node_t *next = child->vgn_sib_next;
427 	if (prev != NULL) {
428 		ASSERT3P(prev->vgn_sib_next, ==, child);
429 		prev->vgn_sib_next = next;
430 	}
431 	if (next != NULL) {
432 		ASSERT3P(next->vgn_sib_prev, ==, child);
433 		next->vgn_sib_prev = prev;
434 	}
435 	if (prev == NULL) {
436 		ASSERT3P(parent->vgn_children, ==, child);
437 		parent->vgn_children = next;
438 	}
439 	child->vgn_parent = NULL;
440 	child->vgn_sib_next = NULL;
441 	child->vgn_sib_prev = NULL;
442 	parent->vgn_entries[child->vgn_index] = 0;
443 	parent->vgn_ref_cnt--;
444 
445 	vmm_gpt_node_free(child);
446 }
447 
448 /*
449  * Walks the GPT for the given GPA, accumulating entries to the given depth.  If
450  * the walk terminates before the depth is reached, the remaining entries are
451  * written with NULLs.
452  */
453 void
454 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries,
455     vmm_gpt_node_level_t depth)
456 {
457 	uint64_t *current_entries, entry;
458 	pfn_t pfn;
459 
460 	ASSERT(gpt != NULL);
461 	current_entries = gpt->vgpt_root->vgn_entries;
462 	for (uint_t i = 0; i < depth; i++) {
463 		if (current_entries == NULL) {
464 			entries[i] = NULL;
465 			continue;
466 		}
467 		entries[i] = &current_entries[vmm_gpt_lvl_index(i, gpa)];
468 		entry = *entries[i];
469 		if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
470 			current_entries = NULL;
471 			continue;
472 		}
473 		pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
474 		current_entries = (uint64_t *)hat_kpm_pfn2va(pfn);
475 	}
476 }
477 
478 /*
479  * Looks up an entry given GPA.
480  */
481 uint64_t *
482 vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa)
483 {
484 	uint64_t *entries[MAX_GPT_LEVEL];
485 
486 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
487 
488 	return (entries[LEVEL1]);
489 }
490 
491 /*
492  * Populate child table nodes for a given level between the provided interval
493  * of [addr, addr + len).  Caller is expected to provide a pointer to the parent
494  * node which would contain the child node for GPA at `addr`.  A pointer to said
495  * child node will be returned when the operation is complete.
496  */
497 static vmm_gpt_node_t *
498 vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len,
499     vmm_gpt_node_t *node_start)
500 {
501 	const vmm_gpt_node_level_t lvl = node_start->vgn_level;
502 	const uint64_t end = addr + len;
503 	const uint64_t incr = vmm_gpt_lvl_len(lvl);
504 	uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl);
505 	vmm_gpt_node_t *parent = node_start;
506 
507 	/* Try to locate node at starting address */
508 	vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children;
509 	while (node != NULL && node->vgn_gpa < gpa) {
510 		prev = node;
511 		node = node->vgn_sib_next;
512 	}
513 
514 	/*
515 	 * If no node exists at the starting address, create one and link it
516 	 * into the parent.
517 	 */
518 	if (node == NULL || node->vgn_gpa > gpa) {
519 		/* Need to insert node for starting GPA */
520 		node = vmm_gpt_node_alloc();
521 		vmm_gpt_node_add(gpt, parent, node, gpa, prev);
522 	}
523 
524 	vmm_gpt_node_t *front_node = node;
525 	prev = node;
526 	gpa += incr;
527 
528 	/*
529 	 * With a node at the starting address, walk forward creating nodes in
530 	 * any of the gaps.
531 	 */
532 	for (; gpa < end; gpa += incr, prev = node) {
533 		node = vmm_gpt_node_next(prev, true);
534 		if (node != NULL) {
535 			ASSERT3U(node->vgn_gpa, ==, gpa);
536 
537 			/* We may have crossed into a new parent */
538 			parent = node->vgn_parent;
539 			continue;
540 		}
541 
542 		if (vmm_gpt_node_is_last(prev)) {
543 			/*
544 			 * The node preceding this was the last one in its
545 			 * containing parent, so move on to that parent's
546 			 * sibling.  We expect (demand) that it exist already.
547 			 */
548 			parent = vmm_gpt_node_next(parent, true);
549 			ASSERT(parent != NULL);
550 
551 			/*
552 			 * Forget our previous sibling, since it is of no use
553 			 * for assigning the new node to the a now-different
554 			 * parent.
555 			 */
556 			prev = NULL;
557 
558 		}
559 		node = vmm_gpt_node_alloc();
560 		vmm_gpt_node_add(gpt, parent, node, gpa, prev);
561 	}
562 
563 	return (front_node);
564 }
565 
566 /*
567  * Ensures that PTEs for the region of address space bounded by
568  * [addr, addr + len) exist in the tree.
569  */
570 void
571 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
572 {
573 	ASSERT0(addr & PAGEOFFSET);
574 	ASSERT0(len & PAGEOFFSET);
575 
576 	/*
577 	 * Starting at the top of the tree, ensure that tables covering the
578 	 * requested region exist at each level.
579 	 */
580 	vmm_gpt_node_t *node = gpt->vgpt_root;
581 	for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
582 		ASSERT3U(node->vgn_level, ==, lvl);
583 
584 		node = vmm_gpt_populate_region_lvl(gpt, addr, len, node);
585 	}
586 
587 
588 	/*
589 	 * Establish reference counts for the soon-to-be memory PTEs which will
590 	 * be filling these LEVEL1 tables.
591 	 */
592 	uint64_t gpa = addr;
593 	const uint64_t end = addr + len;
594 	while (gpa < end) {
595 		ASSERT(node != NULL);
596 		ASSERT3U(node->vgn_level, ==, LEVEL1);
597 
598 		const uint16_t covered =
599 		    vmm_gpt_node_entries_covered(node, addr, end);
600 
601 		ASSERT(covered != 0);
602 		ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE);
603 		ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE);
604 
605 		node->vgn_ref_cnt += covered;
606 
607 		vmm_gpt_node_t *next = vmm_gpt_node_next(node, true);
608 		if (next != NULL) {
609 			gpa = next->vgn_gpa;
610 			node = next;
611 		} else {
612 			/*
613 			 * We do not expect to find a subsequent node after
614 			 * filling the last node in the table, completing PTE
615 			 * accounting for the specified range.
616 			 */
617 			VERIFY3U(end, <=, vmm_gpt_node_end(node));
618 			break;
619 		}
620 	}
621 }
622 
623 /*
624  * Format a PTE and install it in the provided PTE-pointer.
625  */
626 bool
627 vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot,
628     uint8_t attr)
629 {
630 	uint64_t entry, old_entry;
631 
632 	entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr);
633 	old_entry = atomic_cas_64(ptep, 0, entry);
634 	if (old_entry != 0) {
635 		ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==,
636 		    gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry));
637 		return (false);
638 	}
639 
640 	return (true);
641 }
642 
643 /*
644  * Inserts an entry for a given GPA into the table.  The caller must
645  * ensure that a conflicting PFN is not mapped at the requested location.
646  * Racing operations to map the same PFN at one location is acceptable and
647  * properly handled.
648  */
649 bool
650 vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr)
651 {
652 	uint64_t *entries[MAX_GPT_LEVEL];
653 
654 	ASSERT(gpt != NULL);
655 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
656 	ASSERT(entries[LEVEL1] != NULL);
657 
658 	return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr));
659 }
660 
661 /*
662  * Cleans up the unused inner nodes in the GPT for a region of guest physical
663  * address space of [addr, addr + len).  The region must map no pages.
664  */
665 void
666 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
667 {
668 	ASSERT0(addr & PAGEOFFSET);
669 	ASSERT0(len & PAGEOFFSET);
670 
671 	const uint64_t end = addr + len;
672 	vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = {
673 		[LEVEL4] = gpt->vgpt_root,
674 	};
675 
676 	for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
677 		node = vmm_gpt_node_find_child(starts[lvl], addr);
678 		if (node == NULL) {
679 			break;
680 		}
681 		starts[lvl + 1] = node;
682 	}
683 
684 	/*
685 	 * Starting at the bottom of the tree, ensure that PTEs for pages have
686 	 * been cleared for the region, and remove the corresponding reference
687 	 * counts from the containing LEVEL1 tables.
688 	 */
689 	uint64_t gpa = addr;
690 	node = starts[LEVEL1];
691 	while (gpa < end && node != NULL) {
692 		const uint16_t covered =
693 		    vmm_gpt_node_entries_covered(node, addr, end);
694 
695 		ASSERT3U(node->vgn_ref_cnt, >=, covered);
696 		node->vgn_ref_cnt -= covered;
697 
698 		node = vmm_gpt_node_next(node, false);
699 		if (node != NULL) {
700 			gpa = node->vgn_gpa;
701 		}
702 	}
703 
704 	/*
705 	 * With the page PTE references eliminated, work up from the bottom of
706 	 * the table, removing nodes which have no remaining references.
707 	 *
708 	 * This stops short of LEVEL4, which is the root table of the GPT.  It
709 	 * is left standing to be cleaned up when the vmm_gpt_t is destroyed.
710 	 */
711 	for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) {
712 		gpa = addr;
713 		node = starts[lvl];
714 
715 		while (gpa < end && node != NULL) {
716 			vmm_gpt_node_t *next = vmm_gpt_node_next(node, false);
717 
718 			if (node->vgn_ref_cnt == 0) {
719 				vmm_gpt_node_remove(node);
720 			}
721 			if (next != NULL) {
722 				gpa = next->vgn_gpa;
723 			}
724 			node = next;
725 		}
726 	}
727 }
728 
729 /*
730  * Remove a mapping from the table.  Returns false if the page was not mapped,
731  * otherwise returns true.
732  */
733 bool
734 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
735 {
736 	uint64_t *entries[MAX_GPT_LEVEL], entry;
737 
738 	ASSERT(gpt != NULL);
739 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
740 	if (entries[LEVEL1] == NULL)
741 		return (false);
742 
743 	entry = *entries[LEVEL1];
744 	*entries[LEVEL1] = 0;
745 	return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry));
746 }
747 
748 /*
749  * Un-maps the region of guest physical address space bounded by [start..end).
750  * Returns the number of pages that are unmapped.
751  */
752 size_t
753 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
754 {
755 	ASSERT0(addr & PAGEOFFSET);
756 	ASSERT0(len & PAGEOFFSET);
757 
758 	const uint64_t end = addr + len;
759 	size_t num_unmapped = 0;
760 	for (uint64_t gpa = addr; gpa < end; gpa += PAGESIZE) {
761 		if (vmm_gpt_unmap(gpt, gpa) != 0) {
762 			num_unmapped++;
763 		}
764 	}
765 
766 	return (num_unmapped);
767 }
768 
769 /*
770  * Returns a value indicating whether or not this GPT maps the given
771  * GPA.  If the GPA is mapped, *protp will be filled with the protection
772  * bits of the entry.  Otherwise, it will be ignored.
773  */
774 bool
775 vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp)
776 {
777 	uint64_t entry;
778 
779 	ASSERT(pfnp != NULL);
780 	ASSERT(protp != NULL);
781 
782 	if (ptep == NULL) {
783 		return (false);
784 	}
785 	entry = *ptep;
786 	if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
787 		return (false);
788 	}
789 	*pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
790 	*protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry);
791 	return (true);
792 }
793 
794 /*
795  * Resets the accessed bit on the page table entry pointed to be `entry`.
796  * If `on` is true, the bit will be set, otherwise it will be cleared.
797  * The old value of the bit is returned.
798  */
799 uint_t
800 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on)
801 {
802 	ASSERT(entry != NULL);
803 	return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on));
804 }
805 
806 /*
807  * Resets the dirty bit on the page table entry pointed to be `entry`.
808  * If `on` is true, the bit will be set, otherwise it will be cleared.
809  * The old value of the bit is returned.
810  */
811 uint_t
812 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on)
813 {
814 	ASSERT(entry != NULL);
815 	return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on));
816 }
817 
818 /*
819  * Query state from PTE pointed to by `entry`.
820  */
821 bool
822 vmm_gpt_query(vmm_gpt_t *gpt, uint64_t *entry, vmm_gpt_query_t query)
823 {
824 	ASSERT(entry != NULL);
825 	return (gpt->vgpt_pte_ops->vpeo_query(entry, query));
826 }
827 
828 /*
829  * Get properly formatted PML4 (EPTP/nCR3) for GPT.
830  */
831 uint64_t
832 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty)
833 {
834 	const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn;
835 	return (gpt->vgpt_pte_ops->vpeo_get_pmtp(root_pfn, track_dirty));
836 }
837 
838 /*
839  * Does the GPT hardware support dirty-page-tracking?
840  */
841 bool
842 vmm_gpt_can_track_dirty(vmm_gpt_t *gpt)
843 {
844 	return (gpt->vgpt_pte_ops->vpeo_hw_ad_supported());
845 }
846