1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019 Joyent, Inc.
14 * Copyright 2024 Oxide Computer Company
15 */
16
17 #include <sys/types.h>
18 #include <sys/atomic.h>
19 #include <sys/kmem.h>
20 #include <sys/sysmacros.h>
21 #include <sys/sunddi.h>
22 #include <sys/panic.h>
23 #include <vm/hat.h>
24 #include <vm/as.h>
25 #include <vm/hat_i86.h>
26
27 #include <sys/vmm_gpt.h>
28
29 /*
30 * VMM Generic Page Tables
31 *
32 * Bhyve runs on AMD and Intel hosts and both support nested page tables
33 * describing the guest's physical address space. But the two use different and
34 * mutually incompatible page table formats: Intel uses the EPT, which is based
35 * on the Itanium page table format, while AMD uses the nPT, which is based on
36 * the x86_64 page table format.
37 *
38 * The GPT abstracts these format differences, and provides a single interface
39 * for interacting with either kind of table structure.
40 *
41 * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
42 * It is parameterized with operations on PTEs that are specific to the table
43 * type (EPT or nPT) and also keeps track of how many pages the table maps, as
44 * well as a pointer to the root node in the tree.
45 *
46 * A node in the GPT keep pointers to its parent (NULL for the root), its
47 * left-most child, and its siblings. The node understands its position in the
48 * tree in terms of the level it appears at and the index it occupies at its
49 * parent's level, as well as how many children it has. It also owns the
50 * physical memory page for the hardware page table entries that map its
51 * children. Thus, for a node at any given level in the tree, the nested PTE
52 * for that node's child at index $i$ is the i'th uint64_t in that node's entry
53 * page and the entry page is part of the paging structure consumed by hardware.
54 *
55 * The GPT interface provides functions for populating and vacating the tree for
56 * regions in the guest physical address space, and for mapping and unmapping
57 * pages in populated regions. Users must populate a region before mapping
58 * pages into it, and must unmap pages before vacating the region.
59 *
60 * The interface also exposes a function for walking the table from the root to
61 * a leaf entry, populating an array of pointers to PTEs. This walk uses the
62 * hardware page structure itself, and is thus fast, though as a result it
63 * potentially aliases entries; caveat emptor. The walk primitive is used for
64 * mapping, unmapping, and lookups.
65 *
66 * Format-specific differences are abstracted by parameterizing the GPT with a
67 * set of PTE operations specific to the platform. The GPT code makes use of
68 * these when mapping or populating entries, resetting accessed and dirty bits
69 * on entries, and similar operations.
70 */
71
72 /*
73 * A GPT node.
74 *
75 * Each node contains pointers to its parent, its left-most child, and its
76 * siblings. Interior nodes also maintain a reference count, and each node
77 * contains its level and index in its parent's table. Finally, each node
78 * contains the host PFN of the page that it links into the page table, as well
79 * as a kernel pointer to table.
80 *
81 * On leaf nodes, the reference count tracks how many entries in the table are
82 * covered by mapping from the containing vmspace. This is maintained during
83 * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace
84 * map/unmap operations, rather than in the data path of faults populating the
85 * PTEs themselves.
86 *
87 * Note, this is carefully sized to fit exactly into a 64-byte cache line.
88 */
89 typedef struct vmm_gpt_node vmm_gpt_node_t;
90 struct vmm_gpt_node {
91 uint64_t vgn_host_pfn;
92 uint16_t vgn_level;
93 uint16_t vgn_index;
94 uint32_t vgn_ref_cnt;
95 vmm_gpt_node_t *vgn_parent;
96 vmm_gpt_node_t *vgn_children;
97 vmm_gpt_node_t *vgn_sib_next;
98 vmm_gpt_node_t *vgn_sib_prev;
99 uint64_t *vgn_entries;
100 uint64_t vgn_gpa;
101 };
102
103 /* Maximum node index determined by number of entries in page table (512) */
104 #define PTE_PER_TABLE 512
105 #define MAX_NODE_IDX (PTE_PER_TABLE - 1)
106
107 /*
108 * A VMM Generic Page Table.
109 *
110 * The generic page table is a format-agnostic, 4-level paging structure
111 * modeling a second-level page table (EPT on Intel, nPT on AMD). It
112 * contains a counter of pages the table maps, a pointer to the root node
113 * in the table, and is parameterized with a set of PTE operations specific
114 * to the table type.
115 */
116 struct vmm_gpt {
117 vmm_gpt_node_t *vgpt_root;
118 vmm_pte_ops_t *vgpt_pte_ops;
119 };
120
121 /*
122 * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
123 * hold the PTEs it contains.
124 */
125 static vmm_gpt_node_t *
vmm_gpt_node_alloc(void)126 vmm_gpt_node_alloc(void)
127 {
128 vmm_gpt_node_t *node;
129 caddr_t page;
130
131 node = kmem_zalloc(sizeof (*node), KM_SLEEP);
132 /*
133 * Note: despite the man page, allocating PAGESIZE bytes is
134 * guaranteed to be page-aligned.
135 */
136 page = kmem_zalloc(PAGESIZE, KM_SLEEP);
137 node->vgn_entries = (uint64_t *)page;
138 node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
139
140 return (node);
141 }
142
143 /*
144 * Allocates and initializes a vmm_gpt_t.
145 */
146 vmm_gpt_t *
vmm_gpt_alloc(vmm_pte_ops_t * pte_ops)147 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops)
148 {
149 vmm_gpt_t *gpt;
150
151 VERIFY(pte_ops != NULL);
152 gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP);
153 gpt->vgpt_pte_ops = pte_ops;
154 gpt->vgpt_root = vmm_gpt_node_alloc();
155
156 return (gpt);
157 }
158
159 /*
160 * Frees a given node. The node is expected to have no familial (parent,
161 * children, siblings) associations at this point. Accordingly, its reference
162 * count should be zero.
163 */
164 static void
vmm_gpt_node_free(vmm_gpt_node_t * node)165 vmm_gpt_node_free(vmm_gpt_node_t *node)
166 {
167 ASSERT(node != NULL);
168 ASSERT3U(node->vgn_ref_cnt, ==, 0);
169 ASSERT(node->vgn_host_pfn != PFN_INVALID);
170 ASSERT(node->vgn_entries != NULL);
171 ASSERT(node->vgn_parent == NULL);
172
173 kmem_free(node->vgn_entries, PAGESIZE);
174 kmem_free(node, sizeof (*node));
175 }
176
177 /*
178 * Frees a vmm_gpt_t. Any lingering nodes in the GPT will be freed too.
179 */
180 void
vmm_gpt_free(vmm_gpt_t * gpt)181 vmm_gpt_free(vmm_gpt_t *gpt)
182 {
183 /* Empty anything remaining in the tree */
184 vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK);
185
186 VERIFY(gpt->vgpt_root != NULL);
187 VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0);
188
189 vmm_gpt_node_free(gpt->vgpt_root);
190 kmem_free(gpt, sizeof (*gpt));
191 }
192
193 /*
194 * Given a GPA, return its corresponding index in a paging structure at the
195 * provided level.
196 */
197 static inline uint16_t
vmm_gpt_lvl_index(vmm_gpt_node_level_t level,uint64_t gpa)198 vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa)
199 {
200 ASSERT(level < MAX_GPT_LEVEL);
201
202 const uint16_t mask = (1U << 9) - 1;
203 switch (level) {
204 case LEVEL4: return ((gpa >> 39) & mask);
205 case LEVEL3: return ((gpa >> 30) & mask);
206 case LEVEL2: return ((gpa >> 21) & mask);
207 case LEVEL1: return ((gpa >> 12) & mask);
208 default:
209 panic("impossible level value");
210 };
211 }
212
213 /* Get mask for addresses of entries at a given table level. */
214 static inline uint64_t
vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)215 vmm_gpt_lvl_mask(vmm_gpt_node_level_t level)
216 {
217 ASSERT(level < MAX_GPT_LEVEL);
218
219 switch (level) {
220 case LEVEL4: return (0xffffff8000000000ul); /* entries cover 512G */
221 case LEVEL3: return (0xffffffffc0000000ul); /* entries cover 1G */
222 case LEVEL2: return (0xffffffffffe00000ul); /* entries cover 2M */
223 case LEVEL1: return (0xfffffffffffff000ul); /* entries cover 4K */
224 default:
225 panic("impossible level value");
226 };
227 }
228
229 /* Get length of GPA covered by entries at a given table level. */
230 static inline uint64_t
vmm_gpt_lvl_len(vmm_gpt_node_level_t level)231 vmm_gpt_lvl_len(vmm_gpt_node_level_t level)
232 {
233 ASSERT(level < MAX_GPT_LEVEL);
234
235 switch (level) {
236 case LEVEL4: return (0x8000000000ul); /* entries cover 512G */
237 case LEVEL3: return (0x40000000ul); /* entries cover 1G */
238 case LEVEL2: return (0x200000ul); /* entries cover 2M */
239 case LEVEL1: return (0x1000ul); /* entries cover 4K */
240 default:
241 panic("impossible level value");
242 };
243 }
244
245 /*
246 * Get the ending GPA which this node could possibly cover given its base
247 * address and level.
248 */
249 static inline uint64_t
vmm_gpt_node_end(vmm_gpt_node_t * node)250 vmm_gpt_node_end(vmm_gpt_node_t *node)
251 {
252 ASSERT(node->vgn_level > LEVEL4);
253 return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1));
254 }
255
256 /*
257 * Is this node the last entry in its parent node, based solely by its GPA?
258 */
259 static inline bool
vmm_gpt_node_is_last(vmm_gpt_node_t * node)260 vmm_gpt_node_is_last(vmm_gpt_node_t *node)
261 {
262 return (node->vgn_index == MAX_NODE_IDX);
263 }
264
265 /*
266 * How many table entries (if any) in this node are covered by the range of
267 * [start, end).
268 */
269 static uint16_t
vmm_gpt_node_entries_covered(vmm_gpt_node_t * node,uint64_t start,uint64_t end)270 vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end)
271 {
272 const uint64_t node_end = vmm_gpt_node_end(node);
273
274 /* Is this node covered at all by the region? */
275 if (start >= node_end || end <= node->vgn_gpa) {
276 return (0);
277 }
278
279 const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level);
280 const uint64_t covered_start = MAX(node->vgn_gpa, start & mask);
281 const uint64_t covered_end = MIN(node_end, end & mask);
282 const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level);
283
284 return ((covered_end - covered_start) / per_entry);
285 }
286
287 /*
288 * Find the next node (by address) in the tree at the same level.
289 *
290 * Returns NULL if this is the last node in the tree or if `only_seq` was true
291 * and there is an address gap between this node and the next.
292 */
293 static vmm_gpt_node_t *
vmm_gpt_node_next(vmm_gpt_node_t * node,bool only_seq)294 vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq)
295 {
296 ASSERT3P(node->vgn_parent, !=, NULL);
297 ASSERT3U(node->vgn_level, >, LEVEL4);
298
299 /*
300 * Next node sequentially would be the one at the address starting at
301 * the end of what is covered by this node.
302 */
303 const uint64_t gpa_match = vmm_gpt_node_end(node);
304
305 /* Try our next sibling */
306 vmm_gpt_node_t *next = node->vgn_sib_next;
307 if (next != NULL) {
308 if (next->vgn_gpa == gpa_match || !only_seq) {
309 return (next);
310 }
311 } else {
312 /*
313 * If the next-sibling pointer is NULL on the node, it can mean
314 * one of two things:
315 *
316 * 1. This entry represents the space leading up to the trailing
317 * boundary of what this node covers.
318 *
319 * 2. The node is not entirely populated, and there is a gap
320 * between the last populated entry, and the trailing
321 * boundary of the node.
322 *
323 * Either way, the proper course of action is to check the first
324 * child of our parent's next sibling.
325 */
326 vmm_gpt_node_t *pibling = node->vgn_parent->vgn_sib_next;
327 if (pibling != NULL) {
328 next = pibling->vgn_children;
329 if (next != NULL) {
330 if (next->vgn_gpa == gpa_match || !only_seq) {
331 return (next);
332 }
333 }
334 }
335 }
336
337 return (NULL);
338 }
339
340
341 /*
342 * Finds the child for the given GPA in the given parent node.
343 * Returns a pointer to node, or NULL if it is not found.
344 */
345 static vmm_gpt_node_t *
vmm_gpt_node_find_child(vmm_gpt_node_t * parent,uint64_t gpa)346 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
347 {
348 const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa);
349 for (vmm_gpt_node_t *child = parent->vgn_children;
350 child != NULL && child->vgn_index <= index;
351 child = child->vgn_sib_next) {
352 if (child->vgn_index == index)
353 return (child);
354 }
355
356 return (NULL);
357 }
358
359 /*
360 * Add a child node to the GPT at a position determined by GPA, parent, and (if
361 * present) preceding sibling.
362 *
363 * If `parent` node contains any children, `prev_sibling` must be populated with
364 * a pointer to the node preceding (by GPA) the to-be-added child node.
365 */
366 static void
vmm_gpt_node_add(vmm_gpt_t * gpt,vmm_gpt_node_t * parent,vmm_gpt_node_t * child,uint64_t gpa,vmm_gpt_node_t * prev_sibling)367 vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent,
368 vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling)
369 {
370 ASSERT3U(parent->vgn_level, <, LEVEL1);
371 ASSERT3U(child->vgn_parent, ==, NULL);
372
373 const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa);
374 child->vgn_index = idx;
375 child->vgn_level = parent->vgn_level + 1;
376 child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level);
377
378 /* Establish familial connections */
379 child->vgn_parent = parent;
380 if (prev_sibling != NULL) {
381 ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa);
382
383 child->vgn_sib_next = prev_sibling->vgn_sib_next;
384 if (child->vgn_sib_next != NULL) {
385 child->vgn_sib_next->vgn_sib_prev = child;
386 }
387 child->vgn_sib_prev = prev_sibling;
388 prev_sibling->vgn_sib_next = child;
389 } else if (parent->vgn_children != NULL) {
390 vmm_gpt_node_t *next_sibling = parent->vgn_children;
391
392 ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa);
393 ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL);
394
395 child->vgn_sib_next = next_sibling;
396 child->vgn_sib_prev = NULL;
397 next_sibling->vgn_sib_prev = child;
398 parent->vgn_children = child;
399 } else {
400 parent->vgn_children = child;
401 child->vgn_sib_next = NULL;
402 child->vgn_sib_prev = NULL;
403 }
404
405 /* Configure PTE for child table */
406 parent->vgn_entries[idx] =
407 gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn);
408 parent->vgn_ref_cnt++;
409 }
410
411 /*
412 * Remove a child node from its relatives (parent, siblings) and free it.
413 */
414 static void
vmm_gpt_node_remove(vmm_gpt_node_t * child)415 vmm_gpt_node_remove(vmm_gpt_node_t *child)
416 {
417 ASSERT3P(child->vgn_children, ==, NULL);
418 ASSERT3U(child->vgn_ref_cnt, ==, 0);
419 ASSERT3P(child->vgn_parent, !=, NULL);
420
421 /* Unlink child from its siblings and parent */
422 vmm_gpt_node_t *parent = child->vgn_parent;
423 vmm_gpt_node_t *prev = child->vgn_sib_prev;
424 vmm_gpt_node_t *next = child->vgn_sib_next;
425 if (prev != NULL) {
426 ASSERT3P(prev->vgn_sib_next, ==, child);
427 prev->vgn_sib_next = next;
428 }
429 if (next != NULL) {
430 ASSERT3P(next->vgn_sib_prev, ==, child);
431 next->vgn_sib_prev = prev;
432 }
433 if (prev == NULL) {
434 ASSERT3P(parent->vgn_children, ==, child);
435 parent->vgn_children = next;
436 }
437 child->vgn_parent = NULL;
438 child->vgn_sib_next = NULL;
439 child->vgn_sib_prev = NULL;
440 parent->vgn_entries[child->vgn_index] = 0;
441 parent->vgn_ref_cnt--;
442
443 vmm_gpt_node_free(child);
444 }
445
446 /*
447 * Walks the GPT for the given GPA, accumulating entries to the given depth. If
448 * the walk terminates before the depth is reached, the remaining entries are
449 * written with NULLs.
450 */
451 void
vmm_gpt_walk(vmm_gpt_t * gpt,uint64_t gpa,uint64_t ** entries,vmm_gpt_node_level_t depth)452 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries,
453 vmm_gpt_node_level_t depth)
454 {
455 uint64_t *current_entries, entry;
456 pfn_t pfn;
457
458 ASSERT(gpt != NULL);
459 current_entries = gpt->vgpt_root->vgn_entries;
460 for (uint_t i = 0; i < depth; i++) {
461 if (current_entries == NULL) {
462 entries[i] = NULL;
463 continue;
464 }
465 entries[i] = ¤t_entries[vmm_gpt_lvl_index(i, gpa)];
466 entry = *entries[i];
467 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
468 current_entries = NULL;
469 continue;
470 }
471 pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
472 current_entries = (uint64_t *)hat_kpm_pfn2va(pfn);
473 }
474 }
475
476 /*
477 * Looks up an entry given GPA.
478 */
479 uint64_t *
vmm_gpt_lookup(vmm_gpt_t * gpt,uint64_t gpa)480 vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa)
481 {
482 uint64_t *entries[MAX_GPT_LEVEL];
483
484 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
485
486 return (entries[LEVEL1]);
487 }
488
489 /*
490 * Populate child table nodes for a given level between the provided interval
491 * of [addr, addr + len). Caller is expected to provide a pointer to the parent
492 * node which would contain the child node for GPA at `addr`. A pointer to said
493 * child node will be returned when the operation is complete.
494 */
495 static vmm_gpt_node_t *
vmm_gpt_populate_region_lvl(vmm_gpt_t * gpt,uint64_t addr,uint64_t len,vmm_gpt_node_t * node_start)496 vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len,
497 vmm_gpt_node_t *node_start)
498 {
499 const vmm_gpt_node_level_t lvl = node_start->vgn_level;
500 const uint64_t end = addr + len;
501 const uint64_t incr = vmm_gpt_lvl_len(lvl);
502 uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl);
503 vmm_gpt_node_t *parent = node_start;
504
505 /* Try to locate node at starting address */
506 vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children;
507 while (node != NULL && node->vgn_gpa < gpa) {
508 prev = node;
509 node = node->vgn_sib_next;
510 }
511
512 /*
513 * If no node exists at the starting address, create one and link it
514 * into the parent.
515 */
516 if (node == NULL || node->vgn_gpa > gpa) {
517 /* Need to insert node for starting GPA */
518 node = vmm_gpt_node_alloc();
519 vmm_gpt_node_add(gpt, parent, node, gpa, prev);
520 }
521
522 vmm_gpt_node_t *front_node = node;
523 prev = node;
524 gpa += incr;
525
526 /*
527 * With a node at the starting address, walk forward creating nodes in
528 * any of the gaps.
529 */
530 for (; gpa < end; gpa += incr, prev = node) {
531 node = vmm_gpt_node_next(prev, true);
532 if (node != NULL) {
533 ASSERT3U(node->vgn_gpa, ==, gpa);
534
535 /* We may have crossed into a new parent */
536 parent = node->vgn_parent;
537 continue;
538 }
539
540 if (vmm_gpt_node_is_last(prev)) {
541 /*
542 * The node preceding this was the last one in its
543 * containing parent, so move on to that parent's
544 * sibling. We expect (demand) that it exist already.
545 */
546 parent = vmm_gpt_node_next(parent, true);
547 ASSERT(parent != NULL);
548
549 /*
550 * Forget our previous sibling, since it is of no use
551 * for assigning the new node to the a now-different
552 * parent.
553 */
554 prev = NULL;
555
556 }
557 node = vmm_gpt_node_alloc();
558 vmm_gpt_node_add(gpt, parent, node, gpa, prev);
559 }
560
561 return (front_node);
562 }
563
564 /*
565 * Ensures that PTEs for the region of address space bounded by
566 * [addr, addr + len) exist in the tree.
567 */
568 void
vmm_gpt_populate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)569 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
570 {
571 ASSERT0(addr & PAGEOFFSET);
572 ASSERT0(len & PAGEOFFSET);
573
574 /*
575 * Starting at the top of the tree, ensure that tables covering the
576 * requested region exist at each level.
577 */
578 vmm_gpt_node_t *node = gpt->vgpt_root;
579 for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
580 ASSERT3U(node->vgn_level, ==, lvl);
581
582 node = vmm_gpt_populate_region_lvl(gpt, addr, len, node);
583 }
584
585
586 /*
587 * Establish reference counts for the soon-to-be memory PTEs which will
588 * be filling these LEVEL1 tables.
589 */
590 uint64_t gpa = addr;
591 const uint64_t end = addr + len;
592 while (gpa < end) {
593 ASSERT(node != NULL);
594 ASSERT3U(node->vgn_level, ==, LEVEL1);
595
596 const uint16_t covered =
597 vmm_gpt_node_entries_covered(node, addr, end);
598
599 ASSERT(covered != 0);
600 ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE);
601 ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE);
602
603 node->vgn_ref_cnt += covered;
604
605 vmm_gpt_node_t *next = vmm_gpt_node_next(node, true);
606 if (next != NULL) {
607 gpa = next->vgn_gpa;
608 node = next;
609 } else {
610 /*
611 * We do not expect to find a subsequent node after
612 * filling the last node in the table, completing PTE
613 * accounting for the specified range.
614 */
615 VERIFY3U(end, <=, vmm_gpt_node_end(node));
616 break;
617 }
618 }
619 }
620
621 /*
622 * Format a PTE and install it in the provided PTE-pointer.
623 */
624 bool
vmm_gpt_map_at(vmm_gpt_t * gpt,uint64_t * ptep,pfn_t pfn,uint_t prot,uint8_t attr)625 vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot,
626 uint8_t attr)
627 {
628 uint64_t entry, old_entry;
629
630 entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr);
631 old_entry = atomic_cas_64(ptep, 0, entry);
632 if (old_entry != 0) {
633 ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==,
634 gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry));
635 return (false);
636 }
637
638 return (true);
639 }
640
641 /*
642 * Inserts an entry for a given GPA into the table. The caller must
643 * ensure that a conflicting PFN is not mapped at the requested location.
644 * Racing operations to map the same PFN at one location is acceptable and
645 * properly handled.
646 */
647 bool
vmm_gpt_map(vmm_gpt_t * gpt,uint64_t gpa,pfn_t pfn,uint_t prot,uint8_t attr)648 vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr)
649 {
650 uint64_t *entries[MAX_GPT_LEVEL];
651
652 ASSERT(gpt != NULL);
653 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
654 ASSERT(entries[LEVEL1] != NULL);
655
656 return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr));
657 }
658
659 /*
660 * Cleans up the unused inner nodes in the GPT for a region of guest physical
661 * address space of [addr, addr + len). The region must map no pages.
662 */
663 void
vmm_gpt_vacate_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)664 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
665 {
666 ASSERT0(addr & PAGEOFFSET);
667 ASSERT0(len & PAGEOFFSET);
668
669 const uint64_t end = addr + len;
670 vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = {
671 [LEVEL4] = gpt->vgpt_root,
672 };
673
674 for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) {
675 node = vmm_gpt_node_find_child(starts[lvl], addr);
676 if (node == NULL) {
677 break;
678 }
679 starts[lvl + 1] = node;
680 }
681
682 /*
683 * Starting at the bottom of the tree, ensure that PTEs for pages have
684 * been cleared for the region, and remove the corresponding reference
685 * counts from the containing LEVEL1 tables.
686 */
687 uint64_t gpa = addr;
688 node = starts[LEVEL1];
689 while (gpa < end && node != NULL) {
690 const uint16_t covered =
691 vmm_gpt_node_entries_covered(node, addr, end);
692
693 ASSERT3U(node->vgn_ref_cnt, >=, covered);
694 node->vgn_ref_cnt -= covered;
695
696 node = vmm_gpt_node_next(node, false);
697 if (node != NULL) {
698 gpa = node->vgn_gpa;
699 }
700 }
701
702 /*
703 * With the page PTE references eliminated, work up from the bottom of
704 * the table, removing nodes which have no remaining references.
705 *
706 * This stops short of LEVEL4, which is the root table of the GPT. It
707 * is left standing to be cleaned up when the vmm_gpt_t is destroyed.
708 */
709 for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) {
710 gpa = addr;
711 node = starts[lvl];
712
713 while (gpa < end && node != NULL) {
714 vmm_gpt_node_t *next = vmm_gpt_node_next(node, false);
715
716 if (node->vgn_ref_cnt == 0) {
717 vmm_gpt_node_remove(node);
718 }
719 if (next != NULL) {
720 gpa = next->vgn_gpa;
721 }
722 node = next;
723 }
724 }
725 }
726
727 /*
728 * Remove a mapping from the table. Returns false if the page was not mapped,
729 * otherwise returns true.
730 */
731 bool
vmm_gpt_unmap(vmm_gpt_t * gpt,uint64_t gpa)732 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
733 {
734 uint64_t *entries[MAX_GPT_LEVEL], entry;
735
736 ASSERT(gpt != NULL);
737 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
738 if (entries[LEVEL1] == NULL)
739 return (false);
740
741 entry = *entries[LEVEL1];
742 *entries[LEVEL1] = 0;
743 return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry));
744 }
745
746 /*
747 * Un-maps the region of guest physical address space bounded by [start..end).
748 * Returns the number of pages that are unmapped.
749 */
750 size_t
vmm_gpt_unmap_region(vmm_gpt_t * gpt,uint64_t addr,uint64_t len)751 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len)
752 {
753 ASSERT0(addr & PAGEOFFSET);
754 ASSERT0(len & PAGEOFFSET);
755
756 const uint64_t end = addr + len;
757 size_t num_unmapped = 0;
758 for (uint64_t gpa = addr; gpa < end; gpa += PAGESIZE) {
759 if (vmm_gpt_unmap(gpt, gpa) != 0) {
760 num_unmapped++;
761 }
762 }
763
764 return (num_unmapped);
765 }
766
767 /*
768 * Returns a value indicating whether or not this GPT maps the given
769 * GPA. If the GPA is mapped, *protp will be filled with the protection
770 * bits of the entry. Otherwise, it will be ignored.
771 */
772 bool
vmm_gpt_is_mapped(vmm_gpt_t * gpt,uint64_t * ptep,pfn_t * pfnp,uint_t * protp)773 vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp)
774 {
775 uint64_t entry;
776
777 ASSERT(pfnp != NULL);
778 ASSERT(protp != NULL);
779
780 if (ptep == NULL) {
781 return (false);
782 }
783 entry = *ptep;
784 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
785 return (false);
786 }
787 *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
788 *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry);
789 return (true);
790 }
791
792 /*
793 * Resets the accessed bit on the page table entry pointed to be `entry`.
794 * If `on` is true, the bit will be set, otherwise it will be cleared.
795 * The old value of the bit is returned.
796 */
797 uint_t
vmm_gpt_reset_accessed(vmm_gpt_t * gpt,uint64_t * entry,bool on)798 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on)
799 {
800 ASSERT(entry != NULL);
801 return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on));
802 }
803
804 /*
805 * Resets the dirty bit on the page table entry pointed to be `entry`.
806 * If `on` is true, the bit will be set, otherwise it will be cleared.
807 * The old value of the bit is returned.
808 */
809 uint_t
vmm_gpt_reset_dirty(vmm_gpt_t * gpt,uint64_t * entry,bool on)810 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on)
811 {
812 ASSERT(entry != NULL);
813 return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on));
814 }
815
816 /*
817 * Query state from PTE pointed to by `entry`.
818 */
819 bool
vmm_gpt_query(vmm_gpt_t * gpt,uint64_t * entry,vmm_gpt_query_t query)820 vmm_gpt_query(vmm_gpt_t *gpt, uint64_t *entry, vmm_gpt_query_t query)
821 {
822 ASSERT(entry != NULL);
823 return (gpt->vgpt_pte_ops->vpeo_query(entry, query));
824 }
825
826 /*
827 * Get properly formatted PML4 (EPTP/nCR3) for GPT.
828 */
829 uint64_t
vmm_gpt_get_pmtp(vmm_gpt_t * gpt,bool track_dirty)830 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty)
831 {
832 const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn;
833 return (gpt->vgpt_pte_ops->vpeo_get_pmtp(root_pfn, track_dirty));
834 }
835
836 /*
837 * Does the GPT hardware support dirty-page-tracking?
838 */
839 bool
vmm_gpt_can_track_dirty(vmm_gpt_t * gpt)840 vmm_gpt_can_track_dirty(vmm_gpt_t *gpt)
841 {
842 return (gpt->vgpt_pte_ops->vpeo_hw_ad_supported());
843 }
844