xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_gpt.c (revision badf94ff3599fab15963f6c532929e9bc411757a)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2022 Oxide Computer Company
15  */
16 
17 #include <sys/types.h>
18 #include <sys/malloc.h>
19 #include <sys/atomic.h>
20 #include <sys/kmem.h>
21 #include <sys/sysmacros.h>
22 #include <sys/sunddi.h>
23 #include <sys/panic.h>
24 #include <vm/hat.h>
25 #include <vm/as.h>
26 #include <vm/hat_i86.h>
27 
28 #include <sys/vmm_gpt.h>
29 
30 /*
31  * VMM Generic Page Tables
32  *
33  * Bhyve runs on AMD and Intel hosts and both support nested page tables
34  * describing the guest's physical address space.  But the two use different and
35  * mutually incompatible page table formats: Intel uses the EPT, which is based
36  * on the Itanium page table format, while AMD uses the nPT, which is based on
37  * the x86_64 page table format.
38  *
39  * The GPT abstracts these format differences, and provides a single interface
40  * for interacting with either kind of table structure.
41  *
42  * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
43  * It is parameterized with operations on PTEs that are specific to the table
44  * type (EPT or nPT) and also keeps track of how many pages the table maps, as
45  * well as a pointer to the root node in the tree.
46  *
47  * A node in the GPT keep pointers to its parent (NULL for the root), its
48  * left-most child, and its rightward siblings.  The node understands its
49  * position in the tree in terms of its level it appears at and the index it
50  * occupies at its parent's level, as well as how many children it has.  It also
51  * owns the physical memory page for the hardware page table entries that map
52  * its children.  Thus, for a node at any given level in the tree, the nested
53  * PTE for that node's child at index $i$ is the i'th uint64_t in that node's
54  * entry page and the entry page is part of the paging structure consumed by
55  * hardware.
56  *
57  * The GPT interface provides functions for populating and vacating the tree for
58  * regions in the guest physical address space, and for mapping and unmapping
59  * pages in populated regions.  Users must populate a region before mapping
60  * pages into it, and must unmap pages before vacating the region.
61  *
62  * The interface also exposes a function for walking the table from the root to
63  * a leaf entry, populating an array of pointers to PTEs.  This walk uses the
64  * hardware page structure itself, and is thus fast, though as a result it
65  * potentially aliases entries; caveat emptor.  The walk primitive is used for
66  * mapping, unmapping, and lookups.
67  *
68  * Format-specific differences are abstracted by parameterizing the GPT with a
69  * set of PTE operations specific to the platform.  The GPT code makes use of
70  * these when mapping or populating entries, resetting accessed and dirty bits
71  * on entries, and similar operations.
72  */
73 
74 /*
75  * A GPT node.
76  *
77  * Each node contains pointers to its parent, its left-most child, and its
78  * rightward siblings.  Interior nodes also maintain a reference count, and
79  * each node contains its level and index in its parent's table.  Finally,
80  * each node contains the host PFN of the page that it links into the page
81  * table, as well as a kernel pointer to table.
82  *
83  * On leaf nodes, the reference count tracks how many entries in the table are
84  * covered by mapping from the containing vmspace.  This is maintained during
85  * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace
86  * map/unmap operations, rather than in the data path of faults populating the
87  * PTEs themselves.
88  *
89  * Note, this is carefully sized to fit exactly into a 64-byte cache line.
90  */
91 typedef struct vmm_gpt_node vmm_gpt_node_t;
92 struct vmm_gpt_node {
93 	uint64_t	vgn_host_pfn;
94 	uint16_t	vgn_level;
95 	uint16_t	vgn_index;
96 	uint32_t	vgn_ref_cnt;
97 	vmm_gpt_node_t	*vgn_parent;
98 	vmm_gpt_node_t	*vgn_children;
99 	vmm_gpt_node_t	*vgn_siblings;
100 	uint64_t	*vgn_entries;
101 	uint64_t	vgn_gpa;
102 	uint64_t	_vgn_pad;
103 };
104 
105 /*
106  * A VMM Generic Page Table.
107  *
108  * The generic page table is a format-agnostic, 4-level paging structure
109  * modeling a second-level page table (EPT on Intel, nPT on AMD).  It
110  * contains a counter of pages the table maps, a pointer to the root node
111  * in the table, and is parameterized with a set of PTE operations specific
112  * to the table type.
113  */
114 struct vmm_gpt {
115 	vmm_gpt_node_t	*vgpt_root;
116 	vmm_pte_ops_t	*vgpt_pte_ops;
117 };
118 
119 /*
120  * VMM Guest Page Tables
121  */
122 
123 /*
124  * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
125  * hold the PTEs it contains.
126  */
127 static vmm_gpt_node_t *
128 vmm_gpt_node_alloc(void)
129 {
130 	vmm_gpt_node_t *node;
131 	caddr_t page;
132 
133 	node = kmem_zalloc(sizeof (*node), KM_SLEEP);
134 	/*
135 	 * Note: despite the man page, allocating PAGESIZE bytes is
136 	 * guaranteed to be page-aligned.
137 	 */
138 	page = kmem_zalloc(PAGESIZE, KM_SLEEP);
139 	node->vgn_entries = (uint64_t *)page;
140 	node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
141 
142 	return (node);
143 }
144 
145 /*
146  * Allocates and initializes a vmm_gpt_t.
147  */
148 vmm_gpt_t *
149 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops)
150 {
151 	vmm_gpt_t *gpt;
152 
153 	VERIFY(pte_ops != NULL);
154 	gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP);
155 	gpt->vgpt_pte_ops = pte_ops;
156 	gpt->vgpt_root = vmm_gpt_node_alloc();
157 
158 	return (gpt);
159 }
160 
161 /*
162  * Frees the given node, first nulling out all of its links to other nodes in
163  * the tree, adjusting its parents reference count, and unlinking itself from
164  * its parents page table.
165  */
166 static void
167 vmm_gpt_node_free(vmm_gpt_node_t *node)
168 {
169 	ASSERT(node != NULL);
170 	ASSERT3U(node->vgn_ref_cnt, ==, 0);
171 	ASSERT(node->vgn_host_pfn != PFN_INVALID);
172 	ASSERT(node->vgn_entries != NULL);
173 	if (node->vgn_parent != NULL) {
174 		uint64_t *parent_entries = node->vgn_parent->vgn_entries;
175 		parent_entries[node->vgn_index] = 0;
176 		node->vgn_parent->vgn_ref_cnt--;
177 	}
178 	kmem_free(node->vgn_entries, PAGESIZE);
179 	kmem_free(node, sizeof (*node));
180 }
181 
182 /*
183  * Frees the portion of the radix tree rooted at the given node.
184  */
185 static void
186 vmm_gpt_node_tree_free(vmm_gpt_node_t *node)
187 {
188 	ASSERT(node != NULL);
189 
190 	for (vmm_gpt_node_t *child = node->vgn_children, *next = NULL;
191 	    child != NULL;
192 	    child = next) {
193 		next = child->vgn_siblings;
194 		vmm_gpt_node_tree_free(child);
195 	}
196 	vmm_gpt_node_free(node);
197 }
198 
199 /*
200  * Cleans up a vmm_gpt_t by removing any lingering vmm_gpt_node_t entries
201  * it refers to.
202  */
203 void
204 vmm_gpt_free(vmm_gpt_t *gpt)
205 {
206 	vmm_gpt_node_tree_free(gpt->vgpt_root);
207 	kmem_free(gpt, sizeof (*gpt));
208 }
209 
210 /*
211  * Return the index in the paging structure for the given level.
212  */
213 static inline uint16_t
214 vmm_gpt_node_index(uint64_t gpa, enum vmm_gpt_node_level level)
215 {
216 	const int SHIFTS[MAX_GPT_LEVEL] = { 39, 30, 21, 12 };
217 	const uint_t MASK = (1U << 9) - 1;
218 	ASSERT(level < MAX_GPT_LEVEL);
219 	return ((gpa >> SHIFTS[level]) & MASK);
220 }
221 
222 /*
223  * Finds the child for the given GPA in the given parent node.
224  * Returns a pointer to node, or NULL if it is not found.
225  */
226 static vmm_gpt_node_t *
227 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
228 {
229 	if (parent == NULL)
230 		return (NULL);
231 
232 	const uint16_t index = vmm_gpt_node_index(gpa, parent->vgn_level);
233 	for (vmm_gpt_node_t *child = parent->vgn_children;
234 	    child != NULL && child->vgn_index <= index;
235 	    child = child->vgn_siblings) {
236 		if (child->vgn_index == index)
237 			return (child);
238 	}
239 
240 	return (NULL);
241 }
242 
243 /*
244  * Walks the GPT for the given GPA, accumulating entries to the given depth.  If
245  * the walk terminates before the depth is reached, the remaining entries are
246  * written with NULLs.
247  */
248 void
249 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries,
250     enum vmm_gpt_node_level depth)
251 {
252 	uint64_t *current_entries, entry;
253 	pfn_t pfn;
254 
255 	ASSERT(gpt != NULL);
256 	current_entries = gpt->vgpt_root->vgn_entries;
257 	for (uint_t i = 0; i < depth; i++) {
258 		if (current_entries == NULL) {
259 			entries[i] = NULL;
260 			continue;
261 		}
262 		entries[i] = &current_entries[vmm_gpt_node_index(gpa, i)];
263 		entry = *entries[i];
264 		if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
265 			current_entries = NULL;
266 			continue;
267 		}
268 		pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
269 		current_entries = (uint64_t *)hat_kpm_pfn2va(pfn);
270 	}
271 }
272 
273 /*
274  * Looks up an entry given GPA.
275  */
276 uint64_t *
277 vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa)
278 {
279 	uint64_t *entries[MAX_GPT_LEVEL];
280 
281 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
282 
283 	return (entries[LEVEL1]);
284 }
285 
286 /*
287  * Adds a node for the given GPA to the GPT as a child of the given parent.
288  */
289 static void
290 vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child,
291     uint64_t gpa)
292 {
293 	vmm_gpt_node_t **prevp;
294 	vmm_gpt_node_t *node;
295 	uint64_t *parent_entries, entry;
296 
297 	ASSERT(gpt != NULL);
298 	ASSERT(gpt->vgpt_pte_ops != NULL);
299 	ASSERT(parent != NULL);
300 	ASSERT(child != NULL);
301 	ASSERT3U(parent->vgn_level, <, LEVEL1);
302 
303 	const uint64_t gpa_mask[3] = {
304 		[LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */
305 		[LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */
306 		[LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */
307 	};
308 	const int index = vmm_gpt_node_index(gpa, parent->vgn_level);
309 	child->vgn_index = index;
310 	child->vgn_level = parent->vgn_level + 1;
311 	child->vgn_parent = parent;
312 	child->vgn_gpa = gpa & gpa_mask[parent->vgn_level];
313 	parent_entries = parent->vgn_entries;
314 	entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn);
315 	parent_entries[index] = entry;
316 
317 	for (prevp = &parent->vgn_children, node = parent->vgn_children;
318 	    node != NULL;
319 	    prevp = &node->vgn_siblings, node = node->vgn_siblings) {
320 		if (node->vgn_index > child->vgn_index) {
321 			break;
322 		}
323 	}
324 	if (node != NULL)
325 		ASSERT3U(node->vgn_index, !=, child->vgn_index);
326 	child->vgn_siblings = node;
327 	*prevp = child;
328 	parent->vgn_ref_cnt++;
329 }
330 
331 /*
332  * Populate the GPT with nodes so that a entries for the given GPA exist.  Note
333  * that this does not actually map the entry, but simply ensures that the
334  * entries exist.
335  */
336 static void
337 vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa)
338 {
339 	vmm_gpt_node_t *node, *child;
340 
341 	ASSERT(gpt != NULL);
342 	ASSERT0(gpa & PAGEOFFSET);
343 
344 	node = gpt->vgpt_root;
345 	for (uint_t i = 0; i < LEVEL1; i++) {
346 		ASSERT(node != NULL);
347 		child = vmm_gpt_node_find_child(node, gpa);
348 		if (child == NULL) {
349 			child = vmm_gpt_node_alloc();
350 			ASSERT(child != NULL);
351 			vmm_gpt_add_child(gpt, node, child, gpa);
352 		}
353 		node = child;
354 	}
355 
356 	/*
357 	 * Bump the reference count for this leaf for the PTE that is now usable
358 	 * by the mapping covering its GPA.
359 	 */
360 	ASSERT3U(node->vgn_level, ==, LEVEL1);
361 	ASSERT3U(node->vgn_ref_cnt, <, 512);
362 	node->vgn_ref_cnt++;
363 }
364 
365 /*
366  * Ensures that PTEs for the region of address space bounded by
367  * [start, end) exist in the tree.
368  */
369 void
370 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
371 {
372 	ASSERT0(start & PAGEOFFSET);
373 	ASSERT0(end & PAGEOFFSET);
374 
375 	for (uint64_t page = start; page < end; page += PAGESIZE) {
376 		vmm_gpt_populate_entry(gpt, page);
377 	}
378 }
379 
380 /*
381  * Format a PTE and install it in the provided PTE-pointer.
382  */
383 bool
384 vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot,
385     uint8_t attr)
386 {
387 	uint64_t entry, old_entry;
388 
389 	entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr);
390 	old_entry = atomic_cas_64(ptep, 0, entry);
391 	if (old_entry != 0) {
392 		ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==,
393 		    gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry));
394 		return (false);
395 	}
396 
397 	return (true);
398 }
399 
400 /*
401  * Inserts an entry for a given GPA into the table.  The caller must
402  * ensure that a conflicting PFN is not mapped at the requested location.
403  * Racing operations to map the same PFN at one location is acceptable and
404  * properly handled.
405  */
406 bool
407 vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr)
408 {
409 	uint64_t *entries[MAX_GPT_LEVEL];
410 
411 	ASSERT(gpt != NULL);
412 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
413 	ASSERT(entries[LEVEL1] != NULL);
414 
415 	return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr));
416 }
417 
418 /*
419  * Removes a child node from its parent's list of children, and then frees
420  * the now-orphaned child.
421  */
422 static void
423 vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child)
424 {
425 	ASSERT(parent != NULL);
426 
427 	ASSERT3P(child->vgn_children, ==, NULL);
428 	vmm_gpt_node_t **prevp = &parent->vgn_children;
429 	for (vmm_gpt_node_t *node = parent->vgn_children;
430 	    node != NULL;
431 	    prevp = &node->vgn_siblings, node = node->vgn_siblings) {
432 		if (node == child) {
433 			*prevp = node->vgn_siblings;
434 			vmm_gpt_node_free(node);
435 			break;
436 		}
437 	}
438 }
439 
440 /*
441  * Cleans up unused inner nodes in the GPT.  Asserts that the leaf corresponding
442  * to the entry does not map any additional pages.
443  */
444 static void
445 vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa)
446 {
447 	vmm_gpt_node_t *nodes[MAX_GPT_LEVEL], *node;
448 
449 	node = gpt->vgpt_root;
450 	for (uint_t i = 0; i < MAX_GPT_LEVEL; i++) {
451 		nodes[i] = node;
452 		node = vmm_gpt_node_find_child(node, gpa);
453 	}
454 	for (uint_t i = LEVEL1; i > 0; i--) {
455 		node = nodes[i];
456 
457 		if (node == NULL)
458 			continue;
459 
460 		if (i == LEVEL1) {
461 			ASSERT0(node->vgn_entries[vmm_gpt_node_index(gpa, i)]);
462 			ASSERT3U(node->vgn_ref_cnt, !=, 0);
463 
464 			/*
465 			 * Just as vmm_gpt_populate_entry() increments the
466 			 * reference count for leaf PTEs which become usable,
467 			 * here we decrement it as they become unusable as the
468 			 * mapping covering its GPA is removed.
469 			 */
470 			node->vgn_ref_cnt--;
471 		}
472 
473 		if (node->vgn_ref_cnt != 0)
474 			break;
475 		vmm_gpt_node_remove_child(nodes[i - 1], nodes[i]);
476 	}
477 }
478 
479 /*
480  * Cleans up the unused inner nodes in the GPT for a region of guest physical
481  * address space of [start, end).  The region must map no pages.
482  */
483 void
484 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
485 {
486 	ASSERT0(start & PAGEOFFSET);
487 	ASSERT0(end & PAGEOFFSET);
488 
489 	for (uint64_t page = start; page < end; page += PAGESIZE) {
490 		vmm_gpt_vacate_entry(gpt, page);
491 	}
492 }
493 
494 /*
495  * Remove a mapping from the table.  Returns false if the page was not mapped,
496  * otherwise returns true.
497  */
498 bool
499 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
500 {
501 	uint64_t *entries[MAX_GPT_LEVEL], entry;
502 
503 	ASSERT(gpt != NULL);
504 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
505 	if (entries[LEVEL1] == NULL)
506 		return (false);
507 
508 	entry = *entries[LEVEL1];
509 	*entries[LEVEL1] = 0;
510 	return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry));
511 }
512 
513 /*
514  * Un-maps the region of guest physical address space bounded by [start..end).
515  * Returns the number of pages that are unmapped.
516  */
517 size_t
518 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
519 {
520 	ASSERT0(start & PAGEOFFSET);
521 	ASSERT0(end & PAGEOFFSET);
522 
523 	size_t num_unmapped = 0;
524 	for (uint64_t page = start; page < end; page += PAGESIZE) {
525 		if (vmm_gpt_unmap(gpt, page) != 0) {
526 			num_unmapped++;
527 		}
528 	}
529 
530 	return (num_unmapped);
531 }
532 
533 /*
534  * Returns a value indicating whether or not this GPT maps the given
535  * GPA.  If the GPA is mapped, *protp will be filled with the protection
536  * bits of the entry.  Otherwise, it will be ignored.
537  */
538 bool
539 vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp)
540 {
541 	uint64_t entry;
542 
543 	if (ptep == NULL) {
544 		return (false);
545 	}
546 	entry = *ptep;
547 	if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
548 		return (false);
549 	}
550 	*pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
551 	*protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry);
552 	return (true);
553 }
554 
555 /*
556  * Resets the accessed bit on the page table entry pointed to be `entry`.
557  * If `on` is true, the bit will be set, otherwise it will be cleared.
558  * The old value of the bit is returned.
559  */
560 uint_t
561 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on)
562 {
563 	ASSERT(entry != NULL);
564 	return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on));
565 }
566 
567 /*
568  * Resets the dirty bit on the page table entry pointed to be `entry`.
569  * If `on` is true, the bit will be set, otherwise it will be cleared.
570  * The old value of the bit is returned.
571  */
572 uint_t
573 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on)
574 {
575 	ASSERT(entry != NULL);
576 	return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on));
577 }
578 
579 /*
580  * Get properly formatted PML4 (EPTP/nCR3) for GPT.
581  */
582 uint64_t
583 vmm_gpt_get_pmtp(vmm_gpt_t *gpt)
584 {
585 	return (gpt->vgpt_pte_ops->vpeo_get_pmtp(gpt->vgpt_root->vgn_host_pfn));
586 }
587