xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_gpt.c (revision b8052df9f609edb713f6828c9eecc3d7be19dfb3)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2022 Oxide Computer Company
15  */
16 
17 #include <sys/types.h>
18 #include <sys/atomic.h>
19 #include <sys/kmem.h>
20 #include <sys/sysmacros.h>
21 #include <sys/sunddi.h>
22 #include <sys/panic.h>
23 #include <vm/hat.h>
24 #include <vm/as.h>
25 #include <vm/hat_i86.h>
26 
27 #include <sys/vmm_gpt.h>
28 
29 /*
30  * VMM Generic Page Tables
31  *
32  * Bhyve runs on AMD and Intel hosts and both support nested page tables
33  * describing the guest's physical address space.  But the two use different and
34  * mutually incompatible page table formats: Intel uses the EPT, which is based
35  * on the Itanium page table format, while AMD uses the nPT, which is based on
36  * the x86_64 page table format.
37  *
38  * The GPT abstracts these format differences, and provides a single interface
39  * for interacting with either kind of table structure.
40  *
41  * At a high-level, the GPT is a tree that mirrors the paging table radix tree.
42  * It is parameterized with operations on PTEs that are specific to the table
43  * type (EPT or nPT) and also keeps track of how many pages the table maps, as
44  * well as a pointer to the root node in the tree.
45  *
46  * A node in the GPT keep pointers to its parent (NULL for the root), its
47  * left-most child, and its rightward siblings.  The node understands its
48  * position in the tree in terms of its level it appears at and the index it
49  * occupies at its parent's level, as well as how many children it has.  It also
50  * owns the physical memory page for the hardware page table entries that map
51  * its children.  Thus, for a node at any given level in the tree, the nested
52  * PTE for that node's child at index $i$ is the i'th uint64_t in that node's
53  * entry page and the entry page is part of the paging structure consumed by
54  * hardware.
55  *
56  * The GPT interface provides functions for populating and vacating the tree for
57  * regions in the guest physical address space, and for mapping and unmapping
58  * pages in populated regions.  Users must populate a region before mapping
59  * pages into it, and must unmap pages before vacating the region.
60  *
61  * The interface also exposes a function for walking the table from the root to
62  * a leaf entry, populating an array of pointers to PTEs.  This walk uses the
63  * hardware page structure itself, and is thus fast, though as a result it
64  * potentially aliases entries; caveat emptor.  The walk primitive is used for
65  * mapping, unmapping, and lookups.
66  *
67  * Format-specific differences are abstracted by parameterizing the GPT with a
68  * set of PTE operations specific to the platform.  The GPT code makes use of
69  * these when mapping or populating entries, resetting accessed and dirty bits
70  * on entries, and similar operations.
71  */
72 
73 /*
74  * A GPT node.
75  *
76  * Each node contains pointers to its parent, its left-most child, and its
77  * rightward siblings.  Interior nodes also maintain a reference count, and
78  * each node contains its level and index in its parent's table.  Finally,
79  * each node contains the host PFN of the page that it links into the page
80  * table, as well as a kernel pointer to table.
81  *
82  * On leaf nodes, the reference count tracks how many entries in the table are
83  * covered by mapping from the containing vmspace.  This is maintained during
84  * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace
85  * map/unmap operations, rather than in the data path of faults populating the
86  * PTEs themselves.
87  *
88  * Note, this is carefully sized to fit exactly into a 64-byte cache line.
89  */
90 typedef struct vmm_gpt_node vmm_gpt_node_t;
91 struct vmm_gpt_node {
92 	uint64_t	vgn_host_pfn;
93 	uint16_t	vgn_level;
94 	uint16_t	vgn_index;
95 	uint32_t	vgn_ref_cnt;
96 	vmm_gpt_node_t	*vgn_parent;
97 	vmm_gpt_node_t	*vgn_children;
98 	vmm_gpt_node_t	*vgn_siblings;
99 	uint64_t	*vgn_entries;
100 	uint64_t	vgn_gpa;
101 	uint64_t	_vgn_pad;
102 };
103 
104 /*
105  * A VMM Generic Page Table.
106  *
107  * The generic page table is a format-agnostic, 4-level paging structure
108  * modeling a second-level page table (EPT on Intel, nPT on AMD).  It
109  * contains a counter of pages the table maps, a pointer to the root node
110  * in the table, and is parameterized with a set of PTE operations specific
111  * to the table type.
112  */
113 struct vmm_gpt {
114 	vmm_gpt_node_t	*vgpt_root;
115 	vmm_pte_ops_t	*vgpt_pte_ops;
116 };
117 
118 /*
119  * VMM Guest Page Tables
120  */
121 
122 /*
123  * Allocates a vmm_gpt_node_t structure with corresponding page of memory to
124  * hold the PTEs it contains.
125  */
126 static vmm_gpt_node_t *
127 vmm_gpt_node_alloc(void)
128 {
129 	vmm_gpt_node_t *node;
130 	caddr_t page;
131 
132 	node = kmem_zalloc(sizeof (*node), KM_SLEEP);
133 	/*
134 	 * Note: despite the man page, allocating PAGESIZE bytes is
135 	 * guaranteed to be page-aligned.
136 	 */
137 	page = kmem_zalloc(PAGESIZE, KM_SLEEP);
138 	node->vgn_entries = (uint64_t *)page;
139 	node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page);
140 
141 	return (node);
142 }
143 
144 /*
145  * Allocates and initializes a vmm_gpt_t.
146  */
147 vmm_gpt_t *
148 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops)
149 {
150 	vmm_gpt_t *gpt;
151 
152 	VERIFY(pte_ops != NULL);
153 	gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP);
154 	gpt->vgpt_pte_ops = pte_ops;
155 	gpt->vgpt_root = vmm_gpt_node_alloc();
156 
157 	return (gpt);
158 }
159 
160 /*
161  * Frees the given node, first nulling out all of its links to other nodes in
162  * the tree, adjusting its parents reference count, and unlinking itself from
163  * its parents page table.
164  */
165 static void
166 vmm_gpt_node_free(vmm_gpt_node_t *node)
167 {
168 	ASSERT(node != NULL);
169 	ASSERT3U(node->vgn_ref_cnt, ==, 0);
170 	ASSERT(node->vgn_host_pfn != PFN_INVALID);
171 	ASSERT(node->vgn_entries != NULL);
172 	if (node->vgn_parent != NULL) {
173 		uint64_t *parent_entries = node->vgn_parent->vgn_entries;
174 		parent_entries[node->vgn_index] = 0;
175 		node->vgn_parent->vgn_ref_cnt--;
176 	}
177 	kmem_free(node->vgn_entries, PAGESIZE);
178 	kmem_free(node, sizeof (*node));
179 }
180 
181 /*
182  * Frees the portion of the radix tree rooted at the given node.
183  */
184 static void
185 vmm_gpt_node_tree_free(vmm_gpt_node_t *node)
186 {
187 	ASSERT(node != NULL);
188 
189 	for (vmm_gpt_node_t *child = node->vgn_children, *next = NULL;
190 	    child != NULL;
191 	    child = next) {
192 		next = child->vgn_siblings;
193 		vmm_gpt_node_tree_free(child);
194 	}
195 	vmm_gpt_node_free(node);
196 }
197 
198 /*
199  * Cleans up a vmm_gpt_t by removing any lingering vmm_gpt_node_t entries
200  * it refers to.
201  */
202 void
203 vmm_gpt_free(vmm_gpt_t *gpt)
204 {
205 	vmm_gpt_node_tree_free(gpt->vgpt_root);
206 	kmem_free(gpt, sizeof (*gpt));
207 }
208 
209 /*
210  * Return the index in the paging structure for the given level.
211  */
212 static inline uint16_t
213 vmm_gpt_node_index(uint64_t gpa, enum vmm_gpt_node_level level)
214 {
215 	const int SHIFTS[MAX_GPT_LEVEL] = { 39, 30, 21, 12 };
216 	const uint_t MASK = (1U << 9) - 1;
217 	ASSERT(level < MAX_GPT_LEVEL);
218 	return ((gpa >> SHIFTS[level]) & MASK);
219 }
220 
221 /*
222  * Finds the child for the given GPA in the given parent node.
223  * Returns a pointer to node, or NULL if it is not found.
224  */
225 static vmm_gpt_node_t *
226 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa)
227 {
228 	if (parent == NULL)
229 		return (NULL);
230 
231 	const uint16_t index = vmm_gpt_node_index(gpa, parent->vgn_level);
232 	for (vmm_gpt_node_t *child = parent->vgn_children;
233 	    child != NULL && child->vgn_index <= index;
234 	    child = child->vgn_siblings) {
235 		if (child->vgn_index == index)
236 			return (child);
237 	}
238 
239 	return (NULL);
240 }
241 
242 /*
243  * Walks the GPT for the given GPA, accumulating entries to the given depth.  If
244  * the walk terminates before the depth is reached, the remaining entries are
245  * written with NULLs.
246  */
247 void
248 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries,
249     enum vmm_gpt_node_level depth)
250 {
251 	uint64_t *current_entries, entry;
252 	pfn_t pfn;
253 
254 	ASSERT(gpt != NULL);
255 	current_entries = gpt->vgpt_root->vgn_entries;
256 	for (uint_t i = 0; i < depth; i++) {
257 		if (current_entries == NULL) {
258 			entries[i] = NULL;
259 			continue;
260 		}
261 		entries[i] = &current_entries[vmm_gpt_node_index(gpa, i)];
262 		entry = *entries[i];
263 		if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
264 			current_entries = NULL;
265 			continue;
266 		}
267 		pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
268 		current_entries = (uint64_t *)hat_kpm_pfn2va(pfn);
269 	}
270 }
271 
272 /*
273  * Looks up an entry given GPA.
274  */
275 uint64_t *
276 vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa)
277 {
278 	uint64_t *entries[MAX_GPT_LEVEL];
279 
280 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
281 
282 	return (entries[LEVEL1]);
283 }
284 
285 /*
286  * Adds a node for the given GPA to the GPT as a child of the given parent.
287  */
288 static void
289 vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child,
290     uint64_t gpa)
291 {
292 	vmm_gpt_node_t **prevp;
293 	vmm_gpt_node_t *node;
294 	uint64_t *parent_entries, entry;
295 
296 	ASSERT(gpt != NULL);
297 	ASSERT(gpt->vgpt_pte_ops != NULL);
298 	ASSERT(parent != NULL);
299 	ASSERT(child != NULL);
300 	ASSERT3U(parent->vgn_level, <, LEVEL1);
301 
302 	const uint64_t gpa_mask[3] = {
303 		[LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */
304 		[LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */
305 		[LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */
306 	};
307 	const int index = vmm_gpt_node_index(gpa, parent->vgn_level);
308 	child->vgn_index = index;
309 	child->vgn_level = parent->vgn_level + 1;
310 	child->vgn_parent = parent;
311 	child->vgn_gpa = gpa & gpa_mask[parent->vgn_level];
312 	parent_entries = parent->vgn_entries;
313 	entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn);
314 	parent_entries[index] = entry;
315 
316 	for (prevp = &parent->vgn_children, node = parent->vgn_children;
317 	    node != NULL;
318 	    prevp = &node->vgn_siblings, node = node->vgn_siblings) {
319 		if (node->vgn_index > child->vgn_index) {
320 			break;
321 		}
322 	}
323 	if (node != NULL)
324 		ASSERT3U(node->vgn_index, !=, child->vgn_index);
325 	child->vgn_siblings = node;
326 	*prevp = child;
327 	parent->vgn_ref_cnt++;
328 }
329 
330 /*
331  * Populate the GPT with nodes so that a entries for the given GPA exist.  Note
332  * that this does not actually map the entry, but simply ensures that the
333  * entries exist.
334  */
335 static void
336 vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa)
337 {
338 	vmm_gpt_node_t *node, *child;
339 
340 	ASSERT(gpt != NULL);
341 	ASSERT0(gpa & PAGEOFFSET);
342 
343 	node = gpt->vgpt_root;
344 	for (uint_t i = 0; i < LEVEL1; i++) {
345 		ASSERT(node != NULL);
346 		child = vmm_gpt_node_find_child(node, gpa);
347 		if (child == NULL) {
348 			child = vmm_gpt_node_alloc();
349 			ASSERT(child != NULL);
350 			vmm_gpt_add_child(gpt, node, child, gpa);
351 		}
352 		node = child;
353 	}
354 
355 	/*
356 	 * Bump the reference count for this leaf for the PTE that is now usable
357 	 * by the mapping covering its GPA.
358 	 */
359 	ASSERT3U(node->vgn_level, ==, LEVEL1);
360 	ASSERT3U(node->vgn_ref_cnt, <, 512);
361 	node->vgn_ref_cnt++;
362 }
363 
364 /*
365  * Ensures that PTEs for the region of address space bounded by
366  * [start, end) exist in the tree.
367  */
368 void
369 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
370 {
371 	ASSERT0(start & PAGEOFFSET);
372 	ASSERT0(end & PAGEOFFSET);
373 
374 	for (uint64_t page = start; page < end; page += PAGESIZE) {
375 		vmm_gpt_populate_entry(gpt, page);
376 	}
377 }
378 
379 /*
380  * Format a PTE and install it in the provided PTE-pointer.
381  */
382 bool
383 vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot,
384     uint8_t attr)
385 {
386 	uint64_t entry, old_entry;
387 
388 	entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr);
389 	old_entry = atomic_cas_64(ptep, 0, entry);
390 	if (old_entry != 0) {
391 		ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==,
392 		    gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry));
393 		return (false);
394 	}
395 
396 	return (true);
397 }
398 
399 /*
400  * Inserts an entry for a given GPA into the table.  The caller must
401  * ensure that a conflicting PFN is not mapped at the requested location.
402  * Racing operations to map the same PFN at one location is acceptable and
403  * properly handled.
404  */
405 bool
406 vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr)
407 {
408 	uint64_t *entries[MAX_GPT_LEVEL];
409 
410 	ASSERT(gpt != NULL);
411 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
412 	ASSERT(entries[LEVEL1] != NULL);
413 
414 	return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr));
415 }
416 
417 /*
418  * Removes a child node from its parent's list of children, and then frees
419  * the now-orphaned child.
420  */
421 static void
422 vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child)
423 {
424 	ASSERT(parent != NULL);
425 
426 	ASSERT3P(child->vgn_children, ==, NULL);
427 	vmm_gpt_node_t **prevp = &parent->vgn_children;
428 	for (vmm_gpt_node_t *node = parent->vgn_children;
429 	    node != NULL;
430 	    prevp = &node->vgn_siblings, node = node->vgn_siblings) {
431 		if (node == child) {
432 			*prevp = node->vgn_siblings;
433 			vmm_gpt_node_free(node);
434 			break;
435 		}
436 	}
437 }
438 
439 /*
440  * Cleans up unused inner nodes in the GPT.  Asserts that the leaf corresponding
441  * to the entry does not map any additional pages.
442  */
443 static void
444 vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa)
445 {
446 	vmm_gpt_node_t *nodes[MAX_GPT_LEVEL], *node;
447 
448 	node = gpt->vgpt_root;
449 	for (uint_t i = 0; i < MAX_GPT_LEVEL; i++) {
450 		nodes[i] = node;
451 		node = vmm_gpt_node_find_child(node, gpa);
452 	}
453 	for (uint_t i = LEVEL1; i > 0; i--) {
454 		node = nodes[i];
455 
456 		if (node == NULL)
457 			continue;
458 
459 		if (i == LEVEL1) {
460 			ASSERT0(node->vgn_entries[vmm_gpt_node_index(gpa, i)]);
461 			ASSERT3U(node->vgn_ref_cnt, !=, 0);
462 
463 			/*
464 			 * Just as vmm_gpt_populate_entry() increments the
465 			 * reference count for leaf PTEs which become usable,
466 			 * here we decrement it as they become unusable as the
467 			 * mapping covering its GPA is removed.
468 			 */
469 			node->vgn_ref_cnt--;
470 		}
471 
472 		if (node->vgn_ref_cnt != 0)
473 			break;
474 		vmm_gpt_node_remove_child(nodes[i - 1], nodes[i]);
475 	}
476 }
477 
478 /*
479  * Cleans up the unused inner nodes in the GPT for a region of guest physical
480  * address space of [start, end).  The region must map no pages.
481  */
482 void
483 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
484 {
485 	ASSERT0(start & PAGEOFFSET);
486 	ASSERT0(end & PAGEOFFSET);
487 
488 	for (uint64_t page = start; page < end; page += PAGESIZE) {
489 		vmm_gpt_vacate_entry(gpt, page);
490 	}
491 }
492 
493 /*
494  * Remove a mapping from the table.  Returns false if the page was not mapped,
495  * otherwise returns true.
496  */
497 bool
498 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa)
499 {
500 	uint64_t *entries[MAX_GPT_LEVEL], entry;
501 
502 	ASSERT(gpt != NULL);
503 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
504 	if (entries[LEVEL1] == NULL)
505 		return (false);
506 
507 	entry = *entries[LEVEL1];
508 	*entries[LEVEL1] = 0;
509 	return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry));
510 }
511 
512 /*
513  * Un-maps the region of guest physical address space bounded by [start..end).
514  * Returns the number of pages that are unmapped.
515  */
516 size_t
517 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end)
518 {
519 	ASSERT0(start & PAGEOFFSET);
520 	ASSERT0(end & PAGEOFFSET);
521 
522 	size_t num_unmapped = 0;
523 	for (uint64_t page = start; page < end; page += PAGESIZE) {
524 		if (vmm_gpt_unmap(gpt, page) != 0) {
525 			num_unmapped++;
526 		}
527 	}
528 
529 	return (num_unmapped);
530 }
531 
532 /*
533  * Returns a value indicating whether or not this GPT maps the given
534  * GPA.  If the GPA is mapped, *protp will be filled with the protection
535  * bits of the entry.  Otherwise, it will be ignored.
536  */
537 bool
538 vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp)
539 {
540 	uint64_t entry;
541 
542 	if (ptep == NULL) {
543 		return (false);
544 	}
545 	entry = *ptep;
546 	if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) {
547 		return (false);
548 	}
549 	*pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry);
550 	*protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry);
551 	return (true);
552 }
553 
554 /*
555  * Resets the accessed bit on the page table entry pointed to be `entry`.
556  * If `on` is true, the bit will be set, otherwise it will be cleared.
557  * The old value of the bit is returned.
558  */
559 uint_t
560 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on)
561 {
562 	ASSERT(entry != NULL);
563 	return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on));
564 }
565 
566 /*
567  * Resets the dirty bit on the page table entry pointed to be `entry`.
568  * If `on` is true, the bit will be set, otherwise it will be cleared.
569  * The old value of the bit is returned.
570  */
571 uint_t
572 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on)
573 {
574 	ASSERT(entry != NULL);
575 	return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on));
576 }
577 
578 /*
579  * Get properly formatted PML4 (EPTP/nCR3) for GPT.
580  */
581 uint64_t
582 vmm_gpt_get_pmtp(vmm_gpt_t *gpt)
583 {
584 	return (gpt->vgpt_pte_ops->vpeo_get_pmtp(gpt->vgpt_root->vgn_host_pfn));
585 }
586