1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 * Copyright 2022 Oxide Computer Company 15 */ 16 17 #include <sys/types.h> 18 #include <sys/atomic.h> 19 #include <sys/kmem.h> 20 #include <sys/sysmacros.h> 21 #include <sys/sunddi.h> 22 #include <sys/panic.h> 23 #include <vm/hat.h> 24 #include <vm/as.h> 25 #include <vm/hat_i86.h> 26 27 #include <sys/vmm_gpt.h> 28 29 /* 30 * VMM Generic Page Tables 31 * 32 * Bhyve runs on AMD and Intel hosts and both support nested page tables 33 * describing the guest's physical address space. But the two use different and 34 * mutually incompatible page table formats: Intel uses the EPT, which is based 35 * on the Itanium page table format, while AMD uses the nPT, which is based on 36 * the x86_64 page table format. 37 * 38 * The GPT abstracts these format differences, and provides a single interface 39 * for interacting with either kind of table structure. 40 * 41 * At a high-level, the GPT is a tree that mirrors the paging table radix tree. 42 * It is parameterized with operations on PTEs that are specific to the table 43 * type (EPT or nPT) and also keeps track of how many pages the table maps, as 44 * well as a pointer to the root node in the tree. 45 * 46 * A node in the GPT keep pointers to its parent (NULL for the root), its 47 * left-most child, and its rightward siblings. The node understands its 48 * position in the tree in terms of its level it appears at and the index it 49 * occupies at its parent's level, as well as how many children it has. It also 50 * owns the physical memory page for the hardware page table entries that map 51 * its children. Thus, for a node at any given level in the tree, the nested 52 * PTE for that node's child at index $i$ is the i'th uint64_t in that node's 53 * entry page and the entry page is part of the paging structure consumed by 54 * hardware. 55 * 56 * The GPT interface provides functions for populating and vacating the tree for 57 * regions in the guest physical address space, and for mapping and unmapping 58 * pages in populated regions. Users must populate a region before mapping 59 * pages into it, and must unmap pages before vacating the region. 60 * 61 * The interface also exposes a function for walking the table from the root to 62 * a leaf entry, populating an array of pointers to PTEs. This walk uses the 63 * hardware page structure itself, and is thus fast, though as a result it 64 * potentially aliases entries; caveat emptor. The walk primitive is used for 65 * mapping, unmapping, and lookups. 66 * 67 * Format-specific differences are abstracted by parameterizing the GPT with a 68 * set of PTE operations specific to the platform. The GPT code makes use of 69 * these when mapping or populating entries, resetting accessed and dirty bits 70 * on entries, and similar operations. 71 */ 72 73 /* 74 * A GPT node. 75 * 76 * Each node contains pointers to its parent, its left-most child, and its 77 * rightward siblings. Interior nodes also maintain a reference count, and 78 * each node contains its level and index in its parent's table. Finally, 79 * each node contains the host PFN of the page that it links into the page 80 * table, as well as a kernel pointer to table. 81 * 82 * On leaf nodes, the reference count tracks how many entries in the table are 83 * covered by mapping from the containing vmspace. This is maintained during 84 * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace 85 * map/unmap operations, rather than in the data path of faults populating the 86 * PTEs themselves. 87 * 88 * Note, this is carefully sized to fit exactly into a 64-byte cache line. 89 */ 90 typedef struct vmm_gpt_node vmm_gpt_node_t; 91 struct vmm_gpt_node { 92 uint64_t vgn_host_pfn; 93 uint16_t vgn_level; 94 uint16_t vgn_index; 95 uint32_t vgn_ref_cnt; 96 vmm_gpt_node_t *vgn_parent; 97 vmm_gpt_node_t *vgn_children; 98 vmm_gpt_node_t *vgn_siblings; 99 uint64_t *vgn_entries; 100 uint64_t vgn_gpa; 101 uint64_t _vgn_pad; 102 }; 103 104 /* 105 * A VMM Generic Page Table. 106 * 107 * The generic page table is a format-agnostic, 4-level paging structure 108 * modeling a second-level page table (EPT on Intel, nPT on AMD). It 109 * contains a counter of pages the table maps, a pointer to the root node 110 * in the table, and is parameterized with a set of PTE operations specific 111 * to the table type. 112 */ 113 struct vmm_gpt { 114 vmm_gpt_node_t *vgpt_root; 115 vmm_pte_ops_t *vgpt_pte_ops; 116 }; 117 118 /* 119 * VMM Guest Page Tables 120 */ 121 122 /* 123 * Allocates a vmm_gpt_node_t structure with corresponding page of memory to 124 * hold the PTEs it contains. 125 */ 126 static vmm_gpt_node_t * 127 vmm_gpt_node_alloc(void) 128 { 129 vmm_gpt_node_t *node; 130 caddr_t page; 131 132 node = kmem_zalloc(sizeof (*node), KM_SLEEP); 133 /* 134 * Note: despite the man page, allocating PAGESIZE bytes is 135 * guaranteed to be page-aligned. 136 */ 137 page = kmem_zalloc(PAGESIZE, KM_SLEEP); 138 node->vgn_entries = (uint64_t *)page; 139 node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page); 140 141 return (node); 142 } 143 144 /* 145 * Allocates and initializes a vmm_gpt_t. 146 */ 147 vmm_gpt_t * 148 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops) 149 { 150 vmm_gpt_t *gpt; 151 152 VERIFY(pte_ops != NULL); 153 gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP); 154 gpt->vgpt_pte_ops = pte_ops; 155 gpt->vgpt_root = vmm_gpt_node_alloc(); 156 157 return (gpt); 158 } 159 160 /* 161 * Frees the given node, first nulling out all of its links to other nodes in 162 * the tree, adjusting its parents reference count, and unlinking itself from 163 * its parents page table. 164 */ 165 static void 166 vmm_gpt_node_free(vmm_gpt_node_t *node) 167 { 168 ASSERT(node != NULL); 169 ASSERT3U(node->vgn_ref_cnt, ==, 0); 170 ASSERT(node->vgn_host_pfn != PFN_INVALID); 171 ASSERT(node->vgn_entries != NULL); 172 if (node->vgn_parent != NULL) { 173 uint64_t *parent_entries = node->vgn_parent->vgn_entries; 174 parent_entries[node->vgn_index] = 0; 175 node->vgn_parent->vgn_ref_cnt--; 176 } 177 kmem_free(node->vgn_entries, PAGESIZE); 178 kmem_free(node, sizeof (*node)); 179 } 180 181 /* 182 * Frees the portion of the radix tree rooted at the given node. 183 */ 184 static void 185 vmm_gpt_node_tree_free(vmm_gpt_node_t *node) 186 { 187 ASSERT(node != NULL); 188 189 for (vmm_gpt_node_t *child = node->vgn_children, *next = NULL; 190 child != NULL; 191 child = next) { 192 next = child->vgn_siblings; 193 vmm_gpt_node_tree_free(child); 194 } 195 vmm_gpt_node_free(node); 196 } 197 198 /* 199 * Cleans up a vmm_gpt_t by removing any lingering vmm_gpt_node_t entries 200 * it refers to. 201 */ 202 void 203 vmm_gpt_free(vmm_gpt_t *gpt) 204 { 205 vmm_gpt_node_tree_free(gpt->vgpt_root); 206 kmem_free(gpt, sizeof (*gpt)); 207 } 208 209 /* 210 * Return the index in the paging structure for the given level. 211 */ 212 static inline uint16_t 213 vmm_gpt_node_index(uint64_t gpa, enum vmm_gpt_node_level level) 214 { 215 const int SHIFTS[MAX_GPT_LEVEL] = { 39, 30, 21, 12 }; 216 const uint_t MASK = (1U << 9) - 1; 217 ASSERT(level < MAX_GPT_LEVEL); 218 return ((gpa >> SHIFTS[level]) & MASK); 219 } 220 221 /* 222 * Finds the child for the given GPA in the given parent node. 223 * Returns a pointer to node, or NULL if it is not found. 224 */ 225 static vmm_gpt_node_t * 226 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa) 227 { 228 if (parent == NULL) 229 return (NULL); 230 231 const uint16_t index = vmm_gpt_node_index(gpa, parent->vgn_level); 232 for (vmm_gpt_node_t *child = parent->vgn_children; 233 child != NULL && child->vgn_index <= index; 234 child = child->vgn_siblings) { 235 if (child->vgn_index == index) 236 return (child); 237 } 238 239 return (NULL); 240 } 241 242 /* 243 * Walks the GPT for the given GPA, accumulating entries to the given depth. If 244 * the walk terminates before the depth is reached, the remaining entries are 245 * written with NULLs. 246 */ 247 void 248 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries, 249 enum vmm_gpt_node_level depth) 250 { 251 uint64_t *current_entries, entry; 252 pfn_t pfn; 253 254 ASSERT(gpt != NULL); 255 current_entries = gpt->vgpt_root->vgn_entries; 256 for (uint_t i = 0; i < depth; i++) { 257 if (current_entries == NULL) { 258 entries[i] = NULL; 259 continue; 260 } 261 entries[i] = ¤t_entries[vmm_gpt_node_index(gpa, i)]; 262 entry = *entries[i]; 263 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { 264 current_entries = NULL; 265 continue; 266 } 267 pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); 268 current_entries = (uint64_t *)hat_kpm_pfn2va(pfn); 269 } 270 } 271 272 /* 273 * Looks up an entry given GPA. 274 */ 275 uint64_t * 276 vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa) 277 { 278 uint64_t *entries[MAX_GPT_LEVEL]; 279 280 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 281 282 return (entries[LEVEL1]); 283 } 284 285 /* 286 * Adds a node for the given GPA to the GPT as a child of the given parent. 287 */ 288 static void 289 vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child, 290 uint64_t gpa) 291 { 292 vmm_gpt_node_t **prevp; 293 vmm_gpt_node_t *node; 294 uint64_t *parent_entries, entry; 295 296 ASSERT(gpt != NULL); 297 ASSERT(gpt->vgpt_pte_ops != NULL); 298 ASSERT(parent != NULL); 299 ASSERT(child != NULL); 300 ASSERT3U(parent->vgn_level, <, LEVEL1); 301 302 const uint64_t gpa_mask[3] = { 303 [LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */ 304 [LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */ 305 [LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */ 306 }; 307 const int index = vmm_gpt_node_index(gpa, parent->vgn_level); 308 child->vgn_index = index; 309 child->vgn_level = parent->vgn_level + 1; 310 child->vgn_parent = parent; 311 child->vgn_gpa = gpa & gpa_mask[parent->vgn_level]; 312 parent_entries = parent->vgn_entries; 313 entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn); 314 parent_entries[index] = entry; 315 316 for (prevp = &parent->vgn_children, node = parent->vgn_children; 317 node != NULL; 318 prevp = &node->vgn_siblings, node = node->vgn_siblings) { 319 if (node->vgn_index > child->vgn_index) { 320 break; 321 } 322 } 323 if (node != NULL) 324 ASSERT3U(node->vgn_index, !=, child->vgn_index); 325 child->vgn_siblings = node; 326 *prevp = child; 327 parent->vgn_ref_cnt++; 328 } 329 330 /* 331 * Populate the GPT with nodes so that a entries for the given GPA exist. Note 332 * that this does not actually map the entry, but simply ensures that the 333 * entries exist. 334 */ 335 static void 336 vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa) 337 { 338 vmm_gpt_node_t *node, *child; 339 340 ASSERT(gpt != NULL); 341 ASSERT0(gpa & PAGEOFFSET); 342 343 node = gpt->vgpt_root; 344 for (uint_t i = 0; i < LEVEL1; i++) { 345 ASSERT(node != NULL); 346 child = vmm_gpt_node_find_child(node, gpa); 347 if (child == NULL) { 348 child = vmm_gpt_node_alloc(); 349 ASSERT(child != NULL); 350 vmm_gpt_add_child(gpt, node, child, gpa); 351 } 352 node = child; 353 } 354 355 /* 356 * Bump the reference count for this leaf for the PTE that is now usable 357 * by the mapping covering its GPA. 358 */ 359 ASSERT3U(node->vgn_level, ==, LEVEL1); 360 ASSERT3U(node->vgn_ref_cnt, <, 512); 361 node->vgn_ref_cnt++; 362 } 363 364 /* 365 * Ensures that PTEs for the region of address space bounded by 366 * [start, end) exist in the tree. 367 */ 368 void 369 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) 370 { 371 ASSERT0(start & PAGEOFFSET); 372 ASSERT0(end & PAGEOFFSET); 373 374 for (uint64_t page = start; page < end; page += PAGESIZE) { 375 vmm_gpt_populate_entry(gpt, page); 376 } 377 } 378 379 /* 380 * Format a PTE and install it in the provided PTE-pointer. 381 */ 382 bool 383 vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot, 384 uint8_t attr) 385 { 386 uint64_t entry, old_entry; 387 388 entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr); 389 old_entry = atomic_cas_64(ptep, 0, entry); 390 if (old_entry != 0) { 391 ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==, 392 gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry)); 393 return (false); 394 } 395 396 return (true); 397 } 398 399 /* 400 * Inserts an entry for a given GPA into the table. The caller must 401 * ensure that a conflicting PFN is not mapped at the requested location. 402 * Racing operations to map the same PFN at one location is acceptable and 403 * properly handled. 404 */ 405 bool 406 vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr) 407 { 408 uint64_t *entries[MAX_GPT_LEVEL]; 409 410 ASSERT(gpt != NULL); 411 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 412 ASSERT(entries[LEVEL1] != NULL); 413 414 return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr)); 415 } 416 417 /* 418 * Removes a child node from its parent's list of children, and then frees 419 * the now-orphaned child. 420 */ 421 static void 422 vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child) 423 { 424 ASSERT(parent != NULL); 425 426 ASSERT3P(child->vgn_children, ==, NULL); 427 vmm_gpt_node_t **prevp = &parent->vgn_children; 428 for (vmm_gpt_node_t *node = parent->vgn_children; 429 node != NULL; 430 prevp = &node->vgn_siblings, node = node->vgn_siblings) { 431 if (node == child) { 432 *prevp = node->vgn_siblings; 433 vmm_gpt_node_free(node); 434 break; 435 } 436 } 437 } 438 439 /* 440 * Cleans up unused inner nodes in the GPT. Asserts that the leaf corresponding 441 * to the entry does not map any additional pages. 442 */ 443 static void 444 vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa) 445 { 446 vmm_gpt_node_t *nodes[MAX_GPT_LEVEL], *node; 447 448 node = gpt->vgpt_root; 449 for (uint_t i = 0; i < MAX_GPT_LEVEL; i++) { 450 nodes[i] = node; 451 node = vmm_gpt_node_find_child(node, gpa); 452 } 453 for (uint_t i = LEVEL1; i > 0; i--) { 454 node = nodes[i]; 455 456 if (node == NULL) 457 continue; 458 459 if (i == LEVEL1) { 460 ASSERT0(node->vgn_entries[vmm_gpt_node_index(gpa, i)]); 461 ASSERT3U(node->vgn_ref_cnt, !=, 0); 462 463 /* 464 * Just as vmm_gpt_populate_entry() increments the 465 * reference count for leaf PTEs which become usable, 466 * here we decrement it as they become unusable as the 467 * mapping covering its GPA is removed. 468 */ 469 node->vgn_ref_cnt--; 470 } 471 472 if (node->vgn_ref_cnt != 0) 473 break; 474 vmm_gpt_node_remove_child(nodes[i - 1], nodes[i]); 475 } 476 } 477 478 /* 479 * Cleans up the unused inner nodes in the GPT for a region of guest physical 480 * address space of [start, end). The region must map no pages. 481 */ 482 void 483 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) 484 { 485 ASSERT0(start & PAGEOFFSET); 486 ASSERT0(end & PAGEOFFSET); 487 488 for (uint64_t page = start; page < end; page += PAGESIZE) { 489 vmm_gpt_vacate_entry(gpt, page); 490 } 491 } 492 493 /* 494 * Remove a mapping from the table. Returns false if the page was not mapped, 495 * otherwise returns true. 496 */ 497 bool 498 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) 499 { 500 uint64_t *entries[MAX_GPT_LEVEL], entry; 501 502 ASSERT(gpt != NULL); 503 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 504 if (entries[LEVEL1] == NULL) 505 return (false); 506 507 entry = *entries[LEVEL1]; 508 *entries[LEVEL1] = 0; 509 return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)); 510 } 511 512 /* 513 * Un-maps the region of guest physical address space bounded by [start..end). 514 * Returns the number of pages that are unmapped. 515 */ 516 size_t 517 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) 518 { 519 ASSERT0(start & PAGEOFFSET); 520 ASSERT0(end & PAGEOFFSET); 521 522 size_t num_unmapped = 0; 523 for (uint64_t page = start; page < end; page += PAGESIZE) { 524 if (vmm_gpt_unmap(gpt, page) != 0) { 525 num_unmapped++; 526 } 527 } 528 529 return (num_unmapped); 530 } 531 532 /* 533 * Returns a value indicating whether or not this GPT maps the given 534 * GPA. If the GPA is mapped, *protp will be filled with the protection 535 * bits of the entry. Otherwise, it will be ignored. 536 */ 537 bool 538 vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp) 539 { 540 uint64_t entry; 541 542 if (ptep == NULL) { 543 return (false); 544 } 545 entry = *ptep; 546 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { 547 return (false); 548 } 549 *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); 550 *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry); 551 return (true); 552 } 553 554 /* 555 * Resets the accessed bit on the page table entry pointed to be `entry`. 556 * If `on` is true, the bit will be set, otherwise it will be cleared. 557 * The old value of the bit is returned. 558 */ 559 uint_t 560 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on) 561 { 562 ASSERT(entry != NULL); 563 return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on)); 564 } 565 566 /* 567 * Resets the dirty bit on the page table entry pointed to be `entry`. 568 * If `on` is true, the bit will be set, otherwise it will be cleared. 569 * The old value of the bit is returned. 570 */ 571 uint_t 572 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on) 573 { 574 ASSERT(entry != NULL); 575 return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on)); 576 } 577 578 /* 579 * Get properly formatted PML4 (EPTP/nCR3) for GPT. 580 */ 581 uint64_t 582 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty) 583 { 584 const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn; 585 return (gpt->vgpt_pte_ops->vpeo_get_pmtp(root_pfn, track_dirty)); 586 } 587