1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 * Copyright 2022 Oxide Computer Company 15 */ 16 17 #include <sys/types.h> 18 #include <sys/malloc.h> 19 #include <sys/atomic.h> 20 #include <sys/kmem.h> 21 #include <sys/sysmacros.h> 22 #include <sys/sunddi.h> 23 #include <sys/panic.h> 24 #include <vm/hat.h> 25 #include <vm/as.h> 26 #include <vm/hat_i86.h> 27 28 #include <sys/vmm_gpt.h> 29 30 /* 31 * VMM Generic Page Tables 32 * 33 * Bhyve runs on AMD and Intel hosts and both support nested page tables 34 * describing the guest's physical address space. But the two use different and 35 * mutually incompatible page table formats: Intel uses the EPT, which is based 36 * on the Itanium page table format, while AMD uses the nPT, which is based on 37 * the x86_64 page table format. 38 * 39 * The GPT abstracts these format differences, and provides a single interface 40 * for interacting with either kind of table structure. 41 * 42 * At a high-level, the GPT is a tree that mirrors the paging table radix tree. 43 * It is parameterized with operations on PTEs that are specific to the table 44 * type (EPT or nPT) and also keeps track of how many pages the table maps, as 45 * well as a pointer to the root node in the tree. 46 * 47 * A node in the GPT keep pointers to its parent (NULL for the root), its 48 * left-most child, and its rightward siblings. The node understands its 49 * position in the tree in terms of its level it appears at and the index it 50 * occupies at its parent's level, as well as how many children it has. It also 51 * owns the physical memory page for the hardware page table entries that map 52 * its children. Thus, for a node at any given level in the tree, the nested 53 * PTE for that node's child at index $i$ is the i'th uint64_t in that node's 54 * entry page and the entry page is part of the paging structure consumed by 55 * hardware. 56 * 57 * The GPT interface provides functions for populating and vacating the tree for 58 * regions in the guest physical address space, and for mapping and unmapping 59 * pages in populated regions. Users must populate a region before mapping 60 * pages into it, and must unmap pages before vacating the region. 61 * 62 * The interface also exposes a function for walking the table from the root to 63 * a leaf entry, populating an array of pointers to PTEs. This walk uses the 64 * hardware page structure itself, and is thus fast, though as a result it 65 * potentially aliases entries; caveat emptor. The walk primitive is used for 66 * mapping, unmapping, and lookups. 67 * 68 * Format-specific differences are abstracted by parameterizing the GPT with a 69 * set of PTE operations specific to the platform. The GPT code makes use of 70 * these when mapping or populating entries, resetting accessed and dirty bits 71 * on entries, and similar operations. 72 */ 73 74 /* 75 * A GPT node. 76 * 77 * Each node contains pointers to its parent, its left-most child, and its 78 * rightward siblings. Interior nodes also maintain a reference count, and 79 * each node contains its level and index in its parent's table. Finally, 80 * each node contains the host PFN of the page that it links into the page 81 * table, as well as a kernel pointer to table. 82 * 83 * On leaf nodes, the reference count tracks how many entries in the table are 84 * covered by mapping from the containing vmspace. This is maintained during 85 * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace 86 * map/unmap operations, rather than in the data path of faults populating the 87 * PTEs themselves. 88 * 89 * Note, this is carefully sized to fit exactly into a 64-byte cache line. 90 */ 91 typedef struct vmm_gpt_node vmm_gpt_node_t; 92 struct vmm_gpt_node { 93 uint64_t vgn_host_pfn; 94 uint16_t vgn_level; 95 uint16_t vgn_index; 96 uint32_t vgn_ref_cnt; 97 vmm_gpt_node_t *vgn_parent; 98 vmm_gpt_node_t *vgn_children; 99 vmm_gpt_node_t *vgn_siblings; 100 uint64_t *vgn_entries; 101 uint64_t vgn_gpa; 102 uint64_t _vgn_pad; 103 }; 104 105 /* 106 * A VMM Generic Page Table. 107 * 108 * The generic page table is a format-agnostic, 4-level paging structure 109 * modeling a second-level page table (EPT on Intel, nPT on AMD). It 110 * contains a counter of pages the table maps, a pointer to the root node 111 * in the table, and is parameterized with a set of PTE operations specific 112 * to the table type. 113 */ 114 struct vmm_gpt { 115 vmm_gpt_node_t *vgpt_root; 116 vmm_pte_ops_t *vgpt_pte_ops; 117 }; 118 119 /* 120 * VMM Guest Page Tables 121 */ 122 123 /* 124 * Allocates a vmm_gpt_node_t structure with corresponding page of memory to 125 * hold the PTEs it contains. 126 */ 127 static vmm_gpt_node_t * 128 vmm_gpt_node_alloc(void) 129 { 130 vmm_gpt_node_t *node; 131 caddr_t page; 132 133 node = kmem_zalloc(sizeof (*node), KM_SLEEP); 134 /* 135 * Note: despite the man page, allocating PAGESIZE bytes is 136 * guaranteed to be page-aligned. 137 */ 138 page = kmem_zalloc(PAGESIZE, KM_SLEEP); 139 node->vgn_entries = (uint64_t *)page; 140 node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page); 141 142 return (node); 143 } 144 145 /* 146 * Allocates and initializes a vmm_gpt_t. 147 */ 148 vmm_gpt_t * 149 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops) 150 { 151 vmm_gpt_t *gpt; 152 153 VERIFY(pte_ops != NULL); 154 gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP); 155 gpt->vgpt_pte_ops = pte_ops; 156 gpt->vgpt_root = vmm_gpt_node_alloc(); 157 158 return (gpt); 159 } 160 161 /* 162 * Frees the given node, first nulling out all of its links to other nodes in 163 * the tree, adjusting its parents reference count, and unlinking itself from 164 * its parents page table. 165 */ 166 static void 167 vmm_gpt_node_free(vmm_gpt_node_t *node) 168 { 169 ASSERT(node != NULL); 170 ASSERT3U(node->vgn_ref_cnt, ==, 0); 171 ASSERT(node->vgn_host_pfn != PFN_INVALID); 172 ASSERT(node->vgn_entries != NULL); 173 if (node->vgn_parent != NULL) { 174 uint64_t *parent_entries = node->vgn_parent->vgn_entries; 175 parent_entries[node->vgn_index] = 0; 176 node->vgn_parent->vgn_ref_cnt--; 177 } 178 kmem_free(node->vgn_entries, PAGESIZE); 179 kmem_free(node, sizeof (*node)); 180 } 181 182 /* 183 * Frees the portion of the radix tree rooted at the given node. 184 */ 185 static void 186 vmm_gpt_node_tree_free(vmm_gpt_node_t *node) 187 { 188 ASSERT(node != NULL); 189 190 for (vmm_gpt_node_t *child = node->vgn_children, *next = NULL; 191 child != NULL; 192 child = next) { 193 next = child->vgn_siblings; 194 vmm_gpt_node_tree_free(child); 195 } 196 vmm_gpt_node_free(node); 197 } 198 199 /* 200 * Cleans up a vmm_gpt_t by removing any lingering vmm_gpt_node_t entries 201 * it refers to. 202 */ 203 void 204 vmm_gpt_free(vmm_gpt_t *gpt) 205 { 206 vmm_gpt_node_tree_free(gpt->vgpt_root); 207 kmem_free(gpt, sizeof (*gpt)); 208 } 209 210 /* 211 * Return the index in the paging structure for the given level. 212 */ 213 static inline uint16_t 214 vmm_gpt_node_index(uint64_t gpa, enum vmm_gpt_node_level level) 215 { 216 const int SHIFTS[MAX_GPT_LEVEL] = { 39, 30, 21, 12 }; 217 const uint_t MASK = (1U << 9) - 1; 218 ASSERT(level < MAX_GPT_LEVEL); 219 return ((gpa >> SHIFTS[level]) & MASK); 220 } 221 222 /* 223 * Finds the child for the given GPA in the given parent node. 224 * Returns a pointer to node, or NULL if it is not found. 225 */ 226 static vmm_gpt_node_t * 227 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa) 228 { 229 if (parent == NULL) 230 return (NULL); 231 232 const uint16_t index = vmm_gpt_node_index(gpa, parent->vgn_level); 233 for (vmm_gpt_node_t *child = parent->vgn_children; 234 child != NULL && child->vgn_index <= index; 235 child = child->vgn_siblings) { 236 if (child->vgn_index == index) 237 return (child); 238 } 239 240 return (NULL); 241 } 242 243 /* 244 * Walks the GPT for the given GPA, accumulating entries to the given depth. If 245 * the walk terminates before the depth is reached, the remaining entries are 246 * written with NULLs. 247 */ 248 void 249 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries, 250 enum vmm_gpt_node_level depth) 251 { 252 uint64_t *current_entries, entry; 253 pfn_t pfn; 254 255 ASSERT(gpt != NULL); 256 current_entries = gpt->vgpt_root->vgn_entries; 257 for (uint_t i = 0; i < depth; i++) { 258 if (current_entries == NULL) { 259 entries[i] = NULL; 260 continue; 261 } 262 entries[i] = ¤t_entries[vmm_gpt_node_index(gpa, i)]; 263 entry = *entries[i]; 264 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { 265 current_entries = NULL; 266 continue; 267 } 268 pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); 269 current_entries = (uint64_t *)hat_kpm_pfn2va(pfn); 270 } 271 } 272 273 /* 274 * Looks up an entry given GPA. 275 */ 276 uint64_t * 277 vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa) 278 { 279 uint64_t *entries[MAX_GPT_LEVEL]; 280 281 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 282 283 return (entries[LEVEL1]); 284 } 285 286 /* 287 * Adds a node for the given GPA to the GPT as a child of the given parent. 288 */ 289 static void 290 vmm_gpt_add_child(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, vmm_gpt_node_t *child, 291 uint64_t gpa) 292 { 293 vmm_gpt_node_t **prevp; 294 vmm_gpt_node_t *node; 295 uint64_t *parent_entries, entry; 296 297 ASSERT(gpt != NULL); 298 ASSERT(gpt->vgpt_pte_ops != NULL); 299 ASSERT(parent != NULL); 300 ASSERT(child != NULL); 301 ASSERT3U(parent->vgn_level, <, LEVEL1); 302 303 const uint64_t gpa_mask[3] = { 304 [LEVEL4] = 0xffffff8000000000ul, /* entries cover 512G */ 305 [LEVEL3] = 0xffffffffc0000000ul, /* entries cover 1G */ 306 [LEVEL2] = 0xffffffffffe00000ul, /* entries cover 2M */ 307 }; 308 const int index = vmm_gpt_node_index(gpa, parent->vgn_level); 309 child->vgn_index = index; 310 child->vgn_level = parent->vgn_level + 1; 311 child->vgn_parent = parent; 312 child->vgn_gpa = gpa & gpa_mask[parent->vgn_level]; 313 parent_entries = parent->vgn_entries; 314 entry = gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn); 315 parent_entries[index] = entry; 316 317 for (prevp = &parent->vgn_children, node = parent->vgn_children; 318 node != NULL; 319 prevp = &node->vgn_siblings, node = node->vgn_siblings) { 320 if (node->vgn_index > child->vgn_index) { 321 break; 322 } 323 } 324 if (node != NULL) 325 ASSERT3U(node->vgn_index, !=, child->vgn_index); 326 child->vgn_siblings = node; 327 *prevp = child; 328 parent->vgn_ref_cnt++; 329 } 330 331 /* 332 * Populate the GPT with nodes so that a entries for the given GPA exist. Note 333 * that this does not actually map the entry, but simply ensures that the 334 * entries exist. 335 */ 336 static void 337 vmm_gpt_populate_entry(vmm_gpt_t *gpt, uint64_t gpa) 338 { 339 vmm_gpt_node_t *node, *child; 340 341 ASSERT(gpt != NULL); 342 ASSERT0(gpa & PAGEOFFSET); 343 344 node = gpt->vgpt_root; 345 for (uint_t i = 0; i < LEVEL1; i++) { 346 ASSERT(node != NULL); 347 child = vmm_gpt_node_find_child(node, gpa); 348 if (child == NULL) { 349 child = vmm_gpt_node_alloc(); 350 ASSERT(child != NULL); 351 vmm_gpt_add_child(gpt, node, child, gpa); 352 } 353 node = child; 354 } 355 356 /* 357 * Bump the reference count for this leaf for the PTE that is now usable 358 * by the mapping covering its GPA. 359 */ 360 ASSERT3U(node->vgn_level, ==, LEVEL1); 361 ASSERT3U(node->vgn_ref_cnt, <, 512); 362 node->vgn_ref_cnt++; 363 } 364 365 /* 366 * Ensures that PTEs for the region of address space bounded by 367 * [start, end) exist in the tree. 368 */ 369 void 370 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) 371 { 372 ASSERT0(start & PAGEOFFSET); 373 ASSERT0(end & PAGEOFFSET); 374 375 for (uint64_t page = start; page < end; page += PAGESIZE) { 376 vmm_gpt_populate_entry(gpt, page); 377 } 378 } 379 380 /* 381 * Format a PTE and install it in the provided PTE-pointer. 382 */ 383 bool 384 vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot, 385 uint8_t attr) 386 { 387 uint64_t entry, old_entry; 388 389 entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr); 390 old_entry = atomic_cas_64(ptep, 0, entry); 391 if (old_entry != 0) { 392 ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==, 393 gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry)); 394 return (false); 395 } 396 397 return (true); 398 } 399 400 /* 401 * Inserts an entry for a given GPA into the table. The caller must 402 * ensure that a conflicting PFN is not mapped at the requested location. 403 * Racing operations to map the same PFN at one location is acceptable and 404 * properly handled. 405 */ 406 bool 407 vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr) 408 { 409 uint64_t *entries[MAX_GPT_LEVEL]; 410 411 ASSERT(gpt != NULL); 412 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 413 ASSERT(entries[LEVEL1] != NULL); 414 415 return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr)); 416 } 417 418 /* 419 * Removes a child node from its parent's list of children, and then frees 420 * the now-orphaned child. 421 */ 422 static void 423 vmm_gpt_node_remove_child(vmm_gpt_node_t *parent, vmm_gpt_node_t *child) 424 { 425 ASSERT(parent != NULL); 426 427 ASSERT3P(child->vgn_children, ==, NULL); 428 vmm_gpt_node_t **prevp = &parent->vgn_children; 429 for (vmm_gpt_node_t *node = parent->vgn_children; 430 node != NULL; 431 prevp = &node->vgn_siblings, node = node->vgn_siblings) { 432 if (node == child) { 433 *prevp = node->vgn_siblings; 434 vmm_gpt_node_free(node); 435 break; 436 } 437 } 438 } 439 440 /* 441 * Cleans up unused inner nodes in the GPT. Asserts that the leaf corresponding 442 * to the entry does not map any additional pages. 443 */ 444 static void 445 vmm_gpt_vacate_entry(vmm_gpt_t *gpt, uint64_t gpa) 446 { 447 vmm_gpt_node_t *nodes[MAX_GPT_LEVEL], *node; 448 449 node = gpt->vgpt_root; 450 for (uint_t i = 0; i < MAX_GPT_LEVEL; i++) { 451 nodes[i] = node; 452 node = vmm_gpt_node_find_child(node, gpa); 453 } 454 for (uint_t i = LEVEL1; i > 0; i--) { 455 node = nodes[i]; 456 457 if (node == NULL) 458 continue; 459 460 if (i == LEVEL1) { 461 ASSERT0(node->vgn_entries[vmm_gpt_node_index(gpa, i)]); 462 ASSERT3U(node->vgn_ref_cnt, !=, 0); 463 464 /* 465 * Just as vmm_gpt_populate_entry() increments the 466 * reference count for leaf PTEs which become usable, 467 * here we decrement it as they become unusable as the 468 * mapping covering its GPA is removed. 469 */ 470 node->vgn_ref_cnt--; 471 } 472 473 if (node->vgn_ref_cnt != 0) 474 break; 475 vmm_gpt_node_remove_child(nodes[i - 1], nodes[i]); 476 } 477 } 478 479 /* 480 * Cleans up the unused inner nodes in the GPT for a region of guest physical 481 * address space of [start, end). The region must map no pages. 482 */ 483 void 484 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) 485 { 486 ASSERT0(start & PAGEOFFSET); 487 ASSERT0(end & PAGEOFFSET); 488 489 for (uint64_t page = start; page < end; page += PAGESIZE) { 490 vmm_gpt_vacate_entry(gpt, page); 491 } 492 } 493 494 /* 495 * Remove a mapping from the table. Returns false if the page was not mapped, 496 * otherwise returns true. 497 */ 498 bool 499 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) 500 { 501 uint64_t *entries[MAX_GPT_LEVEL], entry; 502 503 ASSERT(gpt != NULL); 504 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 505 if (entries[LEVEL1] == NULL) 506 return (false); 507 508 entry = *entries[LEVEL1]; 509 *entries[LEVEL1] = 0; 510 return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)); 511 } 512 513 /* 514 * Un-maps the region of guest physical address space bounded by [start..end). 515 * Returns the number of pages that are unmapped. 516 */ 517 size_t 518 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t start, uint64_t end) 519 { 520 ASSERT0(start & PAGEOFFSET); 521 ASSERT0(end & PAGEOFFSET); 522 523 size_t num_unmapped = 0; 524 for (uint64_t page = start; page < end; page += PAGESIZE) { 525 if (vmm_gpt_unmap(gpt, page) != 0) { 526 num_unmapped++; 527 } 528 } 529 530 return (num_unmapped); 531 } 532 533 /* 534 * Returns a value indicating whether or not this GPT maps the given 535 * GPA. If the GPA is mapped, *protp will be filled with the protection 536 * bits of the entry. Otherwise, it will be ignored. 537 */ 538 bool 539 vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp) 540 { 541 uint64_t entry; 542 543 if (ptep == NULL) { 544 return (false); 545 } 546 entry = *ptep; 547 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { 548 return (false); 549 } 550 *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); 551 *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry); 552 return (true); 553 } 554 555 /* 556 * Resets the accessed bit on the page table entry pointed to be `entry`. 557 * If `on` is true, the bit will be set, otherwise it will be cleared. 558 * The old value of the bit is returned. 559 */ 560 uint_t 561 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on) 562 { 563 ASSERT(entry != NULL); 564 return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on)); 565 } 566 567 /* 568 * Resets the dirty bit on the page table entry pointed to be `entry`. 569 * If `on` is true, the bit will be set, otherwise it will be cleared. 570 * The old value of the bit is returned. 571 */ 572 uint_t 573 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on) 574 { 575 ASSERT(entry != NULL); 576 return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on)); 577 } 578 579 /* 580 * Get properly formatted PML4 (EPTP/nCR3) for GPT. 581 */ 582 uint64_t 583 vmm_gpt_get_pmtp(vmm_gpt_t *gpt) 584 { 585 return (gpt->vgpt_pte_ops->vpeo_get_pmtp(gpt->vgpt_root->vgn_host_pfn)); 586 } 587