1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2025 Oxide Computer Company 16 */ 17 18 #include <sys/types.h> 19 #include <sys/atomic.h> 20 #include <sys/kmem.h> 21 #include <sys/sysmacros.h> 22 #include <sys/sunddi.h> 23 #include <sys/panic.h> 24 #include <vm/hat.h> 25 #include <vm/as.h> 26 #include <vm/hat_i86.h> 27 28 #include <sys/vmm_gpt.h> 29 #include <sys/vmm_gpt_impl.h> 30 31 /* 32 * VMM Generic Page Tables 33 * 34 * Bhyve runs on AMD and Intel hosts and both support nested page tables 35 * describing the guest's physical address space. But the two use different and 36 * mutually incompatible page table formats: Intel uses the EPT, which is based 37 * on the Itanium page table format, while AMD uses the nPT, which is based on 38 * the x86_64 page table format. 39 * 40 * The GPT abstracts these format differences, and provides a single interface 41 * for interacting with either kind of table structure. 42 * 43 * At a high-level, the GPT is a tree that mirrors the paging table radix tree. 44 * It is parameterized with operations on PTEs that are specific to the table 45 * type (EPT or nPT) and also keeps track of how many pages the table maps, as 46 * well as a pointer to the root node in the tree. 47 * 48 * A node in the GPT keep pointers to its parent (NULL for the root), its 49 * left-most child, and its siblings. The node understands its position in the 50 * tree in terms of the level it appears at and the index it occupies at its 51 * parent's level, as well as how many children it has. It also owns the 52 * physical memory page for the hardware page table entries that map its 53 * children. Thus, for a node at any given level in the tree, the nested PTE 54 * for that node's child at index $i$ is the i'th uint64_t in that node's entry 55 * page and the entry page is part of the paging structure consumed by hardware. 56 * 57 * The GPT interface provides functions for populating and vacating the tree for 58 * regions in the guest physical address space, and for mapping and unmapping 59 * pages in populated regions. Users must populate a region before mapping 60 * pages into it, and must unmap pages before vacating the region. 61 * 62 * The interface also exposes a function for walking the table from the root to 63 * a leaf entry, populating an array of pointers to PTEs. This walk uses the 64 * hardware page structure itself, and is thus fast, though as a result it 65 * potentially aliases entries; caveat emptor. The walk primitive is used for 66 * mapping, unmapping, and lookups. 67 * 68 * Format-specific differences are abstracted by parameterizing the GPT with a 69 * set of PTE operations specific to the platform. The GPT code makes use of 70 * these when mapping or populating entries, resetting accessed and dirty bits 71 * on entries, and similar operations. 72 */ 73 74 /* 75 * A GPT node. 76 * 77 * Each node contains pointers to its parent, its left-most child, and its 78 * siblings. Interior nodes also maintain a reference count, and each node 79 * contains its level and index in its parent's table. Finally, each node 80 * contains the host PFN of the page that it links into the page table, as well 81 * as a kernel pointer to table. 82 * 83 * On leaf nodes, the reference count tracks how many entries in the table are 84 * covered by mapping from the containing vmspace. This is maintained during 85 * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace 86 * map/unmap operations, rather than in the data path of faults populating the 87 * PTEs themselves. 88 * 89 * Note, this is carefully sized to fit exactly into a 64-byte cache line. 90 */ 91 typedef struct vmm_gpt_node vmm_gpt_node_t; 92 struct vmm_gpt_node { 93 uint64_t vgn_host_pfn; 94 uint16_t vgn_level; 95 uint16_t vgn_index; 96 uint32_t vgn_ref_cnt; 97 vmm_gpt_node_t *vgn_parent; 98 vmm_gpt_node_t *vgn_children; 99 vmm_gpt_node_t *vgn_sib_next; 100 vmm_gpt_node_t *vgn_sib_prev; 101 vmm_gpt_entry_t *vgn_entries; 102 uint64_t vgn_gpa; 103 }; 104 105 /* Maximum node index determined by number of entries in page table (512) */ 106 #define PTE_PER_TABLE 512 107 #define MAX_NODE_IDX (PTE_PER_TABLE - 1) 108 109 /* 110 * A VMM Generic Page Table. 111 * 112 * The generic page table is a format-agnostic, 4-level paging structure 113 * modeling a second-level page table (EPT on Intel, nPT on AMD). It 114 * contains a counter of pages the table maps, a pointer to the root node 115 * in the table, and is parameterized with a set of PTE operations specific 116 * to the table type. 117 */ 118 struct vmm_gpt { 119 vmm_gpt_node_t *vgpt_root; 120 }; 121 122 void 123 vmm_gpt_impl_panic(void) 124 { 125 /* 126 * Bail out if caller a makes it to indirect stub before complete 127 * hot-patching has occured. 128 */ 129 panic("Indirect function not hot-patched"); 130 } 131 132 /* 133 * Function indirection stubs 134 * 135 * These are made valid (no longer jumping to vmm_gpt_impl_panic()) by 136 * vmm_gpt_init(), and are only to be called after that has run successfully 137 * during module initialization. 138 */ 139 uint64_t vmm_gpti_map_table(uint64_t); 140 uint64_t vmm_gpti_map_page(uint64_t, uint_t, uint8_t); 141 bool vmm_gpti_parse(uint64_t, pfn_t *, uint_t *); 142 143 const struct vmm_pte_impl vmm_pte_uninit_impl = { 144 .vpi_map_table = (void *)vmm_gpt_impl_panic, 145 .vpi_map_page = (void *)vmm_gpt_impl_panic, 146 .vpi_pte_parse = (void *)vmm_gpt_impl_panic, 147 .vpi_bit_accessed = 0, 148 .vpi_bit_dirty = 0, 149 150 .vpi_get_pmtp = (void *)vmm_gpt_impl_panic, 151 .vpi_hw_ad_supported = (void *)vmm_gpt_impl_panic, 152 }; 153 154 static const struct vmm_pte_impl *vmm_pte_impl = &vmm_pte_uninit_impl; 155 156 #define JMP_NEAR_OPCODE 0xe9 157 158 struct jmp_instr { 159 uint8_t opcode; 160 int32_t off; 161 } __packed; 162 163 164 /* 165 * Given a pointer to an immediate-near-`jmp` instruction, calculate the address 166 * of its target. 167 */ 168 static uintptr_t 169 jmp_off_to_addr(const struct jmp_instr *instp) 170 { 171 const uintptr_t next_rip = (uintptr_t)&instp[1]; 172 const uintptr_t off = (uintptr_t)(intptr_t)instp->off; 173 174 return (next_rip + off); 175 } 176 177 /* 178 * Given a pointer to a immediate-near-`jmp` instruction, calculate the 179 * immediate value for a given target address, if possible. 180 * 181 * Returns false if the target address is not within range of the `jmp`. 182 * Stores the offset value in `resp` on success. 183 */ 184 static bool 185 jmp_addr_to_off(const struct jmp_instr *instp, void *target, int32_t *resp) 186 { 187 const uintptr_t next_rip = (uintptr_t)&instp[1]; 188 const intptr_t off = (uintptr_t)target - next_rip; 189 190 /* jump is not "near" */ 191 if (off < INT32_MIN || off > INT32_MAX) { 192 return (false); 193 } 194 195 if (resp != NULL) { 196 *resp = (int32_t)off; 197 } 198 return (true); 199 } 200 201 /* 202 * Try to get the symbol name for a given kernel-virtual address. 203 * 204 * Returns a valid string pointer regardless of success or failure. 205 */ 206 static inline const char * 207 addr_to_sym(void *addr) 208 { 209 ulong_t ignored_offset; 210 const char *name = modgetsymname((uintptr_t)addr, &ignored_offset); 211 212 return (name != NULL ? name : "<null>"); 213 } 214 215 /* 216 * Hot-patch frequently called PTE implementation functions. 217 * 218 * Since indirect function calls are relatively expensive in a post-Spectre 219 * world, we choose to call into the backend via stubs which are hot-patched 220 * during module initialization with the proper implementation. During module 221 * unload, it is called again to re-patch the stubs back to their uninitialized 222 * state of pointing to vmm_gpt_impl_panic(). 223 */ 224 static bool 225 vmm_gpt_patch_indirection(const struct vmm_pte_impl *old_impl, 226 const struct vmm_pte_impl *new_impl) 227 { 228 struct indirection_patch { 229 void (*patch_site)(); 230 void (*old_implf)(); 231 void (*new_implf)(); 232 }; 233 const struct indirection_patch patches[] = { 234 { 235 .patch_site = (void *)vmm_gpti_map_table, 236 .old_implf = (void *)old_impl->vpi_map_table, 237 .new_implf = (void *)new_impl->vpi_map_table, 238 }, 239 { 240 .patch_site = (void *)vmm_gpti_map_page, 241 .old_implf = (void *)old_impl->vpi_map_page, 242 .new_implf = (void *)new_impl->vpi_map_page, 243 }, 244 { 245 .patch_site = (void *)vmm_gpti_parse, 246 .old_implf = (void *)old_impl->vpi_pte_parse, 247 .new_implf = (void *)new_impl->vpi_pte_parse, 248 }, 249 }; 250 251 /* Check the patches for validity first */ 252 for (uint_t i = 0; i < ARRAY_SIZE(patches); i++) { 253 const struct indirection_patch *patch = &patches[i]; 254 const struct jmp_instr *instp = (void *)patch->patch_site; 255 256 /* Expecting a near `jmp */ 257 if (instp->opcode != JMP_NEAR_OPCODE) { 258 cmn_err(CE_WARN, "vmm: non-JMP instruction found when " 259 "attempting to hotpatch %s", 260 addr_to_sym(patch->patch_site)); 261 return (false); 262 } 263 /* ... targeted at the expected function */ 264 const uintptr_t old_target = jmp_off_to_addr(instp); 265 if (old_target != (uintptr_t)patch->old_implf) { 266 cmn_err(CE_WARN, "vmm: JMP instr @ %s has unexpected " 267 "target %s != %s", 268 addr_to_sym(patch->patch_site), 269 addr_to_sym((void *)old_target), 270 addr_to_sym(patch->old_implf)); 271 return (false); 272 } 273 /* 274 * ... and which is close enough in the address space to the new 275 * intended target to be within range of the near `jmp`. 276 */ 277 if (!jmp_addr_to_off(instp, patch->new_implf, NULL)) { 278 cmn_err(CE_WARN, "vmm: near-JMP to new target %s is " 279 "too far for site %s", 280 addr_to_sym(patch->new_implf), 281 addr_to_sym(patch->patch_site)); 282 return (false); 283 } 284 } 285 286 for (uint_t i = 0; i < ARRAY_SIZE(patches); i++) { 287 const struct indirection_patch *patch = &patches[i]; 288 struct jmp_instr *instp = (void *)patch->patch_site; 289 290 int32_t new_off; 291 VERIFY(jmp_addr_to_off(instp, patch->new_implf, &new_off)); 292 /* 293 * We do not meet the demand from hot_patch_kernel_text() that 294 * the to-be-patched data be aligned with the patch size. On 295 * x86_64, with the caveat that no other CPU(s) will be 296 * concurrently executing the patched instructions, we can break 297 * the rules without fear of disaster. 298 */ 299 hot_patch_kernel_text((caddr_t)&instp->off, new_off, 4); 300 } 301 return (true); 302 } 303 304 bool 305 vmm_gpt_init(const struct vmm_pte_impl *target_impl) 306 { 307 VERIFY3P(vmm_pte_impl, ==, &vmm_pte_uninit_impl); 308 309 if (vmm_gpt_patch_indirection(vmm_pte_impl, target_impl)) { 310 vmm_pte_impl = target_impl; 311 return (true); 312 } 313 return (false); 314 } 315 316 void 317 vmm_gpt_fini(void) 318 { 319 /* 320 * Restore the indirection stubs back to their original (panicking) 321 * implementations. All potential callers should be excluded since this 322 * is only done during module unload. 323 */ 324 VERIFY(vmm_gpt_patch_indirection(vmm_pte_impl, &vmm_pte_uninit_impl)); 325 vmm_pte_impl = &vmm_pte_uninit_impl; 326 } 327 328 /* 329 * Allocates a vmm_gpt_node_t structure with corresponding page of memory to 330 * hold the PTEs it contains. 331 */ 332 static vmm_gpt_node_t * 333 vmm_gpt_node_alloc(void) 334 { 335 vmm_gpt_node_t *node; 336 caddr_t page; 337 338 node = kmem_zalloc(sizeof (*node), KM_SLEEP); 339 /* 340 * Note: despite the man page, allocating PAGESIZE bytes is 341 * guaranteed to be page-aligned. 342 */ 343 page = kmem_zalloc(PAGESIZE, KM_SLEEP); 344 node->vgn_entries = (vmm_gpt_entry_t *)page; 345 node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page); 346 347 return (node); 348 } 349 350 /* 351 * Allocates and initializes a vmm_gpt_t. 352 */ 353 vmm_gpt_t * 354 vmm_gpt_alloc(void) 355 { 356 vmm_gpt_t *gpt = kmem_zalloc(sizeof (vmm_gpt_t), KM_SLEEP); 357 gpt->vgpt_root = vmm_gpt_node_alloc(); 358 359 return (gpt); 360 } 361 362 /* 363 * Frees a given node. The node is expected to have no familial (parent, 364 * children, siblings) associations at this point. Accordingly, its reference 365 * count should be zero. 366 */ 367 static void 368 vmm_gpt_node_free(vmm_gpt_node_t *node) 369 { 370 ASSERT(node != NULL); 371 ASSERT3U(node->vgn_ref_cnt, ==, 0); 372 ASSERT(node->vgn_host_pfn != PFN_INVALID); 373 ASSERT(node->vgn_entries != NULL); 374 ASSERT(node->vgn_parent == NULL); 375 376 kmem_free(node->vgn_entries, PAGESIZE); 377 kmem_free(node, sizeof (*node)); 378 } 379 380 /* 381 * Frees a vmm_gpt_t. Any lingering nodes in the GPT will be freed too. 382 */ 383 void 384 vmm_gpt_free(vmm_gpt_t *gpt) 385 { 386 /* Empty anything remaining in the tree */ 387 vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK); 388 389 VERIFY(gpt->vgpt_root != NULL); 390 VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0); 391 392 vmm_gpt_node_free(gpt->vgpt_root); 393 kmem_free(gpt, sizeof (*gpt)); 394 } 395 396 /* 397 * Given a GPA, return its corresponding index in a paging structure at the 398 * provided level. 399 */ 400 static inline uint16_t 401 vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa) 402 { 403 ASSERT(level < MAX_GPT_LEVEL); 404 405 const uint16_t mask = MAX_NODE_IDX; 406 switch (level) { 407 case LEVEL4: return ((gpa >> 39) & mask); 408 case LEVEL3: return ((gpa >> 30) & mask); 409 case LEVEL2: return ((gpa >> 21) & mask); 410 case LEVEL1: return ((gpa >> 12) & mask); 411 default: 412 panic("impossible level value"); 413 }; 414 } 415 416 /* Get mask for addresses of entries at a given table level. */ 417 static inline uint64_t 418 vmm_gpt_lvl_mask(vmm_gpt_node_level_t level) 419 { 420 ASSERT(level < MAX_GPT_LEVEL); 421 422 switch (level) { 423 case LEVEL4: return (0xffffff8000000000ul); /* entries cover 512G */ 424 case LEVEL3: return (0xffffffffc0000000ul); /* entries cover 1G */ 425 case LEVEL2: return (0xffffffffffe00000ul); /* entries cover 2M */ 426 case LEVEL1: return (0xfffffffffffff000ul); /* entries cover 4K */ 427 default: 428 panic("impossible level value"); 429 }; 430 } 431 432 /* Get length of GPA covered by entries at a given table level. */ 433 static inline uint64_t 434 vmm_gpt_lvl_len(vmm_gpt_node_level_t level) 435 { 436 ASSERT(level < MAX_GPT_LEVEL); 437 438 switch (level) { 439 case LEVEL4: return (0x8000000000ul); /* entries cover 512G */ 440 case LEVEL3: return (0x40000000ul); /* entries cover 1G */ 441 case LEVEL2: return (0x200000ul); /* entries cover 2M */ 442 case LEVEL1: return (0x1000ul); /* entries cover 4K */ 443 default: 444 panic("impossible level value"); 445 }; 446 } 447 448 /* Get the index of a PTE pointer based on its offset in that table */ 449 static inline uint16_t 450 vmm_gpt_ptep_index(const vmm_gpt_entry_t *ptep) 451 { 452 const uintptr_t offset = (uintptr_t)ptep & 0xffful; 453 return (offset / sizeof (uint64_t)); 454 } 455 456 /* 457 * Get the ending GPA which this node could possibly cover given its base 458 * address and level. 459 */ 460 static inline uint64_t 461 vmm_gpt_node_end(vmm_gpt_node_t *node) 462 { 463 ASSERT(node->vgn_level > LEVEL4); 464 return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1)); 465 } 466 467 /* 468 * Is this node the last entry in its parent node, based solely by its GPA? 469 */ 470 static inline bool 471 vmm_gpt_node_is_last(vmm_gpt_node_t *node) 472 { 473 return (node->vgn_index == MAX_NODE_IDX); 474 } 475 476 /* 477 * How many table entries (if any) in this node are covered by the range of 478 * [start, end). 479 */ 480 static uint16_t 481 vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end) 482 { 483 const uint64_t node_end = vmm_gpt_node_end(node); 484 485 /* Is this node covered at all by the region? */ 486 if (start >= node_end || end <= node->vgn_gpa) { 487 return (0); 488 } 489 490 const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level); 491 const uint64_t covered_start = MAX(node->vgn_gpa, start & mask); 492 const uint64_t covered_end = MIN(node_end, end & mask); 493 const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level); 494 495 return ((covered_end - covered_start) / per_entry); 496 } 497 498 /* 499 * Find the next node (by address) in the tree at the same level. 500 * 501 * Returns NULL if this is the last node in the tree or if `only_seq` was true 502 * and there is an address gap between this node and the next. 503 */ 504 static vmm_gpt_node_t * 505 vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq) 506 { 507 ASSERT3P(node->vgn_parent, !=, NULL); 508 ASSERT3U(node->vgn_level, >, LEVEL4); 509 510 /* 511 * Next node sequentially would be the one at the address starting at 512 * the end of what is covered by this node. 513 */ 514 const uint64_t gpa_match = vmm_gpt_node_end(node); 515 516 /* Try our next sibling */ 517 vmm_gpt_node_t *next = node->vgn_sib_next; 518 519 if (next == NULL) { 520 /* 521 * If the next-sibling pointer is NULL on the node, it can mean 522 * one of two things: 523 * 524 * 1. This entry represents the space leading up to the trailing 525 * boundary of what this node covers. 526 * 527 * 2. The node is not entirely populated, and there is a gap 528 * between the last populated entry, and the trailing 529 * boundary of the node. 530 * 531 * Either way, the proper course of action is to check the first 532 * child of our parent's next sibling (if the parent is not the 533 * root of the GPT itself). 534 */ 535 if (node->vgn_parent != NULL && node->vgn_level > LEVEL3) { 536 vmm_gpt_node_t *psibling = 537 vmm_gpt_node_next(node->vgn_parent, true); 538 if (psibling != NULL) { 539 next = psibling->vgn_children; 540 } 541 } 542 } 543 544 if (next != NULL && 545 (next->vgn_gpa == gpa_match || !only_seq)) { 546 return (next); 547 } 548 549 return (NULL); 550 } 551 552 553 /* 554 * Finds the child for the given GPA in the given parent node. 555 * Returns a pointer to node, or NULL if it is not found. 556 */ 557 static vmm_gpt_node_t * 558 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa) 559 { 560 const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa); 561 for (vmm_gpt_node_t *child = parent->vgn_children; 562 child != NULL && child->vgn_index <= index; 563 child = child->vgn_sib_next) { 564 if (child->vgn_index == index) 565 return (child); 566 } 567 568 return (NULL); 569 } 570 571 /* 572 * Add a child node to the GPT at a position determined by GPA, parent, and (if 573 * present) preceding sibling. 574 * 575 * If `parent` node contains any children, `prev_sibling` must be populated with 576 * a pointer to the node preceding (by GPA) the to-be-added child node. 577 */ 578 static void 579 vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, 580 vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling) 581 { 582 ASSERT3U(parent->vgn_level, <, LEVEL1); 583 ASSERT3U(child->vgn_parent, ==, NULL); 584 585 const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa); 586 child->vgn_index = idx; 587 child->vgn_level = parent->vgn_level + 1; 588 child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level); 589 590 /* Establish familial connections */ 591 child->vgn_parent = parent; 592 if (prev_sibling != NULL) { 593 ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa); 594 595 child->vgn_sib_next = prev_sibling->vgn_sib_next; 596 if (child->vgn_sib_next != NULL) { 597 child->vgn_sib_next->vgn_sib_prev = child; 598 } 599 child->vgn_sib_prev = prev_sibling; 600 prev_sibling->vgn_sib_next = child; 601 } else if (parent->vgn_children != NULL) { 602 vmm_gpt_node_t *next_sibling = parent->vgn_children; 603 604 ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa); 605 ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL); 606 607 child->vgn_sib_next = next_sibling; 608 child->vgn_sib_prev = NULL; 609 next_sibling->vgn_sib_prev = child; 610 parent->vgn_children = child; 611 } else { 612 parent->vgn_children = child; 613 child->vgn_sib_next = NULL; 614 child->vgn_sib_prev = NULL; 615 } 616 617 /* Configure PTE for child table */ 618 parent->vgn_entries[idx] = vmm_gpti_map_table(child->vgn_host_pfn); 619 parent->vgn_ref_cnt++; 620 } 621 622 /* 623 * Remove a child node from its relatives (parent, siblings) and free it. 624 */ 625 static void 626 vmm_gpt_node_remove(vmm_gpt_node_t *child) 627 { 628 ASSERT3P(child->vgn_children, ==, NULL); 629 ASSERT3U(child->vgn_ref_cnt, ==, 0); 630 ASSERT3P(child->vgn_parent, !=, NULL); 631 632 /* Unlink child from its siblings and parent */ 633 vmm_gpt_node_t *parent = child->vgn_parent; 634 vmm_gpt_node_t *prev = child->vgn_sib_prev; 635 vmm_gpt_node_t *next = child->vgn_sib_next; 636 if (prev != NULL) { 637 ASSERT3P(prev->vgn_sib_next, ==, child); 638 prev->vgn_sib_next = next; 639 } 640 if (next != NULL) { 641 ASSERT3P(next->vgn_sib_prev, ==, child); 642 next->vgn_sib_prev = prev; 643 } 644 if (prev == NULL) { 645 ASSERT3P(parent->vgn_children, ==, child); 646 parent->vgn_children = next; 647 } 648 child->vgn_parent = NULL; 649 child->vgn_sib_next = NULL; 650 child->vgn_sib_prev = NULL; 651 parent->vgn_entries[child->vgn_index] = 0; 652 parent->vgn_ref_cnt--; 653 654 vmm_gpt_node_free(child); 655 } 656 657 /* 658 * Walks the GPT for the given GPA, accumulating entries to the given depth. If 659 * the walk terminates before the depth is reached, the remaining entries are 660 * written with NULLs. 661 * 662 * Returns the GPA corresponding to the deepest populated (non-NULL) entry 663 * encountered during the walk. 664 */ 665 uint64_t 666 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries, 667 vmm_gpt_node_level_t depth) 668 { 669 ASSERT(gpt != NULL); 670 ASSERT3U(depth, <, MAX_GPT_LEVEL); 671 672 vmm_gpt_entry_t *current_entries = gpt->vgpt_root->vgn_entries; 673 uint64_t mask = 0; 674 for (uint_t lvl = LEVEL4; lvl <= depth; lvl++) { 675 if (current_entries == NULL) { 676 entries[lvl] = NULL; 677 continue; 678 } 679 entries[lvl] = ¤t_entries[vmm_gpt_lvl_index(lvl, gpa)]; 680 mask = vmm_gpt_lvl_mask(lvl); 681 const vmm_gpt_entry_t pte = *entries[lvl]; 682 683 pfn_t pfn; 684 if (!vmm_gpti_parse(pte, &pfn, NULL)) { 685 current_entries = NULL; 686 continue; 687 } 688 current_entries = (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn); 689 } 690 return (gpa & mask); 691 } 692 693 /* 694 * Given a `gpa`, and the corresponding `entries` array as queried by a call to 695 * `vmm_gpt_walk()`, attempt to advance to the next PTE at the `depth` provided 696 * by the caller. 697 * 698 * As there may not be PTEs present at the requested `depth`, the amount that 699 * the GPA is advanced may be larger than what is expected for that provided 700 * depth. 701 * 702 * Returns the GPA correspending to the PTE to which `entries` was advanced to. 703 */ 704 static uint64_t 705 vmm_gpt_walk_advance(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries, 706 vmm_gpt_node_level_t depth) 707 { 708 ASSERT(gpt != NULL); 709 ASSERT3U(depth, <, MAX_GPT_LEVEL); 710 ASSERT0(gpa & ~vmm_gpt_lvl_mask(depth)); 711 712 /* 713 * Advanced to the next entry in the deepest level of PTE possible, 714 * ascending to higher levels if we reach the end of a table at any 715 * given point. 716 */ 717 int lvl; 718 for (lvl = depth; lvl >= LEVEL4; lvl--) { 719 vmm_gpt_entry_t *ptep = entries[lvl]; 720 721 if (ptep == NULL) { 722 continue; 723 } 724 725 uint16_t index = vmm_gpt_ptep_index(ptep); 726 ASSERT3U(vmm_gpt_lvl_index(lvl, gpa), ==, index); 727 if (index == MAX_NODE_IDX) { 728 continue; 729 } 730 731 gpa = (gpa & vmm_gpt_lvl_mask(lvl)) + vmm_gpt_lvl_len(lvl); 732 entries[lvl] = ptep + 1; 733 break; 734 } 735 736 if (lvl < LEVEL4) { 737 /* Advanced off the end of the tables */ 738 return (UINT64_MAX); 739 } 740 741 /* 742 * Attempt to walk back down to the target level if any ascension was 743 * necessary during the above advancement. 744 */ 745 vmm_gpt_entry_t pte = *entries[lvl]; 746 lvl++; 747 for (; lvl < MAX_GPT_LEVEL; lvl++) { 748 pfn_t pfn; 749 750 if (lvl > depth || !vmm_gpti_parse(pte, &pfn, NULL)) { 751 entries[lvl] = NULL; 752 continue; 753 } 754 755 vmm_gpt_entry_t *next_table = 756 (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn); 757 const uint16_t index = vmm_gpt_lvl_index(lvl, gpa); 758 pte = next_table[index]; 759 entries[lvl] = &next_table[index]; 760 } 761 762 return (gpa); 763 } 764 765 /* 766 * Initialize iteration over GPT leaf entries between [addr, addr + len). The 767 * specified interval must be page-aligned and not overflow the end of the 768 * address space. 769 * 770 * Subsequent calls to vmm_gpt_iter_next() will emit the encountered entries. 771 */ 772 void 773 vmm_gpt_iter_init(vmm_gpt_iter_t *iter, vmm_gpt_t *gpt, uint64_t addr, 774 uint64_t len) 775 { 776 ASSERT0(addr & PAGEOFFSET); 777 ASSERT0(len & PAGEOFFSET); 778 ASSERT3U((addr + len), >=, addr); 779 780 iter->vgi_gpt = gpt; 781 iter->vgi_addr = addr; 782 iter->vgi_end = addr + len; 783 iter->vgi_current = vmm_gpt_walk(gpt, addr, iter->vgi_entries, LEVEL1); 784 } 785 786 /* 787 * Fetch the next entry (if any) from an iterator initialized from a preceding 788 * call to vmm_gpt_iter_init(). Returns true when the next GPT leaf entry 789 * inside the iterator range has been populated in `entry`. Returns `false` 790 * when there are no such entries available or remaining in the range. 791 */ 792 bool 793 vmm_gpt_iter_next(vmm_gpt_iter_t *iter, vmm_gpt_iter_entry_t *entry) 794 { 795 if (iter->vgi_current >= iter->vgi_end) { 796 return (false); 797 } 798 799 while (iter->vgi_current < iter->vgi_end) { 800 bool found = false; 801 if (iter->vgi_entries[LEVEL1] != NULL) { 802 entry->vgie_gpa = iter->vgi_current; 803 entry->vgie_ptep = iter->vgi_entries[LEVEL1]; 804 found = true; 805 } 806 807 iter->vgi_current = vmm_gpt_walk_advance(iter->vgi_gpt, 808 iter->vgi_current, iter->vgi_entries, LEVEL1); 809 810 if (found) { 811 return (true); 812 } 813 } 814 return (false); 815 } 816 817 /* 818 * Populate child table nodes for a given level between the provided interval 819 * of [addr, addr + len). Caller is expected to provide a pointer to the parent 820 * node which would contain the child node for GPA at `addr`. A pointer to said 821 * child node will be returned when the operation is complete. 822 */ 823 static vmm_gpt_node_t * 824 vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len, 825 vmm_gpt_node_t *node_start) 826 { 827 const vmm_gpt_node_level_t lvl = node_start->vgn_level; 828 const uint64_t end = addr + len; 829 const uint64_t incr = vmm_gpt_lvl_len(lvl); 830 uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl); 831 vmm_gpt_node_t *parent = node_start; 832 833 /* Try to locate node at starting address */ 834 vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children; 835 while (node != NULL && node->vgn_gpa < gpa) { 836 prev = node; 837 node = node->vgn_sib_next; 838 } 839 840 /* 841 * If no node exists at the starting address, create one and link it 842 * into the parent. 843 */ 844 if (node == NULL || node->vgn_gpa > gpa) { 845 /* Need to insert node for starting GPA */ 846 node = vmm_gpt_node_alloc(); 847 vmm_gpt_node_add(gpt, parent, node, gpa, prev); 848 } 849 850 vmm_gpt_node_t *front_node = node; 851 prev = node; 852 gpa += incr; 853 854 /* 855 * With a node at the starting address, walk forward creating nodes in 856 * any of the gaps. 857 */ 858 for (; gpa < end; gpa += incr, prev = node) { 859 node = vmm_gpt_node_next(prev, true); 860 if (node != NULL) { 861 ASSERT3U(node->vgn_gpa, ==, gpa); 862 863 /* We may have crossed into a new parent */ 864 parent = node->vgn_parent; 865 continue; 866 } 867 868 if (vmm_gpt_node_is_last(prev)) { 869 /* 870 * The node preceding this was the last one in its 871 * containing parent, so move on to that parent's 872 * sibling. We expect (demand) that it exist already. 873 */ 874 parent = vmm_gpt_node_next(parent, true); 875 ASSERT(parent != NULL); 876 877 /* 878 * Forget our previous sibling, since it is of no use 879 * for assigning the new node to the a now-different 880 * parent. 881 */ 882 prev = NULL; 883 884 } 885 node = vmm_gpt_node_alloc(); 886 vmm_gpt_node_add(gpt, parent, node, gpa, prev); 887 } 888 889 return (front_node); 890 } 891 892 /* 893 * Ensures that PTEs for the region of address space bounded by 894 * [addr, addr + len) exist in the tree. 895 */ 896 void 897 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len) 898 { 899 ASSERT0(addr & PAGEOFFSET); 900 ASSERT0(len & PAGEOFFSET); 901 902 /* 903 * Starting at the top of the tree, ensure that tables covering the 904 * requested region exist at each level. 905 */ 906 vmm_gpt_node_t *node = gpt->vgpt_root; 907 for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) { 908 ASSERT3U(node->vgn_level, ==, lvl); 909 910 node = vmm_gpt_populate_region_lvl(gpt, addr, len, node); 911 } 912 913 914 /* 915 * Establish reference counts for the soon-to-be memory PTEs which will 916 * be filling these LEVEL1 tables. 917 */ 918 uint64_t gpa = addr; 919 const uint64_t end = addr + len; 920 while (gpa < end) { 921 ASSERT(node != NULL); 922 ASSERT3U(node->vgn_level, ==, LEVEL1); 923 924 const uint16_t covered = 925 vmm_gpt_node_entries_covered(node, addr, end); 926 927 ASSERT(covered != 0); 928 ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE); 929 ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE); 930 931 node->vgn_ref_cnt += covered; 932 933 vmm_gpt_node_t *next = vmm_gpt_node_next(node, true); 934 if (next != NULL) { 935 gpa = next->vgn_gpa; 936 node = next; 937 } else { 938 /* 939 * We do not expect to find a subsequent node after 940 * filling the last node in the table, completing PTE 941 * accounting for the specified range. 942 */ 943 VERIFY3U(end, <=, vmm_gpt_node_end(node)); 944 break; 945 } 946 } 947 } 948 949 /* 950 * Format a PTE and install it in the provided PTE-pointer. 951 * 952 * The caller must ensure that a conflicting PFN is not mapped at the requested 953 * location. Racing operations to map the same PFN at one location are 954 * acceptable and properly handled. 955 */ 956 bool 957 vmm_gpt_map_at(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, pfn_t pfn, uint_t prot, 958 uint8_t attr) 959 { 960 const vmm_gpt_entry_t pte = vmm_gpti_map_page(pfn, prot, attr); 961 const vmm_gpt_entry_t old_pte = atomic_cas_64(ptep, 0, pte); 962 if (old_pte != 0) { 963 #ifdef DEBUG 964 pfn_t new_pfn, old_pfn; 965 966 ASSERT(vmm_gpti_parse(pte, &new_pfn, NULL)); 967 ASSERT(vmm_gpti_parse(old_pte, &old_pfn, NULL)); 968 ASSERT3U(old_pfn, ==, new_pfn); 969 #endif /* DEBUG */ 970 return (false); 971 } 972 973 return (true); 974 } 975 976 /* 977 * Cleans up the unused inner nodes in the GPT for a region of guest physical 978 * address space of [addr, addr + len). The region must map no pages. 979 */ 980 void 981 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len) 982 { 983 ASSERT0(addr & PAGEOFFSET); 984 ASSERT0(len & PAGEOFFSET); 985 986 const uint64_t end = addr + len; 987 vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = { 988 [LEVEL4] = gpt->vgpt_root, 989 }; 990 991 for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) { 992 node = vmm_gpt_node_find_child(starts[lvl], addr); 993 if (node == NULL) { 994 break; 995 } 996 starts[lvl + 1] = node; 997 } 998 999 /* 1000 * Starting at the bottom of the tree, ensure that PTEs for pages have 1001 * been cleared for the region, and remove the corresponding reference 1002 * counts from the containing LEVEL1 tables. 1003 */ 1004 uint64_t gpa = addr; 1005 node = starts[LEVEL1]; 1006 while (gpa < end && node != NULL) { 1007 const uint16_t covered = 1008 vmm_gpt_node_entries_covered(node, addr, end); 1009 1010 ASSERT3U(node->vgn_ref_cnt, >=, covered); 1011 node->vgn_ref_cnt -= covered; 1012 1013 node = vmm_gpt_node_next(node, false); 1014 if (node != NULL) { 1015 gpa = node->vgn_gpa; 1016 } 1017 } 1018 1019 /* 1020 * With the page PTE references eliminated, work up from the bottom of 1021 * the table, removing nodes which have no remaining references. 1022 * 1023 * This stops short of LEVEL4, which is the root table of the GPT. It 1024 * is left standing to be cleaned up when the vmm_gpt_t is destroyed. 1025 */ 1026 for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) { 1027 gpa = addr; 1028 node = starts[lvl]; 1029 1030 while (gpa < end && node != NULL) { 1031 vmm_gpt_node_t *next = vmm_gpt_node_next(node, false); 1032 1033 if (node->vgn_ref_cnt == 0) { 1034 vmm_gpt_node_remove(node); 1035 } 1036 if (next != NULL) { 1037 gpa = next->vgn_gpa; 1038 } 1039 node = next; 1040 } 1041 } 1042 } 1043 1044 /* 1045 * Remove a mapping from the table. Returns false if the page was not mapped, 1046 * otherwise returns true. 1047 */ 1048 bool 1049 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) 1050 { 1051 vmm_gpt_entry_t *entries[MAX_GPT_LEVEL], pte; 1052 1053 ASSERT(gpt != NULL); 1054 (void) vmm_gpt_walk(gpt, gpa, entries, LEVEL1); 1055 if (entries[LEVEL1] == NULL) 1056 return (false); 1057 1058 pte = *entries[LEVEL1]; 1059 *entries[LEVEL1] = 0; 1060 return (vmm_gpti_parse(pte, NULL, NULL)); 1061 } 1062 1063 /* 1064 * Un-maps the region of guest physical address space bounded by [start..end). 1065 * Returns the number of pages that are unmapped. 1066 */ 1067 size_t 1068 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len) 1069 { 1070 ASSERT0(addr & PAGEOFFSET); 1071 ASSERT0(len & PAGEOFFSET); 1072 1073 vmm_gpt_iter_t state; 1074 vmm_gpt_iter_entry_t entry; 1075 size_t num_unmapped = 0; 1076 1077 vmm_gpt_iter_init(&state, gpt, addr, len); 1078 while (vmm_gpt_iter_next(&state, &entry)) { 1079 if (entry.vgie_ptep == NULL) { 1080 continue; 1081 } 1082 1083 const vmm_gpt_entry_t pte = *entry.vgie_ptep; 1084 *entry.vgie_ptep = 0; 1085 if (vmm_gpti_parse(pte, NULL, NULL)) { 1086 num_unmapped++; 1087 } 1088 } 1089 1090 return (num_unmapped); 1091 } 1092 1093 /* 1094 * Returns a value indicating whether or not this GPT maps the given 1095 * GPA. If the GPA is mapped, *protp will be filled with the protection 1096 * bits of the entry. Otherwise, it will be ignored. 1097 */ 1098 bool 1099 vmm_gpte_is_mapped(const vmm_gpt_entry_t *ptep, pfn_t *pfnp, uint_t *protp) 1100 { 1101 ASSERT(ptep != NULL); 1102 1103 return (vmm_gpti_parse(*ptep, pfnp, protp)); 1104 } 1105 1106 1107 static uint_t 1108 vmm_gpt_reset_bits(volatile uint64_t *ptep, uint64_t mask, uint64_t bits) 1109 { 1110 uint64_t pte, newpte, oldpte = 0; 1111 1112 /* 1113 * We use volatile and atomic ops here because we may be 1114 * racing against hardware modifying these bits. 1115 */ 1116 VERIFY3P(ptep, !=, NULL); 1117 oldpte = *ptep; 1118 do { 1119 pte = oldpte; 1120 newpte = (pte & ~mask) | bits; 1121 oldpte = atomic_cas_64(ptep, pte, newpte); 1122 } while (oldpte != pte); 1123 1124 return (oldpte & mask); 1125 } 1126 1127 /* 1128 * Resets the accessed bit on the page table entry pointed to be `ptep`. 1129 * If `on` is true, the bit will be set, otherwise it will be cleared. 1130 * The old value of the bit is returned. 1131 */ 1132 bool 1133 vmm_gpte_reset_accessed(vmm_gpt_entry_t *ptep, bool on) 1134 { 1135 ASSERT(ptep != NULL); 1136 1137 const uint64_t accessed_bit = vmm_pte_impl->vpi_bit_accessed; 1138 const uint64_t dirty_bit = vmm_pte_impl->vpi_bit_dirty; 1139 1140 const uint64_t old_state = vmm_gpt_reset_bits(ptep, 1141 accessed_bit | dirty_bit, on ? accessed_bit : 0); 1142 return (old_state != 0); 1143 } 1144 1145 /* 1146 * Resets the dirty bit on the page table entry pointed to be `ptep`. 1147 * If `on` is true, the bit will be set, otherwise it will be cleared. 1148 * The old value of the bit is returned. 1149 */ 1150 bool 1151 vmm_gpte_reset_dirty(vmm_gpt_entry_t *ptep, bool on) 1152 { 1153 ASSERT(ptep != NULL); 1154 const uint64_t dirty_bit = vmm_pte_impl->vpi_bit_dirty; 1155 1156 const uint64_t old_state = 1157 vmm_gpt_reset_bits(ptep, dirty_bit, on ? dirty_bit : 0); 1158 return (old_state != 0); 1159 } 1160 1161 bool 1162 vmm_gpte_query_accessed(const vmm_gpt_entry_t *ptep) 1163 { 1164 ASSERT(ptep != NULL); 1165 return ((*ptep & vmm_pte_impl->vpi_bit_accessed) != 0); 1166 } 1167 1168 bool 1169 vmm_gpte_query_dirty(const vmm_gpt_entry_t *ptep) 1170 { 1171 ASSERT(ptep != NULL); 1172 return ((*ptep & vmm_pte_impl->vpi_bit_dirty) != 0); 1173 } 1174 1175 /* 1176 * Get properly formatted PML4 (EPTP/nCR3) for GPT. 1177 */ 1178 uint64_t 1179 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty) 1180 { 1181 const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn; 1182 return (vmm_pte_impl->vpi_get_pmtp(root_pfn, track_dirty)); 1183 } 1184 1185 /* 1186 * Does the GPT hardware support dirty-page-tracking? 1187 */ 1188 bool 1189 vmm_gpt_can_track_dirty(vmm_gpt_t *gpt) 1190 { 1191 return (vmm_pte_impl->vpi_hw_ad_supported()); 1192 } 1193