1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2025 Oxide Computer Company 16 */ 17 18 #include <sys/types.h> 19 #include <sys/atomic.h> 20 #include <sys/kmem.h> 21 #include <sys/sysmacros.h> 22 #include <sys/sunddi.h> 23 #include <sys/panic.h> 24 #include <vm/hat.h> 25 #include <vm/as.h> 26 #include <vm/hat_i86.h> 27 28 #include <sys/vmm_gpt.h> 29 30 /* 31 * VMM Generic Page Tables 32 * 33 * Bhyve runs on AMD and Intel hosts and both support nested page tables 34 * describing the guest's physical address space. But the two use different and 35 * mutually incompatible page table formats: Intel uses the EPT, which is based 36 * on the Itanium page table format, while AMD uses the nPT, which is based on 37 * the x86_64 page table format. 38 * 39 * The GPT abstracts these format differences, and provides a single interface 40 * for interacting with either kind of table structure. 41 * 42 * At a high-level, the GPT is a tree that mirrors the paging table radix tree. 43 * It is parameterized with operations on PTEs that are specific to the table 44 * type (EPT or nPT) and also keeps track of how many pages the table maps, as 45 * well as a pointer to the root node in the tree. 46 * 47 * A node in the GPT keep pointers to its parent (NULL for the root), its 48 * left-most child, and its siblings. The node understands its position in the 49 * tree in terms of the level it appears at and the index it occupies at its 50 * parent's level, as well as how many children it has. It also owns the 51 * physical memory page for the hardware page table entries that map its 52 * children. Thus, for a node at any given level in the tree, the nested PTE 53 * for that node's child at index $i$ is the i'th uint64_t in that node's entry 54 * page and the entry page is part of the paging structure consumed by hardware. 55 * 56 * The GPT interface provides functions for populating and vacating the tree for 57 * regions in the guest physical address space, and for mapping and unmapping 58 * pages in populated regions. Users must populate a region before mapping 59 * pages into it, and must unmap pages before vacating the region. 60 * 61 * The interface also exposes a function for walking the table from the root to 62 * a leaf entry, populating an array of pointers to PTEs. This walk uses the 63 * hardware page structure itself, and is thus fast, though as a result it 64 * potentially aliases entries; caveat emptor. The walk primitive is used for 65 * mapping, unmapping, and lookups. 66 * 67 * Format-specific differences are abstracted by parameterizing the GPT with a 68 * set of PTE operations specific to the platform. The GPT code makes use of 69 * these when mapping or populating entries, resetting accessed and dirty bits 70 * on entries, and similar operations. 71 */ 72 73 /* 74 * A GPT node. 75 * 76 * Each node contains pointers to its parent, its left-most child, and its 77 * siblings. Interior nodes also maintain a reference count, and each node 78 * contains its level and index in its parent's table. Finally, each node 79 * contains the host PFN of the page that it links into the page table, as well 80 * as a kernel pointer to table. 81 * 82 * On leaf nodes, the reference count tracks how many entries in the table are 83 * covered by mapping from the containing vmspace. This is maintained during 84 * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace 85 * map/unmap operations, rather than in the data path of faults populating the 86 * PTEs themselves. 87 * 88 * Note, this is carefully sized to fit exactly into a 64-byte cache line. 89 */ 90 typedef struct vmm_gpt_node vmm_gpt_node_t; 91 struct vmm_gpt_node { 92 uint64_t vgn_host_pfn; 93 uint16_t vgn_level; 94 uint16_t vgn_index; 95 uint32_t vgn_ref_cnt; 96 vmm_gpt_node_t *vgn_parent; 97 vmm_gpt_node_t *vgn_children; 98 vmm_gpt_node_t *vgn_sib_next; 99 vmm_gpt_node_t *vgn_sib_prev; 100 vmm_gpt_entry_t *vgn_entries; 101 uint64_t vgn_gpa; 102 }; 103 104 /* Maximum node index determined by number of entries in page table (512) */ 105 #define PTE_PER_TABLE 512 106 #define MAX_NODE_IDX (PTE_PER_TABLE - 1) 107 108 /* 109 * A VMM Generic Page Table. 110 * 111 * The generic page table is a format-agnostic, 4-level paging structure 112 * modeling a second-level page table (EPT on Intel, nPT on AMD). It 113 * contains a counter of pages the table maps, a pointer to the root node 114 * in the table, and is parameterized with a set of PTE operations specific 115 * to the table type. 116 */ 117 struct vmm_gpt { 118 vmm_gpt_node_t *vgpt_root; 119 vmm_pte_ops_t *vgpt_pte_ops; 120 }; 121 122 /* 123 * Allocates a vmm_gpt_node_t structure with corresponding page of memory to 124 * hold the PTEs it contains. 125 */ 126 static vmm_gpt_node_t * 127 vmm_gpt_node_alloc(void) 128 { 129 vmm_gpt_node_t *node; 130 caddr_t page; 131 132 node = kmem_zalloc(sizeof (*node), KM_SLEEP); 133 /* 134 * Note: despite the man page, allocating PAGESIZE bytes is 135 * guaranteed to be page-aligned. 136 */ 137 page = kmem_zalloc(PAGESIZE, KM_SLEEP); 138 node->vgn_entries = (vmm_gpt_entry_t *)page; 139 node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page); 140 141 return (node); 142 } 143 144 /* 145 * Allocates and initializes a vmm_gpt_t. 146 */ 147 vmm_gpt_t * 148 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops) 149 { 150 vmm_gpt_t *gpt; 151 152 VERIFY(pte_ops != NULL); 153 gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP); 154 gpt->vgpt_pte_ops = pte_ops; 155 gpt->vgpt_root = vmm_gpt_node_alloc(); 156 157 return (gpt); 158 } 159 160 /* 161 * Frees a given node. The node is expected to have no familial (parent, 162 * children, siblings) associations at this point. Accordingly, its reference 163 * count should be zero. 164 */ 165 static void 166 vmm_gpt_node_free(vmm_gpt_node_t *node) 167 { 168 ASSERT(node != NULL); 169 ASSERT3U(node->vgn_ref_cnt, ==, 0); 170 ASSERT(node->vgn_host_pfn != PFN_INVALID); 171 ASSERT(node->vgn_entries != NULL); 172 ASSERT(node->vgn_parent == NULL); 173 174 kmem_free(node->vgn_entries, PAGESIZE); 175 kmem_free(node, sizeof (*node)); 176 } 177 178 /* 179 * Frees a vmm_gpt_t. Any lingering nodes in the GPT will be freed too. 180 */ 181 void 182 vmm_gpt_free(vmm_gpt_t *gpt) 183 { 184 /* Empty anything remaining in the tree */ 185 vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK); 186 187 VERIFY(gpt->vgpt_root != NULL); 188 VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0); 189 190 vmm_gpt_node_free(gpt->vgpt_root); 191 kmem_free(gpt, sizeof (*gpt)); 192 } 193 194 /* 195 * Given a GPA, return its corresponding index in a paging structure at the 196 * provided level. 197 */ 198 static inline uint16_t 199 vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa) 200 { 201 ASSERT(level < MAX_GPT_LEVEL); 202 203 const uint16_t mask = MAX_NODE_IDX; 204 switch (level) { 205 case LEVEL4: return ((gpa >> 39) & mask); 206 case LEVEL3: return ((gpa >> 30) & mask); 207 case LEVEL2: return ((gpa >> 21) & mask); 208 case LEVEL1: return ((gpa >> 12) & mask); 209 default: 210 panic("impossible level value"); 211 }; 212 } 213 214 /* Get mask for addresses of entries at a given table level. */ 215 static inline uint64_t 216 vmm_gpt_lvl_mask(vmm_gpt_node_level_t level) 217 { 218 ASSERT(level < MAX_GPT_LEVEL); 219 220 switch (level) { 221 case LEVEL4: return (0xffffff8000000000ul); /* entries cover 512G */ 222 case LEVEL3: return (0xffffffffc0000000ul); /* entries cover 1G */ 223 case LEVEL2: return (0xffffffffffe00000ul); /* entries cover 2M */ 224 case LEVEL1: return (0xfffffffffffff000ul); /* entries cover 4K */ 225 default: 226 panic("impossible level value"); 227 }; 228 } 229 230 /* Get length of GPA covered by entries at a given table level. */ 231 static inline uint64_t 232 vmm_gpt_lvl_len(vmm_gpt_node_level_t level) 233 { 234 ASSERT(level < MAX_GPT_LEVEL); 235 236 switch (level) { 237 case LEVEL4: return (0x8000000000ul); /* entries cover 512G */ 238 case LEVEL3: return (0x40000000ul); /* entries cover 1G */ 239 case LEVEL2: return (0x200000ul); /* entries cover 2M */ 240 case LEVEL1: return (0x1000ul); /* entries cover 4K */ 241 default: 242 panic("impossible level value"); 243 }; 244 } 245 246 /* Get the index of a PTE pointer based on its offset in that table */ 247 static inline uint16_t 248 vmm_gpt_ptep_index(const vmm_gpt_entry_t *ptep) 249 { 250 const uintptr_t offset = (uintptr_t)ptep & 0xffful; 251 return (offset / sizeof (uint64_t)); 252 } 253 254 /* 255 * Get the ending GPA which this node could possibly cover given its base 256 * address and level. 257 */ 258 static inline uint64_t 259 vmm_gpt_node_end(vmm_gpt_node_t *node) 260 { 261 ASSERT(node->vgn_level > LEVEL4); 262 return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1)); 263 } 264 265 /* 266 * Is this node the last entry in its parent node, based solely by its GPA? 267 */ 268 static inline bool 269 vmm_gpt_node_is_last(vmm_gpt_node_t *node) 270 { 271 return (node->vgn_index == MAX_NODE_IDX); 272 } 273 274 /* 275 * How many table entries (if any) in this node are covered by the range of 276 * [start, end). 277 */ 278 static uint16_t 279 vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end) 280 { 281 const uint64_t node_end = vmm_gpt_node_end(node); 282 283 /* Is this node covered at all by the region? */ 284 if (start >= node_end || end <= node->vgn_gpa) { 285 return (0); 286 } 287 288 const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level); 289 const uint64_t covered_start = MAX(node->vgn_gpa, start & mask); 290 const uint64_t covered_end = MIN(node_end, end & mask); 291 const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level); 292 293 return ((covered_end - covered_start) / per_entry); 294 } 295 296 /* 297 * Find the next node (by address) in the tree at the same level. 298 * 299 * Returns NULL if this is the last node in the tree or if `only_seq` was true 300 * and there is an address gap between this node and the next. 301 */ 302 static vmm_gpt_node_t * 303 vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq) 304 { 305 ASSERT3P(node->vgn_parent, !=, NULL); 306 ASSERT3U(node->vgn_level, >, LEVEL4); 307 308 /* 309 * Next node sequentially would be the one at the address starting at 310 * the end of what is covered by this node. 311 */ 312 const uint64_t gpa_match = vmm_gpt_node_end(node); 313 314 /* Try our next sibling */ 315 vmm_gpt_node_t *next = node->vgn_sib_next; 316 317 if (next == NULL) { 318 /* 319 * If the next-sibling pointer is NULL on the node, it can mean 320 * one of two things: 321 * 322 * 1. This entry represents the space leading up to the trailing 323 * boundary of what this node covers. 324 * 325 * 2. The node is not entirely populated, and there is a gap 326 * between the last populated entry, and the trailing 327 * boundary of the node. 328 * 329 * Either way, the proper course of action is to check the first 330 * child of our parent's next sibling (if the parent is not the 331 * root of the GPT itself). 332 */ 333 if (node->vgn_parent != NULL && node->vgn_level > LEVEL3) { 334 vmm_gpt_node_t *psibling = 335 vmm_gpt_node_next(node->vgn_parent, true); 336 if (psibling != NULL) { 337 next = psibling->vgn_children; 338 } 339 } 340 } 341 342 if (next != NULL && 343 (next->vgn_gpa == gpa_match || !only_seq)) { 344 return (next); 345 } 346 347 return (NULL); 348 } 349 350 351 /* 352 * Finds the child for the given GPA in the given parent node. 353 * Returns a pointer to node, or NULL if it is not found. 354 */ 355 static vmm_gpt_node_t * 356 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa) 357 { 358 const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa); 359 for (vmm_gpt_node_t *child = parent->vgn_children; 360 child != NULL && child->vgn_index <= index; 361 child = child->vgn_sib_next) { 362 if (child->vgn_index == index) 363 return (child); 364 } 365 366 return (NULL); 367 } 368 369 /* 370 * Add a child node to the GPT at a position determined by GPA, parent, and (if 371 * present) preceding sibling. 372 * 373 * If `parent` node contains any children, `prev_sibling` must be populated with 374 * a pointer to the node preceding (by GPA) the to-be-added child node. 375 */ 376 static void 377 vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, 378 vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling) 379 { 380 ASSERT3U(parent->vgn_level, <, LEVEL1); 381 ASSERT3U(child->vgn_parent, ==, NULL); 382 383 const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa); 384 child->vgn_index = idx; 385 child->vgn_level = parent->vgn_level + 1; 386 child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level); 387 388 /* Establish familial connections */ 389 child->vgn_parent = parent; 390 if (prev_sibling != NULL) { 391 ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa); 392 393 child->vgn_sib_next = prev_sibling->vgn_sib_next; 394 if (child->vgn_sib_next != NULL) { 395 child->vgn_sib_next->vgn_sib_prev = child; 396 } 397 child->vgn_sib_prev = prev_sibling; 398 prev_sibling->vgn_sib_next = child; 399 } else if (parent->vgn_children != NULL) { 400 vmm_gpt_node_t *next_sibling = parent->vgn_children; 401 402 ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa); 403 ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL); 404 405 child->vgn_sib_next = next_sibling; 406 child->vgn_sib_prev = NULL; 407 next_sibling->vgn_sib_prev = child; 408 parent->vgn_children = child; 409 } else { 410 parent->vgn_children = child; 411 child->vgn_sib_next = NULL; 412 child->vgn_sib_prev = NULL; 413 } 414 415 /* Configure PTE for child table */ 416 parent->vgn_entries[idx] = 417 gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn); 418 parent->vgn_ref_cnt++; 419 } 420 421 /* 422 * Remove a child node from its relatives (parent, siblings) and free it. 423 */ 424 static void 425 vmm_gpt_node_remove(vmm_gpt_node_t *child) 426 { 427 ASSERT3P(child->vgn_children, ==, NULL); 428 ASSERT3U(child->vgn_ref_cnt, ==, 0); 429 ASSERT3P(child->vgn_parent, !=, NULL); 430 431 /* Unlink child from its siblings and parent */ 432 vmm_gpt_node_t *parent = child->vgn_parent; 433 vmm_gpt_node_t *prev = child->vgn_sib_prev; 434 vmm_gpt_node_t *next = child->vgn_sib_next; 435 if (prev != NULL) { 436 ASSERT3P(prev->vgn_sib_next, ==, child); 437 prev->vgn_sib_next = next; 438 } 439 if (next != NULL) { 440 ASSERT3P(next->vgn_sib_prev, ==, child); 441 next->vgn_sib_prev = prev; 442 } 443 if (prev == NULL) { 444 ASSERT3P(parent->vgn_children, ==, child); 445 parent->vgn_children = next; 446 } 447 child->vgn_parent = NULL; 448 child->vgn_sib_next = NULL; 449 child->vgn_sib_prev = NULL; 450 parent->vgn_entries[child->vgn_index] = 0; 451 parent->vgn_ref_cnt--; 452 453 vmm_gpt_node_free(child); 454 } 455 456 /* 457 * Walks the GPT for the given GPA, accumulating entries to the given depth. If 458 * the walk terminates before the depth is reached, the remaining entries are 459 * written with NULLs. 460 * 461 * Returns the GPA corresponding to the deepest populated (non-NULL) entry 462 * encountered during the walk. 463 */ 464 uint64_t 465 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries, 466 vmm_gpt_node_level_t depth) 467 { 468 ASSERT(gpt != NULL); 469 ASSERT3U(depth, <, MAX_GPT_LEVEL); 470 471 vmm_gpt_entry_t *current_entries = gpt->vgpt_root->vgn_entries; 472 uint64_t mask = 0; 473 for (uint_t lvl = LEVEL4; lvl <= depth; lvl++) { 474 if (current_entries == NULL) { 475 entries[lvl] = NULL; 476 continue; 477 } 478 entries[lvl] = ¤t_entries[vmm_gpt_lvl_index(lvl, gpa)]; 479 mask = vmm_gpt_lvl_mask(lvl); 480 const vmm_gpt_entry_t pte = *entries[lvl]; 481 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(pte)) { 482 current_entries = NULL; 483 continue; 484 } 485 const pfn_t pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(pte); 486 current_entries = (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn); 487 } 488 return (gpa & mask); 489 } 490 491 /* 492 * Given a `gpa`, and the corresponding `entries` array as queried by a call to 493 * `vmm_gpt_walk()`, attempt to advance to the next PTE at the `depth` provided 494 * by the caller. 495 * 496 * As there may not be PTEs present at the requested `depth`, the amount that 497 * the GPA is advanced may be larger than what is expected for that provided 498 * depth. 499 * 500 * Returns the GPA correspending to the PTE to which `entries` was advanced to. 501 */ 502 static uint64_t 503 vmm_gpt_walk_advance(vmm_gpt_t *gpt, uint64_t gpa, vmm_gpt_entry_t **entries, 504 vmm_gpt_node_level_t depth) 505 { 506 ASSERT(gpt != NULL); 507 ASSERT3U(depth, <, MAX_GPT_LEVEL); 508 ASSERT0(gpa & ~vmm_gpt_lvl_mask(depth)); 509 510 /* 511 * Advanced to the next entry in the deepest level of PTE possible, 512 * ascending to higher levels if we reach the end of a table at any 513 * given point. 514 */ 515 int lvl; 516 for (lvl = depth; lvl >= LEVEL4; lvl--) { 517 vmm_gpt_entry_t *ptep = entries[lvl]; 518 519 if (ptep == NULL) { 520 continue; 521 } 522 523 uint16_t index = vmm_gpt_ptep_index(ptep); 524 ASSERT3U(vmm_gpt_lvl_index(lvl, gpa), ==, index); 525 if (index == MAX_NODE_IDX) { 526 continue; 527 } 528 529 gpa = (gpa & vmm_gpt_lvl_mask(lvl)) + vmm_gpt_lvl_len(lvl); 530 entries[lvl] = ptep + 1; 531 break; 532 } 533 534 if (lvl < LEVEL4) { 535 /* Advanced off the end of the tables */ 536 return (UINT64_MAX); 537 } 538 539 /* 540 * Attempt to walk back down to the target level if any ascension was 541 * necessary during the above advancement. 542 */ 543 vmm_gpt_entry_t pte = *entries[lvl]; 544 lvl++; 545 for (; lvl < MAX_GPT_LEVEL; lvl++) { 546 if (lvl > depth || 547 !gpt->vgpt_pte_ops->vpeo_pte_is_present(pte)) { 548 entries[lvl] = NULL; 549 continue; 550 } 551 552 const pfn_t pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(pte); 553 vmm_gpt_entry_t *next_table = 554 (vmm_gpt_entry_t *)hat_kpm_pfn2va(pfn); 555 const uint16_t index = vmm_gpt_lvl_index(lvl, gpa); 556 pte = next_table[index]; 557 entries[lvl] = &next_table[index]; 558 } 559 560 return (gpa); 561 } 562 563 /* 564 * Initialize iteration over GPT leaf entries between [addr, addr + len). The 565 * specified interval must be page-aligned and not overflow the end of the 566 * address space. 567 * 568 * Subsequent calls to vmm_gpt_iter_next() will emit the encountered entries. 569 */ 570 void 571 vmm_gpt_iter_init(vmm_gpt_iter_t *iter, vmm_gpt_t *gpt, uint64_t addr, 572 uint64_t len) 573 { 574 ASSERT0(addr & PAGEOFFSET); 575 ASSERT0(len & PAGEOFFSET); 576 ASSERT3U((addr + len), >=, addr); 577 578 iter->vgi_gpt = gpt; 579 iter->vgi_addr = addr; 580 iter->vgi_end = addr + len; 581 iter->vgi_current = vmm_gpt_walk(gpt, addr, iter->vgi_entries, LEVEL1); 582 } 583 584 /* 585 * Fetch the next entry (if any) from an iterator initialized from a preceding 586 * call to vmm_gpt_iter_init(). Returns true when the next GPT leaf entry 587 * inside the iterator range has been populated in `entry`. Returns `false` 588 * when there are no such entries available or remaining in the range. 589 */ 590 bool 591 vmm_gpt_iter_next(vmm_gpt_iter_t *iter, vmm_gpt_iter_entry_t *entry) 592 { 593 if (iter->vgi_current >= iter->vgi_end) { 594 return (false); 595 } 596 597 while (iter->vgi_current < iter->vgi_end) { 598 bool found = false; 599 if (iter->vgi_entries[LEVEL1] != NULL) { 600 entry->vgie_gpa = iter->vgi_current; 601 entry->vgie_ptep = iter->vgi_entries[LEVEL1]; 602 found = true; 603 } 604 605 iter->vgi_current = vmm_gpt_walk_advance(iter->vgi_gpt, 606 iter->vgi_current, iter->vgi_entries, LEVEL1); 607 608 if (found) { 609 return (true); 610 } 611 } 612 return (false); 613 } 614 615 /* 616 * Populate child table nodes for a given level between the provided interval 617 * of [addr, addr + len). Caller is expected to provide a pointer to the parent 618 * node which would contain the child node for GPA at `addr`. A pointer to said 619 * child node will be returned when the operation is complete. 620 */ 621 static vmm_gpt_node_t * 622 vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len, 623 vmm_gpt_node_t *node_start) 624 { 625 const vmm_gpt_node_level_t lvl = node_start->vgn_level; 626 const uint64_t end = addr + len; 627 const uint64_t incr = vmm_gpt_lvl_len(lvl); 628 uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl); 629 vmm_gpt_node_t *parent = node_start; 630 631 /* Try to locate node at starting address */ 632 vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children; 633 while (node != NULL && node->vgn_gpa < gpa) { 634 prev = node; 635 node = node->vgn_sib_next; 636 } 637 638 /* 639 * If no node exists at the starting address, create one and link it 640 * into the parent. 641 */ 642 if (node == NULL || node->vgn_gpa > gpa) { 643 /* Need to insert node for starting GPA */ 644 node = vmm_gpt_node_alloc(); 645 vmm_gpt_node_add(gpt, parent, node, gpa, prev); 646 } 647 648 vmm_gpt_node_t *front_node = node; 649 prev = node; 650 gpa += incr; 651 652 /* 653 * With a node at the starting address, walk forward creating nodes in 654 * any of the gaps. 655 */ 656 for (; gpa < end; gpa += incr, prev = node) { 657 node = vmm_gpt_node_next(prev, true); 658 if (node != NULL) { 659 ASSERT3U(node->vgn_gpa, ==, gpa); 660 661 /* We may have crossed into a new parent */ 662 parent = node->vgn_parent; 663 continue; 664 } 665 666 if (vmm_gpt_node_is_last(prev)) { 667 /* 668 * The node preceding this was the last one in its 669 * containing parent, so move on to that parent's 670 * sibling. We expect (demand) that it exist already. 671 */ 672 parent = vmm_gpt_node_next(parent, true); 673 ASSERT(parent != NULL); 674 675 /* 676 * Forget our previous sibling, since it is of no use 677 * for assigning the new node to the a now-different 678 * parent. 679 */ 680 prev = NULL; 681 682 } 683 node = vmm_gpt_node_alloc(); 684 vmm_gpt_node_add(gpt, parent, node, gpa, prev); 685 } 686 687 return (front_node); 688 } 689 690 /* 691 * Ensures that PTEs for the region of address space bounded by 692 * [addr, addr + len) exist in the tree. 693 */ 694 void 695 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len) 696 { 697 ASSERT0(addr & PAGEOFFSET); 698 ASSERT0(len & PAGEOFFSET); 699 700 /* 701 * Starting at the top of the tree, ensure that tables covering the 702 * requested region exist at each level. 703 */ 704 vmm_gpt_node_t *node = gpt->vgpt_root; 705 for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) { 706 ASSERT3U(node->vgn_level, ==, lvl); 707 708 node = vmm_gpt_populate_region_lvl(gpt, addr, len, node); 709 } 710 711 712 /* 713 * Establish reference counts for the soon-to-be memory PTEs which will 714 * be filling these LEVEL1 tables. 715 */ 716 uint64_t gpa = addr; 717 const uint64_t end = addr + len; 718 while (gpa < end) { 719 ASSERT(node != NULL); 720 ASSERT3U(node->vgn_level, ==, LEVEL1); 721 722 const uint16_t covered = 723 vmm_gpt_node_entries_covered(node, addr, end); 724 725 ASSERT(covered != 0); 726 ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE); 727 ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE); 728 729 node->vgn_ref_cnt += covered; 730 731 vmm_gpt_node_t *next = vmm_gpt_node_next(node, true); 732 if (next != NULL) { 733 gpa = next->vgn_gpa; 734 node = next; 735 } else { 736 /* 737 * We do not expect to find a subsequent node after 738 * filling the last node in the table, completing PTE 739 * accounting for the specified range. 740 */ 741 VERIFY3U(end, <=, vmm_gpt_node_end(node)); 742 break; 743 } 744 } 745 } 746 747 /* 748 * Format a PTE and install it in the provided PTE-pointer. 749 * 750 * The caller must ensure that a conflicting PFN is not mapped at the requested 751 * location. Racing operations to map the same PFN at one location are 752 * acceptable and properly handled. 753 */ 754 bool 755 vmm_gpt_map_at(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, pfn_t pfn, uint_t prot, 756 uint8_t attr) 757 { 758 vmm_gpt_entry_t pte, old_pte; 759 760 pte = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr); 761 old_pte = atomic_cas_64(ptep, 0, pte); 762 if (old_pte != 0) { 763 ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(pte), ==, 764 gpt->vgpt_pte_ops->vpeo_pte_pfn(old_pte)); 765 return (false); 766 } 767 768 return (true); 769 } 770 771 /* 772 * Cleans up the unused inner nodes in the GPT for a region of guest physical 773 * address space of [addr, addr + len). The region must map no pages. 774 */ 775 void 776 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len) 777 { 778 ASSERT0(addr & PAGEOFFSET); 779 ASSERT0(len & PAGEOFFSET); 780 781 const uint64_t end = addr + len; 782 vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = { 783 [LEVEL4] = gpt->vgpt_root, 784 }; 785 786 for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) { 787 node = vmm_gpt_node_find_child(starts[lvl], addr); 788 if (node == NULL) { 789 break; 790 } 791 starts[lvl + 1] = node; 792 } 793 794 /* 795 * Starting at the bottom of the tree, ensure that PTEs for pages have 796 * been cleared for the region, and remove the corresponding reference 797 * counts from the containing LEVEL1 tables. 798 */ 799 uint64_t gpa = addr; 800 node = starts[LEVEL1]; 801 while (gpa < end && node != NULL) { 802 const uint16_t covered = 803 vmm_gpt_node_entries_covered(node, addr, end); 804 805 ASSERT3U(node->vgn_ref_cnt, >=, covered); 806 node->vgn_ref_cnt -= covered; 807 808 node = vmm_gpt_node_next(node, false); 809 if (node != NULL) { 810 gpa = node->vgn_gpa; 811 } 812 } 813 814 /* 815 * With the page PTE references eliminated, work up from the bottom of 816 * the table, removing nodes which have no remaining references. 817 * 818 * This stops short of LEVEL4, which is the root table of the GPT. It 819 * is left standing to be cleaned up when the vmm_gpt_t is destroyed. 820 */ 821 for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) { 822 gpa = addr; 823 node = starts[lvl]; 824 825 while (gpa < end && node != NULL) { 826 vmm_gpt_node_t *next = vmm_gpt_node_next(node, false); 827 828 if (node->vgn_ref_cnt == 0) { 829 vmm_gpt_node_remove(node); 830 } 831 if (next != NULL) { 832 gpa = next->vgn_gpa; 833 } 834 node = next; 835 } 836 } 837 } 838 839 /* 840 * Remove a mapping from the table. Returns false if the page was not mapped, 841 * otherwise returns true. 842 */ 843 bool 844 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) 845 { 846 vmm_gpt_entry_t *entries[MAX_GPT_LEVEL], pte; 847 848 ASSERT(gpt != NULL); 849 (void) vmm_gpt_walk(gpt, gpa, entries, LEVEL1); 850 if (entries[LEVEL1] == NULL) 851 return (false); 852 853 pte = *entries[LEVEL1]; 854 *entries[LEVEL1] = 0; 855 return (gpt->vgpt_pte_ops->vpeo_pte_is_present(pte)); 856 } 857 858 /* 859 * Un-maps the region of guest physical address space bounded by [start..end). 860 * Returns the number of pages that are unmapped. 861 */ 862 size_t 863 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len) 864 { 865 ASSERT0(addr & PAGEOFFSET); 866 ASSERT0(len & PAGEOFFSET); 867 868 vmm_gpt_iter_t state; 869 vmm_gpt_iter_entry_t entry; 870 size_t num_unmapped = 0; 871 872 vmm_gpt_iter_init(&state, gpt, addr, len); 873 while (vmm_gpt_iter_next(&state, &entry)) { 874 if (entry.vgie_ptep == NULL) { 875 continue; 876 } 877 878 const vmm_gpt_entry_t pte = *entry.vgie_ptep; 879 *entry.vgie_ptep = 0; 880 if (gpt->vgpt_pte_ops->vpeo_pte_is_present(pte)) { 881 num_unmapped++; 882 } 883 } 884 885 return (num_unmapped); 886 } 887 888 /* 889 * Returns a value indicating whether or not this GPT maps the given 890 * GPA. If the GPA is mapped, *protp will be filled with the protection 891 * bits of the entry. Otherwise, it will be ignored. 892 */ 893 bool 894 vmm_gpt_is_mapped(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, pfn_t *pfnp, 895 uint_t *protp) 896 { 897 vmm_gpt_entry_t pte; 898 899 ASSERT(pfnp != NULL); 900 ASSERT(protp != NULL); 901 902 if (ptep == NULL) { 903 return (false); 904 } 905 pte = *ptep; 906 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(pte)) { 907 return (false); 908 } 909 *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(pte); 910 *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(pte); 911 return (true); 912 } 913 914 /* 915 * Resets the accessed bit on the page table entry pointed to be `ptep`. 916 * If `on` is true, the bit will be set, otherwise it will be cleared. 917 * The old value of the bit is returned. 918 */ 919 uint_t 920 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, bool on) 921 { 922 ASSERT(ptep != NULL); 923 return (gpt->vgpt_pte_ops->vpeo_reset_accessed(ptep, on)); 924 } 925 926 /* 927 * Resets the dirty bit on the page table entry pointed to be `ptep`. 928 * If `on` is true, the bit will be set, otherwise it will be cleared. 929 * The old value of the bit is returned. 930 */ 931 uint_t 932 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, bool on) 933 { 934 ASSERT(ptep != NULL); 935 return (gpt->vgpt_pte_ops->vpeo_reset_dirty(ptep, on)); 936 } 937 938 /* 939 * Query state from PTE pointed to by `ptep`. 940 */ 941 bool 942 vmm_gpt_query(vmm_gpt_t *gpt, vmm_gpt_entry_t *ptep, vmm_gpt_query_t query) 943 { 944 ASSERT(ptep != NULL); 945 return (gpt->vgpt_pte_ops->vpeo_query(ptep, query)); 946 } 947 948 /* 949 * Get properly formatted PML4 (EPTP/nCR3) for GPT. 950 */ 951 uint64_t 952 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty) 953 { 954 const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn; 955 return (gpt->vgpt_pte_ops->vpeo_get_pmtp(root_pfn, track_dirty)); 956 } 957 958 /* 959 * Does the GPT hardware support dirty-page-tracking? 960 */ 961 bool 962 vmm_gpt_can_track_dirty(vmm_gpt_t *gpt) 963 { 964 return (gpt->vgpt_pte_ops->vpeo_hw_ad_supported()); 965 } 966