1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 * Copyright 2024 Oxide Computer Company 15 */ 16 17 #include <sys/types.h> 18 #include <sys/atomic.h> 19 #include <sys/kmem.h> 20 #include <sys/sysmacros.h> 21 #include <sys/sunddi.h> 22 #include <sys/panic.h> 23 #include <vm/hat.h> 24 #include <vm/as.h> 25 #include <vm/hat_i86.h> 26 27 #include <sys/vmm_gpt.h> 28 29 /* 30 * VMM Generic Page Tables 31 * 32 * Bhyve runs on AMD and Intel hosts and both support nested page tables 33 * describing the guest's physical address space. But the two use different and 34 * mutually incompatible page table formats: Intel uses the EPT, which is based 35 * on the Itanium page table format, while AMD uses the nPT, which is based on 36 * the x86_64 page table format. 37 * 38 * The GPT abstracts these format differences, and provides a single interface 39 * for interacting with either kind of table structure. 40 * 41 * At a high-level, the GPT is a tree that mirrors the paging table radix tree. 42 * It is parameterized with operations on PTEs that are specific to the table 43 * type (EPT or nPT) and also keeps track of how many pages the table maps, as 44 * well as a pointer to the root node in the tree. 45 * 46 * A node in the GPT keep pointers to its parent (NULL for the root), its 47 * left-most child, and its siblings. The node understands its position in the 48 * tree in terms of the level it appears at and the index it occupies at its 49 * parent's level, as well as how many children it has. It also owns the 50 * physical memory page for the hardware page table entries that map its 51 * children. Thus, for a node at any given level in the tree, the nested PTE 52 * for that node's child at index $i$ is the i'th uint64_t in that node's entry 53 * page and the entry page is part of the paging structure consumed by hardware. 54 * 55 * The GPT interface provides functions for populating and vacating the tree for 56 * regions in the guest physical address space, and for mapping and unmapping 57 * pages in populated regions. Users must populate a region before mapping 58 * pages into it, and must unmap pages before vacating the region. 59 * 60 * The interface also exposes a function for walking the table from the root to 61 * a leaf entry, populating an array of pointers to PTEs. This walk uses the 62 * hardware page structure itself, and is thus fast, though as a result it 63 * potentially aliases entries; caveat emptor. The walk primitive is used for 64 * mapping, unmapping, and lookups. 65 * 66 * Format-specific differences are abstracted by parameterizing the GPT with a 67 * set of PTE operations specific to the platform. The GPT code makes use of 68 * these when mapping or populating entries, resetting accessed and dirty bits 69 * on entries, and similar operations. 70 */ 71 72 /* 73 * A GPT node. 74 * 75 * Each node contains pointers to its parent, its left-most child, and its 76 * siblings. Interior nodes also maintain a reference count, and each node 77 * contains its level and index in its parent's table. Finally, each node 78 * contains the host PFN of the page that it links into the page table, as well 79 * as a kernel pointer to table. 80 * 81 * On leaf nodes, the reference count tracks how many entries in the table are 82 * covered by mapping from the containing vmspace. This is maintained during 83 * calls to vmm_populate_region() and vmm_gpt_vacate_region() as part of vmspace 84 * map/unmap operations, rather than in the data path of faults populating the 85 * PTEs themselves. 86 * 87 * Note, this is carefully sized to fit exactly into a 64-byte cache line. 88 */ 89 typedef struct vmm_gpt_node vmm_gpt_node_t; 90 struct vmm_gpt_node { 91 uint64_t vgn_host_pfn; 92 uint16_t vgn_level; 93 uint16_t vgn_index; 94 uint32_t vgn_ref_cnt; 95 vmm_gpt_node_t *vgn_parent; 96 vmm_gpt_node_t *vgn_children; 97 vmm_gpt_node_t *vgn_sib_next; 98 vmm_gpt_node_t *vgn_sib_prev; 99 uint64_t *vgn_entries; 100 uint64_t vgn_gpa; 101 }; 102 103 /* Maximum node index determined by number of entries in page table (512) */ 104 #define PTE_PER_TABLE 512 105 #define MAX_NODE_IDX (PTE_PER_TABLE - 1) 106 107 /* 108 * A VMM Generic Page Table. 109 * 110 * The generic page table is a format-agnostic, 4-level paging structure 111 * modeling a second-level page table (EPT on Intel, nPT on AMD). It 112 * contains a counter of pages the table maps, a pointer to the root node 113 * in the table, and is parameterized with a set of PTE operations specific 114 * to the table type. 115 */ 116 struct vmm_gpt { 117 vmm_gpt_node_t *vgpt_root; 118 vmm_pte_ops_t *vgpt_pte_ops; 119 }; 120 121 /* 122 * Allocates a vmm_gpt_node_t structure with corresponding page of memory to 123 * hold the PTEs it contains. 124 */ 125 static vmm_gpt_node_t * 126 vmm_gpt_node_alloc(void) 127 { 128 vmm_gpt_node_t *node; 129 caddr_t page; 130 131 node = kmem_zalloc(sizeof (*node), KM_SLEEP); 132 /* 133 * Note: despite the man page, allocating PAGESIZE bytes is 134 * guaranteed to be page-aligned. 135 */ 136 page = kmem_zalloc(PAGESIZE, KM_SLEEP); 137 node->vgn_entries = (uint64_t *)page; 138 node->vgn_host_pfn = hat_getpfnum(kas.a_hat, page); 139 140 return (node); 141 } 142 143 /* 144 * Allocates and initializes a vmm_gpt_t. 145 */ 146 vmm_gpt_t * 147 vmm_gpt_alloc(vmm_pte_ops_t *pte_ops) 148 { 149 vmm_gpt_t *gpt; 150 151 VERIFY(pte_ops != NULL); 152 gpt = kmem_zalloc(sizeof (*gpt), KM_SLEEP); 153 gpt->vgpt_pte_ops = pte_ops; 154 gpt->vgpt_root = vmm_gpt_node_alloc(); 155 156 return (gpt); 157 } 158 159 /* 160 * Frees a given node. The node is expected to have no familial (parent, 161 * children, siblings) associations at this point. Accordingly, its reference 162 * count should be zero. 163 */ 164 static void 165 vmm_gpt_node_free(vmm_gpt_node_t *node) 166 { 167 ASSERT(node != NULL); 168 ASSERT3U(node->vgn_ref_cnt, ==, 0); 169 ASSERT(node->vgn_host_pfn != PFN_INVALID); 170 ASSERT(node->vgn_entries != NULL); 171 ASSERT(node->vgn_parent == NULL); 172 173 kmem_free(node->vgn_entries, PAGESIZE); 174 kmem_free(node, sizeof (*node)); 175 } 176 177 /* 178 * Frees a vmm_gpt_t. Any lingering nodes in the GPT will be freed too. 179 */ 180 void 181 vmm_gpt_free(vmm_gpt_t *gpt) 182 { 183 /* Empty anything remaining in the tree */ 184 vmm_gpt_vacate_region(gpt, 0, UINT64_MAX & PAGEMASK); 185 186 VERIFY(gpt->vgpt_root != NULL); 187 VERIFY3U(gpt->vgpt_root->vgn_ref_cnt, ==, 0); 188 189 vmm_gpt_node_free(gpt->vgpt_root); 190 kmem_free(gpt, sizeof (*gpt)); 191 } 192 193 /* 194 * Given a GPA, return its corresponding index in a paging structure at the 195 * provided level. 196 */ 197 static inline uint16_t 198 vmm_gpt_lvl_index(vmm_gpt_node_level_t level, uint64_t gpa) 199 { 200 ASSERT(level < MAX_GPT_LEVEL); 201 202 const uint16_t mask = (1U << 9) - 1; 203 switch (level) { 204 case LEVEL4: return ((gpa >> 39) & mask); 205 case LEVEL3: return ((gpa >> 30) & mask); 206 case LEVEL2: return ((gpa >> 21) & mask); 207 case LEVEL1: return ((gpa >> 12) & mask); 208 default: 209 panic("impossible level value"); 210 }; 211 } 212 213 /* Get mask for addresses of entries at a given table level. */ 214 static inline uint64_t 215 vmm_gpt_lvl_mask(vmm_gpt_node_level_t level) 216 { 217 ASSERT(level < MAX_GPT_LEVEL); 218 219 switch (level) { 220 case LEVEL4: return (0xffffff8000000000ul); /* entries cover 512G */ 221 case LEVEL3: return (0xffffffffc0000000ul); /* entries cover 1G */ 222 case LEVEL2: return (0xffffffffffe00000ul); /* entries cover 2M */ 223 case LEVEL1: return (0xfffffffffffff000ul); /* entries cover 4K */ 224 default: 225 panic("impossible level value"); 226 }; 227 } 228 229 /* Get length of GPA covered by entries at a given table level. */ 230 static inline uint64_t 231 vmm_gpt_lvl_len(vmm_gpt_node_level_t level) 232 { 233 ASSERT(level < MAX_GPT_LEVEL); 234 235 switch (level) { 236 case LEVEL4: return (0x8000000000ul); /* entries cover 512G */ 237 case LEVEL3: return (0x40000000ul); /* entries cover 1G */ 238 case LEVEL2: return (0x200000ul); /* entries cover 2M */ 239 case LEVEL1: return (0x1000ul); /* entries cover 4K */ 240 default: 241 panic("impossible level value"); 242 }; 243 } 244 245 /* 246 * Get the ending GPA which this node could possibly cover given its base 247 * address and level. 248 */ 249 static inline uint64_t 250 vmm_gpt_node_end(vmm_gpt_node_t *node) 251 { 252 ASSERT(node->vgn_level > LEVEL4); 253 return (node->vgn_gpa + vmm_gpt_lvl_len(node->vgn_level - 1)); 254 } 255 256 /* 257 * Is this node the last entry in its parent node, based solely by its GPA? 258 */ 259 static inline bool 260 vmm_gpt_node_is_last(vmm_gpt_node_t *node) 261 { 262 return (node->vgn_index == MAX_NODE_IDX); 263 } 264 265 /* 266 * How many table entries (if any) in this node are covered by the range of 267 * [start, end). 268 */ 269 static uint16_t 270 vmm_gpt_node_entries_covered(vmm_gpt_node_t *node, uint64_t start, uint64_t end) 271 { 272 const uint64_t node_end = vmm_gpt_node_end(node); 273 274 /* Is this node covered at all by the region? */ 275 if (start >= node_end || end <= node->vgn_gpa) { 276 return (0); 277 } 278 279 const uint64_t mask = vmm_gpt_lvl_mask(node->vgn_level); 280 const uint64_t covered_start = MAX(node->vgn_gpa, start & mask); 281 const uint64_t covered_end = MIN(node_end, end & mask); 282 const uint64_t per_entry = vmm_gpt_lvl_len(node->vgn_level); 283 284 return ((covered_end - covered_start) / per_entry); 285 } 286 287 /* 288 * Find the next node (by address) in the tree at the same level. 289 * 290 * Returns NULL if this is the last node in the tree or if `only_seq` was true 291 * and there is an address gap between this node and the next. 292 */ 293 static vmm_gpt_node_t * 294 vmm_gpt_node_next(vmm_gpt_node_t *node, bool only_seq) 295 { 296 ASSERT3P(node->vgn_parent, !=, NULL); 297 ASSERT3U(node->vgn_level, >, LEVEL4); 298 299 /* 300 * Next node sequentially would be the one at the address starting at 301 * the end of what is covered by this node. 302 */ 303 const uint64_t gpa_match = vmm_gpt_node_end(node); 304 305 /* Try our next sibling */ 306 vmm_gpt_node_t *next = node->vgn_sib_next; 307 if (next != NULL) { 308 if (next->vgn_gpa == gpa_match || !only_seq) { 309 return (next); 310 } 311 } else { 312 /* 313 * If the next-sibling pointer is NULL on the node, it can mean 314 * one of two things: 315 * 316 * 1. This entry represents the space leading up to the trailing 317 * boundary of what this node covers. 318 * 319 * 2. The node is not entirely populated, and there is a gap 320 * between the last populated entry, and the trailing 321 * boundary of the node. 322 * 323 * Either way, the proper course of action is to check the first 324 * child of our parent's next sibling. 325 */ 326 vmm_gpt_node_t *pibling = node->vgn_parent->vgn_sib_next; 327 if (pibling != NULL) { 328 next = pibling->vgn_children; 329 if (next != NULL) { 330 if (next->vgn_gpa == gpa_match || !only_seq) { 331 return (next); 332 } 333 } 334 } 335 } 336 337 return (NULL); 338 } 339 340 341 /* 342 * Finds the child for the given GPA in the given parent node. 343 * Returns a pointer to node, or NULL if it is not found. 344 */ 345 static vmm_gpt_node_t * 346 vmm_gpt_node_find_child(vmm_gpt_node_t *parent, uint64_t gpa) 347 { 348 const uint16_t index = vmm_gpt_lvl_index(parent->vgn_level, gpa); 349 for (vmm_gpt_node_t *child = parent->vgn_children; 350 child != NULL && child->vgn_index <= index; 351 child = child->vgn_sib_next) { 352 if (child->vgn_index == index) 353 return (child); 354 } 355 356 return (NULL); 357 } 358 359 /* 360 * Add a child node to the GPT at a position determined by GPA, parent, and (if 361 * present) preceding sibling. 362 * 363 * If `parent` node contains any children, `prev_sibling` must be populated with 364 * a pointer to the node preceding (by GPA) the to-be-added child node. 365 */ 366 static void 367 vmm_gpt_node_add(vmm_gpt_t *gpt, vmm_gpt_node_t *parent, 368 vmm_gpt_node_t *child, uint64_t gpa, vmm_gpt_node_t *prev_sibling) 369 { 370 ASSERT3U(parent->vgn_level, <, LEVEL1); 371 ASSERT3U(child->vgn_parent, ==, NULL); 372 373 const uint16_t idx = vmm_gpt_lvl_index(parent->vgn_level, gpa); 374 child->vgn_index = idx; 375 child->vgn_level = parent->vgn_level + 1; 376 child->vgn_gpa = gpa & vmm_gpt_lvl_mask(parent->vgn_level); 377 378 /* Establish familial connections */ 379 child->vgn_parent = parent; 380 if (prev_sibling != NULL) { 381 ASSERT3U(prev_sibling->vgn_gpa, <, child->vgn_gpa); 382 383 child->vgn_sib_next = prev_sibling->vgn_sib_next; 384 if (child->vgn_sib_next != NULL) { 385 child->vgn_sib_next->vgn_sib_prev = child; 386 } 387 child->vgn_sib_prev = prev_sibling; 388 prev_sibling->vgn_sib_next = child; 389 } else if (parent->vgn_children != NULL) { 390 vmm_gpt_node_t *next_sibling = parent->vgn_children; 391 392 ASSERT3U(next_sibling->vgn_gpa, >, child->vgn_gpa); 393 ASSERT3U(next_sibling->vgn_sib_prev, ==, NULL); 394 395 child->vgn_sib_next = next_sibling; 396 child->vgn_sib_prev = NULL; 397 next_sibling->vgn_sib_prev = child; 398 parent->vgn_children = child; 399 } else { 400 parent->vgn_children = child; 401 child->vgn_sib_next = NULL; 402 child->vgn_sib_prev = NULL; 403 } 404 405 /* Configure PTE for child table */ 406 parent->vgn_entries[idx] = 407 gpt->vgpt_pte_ops->vpeo_map_table(child->vgn_host_pfn); 408 parent->vgn_ref_cnt++; 409 } 410 411 /* 412 * Remove a child node from its relatives (parent, siblings) and free it. 413 */ 414 static void 415 vmm_gpt_node_remove(vmm_gpt_node_t *child) 416 { 417 ASSERT3P(child->vgn_children, ==, NULL); 418 ASSERT3U(child->vgn_ref_cnt, ==, 0); 419 ASSERT3P(child->vgn_parent, !=, NULL); 420 421 /* Unlink child from its siblings and parent */ 422 vmm_gpt_node_t *parent = child->vgn_parent; 423 vmm_gpt_node_t *prev = child->vgn_sib_prev; 424 vmm_gpt_node_t *next = child->vgn_sib_next; 425 if (prev != NULL) { 426 ASSERT3P(prev->vgn_sib_next, ==, child); 427 prev->vgn_sib_next = next; 428 } 429 if (next != NULL) { 430 ASSERT3P(next->vgn_sib_prev, ==, child); 431 next->vgn_sib_prev = prev; 432 } 433 if (prev == NULL) { 434 ASSERT3P(parent->vgn_children, ==, child); 435 parent->vgn_children = next; 436 } 437 child->vgn_parent = NULL; 438 child->vgn_sib_next = NULL; 439 child->vgn_sib_prev = NULL; 440 parent->vgn_entries[child->vgn_index] = 0; 441 parent->vgn_ref_cnt--; 442 443 vmm_gpt_node_free(child); 444 } 445 446 /* 447 * Walks the GPT for the given GPA, accumulating entries to the given depth. If 448 * the walk terminates before the depth is reached, the remaining entries are 449 * written with NULLs. 450 */ 451 void 452 vmm_gpt_walk(vmm_gpt_t *gpt, uint64_t gpa, uint64_t **entries, 453 vmm_gpt_node_level_t depth) 454 { 455 uint64_t *current_entries, entry; 456 pfn_t pfn; 457 458 ASSERT(gpt != NULL); 459 current_entries = gpt->vgpt_root->vgn_entries; 460 for (uint_t i = 0; i < depth; i++) { 461 if (current_entries == NULL) { 462 entries[i] = NULL; 463 continue; 464 } 465 entries[i] = ¤t_entries[vmm_gpt_lvl_index(i, gpa)]; 466 entry = *entries[i]; 467 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { 468 current_entries = NULL; 469 continue; 470 } 471 pfn = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); 472 current_entries = (uint64_t *)hat_kpm_pfn2va(pfn); 473 } 474 } 475 476 /* 477 * Looks up an entry given GPA. 478 */ 479 uint64_t * 480 vmm_gpt_lookup(vmm_gpt_t *gpt, uint64_t gpa) 481 { 482 uint64_t *entries[MAX_GPT_LEVEL]; 483 484 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 485 486 return (entries[LEVEL1]); 487 } 488 489 /* 490 * Populate child table nodes for a given level between the provided interval 491 * of [addr, addr + len). Caller is expected to provide a pointer to the parent 492 * node which would contain the child node for GPA at `addr`. A pointer to said 493 * child node will be returned when the operation is complete. 494 */ 495 static vmm_gpt_node_t * 496 vmm_gpt_populate_region_lvl(vmm_gpt_t *gpt, uint64_t addr, uint64_t len, 497 vmm_gpt_node_t *node_start) 498 { 499 const vmm_gpt_node_level_t lvl = node_start->vgn_level; 500 const uint64_t end = addr + len; 501 const uint64_t incr = vmm_gpt_lvl_len(lvl); 502 uint64_t gpa = addr & vmm_gpt_lvl_mask(lvl); 503 vmm_gpt_node_t *parent = node_start; 504 505 /* Try to locate node at starting address */ 506 vmm_gpt_node_t *prev = NULL, *node = parent->vgn_children; 507 while (node != NULL && node->vgn_gpa < gpa) { 508 prev = node; 509 node = node->vgn_sib_next; 510 } 511 512 /* 513 * If no node exists at the starting address, create one and link it 514 * into the parent. 515 */ 516 if (node == NULL || node->vgn_gpa > gpa) { 517 /* Need to insert node for starting GPA */ 518 node = vmm_gpt_node_alloc(); 519 vmm_gpt_node_add(gpt, parent, node, gpa, prev); 520 } 521 522 vmm_gpt_node_t *front_node = node; 523 prev = node; 524 gpa += incr; 525 526 /* 527 * With a node at the starting address, walk forward creating nodes in 528 * any of the gaps. 529 */ 530 for (; gpa < end; gpa += incr, prev = node) { 531 node = vmm_gpt_node_next(prev, true); 532 if (node != NULL) { 533 ASSERT3U(node->vgn_gpa, ==, gpa); 534 535 /* We may have crossed into a new parent */ 536 parent = node->vgn_parent; 537 continue; 538 } 539 540 if (vmm_gpt_node_is_last(prev)) { 541 /* 542 * The node preceding this was the last one in its 543 * containing parent, so move on to that parent's 544 * sibling. We expect (demand) that it exist already. 545 */ 546 parent = vmm_gpt_node_next(parent, true); 547 ASSERT(parent != NULL); 548 549 /* 550 * Forget our previous sibling, since it is of no use 551 * for assigning the new node to the a now-different 552 * parent. 553 */ 554 prev = NULL; 555 556 } 557 node = vmm_gpt_node_alloc(); 558 vmm_gpt_node_add(gpt, parent, node, gpa, prev); 559 } 560 561 return (front_node); 562 } 563 564 /* 565 * Ensures that PTEs for the region of address space bounded by 566 * [addr, addr + len) exist in the tree. 567 */ 568 void 569 vmm_gpt_populate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len) 570 { 571 ASSERT0(addr & PAGEOFFSET); 572 ASSERT0(len & PAGEOFFSET); 573 574 /* 575 * Starting at the top of the tree, ensure that tables covering the 576 * requested region exist at each level. 577 */ 578 vmm_gpt_node_t *node = gpt->vgpt_root; 579 for (uint_t lvl = LEVEL4; lvl < LEVEL1; lvl++) { 580 ASSERT3U(node->vgn_level, ==, lvl); 581 582 node = vmm_gpt_populate_region_lvl(gpt, addr, len, node); 583 } 584 585 586 /* 587 * Establish reference counts for the soon-to-be memory PTEs which will 588 * be filling these LEVEL1 tables. 589 */ 590 uint64_t gpa = addr; 591 const uint64_t end = addr + len; 592 while (gpa < end) { 593 ASSERT(node != NULL); 594 ASSERT3U(node->vgn_level, ==, LEVEL1); 595 596 const uint16_t covered = 597 vmm_gpt_node_entries_covered(node, addr, end); 598 599 ASSERT(covered != 0); 600 ASSERT3U(node->vgn_ref_cnt, <, PTE_PER_TABLE); 601 ASSERT3U(node->vgn_ref_cnt + covered, <=, PTE_PER_TABLE); 602 603 node->vgn_ref_cnt += covered; 604 605 vmm_gpt_node_t *next = vmm_gpt_node_next(node, true); 606 if (next != NULL) { 607 gpa = next->vgn_gpa; 608 node = next; 609 } else { 610 /* 611 * We do not expect to find a subsequent node after 612 * filling the last node in the table, completing PTE 613 * accounting for the specified range. 614 */ 615 VERIFY3U(end, <=, vmm_gpt_node_end(node)); 616 break; 617 } 618 } 619 } 620 621 /* 622 * Format a PTE and install it in the provided PTE-pointer. 623 */ 624 bool 625 vmm_gpt_map_at(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t pfn, uint_t prot, 626 uint8_t attr) 627 { 628 uint64_t entry, old_entry; 629 630 entry = gpt->vgpt_pte_ops->vpeo_map_page(pfn, prot, attr); 631 old_entry = atomic_cas_64(ptep, 0, entry); 632 if (old_entry != 0) { 633 ASSERT3U(gpt->vgpt_pte_ops->vpeo_pte_pfn(entry), ==, 634 gpt->vgpt_pte_ops->vpeo_pte_pfn(old_entry)); 635 return (false); 636 } 637 638 return (true); 639 } 640 641 /* 642 * Inserts an entry for a given GPA into the table. The caller must 643 * ensure that a conflicting PFN is not mapped at the requested location. 644 * Racing operations to map the same PFN at one location is acceptable and 645 * properly handled. 646 */ 647 bool 648 vmm_gpt_map(vmm_gpt_t *gpt, uint64_t gpa, pfn_t pfn, uint_t prot, uint8_t attr) 649 { 650 uint64_t *entries[MAX_GPT_LEVEL]; 651 652 ASSERT(gpt != NULL); 653 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 654 ASSERT(entries[LEVEL1] != NULL); 655 656 return (vmm_gpt_map_at(gpt, entries[LEVEL1], pfn, prot, attr)); 657 } 658 659 /* 660 * Cleans up the unused inner nodes in the GPT for a region of guest physical 661 * address space of [addr, addr + len). The region must map no pages. 662 */ 663 void 664 vmm_gpt_vacate_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len) 665 { 666 ASSERT0(addr & PAGEOFFSET); 667 ASSERT0(len & PAGEOFFSET); 668 669 const uint64_t end = addr + len; 670 vmm_gpt_node_t *node, *starts[MAX_GPT_LEVEL] = { 671 [LEVEL4] = gpt->vgpt_root, 672 }; 673 674 for (vmm_gpt_node_level_t lvl = LEVEL4; lvl < LEVEL1; lvl++) { 675 node = vmm_gpt_node_find_child(starts[lvl], addr); 676 if (node == NULL) { 677 break; 678 } 679 starts[lvl + 1] = node; 680 } 681 682 /* 683 * Starting at the bottom of the tree, ensure that PTEs for pages have 684 * been cleared for the region, and remove the corresponding reference 685 * counts from the containing LEVEL1 tables. 686 */ 687 uint64_t gpa = addr; 688 node = starts[LEVEL1]; 689 while (gpa < end && node != NULL) { 690 const uint16_t covered = 691 vmm_gpt_node_entries_covered(node, addr, end); 692 693 ASSERT3U(node->vgn_ref_cnt, >=, covered); 694 node->vgn_ref_cnt -= covered; 695 696 node = vmm_gpt_node_next(node, false); 697 if (node != NULL) { 698 gpa = node->vgn_gpa; 699 } 700 } 701 702 /* 703 * With the page PTE references eliminated, work up from the bottom of 704 * the table, removing nodes which have no remaining references. 705 * 706 * This stops short of LEVEL4, which is the root table of the GPT. It 707 * is left standing to be cleaned up when the vmm_gpt_t is destroyed. 708 */ 709 for (vmm_gpt_node_level_t lvl = LEVEL1; lvl > LEVEL4; lvl--) { 710 gpa = addr; 711 node = starts[lvl]; 712 713 while (gpa < end && node != NULL) { 714 vmm_gpt_node_t *next = vmm_gpt_node_next(node, false); 715 716 if (node->vgn_ref_cnt == 0) { 717 vmm_gpt_node_remove(node); 718 } 719 if (next != NULL) { 720 gpa = next->vgn_gpa; 721 } 722 node = next; 723 } 724 } 725 } 726 727 /* 728 * Remove a mapping from the table. Returns false if the page was not mapped, 729 * otherwise returns true. 730 */ 731 bool 732 vmm_gpt_unmap(vmm_gpt_t *gpt, uint64_t gpa) 733 { 734 uint64_t *entries[MAX_GPT_LEVEL], entry; 735 736 ASSERT(gpt != NULL); 737 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 738 if (entries[LEVEL1] == NULL) 739 return (false); 740 741 entry = *entries[LEVEL1]; 742 *entries[LEVEL1] = 0; 743 return (gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)); 744 } 745 746 /* 747 * Un-maps the region of guest physical address space bounded by [start..end). 748 * Returns the number of pages that are unmapped. 749 */ 750 size_t 751 vmm_gpt_unmap_region(vmm_gpt_t *gpt, uint64_t addr, uint64_t len) 752 { 753 ASSERT0(addr & PAGEOFFSET); 754 ASSERT0(len & PAGEOFFSET); 755 756 const uint64_t end = addr + len; 757 size_t num_unmapped = 0; 758 for (uint64_t gpa = addr; gpa < end; gpa += PAGESIZE) { 759 if (vmm_gpt_unmap(gpt, gpa) != 0) { 760 num_unmapped++; 761 } 762 } 763 764 return (num_unmapped); 765 } 766 767 /* 768 * Returns a value indicating whether or not this GPT maps the given 769 * GPA. If the GPA is mapped, *protp will be filled with the protection 770 * bits of the entry. Otherwise, it will be ignored. 771 */ 772 bool 773 vmm_gpt_is_mapped(vmm_gpt_t *gpt, uint64_t *ptep, pfn_t *pfnp, uint_t *protp) 774 { 775 uint64_t entry; 776 777 ASSERT(pfnp != NULL); 778 ASSERT(protp != NULL); 779 780 if (ptep == NULL) { 781 return (false); 782 } 783 entry = *ptep; 784 if (!gpt->vgpt_pte_ops->vpeo_pte_is_present(entry)) { 785 return (false); 786 } 787 *pfnp = gpt->vgpt_pte_ops->vpeo_pte_pfn(entry); 788 *protp = gpt->vgpt_pte_ops->vpeo_pte_prot(entry); 789 return (true); 790 } 791 792 /* 793 * Resets the accessed bit on the page table entry pointed to be `entry`. 794 * If `on` is true, the bit will be set, otherwise it will be cleared. 795 * The old value of the bit is returned. 796 */ 797 uint_t 798 vmm_gpt_reset_accessed(vmm_gpt_t *gpt, uint64_t *entry, bool on) 799 { 800 ASSERT(entry != NULL); 801 return (gpt->vgpt_pte_ops->vpeo_reset_accessed(entry, on)); 802 } 803 804 /* 805 * Resets the dirty bit on the page table entry pointed to be `entry`. 806 * If `on` is true, the bit will be set, otherwise it will be cleared. 807 * The old value of the bit is returned. 808 */ 809 uint_t 810 vmm_gpt_reset_dirty(vmm_gpt_t *gpt, uint64_t *entry, bool on) 811 { 812 ASSERT(entry != NULL); 813 return (gpt->vgpt_pte_ops->vpeo_reset_dirty(entry, on)); 814 } 815 816 /* 817 * Query state from PTE pointed to by `entry`. 818 */ 819 bool 820 vmm_gpt_query(vmm_gpt_t *gpt, uint64_t *entry, vmm_gpt_query_t query) 821 { 822 ASSERT(entry != NULL); 823 return (gpt->vgpt_pte_ops->vpeo_query(entry, query)); 824 } 825 826 /* 827 * Get properly formatted PML4 (EPTP/nCR3) for GPT. 828 */ 829 uint64_t 830 vmm_gpt_get_pmtp(vmm_gpt_t *gpt, bool track_dirty) 831 { 832 const pfn_t root_pfn = gpt->vgpt_root->vgn_host_pfn; 833 return (gpt->vgpt_pte_ops->vpeo_get_pmtp(root_pfn, track_dirty)); 834 } 835 836 /* 837 * Does the GPT hardware support dirty-page-tracking? 838 */ 839 bool 840 vmm_gpt_can_track_dirty(vmm_gpt_t *gpt) 841 { 842 return (gpt->vgpt_pte_ops->vpeo_hw_ad_supported()); 843 } 844