1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * tools/testing/selftests/kvm/lib/x86_64/processor.c 4 * 5 * Copyright (C) 2018, Google LLC. 6 */ 7 8 #include "linux/bitmap.h" 9 #include "test_util.h" 10 #include "kvm_util.h" 11 #include "processor.h" 12 #include "sev.h" 13 14 #ifndef NUM_INTERRUPTS 15 #define NUM_INTERRUPTS 256 16 #endif 17 18 #define KERNEL_CS 0x8 19 #define KERNEL_DS 0x10 20 #define KERNEL_TSS 0x18 21 22 vm_vaddr_t exception_handlers; 23 bool host_cpu_is_amd; 24 bool host_cpu_is_intel; 25 bool is_forced_emulation_enabled; 26 uint64_t guest_tsc_khz; 27 28 static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) 29 { 30 fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx " 31 "rcx: 0x%.16llx rdx: 0x%.16llx\n", 32 indent, "", 33 regs->rax, regs->rbx, regs->rcx, regs->rdx); 34 fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx " 35 "rsp: 0x%.16llx rbp: 0x%.16llx\n", 36 indent, "", 37 regs->rsi, regs->rdi, regs->rsp, regs->rbp); 38 fprintf(stream, "%*sr8: 0x%.16llx r9: 0x%.16llx " 39 "r10: 0x%.16llx r11: 0x%.16llx\n", 40 indent, "", 41 regs->r8, regs->r9, regs->r10, regs->r11); 42 fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx " 43 "r14: 0x%.16llx r15: 0x%.16llx\n", 44 indent, "", 45 regs->r12, regs->r13, regs->r14, regs->r15); 46 fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n", 47 indent, "", 48 regs->rip, regs->rflags); 49 } 50 51 static void segment_dump(FILE *stream, struct kvm_segment *segment, 52 uint8_t indent) 53 { 54 fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x " 55 "selector: 0x%.4x type: 0x%.2x\n", 56 indent, "", segment->base, segment->limit, 57 segment->selector, segment->type); 58 fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x " 59 "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n", 60 indent, "", segment->present, segment->dpl, 61 segment->db, segment->s, segment->l); 62 fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x " 63 "unusable: 0x%.2x padding: 0x%.2x\n", 64 indent, "", segment->g, segment->avl, 65 segment->unusable, segment->padding); 66 } 67 68 static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, 69 uint8_t indent) 70 { 71 fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x " 72 "padding: 0x%.4x 0x%.4x 0x%.4x\n", 73 indent, "", dtable->base, dtable->limit, 74 dtable->padding[0], dtable->padding[1], dtable->padding[2]); 75 } 76 77 static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent) 78 { 79 unsigned int i; 80 81 fprintf(stream, "%*scs:\n", indent, ""); 82 segment_dump(stream, &sregs->cs, indent + 2); 83 fprintf(stream, "%*sds:\n", indent, ""); 84 segment_dump(stream, &sregs->ds, indent + 2); 85 fprintf(stream, "%*ses:\n", indent, ""); 86 segment_dump(stream, &sregs->es, indent + 2); 87 fprintf(stream, "%*sfs:\n", indent, ""); 88 segment_dump(stream, &sregs->fs, indent + 2); 89 fprintf(stream, "%*sgs:\n", indent, ""); 90 segment_dump(stream, &sregs->gs, indent + 2); 91 fprintf(stream, "%*sss:\n", indent, ""); 92 segment_dump(stream, &sregs->ss, indent + 2); 93 fprintf(stream, "%*str:\n", indent, ""); 94 segment_dump(stream, &sregs->tr, indent + 2); 95 fprintf(stream, "%*sldt:\n", indent, ""); 96 segment_dump(stream, &sregs->ldt, indent + 2); 97 98 fprintf(stream, "%*sgdt:\n", indent, ""); 99 dtable_dump(stream, &sregs->gdt, indent + 2); 100 fprintf(stream, "%*sidt:\n", indent, ""); 101 dtable_dump(stream, &sregs->idt, indent + 2); 102 103 fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx " 104 "cr3: 0x%.16llx cr4: 0x%.16llx\n", 105 indent, "", 106 sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4); 107 fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx " 108 "apic_base: 0x%.16llx\n", 109 indent, "", 110 sregs->cr8, sregs->efer, sregs->apic_base); 111 112 fprintf(stream, "%*sinterrupt_bitmap:\n", indent, ""); 113 for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) { 114 fprintf(stream, "%*s%.16llx\n", indent + 2, "", 115 sregs->interrupt_bitmap[i]); 116 } 117 } 118 119 bool kvm_is_tdp_enabled(void) 120 { 121 if (host_cpu_is_intel) 122 return get_kvm_intel_param_bool("ept"); 123 else 124 return get_kvm_amd_param_bool("npt"); 125 } 126 127 void virt_arch_pgd_alloc(struct kvm_vm *vm) 128 { 129 TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " 130 "unknown or unsupported guest mode, mode: 0x%x", vm->mode); 131 132 /* If needed, create page map l4 table. */ 133 if (!vm->pgd_created) { 134 vm->pgd = vm_alloc_page_table(vm); 135 vm->pgd_created = true; 136 } 137 } 138 139 static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, 140 uint64_t vaddr, int level) 141 { 142 uint64_t pt_gpa = PTE_GET_PA(*parent_pte); 143 uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); 144 int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; 145 146 TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd, 147 "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", 148 level + 1, vaddr); 149 150 return &page_table[index]; 151 } 152 153 static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, 154 uint64_t *parent_pte, 155 uint64_t vaddr, 156 uint64_t paddr, 157 int current_level, 158 int target_level) 159 { 160 uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level); 161 162 paddr = vm_untag_gpa(vm, paddr); 163 164 if (!(*pte & PTE_PRESENT_MASK)) { 165 *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; 166 if (current_level == target_level) 167 *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); 168 else 169 *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; 170 } else { 171 /* 172 * Entry already present. Assert that the caller doesn't want 173 * a hugepage at this level, and that there isn't a hugepage at 174 * this level. 175 */ 176 TEST_ASSERT(current_level != target_level, 177 "Cannot create hugepage at level: %u, vaddr: 0x%lx", 178 current_level, vaddr); 179 TEST_ASSERT(!(*pte & PTE_LARGE_MASK), 180 "Cannot create page table at level: %u, vaddr: 0x%lx", 181 current_level, vaddr); 182 } 183 return pte; 184 } 185 186 void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) 187 { 188 const uint64_t pg_size = PG_LEVEL_SIZE(level); 189 uint64_t *pml4e, *pdpe, *pde; 190 uint64_t *pte; 191 192 TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, 193 "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); 194 195 TEST_ASSERT((vaddr % pg_size) == 0, 196 "Virtual address not aligned,\n" 197 "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size); 198 TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), 199 "Invalid virtual address, vaddr: 0x%lx", vaddr); 200 TEST_ASSERT((paddr % pg_size) == 0, 201 "Physical address not aligned,\n" 202 " paddr: 0x%lx page size: 0x%lx", paddr, pg_size); 203 TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, 204 "Physical address beyond maximum supported,\n" 205 " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", 206 paddr, vm->max_gfn, vm->page_size); 207 TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr, 208 "Unexpected bits in paddr: %lx", paddr); 209 210 /* 211 * Allocate upper level page tables, if not already present. Return 212 * early if a hugepage was created. 213 */ 214 pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level); 215 if (*pml4e & PTE_LARGE_MASK) 216 return; 217 218 pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level); 219 if (*pdpe & PTE_LARGE_MASK) 220 return; 221 222 pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level); 223 if (*pde & PTE_LARGE_MASK) 224 return; 225 226 /* Fill in page table entry. */ 227 pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); 228 TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), 229 "PTE already present for 4k page at vaddr: 0x%lx", vaddr); 230 *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); 231 232 /* 233 * Neither SEV nor TDX supports shared page tables, so only the final 234 * leaf PTE needs manually set the C/S-bit. 235 */ 236 if (vm_is_gpa_protected(vm, paddr)) 237 *pte |= vm->arch.c_bit; 238 else 239 *pte |= vm->arch.s_bit; 240 } 241 242 void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) 243 { 244 __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); 245 } 246 247 void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, 248 uint64_t nr_bytes, int level) 249 { 250 uint64_t pg_size = PG_LEVEL_SIZE(level); 251 uint64_t nr_pages = nr_bytes / pg_size; 252 int i; 253 254 TEST_ASSERT(nr_bytes % pg_size == 0, 255 "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx", 256 nr_bytes, pg_size); 257 258 for (i = 0; i < nr_pages; i++) { 259 __virt_pg_map(vm, vaddr, paddr, level); 260 261 vaddr += pg_size; 262 paddr += pg_size; 263 } 264 } 265 266 static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) 267 { 268 if (*pte & PTE_LARGE_MASK) { 269 TEST_ASSERT(*level == PG_LEVEL_NONE || 270 *level == current_level, 271 "Unexpected hugepage at level %d", current_level); 272 *level = current_level; 273 } 274 275 return *level == current_level; 276 } 277 278 uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, 279 int *level) 280 { 281 uint64_t *pml4e, *pdpe, *pde; 282 283 TEST_ASSERT(!vm->arch.is_pt_protected, 284 "Walking page tables of protected guests is impossible"); 285 286 TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM, 287 "Invalid PG_LEVEL_* '%d'", *level); 288 289 TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " 290 "unknown or unsupported guest mode, mode: 0x%x", vm->mode); 291 TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, 292 (vaddr >> vm->page_shift)), 293 "Invalid virtual address, vaddr: 0x%lx", 294 vaddr); 295 /* 296 * Based on the mode check above there are 48 bits in the vaddr, so 297 * shift 16 to sign extend the last bit (bit-47), 298 */ 299 TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), 300 "Canonical check failed. The virtual address is invalid."); 301 302 pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G); 303 if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G)) 304 return pml4e; 305 306 pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G); 307 if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G)) 308 return pdpe; 309 310 pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M); 311 if (vm_is_target_pte(pde, level, PG_LEVEL_2M)) 312 return pde; 313 314 return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); 315 } 316 317 uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) 318 { 319 int level = PG_LEVEL_4K; 320 321 return __vm_get_page_table_entry(vm, vaddr, &level); 322 } 323 324 void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) 325 { 326 uint64_t *pml4e, *pml4e_start; 327 uint64_t *pdpe, *pdpe_start; 328 uint64_t *pde, *pde_start; 329 uint64_t *pte, *pte_start; 330 331 if (!vm->pgd_created) 332 return; 333 334 fprintf(stream, "%*s " 335 " no\n", indent, ""); 336 fprintf(stream, "%*s index hvaddr gpaddr " 337 "addr w exec dirty\n", 338 indent, ""); 339 pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); 340 for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { 341 pml4e = &pml4e_start[n1]; 342 if (!(*pml4e & PTE_PRESENT_MASK)) 343 continue; 344 fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " 345 " %u\n", 346 indent, "", 347 pml4e - pml4e_start, pml4e, 348 addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e), 349 !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK)); 350 351 pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK); 352 for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { 353 pdpe = &pdpe_start[n2]; 354 if (!(*pdpe & PTE_PRESENT_MASK)) 355 continue; 356 fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " 357 "%u %u\n", 358 indent, "", 359 pdpe - pdpe_start, pdpe, 360 addr_hva2gpa(vm, pdpe), 361 PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK), 362 !!(*pdpe & PTE_NX_MASK)); 363 364 pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); 365 for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { 366 pde = &pde_start[n3]; 367 if (!(*pde & PTE_PRESENT_MASK)) 368 continue; 369 fprintf(stream, "%*spde 0x%-3zx %p " 370 "0x%-12lx 0x%-10llx %u %u\n", 371 indent, "", pde - pde_start, pde, 372 addr_hva2gpa(vm, pde), 373 PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK), 374 !!(*pde & PTE_NX_MASK)); 375 376 pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); 377 for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { 378 pte = &pte_start[n4]; 379 if (!(*pte & PTE_PRESENT_MASK)) 380 continue; 381 fprintf(stream, "%*spte 0x%-3zx %p " 382 "0x%-12lx 0x%-10llx %u %u " 383 " %u 0x%-10lx\n", 384 indent, "", 385 pte - pte_start, pte, 386 addr_hva2gpa(vm, pte), 387 PTE_GET_PFN(*pte), 388 !!(*pte & PTE_WRITABLE_MASK), 389 !!(*pte & PTE_NX_MASK), 390 !!(*pte & PTE_DIRTY_MASK), 391 ((uint64_t) n1 << 27) 392 | ((uint64_t) n2 << 18) 393 | ((uint64_t) n3 << 9) 394 | ((uint64_t) n4)); 395 } 396 } 397 } 398 } 399 } 400 401 /* 402 * Set Unusable Segment 403 * 404 * Input Args: None 405 * 406 * Output Args: 407 * segp - Pointer to segment register 408 * 409 * Return: None 410 * 411 * Sets the segment register pointed to by @segp to an unusable state. 412 */ 413 static void kvm_seg_set_unusable(struct kvm_segment *segp) 414 { 415 memset(segp, 0, sizeof(*segp)); 416 segp->unusable = true; 417 } 418 419 static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) 420 { 421 void *gdt = addr_gva2hva(vm, vm->arch.gdt); 422 struct desc64 *desc = gdt + (segp->selector >> 3) * 8; 423 424 desc->limit0 = segp->limit & 0xFFFF; 425 desc->base0 = segp->base & 0xFFFF; 426 desc->base1 = segp->base >> 16; 427 desc->type = segp->type; 428 desc->s = segp->s; 429 desc->dpl = segp->dpl; 430 desc->p = segp->present; 431 desc->limit1 = segp->limit >> 16; 432 desc->avl = segp->avl; 433 desc->l = segp->l; 434 desc->db = segp->db; 435 desc->g = segp->g; 436 desc->base2 = segp->base >> 24; 437 if (!segp->s) 438 desc->base3 = segp->base >> 32; 439 } 440 441 static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp) 442 { 443 memset(segp, 0, sizeof(*segp)); 444 segp->selector = KERNEL_CS; 445 segp->limit = 0xFFFFFFFFu; 446 segp->s = 0x1; /* kTypeCodeData */ 447 segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed 448 * | kFlagCodeReadable 449 */ 450 segp->g = true; 451 segp->l = true; 452 segp->present = 1; 453 } 454 455 static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp) 456 { 457 memset(segp, 0, sizeof(*segp)); 458 segp->selector = KERNEL_DS; 459 segp->limit = 0xFFFFFFFFu; 460 segp->s = 0x1; /* kTypeCodeData */ 461 segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed 462 * | kFlagDataWritable 463 */ 464 segp->g = true; 465 segp->present = true; 466 } 467 468 vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) 469 { 470 int level = PG_LEVEL_NONE; 471 uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level); 472 473 TEST_ASSERT(*pte & PTE_PRESENT_MASK, 474 "Leaf PTE not PRESENT for gva: 0x%08lx", gva); 475 476 /* 477 * No need for a hugepage mask on the PTE, x86-64 requires the "unused" 478 * address bits to be zero. 479 */ 480 return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level)); 481 } 482 483 static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp) 484 { 485 memset(segp, 0, sizeof(*segp)); 486 segp->base = base; 487 segp->limit = 0x67; 488 segp->selector = KERNEL_TSS; 489 segp->type = 0xb; 490 segp->present = 1; 491 } 492 493 static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) 494 { 495 struct kvm_sregs sregs; 496 497 TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K); 498 499 /* Set mode specific system register values. */ 500 vcpu_sregs_get(vcpu, &sregs); 501 502 sregs.idt.base = vm->arch.idt; 503 sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; 504 sregs.gdt.base = vm->arch.gdt; 505 sregs.gdt.limit = getpagesize() - 1; 506 507 sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; 508 sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; 509 sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); 510 511 kvm_seg_set_unusable(&sregs.ldt); 512 kvm_seg_set_kernel_code_64bit(&sregs.cs); 513 kvm_seg_set_kernel_data_64bit(&sregs.ds); 514 kvm_seg_set_kernel_data_64bit(&sregs.es); 515 kvm_seg_set_kernel_data_64bit(&sregs.gs); 516 kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr); 517 518 sregs.cr3 = vm->pgd; 519 vcpu_sregs_set(vcpu, &sregs); 520 } 521 522 static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, 523 int dpl, unsigned short selector) 524 { 525 struct idt_entry *base = 526 (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt); 527 struct idt_entry *e = &base[vector]; 528 529 memset(e, 0, sizeof(*e)); 530 e->offset0 = addr; 531 e->selector = selector; 532 e->ist = 0; 533 e->type = 14; 534 e->dpl = dpl; 535 e->p = 1; 536 e->offset1 = addr >> 16; 537 e->offset2 = addr >> 32; 538 } 539 540 static bool kvm_fixup_exception(struct ex_regs *regs) 541 { 542 if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) 543 return false; 544 545 if (regs->vector == DE_VECTOR) 546 return false; 547 548 regs->rip = regs->r11; 549 regs->r9 = regs->vector; 550 regs->r10 = regs->error_code; 551 return true; 552 } 553 554 void route_exception(struct ex_regs *regs) 555 { 556 typedef void(*handler)(struct ex_regs *); 557 handler *handlers = (handler *)exception_handlers; 558 559 if (handlers && handlers[regs->vector]) { 560 handlers[regs->vector](regs); 561 return; 562 } 563 564 if (kvm_fixup_exception(regs)) 565 return; 566 567 GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'", 568 regs->vector, regs->rip); 569 } 570 571 static void vm_init_descriptor_tables(struct kvm_vm *vm) 572 { 573 extern void *idt_handlers; 574 struct kvm_segment seg; 575 int i; 576 577 vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); 578 vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); 579 vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); 580 vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); 581 582 /* Handlers have the same address in both address spaces.*/ 583 for (i = 0; i < NUM_INTERRUPTS; i++) 584 set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS); 585 586 *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; 587 588 kvm_seg_set_kernel_code_64bit(&seg); 589 kvm_seg_fill_gdt_64bit(vm, &seg); 590 591 kvm_seg_set_kernel_data_64bit(&seg); 592 kvm_seg_fill_gdt_64bit(vm, &seg); 593 594 kvm_seg_set_tss_64bit(vm->arch.tss, &seg); 595 kvm_seg_fill_gdt_64bit(vm, &seg); 596 } 597 598 void vm_install_exception_handler(struct kvm_vm *vm, int vector, 599 void (*handler)(struct ex_regs *)) 600 { 601 vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); 602 603 handlers[vector] = (vm_vaddr_t)handler; 604 } 605 606 void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) 607 { 608 struct ucall uc; 609 610 if (get_ucall(vcpu, &uc) == UCALL_ABORT) 611 REPORT_GUEST_ASSERT(uc); 612 } 613 614 void kvm_arch_vm_post_create(struct kvm_vm *vm) 615 { 616 int r; 617 618 TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ), 619 "Require KVM_GET_TSC_KHZ to provide udelay() to guest."); 620 621 vm_create_irqchip(vm); 622 vm_init_descriptor_tables(vm); 623 624 sync_global_to_guest(vm, host_cpu_is_intel); 625 sync_global_to_guest(vm, host_cpu_is_amd); 626 sync_global_to_guest(vm, is_forced_emulation_enabled); 627 628 if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { 629 struct kvm_sev_init init = { 0 }; 630 631 vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); 632 } 633 634 r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL); 635 TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency."); 636 guest_tsc_khz = r; 637 sync_global_to_guest(vm, guest_tsc_khz); 638 } 639 640 void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) 641 { 642 struct kvm_regs regs; 643 644 vcpu_regs_get(vcpu, ®s); 645 regs.rip = (unsigned long) guest_code; 646 vcpu_regs_set(vcpu, ®s); 647 } 648 649 struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) 650 { 651 struct kvm_mp_state mp_state; 652 struct kvm_regs regs; 653 vm_vaddr_t stack_vaddr; 654 struct kvm_vcpu *vcpu; 655 656 stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), 657 DEFAULT_GUEST_STACK_VADDR_MIN, 658 MEM_REGION_DATA); 659 660 stack_vaddr += DEFAULT_STACK_PGS * getpagesize(); 661 662 /* 663 * Align stack to match calling sequence requirements in section "The 664 * Stack Frame" of the System V ABI AMD64 Architecture Processor 665 * Supplement, which requires the value (%rsp + 8) to be a multiple of 666 * 16 when control is transferred to the function entry point. 667 * 668 * If this code is ever used to launch a vCPU with 32-bit entry point it 669 * may need to subtract 4 bytes instead of 8 bytes. 670 */ 671 TEST_ASSERT(IS_ALIGNED(stack_vaddr, PAGE_SIZE), 672 "__vm_vaddr_alloc() did not provide a page-aligned address"); 673 stack_vaddr -= 8; 674 675 vcpu = __vm_vcpu_add(vm, vcpu_id); 676 vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); 677 vcpu_init_sregs(vm, vcpu); 678 679 /* Setup guest general purpose registers */ 680 vcpu_regs_get(vcpu, ®s); 681 regs.rflags = regs.rflags | 0x2; 682 regs.rsp = stack_vaddr; 683 vcpu_regs_set(vcpu, ®s); 684 685 /* Setup the MP state */ 686 mp_state.mp_state = 0; 687 vcpu_mp_state_set(vcpu, &mp_state); 688 689 return vcpu; 690 } 691 692 struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id) 693 { 694 struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); 695 696 vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); 697 698 return vcpu; 699 } 700 701 void vcpu_arch_free(struct kvm_vcpu *vcpu) 702 { 703 if (vcpu->cpuid) 704 free(vcpu->cpuid); 705 } 706 707 /* Do not use kvm_supported_cpuid directly except for validity checks. */ 708 static void *kvm_supported_cpuid; 709 710 const struct kvm_cpuid2 *kvm_get_supported_cpuid(void) 711 { 712 int kvm_fd; 713 714 if (kvm_supported_cpuid) 715 return kvm_supported_cpuid; 716 717 kvm_supported_cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); 718 kvm_fd = open_kvm_dev_path_or_exit(); 719 720 kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, 721 (struct kvm_cpuid2 *)kvm_supported_cpuid); 722 723 close(kvm_fd); 724 return kvm_supported_cpuid; 725 } 726 727 static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid, 728 uint32_t function, uint32_t index, 729 uint8_t reg, uint8_t lo, uint8_t hi) 730 { 731 const struct kvm_cpuid_entry2 *entry; 732 int i; 733 734 for (i = 0; i < cpuid->nent; i++) { 735 entry = &cpuid->entries[i]; 736 737 /* 738 * The output registers in kvm_cpuid_entry2 are in alphabetical 739 * order, but kvm_x86_cpu_feature matches that mess, so yay 740 * pointer shenanigans! 741 */ 742 if (entry->function == function && entry->index == index) 743 return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo; 744 } 745 746 return 0; 747 } 748 749 bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, 750 struct kvm_x86_cpu_feature feature) 751 { 752 return __kvm_cpu_has(cpuid, feature.function, feature.index, 753 feature.reg, feature.bit, feature.bit); 754 } 755 756 uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid, 757 struct kvm_x86_cpu_property property) 758 { 759 return __kvm_cpu_has(cpuid, property.function, property.index, 760 property.reg, property.lo_bit, property.hi_bit); 761 } 762 763 uint64_t kvm_get_feature_msr(uint64_t msr_index) 764 { 765 struct { 766 struct kvm_msrs header; 767 struct kvm_msr_entry entry; 768 } buffer = {}; 769 int r, kvm_fd; 770 771 buffer.header.nmsrs = 1; 772 buffer.entry.index = msr_index; 773 kvm_fd = open_kvm_dev_path_or_exit(); 774 775 r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); 776 TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r)); 777 778 close(kvm_fd); 779 return buffer.entry.data; 780 } 781 782 void __vm_xsave_require_permission(uint64_t xfeature, const char *name) 783 { 784 int kvm_fd; 785 u64 bitmask; 786 long rc; 787 struct kvm_device_attr attr = { 788 .group = 0, 789 .attr = KVM_X86_XCOMP_GUEST_SUPP, 790 .addr = (unsigned long) &bitmask, 791 }; 792 793 TEST_ASSERT(!kvm_supported_cpuid, 794 "kvm_get_supported_cpuid() cannot be used before ARCH_REQ_XCOMP_GUEST_PERM"); 795 796 TEST_ASSERT(is_power_of_2(xfeature), 797 "Dynamic XFeatures must be enabled one at a time"); 798 799 kvm_fd = open_kvm_dev_path_or_exit(); 800 rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); 801 close(kvm_fd); 802 803 if (rc == -1 && (errno == ENXIO || errno == EINVAL)) 804 __TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported"); 805 806 TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc); 807 808 __TEST_REQUIRE(bitmask & xfeature, 809 "Required XSAVE feature '%s' not supported", name); 810 811 TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, ilog2(xfeature))); 812 813 rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask); 814 TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc); 815 TEST_ASSERT(bitmask & xfeature, 816 "'%s' (0x%lx) not permitted after prctl(ARCH_REQ_XCOMP_GUEST_PERM) permitted=0x%lx", 817 name, xfeature, bitmask); 818 } 819 820 void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid) 821 { 822 TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID"); 823 824 /* Allow overriding the default CPUID. */ 825 if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) { 826 free(vcpu->cpuid); 827 vcpu->cpuid = NULL; 828 } 829 830 if (!vcpu->cpuid) 831 vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent); 832 833 memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent)); 834 vcpu_set_cpuid(vcpu); 835 } 836 837 void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu, 838 struct kvm_x86_cpu_property property, 839 uint32_t value) 840 { 841 struct kvm_cpuid_entry2 *entry; 842 843 entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index); 844 845 (&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit); 846 (&entry->eax)[property.reg] |= value << property.lo_bit; 847 848 vcpu_set_cpuid(vcpu); 849 850 /* Sanity check that @value doesn't exceed the bounds in any way. */ 851 TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value); 852 } 853 854 void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function) 855 { 856 struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function); 857 858 entry->eax = 0; 859 entry->ebx = 0; 860 entry->ecx = 0; 861 entry->edx = 0; 862 vcpu_set_cpuid(vcpu); 863 } 864 865 void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, 866 struct kvm_x86_cpu_feature feature, 867 bool set) 868 { 869 struct kvm_cpuid_entry2 *entry; 870 u32 *reg; 871 872 entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index); 873 reg = (&entry->eax) + feature.reg; 874 875 if (set) 876 *reg |= BIT(feature.bit); 877 else 878 *reg &= ~BIT(feature.bit); 879 880 vcpu_set_cpuid(vcpu); 881 } 882 883 uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index) 884 { 885 struct { 886 struct kvm_msrs header; 887 struct kvm_msr_entry entry; 888 } buffer = {}; 889 890 buffer.header.nmsrs = 1; 891 buffer.entry.index = msr_index; 892 893 vcpu_msrs_get(vcpu, &buffer.header); 894 895 return buffer.entry.data; 896 } 897 898 int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value) 899 { 900 struct { 901 struct kvm_msrs header; 902 struct kvm_msr_entry entry; 903 } buffer = {}; 904 905 memset(&buffer, 0, sizeof(buffer)); 906 buffer.header.nmsrs = 1; 907 buffer.entry.index = msr_index; 908 buffer.entry.data = msr_value; 909 910 return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header); 911 } 912 913 void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) 914 { 915 va_list ap; 916 struct kvm_regs regs; 917 918 TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n" 919 " num: %u", 920 num); 921 922 va_start(ap, num); 923 vcpu_regs_get(vcpu, ®s); 924 925 if (num >= 1) 926 regs.rdi = va_arg(ap, uint64_t); 927 928 if (num >= 2) 929 regs.rsi = va_arg(ap, uint64_t); 930 931 if (num >= 3) 932 regs.rdx = va_arg(ap, uint64_t); 933 934 if (num >= 4) 935 regs.rcx = va_arg(ap, uint64_t); 936 937 if (num >= 5) 938 regs.r8 = va_arg(ap, uint64_t); 939 940 if (num >= 6) 941 regs.r9 = va_arg(ap, uint64_t); 942 943 vcpu_regs_set(vcpu, ®s); 944 va_end(ap); 945 } 946 947 void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) 948 { 949 struct kvm_regs regs; 950 struct kvm_sregs sregs; 951 952 fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id); 953 954 fprintf(stream, "%*sregs:\n", indent + 2, ""); 955 vcpu_regs_get(vcpu, ®s); 956 regs_dump(stream, ®s, indent + 4); 957 958 fprintf(stream, "%*ssregs:\n", indent + 2, ""); 959 vcpu_sregs_get(vcpu, &sregs); 960 sregs_dump(stream, &sregs, indent + 4); 961 } 962 963 static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs) 964 { 965 struct kvm_msr_list *list; 966 struct kvm_msr_list nmsrs; 967 int kvm_fd, r; 968 969 kvm_fd = open_kvm_dev_path_or_exit(); 970 971 nmsrs.nmsrs = 0; 972 if (!feature_msrs) 973 r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); 974 else 975 r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs); 976 977 TEST_ASSERT(r == -1 && errno == E2BIG, 978 "Expected -E2BIG, got rc: %i errno: %i (%s)", 979 r, errno, strerror(errno)); 980 981 list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0])); 982 TEST_ASSERT(list, "-ENOMEM when allocating MSR index list"); 983 list->nmsrs = nmsrs.nmsrs; 984 985 if (!feature_msrs) 986 kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); 987 else 988 kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); 989 close(kvm_fd); 990 991 TEST_ASSERT(list->nmsrs == nmsrs.nmsrs, 992 "Number of MSRs in list changed, was %d, now %d", 993 nmsrs.nmsrs, list->nmsrs); 994 return list; 995 } 996 997 const struct kvm_msr_list *kvm_get_msr_index_list(void) 998 { 999 static const struct kvm_msr_list *list; 1000 1001 if (!list) 1002 list = __kvm_get_msr_index_list(false); 1003 return list; 1004 } 1005 1006 1007 const struct kvm_msr_list *kvm_get_feature_msr_index_list(void) 1008 { 1009 static const struct kvm_msr_list *list; 1010 1011 if (!list) 1012 list = __kvm_get_msr_index_list(true); 1013 return list; 1014 } 1015 1016 bool kvm_msr_is_in_save_restore_list(uint32_t msr_index) 1017 { 1018 const struct kvm_msr_list *list = kvm_get_msr_index_list(); 1019 int i; 1020 1021 for (i = 0; i < list->nmsrs; ++i) { 1022 if (list->indices[i] == msr_index) 1023 return true; 1024 } 1025 1026 return false; 1027 } 1028 1029 static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu, 1030 struct kvm_x86_state *state) 1031 { 1032 int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2); 1033 1034 if (size) { 1035 state->xsave = malloc(size); 1036 vcpu_xsave2_get(vcpu, state->xsave); 1037 } else { 1038 state->xsave = malloc(sizeof(struct kvm_xsave)); 1039 vcpu_xsave_get(vcpu, state->xsave); 1040 } 1041 } 1042 1043 struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu) 1044 { 1045 const struct kvm_msr_list *msr_list = kvm_get_msr_index_list(); 1046 struct kvm_x86_state *state; 1047 int i; 1048 1049 static int nested_size = -1; 1050 1051 if (nested_size == -1) { 1052 nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE); 1053 TEST_ASSERT(nested_size <= sizeof(state->nested_), 1054 "Nested state size too big, %i > %zi", 1055 nested_size, sizeof(state->nested_)); 1056 } 1057 1058 /* 1059 * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees 1060 * guest state is consistent only after userspace re-enters the 1061 * kernel with KVM_RUN. Complete IO prior to migrating state 1062 * to a new VM. 1063 */ 1064 vcpu_run_complete_io(vcpu); 1065 1066 state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0])); 1067 TEST_ASSERT(state, "-ENOMEM when allocating kvm state"); 1068 1069 vcpu_events_get(vcpu, &state->events); 1070 vcpu_mp_state_get(vcpu, &state->mp_state); 1071 vcpu_regs_get(vcpu, &state->regs); 1072 vcpu_save_xsave_state(vcpu, state); 1073 1074 if (kvm_has_cap(KVM_CAP_XCRS)) 1075 vcpu_xcrs_get(vcpu, &state->xcrs); 1076 1077 vcpu_sregs_get(vcpu, &state->sregs); 1078 1079 if (nested_size) { 1080 state->nested.size = sizeof(state->nested_); 1081 1082 vcpu_nested_state_get(vcpu, &state->nested); 1083 TEST_ASSERT(state->nested.size <= nested_size, 1084 "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", 1085 state->nested.size, nested_size); 1086 } else { 1087 state->nested.size = 0; 1088 } 1089 1090 state->msrs.nmsrs = msr_list->nmsrs; 1091 for (i = 0; i < msr_list->nmsrs; i++) 1092 state->msrs.entries[i].index = msr_list->indices[i]; 1093 vcpu_msrs_get(vcpu, &state->msrs); 1094 1095 vcpu_debugregs_get(vcpu, &state->debugregs); 1096 1097 return state; 1098 } 1099 1100 void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state) 1101 { 1102 vcpu_sregs_set(vcpu, &state->sregs); 1103 vcpu_msrs_set(vcpu, &state->msrs); 1104 1105 if (kvm_has_cap(KVM_CAP_XCRS)) 1106 vcpu_xcrs_set(vcpu, &state->xcrs); 1107 1108 vcpu_xsave_set(vcpu, state->xsave); 1109 vcpu_events_set(vcpu, &state->events); 1110 vcpu_mp_state_set(vcpu, &state->mp_state); 1111 vcpu_debugregs_set(vcpu, &state->debugregs); 1112 vcpu_regs_set(vcpu, &state->regs); 1113 1114 if (state->nested.size) 1115 vcpu_nested_state_set(vcpu, &state->nested); 1116 } 1117 1118 void kvm_x86_state_cleanup(struct kvm_x86_state *state) 1119 { 1120 free(state->xsave); 1121 free(state); 1122 } 1123 1124 void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) 1125 { 1126 if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) { 1127 *pa_bits = kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32; 1128 *va_bits = 32; 1129 } else { 1130 *pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR); 1131 *va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR); 1132 } 1133 } 1134 1135 void kvm_init_vm_address_properties(struct kvm_vm *vm) 1136 { 1137 if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { 1138 vm->arch.sev_fd = open_sev_dev_path_or_exit(); 1139 vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT)); 1140 vm->gpa_tag_mask = vm->arch.c_bit; 1141 } else { 1142 vm->arch.sev_fd = -1; 1143 } 1144 } 1145 1146 const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, 1147 uint32_t function, uint32_t index) 1148 { 1149 int i; 1150 1151 for (i = 0; i < cpuid->nent; i++) { 1152 if (cpuid->entries[i].function == function && 1153 cpuid->entries[i].index == index) 1154 return &cpuid->entries[i]; 1155 } 1156 1157 TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index); 1158 1159 return NULL; 1160 } 1161 1162 #define X86_HYPERCALL(inputs...) \ 1163 ({ \ 1164 uint64_t r; \ 1165 \ 1166 asm volatile("test %[use_vmmcall], %[use_vmmcall]\n\t" \ 1167 "jnz 1f\n\t" \ 1168 "vmcall\n\t" \ 1169 "jmp 2f\n\t" \ 1170 "1: vmmcall\n\t" \ 1171 "2:" \ 1172 : "=a"(r) \ 1173 : [use_vmmcall] "r" (host_cpu_is_amd), inputs); \ 1174 \ 1175 r; \ 1176 }) 1177 1178 uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, 1179 uint64_t a3) 1180 { 1181 return X86_HYPERCALL("a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3)); 1182 } 1183 1184 uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1) 1185 { 1186 return X86_HYPERCALL("a"(nr), "D"(a0), "S"(a1)); 1187 } 1188 1189 void xen_hypercall(uint64_t nr, uint64_t a0, void *a1) 1190 { 1191 GUEST_ASSERT(!__xen_hypercall(nr, a0, a1)); 1192 } 1193 1194 unsigned long vm_compute_max_gfn(struct kvm_vm *vm) 1195 { 1196 const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ 1197 unsigned long ht_gfn, max_gfn, max_pfn; 1198 uint8_t maxphyaddr, guest_maxphyaddr; 1199 1200 /* 1201 * Use "guest MAXPHYADDR" from KVM if it's available. Guest MAXPHYADDR 1202 * enumerates the max _mappable_ GPA, which can be less than the raw 1203 * MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU 1204 * doesn't support 5-level TDP. 1205 */ 1206 guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR); 1207 guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits; 1208 TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits, 1209 "Guest MAXPHYADDR should never be greater than raw MAXPHYADDR"); 1210 1211 max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1; 1212 1213 /* Avoid reserved HyperTransport region on AMD processors. */ 1214 if (!host_cpu_is_amd) 1215 return max_gfn; 1216 1217 /* On parts with <40 physical address bits, the area is fully hidden */ 1218 if (vm->pa_bits < 40) 1219 return max_gfn; 1220 1221 /* Before family 17h, the HyperTransport area is just below 1T. */ 1222 ht_gfn = (1 << 28) - num_ht_pages; 1223 if (this_cpu_family() < 0x17) 1224 goto done; 1225 1226 /* 1227 * Otherwise it's at the top of the physical address space, possibly 1228 * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use 1229 * the old conservative value if MAXPHYADDR is not enumerated. 1230 */ 1231 if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) 1232 goto done; 1233 1234 maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR); 1235 max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1; 1236 1237 if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION)) 1238 max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION); 1239 1240 ht_gfn = max_pfn - num_ht_pages; 1241 done: 1242 return min(max_gfn, ht_gfn - 1); 1243 } 1244 1245 /* Returns true if kvm_intel was loaded with unrestricted_guest=1. */ 1246 bool vm_is_unrestricted_guest(struct kvm_vm *vm) 1247 { 1248 /* Ensure that a KVM vendor-specific module is loaded. */ 1249 if (vm == NULL) 1250 close(open_kvm_dev_path_or_exit()); 1251 1252 return get_kvm_intel_param_bool("unrestricted_guest"); 1253 } 1254 1255 void kvm_selftest_arch_init(void) 1256 { 1257 host_cpu_is_intel = this_cpu_is_intel(); 1258 host_cpu_is_amd = this_cpu_is_amd(); 1259 is_forced_emulation_enabled = kvm_is_forced_emulation_enabled(); 1260 } 1261 1262 bool sys_clocksource_is_based_on_tsc(void) 1263 { 1264 char *clk_name = sys_get_cur_clocksource(); 1265 bool ret = !strcmp(clk_name, "tsc\n") || 1266 !strcmp(clk_name, "hyperv_clocksource_tsc_page\n"); 1267 1268 free(clk_name); 1269 1270 return ret; 1271 } 1272