1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * tools/testing/selftests/kvm/lib/x86_64/processor.c
4 *
5 * Copyright (C) 2018, Google LLC.
6 */
7
8 #include "linux/bitmap.h"
9 #include "test_util.h"
10 #include "kvm_util.h"
11 #include "processor.h"
12 #include "sev.h"
13
14 #ifndef NUM_INTERRUPTS
15 #define NUM_INTERRUPTS 256
16 #endif
17
18 #define KERNEL_CS 0x8
19 #define KERNEL_DS 0x10
20 #define KERNEL_TSS 0x18
21
22 vm_vaddr_t exception_handlers;
23 bool host_cpu_is_amd;
24 bool host_cpu_is_intel;
25 bool is_forced_emulation_enabled;
26 uint64_t guest_tsc_khz;
27
regs_dump(FILE * stream,struct kvm_regs * regs,uint8_t indent)28 static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent)
29 {
30 fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
31 "rcx: 0x%.16llx rdx: 0x%.16llx\n",
32 indent, "",
33 regs->rax, regs->rbx, regs->rcx, regs->rdx);
34 fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
35 "rsp: 0x%.16llx rbp: 0x%.16llx\n",
36 indent, "",
37 regs->rsi, regs->rdi, regs->rsp, regs->rbp);
38 fprintf(stream, "%*sr8: 0x%.16llx r9: 0x%.16llx "
39 "r10: 0x%.16llx r11: 0x%.16llx\n",
40 indent, "",
41 regs->r8, regs->r9, regs->r10, regs->r11);
42 fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
43 "r14: 0x%.16llx r15: 0x%.16llx\n",
44 indent, "",
45 regs->r12, regs->r13, regs->r14, regs->r15);
46 fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
47 indent, "",
48 regs->rip, regs->rflags);
49 }
50
segment_dump(FILE * stream,struct kvm_segment * segment,uint8_t indent)51 static void segment_dump(FILE *stream, struct kvm_segment *segment,
52 uint8_t indent)
53 {
54 fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
55 "selector: 0x%.4x type: 0x%.2x\n",
56 indent, "", segment->base, segment->limit,
57 segment->selector, segment->type);
58 fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
59 "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
60 indent, "", segment->present, segment->dpl,
61 segment->db, segment->s, segment->l);
62 fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
63 "unusable: 0x%.2x padding: 0x%.2x\n",
64 indent, "", segment->g, segment->avl,
65 segment->unusable, segment->padding);
66 }
67
dtable_dump(FILE * stream,struct kvm_dtable * dtable,uint8_t indent)68 static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
69 uint8_t indent)
70 {
71 fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
72 "padding: 0x%.4x 0x%.4x 0x%.4x\n",
73 indent, "", dtable->base, dtable->limit,
74 dtable->padding[0], dtable->padding[1], dtable->padding[2]);
75 }
76
sregs_dump(FILE * stream,struct kvm_sregs * sregs,uint8_t indent)77 static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent)
78 {
79 unsigned int i;
80
81 fprintf(stream, "%*scs:\n", indent, "");
82 segment_dump(stream, &sregs->cs, indent + 2);
83 fprintf(stream, "%*sds:\n", indent, "");
84 segment_dump(stream, &sregs->ds, indent + 2);
85 fprintf(stream, "%*ses:\n", indent, "");
86 segment_dump(stream, &sregs->es, indent + 2);
87 fprintf(stream, "%*sfs:\n", indent, "");
88 segment_dump(stream, &sregs->fs, indent + 2);
89 fprintf(stream, "%*sgs:\n", indent, "");
90 segment_dump(stream, &sregs->gs, indent + 2);
91 fprintf(stream, "%*sss:\n", indent, "");
92 segment_dump(stream, &sregs->ss, indent + 2);
93 fprintf(stream, "%*str:\n", indent, "");
94 segment_dump(stream, &sregs->tr, indent + 2);
95 fprintf(stream, "%*sldt:\n", indent, "");
96 segment_dump(stream, &sregs->ldt, indent + 2);
97
98 fprintf(stream, "%*sgdt:\n", indent, "");
99 dtable_dump(stream, &sregs->gdt, indent + 2);
100 fprintf(stream, "%*sidt:\n", indent, "");
101 dtable_dump(stream, &sregs->idt, indent + 2);
102
103 fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
104 "cr3: 0x%.16llx cr4: 0x%.16llx\n",
105 indent, "",
106 sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
107 fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
108 "apic_base: 0x%.16llx\n",
109 indent, "",
110 sregs->cr8, sregs->efer, sregs->apic_base);
111
112 fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
113 for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
114 fprintf(stream, "%*s%.16llx\n", indent + 2, "",
115 sregs->interrupt_bitmap[i]);
116 }
117 }
118
kvm_is_tdp_enabled(void)119 bool kvm_is_tdp_enabled(void)
120 {
121 if (host_cpu_is_intel)
122 return get_kvm_intel_param_bool("ept");
123 else
124 return get_kvm_amd_param_bool("npt");
125 }
126
virt_arch_pgd_alloc(struct kvm_vm * vm)127 void virt_arch_pgd_alloc(struct kvm_vm *vm)
128 {
129 TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
130 "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
131
132 /* If needed, create page map l4 table. */
133 if (!vm->pgd_created) {
134 vm->pgd = vm_alloc_page_table(vm);
135 vm->pgd_created = true;
136 }
137 }
138
virt_get_pte(struct kvm_vm * vm,uint64_t * parent_pte,uint64_t vaddr,int level)139 static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte,
140 uint64_t vaddr, int level)
141 {
142 uint64_t pt_gpa = PTE_GET_PA(*parent_pte);
143 uint64_t *page_table = addr_gpa2hva(vm, pt_gpa);
144 int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
145
146 TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd,
147 "Parent PTE (level %d) not PRESENT for gva: 0x%08lx",
148 level + 1, vaddr);
149
150 return &page_table[index];
151 }
152
virt_create_upper_pte(struct kvm_vm * vm,uint64_t * parent_pte,uint64_t vaddr,uint64_t paddr,int current_level,int target_level)153 static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
154 uint64_t *parent_pte,
155 uint64_t vaddr,
156 uint64_t paddr,
157 int current_level,
158 int target_level)
159 {
160 uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level);
161
162 paddr = vm_untag_gpa(vm, paddr);
163
164 if (!(*pte & PTE_PRESENT_MASK)) {
165 *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
166 if (current_level == target_level)
167 *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
168 else
169 *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
170 } else {
171 /*
172 * Entry already present. Assert that the caller doesn't want
173 * a hugepage at this level, and that there isn't a hugepage at
174 * this level.
175 */
176 TEST_ASSERT(current_level != target_level,
177 "Cannot create hugepage at level: %u, vaddr: 0x%lx",
178 current_level, vaddr);
179 TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
180 "Cannot create page table at level: %u, vaddr: 0x%lx",
181 current_level, vaddr);
182 }
183 return pte;
184 }
185
__virt_pg_map(struct kvm_vm * vm,uint64_t vaddr,uint64_t paddr,int level)186 void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
187 {
188 const uint64_t pg_size = PG_LEVEL_SIZE(level);
189 uint64_t *pml4e, *pdpe, *pde;
190 uint64_t *pte;
191
192 TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K,
193 "Unknown or unsupported guest mode, mode: 0x%x", vm->mode);
194
195 TEST_ASSERT((vaddr % pg_size) == 0,
196 "Virtual address not aligned,\n"
197 "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size);
198 TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)),
199 "Invalid virtual address, vaddr: 0x%lx", vaddr);
200 TEST_ASSERT((paddr % pg_size) == 0,
201 "Physical address not aligned,\n"
202 " paddr: 0x%lx page size: 0x%lx", paddr, pg_size);
203 TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
204 "Physical address beyond maximum supported,\n"
205 " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
206 paddr, vm->max_gfn, vm->page_size);
207 TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr,
208 "Unexpected bits in paddr: %lx", paddr);
209
210 /*
211 * Allocate upper level page tables, if not already present. Return
212 * early if a hugepage was created.
213 */
214 pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level);
215 if (*pml4e & PTE_LARGE_MASK)
216 return;
217
218 pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level);
219 if (*pdpe & PTE_LARGE_MASK)
220 return;
221
222 pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level);
223 if (*pde & PTE_LARGE_MASK)
224 return;
225
226 /* Fill in page table entry. */
227 pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
228 TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
229 "PTE already present for 4k page at vaddr: 0x%lx", vaddr);
230 *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
231
232 /*
233 * Neither SEV nor TDX supports shared page tables, so only the final
234 * leaf PTE needs manually set the C/S-bit.
235 */
236 if (vm_is_gpa_protected(vm, paddr))
237 *pte |= vm->arch.c_bit;
238 else
239 *pte |= vm->arch.s_bit;
240 }
241
virt_arch_pg_map(struct kvm_vm * vm,uint64_t vaddr,uint64_t paddr)242 void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
243 {
244 __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);
245 }
246
virt_map_level(struct kvm_vm * vm,uint64_t vaddr,uint64_t paddr,uint64_t nr_bytes,int level)247 void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
248 uint64_t nr_bytes, int level)
249 {
250 uint64_t pg_size = PG_LEVEL_SIZE(level);
251 uint64_t nr_pages = nr_bytes / pg_size;
252 int i;
253
254 TEST_ASSERT(nr_bytes % pg_size == 0,
255 "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx",
256 nr_bytes, pg_size);
257
258 for (i = 0; i < nr_pages; i++) {
259 __virt_pg_map(vm, vaddr, paddr, level);
260
261 vaddr += pg_size;
262 paddr += pg_size;
263 }
264 }
265
vm_is_target_pte(uint64_t * pte,int * level,int current_level)266 static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)
267 {
268 if (*pte & PTE_LARGE_MASK) {
269 TEST_ASSERT(*level == PG_LEVEL_NONE ||
270 *level == current_level,
271 "Unexpected hugepage at level %d", current_level);
272 *level = current_level;
273 }
274
275 return *level == current_level;
276 }
277
__vm_get_page_table_entry(struct kvm_vm * vm,uint64_t vaddr,int * level)278 uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
279 int *level)
280 {
281 uint64_t *pml4e, *pdpe, *pde;
282
283 TEST_ASSERT(!vm->arch.is_pt_protected,
284 "Walking page tables of protected guests is impossible");
285
286 TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM,
287 "Invalid PG_LEVEL_* '%d'", *level);
288
289 TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
290 "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
291 TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
292 (vaddr >> vm->page_shift)),
293 "Invalid virtual address, vaddr: 0x%lx",
294 vaddr);
295 /*
296 * Based on the mode check above there are 48 bits in the vaddr, so
297 * shift 16 to sign extend the last bit (bit-47),
298 */
299 TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16),
300 "Canonical check failed. The virtual address is invalid.");
301
302 pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G);
303 if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G))
304 return pml4e;
305
306 pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G);
307 if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G))
308 return pdpe;
309
310 pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M);
311 if (vm_is_target_pte(pde, level, PG_LEVEL_2M))
312 return pde;
313
314 return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K);
315 }
316
vm_get_page_table_entry(struct kvm_vm * vm,uint64_t vaddr)317 uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr)
318 {
319 int level = PG_LEVEL_4K;
320
321 return __vm_get_page_table_entry(vm, vaddr, &level);
322 }
323
virt_arch_dump(FILE * stream,struct kvm_vm * vm,uint8_t indent)324 void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
325 {
326 uint64_t *pml4e, *pml4e_start;
327 uint64_t *pdpe, *pdpe_start;
328 uint64_t *pde, *pde_start;
329 uint64_t *pte, *pte_start;
330
331 if (!vm->pgd_created)
332 return;
333
334 fprintf(stream, "%*s "
335 " no\n", indent, "");
336 fprintf(stream, "%*s index hvaddr gpaddr "
337 "addr w exec dirty\n",
338 indent, "");
339 pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
340 for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
341 pml4e = &pml4e_start[n1];
342 if (!(*pml4e & PTE_PRESENT_MASK))
343 continue;
344 fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
345 " %u\n",
346 indent, "",
347 pml4e - pml4e_start, pml4e,
348 addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
349 !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
350
351 pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
352 for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
353 pdpe = &pdpe_start[n2];
354 if (!(*pdpe & PTE_PRESENT_MASK))
355 continue;
356 fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx "
357 "%u %u\n",
358 indent, "",
359 pdpe - pdpe_start, pdpe,
360 addr_hva2gpa(vm, pdpe),
361 PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
362 !!(*pdpe & PTE_NX_MASK));
363
364 pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
365 for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
366 pde = &pde_start[n3];
367 if (!(*pde & PTE_PRESENT_MASK))
368 continue;
369 fprintf(stream, "%*spde 0x%-3zx %p "
370 "0x%-12lx 0x%-10llx %u %u\n",
371 indent, "", pde - pde_start, pde,
372 addr_hva2gpa(vm, pde),
373 PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
374 !!(*pde & PTE_NX_MASK));
375
376 pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
377 for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
378 pte = &pte_start[n4];
379 if (!(*pte & PTE_PRESENT_MASK))
380 continue;
381 fprintf(stream, "%*spte 0x%-3zx %p "
382 "0x%-12lx 0x%-10llx %u %u "
383 " %u 0x%-10lx\n",
384 indent, "",
385 pte - pte_start, pte,
386 addr_hva2gpa(vm, pte),
387 PTE_GET_PFN(*pte),
388 !!(*pte & PTE_WRITABLE_MASK),
389 !!(*pte & PTE_NX_MASK),
390 !!(*pte & PTE_DIRTY_MASK),
391 ((uint64_t) n1 << 27)
392 | ((uint64_t) n2 << 18)
393 | ((uint64_t) n3 << 9)
394 | ((uint64_t) n4));
395 }
396 }
397 }
398 }
399 }
400
401 /*
402 * Set Unusable Segment
403 *
404 * Input Args: None
405 *
406 * Output Args:
407 * segp - Pointer to segment register
408 *
409 * Return: None
410 *
411 * Sets the segment register pointed to by @segp to an unusable state.
412 */
kvm_seg_set_unusable(struct kvm_segment * segp)413 static void kvm_seg_set_unusable(struct kvm_segment *segp)
414 {
415 memset(segp, 0, sizeof(*segp));
416 segp->unusable = true;
417 }
418
kvm_seg_fill_gdt_64bit(struct kvm_vm * vm,struct kvm_segment * segp)419 static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
420 {
421 void *gdt = addr_gva2hva(vm, vm->arch.gdt);
422 struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
423
424 desc->limit0 = segp->limit & 0xFFFF;
425 desc->base0 = segp->base & 0xFFFF;
426 desc->base1 = segp->base >> 16;
427 desc->type = segp->type;
428 desc->s = segp->s;
429 desc->dpl = segp->dpl;
430 desc->p = segp->present;
431 desc->limit1 = segp->limit >> 16;
432 desc->avl = segp->avl;
433 desc->l = segp->l;
434 desc->db = segp->db;
435 desc->g = segp->g;
436 desc->base2 = segp->base >> 24;
437 if (!segp->s)
438 desc->base3 = segp->base >> 32;
439 }
440
kvm_seg_set_kernel_code_64bit(struct kvm_segment * segp)441 static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp)
442 {
443 memset(segp, 0, sizeof(*segp));
444 segp->selector = KERNEL_CS;
445 segp->limit = 0xFFFFFFFFu;
446 segp->s = 0x1; /* kTypeCodeData */
447 segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
448 * | kFlagCodeReadable
449 */
450 segp->g = true;
451 segp->l = true;
452 segp->present = 1;
453 }
454
kvm_seg_set_kernel_data_64bit(struct kvm_segment * segp)455 static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp)
456 {
457 memset(segp, 0, sizeof(*segp));
458 segp->selector = KERNEL_DS;
459 segp->limit = 0xFFFFFFFFu;
460 segp->s = 0x1; /* kTypeCodeData */
461 segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
462 * | kFlagDataWritable
463 */
464 segp->g = true;
465 segp->present = true;
466 }
467
addr_arch_gva2gpa(struct kvm_vm * vm,vm_vaddr_t gva)468 vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
469 {
470 int level = PG_LEVEL_NONE;
471 uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level);
472
473 TEST_ASSERT(*pte & PTE_PRESENT_MASK,
474 "Leaf PTE not PRESENT for gva: 0x%08lx", gva);
475
476 /*
477 * No need for a hugepage mask on the PTE, x86-64 requires the "unused"
478 * address bits to be zero.
479 */
480 return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level));
481 }
482
kvm_seg_set_tss_64bit(vm_vaddr_t base,struct kvm_segment * segp)483 static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp)
484 {
485 memset(segp, 0, sizeof(*segp));
486 segp->base = base;
487 segp->limit = 0x67;
488 segp->selector = KERNEL_TSS;
489 segp->type = 0xb;
490 segp->present = 1;
491 }
492
vcpu_init_sregs(struct kvm_vm * vm,struct kvm_vcpu * vcpu)493 static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
494 {
495 struct kvm_sregs sregs;
496
497 TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K);
498
499 /* Set mode specific system register values. */
500 vcpu_sregs_get(vcpu, &sregs);
501
502 sregs.idt.base = vm->arch.idt;
503 sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
504 sregs.gdt.base = vm->arch.gdt;
505 sregs.gdt.limit = getpagesize() - 1;
506
507 sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
508 sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
509 sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
510
511 kvm_seg_set_unusable(&sregs.ldt);
512 kvm_seg_set_kernel_code_64bit(&sregs.cs);
513 kvm_seg_set_kernel_data_64bit(&sregs.ds);
514 kvm_seg_set_kernel_data_64bit(&sregs.es);
515 kvm_seg_set_kernel_data_64bit(&sregs.gs);
516 kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr);
517
518 sregs.cr3 = vm->pgd;
519 vcpu_sregs_set(vcpu, &sregs);
520 }
521
set_idt_entry(struct kvm_vm * vm,int vector,unsigned long addr,int dpl,unsigned short selector)522 static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
523 int dpl, unsigned short selector)
524 {
525 struct idt_entry *base =
526 (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt);
527 struct idt_entry *e = &base[vector];
528
529 memset(e, 0, sizeof(*e));
530 e->offset0 = addr;
531 e->selector = selector;
532 e->ist = 0;
533 e->type = 14;
534 e->dpl = dpl;
535 e->p = 1;
536 e->offset1 = addr >> 16;
537 e->offset2 = addr >> 32;
538 }
539
kvm_fixup_exception(struct ex_regs * regs)540 static bool kvm_fixup_exception(struct ex_regs *regs)
541 {
542 if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10)
543 return false;
544
545 if (regs->vector == DE_VECTOR)
546 return false;
547
548 regs->rip = regs->r11;
549 regs->r9 = regs->vector;
550 regs->r10 = regs->error_code;
551 return true;
552 }
553
route_exception(struct ex_regs * regs)554 void route_exception(struct ex_regs *regs)
555 {
556 typedef void(*handler)(struct ex_regs *);
557 handler *handlers = (handler *)exception_handlers;
558
559 if (handlers && handlers[regs->vector]) {
560 handlers[regs->vector](regs);
561 return;
562 }
563
564 if (kvm_fixup_exception(regs))
565 return;
566
567 GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'",
568 regs->vector, regs->rip);
569 }
570
vm_init_descriptor_tables(struct kvm_vm * vm)571 static void vm_init_descriptor_tables(struct kvm_vm *vm)
572 {
573 extern void *idt_handlers;
574 struct kvm_segment seg;
575 int i;
576
577 vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
578 vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
579 vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
580 vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA);
581
582 /* Handlers have the same address in both address spaces.*/
583 for (i = 0; i < NUM_INTERRUPTS; i++)
584 set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS);
585
586 *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
587
588 kvm_seg_set_kernel_code_64bit(&seg);
589 kvm_seg_fill_gdt_64bit(vm, &seg);
590
591 kvm_seg_set_kernel_data_64bit(&seg);
592 kvm_seg_fill_gdt_64bit(vm, &seg);
593
594 kvm_seg_set_tss_64bit(vm->arch.tss, &seg);
595 kvm_seg_fill_gdt_64bit(vm, &seg);
596 }
597
vm_install_exception_handler(struct kvm_vm * vm,int vector,void (* handler)(struct ex_regs *))598 void vm_install_exception_handler(struct kvm_vm *vm, int vector,
599 void (*handler)(struct ex_regs *))
600 {
601 vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers);
602
603 handlers[vector] = (vm_vaddr_t)handler;
604 }
605
assert_on_unhandled_exception(struct kvm_vcpu * vcpu)606 void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
607 {
608 struct ucall uc;
609
610 if (get_ucall(vcpu, &uc) == UCALL_ABORT)
611 REPORT_GUEST_ASSERT(uc);
612 }
613
kvm_arch_vm_post_create(struct kvm_vm * vm)614 void kvm_arch_vm_post_create(struct kvm_vm *vm)
615 {
616 int r;
617
618 TEST_ASSERT(kvm_has_cap(KVM_CAP_GET_TSC_KHZ),
619 "Require KVM_GET_TSC_KHZ to provide udelay() to guest.");
620
621 vm_create_irqchip(vm);
622 vm_init_descriptor_tables(vm);
623
624 sync_global_to_guest(vm, host_cpu_is_intel);
625 sync_global_to_guest(vm, host_cpu_is_amd);
626 sync_global_to_guest(vm, is_forced_emulation_enabled);
627
628 if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
629 struct kvm_sev_init init = { 0 };
630
631 vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
632 }
633
634 r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
635 TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
636 guest_tsc_khz = r;
637 sync_global_to_guest(vm, guest_tsc_khz);
638 }
639
vcpu_arch_set_entry_point(struct kvm_vcpu * vcpu,void * guest_code)640 void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
641 {
642 struct kvm_regs regs;
643
644 vcpu_regs_get(vcpu, ®s);
645 regs.rip = (unsigned long) guest_code;
646 vcpu_regs_set(vcpu, ®s);
647 }
648
vm_arch_vcpu_add(struct kvm_vm * vm,uint32_t vcpu_id)649 struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
650 {
651 struct kvm_mp_state mp_state;
652 struct kvm_regs regs;
653 vm_vaddr_t stack_vaddr;
654 struct kvm_vcpu *vcpu;
655
656 stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
657 DEFAULT_GUEST_STACK_VADDR_MIN,
658 MEM_REGION_DATA);
659
660 stack_vaddr += DEFAULT_STACK_PGS * getpagesize();
661
662 /*
663 * Align stack to match calling sequence requirements in section "The
664 * Stack Frame" of the System V ABI AMD64 Architecture Processor
665 * Supplement, which requires the value (%rsp + 8) to be a multiple of
666 * 16 when control is transferred to the function entry point.
667 *
668 * If this code is ever used to launch a vCPU with 32-bit entry point it
669 * may need to subtract 4 bytes instead of 8 bytes.
670 */
671 TEST_ASSERT(IS_ALIGNED(stack_vaddr, PAGE_SIZE),
672 "__vm_vaddr_alloc() did not provide a page-aligned address");
673 stack_vaddr -= 8;
674
675 vcpu = __vm_vcpu_add(vm, vcpu_id);
676 vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
677 vcpu_init_sregs(vm, vcpu);
678
679 /* Setup guest general purpose registers */
680 vcpu_regs_get(vcpu, ®s);
681 regs.rflags = regs.rflags | 0x2;
682 regs.rsp = stack_vaddr;
683 vcpu_regs_set(vcpu, ®s);
684
685 /* Setup the MP state */
686 mp_state.mp_state = 0;
687 vcpu_mp_state_set(vcpu, &mp_state);
688
689 return vcpu;
690 }
691
vm_arch_vcpu_recreate(struct kvm_vm * vm,uint32_t vcpu_id)692 struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id)
693 {
694 struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id);
695
696 vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid());
697
698 return vcpu;
699 }
700
vcpu_arch_free(struct kvm_vcpu * vcpu)701 void vcpu_arch_free(struct kvm_vcpu *vcpu)
702 {
703 if (vcpu->cpuid)
704 free(vcpu->cpuid);
705 }
706
707 /* Do not use kvm_supported_cpuid directly except for validity checks. */
708 static void *kvm_supported_cpuid;
709
kvm_get_supported_cpuid(void)710 const struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
711 {
712 int kvm_fd;
713
714 if (kvm_supported_cpuid)
715 return kvm_supported_cpuid;
716
717 kvm_supported_cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
718 kvm_fd = open_kvm_dev_path_or_exit();
719
720 kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID,
721 (struct kvm_cpuid2 *)kvm_supported_cpuid);
722
723 close(kvm_fd);
724 return kvm_supported_cpuid;
725 }
726
__kvm_cpu_has(const struct kvm_cpuid2 * cpuid,uint32_t function,uint32_t index,uint8_t reg,uint8_t lo,uint8_t hi)727 static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid,
728 uint32_t function, uint32_t index,
729 uint8_t reg, uint8_t lo, uint8_t hi)
730 {
731 const struct kvm_cpuid_entry2 *entry;
732 int i;
733
734 for (i = 0; i < cpuid->nent; i++) {
735 entry = &cpuid->entries[i];
736
737 /*
738 * The output registers in kvm_cpuid_entry2 are in alphabetical
739 * order, but kvm_x86_cpu_feature matches that mess, so yay
740 * pointer shenanigans!
741 */
742 if (entry->function == function && entry->index == index)
743 return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo;
744 }
745
746 return 0;
747 }
748
kvm_cpuid_has(const struct kvm_cpuid2 * cpuid,struct kvm_x86_cpu_feature feature)749 bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid,
750 struct kvm_x86_cpu_feature feature)
751 {
752 return __kvm_cpu_has(cpuid, feature.function, feature.index,
753 feature.reg, feature.bit, feature.bit);
754 }
755
kvm_cpuid_property(const struct kvm_cpuid2 * cpuid,struct kvm_x86_cpu_property property)756 uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid,
757 struct kvm_x86_cpu_property property)
758 {
759 return __kvm_cpu_has(cpuid, property.function, property.index,
760 property.reg, property.lo_bit, property.hi_bit);
761 }
762
kvm_get_feature_msr(uint64_t msr_index)763 uint64_t kvm_get_feature_msr(uint64_t msr_index)
764 {
765 struct {
766 struct kvm_msrs header;
767 struct kvm_msr_entry entry;
768 } buffer = {};
769 int r, kvm_fd;
770
771 buffer.header.nmsrs = 1;
772 buffer.entry.index = msr_index;
773 kvm_fd = open_kvm_dev_path_or_exit();
774
775 r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header);
776 TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r));
777
778 close(kvm_fd);
779 return buffer.entry.data;
780 }
781
__vm_xsave_require_permission(uint64_t xfeature,const char * name)782 void __vm_xsave_require_permission(uint64_t xfeature, const char *name)
783 {
784 int kvm_fd;
785 u64 bitmask;
786 long rc;
787 struct kvm_device_attr attr = {
788 .group = 0,
789 .attr = KVM_X86_XCOMP_GUEST_SUPP,
790 .addr = (unsigned long) &bitmask,
791 };
792
793 TEST_ASSERT(!kvm_supported_cpuid,
794 "kvm_get_supported_cpuid() cannot be used before ARCH_REQ_XCOMP_GUEST_PERM");
795
796 TEST_ASSERT(is_power_of_2(xfeature),
797 "Dynamic XFeatures must be enabled one at a time");
798
799 kvm_fd = open_kvm_dev_path_or_exit();
800 rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr);
801 close(kvm_fd);
802
803 if (rc == -1 && (errno == ENXIO || errno == EINVAL))
804 __TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported");
805
806 TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc);
807
808 __TEST_REQUIRE(bitmask & xfeature,
809 "Required XSAVE feature '%s' not supported", name);
810
811 TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, ilog2(xfeature)));
812
813 rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask);
814 TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc);
815 TEST_ASSERT(bitmask & xfeature,
816 "'%s' (0x%lx) not permitted after prctl(ARCH_REQ_XCOMP_GUEST_PERM) permitted=0x%lx",
817 name, xfeature, bitmask);
818 }
819
vcpu_init_cpuid(struct kvm_vcpu * vcpu,const struct kvm_cpuid2 * cpuid)820 void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid)
821 {
822 TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID");
823
824 /* Allow overriding the default CPUID. */
825 if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) {
826 free(vcpu->cpuid);
827 vcpu->cpuid = NULL;
828 }
829
830 if (!vcpu->cpuid)
831 vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent);
832
833 memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));
834 vcpu_set_cpuid(vcpu);
835 }
836
vcpu_set_cpuid_property(struct kvm_vcpu * vcpu,struct kvm_x86_cpu_property property,uint32_t value)837 void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu,
838 struct kvm_x86_cpu_property property,
839 uint32_t value)
840 {
841 struct kvm_cpuid_entry2 *entry;
842
843 entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index);
844
845 (&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit);
846 (&entry->eax)[property.reg] |= value << property.lo_bit;
847
848 vcpu_set_cpuid(vcpu);
849
850 /* Sanity check that @value doesn't exceed the bounds in any way. */
851 TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value);
852 }
853
vcpu_clear_cpuid_entry(struct kvm_vcpu * vcpu,uint32_t function)854 void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function)
855 {
856 struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function);
857
858 entry->eax = 0;
859 entry->ebx = 0;
860 entry->ecx = 0;
861 entry->edx = 0;
862 vcpu_set_cpuid(vcpu);
863 }
864
vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu * vcpu,struct kvm_x86_cpu_feature feature,bool set)865 void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu,
866 struct kvm_x86_cpu_feature feature,
867 bool set)
868 {
869 struct kvm_cpuid_entry2 *entry;
870 u32 *reg;
871
872 entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index);
873 reg = (&entry->eax) + feature.reg;
874
875 if (set)
876 *reg |= BIT(feature.bit);
877 else
878 *reg &= ~BIT(feature.bit);
879
880 vcpu_set_cpuid(vcpu);
881 }
882
vcpu_get_msr(struct kvm_vcpu * vcpu,uint64_t msr_index)883 uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index)
884 {
885 struct {
886 struct kvm_msrs header;
887 struct kvm_msr_entry entry;
888 } buffer = {};
889
890 buffer.header.nmsrs = 1;
891 buffer.entry.index = msr_index;
892
893 vcpu_msrs_get(vcpu, &buffer.header);
894
895 return buffer.entry.data;
896 }
897
_vcpu_set_msr(struct kvm_vcpu * vcpu,uint64_t msr_index,uint64_t msr_value)898 int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value)
899 {
900 struct {
901 struct kvm_msrs header;
902 struct kvm_msr_entry entry;
903 } buffer = {};
904
905 memset(&buffer, 0, sizeof(buffer));
906 buffer.header.nmsrs = 1;
907 buffer.entry.index = msr_index;
908 buffer.entry.data = msr_value;
909
910 return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header);
911 }
912
vcpu_args_set(struct kvm_vcpu * vcpu,unsigned int num,...)913 void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...)
914 {
915 va_list ap;
916 struct kvm_regs regs;
917
918 TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
919 " num: %u",
920 num);
921
922 va_start(ap, num);
923 vcpu_regs_get(vcpu, ®s);
924
925 if (num >= 1)
926 regs.rdi = va_arg(ap, uint64_t);
927
928 if (num >= 2)
929 regs.rsi = va_arg(ap, uint64_t);
930
931 if (num >= 3)
932 regs.rdx = va_arg(ap, uint64_t);
933
934 if (num >= 4)
935 regs.rcx = va_arg(ap, uint64_t);
936
937 if (num >= 5)
938 regs.r8 = va_arg(ap, uint64_t);
939
940 if (num >= 6)
941 regs.r9 = va_arg(ap, uint64_t);
942
943 vcpu_regs_set(vcpu, ®s);
944 va_end(ap);
945 }
946
vcpu_arch_dump(FILE * stream,struct kvm_vcpu * vcpu,uint8_t indent)947 void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
948 {
949 struct kvm_regs regs;
950 struct kvm_sregs sregs;
951
952 fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id);
953
954 fprintf(stream, "%*sregs:\n", indent + 2, "");
955 vcpu_regs_get(vcpu, ®s);
956 regs_dump(stream, ®s, indent + 4);
957
958 fprintf(stream, "%*ssregs:\n", indent + 2, "");
959 vcpu_sregs_get(vcpu, &sregs);
960 sregs_dump(stream, &sregs, indent + 4);
961 }
962
__kvm_get_msr_index_list(bool feature_msrs)963 static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs)
964 {
965 struct kvm_msr_list *list;
966 struct kvm_msr_list nmsrs;
967 int kvm_fd, r;
968
969 kvm_fd = open_kvm_dev_path_or_exit();
970
971 nmsrs.nmsrs = 0;
972 if (!feature_msrs)
973 r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
974 else
975 r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs);
976
977 TEST_ASSERT(r == -1 && errno == E2BIG,
978 "Expected -E2BIG, got rc: %i errno: %i (%s)",
979 r, errno, strerror(errno));
980
981 list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0]));
982 TEST_ASSERT(list, "-ENOMEM when allocating MSR index list");
983 list->nmsrs = nmsrs.nmsrs;
984
985 if (!feature_msrs)
986 kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
987 else
988 kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list);
989 close(kvm_fd);
990
991 TEST_ASSERT(list->nmsrs == nmsrs.nmsrs,
992 "Number of MSRs in list changed, was %d, now %d",
993 nmsrs.nmsrs, list->nmsrs);
994 return list;
995 }
996
kvm_get_msr_index_list(void)997 const struct kvm_msr_list *kvm_get_msr_index_list(void)
998 {
999 static const struct kvm_msr_list *list;
1000
1001 if (!list)
1002 list = __kvm_get_msr_index_list(false);
1003 return list;
1004 }
1005
1006
kvm_get_feature_msr_index_list(void)1007 const struct kvm_msr_list *kvm_get_feature_msr_index_list(void)
1008 {
1009 static const struct kvm_msr_list *list;
1010
1011 if (!list)
1012 list = __kvm_get_msr_index_list(true);
1013 return list;
1014 }
1015
kvm_msr_is_in_save_restore_list(uint32_t msr_index)1016 bool kvm_msr_is_in_save_restore_list(uint32_t msr_index)
1017 {
1018 const struct kvm_msr_list *list = kvm_get_msr_index_list();
1019 int i;
1020
1021 for (i = 0; i < list->nmsrs; ++i) {
1022 if (list->indices[i] == msr_index)
1023 return true;
1024 }
1025
1026 return false;
1027 }
1028
vcpu_save_xsave_state(struct kvm_vcpu * vcpu,struct kvm_x86_state * state)1029 static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu,
1030 struct kvm_x86_state *state)
1031 {
1032 int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2);
1033
1034 if (size) {
1035 state->xsave = malloc(size);
1036 vcpu_xsave2_get(vcpu, state->xsave);
1037 } else {
1038 state->xsave = malloc(sizeof(struct kvm_xsave));
1039 vcpu_xsave_get(vcpu, state->xsave);
1040 }
1041 }
1042
vcpu_save_state(struct kvm_vcpu * vcpu)1043 struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu)
1044 {
1045 const struct kvm_msr_list *msr_list = kvm_get_msr_index_list();
1046 struct kvm_x86_state *state;
1047 int i;
1048
1049 static int nested_size = -1;
1050
1051 if (nested_size == -1) {
1052 nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
1053 TEST_ASSERT(nested_size <= sizeof(state->nested_),
1054 "Nested state size too big, %i > %zi",
1055 nested_size, sizeof(state->nested_));
1056 }
1057
1058 /*
1059 * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
1060 * guest state is consistent only after userspace re-enters the
1061 * kernel with KVM_RUN. Complete IO prior to migrating state
1062 * to a new VM.
1063 */
1064 vcpu_run_complete_io(vcpu);
1065
1066 state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0]));
1067 TEST_ASSERT(state, "-ENOMEM when allocating kvm state");
1068
1069 vcpu_events_get(vcpu, &state->events);
1070 vcpu_mp_state_get(vcpu, &state->mp_state);
1071 vcpu_regs_get(vcpu, &state->regs);
1072 vcpu_save_xsave_state(vcpu, state);
1073
1074 if (kvm_has_cap(KVM_CAP_XCRS))
1075 vcpu_xcrs_get(vcpu, &state->xcrs);
1076
1077 vcpu_sregs_get(vcpu, &state->sregs);
1078
1079 if (nested_size) {
1080 state->nested.size = sizeof(state->nested_);
1081
1082 vcpu_nested_state_get(vcpu, &state->nested);
1083 TEST_ASSERT(state->nested.size <= nested_size,
1084 "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
1085 state->nested.size, nested_size);
1086 } else {
1087 state->nested.size = 0;
1088 }
1089
1090 state->msrs.nmsrs = msr_list->nmsrs;
1091 for (i = 0; i < msr_list->nmsrs; i++)
1092 state->msrs.entries[i].index = msr_list->indices[i];
1093 vcpu_msrs_get(vcpu, &state->msrs);
1094
1095 vcpu_debugregs_get(vcpu, &state->debugregs);
1096
1097 return state;
1098 }
1099
vcpu_load_state(struct kvm_vcpu * vcpu,struct kvm_x86_state * state)1100 void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state)
1101 {
1102 vcpu_sregs_set(vcpu, &state->sregs);
1103 vcpu_msrs_set(vcpu, &state->msrs);
1104
1105 if (kvm_has_cap(KVM_CAP_XCRS))
1106 vcpu_xcrs_set(vcpu, &state->xcrs);
1107
1108 vcpu_xsave_set(vcpu, state->xsave);
1109 vcpu_events_set(vcpu, &state->events);
1110 vcpu_mp_state_set(vcpu, &state->mp_state);
1111 vcpu_debugregs_set(vcpu, &state->debugregs);
1112 vcpu_regs_set(vcpu, &state->regs);
1113
1114 if (state->nested.size)
1115 vcpu_nested_state_set(vcpu, &state->nested);
1116 }
1117
kvm_x86_state_cleanup(struct kvm_x86_state * state)1118 void kvm_x86_state_cleanup(struct kvm_x86_state *state)
1119 {
1120 free(state->xsave);
1121 free(state);
1122 }
1123
kvm_get_cpu_address_width(unsigned int * pa_bits,unsigned int * va_bits)1124 void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
1125 {
1126 if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) {
1127 *pa_bits = kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32;
1128 *va_bits = 32;
1129 } else {
1130 *pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
1131 *va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR);
1132 }
1133 }
1134
kvm_init_vm_address_properties(struct kvm_vm * vm)1135 void kvm_init_vm_address_properties(struct kvm_vm *vm)
1136 {
1137 if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) {
1138 vm->arch.sev_fd = open_sev_dev_path_or_exit();
1139 vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
1140 vm->gpa_tag_mask = vm->arch.c_bit;
1141 } else {
1142 vm->arch.sev_fd = -1;
1143 }
1144 }
1145
get_cpuid_entry(const struct kvm_cpuid2 * cpuid,uint32_t function,uint32_t index)1146 const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
1147 uint32_t function, uint32_t index)
1148 {
1149 int i;
1150
1151 for (i = 0; i < cpuid->nent; i++) {
1152 if (cpuid->entries[i].function == function &&
1153 cpuid->entries[i].index == index)
1154 return &cpuid->entries[i];
1155 }
1156
1157 TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
1158
1159 return NULL;
1160 }
1161
1162 #define X86_HYPERCALL(inputs...) \
1163 ({ \
1164 uint64_t r; \
1165 \
1166 asm volatile("test %[use_vmmcall], %[use_vmmcall]\n\t" \
1167 "jnz 1f\n\t" \
1168 "vmcall\n\t" \
1169 "jmp 2f\n\t" \
1170 "1: vmmcall\n\t" \
1171 "2:" \
1172 : "=a"(r) \
1173 : [use_vmmcall] "r" (host_cpu_is_amd), inputs); \
1174 \
1175 r; \
1176 })
1177
kvm_hypercall(uint64_t nr,uint64_t a0,uint64_t a1,uint64_t a2,uint64_t a3)1178 uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
1179 uint64_t a3)
1180 {
1181 return X86_HYPERCALL("a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3));
1182 }
1183
__xen_hypercall(uint64_t nr,uint64_t a0,void * a1)1184 uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1)
1185 {
1186 return X86_HYPERCALL("a"(nr), "D"(a0), "S"(a1));
1187 }
1188
xen_hypercall(uint64_t nr,uint64_t a0,void * a1)1189 void xen_hypercall(uint64_t nr, uint64_t a0, void *a1)
1190 {
1191 GUEST_ASSERT(!__xen_hypercall(nr, a0, a1));
1192 }
1193
vm_compute_max_gfn(struct kvm_vm * vm)1194 unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
1195 {
1196 const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
1197 unsigned long ht_gfn, max_gfn, max_pfn;
1198 uint8_t maxphyaddr, guest_maxphyaddr;
1199
1200 /*
1201 * Use "guest MAXPHYADDR" from KVM if it's available. Guest MAXPHYADDR
1202 * enumerates the max _mappable_ GPA, which can be less than the raw
1203 * MAXPHYADDR, e.g. if MAXPHYADDR=52, KVM is using TDP, and the CPU
1204 * doesn't support 5-level TDP.
1205 */
1206 guest_maxphyaddr = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR);
1207 guest_maxphyaddr = guest_maxphyaddr ?: vm->pa_bits;
1208 TEST_ASSERT(guest_maxphyaddr <= vm->pa_bits,
1209 "Guest MAXPHYADDR should never be greater than raw MAXPHYADDR");
1210
1211 max_gfn = (1ULL << (guest_maxphyaddr - vm->page_shift)) - 1;
1212
1213 /* Avoid reserved HyperTransport region on AMD processors. */
1214 if (!host_cpu_is_amd)
1215 return max_gfn;
1216
1217 /* On parts with <40 physical address bits, the area is fully hidden */
1218 if (vm->pa_bits < 40)
1219 return max_gfn;
1220
1221 /* Before family 17h, the HyperTransport area is just below 1T. */
1222 ht_gfn = (1 << 28) - num_ht_pages;
1223 if (this_cpu_family() < 0x17)
1224 goto done;
1225
1226 /*
1227 * Otherwise it's at the top of the physical address space, possibly
1228 * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use
1229 * the old conservative value if MAXPHYADDR is not enumerated.
1230 */
1231 if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR))
1232 goto done;
1233
1234 maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR);
1235 max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1;
1236
1237 if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION))
1238 max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION);
1239
1240 ht_gfn = max_pfn - num_ht_pages;
1241 done:
1242 return min(max_gfn, ht_gfn - 1);
1243 }
1244
1245 /* Returns true if kvm_intel was loaded with unrestricted_guest=1. */
vm_is_unrestricted_guest(struct kvm_vm * vm)1246 bool vm_is_unrestricted_guest(struct kvm_vm *vm)
1247 {
1248 /* Ensure that a KVM vendor-specific module is loaded. */
1249 if (vm == NULL)
1250 close(open_kvm_dev_path_or_exit());
1251
1252 return get_kvm_intel_param_bool("unrestricted_guest");
1253 }
1254
kvm_selftest_arch_init(void)1255 void kvm_selftest_arch_init(void)
1256 {
1257 host_cpu_is_intel = this_cpu_is_intel();
1258 host_cpu_is_amd = this_cpu_is_amd();
1259 is_forced_emulation_enabled = kvm_is_forced_emulation_enabled();
1260 }
1261
sys_clocksource_is_based_on_tsc(void)1262 bool sys_clocksource_is_based_on_tsc(void)
1263 {
1264 char *clk_name = sys_get_cur_clocksource();
1265 bool ret = !strcmp(clk_name, "tsc\n") ||
1266 !strcmp(clk_name, "hyperv_clocksource_tsc_page\n");
1267
1268 free(clk_name);
1269
1270 return ret;
1271 }
1272