xref: /linux/tools/testing/selftests/kvm/lib/x86_64/processor.c (revision df561f6688fef775baa341a0f5d960becd248b11)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * tools/testing/selftests/kvm/lib/x86_64/processor.c
4  *
5  * Copyright (C) 2018, Google LLC.
6  */
7 
8 #define _GNU_SOURCE /* for program_invocation_name */
9 
10 #include "test_util.h"
11 #include "kvm_util.h"
12 #include "../kvm_util_internal.h"
13 #include "processor.h"
14 
15 /* Minimum physical address used for virtual translation tables. */
16 #define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
17 
18 /* Virtual translation table structure declarations */
19 struct pageMapL4Entry {
20 	uint64_t present:1;
21 	uint64_t writable:1;
22 	uint64_t user:1;
23 	uint64_t write_through:1;
24 	uint64_t cache_disable:1;
25 	uint64_t accessed:1;
26 	uint64_t ignored_06:1;
27 	uint64_t page_size:1;
28 	uint64_t ignored_11_08:4;
29 	uint64_t address:40;
30 	uint64_t ignored_62_52:11;
31 	uint64_t execute_disable:1;
32 };
33 
34 struct pageDirectoryPointerEntry {
35 	uint64_t present:1;
36 	uint64_t writable:1;
37 	uint64_t user:1;
38 	uint64_t write_through:1;
39 	uint64_t cache_disable:1;
40 	uint64_t accessed:1;
41 	uint64_t ignored_06:1;
42 	uint64_t page_size:1;
43 	uint64_t ignored_11_08:4;
44 	uint64_t address:40;
45 	uint64_t ignored_62_52:11;
46 	uint64_t execute_disable:1;
47 };
48 
49 struct pageDirectoryEntry {
50 	uint64_t present:1;
51 	uint64_t writable:1;
52 	uint64_t user:1;
53 	uint64_t write_through:1;
54 	uint64_t cache_disable:1;
55 	uint64_t accessed:1;
56 	uint64_t ignored_06:1;
57 	uint64_t page_size:1;
58 	uint64_t ignored_11_08:4;
59 	uint64_t address:40;
60 	uint64_t ignored_62_52:11;
61 	uint64_t execute_disable:1;
62 };
63 
64 struct pageTableEntry {
65 	uint64_t present:1;
66 	uint64_t writable:1;
67 	uint64_t user:1;
68 	uint64_t write_through:1;
69 	uint64_t cache_disable:1;
70 	uint64_t accessed:1;
71 	uint64_t dirty:1;
72 	uint64_t reserved_07:1;
73 	uint64_t global:1;
74 	uint64_t ignored_11_09:3;
75 	uint64_t address:40;
76 	uint64_t ignored_62_52:11;
77 	uint64_t execute_disable:1;
78 };
79 
80 void regs_dump(FILE *stream, struct kvm_regs *regs,
81 	       uint8_t indent)
82 {
83 	fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
84 		"rcx: 0x%.16llx rdx: 0x%.16llx\n",
85 		indent, "",
86 		regs->rax, regs->rbx, regs->rcx, regs->rdx);
87 	fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
88 		"rsp: 0x%.16llx rbp: 0x%.16llx\n",
89 		indent, "",
90 		regs->rsi, regs->rdi, regs->rsp, regs->rbp);
91 	fprintf(stream, "%*sr8:  0x%.16llx r9:  0x%.16llx "
92 		"r10: 0x%.16llx r11: 0x%.16llx\n",
93 		indent, "",
94 		regs->r8, regs->r9, regs->r10, regs->r11);
95 	fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
96 		"r14: 0x%.16llx r15: 0x%.16llx\n",
97 		indent, "",
98 		regs->r12, regs->r13, regs->r14, regs->r15);
99 	fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
100 		indent, "",
101 		regs->rip, regs->rflags);
102 }
103 
104 /*
105  * Segment Dump
106  *
107  * Input Args:
108  *   stream  - Output FILE stream
109  *   segment - KVM segment
110  *   indent  - Left margin indent amount
111  *
112  * Output Args: None
113  *
114  * Return: None
115  *
116  * Dumps the state of the KVM segment given by @segment, to the FILE stream
117  * given by @stream.
118  */
119 static void segment_dump(FILE *stream, struct kvm_segment *segment,
120 			 uint8_t indent)
121 {
122 	fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
123 		"selector: 0x%.4x type: 0x%.2x\n",
124 		indent, "", segment->base, segment->limit,
125 		segment->selector, segment->type);
126 	fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
127 		"db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
128 		indent, "", segment->present, segment->dpl,
129 		segment->db, segment->s, segment->l);
130 	fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
131 		"unusable: 0x%.2x padding: 0x%.2x\n",
132 		indent, "", segment->g, segment->avl,
133 		segment->unusable, segment->padding);
134 }
135 
136 /*
137  * dtable Dump
138  *
139  * Input Args:
140  *   stream - Output FILE stream
141  *   dtable - KVM dtable
142  *   indent - Left margin indent amount
143  *
144  * Output Args: None
145  *
146  * Return: None
147  *
148  * Dumps the state of the KVM dtable given by @dtable, to the FILE stream
149  * given by @stream.
150  */
151 static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
152 			uint8_t indent)
153 {
154 	fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
155 		"padding: 0x%.4x 0x%.4x 0x%.4x\n",
156 		indent, "", dtable->base, dtable->limit,
157 		dtable->padding[0], dtable->padding[1], dtable->padding[2]);
158 }
159 
160 void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
161 		uint8_t indent)
162 {
163 	unsigned int i;
164 
165 	fprintf(stream, "%*scs:\n", indent, "");
166 	segment_dump(stream, &sregs->cs, indent + 2);
167 	fprintf(stream, "%*sds:\n", indent, "");
168 	segment_dump(stream, &sregs->ds, indent + 2);
169 	fprintf(stream, "%*ses:\n", indent, "");
170 	segment_dump(stream, &sregs->es, indent + 2);
171 	fprintf(stream, "%*sfs:\n", indent, "");
172 	segment_dump(stream, &sregs->fs, indent + 2);
173 	fprintf(stream, "%*sgs:\n", indent, "");
174 	segment_dump(stream, &sregs->gs, indent + 2);
175 	fprintf(stream, "%*sss:\n", indent, "");
176 	segment_dump(stream, &sregs->ss, indent + 2);
177 	fprintf(stream, "%*str:\n", indent, "");
178 	segment_dump(stream, &sregs->tr, indent + 2);
179 	fprintf(stream, "%*sldt:\n", indent, "");
180 	segment_dump(stream, &sregs->ldt, indent + 2);
181 
182 	fprintf(stream, "%*sgdt:\n", indent, "");
183 	dtable_dump(stream, &sregs->gdt, indent + 2);
184 	fprintf(stream, "%*sidt:\n", indent, "");
185 	dtable_dump(stream, &sregs->idt, indent + 2);
186 
187 	fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
188 		"cr3: 0x%.16llx cr4: 0x%.16llx\n",
189 		indent, "",
190 		sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
191 	fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
192 		"apic_base: 0x%.16llx\n",
193 		indent, "",
194 		sregs->cr8, sregs->efer, sregs->apic_base);
195 
196 	fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
197 	for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
198 		fprintf(stream, "%*s%.16llx\n", indent + 2, "",
199 			sregs->interrupt_bitmap[i]);
200 	}
201 }
202 
203 void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
204 {
205 	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
206 		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
207 
208 	/* If needed, create page map l4 table. */
209 	if (!vm->pgd_created) {
210 		vm_paddr_t paddr = vm_phy_page_alloc(vm,
211 			KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
212 		vm->pgd = paddr;
213 		vm->pgd_created = true;
214 	}
215 }
216 
217 void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
218 	uint32_t pgd_memslot)
219 {
220 	uint16_t index[4];
221 	struct pageMapL4Entry *pml4e;
222 
223 	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
224 		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
225 
226 	TEST_ASSERT((vaddr % vm->page_size) == 0,
227 		"Virtual address not on page boundary,\n"
228 		"  vaddr: 0x%lx vm->page_size: 0x%x",
229 		vaddr, vm->page_size);
230 	TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
231 		(vaddr >> vm->page_shift)),
232 		"Invalid virtual address, vaddr: 0x%lx",
233 		vaddr);
234 	TEST_ASSERT((paddr % vm->page_size) == 0,
235 		"Physical address not on page boundary,\n"
236 		"  paddr: 0x%lx vm->page_size: 0x%x",
237 		paddr, vm->page_size);
238 	TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
239 		"Physical address beyond beyond maximum supported,\n"
240 		"  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
241 		paddr, vm->max_gfn, vm->page_size);
242 
243 	index[0] = (vaddr >> 12) & 0x1ffu;
244 	index[1] = (vaddr >> 21) & 0x1ffu;
245 	index[2] = (vaddr >> 30) & 0x1ffu;
246 	index[3] = (vaddr >> 39) & 0x1ffu;
247 
248 	/* Allocate page directory pointer table if not present. */
249 	pml4e = addr_gpa2hva(vm, vm->pgd);
250 	if (!pml4e[index[3]].present) {
251 		pml4e[index[3]].address = vm_phy_page_alloc(vm,
252 			KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
253 			>> vm->page_shift;
254 		pml4e[index[3]].writable = true;
255 		pml4e[index[3]].present = true;
256 	}
257 
258 	/* Allocate page directory table if not present. */
259 	struct pageDirectoryPointerEntry *pdpe;
260 	pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
261 	if (!pdpe[index[2]].present) {
262 		pdpe[index[2]].address = vm_phy_page_alloc(vm,
263 			KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
264 			>> vm->page_shift;
265 		pdpe[index[2]].writable = true;
266 		pdpe[index[2]].present = true;
267 	}
268 
269 	/* Allocate page table if not present. */
270 	struct pageDirectoryEntry *pde;
271 	pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
272 	if (!pde[index[1]].present) {
273 		pde[index[1]].address = vm_phy_page_alloc(vm,
274 			KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
275 			>> vm->page_shift;
276 		pde[index[1]].writable = true;
277 		pde[index[1]].present = true;
278 	}
279 
280 	/* Fill in page table entry. */
281 	struct pageTableEntry *pte;
282 	pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
283 	pte[index[0]].address = paddr >> vm->page_shift;
284 	pte[index[0]].writable = true;
285 	pte[index[0]].present = 1;
286 }
287 
288 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
289 {
290 	struct pageMapL4Entry *pml4e, *pml4e_start;
291 	struct pageDirectoryPointerEntry *pdpe, *pdpe_start;
292 	struct pageDirectoryEntry *pde, *pde_start;
293 	struct pageTableEntry *pte, *pte_start;
294 
295 	if (!vm->pgd_created)
296 		return;
297 
298 	fprintf(stream, "%*s                                          "
299 		"                no\n", indent, "");
300 	fprintf(stream, "%*s      index hvaddr         gpaddr         "
301 		"addr         w exec dirty\n",
302 		indent, "");
303 	pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm,
304 		vm->pgd);
305 	for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
306 		pml4e = &pml4e_start[n1];
307 		if (!pml4e->present)
308 			continue;
309 		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
310 			" %u\n",
311 			indent, "",
312 			pml4e - pml4e_start, pml4e,
313 			addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address,
314 			pml4e->writable, pml4e->execute_disable);
315 
316 		pdpe_start = addr_gpa2hva(vm, pml4e->address
317 			* vm->page_size);
318 		for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
319 			pdpe = &pdpe_start[n2];
320 			if (!pdpe->present)
321 				continue;
322 			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10lx "
323 				"%u  %u\n",
324 				indent, "",
325 				pdpe - pdpe_start, pdpe,
326 				addr_hva2gpa(vm, pdpe),
327 				(uint64_t) pdpe->address, pdpe->writable,
328 				pdpe->execute_disable);
329 
330 			pde_start = addr_gpa2hva(vm,
331 				pdpe->address * vm->page_size);
332 			for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
333 				pde = &pde_start[n3];
334 				if (!pde->present)
335 					continue;
336 				fprintf(stream, "%*spde   0x%-3zx %p "
337 					"0x%-12lx 0x%-10lx %u  %u\n",
338 					indent, "", pde - pde_start, pde,
339 					addr_hva2gpa(vm, pde),
340 					(uint64_t) pde->address, pde->writable,
341 					pde->execute_disable);
342 
343 				pte_start = addr_gpa2hva(vm,
344 					pde->address * vm->page_size);
345 				for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
346 					pte = &pte_start[n4];
347 					if (!pte->present)
348 						continue;
349 					fprintf(stream, "%*spte   0x%-3zx %p "
350 						"0x%-12lx 0x%-10lx %u  %u "
351 						"    %u    0x%-10lx\n",
352 						indent, "",
353 						pte - pte_start, pte,
354 						addr_hva2gpa(vm, pte),
355 						(uint64_t) pte->address,
356 						pte->writable,
357 						pte->execute_disable,
358 						pte->dirty,
359 						((uint64_t) n1 << 27)
360 							| ((uint64_t) n2 << 18)
361 							| ((uint64_t) n3 << 9)
362 							| ((uint64_t) n4));
363 				}
364 			}
365 		}
366 	}
367 }
368 
369 /*
370  * Set Unusable Segment
371  *
372  * Input Args: None
373  *
374  * Output Args:
375  *   segp - Pointer to segment register
376  *
377  * Return: None
378  *
379  * Sets the segment register pointed to by @segp to an unusable state.
380  */
381 static void kvm_seg_set_unusable(struct kvm_segment *segp)
382 {
383 	memset(segp, 0, sizeof(*segp));
384 	segp->unusable = true;
385 }
386 
387 static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
388 {
389 	void *gdt = addr_gva2hva(vm, vm->gdt);
390 	struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
391 
392 	desc->limit0 = segp->limit & 0xFFFF;
393 	desc->base0 = segp->base & 0xFFFF;
394 	desc->base1 = segp->base >> 16;
395 	desc->s = segp->s;
396 	desc->type = segp->type;
397 	desc->dpl = segp->dpl;
398 	desc->p = segp->present;
399 	desc->limit1 = segp->limit >> 16;
400 	desc->l = segp->l;
401 	desc->db = segp->db;
402 	desc->g = segp->g;
403 	desc->base2 = segp->base >> 24;
404 	if (!segp->s)
405 		desc->base3 = segp->base >> 32;
406 }
407 
408 
409 /*
410  * Set Long Mode Flat Kernel Code Segment
411  *
412  * Input Args:
413  *   vm - VM whose GDT is being filled, or NULL to only write segp
414  *   selector - selector value
415  *
416  * Output Args:
417  *   segp - Pointer to KVM segment
418  *
419  * Return: None
420  *
421  * Sets up the KVM segment pointed to by @segp, to be a code segment
422  * with the selector value given by @selector.
423  */
424 static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector,
425 	struct kvm_segment *segp)
426 {
427 	memset(segp, 0, sizeof(*segp));
428 	segp->selector = selector;
429 	segp->limit = 0xFFFFFFFFu;
430 	segp->s = 0x1; /* kTypeCodeData */
431 	segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
432 					  * | kFlagCodeReadable
433 					  */
434 	segp->g = true;
435 	segp->l = true;
436 	segp->present = 1;
437 	if (vm)
438 		kvm_seg_fill_gdt_64bit(vm, segp);
439 }
440 
441 /*
442  * Set Long Mode Flat Kernel Data Segment
443  *
444  * Input Args:
445  *   vm - VM whose GDT is being filled, or NULL to only write segp
446  *   selector - selector value
447  *
448  * Output Args:
449  *   segp - Pointer to KVM segment
450  *
451  * Return: None
452  *
453  * Sets up the KVM segment pointed to by @segp, to be a data segment
454  * with the selector value given by @selector.
455  */
456 static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
457 	struct kvm_segment *segp)
458 {
459 	memset(segp, 0, sizeof(*segp));
460 	segp->selector = selector;
461 	segp->limit = 0xFFFFFFFFu;
462 	segp->s = 0x1; /* kTypeCodeData */
463 	segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
464 					  * | kFlagDataWritable
465 					  */
466 	segp->g = true;
467 	segp->present = true;
468 	if (vm)
469 		kvm_seg_fill_gdt_64bit(vm, segp);
470 }
471 
472 vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
473 {
474 	uint16_t index[4];
475 	struct pageMapL4Entry *pml4e;
476 	struct pageDirectoryPointerEntry *pdpe;
477 	struct pageDirectoryEntry *pde;
478 	struct pageTableEntry *pte;
479 
480 	TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
481 		"unknown or unsupported guest mode, mode: 0x%x", vm->mode);
482 
483 	index[0] = (gva >> 12) & 0x1ffu;
484 	index[1] = (gva >> 21) & 0x1ffu;
485 	index[2] = (gva >> 30) & 0x1ffu;
486 	index[3] = (gva >> 39) & 0x1ffu;
487 
488 	if (!vm->pgd_created)
489 		goto unmapped_gva;
490 	pml4e = addr_gpa2hva(vm, vm->pgd);
491 	if (!pml4e[index[3]].present)
492 		goto unmapped_gva;
493 
494 	pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
495 	if (!pdpe[index[2]].present)
496 		goto unmapped_gva;
497 
498 	pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
499 	if (!pde[index[1]].present)
500 		goto unmapped_gva;
501 
502 	pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
503 	if (!pte[index[0]].present)
504 		goto unmapped_gva;
505 
506 	return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu);
507 
508 unmapped_gva:
509 	TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
510 	exit(EXIT_FAILURE);
511 }
512 
513 static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot,
514 			  int pgd_memslot)
515 {
516 	if (!vm->gdt)
517 		vm->gdt = vm_vaddr_alloc(vm, getpagesize(),
518 			KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
519 
520 	dt->base = vm->gdt;
521 	dt->limit = getpagesize();
522 }
523 
524 static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp,
525 				int selector, int gdt_memslot,
526 				int pgd_memslot)
527 {
528 	if (!vm->tss)
529 		vm->tss = vm_vaddr_alloc(vm, getpagesize(),
530 			KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
531 
532 	memset(segp, 0, sizeof(*segp));
533 	segp->base = vm->tss;
534 	segp->limit = 0x67;
535 	segp->selector = selector;
536 	segp->type = 0xb;
537 	segp->present = 1;
538 	kvm_seg_fill_gdt_64bit(vm, segp);
539 }
540 
541 static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
542 {
543 	struct kvm_sregs sregs;
544 
545 	/* Set mode specific system register values. */
546 	vcpu_sregs_get(vm, vcpuid, &sregs);
547 
548 	sregs.idt.limit = 0;
549 
550 	kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot);
551 
552 	switch (vm->mode) {
553 	case VM_MODE_PXXV48_4K:
554 		sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
555 		sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
556 		sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
557 
558 		kvm_seg_set_unusable(&sregs.ldt);
559 		kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs);
560 		kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds);
561 		kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es);
562 		kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot);
563 		break;
564 
565 	default:
566 		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
567 	}
568 
569 	sregs.cr3 = vm->pgd;
570 	vcpu_sregs_set(vm, vcpuid, &sregs);
571 }
572 
573 void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
574 {
575 	struct kvm_mp_state mp_state;
576 	struct kvm_regs regs;
577 	vm_vaddr_t stack_vaddr;
578 	stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
579 				     DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
580 
581 	/* Create VCPU */
582 	vm_vcpu_add(vm, vcpuid);
583 	vcpu_setup(vm, vcpuid, 0, 0);
584 
585 	/* Setup guest general purpose registers */
586 	vcpu_regs_get(vm, vcpuid, &regs);
587 	regs.rflags = regs.rflags | 0x2;
588 	regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize());
589 	regs.rip = (unsigned long) guest_code;
590 	vcpu_regs_set(vm, vcpuid, &regs);
591 
592 	/* Setup the MP state */
593 	mp_state.mp_state = 0;
594 	vcpu_set_mp_state(vm, vcpuid, &mp_state);
595 }
596 
597 /*
598  * Allocate an instance of struct kvm_cpuid2
599  *
600  * Input Args: None
601  *
602  * Output Args: None
603  *
604  * Return: A pointer to the allocated struct. The caller is responsible
605  * for freeing this struct.
606  *
607  * Since kvm_cpuid2 uses a 0-length array to allow a the size of the
608  * array to be decided at allocation time, allocation is slightly
609  * complicated. This function uses a reasonable default length for
610  * the array and performs the appropriate allocation.
611  */
612 static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
613 {
614 	struct kvm_cpuid2 *cpuid;
615 	int nent = 100;
616 	size_t size;
617 
618 	size = sizeof(*cpuid);
619 	size += nent * sizeof(struct kvm_cpuid_entry2);
620 	cpuid = malloc(size);
621 	if (!cpuid) {
622 		perror("malloc");
623 		abort();
624 	}
625 
626 	cpuid->nent = nent;
627 
628 	return cpuid;
629 }
630 
631 /*
632  * KVM Supported CPUID Get
633  *
634  * Input Args: None
635  *
636  * Output Args:
637  *
638  * Return: The supported KVM CPUID
639  *
640  * Get the guest CPUID supported by KVM.
641  */
642 struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
643 {
644 	static struct kvm_cpuid2 *cpuid;
645 	int ret;
646 	int kvm_fd;
647 
648 	if (cpuid)
649 		return cpuid;
650 
651 	cpuid = allocate_kvm_cpuid2();
652 	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
653 	if (kvm_fd < 0)
654 		exit(KSFT_SKIP);
655 
656 	ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
657 	TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
658 		    ret, errno);
659 
660 	close(kvm_fd);
661 	return cpuid;
662 }
663 
664 /*
665  * Locate a cpuid entry.
666  *
667  * Input Args:
668  *   function: The function of the cpuid entry to find.
669  *   index: The index of the cpuid entry.
670  *
671  * Output Args: None
672  *
673  * Return: A pointer to the cpuid entry. Never returns NULL.
674  */
675 struct kvm_cpuid_entry2 *
676 kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
677 {
678 	struct kvm_cpuid2 *cpuid;
679 	struct kvm_cpuid_entry2 *entry = NULL;
680 	int i;
681 
682 	cpuid = kvm_get_supported_cpuid();
683 	for (i = 0; i < cpuid->nent; i++) {
684 		if (cpuid->entries[i].function == function &&
685 		    cpuid->entries[i].index == index) {
686 			entry = &cpuid->entries[i];
687 			break;
688 		}
689 	}
690 
691 	TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).",
692 		    function, index);
693 	return entry;
694 }
695 
696 /*
697  * VM VCPU CPUID Set
698  *
699  * Input Args:
700  *   vm - Virtual Machine
701  *   vcpuid - VCPU id
702  *   cpuid - The CPUID values to set.
703  *
704  * Output Args: None
705  *
706  * Return: void
707  *
708  * Set the VCPU's CPUID.
709  */
710 void vcpu_set_cpuid(struct kvm_vm *vm,
711 		uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
712 {
713 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
714 	int rc;
715 
716 	TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
717 
718 	rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
719 	TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
720 		    rc, errno);
721 
722 }
723 
724 struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
725 				 void *guest_code)
726 {
727 	struct kvm_vm *vm;
728 	/*
729 	 * For x86 the maximum page table size for a memory region
730 	 * will be when only 4K pages are used.  In that case the
731 	 * total extra size for page tables (for extra N pages) will
732 	 * be: N/512+N/512^2+N/512^3+... which is definitely smaller
733 	 * than N/512*2.
734 	 */
735 	uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
736 
737 	/* Create VM */
738 	vm = vm_create(VM_MODE_DEFAULT,
739 		       DEFAULT_GUEST_PHY_PAGES + extra_pg_pages,
740 		       O_RDWR);
741 
742 	/* Setup guest code */
743 	kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
744 
745 	/* Setup IRQ Chip */
746 	vm_create_irqchip(vm);
747 
748 	/* Add the first vCPU. */
749 	vm_vcpu_add_default(vm, vcpuid, guest_code);
750 
751 	return vm;
752 }
753 
754 /*
755  * VCPU Get MSR
756  *
757  * Input Args:
758  *   vm - Virtual Machine
759  *   vcpuid - VCPU ID
760  *   msr_index - Index of MSR
761  *
762  * Output Args: None
763  *
764  * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
765  *
766  * Get value of MSR for VCPU.
767  */
768 uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
769 {
770 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
771 	struct {
772 		struct kvm_msrs header;
773 		struct kvm_msr_entry entry;
774 	} buffer = {};
775 	int r;
776 
777 	TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
778 	buffer.header.nmsrs = 1;
779 	buffer.entry.index = msr_index;
780 	r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header);
781 	TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
782 		"  rc: %i errno: %i", r, errno);
783 
784 	return buffer.entry.data;
785 }
786 
787 /*
788  * _VCPU Set MSR
789  *
790  * Input Args:
791  *   vm - Virtual Machine
792  *   vcpuid - VCPU ID
793  *   msr_index - Index of MSR
794  *   msr_value - New value of MSR
795  *
796  * Output Args: None
797  *
798  * Return: The result of KVM_SET_MSRS.
799  *
800  * Sets the value of an MSR for the given VCPU.
801  */
802 int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
803 		  uint64_t msr_value)
804 {
805 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
806 	struct {
807 		struct kvm_msrs header;
808 		struct kvm_msr_entry entry;
809 	} buffer = {};
810 	int r;
811 
812 	TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
813 	memset(&buffer, 0, sizeof(buffer));
814 	buffer.header.nmsrs = 1;
815 	buffer.entry.index = msr_index;
816 	buffer.entry.data = msr_value;
817 	r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header);
818 	return r;
819 }
820 
821 /*
822  * VCPU Set MSR
823  *
824  * Input Args:
825  *   vm - Virtual Machine
826  *   vcpuid - VCPU ID
827  *   msr_index - Index of MSR
828  *   msr_value - New value of MSR
829  *
830  * Output Args: None
831  *
832  * Return: On success, nothing. On failure a TEST_ASSERT is produced.
833  *
834  * Set value of MSR for VCPU.
835  */
836 void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
837 	uint64_t msr_value)
838 {
839 	int r;
840 
841 	r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value);
842 	TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n"
843 		"  rc: %i errno: %i", r, errno);
844 }
845 
846 void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
847 {
848 	va_list ap;
849 	struct kvm_regs regs;
850 
851 	TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
852 		    "  num: %u\n",
853 		    num);
854 
855 	va_start(ap, num);
856 	vcpu_regs_get(vm, vcpuid, &regs);
857 
858 	if (num >= 1)
859 		regs.rdi = va_arg(ap, uint64_t);
860 
861 	if (num >= 2)
862 		regs.rsi = va_arg(ap, uint64_t);
863 
864 	if (num >= 3)
865 		regs.rdx = va_arg(ap, uint64_t);
866 
867 	if (num >= 4)
868 		regs.rcx = va_arg(ap, uint64_t);
869 
870 	if (num >= 5)
871 		regs.r8 = va_arg(ap, uint64_t);
872 
873 	if (num >= 6)
874 		regs.r9 = va_arg(ap, uint64_t);
875 
876 	vcpu_regs_set(vm, vcpuid, &regs);
877 	va_end(ap);
878 }
879 
880 void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
881 {
882 	struct kvm_regs regs;
883 	struct kvm_sregs sregs;
884 
885 	fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid);
886 
887 	fprintf(stream, "%*sregs:\n", indent + 2, "");
888 	vcpu_regs_get(vm, vcpuid, &regs);
889 	regs_dump(stream, &regs, indent + 4);
890 
891 	fprintf(stream, "%*ssregs:\n", indent + 2, "");
892 	vcpu_sregs_get(vm, vcpuid, &sregs);
893 	sregs_dump(stream, &sregs, indent + 4);
894 }
895 
896 struct kvm_x86_state {
897 	struct kvm_vcpu_events events;
898 	struct kvm_mp_state mp_state;
899 	struct kvm_regs regs;
900 	struct kvm_xsave xsave;
901 	struct kvm_xcrs xcrs;
902 	struct kvm_sregs sregs;
903 	struct kvm_debugregs debugregs;
904 	union {
905 		struct kvm_nested_state nested;
906 		char nested_[16384];
907 	};
908 	struct kvm_msrs msrs;
909 };
910 
911 static int kvm_get_num_msrs_fd(int kvm_fd)
912 {
913 	struct kvm_msr_list nmsrs;
914 	int r;
915 
916 	nmsrs.nmsrs = 0;
917 	r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
918 	TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i",
919 		r);
920 
921 	return nmsrs.nmsrs;
922 }
923 
924 static int kvm_get_num_msrs(struct kvm_vm *vm)
925 {
926 	return kvm_get_num_msrs_fd(vm->kvm_fd);
927 }
928 
929 struct kvm_msr_list *kvm_get_msr_index_list(void)
930 {
931 	struct kvm_msr_list *list;
932 	int nmsrs, r, kvm_fd;
933 
934 	kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
935 	if (kvm_fd < 0)
936 		exit(KSFT_SKIP);
937 
938 	nmsrs = kvm_get_num_msrs_fd(kvm_fd);
939 	list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
940 	list->nmsrs = nmsrs;
941 	r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
942 	close(kvm_fd);
943 
944 	TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
945 		r);
946 
947 	return list;
948 }
949 
950 struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
951 {
952 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
953 	struct kvm_msr_list *list;
954 	struct kvm_x86_state *state;
955 	int nmsrs, r, i;
956 	static int nested_size = -1;
957 
958 	if (nested_size == -1) {
959 		nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
960 		TEST_ASSERT(nested_size <= sizeof(state->nested_),
961 			    "Nested state size too big, %i > %zi",
962 			    nested_size, sizeof(state->nested_));
963 	}
964 
965 	/*
966 	 * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
967 	 * guest state is consistent only after userspace re-enters the
968 	 * kernel with KVM_RUN.  Complete IO prior to migrating state
969 	 * to a new VM.
970 	 */
971 	vcpu_run_complete_io(vm, vcpuid);
972 
973 	nmsrs = kvm_get_num_msrs(vm);
974 	list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
975 	list->nmsrs = nmsrs;
976 	r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
977         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
978                 r);
979 
980 	state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
981 	r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
982         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
983                 r);
984 
985 	r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
986         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
987                 r);
988 
989 	r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
990         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
991                 r);
992 
993 	r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave);
994         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
995                 r);
996 
997 	if (kvm_check_cap(KVM_CAP_XCRS)) {
998 		r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
999 		TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i",
1000 			    r);
1001 	}
1002 
1003 	r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
1004         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
1005                 r);
1006 
1007 	if (nested_size) {
1008 		state->nested.size = sizeof(state->nested_);
1009 		r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
1010 		TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
1011 			r);
1012 		TEST_ASSERT(state->nested.size <= nested_size,
1013 			"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
1014 			state->nested.size, nested_size);
1015 	} else
1016 		state->nested.size = 0;
1017 
1018 	state->msrs.nmsrs = nmsrs;
1019 	for (i = 0; i < nmsrs; i++)
1020 		state->msrs.entries[i].index = list->indices[i];
1021 	r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
1022         TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
1023                 r, r == nmsrs ? -1 : list->indices[r]);
1024 
1025 	r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
1026         TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
1027                 r);
1028 
1029 	free(list);
1030 	return state;
1031 }
1032 
1033 void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state)
1034 {
1035 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
1036 	int r;
1037 
1038 	r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave);
1039         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
1040                 r);
1041 
1042 	if (kvm_check_cap(KVM_CAP_XCRS)) {
1043 		r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs);
1044 		TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i",
1045 			    r);
1046 	}
1047 
1048 	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
1049         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
1050                 r);
1051 
1052 	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
1053         TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
1054                 r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
1055 
1056 	r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
1057         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
1058                 r);
1059 
1060 	r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
1061         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
1062                 r);
1063 
1064 	r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
1065         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
1066                 r);
1067 
1068 	r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
1069         TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
1070                 r);
1071 
1072 	if (state->nested.size) {
1073 		r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
1074 		TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
1075 			r);
1076 	}
1077 }
1078 
1079 bool is_intel_cpu(void)
1080 {
1081 	int eax, ebx, ecx, edx;
1082 	const uint32_t *chunk;
1083 	const int leaf = 0;
1084 
1085 	__asm__ __volatile__(
1086 		"cpuid"
1087 		: /* output */ "=a"(eax), "=b"(ebx),
1088 		  "=c"(ecx), "=d"(edx)
1089 		: /* input */ "0"(leaf), "2"(0));
1090 
1091 	chunk = (const uint32_t *)("GenuineIntel");
1092 	return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
1093 }
1094 
1095 uint32_t kvm_get_cpuid_max_basic(void)
1096 {
1097 	return kvm_get_supported_cpuid_entry(0)->eax;
1098 }
1099 
1100 uint32_t kvm_get_cpuid_max_extended(void)
1101 {
1102 	return kvm_get_supported_cpuid_entry(0x80000000)->eax;
1103 }
1104 
1105 void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
1106 {
1107 	struct kvm_cpuid_entry2 *entry;
1108 	bool pae;
1109 
1110 	/* SDM 4.1.4 */
1111 	if (kvm_get_cpuid_max_extended() < 0x80000008) {
1112 		pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6);
1113 		*pa_bits = pae ? 36 : 32;
1114 		*va_bits = 32;
1115 	} else {
1116 		entry = kvm_get_supported_cpuid_entry(0x80000008);
1117 		*pa_bits = entry->eax & 0xff;
1118 		*va_bits = (entry->eax >> 8) & 0xff;
1119 	}
1120 }
1121