xref: /illumos-gate/usr/src/lib/libvmm/libvmm.c (revision 95adbecaacb2fe97eb8c9e49e7c1d6910c577bba)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * Library for native code to access bhyve VMs, without the need to use
18  * FreeBSD compat headers
19  */
20 
21 #include <sys/param.h>
22 #include <sys/list.h>
23 #include <sys/stddef.h>
24 #include <sys/mman.h>
25 #include <sys/kdi_regs.h>
26 #include <sys/sysmacros.h>
27 #include <sys/controlregs.h>
28 #include <sys/note.h>
29 #include <sys/debug.h>
30 #include <errno.h>
31 #include <stdlib.h>
32 #include <strings.h>
33 #include <unistd.h>
34 #include <assert.h>
35 
36 #include <machine/vmm.h>
37 #include <vmmapi.h>
38 
39 #include <libvmm.h>
40 
41 typedef struct vmm_memseg vmm_memseg_t;
42 
43 #define	VMM_MEMSEG_DEVMEM	0x1
44 
45 struct vmm_memseg {
46 	list_node_t vms_list;
47 	int vms_segid;
48 	int vms_prot;
49 	int vms_flags;
50 	uintptr_t vms_gpa;
51 	off_t vms_segoff;
52 	size_t vms_seglen;
53 	size_t vms_maplen;
54 	char vms_name[64];
55 };
56 
57 struct vmm {
58 	struct vmctx *vmm_ctx;
59 	list_t vmm_memlist;
60 	char *vmm_mem;
61 	size_t vmm_memsize;
62 	size_t vmm_ncpu;
63 };
64 
65 
66 /*
67  * This code relies on two assumptions:
68  * - CPUs are never removed from the "active set", not even when suspended.
69  *   A CPU being active just means that it has been used by the guest OS.
70  * - The CPU numbering is consecutive.
71  */
72 static void
73 vmm_update_ncpu(vmm_t *vmm)
74 {
75 	cpuset_t cpuset;
76 
77 	assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0);
78 
79 	for (vmm->vmm_ncpu = 0;
80 	    CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1;
81 	    vmm->vmm_ncpu++)
82 		;
83 }
84 
85 vmm_t *
86 vmm_open_vm(const char *name)
87 {
88 	vmm_t *vmm = NULL;
89 
90 	vmm = malloc(sizeof (vmm_t));
91 	if (vmm == NULL)
92 		return (NULL);
93 
94 	bzero(vmm, sizeof (vmm_t));
95 	vmm->vmm_mem = MAP_FAILED;
96 
97 	list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t),
98 	    offsetof(vmm_memseg_t, vms_list));
99 
100 	vmm->vmm_ctx = vm_open(name);
101 	if (vmm->vmm_ctx == NULL) {
102 		free(vmm);
103 		return (NULL);
104 	}
105 
106 	vmm_update_ncpu(vmm);
107 
108 	/*
109 	 * If we open a VM that has just been created we may see a state
110 	 * where it has no CPUs configured yet. We'll just wait for 10ms
111 	 * and retry until we get a non-zero CPU count.
112 	 */
113 	if (vmm->vmm_ncpu == 0) {
114 		do {
115 			(void) usleep(10000);
116 			vmm_update_ncpu(vmm);
117 		} while (vmm->vmm_ncpu == 0);
118 	}
119 
120 	return (vmm);
121 }
122 
123 void
124 vmm_close_vm(vmm_t *vmm)
125 {
126 	vmm_unmap(vmm);
127 
128 	list_destroy(&vmm->vmm_memlist);
129 
130 	if (vmm->vmm_ctx != NULL)
131 		vm_close(vmm->vmm_ctx);
132 
133 	free(vmm);
134 }
135 
136 static vmm_memseg_t *
137 vmm_get_memseg(vmm_t *vmm, uintptr_t gpa)
138 {
139 	vmm_memseg_t ms, *ret;
140 	int error, flags;
141 
142 	bzero(&ms, sizeof (vmm_memseg_t));
143 	ms.vms_gpa = gpa;
144 	error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid,
145 	    &ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags);
146 	if (error)
147 		return (NULL);
148 
149 	error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen,
150 	    ms.vms_name, sizeof (ms.vms_name));
151 	if (error)
152 		return (NULL);
153 
154 	/*
155 	 * Regular memory segments don't have a name, but devmem segments do.
156 	 * We can use that information to set the DEVMEM flag if necessary.
157 	 */
158 	ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0;
159 
160 	ret = malloc(sizeof (vmm_memseg_t));
161 	if (ret == NULL)
162 		return (NULL);
163 
164 	*ret = ms;
165 
166 	return (ret);
167 }
168 
169 int
170 vmm_map(vmm_t *vmm, boolean_t writable)
171 {
172 	uintptr_t last_gpa = 0;
173 	vmm_memseg_t *ms;
174 	int prot_write = writable ? PROT_WRITE : 0;
175 
176 	if (vmm->vmm_mem != MAP_FAILED) {
177 		errno = EINVAL;
178 		return (-1);
179 	}
180 
181 	assert(list_is_empty(&vmm->vmm_memlist));
182 
183 	for (;;) {
184 		ms = vmm_get_memseg(vmm, last_gpa);
185 
186 		if (ms == NULL)
187 			break;
188 
189 		last_gpa = ms->vms_gpa + ms->vms_maplen;
190 		list_insert_tail(&vmm->vmm_memlist, ms);
191 	}
192 
193 	vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE,
194 	    MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0);
195 
196 	if (vmm->vmm_mem == MAP_FAILED)
197 		goto fail;
198 
199 	for (ms = list_head(&vmm->vmm_memlist);
200 	    ms != NULL;
201 	    ms = list_next(&vmm->vmm_memlist, ms)) {
202 		off_t mapoff = ms->vms_gpa;
203 
204 		if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) &&
205 		    vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid, &mapoff)
206 		    != 0)
207 			goto fail;
208 
209 		vmm->vmm_memsize += ms->vms_maplen;
210 
211 		if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen,
212 		    PROT_READ | prot_write, MAP_SHARED | MAP_FIXED,
213 		    vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED)
214 			goto fail;
215 	}
216 
217 	return (0);
218 
219 fail:
220 	vmm_unmap(vmm);
221 
222 	return (-1);
223 }
224 
225 void
226 vmm_unmap(vmm_t *vmm)
227 {
228 	while (!list_is_empty(&vmm->vmm_memlist)) {
229 		vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist);
230 
231 		if (vmm->vmm_mem != MAP_FAILED) {
232 			(void) munmap(vmm->vmm_mem + ms->vms_gpa,
233 			    ms->vms_maplen);
234 		}
235 
236 		free(ms);
237 	}
238 
239 	if (vmm->vmm_mem != MAP_FAILED)
240 		(void) munmap(vmm->vmm_mem, vmm->vmm_memsize);
241 
242 	vmm->vmm_mem = MAP_FAILED;
243 	vmm->vmm_memsize = 0;
244 }
245 
246 ssize_t
247 vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr)
248 {
249 	ssize_t count = 0;
250 	vmm_memseg_t *ms;
251 	ssize_t res = len;
252 
253 	for (ms = list_head(&vmm->vmm_memlist);
254 	    ms != NULL && len != 0;
255 	    ms = list_next(&vmm->vmm_memlist, ms)) {
256 
257 		if (addr >= ms->vms_gpa &&
258 		    addr < ms->vms_gpa + ms->vms_maplen) {
259 			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
260 
261 			if (res < 0)
262 				res = 0;
263 
264 			bcopy(vmm->vmm_mem + addr, buf, len - res);
265 			count += len - res;
266 			addr += len - res;
267 			len = res;
268 		}
269 	}
270 
271 	if (res)
272 		errno = EFAULT;
273 	else
274 		errno = 0;
275 
276 	return (count);
277 }
278 
279 ssize_t
280 vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr)
281 {
282 	ssize_t count = 0;
283 	vmm_memseg_t *ms;
284 	ssize_t res = len;
285 
286 	for (ms = list_head(&vmm->vmm_memlist);
287 	    ms != NULL;
288 	    ms = list_next(&vmm->vmm_memlist, ms)) {
289 		if (addr >= ms->vms_gpa &&
290 		    addr < ms->vms_gpa + ms->vms_maplen) {
291 			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
292 
293 			if (res < 0)
294 				res = 0;
295 
296 			bcopy(buf, vmm->vmm_mem + addr, len - res);
297 			count += len - res;
298 			addr += len - res;
299 			len = res;
300 		}
301 	}
302 
303 	if (res)
304 		errno = EFAULT;
305 	else
306 		errno = 0;
307 
308 	return (count);
309 }
310 
311 size_t
312 vmm_ncpu(vmm_t *vmm)
313 {
314 	return (vmm->vmm_ncpu);
315 }
316 
317 size_t
318 vmm_memsize(vmm_t *vmm)
319 {
320 	return (vmm->vmm_memsize);
321 }
322 
323 int
324 vmm_cont(vmm_t *vmm)
325 {
326 	return (vm_resume_cpu(vmm->vmm_ctx, -1));
327 }
328 
329 int
330 vmm_step(vmm_t *vmm, int vcpu)
331 {
332 	cpuset_t cpuset;
333 	int ret;
334 
335 	if (vcpu >= vmm->vmm_ncpu) {
336 		errno = EINVAL;
337 		return (-1);
338 	}
339 
340 	ret = vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 1);
341 	if (ret != 0)
342 		return (-1);
343 
344 	assert(vm_resume_cpu(vmm->vmm_ctx, vcpu) == 0);
345 
346 	do {
347 		(void) vm_debug_cpus(vmm->vmm_ctx, &cpuset);
348 	} while (!CPU_ISSET(vcpu, &cpuset));
349 
350 	(void) vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 0);
351 
352 	return (ret);
353 }
354 
355 int
356 vmm_stop(vmm_t *vmm)
357 {
358 	int ret = vm_suspend_cpu(vmm->vmm_ctx, -1);
359 
360 	if (ret == 0)
361 		vmm_update_ncpu(vmm);
362 
363 	return (ret);
364 }
365 
366 /*
367  * Mapping of KDI-defined registers to vmmapi-defined registers.
368  * Registers not known to vmmapi use VM_REG_LAST, which is invalid and
369  * causes an error in vm_{get,set}_register_set().
370  *
371  * This array must be kept in sync with the definitions in kdi_regs.h.
372  */
373 static int vmm_kdi_regmap[] = {
374 	VM_REG_LAST,		/* KDIREG_SAVFP */
375 	VM_REG_LAST,		/* KDIREG_SAVPC */
376 	VM_REG_GUEST_RDI,	/* KDIREG_RDI */
377 	VM_REG_GUEST_RSI,	/* KDIREG_RSI */
378 	VM_REG_GUEST_RDX,	/* KDIREG_RDX */
379 	VM_REG_GUEST_RCX,	/* KDIREG_RCX */
380 	VM_REG_GUEST_R8,	/* KDIREG_R8 */
381 	VM_REG_GUEST_R9,	/* KDIREG_R9 */
382 	VM_REG_GUEST_RAX,	/* KDIREG_RAX */
383 	VM_REG_GUEST_RBX,	/* KDIREG_RBX */
384 	VM_REG_GUEST_RBP,	/* KDIREG_RBP */
385 	VM_REG_GUEST_R10,	/* KDIREG_R10 */
386 	VM_REG_GUEST_R11,	/* KDIREG_R11 */
387 	VM_REG_GUEST_R12,	/* KDIREG_R12 */
388 	VM_REG_GUEST_R13,	/* KDIREG_R13 */
389 	VM_REG_GUEST_R14,	/* KDIREG_R14 */
390 	VM_REG_GUEST_R15,	/* KDIREG_R15 */
391 	VM_REG_LAST,		/* KDIREG_FSBASE */
392 	VM_REG_LAST,		/* KDIREG_GSBASE */
393 	VM_REG_LAST,		/* KDIREG_KGSBASE */
394 	VM_REG_GUEST_CR2,	/* KDIREG_CR2 */
395 	VM_REG_GUEST_CR3,	/* KDIREG_CR3 */
396 	VM_REG_GUEST_DS,	/* KDIREG_DS */
397 	VM_REG_GUEST_ES,	/* KDIREG_ES */
398 	VM_REG_GUEST_FS,	/* KDIREG_FS */
399 	VM_REG_GUEST_GS,	/* KDIREG_GS */
400 	VM_REG_LAST,		/* KDIREG_TRAPNO */
401 	VM_REG_LAST,		/* KDIREG_ERR */
402 	VM_REG_GUEST_RIP,	/* KDIREG_RIP */
403 	VM_REG_GUEST_CS,	/* KDIREG_CS */
404 	VM_REG_GUEST_RFLAGS,	/* KDIREG_RFLAGS */
405 	VM_REG_GUEST_RSP,	/* KDIREG_RSP */
406 	VM_REG_GUEST_SS		/* KDIREG_SS */
407 };
408 CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG);
409 
410 /*
411  * Mapping of libvmm-defined registers to vmmapi-defined registers.
412  *
413  * This array must be kept in sync with the definitions in libvmm.h
414  */
415 static int vmm_sys_regmap[] = {
416 	VM_REG_GUEST_CR0,	/* VMM_REG_CR0 */
417 	VM_REG_GUEST_CR2,	/* VMM_REG_CR2 */
418 	VM_REG_GUEST_CR3,	/* VMM_REG_CR3 */
419 	VM_REG_GUEST_CR4,	/* VMM_REG_CR4 */
420 	VM_REG_GUEST_DR0,	/* VMM_REG_DR0 */
421 	VM_REG_GUEST_DR1,	/* VMM_REG_DR1 */
422 	VM_REG_GUEST_DR2,	/* VMM_REG_DR2 */
423 	VM_REG_GUEST_DR3,	/* VMM_REG_DR3 */
424 	VM_REG_GUEST_DR6,	/* VMM_REG_DR6 */
425 	VM_REG_GUEST_DR7,	/* VMM_REG_DR7 */
426 	VM_REG_GUEST_EFER,	/* VMM_REG_EFER */
427 	VM_REG_GUEST_PDPTE0,	/* VMM_REG_PDPTE0 */
428 	VM_REG_GUEST_PDPTE1,	/* VMM_REG_PDPTE1 */
429 	VM_REG_GUEST_PDPTE2,	/* VMM_REG_PDPTE2 */
430 	VM_REG_GUEST_PDPTE3,	/* VMM_REG_PDPTE3 */
431 	VM_REG_GUEST_INTR_SHADOW, /* VMM_REG_INTR_SHADOW */
432 };
433 
434 /*
435  * Mapping of libvmm-defined descriptors to vmmapi-defined descriptors.
436  *
437  * This array must be kept in sync with the definitions in libvmm.h
438  */
439 static int vmm_descmap[] = {
440 	VM_REG_GUEST_GDTR,
441 	VM_REG_GUEST_LDTR,
442 	VM_REG_GUEST_IDTR,
443 	VM_REG_GUEST_TR,
444 	VM_REG_GUEST_CS,
445 	VM_REG_GUEST_DS,
446 	VM_REG_GUEST_ES,
447 	VM_REG_GUEST_FS,
448 	VM_REG_GUEST_GS,
449 	VM_REG_GUEST_SS
450 };
451 
452 static int
453 vmm_mapreg(int reg)
454 {
455 	errno = 0;
456 
457 	if (reg < 0)
458 		goto fail;
459 
460 	if (reg < KDIREG_NGREG)
461 		return (vmm_kdi_regmap[reg]);
462 
463 	if (reg >= VMM_REG_OFFSET &&
464 	    reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap))
465 		return (vmm_sys_regmap[reg - VMM_REG_OFFSET]);
466 
467 fail:
468 	errno = EINVAL;
469 	return (VM_REG_LAST);
470 }
471 
472 static int
473 vmm_mapdesc(int desc)
474 {
475 	errno = 0;
476 
477 	if (desc >= VMM_DESC_OFFSET &&
478 	    desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap))
479 		return (vmm_descmap[desc - VMM_DESC_OFFSET]);
480 
481 	errno = EINVAL;
482 	return (VM_REG_LAST);
483 }
484 
485 int
486 vmm_getreg(vmm_t *vmm, int vcpu, int reg, uint64_t *val)
487 {
488 	reg = vmm_mapreg(reg);
489 
490 	if (reg == VM_REG_LAST)
491 		return (-1);
492 
493 	return (vm_get_register(vmm->vmm_ctx, vcpu, reg, val));
494 }
495 
496 int
497 vmm_setreg(vmm_t *vmm, int vcpu, int reg, uint64_t val)
498 {
499 	reg = vmm_mapreg(reg);
500 
501 	if (reg == VM_REG_LAST)
502 		return (-1);
503 
504 	return (vm_set_register(vmm->vmm_ctx, vcpu, reg, val));
505 }
506 
507 int
508 vmm_get_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
509     uint64_t *regvals)
510 {
511 	int *vm_regnums;
512 	int i;
513 	int ret = -1;
514 
515 	vm_regnums = malloc(sizeof (int) * nregs);
516 	if (vm_regnums == NULL)
517 		return (ret);
518 
519 	for (i = 0; i != nregs; i++) {
520 		vm_regnums[i] = vmm_mapreg(regnums[i]);
521 		if (vm_regnums[i] == VM_REG_LAST)
522 			goto fail;
523 	}
524 
525 	ret = vm_get_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
526 	    regvals);
527 
528 fail:
529 	free(vm_regnums);
530 	return (ret);
531 }
532 
533 int
534 vmm_set_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
535     uint64_t *regvals)
536 {
537 	int *vm_regnums;
538 	int i;
539 	int ret = -1;
540 
541 	vm_regnums = malloc(sizeof (int) * nregs);
542 	if (vm_regnums == NULL)
543 		return (ret);
544 
545 	for (i = 0; i != nregs; i++) {
546 		vm_regnums[i] = vmm_mapreg(regnums[i]);
547 		if (vm_regnums[i] == VM_REG_LAST)
548 			goto fail;
549 	}
550 
551 	ret = vm_set_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
552 	    regvals);
553 
554 fail:
555 	free(vm_regnums);
556 	return (ret);
557 }
558 
559 int
560 vmm_get_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
561 {
562 	desc = vmm_mapdesc(desc);
563 	if (desc == VM_REG_LAST)
564 		return (-1);
565 
566 	return (vm_get_desc(vmm->vmm_ctx, vcpu, desc, &vd->vd_base, &vd->vd_lim,
567 	    &vd->vd_acc));
568 }
569 
570 int
571 vmm_set_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
572 {
573 	desc = vmm_mapdesc(desc);
574 	if (desc == VM_REG_LAST)
575 		return (-1);
576 
577 	return (vm_set_desc(vmm->vmm_ctx, vcpu, desc, vd->vd_base, vd->vd_lim,
578 	    vd->vd_acc));
579 }
580 
581 /*
582  * Structure to hold MMU state during address translation.
583  * The contents of vmm_mmu_regnum[] must be kept in sync with this.
584  */
585 typedef struct vmm_mmu {
586 	uint64_t vm_cr0;
587 	uint64_t vm_cr3;
588 	uint64_t vm_cr4;
589 	uint64_t vm_efer;
590 } vmm_mmu_t;
591 
592 static const int vmm_mmu_regnum[] = {
593 	VMM_REG_CR0,
594 	VMM_REG_CR3,
595 	VMM_REG_CR4,
596 	VMM_REG_EFER
597 };
598 
599 #define	X86_PTE_P		0x001ULL
600 #define	X86_PTE_PS		0x080ULL
601 
602 #define	X86_PTE_PHYSMASK	0x000ffffffffff000ULL
603 #define	X86_PAGE_SHIFT		12
604 #define	X86_PAGE_SIZE		(1ULL << X86_PAGE_SHIFT)
605 
606 #define	X86_SEG_CODE_DATA	(1ULL << 4)
607 #define	X86_SEG_PRESENT		(1ULL << 7)
608 #define	X86_SEG_LONG		(1ULL << 13)
609 #define	X86_SEG_BIG		(1ULL << 14)
610 #define	X86_SEG_GRANULARITY	(1ULL << 15)
611 #define	X86_SEG_UNUSABLE	(1ULL << 16)
612 
613 #define	X86_SEG_USABLE		(X86_SEG_PRESENT | X86_SEG_CODE_DATA)
614 #define	X86_SEG_USABLE_MASK	(X86_SEG_UNUSABLE | X86_SEG_USABLE)
615 
616 /*
617  * vmm_pte2paddr:
618  *
619  * Recursively calculate the physical address from a virtual address,
620  * starting at the given PTE level using the given PTE.
621  */
622 static int
623 vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level,
624     uint64_t vaddr, uint64_t *paddr)
625 {
626 	int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t);
627 	int off_bits = ia32 ? 10 : 9;
628 	boolean_t hugepage = B_FALSE;
629 	uint64_t offset;
630 	uint64_t off_mask, off_shift;
631 
632 	if (level < 4 && (pte & X86_PTE_P) == 0) {
633 		errno = EFAULT;
634 		return (-1);
635 	}
636 
637 	off_shift = X86_PAGE_SHIFT + off_bits * level;
638 	off_mask = (1ULL << off_shift) - 1;
639 
640 	offset = vaddr & off_mask;
641 
642 	if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) {
643 		hugepage = B_TRUE;
644 	} else {
645 		if (level > 0) {
646 			offset >>= off_shift - off_bits;
647 			offset <<= X86_PAGE_SHIFT - off_bits;
648 		}
649 		off_mask = 0xfff;
650 	}
651 
652 	*paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset;
653 
654 	if (level == 0 || hugepage)
655 		return (0);
656 
657 	pte = 0;
658 	if (vmm_pread(vmm, &pte,  pte_size, *paddr) != pte_size)
659 		return (-1);
660 	return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr));
661 }
662 
663 static vmm_mode_t
664 vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpu, vmm_mmu_t *mmu)
665 {
666 	if ((mmu->vm_cr0 & CR0_PE) == 0)
667 		return (VMM_MODE_REAL);
668 	else if ((mmu->vm_cr4 & CR4_PAE) == 0)
669 		return (VMM_MODE_PROT);
670 	else if ((mmu->vm_efer & AMD_EFER_LME) == 0)
671 		return (VMM_MODE_PAE);
672 	else
673 		return (VMM_MODE_LONG);
674 }
675 
676 vmm_mode_t
677 vmm_vcpu_mode(vmm_t *vmm, int vcpu)
678 {
679 	vmm_mmu_t mmu = { 0 };
680 
681 	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
682 	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
683 		return (VMM_MODE_UNKNOWN);
684 
685 	return (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu));
686 }
687 
688 vmm_isa_t
689 vmm_vcpu_isa(vmm_t *vmm, int vcpu)
690 {
691 	vmm_desc_t cs;
692 
693 	if (vmm_get_desc(vmm, vcpu, VMM_DESC_CS, &cs) != 0)
694 		return (VMM_ISA_UNKNOWN);
695 
696 	switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) {
697 	case 0x0:		/* 16b code segment */
698 		return (VMM_ISA_16);
699 	case X86_SEG_LONG:	/* 64b code segment */
700 		return (VMM_ISA_64);
701 	case X86_SEG_BIG:	/* 32b code segment */
702 		return (VMM_ISA_32);
703 	}
704 
705 	return (VMM_ISA_UNKNOWN);
706 }
707 
708 /*
709  * vmm_vtol:
710  *
711  * Translate a virtual address to a physical address on a certain vCPU,
712  * using the specified segment register or descriptor according to the mode.
713  *
714  */
715 int
716 vmm_vtol(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *laddr)
717 {
718 	vmm_desc_t desc;
719 	uint64_t limit;
720 
721 	if (vmm_get_desc(vmm, vcpu, seg, &desc) != 0)
722 		return (-1);
723 
724 	switch (vmm_vcpu_mode(vmm, vcpu)) {
725 	case VMM_MODE_REAL:
726 		if (seg == VMM_DESC_FS || seg == VMM_DESC_GS)
727 			goto fault;
728 		/* FALLTHRU */
729 	case VMM_MODE_PROT:
730 	case VMM_MODE_PAE:
731 		if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE)
732 			/* unusable, system segment, or not present */
733 			goto fault;
734 
735 		limit = desc.vd_lim;
736 		if (desc.vd_acc & X86_SEG_GRANULARITY)
737 			limit *= 4096;
738 
739 		if (vaddr > limit)
740 			goto fault;
741 		/* FALLTHRU */
742 	case VMM_MODE_LONG:
743 		*laddr = desc.vd_base + vaddr;
744 		return (0);
745 
746 	default:
747 	fault:
748 		errno = EFAULT;
749 		return (-1);
750 	}
751 
752 }
753 
754 /*
755  * vmm_vtop:
756  *
757  * Translate a virtual address to a guest physical address on a certain vCPU,
758  * according to the mode the vCPU is in.
759  */
760 int
761 vmm_vtop(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *paddr)
762 {
763 	vmm_mmu_t mmu = { 0 };
764 	int ret = 0;
765 
766 	if (vmm_vtol(vmm, vcpu, seg, vaddr, &vaddr) != 0)
767 		return (-1);
768 
769 	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
770 	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
771 		return (-1);
772 
773 	if ((mmu.vm_cr0 & CR0_PG) == 0) {
774 		/* no paging, physical equals virtual */
775 		*paddr = vaddr;
776 		return (0);
777 	}
778 
779 	switch (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu)) {
780 	case VMM_MODE_PROT:
781 		/* protected mode, no PAE: 2-level paging, 32bit PTEs */
782 		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr);
783 		break;
784 	case VMM_MODE_PAE:
785 		/* protected mode with PAE: 3-level paging, 64bit PTEs */
786 		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr);
787 		break;
788 	case VMM_MODE_LONG:
789 		/* long mode: 4-level paging, 64bit PTEs */
790 		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr);
791 		break;
792 	default:
793 		ret = -1;
794 	}
795 
796 	return (ret);
797 }
798 
799 ssize_t
800 vmm_vread(vmm_t *vmm, int vcpu, int seg, void *buf, size_t len, uintptr_t addr)
801 {
802 	ssize_t res = 0;
803 	uint64_t paddr;
804 	size_t plen;
805 	uint64_t boundary;
806 
807 	while (len != 0) {
808 		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
809 			errno = EFAULT;
810 			return (0);
811 		}
812 
813 		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
814 		if (addr + len > boundary)
815 			plen = boundary - addr;
816 		else
817 			plen = len;
818 
819 		if (vmm_pread(vmm, buf, plen, paddr) != plen)
820 			return (0);
821 		len -= plen;
822 		addr += plen;
823 		buf += plen;
824 		res += plen;
825 	}
826 
827 	return (res);
828 }
829 
830 ssize_t
831 vmm_vwrite(vmm_t *vmm, int vcpu, int seg, const void *buf, size_t len,
832     uintptr_t addr)
833 {
834 	ssize_t res = 0;
835 	uint64_t paddr;
836 	size_t plen;
837 	uint64_t boundary;
838 
839 	while (len != 0) {
840 		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
841 			errno = EFAULT;
842 			return (0);
843 		}
844 
845 		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
846 		if (addr + len > boundary)
847 			plen = boundary - addr;
848 		else
849 			plen = len;
850 
851 		if (vmm_pwrite(vmm, buf, plen, paddr) != plen)
852 			return (0);
853 		len -= plen;
854 		addr += plen;
855 		buf += plen;
856 		res += plen;
857 	}
858 
859 	return (res);
860 }
861