xref: /illumos-gate/usr/src/lib/libvmm/libvmm.c (revision ab26215b1a80ead55969e925a597044ad4185a34)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2020 Oxide Computer Company
15  */
16 
17 /*
18  * Library for native code to access bhyve VMs, without the need to use
19  * FreeBSD compat headers
20  */
21 
22 #include <sys/param.h>
23 #include <sys/list.h>
24 #include <sys/stddef.h>
25 #include <sys/mman.h>
26 #include <sys/kdi_regs.h>
27 #include <sys/sysmacros.h>
28 #include <sys/controlregs.h>
29 #include <sys/note.h>
30 #include <sys/debug.h>
31 #include <errno.h>
32 #include <stdlib.h>
33 #include <strings.h>
34 #include <unistd.h>
35 #include <assert.h>
36 
37 #include <machine/vmm.h>
38 #include <vmmapi.h>
39 
40 #include <libvmm.h>
41 
42 typedef struct vmm_memseg vmm_memseg_t;
43 
44 #define	VMM_MEMSEG_DEVMEM	0x1
45 
46 struct vmm_memseg {
47 	list_node_t vms_list;
48 	int vms_segid;
49 	int vms_prot;
50 	int vms_flags;
51 	uintptr_t vms_gpa;
52 	off_t vms_segoff;
53 	size_t vms_seglen;
54 	size_t vms_maplen;
55 	char vms_name[64];
56 };
57 
58 struct vmm {
59 	struct vmctx *vmm_ctx;
60 	list_t vmm_memlist;
61 	char *vmm_mem;
62 	size_t vmm_memsize;
63 	size_t vmm_ncpu;
64 };
65 
66 
67 /*
68  * This code relies on two assumptions:
69  * - CPUs are never removed from the "active set", not even when suspended.
70  *   A CPU being active just means that it has been used by the guest OS.
71  * - The CPU numbering is consecutive.
72  */
73 static void
74 vmm_update_ncpu(vmm_t *vmm)
75 {
76 	cpuset_t cpuset;
77 
78 	assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0);
79 
80 	for (vmm->vmm_ncpu = 0;
81 	    CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1;
82 	    vmm->vmm_ncpu++)
83 		;
84 }
85 
86 vmm_t *
87 vmm_open_vm(const char *name)
88 {
89 	vmm_t *vmm = NULL;
90 
91 	vmm = malloc(sizeof (vmm_t));
92 	if (vmm == NULL)
93 		return (NULL);
94 
95 	bzero(vmm, sizeof (vmm_t));
96 	vmm->vmm_mem = MAP_FAILED;
97 
98 	list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t),
99 	    offsetof(vmm_memseg_t, vms_list));
100 
101 	vmm->vmm_ctx = vm_open(name);
102 	if (vmm->vmm_ctx == NULL) {
103 		free(vmm);
104 		return (NULL);
105 	}
106 
107 	vmm_update_ncpu(vmm);
108 
109 	/*
110 	 * If we open a VM that has just been created we may see a state
111 	 * where it has no CPUs configured yet. We'll just wait for 10ms
112 	 * and retry until we get a non-zero CPU count.
113 	 */
114 	if (vmm->vmm_ncpu == 0) {
115 		do {
116 			(void) usleep(10000);
117 			vmm_update_ncpu(vmm);
118 		} while (vmm->vmm_ncpu == 0);
119 	}
120 
121 	return (vmm);
122 }
123 
124 void
125 vmm_close_vm(vmm_t *vmm)
126 {
127 	vmm_unmap(vmm);
128 
129 	list_destroy(&vmm->vmm_memlist);
130 
131 	if (vmm->vmm_ctx != NULL)
132 		vm_close(vmm->vmm_ctx);
133 
134 	free(vmm);
135 }
136 
137 static vmm_memseg_t *
138 vmm_get_memseg(vmm_t *vmm, uintptr_t gpa)
139 {
140 	vmm_memseg_t ms, *ret;
141 	int error, flags;
142 
143 	bzero(&ms, sizeof (vmm_memseg_t));
144 	ms.vms_gpa = gpa;
145 	error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid,
146 	    &ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags);
147 	if (error)
148 		return (NULL);
149 
150 	error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen,
151 	    ms.vms_name, sizeof (ms.vms_name));
152 	if (error)
153 		return (NULL);
154 
155 	/*
156 	 * Regular memory segments don't have a name, but devmem segments do.
157 	 * We can use that information to set the DEVMEM flag if necessary.
158 	 */
159 	ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0;
160 
161 	ret = malloc(sizeof (vmm_memseg_t));
162 	if (ret == NULL)
163 		return (NULL);
164 
165 	*ret = ms;
166 
167 	return (ret);
168 }
169 
170 int
171 vmm_map(vmm_t *vmm, boolean_t writable)
172 {
173 	uintptr_t last_gpa = 0;
174 	vmm_memseg_t *ms;
175 	int prot_write = writable ? PROT_WRITE : 0;
176 
177 	if (vmm->vmm_mem != MAP_FAILED) {
178 		errno = EINVAL;
179 		return (-1);
180 	}
181 
182 	assert(list_is_empty(&vmm->vmm_memlist));
183 
184 	for (;;) {
185 		ms = vmm_get_memseg(vmm, last_gpa);
186 
187 		if (ms == NULL)
188 			break;
189 
190 		last_gpa = ms->vms_gpa + ms->vms_maplen;
191 		list_insert_tail(&vmm->vmm_memlist, ms);
192 	}
193 
194 	vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE,
195 	    MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0);
196 
197 	if (vmm->vmm_mem == MAP_FAILED)
198 		goto fail;
199 
200 	for (ms = list_head(&vmm->vmm_memlist);
201 	    ms != NULL;
202 	    ms = list_next(&vmm->vmm_memlist, ms)) {
203 		off_t mapoff;
204 
205 		if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) == 0) {
206 			/*
207 			 * sysmem segments will be located at an offset
208 			 * equivalent to their GPA.
209 			 */
210 			mapoff = ms->vms_gpa;
211 		} else {
212 			/*
213 			 * devmem segments are located in a special region away
214 			 * from the normal GPA space.
215 			 */
216 			if (vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid,
217 			    &mapoff) != 0) {
218 				goto fail;
219 			}
220 		}
221 
222 		/*
223 		 * While 'mapoff' points to the front of the segment, the actual
224 		 * mapping may be at some offset beyond that.
225 		 */
226 		VERIFY(ms->vms_segoff >= 0);
227 		mapoff += ms->vms_segoff;
228 
229 		vmm->vmm_memsize += ms->vms_maplen;
230 
231 		if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen,
232 		    PROT_READ | prot_write, MAP_SHARED | MAP_FIXED,
233 		    vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED)
234 			goto fail;
235 	}
236 
237 	return (0);
238 
239 fail:
240 	vmm_unmap(vmm);
241 
242 	return (-1);
243 }
244 
245 void
246 vmm_unmap(vmm_t *vmm)
247 {
248 	while (!list_is_empty(&vmm->vmm_memlist)) {
249 		vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist);
250 
251 		if (vmm->vmm_mem != MAP_FAILED) {
252 			(void) munmap(vmm->vmm_mem + ms->vms_gpa,
253 			    ms->vms_maplen);
254 		}
255 
256 		free(ms);
257 	}
258 
259 	if (vmm->vmm_mem != MAP_FAILED)
260 		(void) munmap(vmm->vmm_mem, vmm->vmm_memsize);
261 
262 	vmm->vmm_mem = MAP_FAILED;
263 	vmm->vmm_memsize = 0;
264 }
265 
266 ssize_t
267 vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr)
268 {
269 	ssize_t count = 0;
270 	vmm_memseg_t *ms;
271 	ssize_t res = len;
272 
273 	for (ms = list_head(&vmm->vmm_memlist);
274 	    ms != NULL && len != 0;
275 	    ms = list_next(&vmm->vmm_memlist, ms)) {
276 
277 		if (addr >= ms->vms_gpa &&
278 		    addr < ms->vms_gpa + ms->vms_maplen) {
279 			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
280 
281 			if (res < 0)
282 				res = 0;
283 
284 			bcopy(vmm->vmm_mem + addr, buf, len - res);
285 			count += len - res;
286 			addr += len - res;
287 			len = res;
288 		}
289 	}
290 
291 	if (res)
292 		errno = EFAULT;
293 	else
294 		errno = 0;
295 
296 	return (count);
297 }
298 
299 ssize_t
300 vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr)
301 {
302 	ssize_t count = 0;
303 	vmm_memseg_t *ms;
304 	ssize_t res = len;
305 
306 	for (ms = list_head(&vmm->vmm_memlist);
307 	    ms != NULL;
308 	    ms = list_next(&vmm->vmm_memlist, ms)) {
309 		if (addr >= ms->vms_gpa &&
310 		    addr < ms->vms_gpa + ms->vms_maplen) {
311 			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
312 
313 			if (res < 0)
314 				res = 0;
315 
316 			bcopy(buf, vmm->vmm_mem + addr, len - res);
317 			count += len - res;
318 			addr += len - res;
319 			len = res;
320 		}
321 	}
322 
323 	if (res)
324 		errno = EFAULT;
325 	else
326 		errno = 0;
327 
328 	return (count);
329 }
330 
331 size_t
332 vmm_ncpu(vmm_t *vmm)
333 {
334 	return (vmm->vmm_ncpu);
335 }
336 
337 size_t
338 vmm_memsize(vmm_t *vmm)
339 {
340 	return (vmm->vmm_memsize);
341 }
342 
343 int
344 vmm_cont(vmm_t *vmm)
345 {
346 	return (vm_resume_cpu(vmm->vmm_ctx, -1));
347 }
348 
349 int
350 vmm_step(vmm_t *vmm, int vcpu)
351 {
352 	cpuset_t cpuset;
353 	int ret;
354 
355 	if (vcpu >= vmm->vmm_ncpu) {
356 		errno = EINVAL;
357 		return (-1);
358 	}
359 
360 	ret = vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 1);
361 	if (ret != 0)
362 		return (-1);
363 
364 	assert(vm_resume_cpu(vmm->vmm_ctx, vcpu) == 0);
365 
366 	do {
367 		(void) vm_debug_cpus(vmm->vmm_ctx, &cpuset);
368 	} while (!CPU_ISSET(vcpu, &cpuset));
369 
370 	(void) vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 0);
371 
372 	return (ret);
373 }
374 
375 int
376 vmm_stop(vmm_t *vmm)
377 {
378 	int ret = vm_suspend_cpu(vmm->vmm_ctx, -1);
379 
380 	if (ret == 0)
381 		vmm_update_ncpu(vmm);
382 
383 	return (ret);
384 }
385 
386 /*
387  * Mapping of KDI-defined registers to vmmapi-defined registers.
388  * Registers not known to vmmapi use VM_REG_LAST, which is invalid and
389  * causes an error in vm_{get,set}_register_set().
390  *
391  * This array must be kept in sync with the definitions in kdi_regs.h.
392  */
393 static int vmm_kdi_regmap[] = {
394 	VM_REG_LAST,		/* KDIREG_SAVFP */
395 	VM_REG_LAST,		/* KDIREG_SAVPC */
396 	VM_REG_GUEST_RDI,	/* KDIREG_RDI */
397 	VM_REG_GUEST_RSI,	/* KDIREG_RSI */
398 	VM_REG_GUEST_RDX,	/* KDIREG_RDX */
399 	VM_REG_GUEST_RCX,	/* KDIREG_RCX */
400 	VM_REG_GUEST_R8,	/* KDIREG_R8 */
401 	VM_REG_GUEST_R9,	/* KDIREG_R9 */
402 	VM_REG_GUEST_RAX,	/* KDIREG_RAX */
403 	VM_REG_GUEST_RBX,	/* KDIREG_RBX */
404 	VM_REG_GUEST_RBP,	/* KDIREG_RBP */
405 	VM_REG_GUEST_R10,	/* KDIREG_R10 */
406 	VM_REG_GUEST_R11,	/* KDIREG_R11 */
407 	VM_REG_GUEST_R12,	/* KDIREG_R12 */
408 	VM_REG_GUEST_R13,	/* KDIREG_R13 */
409 	VM_REG_GUEST_R14,	/* KDIREG_R14 */
410 	VM_REG_GUEST_R15,	/* KDIREG_R15 */
411 	VM_REG_LAST,		/* KDIREG_FSBASE */
412 	VM_REG_LAST,		/* KDIREG_GSBASE */
413 	VM_REG_LAST,		/* KDIREG_KGSBASE */
414 	VM_REG_GUEST_CR2,	/* KDIREG_CR2 */
415 	VM_REG_GUEST_CR3,	/* KDIREG_CR3 */
416 	VM_REG_GUEST_DS,	/* KDIREG_DS */
417 	VM_REG_GUEST_ES,	/* KDIREG_ES */
418 	VM_REG_GUEST_FS,	/* KDIREG_FS */
419 	VM_REG_GUEST_GS,	/* KDIREG_GS */
420 	VM_REG_LAST,		/* KDIREG_TRAPNO */
421 	VM_REG_LAST,		/* KDIREG_ERR */
422 	VM_REG_GUEST_RIP,	/* KDIREG_RIP */
423 	VM_REG_GUEST_CS,	/* KDIREG_CS */
424 	VM_REG_GUEST_RFLAGS,	/* KDIREG_RFLAGS */
425 	VM_REG_GUEST_RSP,	/* KDIREG_RSP */
426 	VM_REG_GUEST_SS		/* KDIREG_SS */
427 };
428 CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG);
429 
430 /*
431  * Mapping of libvmm-defined registers to vmmapi-defined registers.
432  *
433  * This array must be kept in sync with the definitions in libvmm.h
434  */
435 static int vmm_sys_regmap[] = {
436 	VM_REG_GUEST_CR0,	/* VMM_REG_CR0 */
437 	VM_REG_GUEST_CR2,	/* VMM_REG_CR2 */
438 	VM_REG_GUEST_CR3,	/* VMM_REG_CR3 */
439 	VM_REG_GUEST_CR4,	/* VMM_REG_CR4 */
440 	VM_REG_GUEST_DR0,	/* VMM_REG_DR0 */
441 	VM_REG_GUEST_DR1,	/* VMM_REG_DR1 */
442 	VM_REG_GUEST_DR2,	/* VMM_REG_DR2 */
443 	VM_REG_GUEST_DR3,	/* VMM_REG_DR3 */
444 	VM_REG_GUEST_DR6,	/* VMM_REG_DR6 */
445 	VM_REG_GUEST_DR7,	/* VMM_REG_DR7 */
446 	VM_REG_GUEST_EFER,	/* VMM_REG_EFER */
447 	VM_REG_GUEST_PDPTE0,	/* VMM_REG_PDPTE0 */
448 	VM_REG_GUEST_PDPTE1,	/* VMM_REG_PDPTE1 */
449 	VM_REG_GUEST_PDPTE2,	/* VMM_REG_PDPTE2 */
450 	VM_REG_GUEST_PDPTE3,	/* VMM_REG_PDPTE3 */
451 	VM_REG_GUEST_INTR_SHADOW, /* VMM_REG_INTR_SHADOW */
452 };
453 
454 /*
455  * Mapping of libvmm-defined descriptors to vmmapi-defined descriptors.
456  *
457  * This array must be kept in sync with the definitions in libvmm.h
458  */
459 static int vmm_descmap[] = {
460 	VM_REG_GUEST_GDTR,
461 	VM_REG_GUEST_LDTR,
462 	VM_REG_GUEST_IDTR,
463 	VM_REG_GUEST_TR,
464 	VM_REG_GUEST_CS,
465 	VM_REG_GUEST_DS,
466 	VM_REG_GUEST_ES,
467 	VM_REG_GUEST_FS,
468 	VM_REG_GUEST_GS,
469 	VM_REG_GUEST_SS
470 };
471 
472 static int
473 vmm_mapreg(int reg)
474 {
475 	errno = 0;
476 
477 	if (reg < 0)
478 		goto fail;
479 
480 	if (reg < KDIREG_NGREG)
481 		return (vmm_kdi_regmap[reg]);
482 
483 	if (reg >= VMM_REG_OFFSET &&
484 	    reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap))
485 		return (vmm_sys_regmap[reg - VMM_REG_OFFSET]);
486 
487 fail:
488 	errno = EINVAL;
489 	return (VM_REG_LAST);
490 }
491 
492 static int
493 vmm_mapdesc(int desc)
494 {
495 	errno = 0;
496 
497 	if (desc >= VMM_DESC_OFFSET &&
498 	    desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap))
499 		return (vmm_descmap[desc - VMM_DESC_OFFSET]);
500 
501 	errno = EINVAL;
502 	return (VM_REG_LAST);
503 }
504 
505 int
506 vmm_getreg(vmm_t *vmm, int vcpu, int reg, uint64_t *val)
507 {
508 	reg = vmm_mapreg(reg);
509 
510 	if (reg == VM_REG_LAST)
511 		return (-1);
512 
513 	return (vm_get_register(vmm->vmm_ctx, vcpu, reg, val));
514 }
515 
516 int
517 vmm_setreg(vmm_t *vmm, int vcpu, int reg, uint64_t val)
518 {
519 	reg = vmm_mapreg(reg);
520 
521 	if (reg == VM_REG_LAST)
522 		return (-1);
523 
524 	return (vm_set_register(vmm->vmm_ctx, vcpu, reg, val));
525 }
526 
527 int
528 vmm_get_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
529     uint64_t *regvals)
530 {
531 	int *vm_regnums;
532 	int i;
533 	int ret = -1;
534 
535 	vm_regnums = malloc(sizeof (int) * nregs);
536 	if (vm_regnums == NULL)
537 		return (ret);
538 
539 	for (i = 0; i != nregs; i++) {
540 		vm_regnums[i] = vmm_mapreg(regnums[i]);
541 		if (vm_regnums[i] == VM_REG_LAST)
542 			goto fail;
543 	}
544 
545 	ret = vm_get_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
546 	    regvals);
547 
548 fail:
549 	free(vm_regnums);
550 	return (ret);
551 }
552 
553 int
554 vmm_set_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
555     uint64_t *regvals)
556 {
557 	int *vm_regnums;
558 	int i;
559 	int ret = -1;
560 
561 	vm_regnums = malloc(sizeof (int) * nregs);
562 	if (vm_regnums == NULL)
563 		return (ret);
564 
565 	for (i = 0; i != nregs; i++) {
566 		vm_regnums[i] = vmm_mapreg(regnums[i]);
567 		if (vm_regnums[i] == VM_REG_LAST)
568 			goto fail;
569 	}
570 
571 	ret = vm_set_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
572 	    regvals);
573 
574 fail:
575 	free(vm_regnums);
576 	return (ret);
577 }
578 
579 int
580 vmm_get_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
581 {
582 	desc = vmm_mapdesc(desc);
583 	if (desc == VM_REG_LAST)
584 		return (-1);
585 
586 	return (vm_get_desc(vmm->vmm_ctx, vcpu, desc, &vd->vd_base, &vd->vd_lim,
587 	    &vd->vd_acc));
588 }
589 
590 int
591 vmm_set_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
592 {
593 	desc = vmm_mapdesc(desc);
594 	if (desc == VM_REG_LAST)
595 		return (-1);
596 
597 	return (vm_set_desc(vmm->vmm_ctx, vcpu, desc, vd->vd_base, vd->vd_lim,
598 	    vd->vd_acc));
599 }
600 
601 /*
602  * Structure to hold MMU state during address translation.
603  * The contents of vmm_mmu_regnum[] must be kept in sync with this.
604  */
605 typedef struct vmm_mmu {
606 	uint64_t vm_cr0;
607 	uint64_t vm_cr3;
608 	uint64_t vm_cr4;
609 	uint64_t vm_efer;
610 } vmm_mmu_t;
611 
612 static const int vmm_mmu_regnum[] = {
613 	VMM_REG_CR0,
614 	VMM_REG_CR3,
615 	VMM_REG_CR4,
616 	VMM_REG_EFER
617 };
618 
619 #define	X86_PTE_P		0x001ULL
620 #define	X86_PTE_PS		0x080ULL
621 
622 #define	X86_PTE_PHYSMASK	0x000ffffffffff000ULL
623 #define	X86_PAGE_SHIFT		12
624 #define	X86_PAGE_SIZE		(1ULL << X86_PAGE_SHIFT)
625 
626 #define	X86_SEG_CODE_DATA	(1ULL << 4)
627 #define	X86_SEG_PRESENT		(1ULL << 7)
628 #define	X86_SEG_LONG		(1ULL << 13)
629 #define	X86_SEG_BIG		(1ULL << 14)
630 #define	X86_SEG_GRANULARITY	(1ULL << 15)
631 #define	X86_SEG_UNUSABLE	(1ULL << 16)
632 
633 #define	X86_SEG_USABLE		(X86_SEG_PRESENT | X86_SEG_CODE_DATA)
634 #define	X86_SEG_USABLE_MASK	(X86_SEG_UNUSABLE | X86_SEG_USABLE)
635 
636 /*
637  * vmm_pte2paddr:
638  *
639  * Recursively calculate the physical address from a virtual address,
640  * starting at the given PTE level using the given PTE.
641  */
642 static int
643 vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level,
644     uint64_t vaddr, uint64_t *paddr)
645 {
646 	int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t);
647 	int off_bits = ia32 ? 10 : 9;
648 	boolean_t hugepage = B_FALSE;
649 	uint64_t offset;
650 	uint64_t off_mask, off_shift;
651 
652 	if (level < 4 && (pte & X86_PTE_P) == 0) {
653 		errno = EFAULT;
654 		return (-1);
655 	}
656 
657 	off_shift = X86_PAGE_SHIFT + off_bits * level;
658 	off_mask = (1ULL << off_shift) - 1;
659 
660 	offset = vaddr & off_mask;
661 
662 	if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) {
663 		hugepage = B_TRUE;
664 	} else {
665 		if (level > 0) {
666 			offset >>= off_shift - off_bits;
667 			offset <<= X86_PAGE_SHIFT - off_bits;
668 		}
669 		off_mask = 0xfff;
670 	}
671 
672 	*paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset;
673 
674 	if (level == 0 || hugepage)
675 		return (0);
676 
677 	pte = 0;
678 	if (vmm_pread(vmm, &pte,  pte_size, *paddr) != pte_size)
679 		return (-1);
680 	return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr));
681 }
682 
683 static vmm_mode_t
684 vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpu, vmm_mmu_t *mmu)
685 {
686 	if ((mmu->vm_cr0 & CR0_PE) == 0)
687 		return (VMM_MODE_REAL);
688 	else if ((mmu->vm_cr4 & CR4_PAE) == 0)
689 		return (VMM_MODE_PROT);
690 	else if ((mmu->vm_efer & AMD_EFER_LME) == 0)
691 		return (VMM_MODE_PAE);
692 	else
693 		return (VMM_MODE_LONG);
694 }
695 
696 vmm_mode_t
697 vmm_vcpu_mode(vmm_t *vmm, int vcpu)
698 {
699 	vmm_mmu_t mmu = { 0 };
700 
701 	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
702 	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
703 		return (VMM_MODE_UNKNOWN);
704 
705 	return (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu));
706 }
707 
708 vmm_isa_t
709 vmm_vcpu_isa(vmm_t *vmm, int vcpu)
710 {
711 	vmm_desc_t cs;
712 
713 	if (vmm_get_desc(vmm, vcpu, VMM_DESC_CS, &cs) != 0)
714 		return (VMM_ISA_UNKNOWN);
715 
716 	switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) {
717 	case 0x0:		/* 16b code segment */
718 		return (VMM_ISA_16);
719 	case X86_SEG_LONG:	/* 64b code segment */
720 		return (VMM_ISA_64);
721 	case X86_SEG_BIG:	/* 32b code segment */
722 		return (VMM_ISA_32);
723 	}
724 
725 	return (VMM_ISA_UNKNOWN);
726 }
727 
728 /*
729  * vmm_vtol:
730  *
731  * Translate a virtual address to a physical address on a certain vCPU,
732  * using the specified segment register or descriptor according to the mode.
733  *
734  */
735 int
736 vmm_vtol(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *laddr)
737 {
738 	vmm_desc_t desc;
739 	uint64_t limit;
740 
741 	if (vmm_get_desc(vmm, vcpu, seg, &desc) != 0)
742 		return (-1);
743 
744 	switch (vmm_vcpu_mode(vmm, vcpu)) {
745 	case VMM_MODE_REAL:
746 		if (seg == VMM_DESC_FS || seg == VMM_DESC_GS)
747 			goto fault;
748 		/* FALLTHRU */
749 	case VMM_MODE_PROT:
750 	case VMM_MODE_PAE:
751 		if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE)
752 			/* unusable, system segment, or not present */
753 			goto fault;
754 
755 		limit = desc.vd_lim;
756 		if (desc.vd_acc & X86_SEG_GRANULARITY)
757 			limit *= 4096;
758 
759 		if (vaddr > limit)
760 			goto fault;
761 		/* FALLTHRU */
762 	case VMM_MODE_LONG:
763 		*laddr = desc.vd_base + vaddr;
764 		return (0);
765 
766 	default:
767 	fault:
768 		errno = EFAULT;
769 		return (-1);
770 	}
771 
772 }
773 
774 /*
775  * vmm_vtop:
776  *
777  * Translate a virtual address to a guest physical address on a certain vCPU,
778  * according to the mode the vCPU is in.
779  */
780 int
781 vmm_vtop(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *paddr)
782 {
783 	vmm_mmu_t mmu = { 0 };
784 	int ret = 0;
785 
786 	if (vmm_vtol(vmm, vcpu, seg, vaddr, &vaddr) != 0)
787 		return (-1);
788 
789 	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
790 	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
791 		return (-1);
792 
793 	if ((mmu.vm_cr0 & CR0_PG) == 0) {
794 		/* no paging, physical equals virtual */
795 		*paddr = vaddr;
796 		return (0);
797 	}
798 
799 	switch (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu)) {
800 	case VMM_MODE_PROT:
801 		/* protected mode, no PAE: 2-level paging, 32bit PTEs */
802 		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr);
803 		break;
804 	case VMM_MODE_PAE:
805 		/* protected mode with PAE: 3-level paging, 64bit PTEs */
806 		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr);
807 		break;
808 	case VMM_MODE_LONG:
809 		/* long mode: 4-level paging, 64bit PTEs */
810 		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr);
811 		break;
812 	default:
813 		ret = -1;
814 	}
815 
816 	return (ret);
817 }
818 
819 ssize_t
820 vmm_vread(vmm_t *vmm, int vcpu, int seg, void *buf, size_t len, uintptr_t addr)
821 {
822 	ssize_t res = 0;
823 	uint64_t paddr;
824 	size_t plen;
825 	uint64_t boundary;
826 
827 	while (len != 0) {
828 		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
829 			errno = EFAULT;
830 			return (0);
831 		}
832 
833 		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
834 		if (addr + len > boundary)
835 			plen = boundary - addr;
836 		else
837 			plen = len;
838 
839 		if (vmm_pread(vmm, buf, plen, paddr) != plen)
840 			return (0);
841 		len -= plen;
842 		addr += plen;
843 		buf += plen;
844 		res += plen;
845 	}
846 
847 	return (res);
848 }
849 
850 ssize_t
851 vmm_vwrite(vmm_t *vmm, int vcpu, int seg, const void *buf, size_t len,
852     uintptr_t addr)
853 {
854 	ssize_t res = 0;
855 	uint64_t paddr;
856 	size_t plen;
857 	uint64_t boundary;
858 
859 	while (len != 0) {
860 		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
861 			errno = EFAULT;
862 			return (0);
863 		}
864 
865 		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
866 		if (addr + len > boundary)
867 			plen = boundary - addr;
868 		else
869 			plen = len;
870 
871 		if (vmm_pwrite(vmm, buf, plen, paddr) != plen)
872 			return (0);
873 		len -= plen;
874 		addr += plen;
875 		buf += plen;
876 		res += plen;
877 	}
878 
879 	return (res);
880 }
881