xref: /illumos-gate/usr/src/lib/libvmm/libvmm.c (revision 2d9a5a52c758e1dbaee1569f0d91634a0f5cbe39)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2020 Oxide Computer Company
15  * Copyright 2023 OmniOS Community Edition (OmniOSce) Association.
16  */
17 
18 /*
19  * Library for native code to access bhyve VMs, without the need to use
20  * FreeBSD compat headers
21  */
22 
23 #include <sys/param.h>
24 #include <sys/list.h>
25 #include <sys/stddef.h>
26 #include <sys/mman.h>
27 #include <sys/kdi_regs.h>
28 #include <sys/sysmacros.h>
29 #include <sys/controlregs.h>
30 #include <sys/note.h>
31 #include <sys/debug.h>
32 #include <errno.h>
33 #include <stdlib.h>
34 #include <strings.h>
35 #include <unistd.h>
36 #include <assert.h>
37 
38 #include <machine/vmm.h>
39 #include <vmmapi.h>
40 
41 #include <libvmm.h>
42 
43 typedef struct vmm_memseg vmm_memseg_t;
44 
45 #define	VMM_MEMSEG_DEVMEM	0x1
46 
47 struct vmm_memseg {
48 	list_node_t vms_list;
49 	int vms_segid;
50 	int vms_prot;
51 	int vms_flags;
52 	uintptr_t vms_gpa;
53 	off_t vms_segoff;
54 	size_t vms_seglen;
55 	size_t vms_maplen;
56 	char vms_name[64];
57 };
58 
59 struct vmm {
60 	struct vmctx *vmm_ctx;
61 	list_t vmm_memlist;
62 	char *vmm_mem;
63 	size_t vmm_memsize;
64 	size_t vmm_ncpu;
65 	struct vcpu **vmm_vcpu;
66 };
67 
68 
69 /*
70  * This code relies on two assumptions:
71  * - CPUs are never removed from the "active set", not even when suspended.
72  *   A CPU being active just means that it has been used by the guest OS.
73  * - The CPU numbering is consecutive.
74  */
75 static void
76 vmm_update_ncpu(vmm_t *vmm)
77 {
78 	cpuset_t cpuset;
79 
80 	assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0);
81 
82 	for (vmm->vmm_ncpu = 0;
83 	    CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1;
84 	    vmm->vmm_ncpu++)
85 		;
86 }
87 
88 vmm_t *
89 vmm_open_vm(const char *name)
90 {
91 	vmm_t *vmm = NULL;
92 	int _errno;
93 	int i;
94 
95 	vmm = malloc(sizeof (vmm_t));
96 	if (vmm == NULL)
97 		return (NULL);
98 
99 	bzero(vmm, sizeof (vmm_t));
100 	vmm->vmm_mem = MAP_FAILED;
101 
102 	list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t),
103 	    offsetof(vmm_memseg_t, vms_list));
104 
105 	vmm->vmm_ctx = vm_open(name);
106 	if (vmm->vmm_ctx == NULL) {
107 		list_destroy(&vmm->vmm_memlist);
108 		free(vmm);
109 		return (NULL);
110 	}
111 
112 	vmm_update_ncpu(vmm);
113 
114 	/*
115 	 * If we open a VM that has just been created we may see a state
116 	 * where it has no CPUs configured yet. We'll just wait for 10ms
117 	 * and retry until we get a non-zero CPU count.
118 	 */
119 	if (vmm->vmm_ncpu == 0) {
120 		do {
121 			(void) usleep(10000);
122 			vmm_update_ncpu(vmm);
123 		} while (vmm->vmm_ncpu == 0);
124 	}
125 
126 	vmm->vmm_vcpu = calloc(vmm->vmm_ncpu, sizeof (struct vcpu *));
127 	if (vmm->vmm_vcpu == NULL)
128 		goto fail;
129 	for (i = 0; i < vmm->vmm_ncpu; i++) {
130 		vmm->vmm_vcpu[i] = vm_vcpu_open(vmm->vmm_ctx, i);
131 		if (vmm->vmm_vcpu[i] == NULL) {
132 			_errno = errno;
133 			while (i-- >= 0)
134 				vm_vcpu_close(vmm->vmm_vcpu[i]);
135 			free(vmm->vmm_vcpu);
136 			errno = _errno;
137 			goto fail;
138 		}
139 	}
140 
141 	return (vmm);
142 
143 fail:
144 	_errno = errno;
145 	vmm_close_vm(vmm);
146 	errno = _errno;
147 
148 	return (NULL);
149 }
150 
151 void
152 vmm_close_vm(vmm_t *vmm)
153 {
154 	uint_t i;
155 
156 	vmm_unmap(vmm);
157 
158 	for (i = 0; i < vmm->vmm_ncpu; i++)
159 		vm_vcpu_close(vmm->vmm_vcpu[i]);
160 	free(vmm->vmm_vcpu);
161 
162 	list_destroy(&vmm->vmm_memlist);
163 
164 	if (vmm->vmm_ctx != NULL)
165 		vm_close(vmm->vmm_ctx);
166 
167 	free(vmm);
168 }
169 
170 static vmm_memseg_t *
171 vmm_get_memseg(vmm_t *vmm, uintptr_t gpa)
172 {
173 	vmm_memseg_t ms, *ret;
174 	int error, flags;
175 
176 	bzero(&ms, sizeof (vmm_memseg_t));
177 	ms.vms_gpa = gpa;
178 	error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid,
179 	    &ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags);
180 	if (error)
181 		return (NULL);
182 
183 	error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen,
184 	    ms.vms_name, sizeof (ms.vms_name));
185 	if (error)
186 		return (NULL);
187 
188 	/*
189 	 * Regular memory segments don't have a name, but devmem segments do.
190 	 * We can use that information to set the DEVMEM flag if necessary.
191 	 */
192 	ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0;
193 
194 	ret = malloc(sizeof (vmm_memseg_t));
195 	if (ret == NULL)
196 		return (NULL);
197 
198 	*ret = ms;
199 
200 	return (ret);
201 }
202 
203 int
204 vmm_map(vmm_t *vmm, boolean_t writable)
205 {
206 	uintptr_t last_gpa = 0;
207 	vmm_memseg_t *ms;
208 	int prot_write = writable ? PROT_WRITE : 0;
209 
210 	if (vmm->vmm_mem != MAP_FAILED) {
211 		errno = EINVAL;
212 		return (-1);
213 	}
214 
215 	assert(list_is_empty(&vmm->vmm_memlist));
216 
217 	for (;;) {
218 		ms = vmm_get_memseg(vmm, last_gpa);
219 
220 		if (ms == NULL)
221 			break;
222 
223 		last_gpa = ms->vms_gpa + ms->vms_maplen;
224 		list_insert_tail(&vmm->vmm_memlist, ms);
225 	}
226 
227 	vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE,
228 	    MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0);
229 
230 	if (vmm->vmm_mem == MAP_FAILED)
231 		goto fail;
232 
233 	for (ms = list_head(&vmm->vmm_memlist);
234 	    ms != NULL;
235 	    ms = list_next(&vmm->vmm_memlist, ms)) {
236 		off_t mapoff;
237 
238 		if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) == 0) {
239 			/*
240 			 * sysmem segments will be located at an offset
241 			 * equivalent to their GPA.
242 			 */
243 			mapoff = ms->vms_gpa;
244 		} else {
245 			/*
246 			 * devmem segments are located in a special region away
247 			 * from the normal GPA space.
248 			 */
249 			if (vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid,
250 			    &mapoff) != 0) {
251 				goto fail;
252 			}
253 		}
254 
255 		/*
256 		 * While 'mapoff' points to the front of the segment, the actual
257 		 * mapping may be at some offset beyond that.
258 		 */
259 		VERIFY(ms->vms_segoff >= 0);
260 		mapoff += ms->vms_segoff;
261 
262 		vmm->vmm_memsize += ms->vms_maplen;
263 
264 		if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen,
265 		    PROT_READ | prot_write, MAP_SHARED | MAP_FIXED,
266 		    vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED)
267 			goto fail;
268 	}
269 
270 	return (0);
271 
272 fail:
273 	vmm_unmap(vmm);
274 
275 	return (-1);
276 }
277 
278 void
279 vmm_unmap(vmm_t *vmm)
280 {
281 	while (!list_is_empty(&vmm->vmm_memlist)) {
282 		vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist);
283 
284 		if (vmm->vmm_mem != MAP_FAILED) {
285 			(void) munmap(vmm->vmm_mem + ms->vms_gpa,
286 			    ms->vms_maplen);
287 		}
288 
289 		free(ms);
290 	}
291 
292 	if (vmm->vmm_mem != MAP_FAILED)
293 		(void) munmap(vmm->vmm_mem, vmm->vmm_memsize);
294 
295 	vmm->vmm_mem = MAP_FAILED;
296 	vmm->vmm_memsize = 0;
297 }
298 
299 ssize_t
300 vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr)
301 {
302 	ssize_t count = 0;
303 	vmm_memseg_t *ms;
304 	ssize_t res = len;
305 
306 	for (ms = list_head(&vmm->vmm_memlist);
307 	    ms != NULL && len != 0;
308 	    ms = list_next(&vmm->vmm_memlist, ms)) {
309 
310 		if (addr >= ms->vms_gpa &&
311 		    addr < ms->vms_gpa + ms->vms_maplen) {
312 			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
313 
314 			if (res < 0)
315 				res = 0;
316 
317 			bcopy(vmm->vmm_mem + addr, buf, len - res);
318 			count += len - res;
319 			addr += len - res;
320 			len = res;
321 		}
322 	}
323 
324 	if (res)
325 		errno = EFAULT;
326 	else
327 		errno = 0;
328 
329 	return (count);
330 }
331 
332 ssize_t
333 vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr)
334 {
335 	ssize_t count = 0;
336 	vmm_memseg_t *ms;
337 	ssize_t res = len;
338 
339 	for (ms = list_head(&vmm->vmm_memlist);
340 	    ms != NULL;
341 	    ms = list_next(&vmm->vmm_memlist, ms)) {
342 		if (addr >= ms->vms_gpa &&
343 		    addr < ms->vms_gpa + ms->vms_maplen) {
344 			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
345 
346 			if (res < 0)
347 				res = 0;
348 
349 			bcopy(buf, vmm->vmm_mem + addr, len - res);
350 			count += len - res;
351 			addr += len - res;
352 			len = res;
353 		}
354 	}
355 
356 	if (res)
357 		errno = EFAULT;
358 	else
359 		errno = 0;
360 
361 	return (count);
362 }
363 
364 size_t
365 vmm_ncpu(vmm_t *vmm)
366 {
367 	return (vmm->vmm_ncpu);
368 }
369 
370 size_t
371 vmm_memsize(vmm_t *vmm)
372 {
373 	return (vmm->vmm_memsize);
374 }
375 
376 int
377 vmm_cont(vmm_t *vmm)
378 {
379 	return (vm_resume_all_cpus(vmm->vmm_ctx));
380 }
381 
382 int
383 vmm_step(vmm_t *vmm, int vcpuid)
384 {
385 	cpuset_t cpuset;
386 	int ret;
387 
388 	if (vcpuid >= vmm->vmm_ncpu) {
389 		errno = EINVAL;
390 		return (-1);
391 	}
392 
393 	ret = vm_set_capability(vmm->vmm_vcpu[vcpuid], VM_CAP_MTRAP_EXIT, 1);
394 	if (ret != 0)
395 		return (-1);
396 
397 	assert(vm_resume_cpu(vmm->vmm_vcpu[vcpuid]) == 0);
398 
399 	do {
400 		(void) vm_debug_cpus(vmm->vmm_ctx, &cpuset);
401 	} while (!CPU_ISSET(vcpuid, &cpuset));
402 
403 	(void) vm_set_capability(vmm->vmm_vcpu[vcpuid], VM_CAP_MTRAP_EXIT, 0);
404 
405 	return (ret);
406 }
407 
408 int
409 vmm_stop(vmm_t *vmm)
410 {
411 	int ret = vm_suspend_all_cpus(vmm->vmm_ctx);
412 
413 	if (ret == 0)
414 		vmm_update_ncpu(vmm);
415 
416 	return (ret);
417 }
418 
419 /*
420  * Mapping of KDI-defined registers to vmmapi-defined registers.
421  * Registers not known to vmmapi use VM_REG_LAST, which is invalid and
422  * causes an error in vm_{get,set}_register_set().
423  *
424  * This array must be kept in sync with the definitions in kdi_regs.h.
425  */
426 static int vmm_kdi_regmap[] = {
427 	VM_REG_LAST,		/* KDIREG_SAVFP */
428 	VM_REG_LAST,		/* KDIREG_SAVPC */
429 	VM_REG_GUEST_RDI,	/* KDIREG_RDI */
430 	VM_REG_GUEST_RSI,	/* KDIREG_RSI */
431 	VM_REG_GUEST_RDX,	/* KDIREG_RDX */
432 	VM_REG_GUEST_RCX,	/* KDIREG_RCX */
433 	VM_REG_GUEST_R8,	/* KDIREG_R8 */
434 	VM_REG_GUEST_R9,	/* KDIREG_R9 */
435 	VM_REG_GUEST_RAX,	/* KDIREG_RAX */
436 	VM_REG_GUEST_RBX,	/* KDIREG_RBX */
437 	VM_REG_GUEST_RBP,	/* KDIREG_RBP */
438 	VM_REG_GUEST_R10,	/* KDIREG_R10 */
439 	VM_REG_GUEST_R11,	/* KDIREG_R11 */
440 	VM_REG_GUEST_R12,	/* KDIREG_R12 */
441 	VM_REG_GUEST_R13,	/* KDIREG_R13 */
442 	VM_REG_GUEST_R14,	/* KDIREG_R14 */
443 	VM_REG_GUEST_R15,	/* KDIREG_R15 */
444 	VM_REG_LAST,		/* KDIREG_FSBASE */
445 	VM_REG_LAST,		/* KDIREG_GSBASE */
446 	VM_REG_LAST,		/* KDIREG_KGSBASE */
447 	VM_REG_GUEST_CR2,	/* KDIREG_CR2 */
448 	VM_REG_GUEST_CR3,	/* KDIREG_CR3 */
449 	VM_REG_GUEST_DS,	/* KDIREG_DS */
450 	VM_REG_GUEST_ES,	/* KDIREG_ES */
451 	VM_REG_GUEST_FS,	/* KDIREG_FS */
452 	VM_REG_GUEST_GS,	/* KDIREG_GS */
453 	VM_REG_LAST,		/* KDIREG_TRAPNO */
454 	VM_REG_LAST,		/* KDIREG_ERR */
455 	VM_REG_GUEST_RIP,	/* KDIREG_RIP */
456 	VM_REG_GUEST_CS,	/* KDIREG_CS */
457 	VM_REG_GUEST_RFLAGS,	/* KDIREG_RFLAGS */
458 	VM_REG_GUEST_RSP,	/* KDIREG_RSP */
459 	VM_REG_GUEST_SS		/* KDIREG_SS */
460 };
461 CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG);
462 
463 /*
464  * Mapping of libvmm-defined registers to vmmapi-defined registers.
465  *
466  * This array must be kept in sync with the definitions in libvmm.h
467  */
468 static int vmm_sys_regmap[] = {
469 	VM_REG_GUEST_CR0,	/* VMM_REG_CR0 */
470 	VM_REG_GUEST_CR2,	/* VMM_REG_CR2 */
471 	VM_REG_GUEST_CR3,	/* VMM_REG_CR3 */
472 	VM_REG_GUEST_CR4,	/* VMM_REG_CR4 */
473 	VM_REG_GUEST_DR0,	/* VMM_REG_DR0 */
474 	VM_REG_GUEST_DR1,	/* VMM_REG_DR1 */
475 	VM_REG_GUEST_DR2,	/* VMM_REG_DR2 */
476 	VM_REG_GUEST_DR3,	/* VMM_REG_DR3 */
477 	VM_REG_GUEST_DR6,	/* VMM_REG_DR6 */
478 	VM_REG_GUEST_DR7,	/* VMM_REG_DR7 */
479 	VM_REG_GUEST_EFER,	/* VMM_REG_EFER */
480 	VM_REG_GUEST_PDPTE0,	/* VMM_REG_PDPTE0 */
481 	VM_REG_GUEST_PDPTE1,	/* VMM_REG_PDPTE1 */
482 	VM_REG_GUEST_PDPTE2,	/* VMM_REG_PDPTE2 */
483 	VM_REG_GUEST_PDPTE3,	/* VMM_REG_PDPTE3 */
484 	VM_REG_GUEST_INTR_SHADOW, /* VMM_REG_INTR_SHADOW */
485 };
486 
487 /*
488  * Mapping of libvmm-defined descriptors to vmmapi-defined descriptors.
489  *
490  * This array must be kept in sync with the definitions in libvmm.h
491  */
492 static int vmm_descmap[] = {
493 	VM_REG_GUEST_GDTR,
494 	VM_REG_GUEST_LDTR,
495 	VM_REG_GUEST_IDTR,
496 	VM_REG_GUEST_TR,
497 	VM_REG_GUEST_CS,
498 	VM_REG_GUEST_DS,
499 	VM_REG_GUEST_ES,
500 	VM_REG_GUEST_FS,
501 	VM_REG_GUEST_GS,
502 	VM_REG_GUEST_SS
503 };
504 
505 static int
506 vmm_mapreg(int reg)
507 {
508 	errno = 0;
509 
510 	if (reg < 0)
511 		goto fail;
512 
513 	if (reg < KDIREG_NGREG)
514 		return (vmm_kdi_regmap[reg]);
515 
516 	if (reg >= VMM_REG_OFFSET &&
517 	    reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap))
518 		return (vmm_sys_regmap[reg - VMM_REG_OFFSET]);
519 
520 fail:
521 	errno = EINVAL;
522 	return (VM_REG_LAST);
523 }
524 
525 static int
526 vmm_mapdesc(int desc)
527 {
528 	errno = 0;
529 
530 	if (desc >= VMM_DESC_OFFSET &&
531 	    desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap))
532 		return (vmm_descmap[desc - VMM_DESC_OFFSET]);
533 
534 	errno = EINVAL;
535 	return (VM_REG_LAST);
536 }
537 
538 int
539 vmm_getreg(vmm_t *vmm, int vcpuid, int reg, uint64_t *val)
540 {
541 	reg = vmm_mapreg(reg);
542 
543 	if (reg == VM_REG_LAST)
544 		return (-1);
545 
546 	return (vm_get_register(vmm->vmm_vcpu[vcpuid], reg, val));
547 }
548 
549 int
550 vmm_setreg(vmm_t *vmm, int vcpuid, int reg, uint64_t val)
551 {
552 	reg = vmm_mapreg(reg);
553 
554 	if (reg == VM_REG_LAST)
555 		return (-1);
556 
557 	return (vm_set_register(vmm->vmm_vcpu[vcpuid], reg, val));
558 }
559 
560 int
561 vmm_get_regset(vmm_t *vmm, int vcpuid, size_t nregs, const int *regnums,
562     uint64_t *regvals)
563 {
564 	int *vm_regnums;
565 	int i;
566 	int ret = -1;
567 
568 	vm_regnums = malloc(sizeof (int) * nregs);
569 	if (vm_regnums == NULL)
570 		return (ret);
571 
572 	for (i = 0; i != nregs; i++) {
573 		vm_regnums[i] = vmm_mapreg(regnums[i]);
574 		if (vm_regnums[i] == VM_REG_LAST)
575 			goto fail;
576 	}
577 
578 	ret = vm_get_register_set(vmm->vmm_vcpu[vcpuid], nregs, vm_regnums,
579 	    regvals);
580 
581 fail:
582 	free(vm_regnums);
583 	return (ret);
584 }
585 
586 int
587 vmm_set_regset(vmm_t *vmm, int vcpuid, size_t nregs, const int *regnums,
588     uint64_t *regvals)
589 {
590 	int *vm_regnums;
591 	int i;
592 	int ret = -1;
593 
594 	vm_regnums = malloc(sizeof (int) * nregs);
595 	if (vm_regnums == NULL)
596 		return (ret);
597 
598 	for (i = 0; i != nregs; i++) {
599 		vm_regnums[i] = vmm_mapreg(regnums[i]);
600 		if (vm_regnums[i] == VM_REG_LAST)
601 			goto fail;
602 	}
603 
604 	ret = vm_set_register_set(vmm->vmm_vcpu[vcpuid], nregs, vm_regnums,
605 	    regvals);
606 
607 fail:
608 	free(vm_regnums);
609 	return (ret);
610 }
611 
612 int
613 vmm_get_desc(vmm_t *vmm, int vcpuid, int desc, vmm_desc_t *vd)
614 {
615 	desc = vmm_mapdesc(desc);
616 	if (desc == VM_REG_LAST)
617 		return (-1);
618 
619 	return (vm_get_desc(vmm->vmm_vcpu[vcpuid], desc, &vd->vd_base,
620 	    &vd->vd_lim,
621 	    &vd->vd_acc));
622 }
623 
624 int
625 vmm_set_desc(vmm_t *vmm, int vcpuid, int desc, vmm_desc_t *vd)
626 {
627 	desc = vmm_mapdesc(desc);
628 	if (desc == VM_REG_LAST)
629 		return (-1);
630 
631 	return (vm_set_desc(vmm->vmm_vcpu[vcpuid], desc, vd->vd_base,
632 	    vd->vd_lim, vd->vd_acc));
633 }
634 
635 /*
636  * Structure to hold MMU state during address translation.
637  * The contents of vmm_mmu_regnum[] must be kept in sync with this.
638  */
639 typedef struct vmm_mmu {
640 	uint64_t vm_cr0;
641 	uint64_t vm_cr3;
642 	uint64_t vm_cr4;
643 	uint64_t vm_efer;
644 } vmm_mmu_t;
645 
646 static const int vmm_mmu_regnum[] = {
647 	VMM_REG_CR0,
648 	VMM_REG_CR3,
649 	VMM_REG_CR4,
650 	VMM_REG_EFER
651 };
652 
653 #define	X86_PTE_P		0x001ULL
654 #define	X86_PTE_PS		0x080ULL
655 
656 #define	X86_PTE_PHYSMASK	0x000ffffffffff000ULL
657 #define	X86_PAGE_SHIFT		12
658 #define	X86_PAGE_SIZE		(1ULL << X86_PAGE_SHIFT)
659 
660 #define	X86_SEG_CODE_DATA	(1ULL << 4)
661 #define	X86_SEG_PRESENT		(1ULL << 7)
662 #define	X86_SEG_LONG		(1ULL << 13)
663 #define	X86_SEG_BIG		(1ULL << 14)
664 #define	X86_SEG_GRANULARITY	(1ULL << 15)
665 #define	X86_SEG_UNUSABLE	(1ULL << 16)
666 
667 #define	X86_SEG_USABLE		(X86_SEG_PRESENT | X86_SEG_CODE_DATA)
668 #define	X86_SEG_USABLE_MASK	(X86_SEG_UNUSABLE | X86_SEG_USABLE)
669 
670 /*
671  * vmm_pte2paddr:
672  *
673  * Recursively calculate the physical address from a virtual address,
674  * starting at the given PTE level using the given PTE.
675  */
676 static int
677 vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level,
678     uint64_t vaddr, uint64_t *paddr)
679 {
680 	int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t);
681 	int off_bits = ia32 ? 10 : 9;
682 	boolean_t hugepage = B_FALSE;
683 	uint64_t offset;
684 	uint64_t off_mask, off_shift;
685 
686 	if (level < 4 && (pte & X86_PTE_P) == 0) {
687 		errno = EFAULT;
688 		return (-1);
689 	}
690 
691 	off_shift = X86_PAGE_SHIFT + off_bits * level;
692 	off_mask = (1ULL << off_shift) - 1;
693 
694 	offset = vaddr & off_mask;
695 
696 	if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) {
697 		hugepage = B_TRUE;
698 	} else {
699 		if (level > 0) {
700 			offset >>= off_shift - off_bits;
701 			offset <<= X86_PAGE_SHIFT - off_bits;
702 		}
703 		off_mask = 0xfff;
704 	}
705 
706 	*paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset;
707 
708 	if (level == 0 || hugepage)
709 		return (0);
710 
711 	pte = 0;
712 	if (vmm_pread(vmm, &pte,  pte_size, *paddr) != pte_size)
713 		return (-1);
714 	return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr));
715 }
716 
717 static vmm_mode_t
718 vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpuid __unused, vmm_mmu_t *mmu)
719 {
720 	if ((mmu->vm_cr0 & CR0_PE) == 0)
721 		return (VMM_MODE_REAL);
722 	else if ((mmu->vm_cr4 & CR4_PAE) == 0)
723 		return (VMM_MODE_PROT);
724 	else if ((mmu->vm_efer & AMD_EFER_LME) == 0)
725 		return (VMM_MODE_PAE);
726 	else
727 		return (VMM_MODE_LONG);
728 }
729 
730 vmm_mode_t
731 vmm_vcpu_mode(vmm_t *vmm, int vcpuid)
732 {
733 	vmm_mmu_t mmu = { 0 };
734 
735 	if (vmm_get_regset(vmm, vcpuid, ARRAY_SIZE(vmm_mmu_regnum),
736 	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
737 		return (VMM_MODE_UNKNOWN);
738 
739 	return (vmm_vcpu_mmu_mode(vmm, vcpuid, &mmu));
740 }
741 
742 vmm_isa_t
743 vmm_vcpu_isa(vmm_t *vmm, int vcpuid)
744 {
745 	vmm_desc_t cs;
746 
747 	if (vmm_get_desc(vmm, vcpuid, VMM_DESC_CS, &cs) != 0)
748 		return (VMM_ISA_UNKNOWN);
749 
750 	switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) {
751 	case 0x0:		/* 16b code segment */
752 		return (VMM_ISA_16);
753 	case X86_SEG_LONG:	/* 64b code segment */
754 		return (VMM_ISA_64);
755 	case X86_SEG_BIG:	/* 32b code segment */
756 		return (VMM_ISA_32);
757 	}
758 
759 	return (VMM_ISA_UNKNOWN);
760 }
761 
762 /*
763  * vmm_vtol:
764  *
765  * Translate a virtual address to a physical address on a certain vCPU,
766  * using the specified segment register or descriptor according to the mode.
767  *
768  */
769 int
770 vmm_vtol(vmm_t *vmm, int vcpuid, int seg, uint64_t vaddr, uint64_t *laddr)
771 {
772 	vmm_desc_t desc;
773 	uint64_t limit;
774 
775 	if (vmm_get_desc(vmm, vcpuid, seg, &desc) != 0)
776 		return (-1);
777 
778 	switch (vmm_vcpu_mode(vmm, vcpuid)) {
779 	case VMM_MODE_REAL:
780 		if (seg == VMM_DESC_FS || seg == VMM_DESC_GS)
781 			goto fault;
782 		/* FALLTHRU */
783 	case VMM_MODE_PROT:
784 	case VMM_MODE_PAE:
785 		if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE)
786 			/* unusable, system segment, or not present */
787 			goto fault;
788 
789 		limit = desc.vd_lim;
790 		if (desc.vd_acc & X86_SEG_GRANULARITY)
791 			limit *= 4096;
792 
793 		if (vaddr > limit)
794 			goto fault;
795 		/* FALLTHRU */
796 	case VMM_MODE_LONG:
797 		*laddr = desc.vd_base + vaddr;
798 		return (0);
799 
800 	default:
801 	fault:
802 		errno = EFAULT;
803 		return (-1);
804 	}
805 
806 }
807 
808 /*
809  * vmm_vtop:
810  *
811  * Translate a virtual address to a guest physical address on a certain vCPU,
812  * according to the mode the vCPU is in.
813  */
814 int
815 vmm_vtop(vmm_t *vmm, int vcpuid, int seg, uint64_t vaddr, uint64_t *paddr)
816 {
817 	vmm_mmu_t mmu = { 0 };
818 	int ret = 0;
819 
820 	if (vmm_vtol(vmm, vcpuid, seg, vaddr, &vaddr) != 0)
821 		return (-1);
822 
823 	if (vmm_get_regset(vmm, vcpuid, ARRAY_SIZE(vmm_mmu_regnum),
824 	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
825 		return (-1);
826 
827 	if ((mmu.vm_cr0 & CR0_PG) == 0) {
828 		/* no paging, physical equals virtual */
829 		*paddr = vaddr;
830 		return (0);
831 	}
832 
833 	switch (vmm_vcpu_mmu_mode(vmm, vcpuid, &mmu)) {
834 	case VMM_MODE_PROT:
835 		/* protected mode, no PAE: 2-level paging, 32bit PTEs */
836 		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr);
837 		break;
838 	case VMM_MODE_PAE:
839 		/* protected mode with PAE: 3-level paging, 64bit PTEs */
840 		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr);
841 		break;
842 	case VMM_MODE_LONG:
843 		/* long mode: 4-level paging, 64bit PTEs */
844 		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr);
845 		break;
846 	default:
847 		ret = -1;
848 	}
849 
850 	return (ret);
851 }
852 
853 ssize_t
854 vmm_vread(vmm_t *vmm, int vcpuid, int seg, void *buf, size_t len, uintptr_t
855     addr)
856 {
857 	ssize_t res = 0;
858 	uint64_t paddr;
859 	size_t plen;
860 	uint64_t boundary;
861 
862 	while (len != 0) {
863 		if (vmm_vtop(vmm, vcpuid, seg, addr, &paddr) != 0) {
864 			errno = EFAULT;
865 			return (0);
866 		}
867 
868 		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
869 		if (addr + len > boundary)
870 			plen = boundary - addr;
871 		else
872 			plen = len;
873 
874 		if (vmm_pread(vmm, buf, plen, paddr) != plen)
875 			return (0);
876 		len -= plen;
877 		addr += plen;
878 		buf += plen;
879 		res += plen;
880 	}
881 
882 	return (res);
883 }
884 
885 ssize_t
886 vmm_vwrite(vmm_t *vmm, int vcpuid, int seg, const void *buf, size_t len,
887     uintptr_t addr)
888 {
889 	ssize_t res = 0;
890 	uint64_t paddr;
891 	size_t plen;
892 	uint64_t boundary;
893 
894 	while (len != 0) {
895 		if (vmm_vtop(vmm, vcpuid, seg, addr, &paddr) != 0) {
896 			errno = EFAULT;
897 			return (0);
898 		}
899 
900 		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
901 		if (addr + len > boundary)
902 			plen = boundary - addr;
903 		else
904 			plen = len;
905 
906 		if (vmm_pwrite(vmm, buf, plen, paddr) != plen)
907 			return (0);
908 		len -= plen;
909 		addr += plen;
910 		buf += plen;
911 		res += plen;
912 	}
913 
914 	return (res);
915 }
916