1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019 Joyent, Inc.
14 * Copyright 2020 Oxide Computer Company
15 * Copyright 2023 OmniOS Community Edition (OmniOSce) Association.
16 */
17
18 /*
19 * Library for native code to access bhyve VMs, without the need to use
20 * FreeBSD compat headers
21 */
22
23 #include <sys/param.h>
24 #include <sys/list.h>
25 #include <sys/stddef.h>
26 #include <sys/mman.h>
27 #include <sys/kdi_regs.h>
28 #include <sys/sysmacros.h>
29 #include <sys/controlregs.h>
30 #include <sys/note.h>
31 #include <sys/debug.h>
32 #include <errno.h>
33 #include <stdlib.h>
34 #include <strings.h>
35 #include <unistd.h>
36 #include <assert.h>
37
38 #include <machine/vmm.h>
39 #include <vmmapi.h>
40
41 #include <libvmm.h>
42
43 typedef struct vmm_memseg vmm_memseg_t;
44
45 #define VMM_MEMSEG_DEVMEM 0x1
46
47 struct vmm_memseg {
48 list_node_t vms_list;
49 int vms_segid;
50 int vms_prot;
51 int vms_flags;
52 uintptr_t vms_gpa;
53 off_t vms_segoff;
54 size_t vms_seglen;
55 size_t vms_maplen;
56 char vms_name[64];
57 };
58
59 struct vmm {
60 struct vmctx *vmm_ctx;
61 list_t vmm_memlist;
62 char *vmm_mem;
63 size_t vmm_memsize;
64 size_t vmm_ncpu;
65 struct vcpu **vmm_vcpu;
66 };
67
68
69 /*
70 * This code relies on two assumptions:
71 * - CPUs are never removed from the "active set", not even when suspended.
72 * A CPU being active just means that it has been used by the guest OS.
73 * - The CPU numbering is consecutive.
74 */
75 static void
vmm_update_ncpu(vmm_t * vmm)76 vmm_update_ncpu(vmm_t *vmm)
77 {
78 cpuset_t cpuset;
79
80 assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0);
81
82 for (vmm->vmm_ncpu = 0;
83 CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1;
84 vmm->vmm_ncpu++)
85 ;
86 }
87
88 vmm_t *
vmm_open_vm(const char * name)89 vmm_open_vm(const char *name)
90 {
91 vmm_t *vmm = NULL;
92 int _errno;
93 int i;
94
95 vmm = malloc(sizeof (vmm_t));
96 if (vmm == NULL)
97 return (NULL);
98
99 bzero(vmm, sizeof (vmm_t));
100 vmm->vmm_mem = MAP_FAILED;
101
102 list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t),
103 offsetof(vmm_memseg_t, vms_list));
104
105 vmm->vmm_ctx = vm_open(name);
106 if (vmm->vmm_ctx == NULL) {
107 list_destroy(&vmm->vmm_memlist);
108 free(vmm);
109 return (NULL);
110 }
111
112 vmm_update_ncpu(vmm);
113
114 /*
115 * If we open a VM that has just been created we may see a state
116 * where it has no CPUs configured yet. We'll just wait for 10ms
117 * and retry until we get a non-zero CPU count.
118 */
119 if (vmm->vmm_ncpu == 0) {
120 do {
121 (void) usleep(10000);
122 vmm_update_ncpu(vmm);
123 } while (vmm->vmm_ncpu == 0);
124 }
125
126 vmm->vmm_vcpu = calloc(vmm->vmm_ncpu, sizeof (struct vcpu *));
127 if (vmm->vmm_vcpu == NULL)
128 goto fail;
129 for (i = 0; i < vmm->vmm_ncpu; i++) {
130 vmm->vmm_vcpu[i] = vm_vcpu_open(vmm->vmm_ctx, i);
131 if (vmm->vmm_vcpu[i] == NULL) {
132 _errno = errno;
133 while (i-- >= 0)
134 vm_vcpu_close(vmm->vmm_vcpu[i]);
135 free(vmm->vmm_vcpu);
136 errno = _errno;
137 goto fail;
138 }
139 }
140
141 return (vmm);
142
143 fail:
144 _errno = errno;
145 vmm_close_vm(vmm);
146 errno = _errno;
147
148 return (NULL);
149 }
150
151 void
vmm_close_vm(vmm_t * vmm)152 vmm_close_vm(vmm_t *vmm)
153 {
154 uint_t i;
155
156 vmm_unmap(vmm);
157
158 for (i = 0; i < vmm->vmm_ncpu; i++)
159 vm_vcpu_close(vmm->vmm_vcpu[i]);
160 free(vmm->vmm_vcpu);
161
162 list_destroy(&vmm->vmm_memlist);
163
164 if (vmm->vmm_ctx != NULL)
165 vm_close(vmm->vmm_ctx);
166
167 free(vmm);
168 }
169
170 static vmm_memseg_t *
vmm_get_memseg(vmm_t * vmm,uintptr_t gpa)171 vmm_get_memseg(vmm_t *vmm, uintptr_t gpa)
172 {
173 vmm_memseg_t ms, *ret;
174 int error, flags;
175
176 bzero(&ms, sizeof (vmm_memseg_t));
177 ms.vms_gpa = gpa;
178 error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid,
179 &ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags);
180 if (error)
181 return (NULL);
182
183 error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen,
184 ms.vms_name, sizeof (ms.vms_name));
185 if (error)
186 return (NULL);
187
188 /*
189 * Regular memory segments don't have a name, but devmem segments do.
190 * We can use that information to set the DEVMEM flag if necessary.
191 */
192 ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0;
193
194 ret = malloc(sizeof (vmm_memseg_t));
195 if (ret == NULL)
196 return (NULL);
197
198 *ret = ms;
199
200 return (ret);
201 }
202
203 int
vmm_map(vmm_t * vmm,boolean_t writable)204 vmm_map(vmm_t *vmm, boolean_t writable)
205 {
206 uintptr_t last_gpa = 0;
207 vmm_memseg_t *ms;
208 int prot_write = writable ? PROT_WRITE : 0;
209
210 if (vmm->vmm_mem != MAP_FAILED) {
211 errno = EINVAL;
212 return (-1);
213 }
214
215 assert(list_is_empty(&vmm->vmm_memlist));
216
217 for (;;) {
218 ms = vmm_get_memseg(vmm, last_gpa);
219
220 if (ms == NULL)
221 break;
222
223 last_gpa = ms->vms_gpa + ms->vms_maplen;
224 list_insert_tail(&vmm->vmm_memlist, ms);
225 }
226
227 vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE,
228 MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0);
229
230 if (vmm->vmm_mem == MAP_FAILED)
231 goto fail;
232
233 for (ms = list_head(&vmm->vmm_memlist);
234 ms != NULL;
235 ms = list_next(&vmm->vmm_memlist, ms)) {
236 off_t mapoff;
237
238 if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) == 0) {
239 /*
240 * sysmem segments will be located at an offset
241 * equivalent to their GPA.
242 */
243 mapoff = ms->vms_gpa;
244 } else {
245 /*
246 * devmem segments are located in a special region away
247 * from the normal GPA space.
248 */
249 if (vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid,
250 &mapoff) != 0) {
251 goto fail;
252 }
253 }
254
255 /*
256 * While 'mapoff' points to the front of the segment, the actual
257 * mapping may be at some offset beyond that.
258 */
259 VERIFY(ms->vms_segoff >= 0);
260 mapoff += ms->vms_segoff;
261
262 vmm->vmm_memsize += ms->vms_maplen;
263
264 if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen,
265 PROT_READ | prot_write, MAP_SHARED | MAP_FIXED,
266 vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED)
267 goto fail;
268 }
269
270 return (0);
271
272 fail:
273 vmm_unmap(vmm);
274
275 return (-1);
276 }
277
278 void
vmm_unmap(vmm_t * vmm)279 vmm_unmap(vmm_t *vmm)
280 {
281 while (!list_is_empty(&vmm->vmm_memlist)) {
282 vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist);
283
284 if (vmm->vmm_mem != MAP_FAILED) {
285 (void) munmap(vmm->vmm_mem + ms->vms_gpa,
286 ms->vms_maplen);
287 }
288
289 free(ms);
290 }
291
292 if (vmm->vmm_mem != MAP_FAILED)
293 (void) munmap(vmm->vmm_mem, vmm->vmm_memsize);
294
295 vmm->vmm_mem = MAP_FAILED;
296 vmm->vmm_memsize = 0;
297 }
298
299 ssize_t
vmm_pread(vmm_t * vmm,void * buf,size_t len,uintptr_t addr)300 vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr)
301 {
302 ssize_t count = 0;
303 vmm_memseg_t *ms;
304 ssize_t res = len;
305
306 for (ms = list_head(&vmm->vmm_memlist);
307 ms != NULL && len != 0;
308 ms = list_next(&vmm->vmm_memlist, ms)) {
309
310 if (addr >= ms->vms_gpa &&
311 addr < ms->vms_gpa + ms->vms_maplen) {
312 res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
313
314 if (res < 0)
315 res = 0;
316
317 bcopy(vmm->vmm_mem + addr, buf, len - res);
318 count += len - res;
319 addr += len - res;
320 len = res;
321 }
322 }
323
324 if (res)
325 errno = EFAULT;
326 else
327 errno = 0;
328
329 return (count);
330 }
331
332 ssize_t
vmm_pwrite(vmm_t * vmm,const void * buf,size_t len,uintptr_t addr)333 vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr)
334 {
335 ssize_t count = 0;
336 vmm_memseg_t *ms;
337 ssize_t res = len;
338
339 for (ms = list_head(&vmm->vmm_memlist);
340 ms != NULL;
341 ms = list_next(&vmm->vmm_memlist, ms)) {
342 if (addr >= ms->vms_gpa &&
343 addr < ms->vms_gpa + ms->vms_maplen) {
344 res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
345
346 if (res < 0)
347 res = 0;
348
349 bcopy(buf, vmm->vmm_mem + addr, len - res);
350 count += len - res;
351 addr += len - res;
352 len = res;
353 }
354 }
355
356 if (res)
357 errno = EFAULT;
358 else
359 errno = 0;
360
361 return (count);
362 }
363
364 size_t
vmm_ncpu(vmm_t * vmm)365 vmm_ncpu(vmm_t *vmm)
366 {
367 return (vmm->vmm_ncpu);
368 }
369
370 size_t
vmm_memsize(vmm_t * vmm)371 vmm_memsize(vmm_t *vmm)
372 {
373 return (vmm->vmm_memsize);
374 }
375
376 int
vmm_cont(vmm_t * vmm)377 vmm_cont(vmm_t *vmm)
378 {
379 return (vm_resume_all_cpus(vmm->vmm_ctx));
380 }
381
382 int
vmm_step(vmm_t * vmm,int vcpuid)383 vmm_step(vmm_t *vmm, int vcpuid)
384 {
385 cpuset_t cpuset;
386 int ret;
387
388 if (vcpuid >= vmm->vmm_ncpu) {
389 errno = EINVAL;
390 return (-1);
391 }
392
393 ret = vm_set_capability(vmm->vmm_vcpu[vcpuid], VM_CAP_MTRAP_EXIT, 1);
394 if (ret != 0)
395 return (-1);
396
397 assert(vm_resume_cpu(vmm->vmm_vcpu[vcpuid]) == 0);
398
399 do {
400 (void) vm_debug_cpus(vmm->vmm_ctx, &cpuset);
401 } while (!CPU_ISSET(vcpuid, &cpuset));
402
403 (void) vm_set_capability(vmm->vmm_vcpu[vcpuid], VM_CAP_MTRAP_EXIT, 0);
404
405 return (ret);
406 }
407
408 int
vmm_stop(vmm_t * vmm)409 vmm_stop(vmm_t *vmm)
410 {
411 int ret = vm_suspend_all_cpus(vmm->vmm_ctx);
412
413 if (ret == 0)
414 vmm_update_ncpu(vmm);
415
416 return (ret);
417 }
418
419 /*
420 * Mapping of KDI-defined registers to vmmapi-defined registers.
421 * Registers not known to vmmapi use VM_REG_LAST, which is invalid and
422 * causes an error in vm_{get,set}_register_set().
423 *
424 * This array must be kept in sync with the definitions in kdi_regs.h.
425 */
426 static int vmm_kdi_regmap[] = {
427 VM_REG_LAST, /* KDIREG_SAVFP */
428 VM_REG_LAST, /* KDIREG_SAVPC */
429 VM_REG_GUEST_RDI, /* KDIREG_RDI */
430 VM_REG_GUEST_RSI, /* KDIREG_RSI */
431 VM_REG_GUEST_RDX, /* KDIREG_RDX */
432 VM_REG_GUEST_RCX, /* KDIREG_RCX */
433 VM_REG_GUEST_R8, /* KDIREG_R8 */
434 VM_REG_GUEST_R9, /* KDIREG_R9 */
435 VM_REG_GUEST_RAX, /* KDIREG_RAX */
436 VM_REG_GUEST_RBX, /* KDIREG_RBX */
437 VM_REG_GUEST_RBP, /* KDIREG_RBP */
438 VM_REG_GUEST_R10, /* KDIREG_R10 */
439 VM_REG_GUEST_R11, /* KDIREG_R11 */
440 VM_REG_GUEST_R12, /* KDIREG_R12 */
441 VM_REG_GUEST_R13, /* KDIREG_R13 */
442 VM_REG_GUEST_R14, /* KDIREG_R14 */
443 VM_REG_GUEST_R15, /* KDIREG_R15 */
444 VM_REG_LAST, /* KDIREG_FSBASE */
445 VM_REG_LAST, /* KDIREG_GSBASE */
446 VM_REG_LAST, /* KDIREG_KGSBASE */
447 VM_REG_GUEST_CR2, /* KDIREG_CR2 */
448 VM_REG_GUEST_CR3, /* KDIREG_CR3 */
449 VM_REG_GUEST_DS, /* KDIREG_DS */
450 VM_REG_GUEST_ES, /* KDIREG_ES */
451 VM_REG_GUEST_FS, /* KDIREG_FS */
452 VM_REG_GUEST_GS, /* KDIREG_GS */
453 VM_REG_LAST, /* KDIREG_TRAPNO */
454 VM_REG_LAST, /* KDIREG_ERR */
455 VM_REG_GUEST_RIP, /* KDIREG_RIP */
456 VM_REG_GUEST_CS, /* KDIREG_CS */
457 VM_REG_GUEST_RFLAGS, /* KDIREG_RFLAGS */
458 VM_REG_GUEST_RSP, /* KDIREG_RSP */
459 VM_REG_GUEST_SS /* KDIREG_SS */
460 };
461 CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG);
462
463 /*
464 * Mapping of libvmm-defined registers to vmmapi-defined registers.
465 *
466 * This array must be kept in sync with the definitions in libvmm.h
467 */
468 static int vmm_sys_regmap[] = {
469 VM_REG_GUEST_CR0, /* VMM_REG_CR0 */
470 VM_REG_GUEST_CR2, /* VMM_REG_CR2 */
471 VM_REG_GUEST_CR3, /* VMM_REG_CR3 */
472 VM_REG_GUEST_CR4, /* VMM_REG_CR4 */
473 VM_REG_GUEST_DR0, /* VMM_REG_DR0 */
474 VM_REG_GUEST_DR1, /* VMM_REG_DR1 */
475 VM_REG_GUEST_DR2, /* VMM_REG_DR2 */
476 VM_REG_GUEST_DR3, /* VMM_REG_DR3 */
477 VM_REG_GUEST_DR6, /* VMM_REG_DR6 */
478 VM_REG_GUEST_DR7, /* VMM_REG_DR7 */
479 VM_REG_GUEST_EFER, /* VMM_REG_EFER */
480 VM_REG_GUEST_PDPTE0, /* VMM_REG_PDPTE0 */
481 VM_REG_GUEST_PDPTE1, /* VMM_REG_PDPTE1 */
482 VM_REG_GUEST_PDPTE2, /* VMM_REG_PDPTE2 */
483 VM_REG_GUEST_PDPTE3, /* VMM_REG_PDPTE3 */
484 VM_REG_GUEST_INTR_SHADOW, /* VMM_REG_INTR_SHADOW */
485 };
486
487 /*
488 * Mapping of libvmm-defined descriptors to vmmapi-defined descriptors.
489 *
490 * This array must be kept in sync with the definitions in libvmm.h
491 */
492 static int vmm_descmap[] = {
493 VM_REG_GUEST_GDTR,
494 VM_REG_GUEST_LDTR,
495 VM_REG_GUEST_IDTR,
496 VM_REG_GUEST_TR,
497 VM_REG_GUEST_CS,
498 VM_REG_GUEST_DS,
499 VM_REG_GUEST_ES,
500 VM_REG_GUEST_FS,
501 VM_REG_GUEST_GS,
502 VM_REG_GUEST_SS
503 };
504
505 static int
vmm_mapreg(int reg)506 vmm_mapreg(int reg)
507 {
508 errno = 0;
509
510 if (reg < 0)
511 goto fail;
512
513 if (reg < KDIREG_NGREG)
514 return (vmm_kdi_regmap[reg]);
515
516 if (reg >= VMM_REG_OFFSET &&
517 reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap))
518 return (vmm_sys_regmap[reg - VMM_REG_OFFSET]);
519
520 fail:
521 errno = EINVAL;
522 return (VM_REG_LAST);
523 }
524
525 static int
vmm_mapdesc(int desc)526 vmm_mapdesc(int desc)
527 {
528 errno = 0;
529
530 if (desc >= VMM_DESC_OFFSET &&
531 desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap))
532 return (vmm_descmap[desc - VMM_DESC_OFFSET]);
533
534 errno = EINVAL;
535 return (VM_REG_LAST);
536 }
537
538 int
vmm_getreg(vmm_t * vmm,int vcpuid,int reg,uint64_t * val)539 vmm_getreg(vmm_t *vmm, int vcpuid, int reg, uint64_t *val)
540 {
541 reg = vmm_mapreg(reg);
542
543 if (reg == VM_REG_LAST)
544 return (-1);
545
546 return (vm_get_register(vmm->vmm_vcpu[vcpuid], reg, val));
547 }
548
549 int
vmm_setreg(vmm_t * vmm,int vcpuid,int reg,uint64_t val)550 vmm_setreg(vmm_t *vmm, int vcpuid, int reg, uint64_t val)
551 {
552 reg = vmm_mapreg(reg);
553
554 if (reg == VM_REG_LAST)
555 return (-1);
556
557 return (vm_set_register(vmm->vmm_vcpu[vcpuid], reg, val));
558 }
559
560 int
vmm_get_regset(vmm_t * vmm,int vcpuid,size_t nregs,const int * regnums,uint64_t * regvals)561 vmm_get_regset(vmm_t *vmm, int vcpuid, size_t nregs, const int *regnums,
562 uint64_t *regvals)
563 {
564 int *vm_regnums;
565 int i;
566 int ret = -1;
567
568 vm_regnums = malloc(sizeof (int) * nregs);
569 if (vm_regnums == NULL)
570 return (ret);
571
572 for (i = 0; i != nregs; i++) {
573 vm_regnums[i] = vmm_mapreg(regnums[i]);
574 if (vm_regnums[i] == VM_REG_LAST)
575 goto fail;
576 }
577
578 ret = vm_get_register_set(vmm->vmm_vcpu[vcpuid], nregs, vm_regnums,
579 regvals);
580
581 fail:
582 free(vm_regnums);
583 return (ret);
584 }
585
586 int
vmm_set_regset(vmm_t * vmm,int vcpuid,size_t nregs,const int * regnums,uint64_t * regvals)587 vmm_set_regset(vmm_t *vmm, int vcpuid, size_t nregs, const int *regnums,
588 uint64_t *regvals)
589 {
590 int *vm_regnums;
591 int i;
592 int ret = -1;
593
594 vm_regnums = malloc(sizeof (int) * nregs);
595 if (vm_regnums == NULL)
596 return (ret);
597
598 for (i = 0; i != nregs; i++) {
599 vm_regnums[i] = vmm_mapreg(regnums[i]);
600 if (vm_regnums[i] == VM_REG_LAST)
601 goto fail;
602 }
603
604 ret = vm_set_register_set(vmm->vmm_vcpu[vcpuid], nregs, vm_regnums,
605 regvals);
606
607 fail:
608 free(vm_regnums);
609 return (ret);
610 }
611
612 int
vmm_get_desc(vmm_t * vmm,int vcpuid,int desc,vmm_desc_t * vd)613 vmm_get_desc(vmm_t *vmm, int vcpuid, int desc, vmm_desc_t *vd)
614 {
615 desc = vmm_mapdesc(desc);
616 if (desc == VM_REG_LAST)
617 return (-1);
618
619 return (vm_get_desc(vmm->vmm_vcpu[vcpuid], desc, &vd->vd_base,
620 &vd->vd_lim,
621 &vd->vd_acc));
622 }
623
624 int
vmm_set_desc(vmm_t * vmm,int vcpuid,int desc,vmm_desc_t * vd)625 vmm_set_desc(vmm_t *vmm, int vcpuid, int desc, vmm_desc_t *vd)
626 {
627 desc = vmm_mapdesc(desc);
628 if (desc == VM_REG_LAST)
629 return (-1);
630
631 return (vm_set_desc(vmm->vmm_vcpu[vcpuid], desc, vd->vd_base,
632 vd->vd_lim, vd->vd_acc));
633 }
634
635 /*
636 * Structure to hold MMU state during address translation.
637 * The contents of vmm_mmu_regnum[] must be kept in sync with this.
638 */
639 typedef struct vmm_mmu {
640 uint64_t vm_cr0;
641 uint64_t vm_cr3;
642 uint64_t vm_cr4;
643 uint64_t vm_efer;
644 } vmm_mmu_t;
645
646 static const int vmm_mmu_regnum[] = {
647 VMM_REG_CR0,
648 VMM_REG_CR3,
649 VMM_REG_CR4,
650 VMM_REG_EFER
651 };
652
653 #define X86_PTE_P 0x001ULL
654 #define X86_PTE_PS 0x080ULL
655
656 #define X86_PTE_PHYSMASK 0x000ffffffffff000ULL
657 #define X86_PAGE_SHIFT 12
658 #define X86_PAGE_SIZE (1ULL << X86_PAGE_SHIFT)
659
660 #define X86_SEG_CODE_DATA (1ULL << 4)
661 #define X86_SEG_PRESENT (1ULL << 7)
662 #define X86_SEG_LONG (1ULL << 13)
663 #define X86_SEG_BIG (1ULL << 14)
664 #define X86_SEG_GRANULARITY (1ULL << 15)
665 #define X86_SEG_UNUSABLE (1ULL << 16)
666
667 #define X86_SEG_USABLE (X86_SEG_PRESENT | X86_SEG_CODE_DATA)
668 #define X86_SEG_USABLE_MASK (X86_SEG_UNUSABLE | X86_SEG_USABLE)
669
670 /*
671 * vmm_pte2paddr:
672 *
673 * Recursively calculate the physical address from a virtual address,
674 * starting at the given PTE level using the given PTE.
675 */
676 static int
vmm_pte2paddr(vmm_t * vmm,uint64_t pte,boolean_t ia32,int level,uint64_t vaddr,uint64_t * paddr)677 vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level,
678 uint64_t vaddr, uint64_t *paddr)
679 {
680 int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t);
681 int off_bits = ia32 ? 10 : 9;
682 boolean_t hugepage = B_FALSE;
683 uint64_t offset;
684 uint64_t off_mask, off_shift;
685
686 if (level < 4 && (pte & X86_PTE_P) == 0) {
687 errno = EFAULT;
688 return (-1);
689 }
690
691 off_shift = X86_PAGE_SHIFT + off_bits * level;
692 off_mask = (1ULL << off_shift) - 1;
693
694 offset = vaddr & off_mask;
695
696 if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) {
697 hugepage = B_TRUE;
698 } else {
699 if (level > 0) {
700 offset >>= off_shift - off_bits;
701 offset <<= X86_PAGE_SHIFT - off_bits;
702 }
703 off_mask = 0xfff;
704 }
705
706 *paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset;
707
708 if (level == 0 || hugepage)
709 return (0);
710
711 pte = 0;
712 if (vmm_pread(vmm, &pte, pte_size, *paddr) != pte_size)
713 return (-1);
714 return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr));
715 }
716
717 static vmm_mode_t
vmm_vcpu_mmu_mode(vmm_t * vmm,int vcpuid __unused,vmm_mmu_t * mmu)718 vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpuid __unused, vmm_mmu_t *mmu)
719 {
720 if ((mmu->vm_cr0 & CR0_PE) == 0)
721 return (VMM_MODE_REAL);
722 else if ((mmu->vm_cr4 & CR4_PAE) == 0)
723 return (VMM_MODE_PROT);
724 else if ((mmu->vm_efer & AMD_EFER_LME) == 0)
725 return (VMM_MODE_PAE);
726 else
727 return (VMM_MODE_LONG);
728 }
729
730 vmm_mode_t
vmm_vcpu_mode(vmm_t * vmm,int vcpuid)731 vmm_vcpu_mode(vmm_t *vmm, int vcpuid)
732 {
733 vmm_mmu_t mmu = { 0 };
734
735 if (vmm_get_regset(vmm, vcpuid, ARRAY_SIZE(vmm_mmu_regnum),
736 vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
737 return (VMM_MODE_UNKNOWN);
738
739 return (vmm_vcpu_mmu_mode(vmm, vcpuid, &mmu));
740 }
741
742 vmm_isa_t
vmm_vcpu_isa(vmm_t * vmm,int vcpuid)743 vmm_vcpu_isa(vmm_t *vmm, int vcpuid)
744 {
745 vmm_desc_t cs;
746
747 if (vmm_get_desc(vmm, vcpuid, VMM_DESC_CS, &cs) != 0)
748 return (VMM_ISA_UNKNOWN);
749
750 switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) {
751 case 0x0: /* 16b code segment */
752 return (VMM_ISA_16);
753 case X86_SEG_LONG: /* 64b code segment */
754 return (VMM_ISA_64);
755 case X86_SEG_BIG: /* 32b code segment */
756 return (VMM_ISA_32);
757 }
758
759 return (VMM_ISA_UNKNOWN);
760 }
761
762 /*
763 * vmm_vtol:
764 *
765 * Translate a virtual address to a physical address on a certain vCPU,
766 * using the specified segment register or descriptor according to the mode.
767 *
768 */
769 int
vmm_vtol(vmm_t * vmm,int vcpuid,int seg,uint64_t vaddr,uint64_t * laddr)770 vmm_vtol(vmm_t *vmm, int vcpuid, int seg, uint64_t vaddr, uint64_t *laddr)
771 {
772 vmm_desc_t desc;
773 uint64_t limit;
774
775 if (vmm_get_desc(vmm, vcpuid, seg, &desc) != 0)
776 return (-1);
777
778 switch (vmm_vcpu_mode(vmm, vcpuid)) {
779 case VMM_MODE_REAL:
780 if (seg == VMM_DESC_FS || seg == VMM_DESC_GS)
781 goto fault;
782 /* FALLTHRU */
783 case VMM_MODE_PROT:
784 case VMM_MODE_PAE:
785 if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE)
786 /* unusable, system segment, or not present */
787 goto fault;
788
789 limit = desc.vd_lim;
790 if (desc.vd_acc & X86_SEG_GRANULARITY)
791 limit *= 4096;
792
793 if (vaddr > limit)
794 goto fault;
795 /* FALLTHRU */
796 case VMM_MODE_LONG:
797 *laddr = desc.vd_base + vaddr;
798 return (0);
799
800 default:
801 fault:
802 errno = EFAULT;
803 return (-1);
804 }
805
806 }
807
808 /*
809 * vmm_vtop:
810 *
811 * Translate a virtual address to a guest physical address on a certain vCPU,
812 * according to the mode the vCPU is in.
813 */
814 int
vmm_vtop(vmm_t * vmm,int vcpuid,int seg,uint64_t vaddr,uint64_t * paddr)815 vmm_vtop(vmm_t *vmm, int vcpuid, int seg, uint64_t vaddr, uint64_t *paddr)
816 {
817 vmm_mmu_t mmu = { 0 };
818 int ret = 0;
819
820 if (vmm_vtol(vmm, vcpuid, seg, vaddr, &vaddr) != 0)
821 return (-1);
822
823 if (vmm_get_regset(vmm, vcpuid, ARRAY_SIZE(vmm_mmu_regnum),
824 vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
825 return (-1);
826
827 if ((mmu.vm_cr0 & CR0_PG) == 0) {
828 /* no paging, physical equals virtual */
829 *paddr = vaddr;
830 return (0);
831 }
832
833 switch (vmm_vcpu_mmu_mode(vmm, vcpuid, &mmu)) {
834 case VMM_MODE_PROT:
835 /* protected mode, no PAE: 2-level paging, 32bit PTEs */
836 ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr);
837 break;
838 case VMM_MODE_PAE:
839 /* protected mode with PAE: 3-level paging, 64bit PTEs */
840 ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr);
841 break;
842 case VMM_MODE_LONG:
843 /* long mode: 4-level paging, 64bit PTEs */
844 ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr);
845 break;
846 default:
847 ret = -1;
848 }
849
850 return (ret);
851 }
852
853 ssize_t
vmm_vread(vmm_t * vmm,int vcpuid,int seg,void * buf,size_t len,uintptr_t addr)854 vmm_vread(vmm_t *vmm, int vcpuid, int seg, void *buf, size_t len, uintptr_t
855 addr)
856 {
857 ssize_t res = 0;
858 uint64_t paddr;
859 size_t plen;
860 uint64_t boundary;
861
862 while (len != 0) {
863 if (vmm_vtop(vmm, vcpuid, seg, addr, &paddr) != 0) {
864 errno = EFAULT;
865 return (0);
866 }
867
868 boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
869 if (addr + len > boundary)
870 plen = boundary - addr;
871 else
872 plen = len;
873
874 if (vmm_pread(vmm, buf, plen, paddr) != plen)
875 return (0);
876 len -= plen;
877 addr += plen;
878 buf += plen;
879 res += plen;
880 }
881
882 return (res);
883 }
884
885 ssize_t
vmm_vwrite(vmm_t * vmm,int vcpuid,int seg,const void * buf,size_t len,uintptr_t addr)886 vmm_vwrite(vmm_t *vmm, int vcpuid, int seg, const void *buf, size_t len,
887 uintptr_t addr)
888 {
889 ssize_t res = 0;
890 uint64_t paddr;
891 size_t plen;
892 uint64_t boundary;
893
894 while (len != 0) {
895 if (vmm_vtop(vmm, vcpuid, seg, addr, &paddr) != 0) {
896 errno = EFAULT;
897 return (0);
898 }
899
900 boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
901 if (addr + len > boundary)
902 plen = boundary - addr;
903 else
904 plen = len;
905
906 if (vmm_pwrite(vmm, buf, plen, paddr) != plen)
907 return (0);
908 len -= plen;
909 addr += plen;
910 buf += plen;
911 res += plen;
912 }
913
914 return (res);
915 }
916