/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2019 Joyent, Inc. */ /* * Library for native code to access bhyve VMs, without the need to use * FreeBSD compat headers */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef struct vmm_memseg vmm_memseg_t; #define VMM_MEMSEG_DEVMEM 0x1 struct vmm_memseg { list_node_t vms_list; int vms_segid; int vms_prot; int vms_flags; uintptr_t vms_gpa; off_t vms_segoff; size_t vms_seglen; size_t vms_maplen; char vms_name[64]; }; struct vmm { struct vmctx *vmm_ctx; list_t vmm_memlist; char *vmm_mem; size_t vmm_memsize; size_t vmm_ncpu; }; /* * This code relies on two assumptions: * - CPUs are never removed from the "active set", not even when suspended. * A CPU being active just means that it has been used by the guest OS. * - The CPU numbering is consecutive. */ static void vmm_update_ncpu(vmm_t *vmm) { cpuset_t cpuset; assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0); for (vmm->vmm_ncpu = 0; CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1; vmm->vmm_ncpu++) ; } vmm_t * vmm_open_vm(const char *name) { vmm_t *vmm = NULL; vmm = malloc(sizeof (vmm_t)); if (vmm == NULL) return (NULL); bzero(vmm, sizeof (vmm_t)); vmm->vmm_mem = MAP_FAILED; list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t), offsetof(vmm_memseg_t, vms_list)); vmm->vmm_ctx = vm_open(name); if (vmm->vmm_ctx == NULL) { free(vmm); return (NULL); } vmm_update_ncpu(vmm); /* * If we open a VM that has just been created we may see a state * where it has no CPUs configured yet. We'll just wait for 10ms * and retry until we get a non-zero CPU count. */ if (vmm->vmm_ncpu == 0) { do { (void) usleep(10000); vmm_update_ncpu(vmm); } while (vmm->vmm_ncpu == 0); } return (vmm); } void vmm_close_vm(vmm_t *vmm) { vmm_unmap(vmm); list_destroy(&vmm->vmm_memlist); if (vmm->vmm_ctx != NULL) vm_close(vmm->vmm_ctx); free(vmm); } static vmm_memseg_t * vmm_get_memseg(vmm_t *vmm, uintptr_t gpa) { vmm_memseg_t ms, *ret; int error, flags; bzero(&ms, sizeof (vmm_memseg_t)); ms.vms_gpa = gpa; error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid, &ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags); if (error) return (NULL); error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen, ms.vms_name, sizeof (ms.vms_name)); if (error) return (NULL); /* * Regular memory segments don't have a name, but devmem segments do. * We can use that information to set the DEVMEM flag if necessary. */ ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0; ret = malloc(sizeof (vmm_memseg_t)); if (ret == NULL) return (NULL); *ret = ms; return (ret); } int vmm_map(vmm_t *vmm, boolean_t writable) { uintptr_t last_gpa = 0; vmm_memseg_t *ms; int prot_write = writable ? PROT_WRITE : 0; if (vmm->vmm_mem != MAP_FAILED) { errno = EINVAL; return (-1); } assert(list_is_empty(&vmm->vmm_memlist)); for (;;) { ms = vmm_get_memseg(vmm, last_gpa); if (ms == NULL) break; last_gpa = ms->vms_gpa + ms->vms_maplen; list_insert_tail(&vmm->vmm_memlist, ms); } vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE, MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0); if (vmm->vmm_mem == MAP_FAILED) goto fail; for (ms = list_head(&vmm->vmm_memlist); ms != NULL; ms = list_next(&vmm->vmm_memlist, ms)) { off_t mapoff = ms->vms_gpa; if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) && vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid, &mapoff) != 0) goto fail; vmm->vmm_memsize += ms->vms_maplen; if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen, PROT_READ | prot_write, MAP_SHARED | MAP_FIXED, vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED) goto fail; } return (0); fail: vmm_unmap(vmm); return (-1); } void vmm_unmap(vmm_t *vmm) { while (!list_is_empty(&vmm->vmm_memlist)) { vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist); if (vmm->vmm_mem != MAP_FAILED) { (void) munmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen); } free(ms); } if (vmm->vmm_mem != MAP_FAILED) (void) munmap(vmm->vmm_mem, vmm->vmm_memsize); vmm->vmm_mem = MAP_FAILED; vmm->vmm_memsize = 0; } ssize_t vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr) { ssize_t count = 0; vmm_memseg_t *ms; ssize_t res = len; for (ms = list_head(&vmm->vmm_memlist); ms != NULL && len != 0; ms = list_next(&vmm->vmm_memlist, ms)) { if (addr >= ms->vms_gpa && addr < ms->vms_gpa + ms->vms_maplen) { res = (addr + len) - (ms->vms_gpa + ms->vms_maplen); if (res < 0) res = 0; bcopy(vmm->vmm_mem + addr, buf, len - res); count += len - res; addr += len - res; len = res; } } if (res) errno = EFAULT; else errno = 0; return (count); } ssize_t vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr) { ssize_t count = 0; vmm_memseg_t *ms; ssize_t res = len; for (ms = list_head(&vmm->vmm_memlist); ms != NULL; ms = list_next(&vmm->vmm_memlist, ms)) { if (addr >= ms->vms_gpa && addr < ms->vms_gpa + ms->vms_maplen) { res = (addr + len) - (ms->vms_gpa + ms->vms_maplen); if (res < 0) res = 0; bcopy(buf, vmm->vmm_mem + addr, len - res); count += len - res; addr += len - res; len = res; } } if (res) errno = EFAULT; else errno = 0; return (count); } size_t vmm_ncpu(vmm_t *vmm) { return (vmm->vmm_ncpu); } size_t vmm_memsize(vmm_t *vmm) { return (vmm->vmm_memsize); } int vmm_cont(vmm_t *vmm) { return (vm_resume_cpu(vmm->vmm_ctx, -1)); } int vmm_step(vmm_t *vmm, int vcpu) { cpuset_t cpuset; int ret; if (vcpu >= vmm->vmm_ncpu) { errno = EINVAL; return (-1); } ret = vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 1); if (ret != 0) return (-1); assert(vm_resume_cpu(vmm->vmm_ctx, vcpu) == 0); do { (void) vm_debug_cpus(vmm->vmm_ctx, &cpuset); } while (!CPU_ISSET(vcpu, &cpuset)); (void) vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 0); return (ret); } int vmm_stop(vmm_t *vmm) { int ret = vm_suspend_cpu(vmm->vmm_ctx, -1); if (ret == 0) vmm_update_ncpu(vmm); return (ret); } /* * Mapping of KDI-defined registers to vmmapi-defined registers. * Registers not known to vmmapi use VM_REG_LAST, which is invalid and * causes an error in vm_{get,set}_register_set(). * * This array must be kept in sync with the definitions in kdi_regs.h. */ static int vmm_kdi_regmap[] = { VM_REG_LAST, /* KDIREG_SAVFP */ VM_REG_LAST, /* KDIREG_SAVPC */ VM_REG_GUEST_RDI, /* KDIREG_RDI */ VM_REG_GUEST_RSI, /* KDIREG_RSI */ VM_REG_GUEST_RDX, /* KDIREG_RDX */ VM_REG_GUEST_RCX, /* KDIREG_RCX */ VM_REG_GUEST_R8, /* KDIREG_R8 */ VM_REG_GUEST_R9, /* KDIREG_R9 */ VM_REG_GUEST_RAX, /* KDIREG_RAX */ VM_REG_GUEST_RBX, /* KDIREG_RBX */ VM_REG_GUEST_RBP, /* KDIREG_RBP */ VM_REG_GUEST_R10, /* KDIREG_R10 */ VM_REG_GUEST_R11, /* KDIREG_R11 */ VM_REG_GUEST_R12, /* KDIREG_R12 */ VM_REG_GUEST_R13, /* KDIREG_R13 */ VM_REG_GUEST_R14, /* KDIREG_R14 */ VM_REG_GUEST_R15, /* KDIREG_R15 */ VM_REG_LAST, /* KDIREG_FSBASE */ VM_REG_LAST, /* KDIREG_GSBASE */ VM_REG_LAST, /* KDIREG_KGSBASE */ VM_REG_GUEST_CR2, /* KDIREG_CR2 */ VM_REG_GUEST_CR3, /* KDIREG_CR3 */ VM_REG_GUEST_DS, /* KDIREG_DS */ VM_REG_GUEST_ES, /* KDIREG_ES */ VM_REG_GUEST_FS, /* KDIREG_FS */ VM_REG_GUEST_GS, /* KDIREG_GS */ VM_REG_LAST, /* KDIREG_TRAPNO */ VM_REG_LAST, /* KDIREG_ERR */ VM_REG_GUEST_RIP, /* KDIREG_RIP */ VM_REG_GUEST_CS, /* KDIREG_CS */ VM_REG_GUEST_RFLAGS, /* KDIREG_RFLAGS */ VM_REG_GUEST_RSP, /* KDIREG_RSP */ VM_REG_GUEST_SS /* KDIREG_SS */ }; CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG); /* * Mapping of libvmm-defined registers to vmmapi-defined registers. * * This array must be kept in sync with the definitions in libvmm.h */ static int vmm_sys_regmap[] = { VM_REG_GUEST_CR0, /* VMM_REG_CR0 */ VM_REG_GUEST_CR2, /* VMM_REG_CR2 */ VM_REG_GUEST_CR3, /* VMM_REG_CR3 */ VM_REG_GUEST_CR4, /* VMM_REG_CR4 */ VM_REG_GUEST_DR0, /* VMM_REG_DR0 */ VM_REG_GUEST_DR1, /* VMM_REG_DR1 */ VM_REG_GUEST_DR2, /* VMM_REG_DR2 */ VM_REG_GUEST_DR3, /* VMM_REG_DR3 */ VM_REG_GUEST_DR6, /* VMM_REG_DR6 */ VM_REG_GUEST_DR7, /* VMM_REG_DR7 */ VM_REG_GUEST_EFER, /* VMM_REG_EFER */ VM_REG_GUEST_PDPTE0, /* VMM_REG_PDPTE0 */ VM_REG_GUEST_PDPTE1, /* VMM_REG_PDPTE1 */ VM_REG_GUEST_PDPTE2, /* VMM_REG_PDPTE2 */ VM_REG_GUEST_PDPTE3, /* VMM_REG_PDPTE3 */ VM_REG_GUEST_INTR_SHADOW, /* VMM_REG_INTR_SHADOW */ }; /* * Mapping of libvmm-defined descriptors to vmmapi-defined descriptors. * * This array must be kept in sync with the definitions in libvmm.h */ static int vmm_descmap[] = { VM_REG_GUEST_GDTR, VM_REG_GUEST_LDTR, VM_REG_GUEST_IDTR, VM_REG_GUEST_TR, VM_REG_GUEST_CS, VM_REG_GUEST_DS, VM_REG_GUEST_ES, VM_REG_GUEST_FS, VM_REG_GUEST_GS, VM_REG_GUEST_SS }; static int vmm_mapreg(int reg) { errno = 0; if (reg < 0) goto fail; if (reg < KDIREG_NGREG) return (vmm_kdi_regmap[reg]); if (reg >= VMM_REG_OFFSET && reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap)) return (vmm_sys_regmap[reg - VMM_REG_OFFSET]); fail: errno = EINVAL; return (VM_REG_LAST); } static int vmm_mapdesc(int desc) { errno = 0; if (desc >= VMM_DESC_OFFSET && desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap)) return (vmm_descmap[desc - VMM_DESC_OFFSET]); errno = EINVAL; return (VM_REG_LAST); } int vmm_getreg(vmm_t *vmm, int vcpu, int reg, uint64_t *val) { reg = vmm_mapreg(reg); if (reg == VM_REG_LAST) return (-1); return (vm_get_register(vmm->vmm_ctx, vcpu, reg, val)); } int vmm_setreg(vmm_t *vmm, int vcpu, int reg, uint64_t val) { reg = vmm_mapreg(reg); if (reg == VM_REG_LAST) return (-1); return (vm_set_register(vmm->vmm_ctx, vcpu, reg, val)); } int vmm_get_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums, uint64_t *regvals) { int *vm_regnums; int i; int ret = -1; vm_regnums = malloc(sizeof (int) * nregs); if (vm_regnums == NULL) return (ret); for (i = 0; i != nregs; i++) { vm_regnums[i] = vmm_mapreg(regnums[i]); if (vm_regnums[i] == VM_REG_LAST) goto fail; } ret = vm_get_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums, regvals); fail: free(vm_regnums); return (ret); } int vmm_set_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums, uint64_t *regvals) { int *vm_regnums; int i; int ret = -1; vm_regnums = malloc(sizeof (int) * nregs); if (vm_regnums == NULL) return (ret); for (i = 0; i != nregs; i++) { vm_regnums[i] = vmm_mapreg(regnums[i]); if (vm_regnums[i] == VM_REG_LAST) goto fail; } ret = vm_set_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums, regvals); fail: free(vm_regnums); return (ret); } int vmm_get_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd) { desc = vmm_mapdesc(desc); if (desc == VM_REG_LAST) return (-1); return (vm_get_desc(vmm->vmm_ctx, vcpu, desc, &vd->vd_base, &vd->vd_lim, &vd->vd_acc)); } int vmm_set_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd) { desc = vmm_mapdesc(desc); if (desc == VM_REG_LAST) return (-1); return (vm_set_desc(vmm->vmm_ctx, vcpu, desc, vd->vd_base, vd->vd_lim, vd->vd_acc)); } /* * Structure to hold MMU state during address translation. * The contents of vmm_mmu_regnum[] must be kept in sync with this. */ typedef struct vmm_mmu { uint64_t vm_cr0; uint64_t vm_cr3; uint64_t vm_cr4; uint64_t vm_efer; } vmm_mmu_t; static const int vmm_mmu_regnum[] = { VMM_REG_CR0, VMM_REG_CR3, VMM_REG_CR4, VMM_REG_EFER }; #define X86_PTE_P 0x001ULL #define X86_PTE_PS 0x080ULL #define X86_PTE_PHYSMASK 0x000ffffffffff000ULL #define X86_PAGE_SHIFT 12 #define X86_PAGE_SIZE (1ULL << X86_PAGE_SHIFT) #define X86_SEG_CODE_DATA (1ULL << 4) #define X86_SEG_PRESENT (1ULL << 7) #define X86_SEG_LONG (1ULL << 13) #define X86_SEG_BIG (1ULL << 14) #define X86_SEG_GRANULARITY (1ULL << 15) #define X86_SEG_UNUSABLE (1ULL << 16) #define X86_SEG_USABLE (X86_SEG_PRESENT | X86_SEG_CODE_DATA) #define X86_SEG_USABLE_MASK (X86_SEG_UNUSABLE | X86_SEG_USABLE) /* * vmm_pte2paddr: * * Recursively calculate the physical address from a virtual address, * starting at the given PTE level using the given PTE. */ static int vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level, uint64_t vaddr, uint64_t *paddr) { int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t); int off_bits = ia32 ? 10 : 9; boolean_t hugepage = B_FALSE; uint64_t offset; uint64_t off_mask, off_shift; if (level < 4 && (pte & X86_PTE_P) == 0) { errno = EFAULT; return (-1); } off_shift = X86_PAGE_SHIFT + off_bits * level; off_mask = (1ULL << off_shift) - 1; offset = vaddr & off_mask; if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) { hugepage = B_TRUE; } else { if (level > 0) { offset >>= off_shift - off_bits; offset <<= X86_PAGE_SHIFT - off_bits; } off_mask = 0xfff; } *paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset; if (level == 0 || hugepage) return (0); pte = 0; if (vmm_pread(vmm, &pte, pte_size, *paddr) != pte_size) return (-1); return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr)); } static vmm_mode_t vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpu, vmm_mmu_t *mmu) { if ((mmu->vm_cr0 & CR0_PE) == 0) return (VMM_MODE_REAL); else if ((mmu->vm_cr4 & CR4_PAE) == 0) return (VMM_MODE_PROT); else if ((mmu->vm_efer & AMD_EFER_LME) == 0) return (VMM_MODE_PAE); else return (VMM_MODE_LONG); } vmm_mode_t vmm_vcpu_mode(vmm_t *vmm, int vcpu) { vmm_mmu_t mmu = { 0 }; if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum), vmm_mmu_regnum, (uint64_t *)&mmu) != 0) return (VMM_MODE_UNKNOWN); return (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu)); } vmm_isa_t vmm_vcpu_isa(vmm_t *vmm, int vcpu) { vmm_desc_t cs; if (vmm_get_desc(vmm, vcpu, VMM_DESC_CS, &cs) != 0) return (VMM_ISA_UNKNOWN); switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) { case 0x0: /* 16b code segment */ return (VMM_ISA_16); case X86_SEG_LONG: /* 64b code segment */ return (VMM_ISA_64); case X86_SEG_BIG: /* 32b code segment */ return (VMM_ISA_32); } return (VMM_ISA_UNKNOWN); } /* * vmm_vtol: * * Translate a virtual address to a physical address on a certain vCPU, * using the specified segment register or descriptor according to the mode. * */ int vmm_vtol(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *laddr) { vmm_desc_t desc; uint64_t limit; if (vmm_get_desc(vmm, vcpu, seg, &desc) != 0) return (-1); switch (vmm_vcpu_mode(vmm, vcpu)) { case VMM_MODE_REAL: if (seg == VMM_DESC_FS || seg == VMM_DESC_GS) goto fault; /* FALLTHRU */ case VMM_MODE_PROT: case VMM_MODE_PAE: if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE) /* unusable, system segment, or not present */ goto fault; limit = desc.vd_lim; if (desc.vd_acc & X86_SEG_GRANULARITY) limit *= 4096; if (vaddr > limit) goto fault; /* FALLTHRU */ case VMM_MODE_LONG: *laddr = desc.vd_base + vaddr; return (0); default: fault: errno = EFAULT; return (-1); } } /* * vmm_vtop: * * Translate a virtual address to a guest physical address on a certain vCPU, * according to the mode the vCPU is in. */ int vmm_vtop(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *paddr) { vmm_mmu_t mmu = { 0 }; int ret = 0; if (vmm_vtol(vmm, vcpu, seg, vaddr, &vaddr) != 0) return (-1); if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum), vmm_mmu_regnum, (uint64_t *)&mmu) != 0) return (-1); if ((mmu.vm_cr0 & CR0_PG) == 0) { /* no paging, physical equals virtual */ *paddr = vaddr; return (0); } switch (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu)) { case VMM_MODE_PROT: /* protected mode, no PAE: 2-level paging, 32bit PTEs */ ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr); break; case VMM_MODE_PAE: /* protected mode with PAE: 3-level paging, 64bit PTEs */ ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr); break; case VMM_MODE_LONG: /* long mode: 4-level paging, 64bit PTEs */ ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr); break; default: ret = -1; } return (ret); } ssize_t vmm_vread(vmm_t *vmm, int vcpu, int seg, void *buf, size_t len, uintptr_t addr) { ssize_t res = 0; uint64_t paddr; size_t plen; uint64_t boundary; while (len != 0) { if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) { errno = EFAULT; return (0); } boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1); if (addr + len > boundary) plen = boundary - addr; else plen = len; if (vmm_pread(vmm, buf, plen, paddr) != plen) return (0); len -= plen; addr += plen; buf += plen; res += plen; } return (res); } ssize_t vmm_vwrite(vmm_t *vmm, int vcpu, int seg, const void *buf, size_t len, uintptr_t addr) { ssize_t res = 0; uint64_t paddr; size_t plen; uint64_t boundary; while (len != 0) { if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) { errno = EFAULT; return (0); } boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1); if (addr + len > boundary) plen = boundary - addr; else plen = len; if (vmm_pwrite(vmm, buf, plen, paddr) != plen) return (0); len -= plen; addr += plen; buf += plen; res += plen; } return (res); }