/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2019 Joyent, Inc.
 * Copyright 2020 Oxide Computer Company
 */

/*
 * Library for native code to access bhyve VMs, without the need to use
 * FreeBSD compat headers
 */

#include <sys/param.h>
#include <sys/list.h>
#include <sys/stddef.h>
#include <sys/mman.h>
#include <sys/kdi_regs.h>
#include <sys/sysmacros.h>
#include <sys/controlregs.h>
#include <sys/note.h>
#include <sys/debug.h>
#include <errno.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>

#include <machine/vmm.h>
#include <vmmapi.h>

#include <libvmm.h>

typedef struct vmm_memseg vmm_memseg_t;

#define	VMM_MEMSEG_DEVMEM	0x1

struct vmm_memseg {
	list_node_t vms_list;
	int vms_segid;
	int vms_prot;
	int vms_flags;
	uintptr_t vms_gpa;
	off_t vms_segoff;
	size_t vms_seglen;
	size_t vms_maplen;
	char vms_name[64];
};

struct vmm {
	struct vmctx *vmm_ctx;
	list_t vmm_memlist;
	char *vmm_mem;
	size_t vmm_memsize;
	size_t vmm_ncpu;
};


/*
 * This code relies on two assumptions:
 * - CPUs are never removed from the "active set", not even when suspended.
 *   A CPU being active just means that it has been used by the guest OS.
 * - The CPU numbering is consecutive.
 */
static void
vmm_update_ncpu(vmm_t *vmm)
{
	cpuset_t cpuset;

	assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0);

	for (vmm->vmm_ncpu = 0;
	    CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1;
	    vmm->vmm_ncpu++)
		;
}

vmm_t *
vmm_open_vm(const char *name)
{
	vmm_t *vmm = NULL;

	vmm = malloc(sizeof (vmm_t));
	if (vmm == NULL)
		return (NULL);

	bzero(vmm, sizeof (vmm_t));
	vmm->vmm_mem = MAP_FAILED;

	list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t),
	    offsetof(vmm_memseg_t, vms_list));

	vmm->vmm_ctx = vm_open(name);
	if (vmm->vmm_ctx == NULL) {
		free(vmm);
		return (NULL);
	}

	vmm_update_ncpu(vmm);

	/*
	 * If we open a VM that has just been created we may see a state
	 * where it has no CPUs configured yet. We'll just wait for 10ms
	 * and retry until we get a non-zero CPU count.
	 */
	if (vmm->vmm_ncpu == 0) {
		do {
			(void) usleep(10000);
			vmm_update_ncpu(vmm);
		} while (vmm->vmm_ncpu == 0);
	}

	return (vmm);
}

void
vmm_close_vm(vmm_t *vmm)
{
	vmm_unmap(vmm);

	list_destroy(&vmm->vmm_memlist);

	if (vmm->vmm_ctx != NULL)
		vm_close(vmm->vmm_ctx);

	free(vmm);
}

static vmm_memseg_t *
vmm_get_memseg(vmm_t *vmm, uintptr_t gpa)
{
	vmm_memseg_t ms, *ret;
	int error, flags;

	bzero(&ms, sizeof (vmm_memseg_t));
	ms.vms_gpa = gpa;
	error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid,
	    &ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags);
	if (error)
		return (NULL);

	error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen,
	    ms.vms_name, sizeof (ms.vms_name));
	if (error)
		return (NULL);

	/*
	 * Regular memory segments don't have a name, but devmem segments do.
	 * We can use that information to set the DEVMEM flag if necessary.
	 */
	ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0;

	ret = malloc(sizeof (vmm_memseg_t));
	if (ret == NULL)
		return (NULL);

	*ret = ms;

	return (ret);
}

int
vmm_map(vmm_t *vmm, boolean_t writable)
{
	uintptr_t last_gpa = 0;
	vmm_memseg_t *ms;
	int prot_write = writable ? PROT_WRITE : 0;

	if (vmm->vmm_mem != MAP_FAILED) {
		errno = EINVAL;
		return (-1);
	}

	assert(list_is_empty(&vmm->vmm_memlist));

	for (;;) {
		ms = vmm_get_memseg(vmm, last_gpa);

		if (ms == NULL)
			break;

		last_gpa = ms->vms_gpa + ms->vms_maplen;
		list_insert_tail(&vmm->vmm_memlist, ms);
	}

	vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE,
	    MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0);

	if (vmm->vmm_mem == MAP_FAILED)
		goto fail;

	for (ms = list_head(&vmm->vmm_memlist);
	    ms != NULL;
	    ms = list_next(&vmm->vmm_memlist, ms)) {
		off_t mapoff;

		if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) == 0) {
			/*
			 * sysmem segments will be located at an offset
			 * equivalent to their GPA.
			 */
			mapoff = ms->vms_gpa;
		} else {
			/*
			 * devmem segments are located in a special region away
			 * from the normal GPA space.
			 */
			if (vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid,
			    &mapoff) != 0) {
				goto fail;
			}
		}

		/*
		 * While 'mapoff' points to the front of the segment, the actual
		 * mapping may be at some offset beyond that.
		 */
		VERIFY(ms->vms_segoff >= 0);
		mapoff += ms->vms_segoff;

		vmm->vmm_memsize += ms->vms_maplen;

		if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen,
		    PROT_READ | prot_write, MAP_SHARED | MAP_FIXED,
		    vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED)
			goto fail;
	}

	return (0);

fail:
	vmm_unmap(vmm);

	return (-1);
}

void
vmm_unmap(vmm_t *vmm)
{
	while (!list_is_empty(&vmm->vmm_memlist)) {
		vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist);

		if (vmm->vmm_mem != MAP_FAILED) {
			(void) munmap(vmm->vmm_mem + ms->vms_gpa,
			    ms->vms_maplen);
		}

		free(ms);
	}

	if (vmm->vmm_mem != MAP_FAILED)
		(void) munmap(vmm->vmm_mem, vmm->vmm_memsize);

	vmm->vmm_mem = MAP_FAILED;
	vmm->vmm_memsize = 0;
}

ssize_t
vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr)
{
	ssize_t count = 0;
	vmm_memseg_t *ms;
	ssize_t res = len;

	for (ms = list_head(&vmm->vmm_memlist);
	    ms != NULL && len != 0;
	    ms = list_next(&vmm->vmm_memlist, ms)) {

		if (addr >= ms->vms_gpa &&
		    addr < ms->vms_gpa + ms->vms_maplen) {
			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);

			if (res < 0)
				res = 0;

			bcopy(vmm->vmm_mem + addr, buf, len - res);
			count += len - res;
			addr += len - res;
			len = res;
		}
	}

	if (res)
		errno = EFAULT;
	else
		errno = 0;

	return (count);
}

ssize_t
vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr)
{
	ssize_t count = 0;
	vmm_memseg_t *ms;
	ssize_t res = len;

	for (ms = list_head(&vmm->vmm_memlist);
	    ms != NULL;
	    ms = list_next(&vmm->vmm_memlist, ms)) {
		if (addr >= ms->vms_gpa &&
		    addr < ms->vms_gpa + ms->vms_maplen) {
			res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);

			if (res < 0)
				res = 0;

			bcopy(buf, vmm->vmm_mem + addr, len - res);
			count += len - res;
			addr += len - res;
			len = res;
		}
	}

	if (res)
		errno = EFAULT;
	else
		errno = 0;

	return (count);
}

size_t
vmm_ncpu(vmm_t *vmm)
{
	return (vmm->vmm_ncpu);
}

size_t
vmm_memsize(vmm_t *vmm)
{
	return (vmm->vmm_memsize);
}

int
vmm_cont(vmm_t *vmm)
{
	return (vm_resume_cpu(vmm->vmm_ctx, -1));
}

int
vmm_step(vmm_t *vmm, int vcpu)
{
	cpuset_t cpuset;
	int ret;

	if (vcpu >= vmm->vmm_ncpu) {
		errno = EINVAL;
		return (-1);
	}

	ret = vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 1);
	if (ret != 0)
		return (-1);

	assert(vm_resume_cpu(vmm->vmm_ctx, vcpu) == 0);

	do {
		(void) vm_debug_cpus(vmm->vmm_ctx, &cpuset);
	} while (!CPU_ISSET(vcpu, &cpuset));

	(void) vm_set_capability(vmm->vmm_ctx, vcpu, VM_CAP_MTRAP_EXIT, 0);

	return (ret);
}

int
vmm_stop(vmm_t *vmm)
{
	int ret = vm_suspend_cpu(vmm->vmm_ctx, -1);

	if (ret == 0)
		vmm_update_ncpu(vmm);

	return (ret);
}

/*
 * Mapping of KDI-defined registers to vmmapi-defined registers.
 * Registers not known to vmmapi use VM_REG_LAST, which is invalid and
 * causes an error in vm_{get,set}_register_set().
 *
 * This array must be kept in sync with the definitions in kdi_regs.h.
 */
static int vmm_kdi_regmap[] = {
	VM_REG_LAST,		/* KDIREG_SAVFP */
	VM_REG_LAST,		/* KDIREG_SAVPC */
	VM_REG_GUEST_RDI,	/* KDIREG_RDI */
	VM_REG_GUEST_RSI,	/* KDIREG_RSI */
	VM_REG_GUEST_RDX,	/* KDIREG_RDX */
	VM_REG_GUEST_RCX,	/* KDIREG_RCX */
	VM_REG_GUEST_R8,	/* KDIREG_R8 */
	VM_REG_GUEST_R9,	/* KDIREG_R9 */
	VM_REG_GUEST_RAX,	/* KDIREG_RAX */
	VM_REG_GUEST_RBX,	/* KDIREG_RBX */
	VM_REG_GUEST_RBP,	/* KDIREG_RBP */
	VM_REG_GUEST_R10,	/* KDIREG_R10 */
	VM_REG_GUEST_R11,	/* KDIREG_R11 */
	VM_REG_GUEST_R12,	/* KDIREG_R12 */
	VM_REG_GUEST_R13,	/* KDIREG_R13 */
	VM_REG_GUEST_R14,	/* KDIREG_R14 */
	VM_REG_GUEST_R15,	/* KDIREG_R15 */
	VM_REG_LAST,		/* KDIREG_FSBASE */
	VM_REG_LAST,		/* KDIREG_GSBASE */
	VM_REG_LAST,		/* KDIREG_KGSBASE */
	VM_REG_GUEST_CR2,	/* KDIREG_CR2 */
	VM_REG_GUEST_CR3,	/* KDIREG_CR3 */
	VM_REG_GUEST_DS,	/* KDIREG_DS */
	VM_REG_GUEST_ES,	/* KDIREG_ES */
	VM_REG_GUEST_FS,	/* KDIREG_FS */
	VM_REG_GUEST_GS,	/* KDIREG_GS */
	VM_REG_LAST,		/* KDIREG_TRAPNO */
	VM_REG_LAST,		/* KDIREG_ERR */
	VM_REG_GUEST_RIP,	/* KDIREG_RIP */
	VM_REG_GUEST_CS,	/* KDIREG_CS */
	VM_REG_GUEST_RFLAGS,	/* KDIREG_RFLAGS */
	VM_REG_GUEST_RSP,	/* KDIREG_RSP */
	VM_REG_GUEST_SS		/* KDIREG_SS */
};
CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG);

/*
 * Mapping of libvmm-defined registers to vmmapi-defined registers.
 *
 * This array must be kept in sync with the definitions in libvmm.h
 */
static int vmm_sys_regmap[] = {
	VM_REG_GUEST_CR0,	/* VMM_REG_CR0 */
	VM_REG_GUEST_CR2,	/* VMM_REG_CR2 */
	VM_REG_GUEST_CR3,	/* VMM_REG_CR3 */
	VM_REG_GUEST_CR4,	/* VMM_REG_CR4 */
	VM_REG_GUEST_DR0,	/* VMM_REG_DR0 */
	VM_REG_GUEST_DR1,	/* VMM_REG_DR1 */
	VM_REG_GUEST_DR2,	/* VMM_REG_DR2 */
	VM_REG_GUEST_DR3,	/* VMM_REG_DR3 */
	VM_REG_GUEST_DR6,	/* VMM_REG_DR6 */
	VM_REG_GUEST_DR7,	/* VMM_REG_DR7 */
	VM_REG_GUEST_EFER,	/* VMM_REG_EFER */
	VM_REG_GUEST_PDPTE0,	/* VMM_REG_PDPTE0 */
	VM_REG_GUEST_PDPTE1,	/* VMM_REG_PDPTE1 */
	VM_REG_GUEST_PDPTE2,	/* VMM_REG_PDPTE2 */
	VM_REG_GUEST_PDPTE3,	/* VMM_REG_PDPTE3 */
	VM_REG_GUEST_INTR_SHADOW, /* VMM_REG_INTR_SHADOW */
};

/*
 * Mapping of libvmm-defined descriptors to vmmapi-defined descriptors.
 *
 * This array must be kept in sync with the definitions in libvmm.h
 */
static int vmm_descmap[] = {
	VM_REG_GUEST_GDTR,
	VM_REG_GUEST_LDTR,
	VM_REG_GUEST_IDTR,
	VM_REG_GUEST_TR,
	VM_REG_GUEST_CS,
	VM_REG_GUEST_DS,
	VM_REG_GUEST_ES,
	VM_REG_GUEST_FS,
	VM_REG_GUEST_GS,
	VM_REG_GUEST_SS
};

static int
vmm_mapreg(int reg)
{
	errno = 0;

	if (reg < 0)
		goto fail;

	if (reg < KDIREG_NGREG)
		return (vmm_kdi_regmap[reg]);

	if (reg >= VMM_REG_OFFSET &&
	    reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap))
		return (vmm_sys_regmap[reg - VMM_REG_OFFSET]);

fail:
	errno = EINVAL;
	return (VM_REG_LAST);
}

static int
vmm_mapdesc(int desc)
{
	errno = 0;

	if (desc >= VMM_DESC_OFFSET &&
	    desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap))
		return (vmm_descmap[desc - VMM_DESC_OFFSET]);

	errno = EINVAL;
	return (VM_REG_LAST);
}

int
vmm_getreg(vmm_t *vmm, int vcpu, int reg, uint64_t *val)
{
	reg = vmm_mapreg(reg);

	if (reg == VM_REG_LAST)
		return (-1);

	return (vm_get_register(vmm->vmm_ctx, vcpu, reg, val));
}

int
vmm_setreg(vmm_t *vmm, int vcpu, int reg, uint64_t val)
{
	reg = vmm_mapreg(reg);

	if (reg == VM_REG_LAST)
		return (-1);

	return (vm_set_register(vmm->vmm_ctx, vcpu, reg, val));
}

int
vmm_get_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
    uint64_t *regvals)
{
	int *vm_regnums;
	int i;
	int ret = -1;

	vm_regnums = malloc(sizeof (int) * nregs);
	if (vm_regnums == NULL)
		return (ret);

	for (i = 0; i != nregs; i++) {
		vm_regnums[i] = vmm_mapreg(regnums[i]);
		if (vm_regnums[i] == VM_REG_LAST)
			goto fail;
	}

	ret = vm_get_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
	    regvals);

fail:
	free(vm_regnums);
	return (ret);
}

int
vmm_set_regset(vmm_t *vmm, int vcpu, size_t nregs, const int *regnums,
    uint64_t *regvals)
{
	int *vm_regnums;
	int i;
	int ret = -1;

	vm_regnums = malloc(sizeof (int) * nregs);
	if (vm_regnums == NULL)
		return (ret);

	for (i = 0; i != nregs; i++) {
		vm_regnums[i] = vmm_mapreg(regnums[i]);
		if (vm_regnums[i] == VM_REG_LAST)
			goto fail;
	}

	ret = vm_set_register_set(vmm->vmm_ctx, vcpu, nregs, vm_regnums,
	    regvals);

fail:
	free(vm_regnums);
	return (ret);
}

int
vmm_get_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
{
	desc = vmm_mapdesc(desc);
	if (desc == VM_REG_LAST)
		return (-1);

	return (vm_get_desc(vmm->vmm_ctx, vcpu, desc, &vd->vd_base, &vd->vd_lim,
	    &vd->vd_acc));
}

int
vmm_set_desc(vmm_t *vmm, int vcpu, int desc, vmm_desc_t *vd)
{
	desc = vmm_mapdesc(desc);
	if (desc == VM_REG_LAST)
		return (-1);

	return (vm_set_desc(vmm->vmm_ctx, vcpu, desc, vd->vd_base, vd->vd_lim,
	    vd->vd_acc));
}

/*
 * Structure to hold MMU state during address translation.
 * The contents of vmm_mmu_regnum[] must be kept in sync with this.
 */
typedef struct vmm_mmu {
	uint64_t vm_cr0;
	uint64_t vm_cr3;
	uint64_t vm_cr4;
	uint64_t vm_efer;
} vmm_mmu_t;

static const int vmm_mmu_regnum[] = {
	VMM_REG_CR0,
	VMM_REG_CR3,
	VMM_REG_CR4,
	VMM_REG_EFER
};

#define	X86_PTE_P		0x001ULL
#define	X86_PTE_PS		0x080ULL

#define	X86_PTE_PHYSMASK	0x000ffffffffff000ULL
#define	X86_PAGE_SHIFT		12
#define	X86_PAGE_SIZE		(1ULL << X86_PAGE_SHIFT)

#define	X86_SEG_CODE_DATA	(1ULL << 4)
#define	X86_SEG_PRESENT		(1ULL << 7)
#define	X86_SEG_LONG		(1ULL << 13)
#define	X86_SEG_BIG		(1ULL << 14)
#define	X86_SEG_GRANULARITY	(1ULL << 15)
#define	X86_SEG_UNUSABLE	(1ULL << 16)

#define	X86_SEG_USABLE		(X86_SEG_PRESENT | X86_SEG_CODE_DATA)
#define	X86_SEG_USABLE_MASK	(X86_SEG_UNUSABLE | X86_SEG_USABLE)

/*
 * vmm_pte2paddr:
 *
 * Recursively calculate the physical address from a virtual address,
 * starting at the given PTE level using the given PTE.
 */
static int
vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level,
    uint64_t vaddr, uint64_t *paddr)
{
	int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t);
	int off_bits = ia32 ? 10 : 9;
	boolean_t hugepage = B_FALSE;
	uint64_t offset;
	uint64_t off_mask, off_shift;

	if (level < 4 && (pte & X86_PTE_P) == 0) {
		errno = EFAULT;
		return (-1);
	}

	off_shift = X86_PAGE_SHIFT + off_bits * level;
	off_mask = (1ULL << off_shift) - 1;

	offset = vaddr & off_mask;

	if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) {
		hugepage = B_TRUE;
	} else {
		if (level > 0) {
			offset >>= off_shift - off_bits;
			offset <<= X86_PAGE_SHIFT - off_bits;
		}
		off_mask = 0xfff;
	}

	*paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset;

	if (level == 0 || hugepage)
		return (0);

	pte = 0;
	if (vmm_pread(vmm, &pte,  pte_size, *paddr) != pte_size)
		return (-1);
	return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr));
}

static vmm_mode_t
vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpu, vmm_mmu_t *mmu)
{
	if ((mmu->vm_cr0 & CR0_PE) == 0)
		return (VMM_MODE_REAL);
	else if ((mmu->vm_cr4 & CR4_PAE) == 0)
		return (VMM_MODE_PROT);
	else if ((mmu->vm_efer & AMD_EFER_LME) == 0)
		return (VMM_MODE_PAE);
	else
		return (VMM_MODE_LONG);
}

vmm_mode_t
vmm_vcpu_mode(vmm_t *vmm, int vcpu)
{
	vmm_mmu_t mmu = { 0 };

	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
		return (VMM_MODE_UNKNOWN);

	return (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu));
}

vmm_isa_t
vmm_vcpu_isa(vmm_t *vmm, int vcpu)
{
	vmm_desc_t cs;

	if (vmm_get_desc(vmm, vcpu, VMM_DESC_CS, &cs) != 0)
		return (VMM_ISA_UNKNOWN);

	switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) {
	case 0x0:		/* 16b code segment */
		return (VMM_ISA_16);
	case X86_SEG_LONG:	/* 64b code segment */
		return (VMM_ISA_64);
	case X86_SEG_BIG:	/* 32b code segment */
		return (VMM_ISA_32);
	}

	return (VMM_ISA_UNKNOWN);
}

/*
 * vmm_vtol:
 *
 * Translate a virtual address to a physical address on a certain vCPU,
 * using the specified segment register or descriptor according to the mode.
 *
 */
int
vmm_vtol(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *laddr)
{
	vmm_desc_t desc;
	uint64_t limit;

	if (vmm_get_desc(vmm, vcpu, seg, &desc) != 0)
		return (-1);

	switch (vmm_vcpu_mode(vmm, vcpu)) {
	case VMM_MODE_REAL:
		if (seg == VMM_DESC_FS || seg == VMM_DESC_GS)
			goto fault;
		/* FALLTHRU */
	case VMM_MODE_PROT:
	case VMM_MODE_PAE:
		if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE)
			/* unusable, system segment, or not present */
			goto fault;

		limit = desc.vd_lim;
		if (desc.vd_acc & X86_SEG_GRANULARITY)
			limit *= 4096;

		if (vaddr > limit)
			goto fault;
		/* FALLTHRU */
	case VMM_MODE_LONG:
		*laddr = desc.vd_base + vaddr;
		return (0);

	default:
	fault:
		errno = EFAULT;
		return (-1);
	}

}

/*
 * vmm_vtop:
 *
 * Translate a virtual address to a guest physical address on a certain vCPU,
 * according to the mode the vCPU is in.
 */
int
vmm_vtop(vmm_t *vmm, int vcpu, int seg, uint64_t vaddr, uint64_t *paddr)
{
	vmm_mmu_t mmu = { 0 };
	int ret = 0;

	if (vmm_vtol(vmm, vcpu, seg, vaddr, &vaddr) != 0)
		return (-1);

	if (vmm_get_regset(vmm, vcpu, ARRAY_SIZE(vmm_mmu_regnum),
	    vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
		return (-1);

	if ((mmu.vm_cr0 & CR0_PG) == 0) {
		/* no paging, physical equals virtual */
		*paddr = vaddr;
		return (0);
	}

	switch (vmm_vcpu_mmu_mode(vmm, vcpu, &mmu)) {
	case VMM_MODE_PROT:
		/* protected mode, no PAE: 2-level paging, 32bit PTEs */
		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr);
		break;
	case VMM_MODE_PAE:
		/* protected mode with PAE: 3-level paging, 64bit PTEs */
		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr);
		break;
	case VMM_MODE_LONG:
		/* long mode: 4-level paging, 64bit PTEs */
		ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr);
		break;
	default:
		ret = -1;
	}

	return (ret);
}

ssize_t
vmm_vread(vmm_t *vmm, int vcpu, int seg, void *buf, size_t len, uintptr_t addr)
{
	ssize_t res = 0;
	uint64_t paddr;
	size_t plen;
	uint64_t boundary;

	while (len != 0) {
		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
			errno = EFAULT;
			return (0);
		}

		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
		if (addr + len > boundary)
			plen = boundary - addr;
		else
			plen = len;

		if (vmm_pread(vmm, buf, plen, paddr) != plen)
			return (0);
		len -= plen;
		addr += plen;
		buf += plen;
		res += plen;
	}

	return (res);
}

ssize_t
vmm_vwrite(vmm_t *vmm, int vcpu, int seg, const void *buf, size_t len,
    uintptr_t addr)
{
	ssize_t res = 0;
	uint64_t paddr;
	size_t plen;
	uint64_t boundary;

	while (len != 0) {
		if (vmm_vtop(vmm, vcpu, seg, addr, &paddr) != 0) {
			errno = EFAULT;
			return (0);
		}

		boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
		if (addr + len > boundary)
			plen = boundary - addr;
		else
			plen = len;

		if (vmm_pwrite(vmm, buf, plen, paddr) != plen)
			return (0);
		len -= plen;
		addr += plen;
		buf += plen;
		res += plen;
	}

	return (res);
}