lib/libvmmapi/vmmapi.c

/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2011 NetApp, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/param.h>
#include <sys/capsicum.h>
#include <sys/cpuset.h>
#include <sys/domainset.h>
#include <sys/sysctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/linker.h>
#include <sys/module.h>
#include <sys/_iovec.h>

#include <capsicum_helpers.h>
#include <err.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>

#include <libutil.h>

#include <vm/vm.h>
#include <machine/vmm.h>
#ifdef WITH_VMMAPI_SNAPSHOT
#include <machine/vmm_snapshot.h>
#endif

#include <dev/vmm/vmm_dev.h>

#include "vmmapi.h"
#include "internal.h"

#define	MB	(1024 * 1024UL)
#define	GB	(1024 * 1024 * 1024UL)

#ifdef __amd64__
#define	VM_LOWMEM_LIMIT	(3 * GB)
#else
#define	VM_LOWMEM_LIMIT	0
#endif
#define	VM_HIGHMEM_BASE	(4 * GB)

/*
 * Size of the guard region before and after the virtual address space
 * mapping the guest physical memory. This must be a multiple of the
 * superpage size for performance reasons.
 */
#define	VM_MMAP_GUARD_SIZE	(4 * MB)

#define	PROT_RW		(PROT_READ | PROT_WRITE)
#define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)

static int
vm_device_open(const char *name)
{
	char devpath[PATH_MAX];

	assert(strlen(name) <= VM_MAX_NAMELEN);
	(void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name);
	return (open(devpath, O_RDWR));
}

static int
vm_ctl_open(void)
{
	if (modfind("vmm") < 0)
		(void)kldload("vmm");
	return (open("/dev/vmmctl", O_RDWR, 0));
}

static int
vm_ctl_create(const char *name, int ctlfd)
{
	struct vmmctl_vm_create vmc;

	memset(&vmc, 0, sizeof(vmc));
	if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) {
		errno = ENAMETOOLONG;
		return (-1);
	}
	return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc));
}

int
vm_create(const char *name)
{
	int error, fd;

	fd = vm_ctl_open();
	if (fd < 0)
		return (-1);

	error = vm_ctl_create(name, fd);
	if (error != 0) {
		error = errno;
		(void)close(fd);
		errno = error;
		return (-1);
	}
	(void)close(fd);
	return (0);
}

struct vmctx *
vm_open(const char *name)
{
	return (vm_openf(name, 0));
}

struct vmctx *
vm_openf(const char *name, int flags)
{
	struct vmctx *vm;
	int saved_errno;
	bool created;

	created = false;

	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
	assert(vm != NULL);

	vm->fd = vm->ctlfd = -1;
	vm->memflags = 0;
	vm->name = (char *)(vm + 1);
	strcpy(vm->name, name);
	memset(vm->memsegs, 0, sizeof(vm->memsegs));

	if ((vm->ctlfd = vm_ctl_open()) < 0)
		goto err;

	vm->fd = vm_device_open(vm->name);
	if (vm->fd < 0 && errno == ENOENT) {
		if (flags & VMMAPI_OPEN_CREATE) {
			if (vm_ctl_create(vm->name, vm->ctlfd) != 0)
				goto err;
			vm->fd = vm_device_open(vm->name);
			created = true;
		}
	}
	if (vm->fd < 0)
		goto err;

	if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0)
		goto err;

	return (vm);
err:
	saved_errno = errno;
	if (created)
		vm_destroy(vm);
	else
		vm_close(vm);
	errno = saved_errno;
	return (NULL);
}

void
vm_close(struct vmctx *vm)
{
	assert(vm != NULL);

	if (vm->fd >= 0)
		(void)close(vm->fd);
	if (vm->ctlfd >= 0)
		(void)close(vm->ctlfd);
	free(vm);
}

void
vm_destroy(struct vmctx *vm)
{
	struct vmmctl_vm_destroy vmd;

	memset(&vmd, 0, sizeof(vmd));
	(void)strlcpy(vmd.name, vm->name, sizeof(vmd.name));
	if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0)
		warn("ioctl(VMMCTL_VM_DESTROY)");

	vm_close(vm);
}

struct vcpu *
vm_vcpu_open(struct vmctx *ctx, int vcpuid)
{
	struct vcpu *vcpu;

	vcpu = malloc(sizeof(*vcpu));
	vcpu->ctx = ctx;
	vcpu->vcpuid = vcpuid;
	return (vcpu);
}

void
vm_vcpu_close(struct vcpu *vcpu)
{
	free(vcpu);
}

int
vcpu_id(struct vcpu *vcpu)
{
	return (vcpu->vcpuid);
}

int
vm_parse_memsize(const char *opt, size_t *ret_memsize)
{
	char *endptr;
	size_t optval;
	int error;

	optval = strtoul(opt, &endptr, 0);
	if (*opt != '\0' && *endptr == '\0') {
		/*
		 * For the sake of backward compatibility if the memory size
		 * specified on the command line is less than a megabyte then
		 * it is interpreted as being in units of MB.
		 */
		if (optval < MB)
			optval *= MB;
		*ret_memsize = optval;
		error = 0;
	} else
		error = expand_number(opt, ret_memsize);

	return (error);
}

uint32_t
vm_get_lowmem_limit(struct vmctx *ctx __unused)
{

	return (VM_LOWMEM_LIMIT);
}

void
vm_set_memflags(struct vmctx *ctx, int flags)
{

	ctx->memflags = flags;
}

int
vm_get_memflags(struct vmctx *ctx)
{

	return (ctx->memflags);
}

/*
 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
 */
int
vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
    size_t len, int prot)
{
	struct vm_memmap memmap;
	int error, flags;

	memmap.gpa = gpa;
	memmap.segid = segid;
	memmap.segoff = off;
	memmap.len = len;
	memmap.prot = prot;
	memmap.flags = 0;

	if (ctx->memflags & VM_MEM_F_WIRED)
		memmap.flags |= VM_MEMMAP_F_WIRED;

	/*
	 * If this mapping already exists then don't create it again. This
	 * is the common case for SYSMEM mappings created by bhyveload(8).
	 */
	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
	if (error == 0 && gpa == memmap.gpa) {
		if (segid != memmap.segid || off != memmap.segoff ||
		    prot != memmap.prot || flags != memmap.flags) {
			errno = EEXIST;
			return (-1);
		} else {
			return (0);
		}
	}

	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
	return (error);
}

int
vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
    size_t *lowmem_size, size_t *highmem_size)
{

	*guest_baseaddr = ctx->baseaddr;
	*lowmem_size = ctx->lowmem_size;
	*highmem_size = ctx->highmem_size;
	return (0);
}

int
vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
{
	struct vm_munmap munmap;
	int error;

	munmap.gpa = gpa;
	munmap.len = len;

	error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap);
	return (error);
}

int
vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
{
	struct vm_memmap memmap;
	int error;

	bzero(&memmap, sizeof(struct vm_memmap));
	memmap.gpa = *gpa;
	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
	if (error == 0) {
		*gpa = memmap.gpa;
		*segid = memmap.segid;
		*segoff = memmap.segoff;
		*len = memmap.len;
		*prot = memmap.prot;
		*flags = memmap.flags;
	}
	return (error);
}

/*
 * Return 0 if the segments are identical and non-zero otherwise.
 *
 * This is slightly complicated by the fact that only device memory segments
 * are named.
 */
static int
cmpseg(size_t len, const char *str, size_t len2, const char *str2)
{

	if (len == len2) {
		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
			return (0);
	}
	return (-1);
}

static int
vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name,
    int ds_policy, domainset_t *ds_mask, size_t ds_size)
{
	struct vm_memseg memseg;
	size_t n;
	int error;

	/*
	 * If the memory segment has already been created then just return.
	 * This is the usual case for the SYSMEM segment created by userspace
	 * loaders like bhyveload(8).
	 */
	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
	    sizeof(memseg.name));
	if (error)
		return (error);

	if (memseg.len != 0) {
		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
			errno = EINVAL;
			return (-1);
		} else {
			return (0);
		}
	}

	bzero(&memseg, sizeof(struct vm_memseg));
	memseg.segid = segid;
	memseg.len = len;
	if (ds_mask == NULL) {
		memseg.ds_policy = DOMAINSET_POLICY_INVALID;
	} else {
		memseg.ds_policy = ds_policy;
		memseg.ds_mask = ds_mask;
		memseg.ds_mask_size = ds_size;
	}
	if (name != NULL) {
		n = strlcpy(memseg.name, name, sizeof(memseg.name));
		if (n >= sizeof(memseg.name)) {
			errno = ENAMETOOLONG;
			return (-1);
		}
	}

	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
	return (error);
}

int
vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
    size_t bufsize)
{
	struct vm_memseg memseg;
	size_t n;
	int error;

	bzero(&memseg, sizeof(memseg));
	memseg.segid = segid;
	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
	if (error == 0) {
		*lenp = memseg.len;
		n = strlcpy(namebuf, memseg.name, bufsize);
		if (n >= bufsize) {
			errno = ENAMETOOLONG;
			error = -1;
		}
	}
	return (error);
}

static int
map_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len,
    size_t segoff, char *base)
{
	char *ptr;
	int error, flags;

	/* Map 'len' bytes starting at 'gpa' in the guest address space */
	error = vm_mmap_memseg(ctx, gpa, segid, segoff, len, PROT_ALL);
	if (error)
		return (error);

	flags = MAP_SHARED | MAP_FIXED;
	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
		flags |= MAP_NOCORE;

	/* mmap into the process address space on the host */
	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
	if (ptr == MAP_FAILED)
		return (-1);

	return (0);
}

/*
 * Allocates and maps virtual machine memory segments according
 * to the NUMA topology specified by the 'doms' array.
 *
 * The domains are laid out sequentially in the guest's physical address space.
 * The [VM_LOWMEM_LIMIT, VM_HIGHMEM_BASE) address range is skipped and
 * left unmapped.
 */
int
vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style vms,
    struct vm_mem_domain *doms, int ndoms)
{
	size_t low_len, len, totalsize;
	struct vm_mem_domain *dom;
	struct vm_memseg memseg;
	char *baseaddr, *ptr;
	int error, i, segid;
	vm_paddr_t gpa;

	/* Sanity checks. */
	assert(vms == VM_MMAP_ALL);
	if (doms == NULL || ndoms <= 0 || ndoms > VM_MAXMEMDOM) {
		errno = EINVAL;
		return (-1);
	}

	/* Calculate total memory size. */
	totalsize = 0;
	for (i = 0; i < ndoms; i++)
		totalsize += doms[i].size;

	if (totalsize > VM_LOWMEM_LIMIT)
		totalsize = VM_HIGHMEM_BASE + (totalsize - VM_LOWMEM_LIMIT);

	/*
	 * Stake out a contiguous region covering the guest physical memory
	 * and the adjoining guard regions.
	 */
	len = VM_MMAP_GUARD_SIZE + totalsize + VM_MMAP_GUARD_SIZE;
	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
	if (ptr == MAP_FAILED)
		return (-1);
	baseaddr = ptr + VM_MMAP_GUARD_SIZE;

	/*
	 * Allocate and map memory segments for the virtual machine.
	 */
	gpa = VM_LOWMEM_LIMIT > 0 ? 0 : VM_HIGHMEM_BASE;
	ctx->lowmem_size = 0;
	ctx->highmem_size = 0;
	for (i = 0; i < ndoms; i++) {
		segid = VM_SYSMEM + i;
		dom = &doms[i];

		/*
		 * Check if the memory segment already exists.
		 * If 'ndoms' is greater than one, refuse to proceed if the
		 * memseg already exists. If only one domain was requested, use
		 * the existing segment to preserve the behaviour of the previous
		 * implementation.
		 *
		 * Splitting existing memory segments is tedious and
		 * error-prone, which is why we don't support NUMA
		 * domains for bhyveload(8)-loaded VMs.
		 */
		error = vm_get_memseg(ctx, segid, &len, memseg.name,
		    sizeof(memseg.name));
		if (error == 0 && len != 0) {
			if (ndoms != 1) {
				errno = EEXIST;
				return (-1);
			} else
				doms[0].size = len;
		} else {
			error = vm_alloc_memseg(ctx, segid, dom->size, NULL,
			    dom->ds_policy, dom->ds_mask, dom->ds_size);
			if (error)
				return (error);
		}

		/*
		 * If a domain is split by VM_LOWMEM_LIMIT then break
		 * its segment mapping into two parts, one below VM_LOWMEM_LIMIT
		 * and one above VM_HIGHMEM_BASE.
		 */
		if (gpa <= VM_LOWMEM_LIMIT &&
		    gpa + dom->size > VM_LOWMEM_LIMIT) {
			low_len = VM_LOWMEM_LIMIT - gpa;
			error = map_memory_segment(ctx, segid, gpa, low_len, 0,
			    baseaddr);
			if (error)
				return (error);
			ctx->lowmem_size = VM_LOWMEM_LIMIT;
			/* Map the remainder. */
			gpa = VM_HIGHMEM_BASE;
			len = dom->size - low_len;
			error = map_memory_segment(ctx, segid, gpa, len,
			    low_len, baseaddr);
			if (error)
				return (error);
		} else {
			len = dom->size;
			error = map_memory_segment(ctx, segid, gpa, len, 0,
			    baseaddr);
			if (error)
				return (error);
		}
		if (gpa <= VM_LOWMEM_LIMIT)
			ctx->lowmem_size += len;
		else
			ctx->highmem_size += len;
		gpa += len;
	}
	ctx->baseaddr = baseaddr;

	return (0);
}

int
vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
{
	struct vm_mem_domain dom0;

	memset(&dom0, 0, sizeof(dom0));
	dom0.ds_policy = DOMAINSET_POLICY_INVALID;
	dom0.size = memsize;

	return (vm_setup_memory_domains(ctx, vms, &dom0, 1));
}

/*
 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
 * the lowmem or highmem regions.
 *
 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
 * The instruction emulation code depends on this behavior.
 */
void *
vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
{
	vm_size_t lowsize, highsize;

	lowsize = ctx->lowmem_size;
	if (lowsize > 0) {
		if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize)
			return (ctx->baseaddr + gaddr);
	}

	highsize = ctx->highmem_size;
	if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) {
		if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize &&
		    gaddr + len <= VM_HIGHMEM_BASE + highsize)
			return (ctx->baseaddr + gaddr);
	}

	return (NULL);
}

vm_paddr_t
vm_rev_map_gpa(struct vmctx *ctx, void *addr)
{
	vm_paddr_t offaddr;
	vm_size_t lowsize, highsize;

	offaddr = (char *)addr - ctx->baseaddr;

	lowsize = ctx->lowmem_size;
	if (lowsize > 0)
		if (offaddr <= lowsize)
			return (offaddr);

	highsize = ctx->highmem_size;
	if (highsize > 0)
		if (offaddr >= VM_HIGHMEM_BASE &&
		    offaddr < VM_HIGHMEM_BASE + highsize)
			return (offaddr);

	return ((vm_paddr_t)-1);
}

const char *
vm_get_name(struct vmctx *ctx)
{

	return (ctx->name);
}

size_t
vm_get_lowmem_size(struct vmctx *ctx)
{
	return (ctx->lowmem_size);
}

vm_paddr_t
vm_get_highmem_base(struct vmctx *ctx __unused)
{

	return (VM_HIGHMEM_BASE);
}

size_t
vm_get_highmem_size(struct vmctx *ctx)
{
	return (ctx->highmem_size);
}

void *
vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
{
	char pathname[MAXPATHLEN];
	size_t len2;
	char *base, *ptr;
	int fd, error, flags;

	fd = -1;
	ptr = MAP_FAILED;
	if (name == NULL || strlen(name) == 0) {
		errno = EINVAL;
		goto done;
	}

	error = vm_alloc_memseg(ctx, segid, len, name, 0, NULL, 0);
	if (error)
		goto done;

	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
	strlcat(pathname, ctx->name, sizeof(pathname));
	strlcat(pathname, ".", sizeof(pathname));
	strlcat(pathname, name, sizeof(pathname));

	fd = open(pathname, O_RDWR);
	if (fd < 0)
		goto done;

	/*
	 * Stake out a contiguous region covering the device memory and the
	 * adjoining guard regions.
	 */
	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
	base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
	    0);
	if (base == MAP_FAILED)
		goto done;

	flags = MAP_SHARED | MAP_FIXED;
	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
		flags |= MAP_NOCORE;

	/* mmap the devmem region in the host address space */
	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
done:
	if (fd >= 0)
		close(fd);
	return (ptr);
}

int
vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg)
{
	/*
	 * XXX: fragile, handle with care
	 * Assumes that the first field of the ioctl data
	 * is the vcpuid.
	 */
	*(int *)arg = vcpu->vcpuid;
	return (ioctl(vcpu->ctx->fd, cmd, arg));
}

int
vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
{
	int error;
	struct vm_register vmreg;

	bzero(&vmreg, sizeof(vmreg));
	vmreg.regnum = reg;
	vmreg.regval = val;

	error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg);
	return (error);
}

int
vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val)
{
	int error;
	struct vm_register vmreg;

	bzero(&vmreg, sizeof(vmreg));
	vmreg.regnum = reg;

	error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg);
	*ret_val = vmreg.regval;
	return (error);
}

int
vm_set_register_set(struct vcpu *vcpu, unsigned int count,
    const int *regnums, uint64_t *regvals)
{
	int error;
	struct vm_register_set vmregset;

	bzero(&vmregset, sizeof(vmregset));
	vmregset.count = count;
	vmregset.regnums = regnums;
	vmregset.regvals = regvals;

	error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset);
	return (error);
}

int
vm_get_register_set(struct vcpu *vcpu, unsigned int count,
    const int *regnums, uint64_t *regvals)
{
	int error;
	struct vm_register_set vmregset;

	bzero(&vmregset, sizeof(vmregset));
	vmregset.count = count;
	vmregset.regnums = regnums;
	vmregset.regvals = regvals;

	error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset);
	return (error);
}

int
vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
{
	return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
}

int
vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
{
	struct vm_suspend vmsuspend;

	bzero(&vmsuspend, sizeof(vmsuspend));
	vmsuspend.how = how;
	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
}

int
vm_reinit(struct vmctx *ctx)
{

	return (ioctl(ctx->fd, VM_REINIT, 0));
}

int
vm_capability_name2type(const char *capname)
{
	int i;

	for (i = 0; i < VM_CAP_MAX; i++) {
		if (vm_capstrmap[i] != NULL &&
		    strcmp(vm_capstrmap[i], capname) == 0)
			return (i);
	}

	return (-1);
}

const char *
vm_capability_type2name(int type)
{
	if (type >= 0 && type < VM_CAP_MAX)
		return (vm_capstrmap[type]);

	return (NULL);
}

int
vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval)
{
	int error;
	struct vm_capability vmcap;

	bzero(&vmcap, sizeof(vmcap));
	vmcap.captype = cap;

	error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap);
	*retval = vmcap.capval;
	return (error);
}

int
vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val)
{
	struct vm_capability vmcap;

	bzero(&vmcap, sizeof(vmcap));
	vmcap.captype = cap;
	vmcap.capval = val;

	return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap));
}

uint64_t *
vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv,
	     int *ret_entries)
{
	static _Thread_local uint64_t *stats_buf;
	static _Thread_local u_int stats_count;
	uint64_t *new_stats;
	struct vm_stats vmstats;
	u_int count, index;
	bool have_stats;

	have_stats = false;
	count = 0;
	for (index = 0;; index += nitems(vmstats.statbuf)) {
		vmstats.index = index;
		if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0)
			break;
		if (stats_count < index + vmstats.num_entries) {
			new_stats = realloc(stats_buf,
			    (index + vmstats.num_entries) * sizeof(uint64_t));
			if (new_stats == NULL) {
				errno = ENOMEM;
				return (NULL);
			}
			stats_count = index + vmstats.num_entries;
			stats_buf = new_stats;
		}
		memcpy(stats_buf + index, vmstats.statbuf,
		    vmstats.num_entries * sizeof(uint64_t));
		count += vmstats.num_entries;
		have_stats = true;

		if (vmstats.num_entries != nitems(vmstats.statbuf))
			break;
	}
	if (have_stats) {
		if (ret_entries)
			*ret_entries = count;
		if (ret_tv)
			*ret_tv = vmstats.tv;
		return (stats_buf);
	} else
		return (NULL);
}

const char *
vm_get_stat_desc(struct vmctx *ctx, int index)
{
	static struct vm_stat_desc statdesc;

	statdesc.index = index;
	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
		return (statdesc.desc);
	else
		return (NULL);
}

#ifdef __amd64__
int
vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
{
	int error, i;
	struct vm_gpa_pte gpapte;

	bzero(&gpapte, sizeof(gpapte));
	gpapte.gpa = gpa;

	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);

	if (error == 0) {
		*num = gpapte.ptenum;
		for (i = 0; i < gpapte.ptenum; i++)
			pte[i] = gpapte.pte[i];
	}

	return (error);
}

int
vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
    uint64_t gla, int prot, uint64_t *gpa, int *fault)
{
	struct vm_gla2gpa gg;
	int error;

	bzero(&gg, sizeof(struct vm_gla2gpa));
	gg.prot = prot;
	gg.gla = gla;
	gg.paging = *paging;

	error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg);
	if (error == 0) {
		*fault = gg.fault;
		*gpa = gg.gpa;
	}
	return (error);
}
#endif

int
vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
    uint64_t gla, int prot, uint64_t *gpa, int *fault)
{
	struct vm_gla2gpa gg;
	int error;

	bzero(&gg, sizeof(struct vm_gla2gpa));
	gg.prot = prot;
	gg.gla = gla;
	gg.paging = *paging;

	error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg);
	if (error == 0) {
		*fault = gg.fault;
		*gpa = gg.gpa;
	}
	return (error);
}

#ifndef min
#define	min(a,b)	(((a) < (b)) ? (a) : (b))
#endif

#ifdef __amd64__
int
vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
    int *fault)
{
	void *va;
	uint64_t gpa, off;
	int error, i, n;

	for (i = 0; i < iovcnt; i++) {
		iov[i].iov_base = 0;
		iov[i].iov_len = 0;
	}

	while (len) {
		assert(iovcnt > 0);
		error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
		if (error || *fault)
			return (error);

		off = gpa & PAGE_MASK;
		n = MIN(len, PAGE_SIZE - off);

		va = vm_map_gpa(vcpu->ctx, gpa, n);
		if (va == NULL)
			return (EFAULT);

		iov->iov_base = va;
		iov->iov_len = n;
		iov++;
		iovcnt--;

		gla += n;
		len -= n;
	}
	return (0);
}
#endif

void
vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused)
{
	/*
	 * Intentionally empty.  This is used by the instruction
	 * emulation code shared with the kernel.  The in-kernel
	 * version of this is non-empty.
	 */
}

void
vm_copyin(struct iovec *iov, void *vp, size_t len)
{
	const char *src;
	char *dst;
	size_t n;

	dst = vp;
	while (len) {
		assert(iov->iov_len);
		n = min(len, iov->iov_len);
		src = iov->iov_base;
		bcopy(src, dst, n);

		iov++;
		dst += n;
		len -= n;
	}
}

void
vm_copyout(const void *vp, struct iovec *iov, size_t len)
{
	const char *src;
	char *dst;
	size_t n;

	src = vp;
	while (len) {
		assert(iov->iov_len);
		n = min(len, iov->iov_len);
		dst = iov->iov_base;
		bcopy(src, dst, n);

		iov++;
		src += n;
		len -= n;
	}
}

static int
vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
{
	struct vm_cpuset vm_cpuset;
	int error;

	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
	vm_cpuset.which = which;
	vm_cpuset.cpusetsize = sizeof(cpuset_t);
	vm_cpuset.cpus = cpus;

	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
	return (error);
}

int
vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
{

	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
}

int
vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
{

	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
}

int
vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
{

	return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
}

int
vm_activate_cpu(struct vcpu *vcpu)
{
	struct vm_activate_cpu ac;
	int error;

	bzero(&ac, sizeof(struct vm_activate_cpu));
	error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac);
	return (error);
}

int
vm_suspend_all_cpus(struct vmctx *ctx)
{
	struct vm_activate_cpu ac;
	int error;

	bzero(&ac, sizeof(struct vm_activate_cpu));
	ac.vcpuid = -1;
	error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
	return (error);
}

int
vm_suspend_cpu(struct vcpu *vcpu)
{
	struct vm_activate_cpu ac;
	int error;

	bzero(&ac, sizeof(struct vm_activate_cpu));
	error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac);
	return (error);
}

int
vm_resume_cpu(struct vcpu *vcpu)
{
	struct vm_activate_cpu ac;
	int error;

	bzero(&ac, sizeof(struct vm_activate_cpu));
	error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac);
	return (error);
}

int
vm_resume_all_cpus(struct vmctx *ctx)
{
	struct vm_activate_cpu ac;
	int error;

	bzero(&ac, sizeof(struct vm_activate_cpu));
	ac.vcpuid = -1;
	error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
	return (error);
}

#ifdef __amd64__
int
vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
{
	struct vm_intinfo vmii;
	int error;

	bzero(&vmii, sizeof(struct vm_intinfo));
	error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii);
	if (error == 0) {
		*info1 = vmii.info1;
		*info2 = vmii.info2;
	}
	return (error);
}

int
vm_set_intinfo(struct vcpu *vcpu, uint64_t info1)
{
	struct vm_intinfo vmii;
	int error;

	bzero(&vmii, sizeof(struct vm_intinfo));
	vmii.info1 = info1;
	error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii);
	return (error);
}
#endif

#ifdef WITH_VMMAPI_SNAPSHOT
int
vm_restart_instruction(struct vcpu *vcpu)
{
	int arg;

	return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg));
}

int
vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta)
{

	if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
#ifdef SNAPSHOT_DEBUG
		fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
		    __func__, meta->dev_name, errno);
#endif
		return (-1);
	}
	return (0);
}

int
vm_restore_time(struct vmctx *ctx)
{
	int dummy;

	dummy = 0;
	return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
}
#endif

int
vm_set_topology(struct vmctx *ctx,
    uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
{
	struct vm_cpu_topology topology;

	bzero(&topology, sizeof (struct vm_cpu_topology));
	topology.sockets = sockets;
	topology.cores = cores;
	topology.threads = threads;
	topology.maxcpus = maxcpus;
	return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
}

int
vm_get_topology(struct vmctx *ctx,
    uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
{
	struct vm_cpu_topology topology;
	int error;

	bzero(&topology, sizeof (struct vm_cpu_topology));
	error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
	if (error == 0) {
		*sockets = topology.sockets;
		*cores = topology.cores;
		*threads = topology.threads;
		*maxcpus = topology.maxcpus;
	}
	return (error);
}

int
vm_limit_rights(struct vmctx *ctx)
{
	cap_rights_t rights;

	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
	if (caph_rights_limit(ctx->fd, &rights) != 0)
		return (-1);
	if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0)
		return (-1);
	return (0);
}

/*
 * Avoid using in new code.  Operations on the fd should be wrapped here so that
 * capability rights can be kept in sync.
 */
int
vm_get_device_fd(struct vmctx *ctx)
{

	return (ctx->fd);
}

/* Legacy interface, do not use. */
const cap_ioctl_t *
vm_get_ioctls(size_t *len)
{
	cap_ioctl_t *cmds;
	size_t sz;

	if (len == NULL) {
		sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]);
		cmds = malloc(sz);
		if (cmds == NULL)
			return (NULL);
		bcopy(vm_ioctl_cmds, cmds, sz);
		return (cmds);
	}

	*len = vm_ioctl_ncmds;
	return (NULL);
}