xref: /freebsd/lib/libvmmapi/vmmapi.c (revision 8b06bdc91ddff995beed7bdcb6e5541c5ca227ef)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/capsicum.h>
31 #include <sys/sysctl.h>
32 #include <sys/ioctl.h>
33 #include <sys/mman.h>
34 #include <sys/linker.h>
35 #include <sys/module.h>
36 #include <sys/_iovec.h>
37 #include <sys/cpuset.h>
38 
39 #include <capsicum_helpers.h>
40 #include <errno.h>
41 #include <stdbool.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <assert.h>
45 #include <string.h>
46 #include <fcntl.h>
47 #include <unistd.h>
48 
49 #include <libutil.h>
50 
51 #include <vm/vm.h>
52 #include <machine/vmm.h>
53 #include <machine/vmm_dev.h>
54 #include <machine/vmm_snapshot.h>
55 
56 #include "vmmapi.h"
57 #include "internal.h"
58 
59 #define	MB	(1024 * 1024UL)
60 #define	GB	(1024 * 1024 * 1024UL)
61 
62 /*
63  * Size of the guard region before and after the virtual address space
64  * mapping the guest physical memory. This must be a multiple of the
65  * superpage size for performance reasons.
66  */
67 #define	VM_MMAP_GUARD_SIZE	(4 * MB)
68 
69 #define	PROT_RW		(PROT_READ | PROT_WRITE)
70 #define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
71 
72 #define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
73 #define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
74 
75 static int
76 vm_device_open(const char *name)
77 {
78 	int fd, len;
79 	char *vmfile;
80 
81 	len = strlen("/dev/vmm/") + strlen(name) + 1;
82 	vmfile = malloc(len);
83 	assert(vmfile != NULL);
84 	snprintf(vmfile, len, "/dev/vmm/%s", name);
85 
86 	/* Open the device file */
87 	fd = open(vmfile, O_RDWR, 0);
88 
89 	free(vmfile);
90 	return (fd);
91 }
92 
93 int
94 vm_create(const char *name)
95 {
96 	/* Try to load vmm(4) module before creating a guest. */
97 	if (modfind("vmm") < 0)
98 		kldload("vmm");
99 	return (CREATE(name));
100 }
101 
102 struct vmctx *
103 vm_open(const char *name)
104 {
105 	struct vmctx *vm;
106 	int saved_errno;
107 
108 	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
109 	assert(vm != NULL);
110 
111 	vm->fd = -1;
112 	vm->memflags = 0;
113 	vm->lowmem_limit = 3 * GB;
114 	vm->name = (char *)(vm + 1);
115 	strcpy(vm->name, name);
116 
117 	if ((vm->fd = vm_device_open(vm->name)) < 0)
118 		goto err;
119 
120 	return (vm);
121 err:
122 	saved_errno = errno;
123 	free(vm);
124 	errno = saved_errno;
125 	return (NULL);
126 }
127 
128 void
129 vm_close(struct vmctx *vm)
130 {
131 	assert(vm != NULL);
132 
133 	close(vm->fd);
134 	free(vm);
135 }
136 
137 void
138 vm_destroy(struct vmctx *vm)
139 {
140 	assert(vm != NULL);
141 
142 	if (vm->fd >= 0)
143 		close(vm->fd);
144 	DESTROY(vm->name);
145 
146 	free(vm);
147 }
148 
149 struct vcpu *
150 vm_vcpu_open(struct vmctx *ctx, int vcpuid)
151 {
152 	struct vcpu *vcpu;
153 
154 	vcpu = malloc(sizeof(*vcpu));
155 	vcpu->ctx = ctx;
156 	vcpu->vcpuid = vcpuid;
157 	return (vcpu);
158 }
159 
160 void
161 vm_vcpu_close(struct vcpu *vcpu)
162 {
163 	free(vcpu);
164 }
165 
166 int
167 vcpu_id(struct vcpu *vcpu)
168 {
169 	return (vcpu->vcpuid);
170 }
171 
172 int
173 vm_parse_memsize(const char *opt, size_t *ret_memsize)
174 {
175 	char *endptr;
176 	size_t optval;
177 	int error;
178 
179 	optval = strtoul(opt, &endptr, 0);
180 	if (*opt != '\0' && *endptr == '\0') {
181 		/*
182 		 * For the sake of backward compatibility if the memory size
183 		 * specified on the command line is less than a megabyte then
184 		 * it is interpreted as being in units of MB.
185 		 */
186 		if (optval < MB)
187 			optval *= MB;
188 		*ret_memsize = optval;
189 		error = 0;
190 	} else
191 		error = expand_number(opt, ret_memsize);
192 
193 	return (error);
194 }
195 
196 uint32_t
197 vm_get_lowmem_limit(struct vmctx *ctx)
198 {
199 
200 	return (ctx->lowmem_limit);
201 }
202 
203 void
204 vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
205 {
206 
207 	ctx->lowmem_limit = limit;
208 }
209 
210 void
211 vm_set_memflags(struct vmctx *ctx, int flags)
212 {
213 
214 	ctx->memflags = flags;
215 }
216 
217 int
218 vm_get_memflags(struct vmctx *ctx)
219 {
220 
221 	return (ctx->memflags);
222 }
223 
224 /*
225  * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
226  */
227 int
228 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
229     size_t len, int prot)
230 {
231 	struct vm_memmap memmap;
232 	int error, flags;
233 
234 	memmap.gpa = gpa;
235 	memmap.segid = segid;
236 	memmap.segoff = off;
237 	memmap.len = len;
238 	memmap.prot = prot;
239 	memmap.flags = 0;
240 
241 	if (ctx->memflags & VM_MEM_F_WIRED)
242 		memmap.flags |= VM_MEMMAP_F_WIRED;
243 
244 	/*
245 	 * If this mapping already exists then don't create it again. This
246 	 * is the common case for SYSMEM mappings created by bhyveload(8).
247 	 */
248 	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
249 	if (error == 0 && gpa == memmap.gpa) {
250 		if (segid != memmap.segid || off != memmap.segoff ||
251 		    prot != memmap.prot || flags != memmap.flags) {
252 			errno = EEXIST;
253 			return (-1);
254 		} else {
255 			return (0);
256 		}
257 	}
258 
259 	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
260 	return (error);
261 }
262 
263 int
264 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
265     size_t *lowmem_size, size_t *highmem_size)
266 {
267 
268 	*guest_baseaddr = ctx->baseaddr;
269 	*lowmem_size = ctx->lowmem;
270 	*highmem_size = ctx->highmem;
271 	return (0);
272 }
273 
274 int
275 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
276 {
277 	struct vm_munmap munmap;
278 	int error;
279 
280 	munmap.gpa = gpa;
281 	munmap.len = len;
282 
283 	error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap);
284 	return (error);
285 }
286 
287 int
288 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
289     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
290 {
291 	struct vm_memmap memmap;
292 	int error;
293 
294 	bzero(&memmap, sizeof(struct vm_memmap));
295 	memmap.gpa = *gpa;
296 	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
297 	if (error == 0) {
298 		*gpa = memmap.gpa;
299 		*segid = memmap.segid;
300 		*segoff = memmap.segoff;
301 		*len = memmap.len;
302 		*prot = memmap.prot;
303 		*flags = memmap.flags;
304 	}
305 	return (error);
306 }
307 
308 /*
309  * Return 0 if the segments are identical and non-zero otherwise.
310  *
311  * This is slightly complicated by the fact that only device memory segments
312  * are named.
313  */
314 static int
315 cmpseg(size_t len, const char *str, size_t len2, const char *str2)
316 {
317 
318 	if (len == len2) {
319 		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
320 			return (0);
321 	}
322 	return (-1);
323 }
324 
325 static int
326 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
327 {
328 	struct vm_memseg memseg;
329 	size_t n;
330 	int error;
331 
332 	/*
333 	 * If the memory segment has already been created then just return.
334 	 * This is the usual case for the SYSMEM segment created by userspace
335 	 * loaders like bhyveload(8).
336 	 */
337 	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
338 	    sizeof(memseg.name));
339 	if (error)
340 		return (error);
341 
342 	if (memseg.len != 0) {
343 		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
344 			errno = EINVAL;
345 			return (-1);
346 		} else {
347 			return (0);
348 		}
349 	}
350 
351 	bzero(&memseg, sizeof(struct vm_memseg));
352 	memseg.segid = segid;
353 	memseg.len = len;
354 	if (name != NULL) {
355 		n = strlcpy(memseg.name, name, sizeof(memseg.name));
356 		if (n >= sizeof(memseg.name)) {
357 			errno = ENAMETOOLONG;
358 			return (-1);
359 		}
360 	}
361 
362 	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
363 	return (error);
364 }
365 
366 int
367 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
368     size_t bufsize)
369 {
370 	struct vm_memseg memseg;
371 	size_t n;
372 	int error;
373 
374 	memseg.segid = segid;
375 	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
376 	if (error == 0) {
377 		*lenp = memseg.len;
378 		n = strlcpy(namebuf, memseg.name, bufsize);
379 		if (n >= bufsize) {
380 			errno = ENAMETOOLONG;
381 			error = -1;
382 		}
383 	}
384 	return (error);
385 }
386 
387 static int
388 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
389 {
390 	char *ptr;
391 	int error, flags;
392 
393 	/* Map 'len' bytes starting at 'gpa' in the guest address space */
394 	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
395 	if (error)
396 		return (error);
397 
398 	flags = MAP_SHARED | MAP_FIXED;
399 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
400 		flags |= MAP_NOCORE;
401 
402 	/* mmap into the process address space on the host */
403 	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
404 	if (ptr == MAP_FAILED)
405 		return (-1);
406 
407 	return (0);
408 }
409 
410 int
411 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
412 {
413 	size_t objsize, len;
414 	vm_paddr_t gpa;
415 	char *baseaddr, *ptr;
416 	int error;
417 
418 	assert(vms == VM_MMAP_ALL);
419 
420 	/*
421 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then
422 	 * create another 'highmem' segment above 4GB for the remainder.
423 	 */
424 	if (memsize > ctx->lowmem_limit) {
425 		ctx->lowmem = ctx->lowmem_limit;
426 		ctx->highmem = memsize - ctx->lowmem_limit;
427 		objsize = 4*GB + ctx->highmem;
428 	} else {
429 		ctx->lowmem = memsize;
430 		ctx->highmem = 0;
431 		objsize = ctx->lowmem;
432 	}
433 
434 	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
435 	if (error)
436 		return (error);
437 
438 	/*
439 	 * Stake out a contiguous region covering the guest physical memory
440 	 * and the adjoining guard regions.
441 	 */
442 	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
443 	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
444 	if (ptr == MAP_FAILED)
445 		return (-1);
446 
447 	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
448 	if (ctx->highmem > 0) {
449 		gpa = 4*GB;
450 		len = ctx->highmem;
451 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
452 		if (error)
453 			return (error);
454 	}
455 
456 	if (ctx->lowmem > 0) {
457 		gpa = 0;
458 		len = ctx->lowmem;
459 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
460 		if (error)
461 			return (error);
462 	}
463 
464 	ctx->baseaddr = baseaddr;
465 
466 	return (0);
467 }
468 
469 /*
470  * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
471  * the lowmem or highmem regions.
472  *
473  * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
474  * The instruction emulation code depends on this behavior.
475  */
476 void *
477 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
478 {
479 
480 	if (ctx->lowmem > 0) {
481 		if (gaddr < ctx->lowmem && len <= ctx->lowmem &&
482 		    gaddr + len <= ctx->lowmem)
483 			return (ctx->baseaddr + gaddr);
484 	}
485 
486 	if (ctx->highmem > 0) {
487                 if (gaddr >= 4*GB) {
488 			if (gaddr < 4*GB + ctx->highmem &&
489 			    len <= ctx->highmem &&
490 			    gaddr + len <= 4*GB + ctx->highmem)
491 				return (ctx->baseaddr + gaddr);
492 		}
493 	}
494 
495 	return (NULL);
496 }
497 
498 vm_paddr_t
499 vm_rev_map_gpa(struct vmctx *ctx, void *addr)
500 {
501 	vm_paddr_t offaddr;
502 
503 	offaddr = (char *)addr - ctx->baseaddr;
504 
505 	if (ctx->lowmem > 0)
506 		if (offaddr <= ctx->lowmem)
507 			return (offaddr);
508 
509 	if (ctx->highmem > 0)
510 		if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem)
511 			return (offaddr);
512 
513 	return ((vm_paddr_t)-1);
514 }
515 
516 const char *
517 vm_get_name(struct vmctx *ctx)
518 {
519 
520 	return (ctx->name);
521 }
522 
523 size_t
524 vm_get_lowmem_size(struct vmctx *ctx)
525 {
526 
527 	return (ctx->lowmem);
528 }
529 
530 size_t
531 vm_get_highmem_size(struct vmctx *ctx)
532 {
533 
534 	return (ctx->highmem);
535 }
536 
537 void *
538 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
539 {
540 	char pathname[MAXPATHLEN];
541 	size_t len2;
542 	char *base, *ptr;
543 	int fd, error, flags;
544 
545 	fd = -1;
546 	ptr = MAP_FAILED;
547 	if (name == NULL || strlen(name) == 0) {
548 		errno = EINVAL;
549 		goto done;
550 	}
551 
552 	error = vm_alloc_memseg(ctx, segid, len, name);
553 	if (error)
554 		goto done;
555 
556 	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
557 	strlcat(pathname, ctx->name, sizeof(pathname));
558 	strlcat(pathname, ".", sizeof(pathname));
559 	strlcat(pathname, name, sizeof(pathname));
560 
561 	fd = open(pathname, O_RDWR);
562 	if (fd < 0)
563 		goto done;
564 
565 	/*
566 	 * Stake out a contiguous region covering the device memory and the
567 	 * adjoining guard regions.
568 	 */
569 	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
570 	base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
571 	    0);
572 	if (base == MAP_FAILED)
573 		goto done;
574 
575 	flags = MAP_SHARED | MAP_FIXED;
576 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
577 		flags |= MAP_NOCORE;
578 
579 	/* mmap the devmem region in the host address space */
580 	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
581 done:
582 	if (fd >= 0)
583 		close(fd);
584 	return (ptr);
585 }
586 
587 int
588 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg)
589 {
590 	/*
591 	 * XXX: fragile, handle with care
592 	 * Assumes that the first field of the ioctl data
593 	 * is the vcpuid.
594 	 */
595 	*(int *)arg = vcpu->vcpuid;
596 	return (ioctl(vcpu->ctx->fd, cmd, arg));
597 }
598 
599 int
600 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
601 {
602 	int error;
603 	struct vm_register vmreg;
604 
605 	bzero(&vmreg, sizeof(vmreg));
606 	vmreg.regnum = reg;
607 	vmreg.regval = val;
608 
609 	error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg);
610 	return (error);
611 }
612 
613 int
614 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val)
615 {
616 	int error;
617 	struct vm_register vmreg;
618 
619 	bzero(&vmreg, sizeof(vmreg));
620 	vmreg.regnum = reg;
621 
622 	error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg);
623 	*ret_val = vmreg.regval;
624 	return (error);
625 }
626 
627 int
628 vm_set_register_set(struct vcpu *vcpu, unsigned int count,
629     const int *regnums, uint64_t *regvals)
630 {
631 	int error;
632 	struct vm_register_set vmregset;
633 
634 	bzero(&vmregset, sizeof(vmregset));
635 	vmregset.count = count;
636 	vmregset.regnums = regnums;
637 	vmregset.regvals = regvals;
638 
639 	error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset);
640 	return (error);
641 }
642 
643 int
644 vm_get_register_set(struct vcpu *vcpu, unsigned int count,
645     const int *regnums, uint64_t *regvals)
646 {
647 	int error;
648 	struct vm_register_set vmregset;
649 
650 	bzero(&vmregset, sizeof(vmregset));
651 	vmregset.count = count;
652 	vmregset.regnums = regnums;
653 	vmregset.regvals = regvals;
654 
655 	error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset);
656 	return (error);
657 }
658 
659 int
660 vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
661 {
662 	return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
663 }
664 
665 int
666 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
667 {
668 	struct vm_suspend vmsuspend;
669 
670 	bzero(&vmsuspend, sizeof(vmsuspend));
671 	vmsuspend.how = how;
672 	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
673 }
674 
675 int
676 vm_reinit(struct vmctx *ctx)
677 {
678 
679 	return (ioctl(ctx->fd, VM_REINIT, 0));
680 }
681 
682 int
683 vm_capability_name2type(const char *capname)
684 {
685 	int i;
686 
687 	for (i = 0; i < VM_CAP_MAX; i++) {
688 		if (vm_capstrmap[i] != NULL &&
689 		    strcmp(vm_capstrmap[i], capname) == 0)
690 			return (i);
691 	}
692 
693 	return (-1);
694 }
695 
696 const char *
697 vm_capability_type2name(int type)
698 {
699 	if (type >= 0 && type < VM_CAP_MAX)
700 		return (vm_capstrmap[type]);
701 
702 	return (NULL);
703 }
704 
705 int
706 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval)
707 {
708 	int error;
709 	struct vm_capability vmcap;
710 
711 	bzero(&vmcap, sizeof(vmcap));
712 	vmcap.captype = cap;
713 
714 	error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap);
715 	*retval = vmcap.capval;
716 	return (error);
717 }
718 
719 int
720 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val)
721 {
722 	struct vm_capability vmcap;
723 
724 	bzero(&vmcap, sizeof(vmcap));
725 	vmcap.captype = cap;
726 	vmcap.capval = val;
727 
728 	return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap));
729 }
730 
731 uint64_t *
732 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv,
733 	     int *ret_entries)
734 {
735 	static _Thread_local uint64_t *stats_buf;
736 	static _Thread_local u_int stats_count;
737 	uint64_t *new_stats;
738 	struct vm_stats vmstats;
739 	u_int count, index;
740 	bool have_stats;
741 
742 	have_stats = false;
743 	count = 0;
744 	for (index = 0;; index += nitems(vmstats.statbuf)) {
745 		vmstats.index = index;
746 		if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0)
747 			break;
748 		if (stats_count < index + vmstats.num_entries) {
749 			new_stats = realloc(stats_buf,
750 			    (index + vmstats.num_entries) * sizeof(uint64_t));
751 			if (new_stats == NULL) {
752 				errno = ENOMEM;
753 				return (NULL);
754 			}
755 			stats_count = index + vmstats.num_entries;
756 			stats_buf = new_stats;
757 		}
758 		memcpy(stats_buf + index, vmstats.statbuf,
759 		    vmstats.num_entries * sizeof(uint64_t));
760 		count += vmstats.num_entries;
761 		have_stats = true;
762 
763 		if (vmstats.num_entries != nitems(vmstats.statbuf))
764 			break;
765 	}
766 	if (have_stats) {
767 		if (ret_entries)
768 			*ret_entries = count;
769 		if (ret_tv)
770 			*ret_tv = vmstats.tv;
771 		return (stats_buf);
772 	} else
773 		return (NULL);
774 }
775 
776 const char *
777 vm_get_stat_desc(struct vmctx *ctx, int index)
778 {
779 	static struct vm_stat_desc statdesc;
780 
781 	statdesc.index = index;
782 	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
783 		return (statdesc.desc);
784 	else
785 		return (NULL);
786 }
787 
788 int
789 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
790 {
791 	int error, i;
792 	struct vm_gpa_pte gpapte;
793 
794 	bzero(&gpapte, sizeof(gpapte));
795 	gpapte.gpa = gpa;
796 
797 	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
798 
799 	if (error == 0) {
800 		*num = gpapte.ptenum;
801 		for (i = 0; i < gpapte.ptenum; i++)
802 			pte[i] = gpapte.pte[i];
803 	}
804 
805 	return (error);
806 }
807 
808 int
809 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
810     uint64_t gla, int prot, uint64_t *gpa, int *fault)
811 {
812 	struct vm_gla2gpa gg;
813 	int error;
814 
815 	bzero(&gg, sizeof(struct vm_gla2gpa));
816 	gg.prot = prot;
817 	gg.gla = gla;
818 	gg.paging = *paging;
819 
820 	error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg);
821 	if (error == 0) {
822 		*fault = gg.fault;
823 		*gpa = gg.gpa;
824 	}
825 	return (error);
826 }
827 
828 int
829 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
830     uint64_t gla, int prot, uint64_t *gpa, int *fault)
831 {
832 	struct vm_gla2gpa gg;
833 	int error;
834 
835 	bzero(&gg, sizeof(struct vm_gla2gpa));
836 	gg.prot = prot;
837 	gg.gla = gla;
838 	gg.paging = *paging;
839 
840 	error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg);
841 	if (error == 0) {
842 		*fault = gg.fault;
843 		*gpa = gg.gpa;
844 	}
845 	return (error);
846 }
847 
848 #ifndef min
849 #define	min(a,b)	(((a) < (b)) ? (a) : (b))
850 #endif
851 
852 int
853 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
854     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
855     int *fault)
856 {
857 	void *va;
858 	uint64_t gpa, off;
859 	int error, i, n;
860 
861 	for (i = 0; i < iovcnt; i++) {
862 		iov[i].iov_base = 0;
863 		iov[i].iov_len = 0;
864 	}
865 
866 	while (len) {
867 		assert(iovcnt > 0);
868 		error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
869 		if (error || *fault)
870 			return (error);
871 
872 		off = gpa & PAGE_MASK;
873 		n = MIN(len, PAGE_SIZE - off);
874 
875 		va = vm_map_gpa(vcpu->ctx, gpa, n);
876 		if (va == NULL)
877 			return (EFAULT);
878 
879 		iov->iov_base = va;
880 		iov->iov_len = n;
881 		iov++;
882 		iovcnt--;
883 
884 		gla += n;
885 		len -= n;
886 	}
887 	return (0);
888 }
889 
890 void
891 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused)
892 {
893 	/*
894 	 * Intentionally empty.  This is used by the instruction
895 	 * emulation code shared with the kernel.  The in-kernel
896 	 * version of this is non-empty.
897 	 */
898 }
899 
900 void
901 vm_copyin(struct iovec *iov, void *vp, size_t len)
902 {
903 	const char *src;
904 	char *dst;
905 	size_t n;
906 
907 	dst = vp;
908 	while (len) {
909 		assert(iov->iov_len);
910 		n = min(len, iov->iov_len);
911 		src = iov->iov_base;
912 		bcopy(src, dst, n);
913 
914 		iov++;
915 		dst += n;
916 		len -= n;
917 	}
918 }
919 
920 void
921 vm_copyout(const void *vp, struct iovec *iov, size_t len)
922 {
923 	const char *src;
924 	char *dst;
925 	size_t n;
926 
927 	src = vp;
928 	while (len) {
929 		assert(iov->iov_len);
930 		n = min(len, iov->iov_len);
931 		dst = iov->iov_base;
932 		bcopy(src, dst, n);
933 
934 		iov++;
935 		src += n;
936 		len -= n;
937 	}
938 }
939 
940 static int
941 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
942 {
943 	struct vm_cpuset vm_cpuset;
944 	int error;
945 
946 	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
947 	vm_cpuset.which = which;
948 	vm_cpuset.cpusetsize = sizeof(cpuset_t);
949 	vm_cpuset.cpus = cpus;
950 
951 	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
952 	return (error);
953 }
954 
955 int
956 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
957 {
958 
959 	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
960 }
961 
962 int
963 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
964 {
965 
966 	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
967 }
968 
969 int
970 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
971 {
972 
973 	return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
974 }
975 
976 int
977 vm_activate_cpu(struct vcpu *vcpu)
978 {
979 	struct vm_activate_cpu ac;
980 	int error;
981 
982 	bzero(&ac, sizeof(struct vm_activate_cpu));
983 	error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac);
984 	return (error);
985 }
986 
987 int
988 vm_suspend_all_cpus(struct vmctx *ctx)
989 {
990 	struct vm_activate_cpu ac;
991 	int error;
992 
993 	bzero(&ac, sizeof(struct vm_activate_cpu));
994 	ac.vcpuid = -1;
995 	error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
996 	return (error);
997 }
998 
999 int
1000 vm_suspend_cpu(struct vcpu *vcpu)
1001 {
1002 	struct vm_activate_cpu ac;
1003 	int error;
1004 
1005 	bzero(&ac, sizeof(struct vm_activate_cpu));
1006 	error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac);
1007 	return (error);
1008 }
1009 
1010 int
1011 vm_resume_cpu(struct vcpu *vcpu)
1012 {
1013 	struct vm_activate_cpu ac;
1014 	int error;
1015 
1016 	bzero(&ac, sizeof(struct vm_activate_cpu));
1017 	error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac);
1018 	return (error);
1019 }
1020 
1021 int
1022 vm_resume_all_cpus(struct vmctx *ctx)
1023 {
1024 	struct vm_activate_cpu ac;
1025 	int error;
1026 
1027 	bzero(&ac, sizeof(struct vm_activate_cpu));
1028 	ac.vcpuid = -1;
1029 	error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
1030 	return (error);
1031 }
1032 
1033 int
1034 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
1035 {
1036 	struct vm_intinfo vmii;
1037 	int error;
1038 
1039 	bzero(&vmii, sizeof(struct vm_intinfo));
1040 	error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii);
1041 	if (error == 0) {
1042 		*info1 = vmii.info1;
1043 		*info2 = vmii.info2;
1044 	}
1045 	return (error);
1046 }
1047 
1048 int
1049 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1)
1050 {
1051 	struct vm_intinfo vmii;
1052 	int error;
1053 
1054 	bzero(&vmii, sizeof(struct vm_intinfo));
1055 	vmii.info1 = info1;
1056 	error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii);
1057 	return (error);
1058 }
1059 
1060 int
1061 vm_restart_instruction(struct vcpu *vcpu)
1062 {
1063 	int arg;
1064 
1065 	return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg));
1066 }
1067 
1068 int
1069 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta)
1070 {
1071 
1072 	if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
1073 #ifdef SNAPSHOT_DEBUG
1074 		fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
1075 		    __func__, meta->dev_name, errno);
1076 #endif
1077 		return (-1);
1078 	}
1079 	return (0);
1080 }
1081 
1082 int
1083 vm_restore_time(struct vmctx *ctx)
1084 {
1085 	int dummy;
1086 
1087 	dummy = 0;
1088 	return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
1089 }
1090 
1091 int
1092 vm_set_topology(struct vmctx *ctx,
1093     uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
1094 {
1095 	struct vm_cpu_topology topology;
1096 
1097 	bzero(&topology, sizeof (struct vm_cpu_topology));
1098 	topology.sockets = sockets;
1099 	topology.cores = cores;
1100 	topology.threads = threads;
1101 	topology.maxcpus = maxcpus;
1102 	return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
1103 }
1104 
1105 int
1106 vm_get_topology(struct vmctx *ctx,
1107     uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
1108 {
1109 	struct vm_cpu_topology topology;
1110 	int error;
1111 
1112 	bzero(&topology, sizeof (struct vm_cpu_topology));
1113 	error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
1114 	if (error == 0) {
1115 		*sockets = topology.sockets;
1116 		*cores = topology.cores;
1117 		*threads = topology.threads;
1118 		*maxcpus = topology.maxcpus;
1119 	}
1120 	return (error);
1121 }
1122 
1123 int
1124 vm_limit_rights(struct vmctx *ctx)
1125 {
1126 	cap_rights_t rights;
1127 
1128 	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1129 	if (caph_rights_limit(ctx->fd, &rights) != 0)
1130 		return (-1);
1131 	if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0)
1132 		return (-1);
1133 	return (0);
1134 }
1135 
1136 /*
1137  * Avoid using in new code.  Operations on the fd should be wrapped here so that
1138  * capability rights can be kept in sync.
1139  */
1140 int
1141 vm_get_device_fd(struct vmctx *ctx)
1142 {
1143 
1144 	return (ctx->fd);
1145 }
1146 
1147 /* Legacy interface, do not use. */
1148 const cap_ioctl_t *
1149 vm_get_ioctls(size_t *len)
1150 {
1151 	cap_ioctl_t *cmds;
1152 	size_t sz;
1153 
1154 	if (len == NULL) {
1155 		sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]);
1156 		cmds = malloc(sz);
1157 		if (cmds == NULL)
1158 			return (NULL);
1159 		bcopy(vm_ioctl_cmds, cmds, sz);
1160 		return (cmds);
1161 	}
1162 
1163 	*len = vm_ioctl_ncmds;
1164 	return (NULL);
1165 }
1166