xref: /freebsd/lib/libvmmapi/vmmapi.c (revision c57c26179033f64c2011a2d2a904ee3fa62e826a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/capsicum.h>
31 #include <sys/sysctl.h>
32 #include <sys/ioctl.h>
33 #include <sys/mman.h>
34 #include <sys/linker.h>
35 #include <sys/module.h>
36 #include <sys/_iovec.h>
37 #include <sys/cpuset.h>
38 
39 #include <capsicum_helpers.h>
40 #include <errno.h>
41 #include <stdbool.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <assert.h>
45 #include <string.h>
46 #include <fcntl.h>
47 #include <unistd.h>
48 
49 #include <libutil.h>
50 
51 #include <vm/vm.h>
52 #include <machine/vmm.h>
53 #include <machine/vmm_dev.h>
54 #ifdef WITH_VMMAPI_SNAPSHOT
55 #include <machine/vmm_snapshot.h>
56 #endif
57 
58 #include "vmmapi.h"
59 #include "internal.h"
60 
61 #define	MB	(1024 * 1024UL)
62 #define	GB	(1024 * 1024 * 1024UL)
63 
64 #ifdef __amd64__
65 #define	VM_LOWMEM_LIMIT	(3 * GB)
66 #else
67 #define	VM_LOWMEM_LIMIT	0
68 #endif
69 #define	VM_HIGHMEM_BASE	(4 * GB)
70 
71 /*
72  * Size of the guard region before and after the virtual address space
73  * mapping the guest physical memory. This must be a multiple of the
74  * superpage size for performance reasons.
75  */
76 #define	VM_MMAP_GUARD_SIZE	(4 * MB)
77 
78 #define	PROT_RW		(PROT_READ | PROT_WRITE)
79 #define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
80 
81 #define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
82 #define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
83 
84 static int
85 vm_device_open(const char *name)
86 {
87 	int fd, len;
88 	char *vmfile;
89 
90 	len = strlen("/dev/vmm/") + strlen(name) + 1;
91 	vmfile = malloc(len);
92 	assert(vmfile != NULL);
93 	snprintf(vmfile, len, "/dev/vmm/%s", name);
94 
95 	/* Open the device file */
96 	fd = open(vmfile, O_RDWR, 0);
97 
98 	free(vmfile);
99 	return (fd);
100 }
101 
102 int
103 vm_create(const char *name)
104 {
105 	/* Try to load vmm(4) module before creating a guest. */
106 	if (modfind("vmm") < 0)
107 		kldload("vmm");
108 	return (CREATE(name));
109 }
110 
111 struct vmctx *
112 vm_open(const char *name)
113 {
114 	struct vmctx *vm;
115 	int saved_errno;
116 
117 	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
118 	assert(vm != NULL);
119 
120 	vm->fd = -1;
121 	vm->memflags = 0;
122 	vm->name = (char *)(vm + 1);
123 	strcpy(vm->name, name);
124 	memset(vm->memsegs, 0, sizeof(vm->memsegs));
125 
126 	if ((vm->fd = vm_device_open(vm->name)) < 0)
127 		goto err;
128 
129 	return (vm);
130 err:
131 	saved_errno = errno;
132 	free(vm);
133 	errno = saved_errno;
134 	return (NULL);
135 }
136 
137 void
138 vm_close(struct vmctx *vm)
139 {
140 	assert(vm != NULL);
141 
142 	close(vm->fd);
143 	free(vm);
144 }
145 
146 void
147 vm_destroy(struct vmctx *vm)
148 {
149 	assert(vm != NULL);
150 
151 	if (vm->fd >= 0)
152 		close(vm->fd);
153 	DESTROY(vm->name);
154 
155 	free(vm);
156 }
157 
158 struct vcpu *
159 vm_vcpu_open(struct vmctx *ctx, int vcpuid)
160 {
161 	struct vcpu *vcpu;
162 
163 	vcpu = malloc(sizeof(*vcpu));
164 	vcpu->ctx = ctx;
165 	vcpu->vcpuid = vcpuid;
166 	return (vcpu);
167 }
168 
169 void
170 vm_vcpu_close(struct vcpu *vcpu)
171 {
172 	free(vcpu);
173 }
174 
175 int
176 vcpu_id(struct vcpu *vcpu)
177 {
178 	return (vcpu->vcpuid);
179 }
180 
181 int
182 vm_parse_memsize(const char *opt, size_t *ret_memsize)
183 {
184 	char *endptr;
185 	size_t optval;
186 	int error;
187 
188 	optval = strtoul(opt, &endptr, 0);
189 	if (*opt != '\0' && *endptr == '\0') {
190 		/*
191 		 * For the sake of backward compatibility if the memory size
192 		 * specified on the command line is less than a megabyte then
193 		 * it is interpreted as being in units of MB.
194 		 */
195 		if (optval < MB)
196 			optval *= MB;
197 		*ret_memsize = optval;
198 		error = 0;
199 	} else
200 		error = expand_number(opt, ret_memsize);
201 
202 	return (error);
203 }
204 
205 uint32_t
206 vm_get_lowmem_limit(struct vmctx *ctx __unused)
207 {
208 
209 	return (VM_LOWMEM_LIMIT);
210 }
211 
212 void
213 vm_set_memflags(struct vmctx *ctx, int flags)
214 {
215 
216 	ctx->memflags = flags;
217 }
218 
219 int
220 vm_get_memflags(struct vmctx *ctx)
221 {
222 
223 	return (ctx->memflags);
224 }
225 
226 /*
227  * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
228  */
229 int
230 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
231     size_t len, int prot)
232 {
233 	struct vm_memmap memmap;
234 	int error, flags;
235 
236 	memmap.gpa = gpa;
237 	memmap.segid = segid;
238 	memmap.segoff = off;
239 	memmap.len = len;
240 	memmap.prot = prot;
241 	memmap.flags = 0;
242 
243 	if (ctx->memflags & VM_MEM_F_WIRED)
244 		memmap.flags |= VM_MEMMAP_F_WIRED;
245 
246 	/*
247 	 * If this mapping already exists then don't create it again. This
248 	 * is the common case for SYSMEM mappings created by bhyveload(8).
249 	 */
250 	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
251 	if (error == 0 && gpa == memmap.gpa) {
252 		if (segid != memmap.segid || off != memmap.segoff ||
253 		    prot != memmap.prot || flags != memmap.flags) {
254 			errno = EEXIST;
255 			return (-1);
256 		} else {
257 			return (0);
258 		}
259 	}
260 
261 	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
262 	return (error);
263 }
264 
265 int
266 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
267     size_t *lowmem_size, size_t *highmem_size)
268 {
269 
270 	*guest_baseaddr = ctx->baseaddr;
271 	*lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size;
272 	*highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size;
273 	return (0);
274 }
275 
276 int
277 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
278 {
279 	struct vm_munmap munmap;
280 	int error;
281 
282 	munmap.gpa = gpa;
283 	munmap.len = len;
284 
285 	error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap);
286 	return (error);
287 }
288 
289 int
290 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
291     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
292 {
293 	struct vm_memmap memmap;
294 	int error;
295 
296 	bzero(&memmap, sizeof(struct vm_memmap));
297 	memmap.gpa = *gpa;
298 	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
299 	if (error == 0) {
300 		*gpa = memmap.gpa;
301 		*segid = memmap.segid;
302 		*segoff = memmap.segoff;
303 		*len = memmap.len;
304 		*prot = memmap.prot;
305 		*flags = memmap.flags;
306 	}
307 	return (error);
308 }
309 
310 /*
311  * Return 0 if the segments are identical and non-zero otherwise.
312  *
313  * This is slightly complicated by the fact that only device memory segments
314  * are named.
315  */
316 static int
317 cmpseg(size_t len, const char *str, size_t len2, const char *str2)
318 {
319 
320 	if (len == len2) {
321 		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
322 			return (0);
323 	}
324 	return (-1);
325 }
326 
327 static int
328 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
329 {
330 	struct vm_memseg memseg;
331 	size_t n;
332 	int error;
333 
334 	/*
335 	 * If the memory segment has already been created then just return.
336 	 * This is the usual case for the SYSMEM segment created by userspace
337 	 * loaders like bhyveload(8).
338 	 */
339 	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
340 	    sizeof(memseg.name));
341 	if (error)
342 		return (error);
343 
344 	if (memseg.len != 0) {
345 		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
346 			errno = EINVAL;
347 			return (-1);
348 		} else {
349 			return (0);
350 		}
351 	}
352 
353 	bzero(&memseg, sizeof(struct vm_memseg));
354 	memseg.segid = segid;
355 	memseg.len = len;
356 	if (name != NULL) {
357 		n = strlcpy(memseg.name, name, sizeof(memseg.name));
358 		if (n >= sizeof(memseg.name)) {
359 			errno = ENAMETOOLONG;
360 			return (-1);
361 		}
362 	}
363 
364 	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
365 	return (error);
366 }
367 
368 int
369 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
370     size_t bufsize)
371 {
372 	struct vm_memseg memseg;
373 	size_t n;
374 	int error;
375 
376 	bzero(&memseg, sizeof(memseg));
377 	memseg.segid = segid;
378 	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
379 	if (error == 0) {
380 		*lenp = memseg.len;
381 		n = strlcpy(namebuf, memseg.name, bufsize);
382 		if (n >= bufsize) {
383 			errno = ENAMETOOLONG;
384 			error = -1;
385 		}
386 	}
387 	return (error);
388 }
389 
390 static int
391 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
392 {
393 	char *ptr;
394 	int error, flags;
395 
396 	/* Map 'len' bytes starting at 'gpa' in the guest address space */
397 	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
398 	if (error)
399 		return (error);
400 
401 	flags = MAP_SHARED | MAP_FIXED;
402 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
403 		flags |= MAP_NOCORE;
404 
405 	/* mmap into the process address space on the host */
406 	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
407 	if (ptr == MAP_FAILED)
408 		return (-1);
409 
410 	return (0);
411 }
412 
413 int
414 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
415 {
416 	size_t objsize, len;
417 	vm_paddr_t gpa;
418 	char *baseaddr, *ptr;
419 	int error;
420 
421 	assert(vms == VM_MMAP_ALL);
422 
423 	/*
424 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create
425 	 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder.
426 	 */
427 	if (memsize > VM_LOWMEM_LIMIT) {
428 		ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT;
429 		ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT;
430 		objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size;
431 	} else {
432 		ctx->memsegs[VM_MEMSEG_LOW].size = memsize;
433 		ctx->memsegs[VM_MEMSEG_HIGH].size = 0;
434 		objsize = memsize;
435 	}
436 
437 	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
438 	if (error)
439 		return (error);
440 
441 	/*
442 	 * Stake out a contiguous region covering the guest physical memory
443 	 * and the adjoining guard regions.
444 	 */
445 	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
446 	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
447 	if (ptr == MAP_FAILED)
448 		return (-1);
449 
450 	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
451 	if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) {
452 		gpa = VM_HIGHMEM_BASE;
453 		len = ctx->memsegs[VM_MEMSEG_HIGH].size;
454 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
455 		if (error)
456 			return (error);
457 	}
458 
459 	if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) {
460 		gpa = 0;
461 		len = ctx->memsegs[VM_MEMSEG_LOW].size;
462 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
463 		if (error)
464 			return (error);
465 	}
466 
467 	ctx->baseaddr = baseaddr;
468 
469 	return (0);
470 }
471 
472 /*
473  * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
474  * the lowmem or highmem regions.
475  *
476  * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
477  * The instruction emulation code depends on this behavior.
478  */
479 void *
480 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
481 {
482 	vm_size_t lowsize, highsize;
483 
484 	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
485 	if (lowsize > 0) {
486 		if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize)
487 			return (ctx->baseaddr + gaddr);
488 	}
489 
490 	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
491 	if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) {
492 		if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize &&
493 		    gaddr + len <= VM_HIGHMEM_BASE + highsize)
494 			return (ctx->baseaddr + gaddr);
495 	}
496 
497 	return (NULL);
498 }
499 
500 vm_paddr_t
501 vm_rev_map_gpa(struct vmctx *ctx, void *addr)
502 {
503 	vm_paddr_t offaddr;
504 	vm_size_t lowsize, highsize;
505 
506 	offaddr = (char *)addr - ctx->baseaddr;
507 
508 	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
509 	if (lowsize > 0)
510 		if (offaddr <= lowsize)
511 			return (offaddr);
512 
513 	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
514 	if (highsize > 0)
515 		if (offaddr >= VM_HIGHMEM_BASE &&
516 		    offaddr < VM_HIGHMEM_BASE + highsize)
517 			return (offaddr);
518 
519 	return ((vm_paddr_t)-1);
520 }
521 
522 const char *
523 vm_get_name(struct vmctx *ctx)
524 {
525 
526 	return (ctx->name);
527 }
528 
529 size_t
530 vm_get_lowmem_size(struct vmctx *ctx)
531 {
532 
533 	return (ctx->memsegs[VM_MEMSEG_LOW].size);
534 }
535 
536 vm_paddr_t
537 vm_get_highmem_base(struct vmctx *ctx __unused)
538 {
539 
540 	return (VM_HIGHMEM_BASE);
541 }
542 
543 size_t
544 vm_get_highmem_size(struct vmctx *ctx)
545 {
546 
547 	return (ctx->memsegs[VM_MEMSEG_HIGH].size);
548 }
549 
550 void *
551 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
552 {
553 	char pathname[MAXPATHLEN];
554 	size_t len2;
555 	char *base, *ptr;
556 	int fd, error, flags;
557 
558 	fd = -1;
559 	ptr = MAP_FAILED;
560 	if (name == NULL || strlen(name) == 0) {
561 		errno = EINVAL;
562 		goto done;
563 	}
564 
565 	error = vm_alloc_memseg(ctx, segid, len, name);
566 	if (error)
567 		goto done;
568 
569 	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
570 	strlcat(pathname, ctx->name, sizeof(pathname));
571 	strlcat(pathname, ".", sizeof(pathname));
572 	strlcat(pathname, name, sizeof(pathname));
573 
574 	fd = open(pathname, O_RDWR);
575 	if (fd < 0)
576 		goto done;
577 
578 	/*
579 	 * Stake out a contiguous region covering the device memory and the
580 	 * adjoining guard regions.
581 	 */
582 	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
583 	base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
584 	    0);
585 	if (base == MAP_FAILED)
586 		goto done;
587 
588 	flags = MAP_SHARED | MAP_FIXED;
589 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
590 		flags |= MAP_NOCORE;
591 
592 	/* mmap the devmem region in the host address space */
593 	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
594 done:
595 	if (fd >= 0)
596 		close(fd);
597 	return (ptr);
598 }
599 
600 int
601 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg)
602 {
603 	/*
604 	 * XXX: fragile, handle with care
605 	 * Assumes that the first field of the ioctl data
606 	 * is the vcpuid.
607 	 */
608 	*(int *)arg = vcpu->vcpuid;
609 	return (ioctl(vcpu->ctx->fd, cmd, arg));
610 }
611 
612 int
613 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
614 {
615 	int error;
616 	struct vm_register vmreg;
617 
618 	bzero(&vmreg, sizeof(vmreg));
619 	vmreg.regnum = reg;
620 	vmreg.regval = val;
621 
622 	error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg);
623 	return (error);
624 }
625 
626 int
627 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val)
628 {
629 	int error;
630 	struct vm_register vmreg;
631 
632 	bzero(&vmreg, sizeof(vmreg));
633 	vmreg.regnum = reg;
634 
635 	error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg);
636 	*ret_val = vmreg.regval;
637 	return (error);
638 }
639 
640 int
641 vm_set_register_set(struct vcpu *vcpu, unsigned int count,
642     const int *regnums, uint64_t *regvals)
643 {
644 	int error;
645 	struct vm_register_set vmregset;
646 
647 	bzero(&vmregset, sizeof(vmregset));
648 	vmregset.count = count;
649 	vmregset.regnums = regnums;
650 	vmregset.regvals = regvals;
651 
652 	error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset);
653 	return (error);
654 }
655 
656 int
657 vm_get_register_set(struct vcpu *vcpu, unsigned int count,
658     const int *regnums, uint64_t *regvals)
659 {
660 	int error;
661 	struct vm_register_set vmregset;
662 
663 	bzero(&vmregset, sizeof(vmregset));
664 	vmregset.count = count;
665 	vmregset.regnums = regnums;
666 	vmregset.regvals = regvals;
667 
668 	error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset);
669 	return (error);
670 }
671 
672 int
673 vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
674 {
675 	return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
676 }
677 
678 int
679 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
680 {
681 	struct vm_suspend vmsuspend;
682 
683 	bzero(&vmsuspend, sizeof(vmsuspend));
684 	vmsuspend.how = how;
685 	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
686 }
687 
688 int
689 vm_reinit(struct vmctx *ctx)
690 {
691 
692 	return (ioctl(ctx->fd, VM_REINIT, 0));
693 }
694 
695 int
696 vm_capability_name2type(const char *capname)
697 {
698 	int i;
699 
700 	for (i = 0; i < VM_CAP_MAX; i++) {
701 		if (vm_capstrmap[i] != NULL &&
702 		    strcmp(vm_capstrmap[i], capname) == 0)
703 			return (i);
704 	}
705 
706 	return (-1);
707 }
708 
709 const char *
710 vm_capability_type2name(int type)
711 {
712 	if (type >= 0 && type < VM_CAP_MAX)
713 		return (vm_capstrmap[type]);
714 
715 	return (NULL);
716 }
717 
718 int
719 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval)
720 {
721 	int error;
722 	struct vm_capability vmcap;
723 
724 	bzero(&vmcap, sizeof(vmcap));
725 	vmcap.captype = cap;
726 
727 	error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap);
728 	*retval = vmcap.capval;
729 	return (error);
730 }
731 
732 int
733 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val)
734 {
735 	struct vm_capability vmcap;
736 
737 	bzero(&vmcap, sizeof(vmcap));
738 	vmcap.captype = cap;
739 	vmcap.capval = val;
740 
741 	return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap));
742 }
743 
744 uint64_t *
745 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv,
746 	     int *ret_entries)
747 {
748 	static _Thread_local uint64_t *stats_buf;
749 	static _Thread_local u_int stats_count;
750 	uint64_t *new_stats;
751 	struct vm_stats vmstats;
752 	u_int count, index;
753 	bool have_stats;
754 
755 	have_stats = false;
756 	count = 0;
757 	for (index = 0;; index += nitems(vmstats.statbuf)) {
758 		vmstats.index = index;
759 		if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0)
760 			break;
761 		if (stats_count < index + vmstats.num_entries) {
762 			new_stats = realloc(stats_buf,
763 			    (index + vmstats.num_entries) * sizeof(uint64_t));
764 			if (new_stats == NULL) {
765 				errno = ENOMEM;
766 				return (NULL);
767 			}
768 			stats_count = index + vmstats.num_entries;
769 			stats_buf = new_stats;
770 		}
771 		memcpy(stats_buf + index, vmstats.statbuf,
772 		    vmstats.num_entries * sizeof(uint64_t));
773 		count += vmstats.num_entries;
774 		have_stats = true;
775 
776 		if (vmstats.num_entries != nitems(vmstats.statbuf))
777 			break;
778 	}
779 	if (have_stats) {
780 		if (ret_entries)
781 			*ret_entries = count;
782 		if (ret_tv)
783 			*ret_tv = vmstats.tv;
784 		return (stats_buf);
785 	} else
786 		return (NULL);
787 }
788 
789 const char *
790 vm_get_stat_desc(struct vmctx *ctx, int index)
791 {
792 	static struct vm_stat_desc statdesc;
793 
794 	statdesc.index = index;
795 	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
796 		return (statdesc.desc);
797 	else
798 		return (NULL);
799 }
800 
801 #ifdef __amd64__
802 int
803 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
804 {
805 	int error, i;
806 	struct vm_gpa_pte gpapte;
807 
808 	bzero(&gpapte, sizeof(gpapte));
809 	gpapte.gpa = gpa;
810 
811 	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
812 
813 	if (error == 0) {
814 		*num = gpapte.ptenum;
815 		for (i = 0; i < gpapte.ptenum; i++)
816 			pte[i] = gpapte.pte[i];
817 	}
818 
819 	return (error);
820 }
821 
822 int
823 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
824     uint64_t gla, int prot, uint64_t *gpa, int *fault)
825 {
826 	struct vm_gla2gpa gg;
827 	int error;
828 
829 	bzero(&gg, sizeof(struct vm_gla2gpa));
830 	gg.prot = prot;
831 	gg.gla = gla;
832 	gg.paging = *paging;
833 
834 	error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg);
835 	if (error == 0) {
836 		*fault = gg.fault;
837 		*gpa = gg.gpa;
838 	}
839 	return (error);
840 }
841 #endif
842 
843 int
844 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
845     uint64_t gla, int prot, uint64_t *gpa, int *fault)
846 {
847 	struct vm_gla2gpa gg;
848 	int error;
849 
850 	bzero(&gg, sizeof(struct vm_gla2gpa));
851 	gg.prot = prot;
852 	gg.gla = gla;
853 	gg.paging = *paging;
854 
855 	error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg);
856 	if (error == 0) {
857 		*fault = gg.fault;
858 		*gpa = gg.gpa;
859 	}
860 	return (error);
861 }
862 
863 #ifndef min
864 #define	min(a,b)	(((a) < (b)) ? (a) : (b))
865 #endif
866 
867 #ifdef __amd64__
868 int
869 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
870     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
871     int *fault)
872 {
873 	void *va;
874 	uint64_t gpa, off;
875 	int error, i, n;
876 
877 	for (i = 0; i < iovcnt; i++) {
878 		iov[i].iov_base = 0;
879 		iov[i].iov_len = 0;
880 	}
881 
882 	while (len) {
883 		assert(iovcnt > 0);
884 		error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
885 		if (error || *fault)
886 			return (error);
887 
888 		off = gpa & PAGE_MASK;
889 		n = MIN(len, PAGE_SIZE - off);
890 
891 		va = vm_map_gpa(vcpu->ctx, gpa, n);
892 		if (va == NULL)
893 			return (EFAULT);
894 
895 		iov->iov_base = va;
896 		iov->iov_len = n;
897 		iov++;
898 		iovcnt--;
899 
900 		gla += n;
901 		len -= n;
902 	}
903 	return (0);
904 }
905 #endif
906 
907 void
908 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused)
909 {
910 	/*
911 	 * Intentionally empty.  This is used by the instruction
912 	 * emulation code shared with the kernel.  The in-kernel
913 	 * version of this is non-empty.
914 	 */
915 }
916 
917 void
918 vm_copyin(struct iovec *iov, void *vp, size_t len)
919 {
920 	const char *src;
921 	char *dst;
922 	size_t n;
923 
924 	dst = vp;
925 	while (len) {
926 		assert(iov->iov_len);
927 		n = min(len, iov->iov_len);
928 		src = iov->iov_base;
929 		bcopy(src, dst, n);
930 
931 		iov++;
932 		dst += n;
933 		len -= n;
934 	}
935 }
936 
937 void
938 vm_copyout(const void *vp, struct iovec *iov, size_t len)
939 {
940 	const char *src;
941 	char *dst;
942 	size_t n;
943 
944 	src = vp;
945 	while (len) {
946 		assert(iov->iov_len);
947 		n = min(len, iov->iov_len);
948 		dst = iov->iov_base;
949 		bcopy(src, dst, n);
950 
951 		iov++;
952 		src += n;
953 		len -= n;
954 	}
955 }
956 
957 static int
958 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
959 {
960 	struct vm_cpuset vm_cpuset;
961 	int error;
962 
963 	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
964 	vm_cpuset.which = which;
965 	vm_cpuset.cpusetsize = sizeof(cpuset_t);
966 	vm_cpuset.cpus = cpus;
967 
968 	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
969 	return (error);
970 }
971 
972 int
973 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
974 {
975 
976 	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
977 }
978 
979 int
980 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
981 {
982 
983 	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
984 }
985 
986 int
987 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
988 {
989 
990 	return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
991 }
992 
993 int
994 vm_activate_cpu(struct vcpu *vcpu)
995 {
996 	struct vm_activate_cpu ac;
997 	int error;
998 
999 	bzero(&ac, sizeof(struct vm_activate_cpu));
1000 	error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac);
1001 	return (error);
1002 }
1003 
1004 int
1005 vm_suspend_all_cpus(struct vmctx *ctx)
1006 {
1007 	struct vm_activate_cpu ac;
1008 	int error;
1009 
1010 	bzero(&ac, sizeof(struct vm_activate_cpu));
1011 	ac.vcpuid = -1;
1012 	error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
1013 	return (error);
1014 }
1015 
1016 int
1017 vm_suspend_cpu(struct vcpu *vcpu)
1018 {
1019 	struct vm_activate_cpu ac;
1020 	int error;
1021 
1022 	bzero(&ac, sizeof(struct vm_activate_cpu));
1023 	error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac);
1024 	return (error);
1025 }
1026 
1027 int
1028 vm_resume_cpu(struct vcpu *vcpu)
1029 {
1030 	struct vm_activate_cpu ac;
1031 	int error;
1032 
1033 	bzero(&ac, sizeof(struct vm_activate_cpu));
1034 	error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac);
1035 	return (error);
1036 }
1037 
1038 int
1039 vm_resume_all_cpus(struct vmctx *ctx)
1040 {
1041 	struct vm_activate_cpu ac;
1042 	int error;
1043 
1044 	bzero(&ac, sizeof(struct vm_activate_cpu));
1045 	ac.vcpuid = -1;
1046 	error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
1047 	return (error);
1048 }
1049 
1050 #ifdef __amd64__
1051 int
1052 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
1053 {
1054 	struct vm_intinfo vmii;
1055 	int error;
1056 
1057 	bzero(&vmii, sizeof(struct vm_intinfo));
1058 	error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii);
1059 	if (error == 0) {
1060 		*info1 = vmii.info1;
1061 		*info2 = vmii.info2;
1062 	}
1063 	return (error);
1064 }
1065 
1066 int
1067 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1)
1068 {
1069 	struct vm_intinfo vmii;
1070 	int error;
1071 
1072 	bzero(&vmii, sizeof(struct vm_intinfo));
1073 	vmii.info1 = info1;
1074 	error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii);
1075 	return (error);
1076 }
1077 #endif
1078 
1079 #ifdef WITH_VMMAPI_SNAPSHOT
1080 int
1081 vm_restart_instruction(struct vcpu *vcpu)
1082 {
1083 	int arg;
1084 
1085 	return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg));
1086 }
1087 
1088 int
1089 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta)
1090 {
1091 
1092 	if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
1093 #ifdef SNAPSHOT_DEBUG
1094 		fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
1095 		    __func__, meta->dev_name, errno);
1096 #endif
1097 		return (-1);
1098 	}
1099 	return (0);
1100 }
1101 
1102 int
1103 vm_restore_time(struct vmctx *ctx)
1104 {
1105 	int dummy;
1106 
1107 	dummy = 0;
1108 	return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
1109 }
1110 #endif
1111 
1112 int
1113 vm_set_topology(struct vmctx *ctx,
1114     uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
1115 {
1116 	struct vm_cpu_topology topology;
1117 
1118 	bzero(&topology, sizeof (struct vm_cpu_topology));
1119 	topology.sockets = sockets;
1120 	topology.cores = cores;
1121 	topology.threads = threads;
1122 	topology.maxcpus = maxcpus;
1123 	return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
1124 }
1125 
1126 int
1127 vm_get_topology(struct vmctx *ctx,
1128     uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
1129 {
1130 	struct vm_cpu_topology topology;
1131 	int error;
1132 
1133 	bzero(&topology, sizeof (struct vm_cpu_topology));
1134 	error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
1135 	if (error == 0) {
1136 		*sockets = topology.sockets;
1137 		*cores = topology.cores;
1138 		*threads = topology.threads;
1139 		*maxcpus = topology.maxcpus;
1140 	}
1141 	return (error);
1142 }
1143 
1144 int
1145 vm_limit_rights(struct vmctx *ctx)
1146 {
1147 	cap_rights_t rights;
1148 
1149 	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1150 	if (caph_rights_limit(ctx->fd, &rights) != 0)
1151 		return (-1);
1152 	if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0)
1153 		return (-1);
1154 	return (0);
1155 }
1156 
1157 /*
1158  * Avoid using in new code.  Operations on the fd should be wrapped here so that
1159  * capability rights can be kept in sync.
1160  */
1161 int
1162 vm_get_device_fd(struct vmctx *ctx)
1163 {
1164 
1165 	return (ctx->fd);
1166 }
1167 
1168 /* Legacy interface, do not use. */
1169 const cap_ioctl_t *
1170 vm_get_ioctls(size_t *len)
1171 {
1172 	cap_ioctl_t *cmds;
1173 	size_t sz;
1174 
1175 	if (len == NULL) {
1176 		sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]);
1177 		cmds = malloc(sz);
1178 		if (cmds == NULL)
1179 			return (NULL);
1180 		bcopy(vm_ioctl_cmds, cmds, sz);
1181 		return (cmds);
1182 	}
1183 
1184 	*len = vm_ioctl_ncmds;
1185 	return (NULL);
1186 }
1187