xref: /freebsd/lib/libvmmapi/vmmapi.c (revision ff50e9d53ff836bd6276c9f5a355e0ab03a99c61)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/capsicum.h>
31 #include <sys/sysctl.h>
32 #include <sys/ioctl.h>
33 #include <sys/mman.h>
34 #include <sys/linker.h>
35 #include <sys/module.h>
36 #include <sys/_iovec.h>
37 #include <sys/cpuset.h>
38 
39 #include <capsicum_helpers.h>
40 #include <errno.h>
41 #include <stdbool.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <assert.h>
45 #include <string.h>
46 #include <fcntl.h>
47 #include <unistd.h>
48 
49 #include <libutil.h>
50 
51 #include <vm/vm.h>
52 #include <machine/vmm.h>
53 #include <machine/vmm_dev.h>
54 #include <machine/vmm_snapshot.h>
55 
56 #include "vmmapi.h"
57 #include "internal.h"
58 
59 #define	MB	(1024 * 1024UL)
60 #define	GB	(1024 * 1024 * 1024UL)
61 
62 #ifdef __amd64__
63 #define	VM_LOWMEM_LIMIT	(3 * GB)
64 #else
65 #define	VM_LOWMEM_LIMIT	0
66 #endif
67 #define	VM_HIGHMEM_BASE	(4 * GB)
68 
69 /*
70  * Size of the guard region before and after the virtual address space
71  * mapping the guest physical memory. This must be a multiple of the
72  * superpage size for performance reasons.
73  */
74 #define	VM_MMAP_GUARD_SIZE	(4 * MB)
75 
76 #define	PROT_RW		(PROT_READ | PROT_WRITE)
77 #define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
78 
79 #define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
80 #define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
81 
82 static int
83 vm_device_open(const char *name)
84 {
85 	int fd, len;
86 	char *vmfile;
87 
88 	len = strlen("/dev/vmm/") + strlen(name) + 1;
89 	vmfile = malloc(len);
90 	assert(vmfile != NULL);
91 	snprintf(vmfile, len, "/dev/vmm/%s", name);
92 
93 	/* Open the device file */
94 	fd = open(vmfile, O_RDWR, 0);
95 
96 	free(vmfile);
97 	return (fd);
98 }
99 
100 int
101 vm_create(const char *name)
102 {
103 	/* Try to load vmm(4) module before creating a guest. */
104 	if (modfind("vmm") < 0)
105 		kldload("vmm");
106 	return (CREATE(name));
107 }
108 
109 struct vmctx *
110 vm_open(const char *name)
111 {
112 	struct vmctx *vm;
113 	int saved_errno;
114 
115 	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
116 	assert(vm != NULL);
117 
118 	vm->fd = -1;
119 	vm->memflags = 0;
120 	vm->name = (char *)(vm + 1);
121 	strcpy(vm->name, name);
122 	memset(vm->memsegs, 0, sizeof(vm->memsegs));
123 
124 	if ((vm->fd = vm_device_open(vm->name)) < 0)
125 		goto err;
126 
127 	return (vm);
128 err:
129 	saved_errno = errno;
130 	free(vm);
131 	errno = saved_errno;
132 	return (NULL);
133 }
134 
135 void
136 vm_close(struct vmctx *vm)
137 {
138 	assert(vm != NULL);
139 
140 	close(vm->fd);
141 	free(vm);
142 }
143 
144 void
145 vm_destroy(struct vmctx *vm)
146 {
147 	assert(vm != NULL);
148 
149 	if (vm->fd >= 0)
150 		close(vm->fd);
151 	DESTROY(vm->name);
152 
153 	free(vm);
154 }
155 
156 struct vcpu *
157 vm_vcpu_open(struct vmctx *ctx, int vcpuid)
158 {
159 	struct vcpu *vcpu;
160 
161 	vcpu = malloc(sizeof(*vcpu));
162 	vcpu->ctx = ctx;
163 	vcpu->vcpuid = vcpuid;
164 	return (vcpu);
165 }
166 
167 void
168 vm_vcpu_close(struct vcpu *vcpu)
169 {
170 	free(vcpu);
171 }
172 
173 int
174 vcpu_id(struct vcpu *vcpu)
175 {
176 	return (vcpu->vcpuid);
177 }
178 
179 int
180 vm_parse_memsize(const char *opt, size_t *ret_memsize)
181 {
182 	char *endptr;
183 	size_t optval;
184 	int error;
185 
186 	optval = strtoul(opt, &endptr, 0);
187 	if (*opt != '\0' && *endptr == '\0') {
188 		/*
189 		 * For the sake of backward compatibility if the memory size
190 		 * specified on the command line is less than a megabyte then
191 		 * it is interpreted as being in units of MB.
192 		 */
193 		if (optval < MB)
194 			optval *= MB;
195 		*ret_memsize = optval;
196 		error = 0;
197 	} else
198 		error = expand_number(opt, ret_memsize);
199 
200 	return (error);
201 }
202 
203 uint32_t
204 vm_get_lowmem_limit(struct vmctx *ctx __unused)
205 {
206 
207 	return (VM_LOWMEM_LIMIT);
208 }
209 
210 void
211 vm_set_memflags(struct vmctx *ctx, int flags)
212 {
213 
214 	ctx->memflags = flags;
215 }
216 
217 int
218 vm_get_memflags(struct vmctx *ctx)
219 {
220 
221 	return (ctx->memflags);
222 }
223 
224 /*
225  * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
226  */
227 int
228 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
229     size_t len, int prot)
230 {
231 	struct vm_memmap memmap;
232 	int error, flags;
233 
234 	memmap.gpa = gpa;
235 	memmap.segid = segid;
236 	memmap.segoff = off;
237 	memmap.len = len;
238 	memmap.prot = prot;
239 	memmap.flags = 0;
240 
241 	if (ctx->memflags & VM_MEM_F_WIRED)
242 		memmap.flags |= VM_MEMMAP_F_WIRED;
243 
244 	/*
245 	 * If this mapping already exists then don't create it again. This
246 	 * is the common case for SYSMEM mappings created by bhyveload(8).
247 	 */
248 	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
249 	if (error == 0 && gpa == memmap.gpa) {
250 		if (segid != memmap.segid || off != memmap.segoff ||
251 		    prot != memmap.prot || flags != memmap.flags) {
252 			errno = EEXIST;
253 			return (-1);
254 		} else {
255 			return (0);
256 		}
257 	}
258 
259 	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
260 	return (error);
261 }
262 
263 int
264 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
265     size_t *lowmem_size, size_t *highmem_size)
266 {
267 
268 	*guest_baseaddr = ctx->baseaddr;
269 	*lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size;
270 	*highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size;
271 	return (0);
272 }
273 
274 int
275 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
276 {
277 	struct vm_munmap munmap;
278 	int error;
279 
280 	munmap.gpa = gpa;
281 	munmap.len = len;
282 
283 	error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap);
284 	return (error);
285 }
286 
287 int
288 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
289     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
290 {
291 	struct vm_memmap memmap;
292 	int error;
293 
294 	bzero(&memmap, sizeof(struct vm_memmap));
295 	memmap.gpa = *gpa;
296 	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
297 	if (error == 0) {
298 		*gpa = memmap.gpa;
299 		*segid = memmap.segid;
300 		*segoff = memmap.segoff;
301 		*len = memmap.len;
302 		*prot = memmap.prot;
303 		*flags = memmap.flags;
304 	}
305 	return (error);
306 }
307 
308 /*
309  * Return 0 if the segments are identical and non-zero otherwise.
310  *
311  * This is slightly complicated by the fact that only device memory segments
312  * are named.
313  */
314 static int
315 cmpseg(size_t len, const char *str, size_t len2, const char *str2)
316 {
317 
318 	if (len == len2) {
319 		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
320 			return (0);
321 	}
322 	return (-1);
323 }
324 
325 static int
326 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
327 {
328 	struct vm_memseg memseg;
329 	size_t n;
330 	int error;
331 
332 	/*
333 	 * If the memory segment has already been created then just return.
334 	 * This is the usual case for the SYSMEM segment created by userspace
335 	 * loaders like bhyveload(8).
336 	 */
337 	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
338 	    sizeof(memseg.name));
339 	if (error)
340 		return (error);
341 
342 	if (memseg.len != 0) {
343 		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
344 			errno = EINVAL;
345 			return (-1);
346 		} else {
347 			return (0);
348 		}
349 	}
350 
351 	bzero(&memseg, sizeof(struct vm_memseg));
352 	memseg.segid = segid;
353 	memseg.len = len;
354 	if (name != NULL) {
355 		n = strlcpy(memseg.name, name, sizeof(memseg.name));
356 		if (n >= sizeof(memseg.name)) {
357 			errno = ENAMETOOLONG;
358 			return (-1);
359 		}
360 	}
361 
362 	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
363 	return (error);
364 }
365 
366 int
367 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
368     size_t bufsize)
369 {
370 	struct vm_memseg memseg;
371 	size_t n;
372 	int error;
373 
374 	bzero(&memseg, sizeof(memseg));
375 	memseg.segid = segid;
376 	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
377 	if (error == 0) {
378 		*lenp = memseg.len;
379 		n = strlcpy(namebuf, memseg.name, bufsize);
380 		if (n >= bufsize) {
381 			errno = ENAMETOOLONG;
382 			error = -1;
383 		}
384 	}
385 	return (error);
386 }
387 
388 static int
389 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
390 {
391 	char *ptr;
392 	int error, flags;
393 
394 	/* Map 'len' bytes starting at 'gpa' in the guest address space */
395 	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
396 	if (error)
397 		return (error);
398 
399 	flags = MAP_SHARED | MAP_FIXED;
400 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
401 		flags |= MAP_NOCORE;
402 
403 	/* mmap into the process address space on the host */
404 	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
405 	if (ptr == MAP_FAILED)
406 		return (-1);
407 
408 	return (0);
409 }
410 
411 int
412 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
413 {
414 	size_t objsize, len;
415 	vm_paddr_t gpa;
416 	char *baseaddr, *ptr;
417 	int error;
418 
419 	assert(vms == VM_MMAP_ALL);
420 
421 	/*
422 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create
423 	 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder.
424 	 */
425 	if (memsize > VM_LOWMEM_LIMIT) {
426 		ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT;
427 		ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT;
428 		objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size;
429 	} else {
430 		ctx->memsegs[VM_MEMSEG_LOW].size = memsize;
431 		ctx->memsegs[VM_MEMSEG_HIGH].size = 0;
432 		objsize = memsize;
433 	}
434 
435 	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
436 	if (error)
437 		return (error);
438 
439 	/*
440 	 * Stake out a contiguous region covering the guest physical memory
441 	 * and the adjoining guard regions.
442 	 */
443 	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
444 	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
445 	if (ptr == MAP_FAILED)
446 		return (-1);
447 
448 	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
449 	if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) {
450 		gpa = VM_HIGHMEM_BASE;
451 		len = ctx->memsegs[VM_MEMSEG_HIGH].size;
452 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
453 		if (error)
454 			return (error);
455 	}
456 
457 	if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) {
458 		gpa = 0;
459 		len = ctx->memsegs[VM_MEMSEG_LOW].size;
460 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
461 		if (error)
462 			return (error);
463 	}
464 
465 	ctx->baseaddr = baseaddr;
466 
467 	return (0);
468 }
469 
470 /*
471  * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
472  * the lowmem or highmem regions.
473  *
474  * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
475  * The instruction emulation code depends on this behavior.
476  */
477 void *
478 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
479 {
480 	vm_size_t lowsize, highsize;
481 
482 	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
483 	if (lowsize > 0) {
484 		if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize)
485 			return (ctx->baseaddr + gaddr);
486 	}
487 
488 	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
489 	if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) {
490 		if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize &&
491 		    gaddr + len <= VM_HIGHMEM_BASE + highsize)
492 			return (ctx->baseaddr + gaddr);
493 	}
494 
495 	return (NULL);
496 }
497 
498 vm_paddr_t
499 vm_rev_map_gpa(struct vmctx *ctx, void *addr)
500 {
501 	vm_paddr_t offaddr;
502 	vm_size_t lowsize, highsize;
503 
504 	offaddr = (char *)addr - ctx->baseaddr;
505 
506 	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
507 	if (lowsize > 0)
508 		if (offaddr <= lowsize)
509 			return (offaddr);
510 
511 	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
512 	if (highsize > 0)
513 		if (offaddr >= VM_HIGHMEM_BASE &&
514 		    offaddr < VM_HIGHMEM_BASE + highsize)
515 			return (offaddr);
516 
517 	return ((vm_paddr_t)-1);
518 }
519 
520 const char *
521 vm_get_name(struct vmctx *ctx)
522 {
523 
524 	return (ctx->name);
525 }
526 
527 size_t
528 vm_get_lowmem_size(struct vmctx *ctx)
529 {
530 
531 	return (ctx->memsegs[VM_MEMSEG_LOW].size);
532 }
533 
534 vm_paddr_t
535 vm_get_highmem_base(struct vmctx *ctx __unused)
536 {
537 
538 	return (VM_HIGHMEM_BASE);
539 }
540 
541 size_t
542 vm_get_highmem_size(struct vmctx *ctx)
543 {
544 
545 	return (ctx->memsegs[VM_MEMSEG_HIGH].size);
546 }
547 
548 void *
549 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
550 {
551 	char pathname[MAXPATHLEN];
552 	size_t len2;
553 	char *base, *ptr;
554 	int fd, error, flags;
555 
556 	fd = -1;
557 	ptr = MAP_FAILED;
558 	if (name == NULL || strlen(name) == 0) {
559 		errno = EINVAL;
560 		goto done;
561 	}
562 
563 	error = vm_alloc_memseg(ctx, segid, len, name);
564 	if (error)
565 		goto done;
566 
567 	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
568 	strlcat(pathname, ctx->name, sizeof(pathname));
569 	strlcat(pathname, ".", sizeof(pathname));
570 	strlcat(pathname, name, sizeof(pathname));
571 
572 	fd = open(pathname, O_RDWR);
573 	if (fd < 0)
574 		goto done;
575 
576 	/*
577 	 * Stake out a contiguous region covering the device memory and the
578 	 * adjoining guard regions.
579 	 */
580 	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
581 	base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
582 	    0);
583 	if (base == MAP_FAILED)
584 		goto done;
585 
586 	flags = MAP_SHARED | MAP_FIXED;
587 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
588 		flags |= MAP_NOCORE;
589 
590 	/* mmap the devmem region in the host address space */
591 	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
592 done:
593 	if (fd >= 0)
594 		close(fd);
595 	return (ptr);
596 }
597 
598 int
599 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg)
600 {
601 	/*
602 	 * XXX: fragile, handle with care
603 	 * Assumes that the first field of the ioctl data
604 	 * is the vcpuid.
605 	 */
606 	*(int *)arg = vcpu->vcpuid;
607 	return (ioctl(vcpu->ctx->fd, cmd, arg));
608 }
609 
610 int
611 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
612 {
613 	int error;
614 	struct vm_register vmreg;
615 
616 	bzero(&vmreg, sizeof(vmreg));
617 	vmreg.regnum = reg;
618 	vmreg.regval = val;
619 
620 	error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg);
621 	return (error);
622 }
623 
624 int
625 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val)
626 {
627 	int error;
628 	struct vm_register vmreg;
629 
630 	bzero(&vmreg, sizeof(vmreg));
631 	vmreg.regnum = reg;
632 
633 	error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg);
634 	*ret_val = vmreg.regval;
635 	return (error);
636 }
637 
638 int
639 vm_set_register_set(struct vcpu *vcpu, unsigned int count,
640     const int *regnums, uint64_t *regvals)
641 {
642 	int error;
643 	struct vm_register_set vmregset;
644 
645 	bzero(&vmregset, sizeof(vmregset));
646 	vmregset.count = count;
647 	vmregset.regnums = regnums;
648 	vmregset.regvals = regvals;
649 
650 	error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset);
651 	return (error);
652 }
653 
654 int
655 vm_get_register_set(struct vcpu *vcpu, unsigned int count,
656     const int *regnums, uint64_t *regvals)
657 {
658 	int error;
659 	struct vm_register_set vmregset;
660 
661 	bzero(&vmregset, sizeof(vmregset));
662 	vmregset.count = count;
663 	vmregset.regnums = regnums;
664 	vmregset.regvals = regvals;
665 
666 	error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset);
667 	return (error);
668 }
669 
670 int
671 vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
672 {
673 	return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
674 }
675 
676 int
677 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
678 {
679 	struct vm_suspend vmsuspend;
680 
681 	bzero(&vmsuspend, sizeof(vmsuspend));
682 	vmsuspend.how = how;
683 	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
684 }
685 
686 int
687 vm_reinit(struct vmctx *ctx)
688 {
689 
690 	return (ioctl(ctx->fd, VM_REINIT, 0));
691 }
692 
693 int
694 vm_capability_name2type(const char *capname)
695 {
696 	int i;
697 
698 	for (i = 0; i < VM_CAP_MAX; i++) {
699 		if (vm_capstrmap[i] != NULL &&
700 		    strcmp(vm_capstrmap[i], capname) == 0)
701 			return (i);
702 	}
703 
704 	return (-1);
705 }
706 
707 const char *
708 vm_capability_type2name(int type)
709 {
710 	if (type >= 0 && type < VM_CAP_MAX)
711 		return (vm_capstrmap[type]);
712 
713 	return (NULL);
714 }
715 
716 int
717 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval)
718 {
719 	int error;
720 	struct vm_capability vmcap;
721 
722 	bzero(&vmcap, sizeof(vmcap));
723 	vmcap.captype = cap;
724 
725 	error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap);
726 	*retval = vmcap.capval;
727 	return (error);
728 }
729 
730 int
731 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val)
732 {
733 	struct vm_capability vmcap;
734 
735 	bzero(&vmcap, sizeof(vmcap));
736 	vmcap.captype = cap;
737 	vmcap.capval = val;
738 
739 	return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap));
740 }
741 
742 uint64_t *
743 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv,
744 	     int *ret_entries)
745 {
746 	static _Thread_local uint64_t *stats_buf;
747 	static _Thread_local u_int stats_count;
748 	uint64_t *new_stats;
749 	struct vm_stats vmstats;
750 	u_int count, index;
751 	bool have_stats;
752 
753 	have_stats = false;
754 	count = 0;
755 	for (index = 0;; index += nitems(vmstats.statbuf)) {
756 		vmstats.index = index;
757 		if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0)
758 			break;
759 		if (stats_count < index + vmstats.num_entries) {
760 			new_stats = realloc(stats_buf,
761 			    (index + vmstats.num_entries) * sizeof(uint64_t));
762 			if (new_stats == NULL) {
763 				errno = ENOMEM;
764 				return (NULL);
765 			}
766 			stats_count = index + vmstats.num_entries;
767 			stats_buf = new_stats;
768 		}
769 		memcpy(stats_buf + index, vmstats.statbuf,
770 		    vmstats.num_entries * sizeof(uint64_t));
771 		count += vmstats.num_entries;
772 		have_stats = true;
773 
774 		if (vmstats.num_entries != nitems(vmstats.statbuf))
775 			break;
776 	}
777 	if (have_stats) {
778 		if (ret_entries)
779 			*ret_entries = count;
780 		if (ret_tv)
781 			*ret_tv = vmstats.tv;
782 		return (stats_buf);
783 	} else
784 		return (NULL);
785 }
786 
787 const char *
788 vm_get_stat_desc(struct vmctx *ctx, int index)
789 {
790 	static struct vm_stat_desc statdesc;
791 
792 	statdesc.index = index;
793 	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
794 		return (statdesc.desc);
795 	else
796 		return (NULL);
797 }
798 
799 int
800 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
801 {
802 	int error, i;
803 	struct vm_gpa_pte gpapte;
804 
805 	bzero(&gpapte, sizeof(gpapte));
806 	gpapte.gpa = gpa;
807 
808 	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
809 
810 	if (error == 0) {
811 		*num = gpapte.ptenum;
812 		for (i = 0; i < gpapte.ptenum; i++)
813 			pte[i] = gpapte.pte[i];
814 	}
815 
816 	return (error);
817 }
818 
819 int
820 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
821     uint64_t gla, int prot, uint64_t *gpa, int *fault)
822 {
823 	struct vm_gla2gpa gg;
824 	int error;
825 
826 	bzero(&gg, sizeof(struct vm_gla2gpa));
827 	gg.prot = prot;
828 	gg.gla = gla;
829 	gg.paging = *paging;
830 
831 	error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg);
832 	if (error == 0) {
833 		*fault = gg.fault;
834 		*gpa = gg.gpa;
835 	}
836 	return (error);
837 }
838 
839 int
840 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
841     uint64_t gla, int prot, uint64_t *gpa, int *fault)
842 {
843 	struct vm_gla2gpa gg;
844 	int error;
845 
846 	bzero(&gg, sizeof(struct vm_gla2gpa));
847 	gg.prot = prot;
848 	gg.gla = gla;
849 	gg.paging = *paging;
850 
851 	error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg);
852 	if (error == 0) {
853 		*fault = gg.fault;
854 		*gpa = gg.gpa;
855 	}
856 	return (error);
857 }
858 
859 #ifndef min
860 #define	min(a,b)	(((a) < (b)) ? (a) : (b))
861 #endif
862 
863 int
864 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
865     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
866     int *fault)
867 {
868 	void *va;
869 	uint64_t gpa, off;
870 	int error, i, n;
871 
872 	for (i = 0; i < iovcnt; i++) {
873 		iov[i].iov_base = 0;
874 		iov[i].iov_len = 0;
875 	}
876 
877 	while (len) {
878 		assert(iovcnt > 0);
879 		error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
880 		if (error || *fault)
881 			return (error);
882 
883 		off = gpa & PAGE_MASK;
884 		n = MIN(len, PAGE_SIZE - off);
885 
886 		va = vm_map_gpa(vcpu->ctx, gpa, n);
887 		if (va == NULL)
888 			return (EFAULT);
889 
890 		iov->iov_base = va;
891 		iov->iov_len = n;
892 		iov++;
893 		iovcnt--;
894 
895 		gla += n;
896 		len -= n;
897 	}
898 	return (0);
899 }
900 
901 void
902 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused)
903 {
904 	/*
905 	 * Intentionally empty.  This is used by the instruction
906 	 * emulation code shared with the kernel.  The in-kernel
907 	 * version of this is non-empty.
908 	 */
909 }
910 
911 void
912 vm_copyin(struct iovec *iov, void *vp, size_t len)
913 {
914 	const char *src;
915 	char *dst;
916 	size_t n;
917 
918 	dst = vp;
919 	while (len) {
920 		assert(iov->iov_len);
921 		n = min(len, iov->iov_len);
922 		src = iov->iov_base;
923 		bcopy(src, dst, n);
924 
925 		iov++;
926 		dst += n;
927 		len -= n;
928 	}
929 }
930 
931 void
932 vm_copyout(const void *vp, struct iovec *iov, size_t len)
933 {
934 	const char *src;
935 	char *dst;
936 	size_t n;
937 
938 	src = vp;
939 	while (len) {
940 		assert(iov->iov_len);
941 		n = min(len, iov->iov_len);
942 		dst = iov->iov_base;
943 		bcopy(src, dst, n);
944 
945 		iov++;
946 		src += n;
947 		len -= n;
948 	}
949 }
950 
951 static int
952 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
953 {
954 	struct vm_cpuset vm_cpuset;
955 	int error;
956 
957 	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
958 	vm_cpuset.which = which;
959 	vm_cpuset.cpusetsize = sizeof(cpuset_t);
960 	vm_cpuset.cpus = cpus;
961 
962 	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
963 	return (error);
964 }
965 
966 int
967 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
968 {
969 
970 	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
971 }
972 
973 int
974 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
975 {
976 
977 	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
978 }
979 
980 int
981 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
982 {
983 
984 	return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
985 }
986 
987 int
988 vm_activate_cpu(struct vcpu *vcpu)
989 {
990 	struct vm_activate_cpu ac;
991 	int error;
992 
993 	bzero(&ac, sizeof(struct vm_activate_cpu));
994 	error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac);
995 	return (error);
996 }
997 
998 int
999 vm_suspend_all_cpus(struct vmctx *ctx)
1000 {
1001 	struct vm_activate_cpu ac;
1002 	int error;
1003 
1004 	bzero(&ac, sizeof(struct vm_activate_cpu));
1005 	ac.vcpuid = -1;
1006 	error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
1007 	return (error);
1008 }
1009 
1010 int
1011 vm_suspend_cpu(struct vcpu *vcpu)
1012 {
1013 	struct vm_activate_cpu ac;
1014 	int error;
1015 
1016 	bzero(&ac, sizeof(struct vm_activate_cpu));
1017 	error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac);
1018 	return (error);
1019 }
1020 
1021 int
1022 vm_resume_cpu(struct vcpu *vcpu)
1023 {
1024 	struct vm_activate_cpu ac;
1025 	int error;
1026 
1027 	bzero(&ac, sizeof(struct vm_activate_cpu));
1028 	error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac);
1029 	return (error);
1030 }
1031 
1032 int
1033 vm_resume_all_cpus(struct vmctx *ctx)
1034 {
1035 	struct vm_activate_cpu ac;
1036 	int error;
1037 
1038 	bzero(&ac, sizeof(struct vm_activate_cpu));
1039 	ac.vcpuid = -1;
1040 	error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
1041 	return (error);
1042 }
1043 
1044 int
1045 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
1046 {
1047 	struct vm_intinfo vmii;
1048 	int error;
1049 
1050 	bzero(&vmii, sizeof(struct vm_intinfo));
1051 	error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii);
1052 	if (error == 0) {
1053 		*info1 = vmii.info1;
1054 		*info2 = vmii.info2;
1055 	}
1056 	return (error);
1057 }
1058 
1059 int
1060 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1)
1061 {
1062 	struct vm_intinfo vmii;
1063 	int error;
1064 
1065 	bzero(&vmii, sizeof(struct vm_intinfo));
1066 	vmii.info1 = info1;
1067 	error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii);
1068 	return (error);
1069 }
1070 
1071 int
1072 vm_restart_instruction(struct vcpu *vcpu)
1073 {
1074 	int arg;
1075 
1076 	return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg));
1077 }
1078 
1079 int
1080 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta)
1081 {
1082 
1083 	if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
1084 #ifdef SNAPSHOT_DEBUG
1085 		fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
1086 		    __func__, meta->dev_name, errno);
1087 #endif
1088 		return (-1);
1089 	}
1090 	return (0);
1091 }
1092 
1093 int
1094 vm_restore_time(struct vmctx *ctx)
1095 {
1096 	int dummy;
1097 
1098 	dummy = 0;
1099 	return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
1100 }
1101 
1102 int
1103 vm_set_topology(struct vmctx *ctx,
1104     uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
1105 {
1106 	struct vm_cpu_topology topology;
1107 
1108 	bzero(&topology, sizeof (struct vm_cpu_topology));
1109 	topology.sockets = sockets;
1110 	topology.cores = cores;
1111 	topology.threads = threads;
1112 	topology.maxcpus = maxcpus;
1113 	return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
1114 }
1115 
1116 int
1117 vm_get_topology(struct vmctx *ctx,
1118     uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
1119 {
1120 	struct vm_cpu_topology topology;
1121 	int error;
1122 
1123 	bzero(&topology, sizeof (struct vm_cpu_topology));
1124 	error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
1125 	if (error == 0) {
1126 		*sockets = topology.sockets;
1127 		*cores = topology.cores;
1128 		*threads = topology.threads;
1129 		*maxcpus = topology.maxcpus;
1130 	}
1131 	return (error);
1132 }
1133 
1134 int
1135 vm_limit_rights(struct vmctx *ctx)
1136 {
1137 	cap_rights_t rights;
1138 
1139 	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1140 	if (caph_rights_limit(ctx->fd, &rights) != 0)
1141 		return (-1);
1142 	if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0)
1143 		return (-1);
1144 	return (0);
1145 }
1146 
1147 /*
1148  * Avoid using in new code.  Operations on the fd should be wrapped here so that
1149  * capability rights can be kept in sync.
1150  */
1151 int
1152 vm_get_device_fd(struct vmctx *ctx)
1153 {
1154 
1155 	return (ctx->fd);
1156 }
1157 
1158 /* Legacy interface, do not use. */
1159 const cap_ioctl_t *
1160 vm_get_ioctls(size_t *len)
1161 {
1162 	cap_ioctl_t *cmds;
1163 	size_t sz;
1164 
1165 	if (len == NULL) {
1166 		sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]);
1167 		cmds = malloc(sz);
1168 		if (cmds == NULL)
1169 			return (NULL);
1170 		bcopy(vm_ioctl_cmds, cmds, sz);
1171 		return (cmds);
1172 	}
1173 
1174 	*len = vm_ioctl_ncmds;
1175 	return (NULL);
1176 }
1177