xref: /freebsd/lib/libvmmapi/vmmapi.c (revision 23a28fe7776f6d76643a6ac16758d114dfbbeec2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/capsicum.h>
31 #include <sys/sysctl.h>
32 #include <sys/ioctl.h>
33 #include <sys/mman.h>
34 #include <sys/linker.h>
35 #include <sys/module.h>
36 #include <sys/_iovec.h>
37 #include <sys/cpuset.h>
38 
39 #include <capsicum_helpers.h>
40 #include <err.h>
41 #include <errno.h>
42 #include <stdbool.h>
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <assert.h>
46 #include <string.h>
47 #include <fcntl.h>
48 #include <unistd.h>
49 
50 #include <libutil.h>
51 
52 #include <vm/vm.h>
53 #include <machine/vmm.h>
54 #ifdef WITH_VMMAPI_SNAPSHOT
55 #include <machine/vmm_snapshot.h>
56 #endif
57 
58 #include <dev/vmm/vmm_dev.h>
59 
60 #include "vmmapi.h"
61 #include "internal.h"
62 
63 #define	MB	(1024 * 1024UL)
64 #define	GB	(1024 * 1024 * 1024UL)
65 
66 #ifdef __amd64__
67 #define	VM_LOWMEM_LIMIT	(3 * GB)
68 #else
69 #define	VM_LOWMEM_LIMIT	0
70 #endif
71 #define	VM_HIGHMEM_BASE	(4 * GB)
72 
73 /*
74  * Size of the guard region before and after the virtual address space
75  * mapping the guest physical memory. This must be a multiple of the
76  * superpage size for performance reasons.
77  */
78 #define	VM_MMAP_GUARD_SIZE	(4 * MB)
79 
80 #define	PROT_RW		(PROT_READ | PROT_WRITE)
81 #define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
82 
83 static int
vm_device_open(const char * name)84 vm_device_open(const char *name)
85 {
86 	char devpath[PATH_MAX];
87 
88 	assert(strlen(name) <= VM_MAX_NAMELEN);
89 	(void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name);
90 	return (open(devpath, O_RDWR));
91 }
92 
93 static int
vm_ctl_open(void)94 vm_ctl_open(void)
95 {
96 	if (modfind("vmm") < 0)
97 		(void)kldload("vmm");
98 	return (open("/dev/vmmctl", O_RDWR, 0));
99 }
100 
101 static int
vm_ctl_create(const char * name,int ctlfd)102 vm_ctl_create(const char *name, int ctlfd)
103 {
104 	struct vmmctl_vm_create vmc;
105 
106 	memset(&vmc, 0, sizeof(vmc));
107 	if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) {
108 		errno = ENAMETOOLONG;
109 		return (-1);
110 	}
111 	return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc));
112 }
113 
114 int
vm_create(const char * name)115 vm_create(const char *name)
116 {
117 	int error, fd;
118 
119 	fd = vm_ctl_open();
120 	if (fd < 0)
121 		return (-1);
122 
123 	error = vm_ctl_create(name, fd);
124 	if (error != 0) {
125 		error = errno;
126 		(void)close(fd);
127 		errno = error;
128 		return (-1);
129 	}
130 	(void)close(fd);
131 	return (0);
132 }
133 
134 struct vmctx *
vm_open(const char * name)135 vm_open(const char *name)
136 {
137 	return (vm_openf(name, 0));
138 }
139 
140 struct vmctx *
vm_openf(const char * name,int flags)141 vm_openf(const char *name, int flags)
142 {
143 	struct vmctx *vm;
144 	int saved_errno;
145 	bool created;
146 
147 	created = false;
148 
149 	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
150 	assert(vm != NULL);
151 
152 	vm->fd = vm->ctlfd = -1;
153 	vm->memflags = 0;
154 	vm->name = (char *)(vm + 1);
155 	strcpy(vm->name, name);
156 	memset(vm->memsegs, 0, sizeof(vm->memsegs));
157 
158 	if ((vm->ctlfd = vm_ctl_open()) < 0)
159 		goto err;
160 
161 	vm->fd = vm_device_open(vm->name);
162 	if (vm->fd < 0 && errno == ENOENT) {
163 		if (flags & VMMAPI_OPEN_CREATE) {
164 			if (vm_ctl_create(vm->name, vm->ctlfd) != 0)
165 				goto err;
166 			vm->fd = vm_device_open(vm->name);
167 			created = true;
168 		}
169 	}
170 	if (vm->fd < 0)
171 		goto err;
172 
173 	if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0)
174 		goto err;
175 
176 	return (vm);
177 err:
178 	saved_errno = errno;
179 	if (created)
180 		vm_destroy(vm);
181 	else
182 		vm_close(vm);
183 	errno = saved_errno;
184 	return (NULL);
185 }
186 
187 void
vm_close(struct vmctx * vm)188 vm_close(struct vmctx *vm)
189 {
190 	assert(vm != NULL);
191 
192 	if (vm->fd >= 0)
193 		(void)close(vm->fd);
194 	if (vm->ctlfd >= 0)
195 		(void)close(vm->ctlfd);
196 	free(vm);
197 }
198 
199 void
vm_destroy(struct vmctx * vm)200 vm_destroy(struct vmctx *vm)
201 {
202 	struct vmmctl_vm_destroy vmd;
203 
204 	memset(&vmd, 0, sizeof(vmd));
205 	(void)strlcpy(vmd.name, vm->name, sizeof(vmd.name));
206 	if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0)
207 		warn("ioctl(VMMCTL_VM_DESTROY)");
208 
209 	vm_close(vm);
210 }
211 
212 struct vcpu *
vm_vcpu_open(struct vmctx * ctx,int vcpuid)213 vm_vcpu_open(struct vmctx *ctx, int vcpuid)
214 {
215 	struct vcpu *vcpu;
216 
217 	vcpu = malloc(sizeof(*vcpu));
218 	vcpu->ctx = ctx;
219 	vcpu->vcpuid = vcpuid;
220 	return (vcpu);
221 }
222 
223 void
vm_vcpu_close(struct vcpu * vcpu)224 vm_vcpu_close(struct vcpu *vcpu)
225 {
226 	free(vcpu);
227 }
228 
229 int
vcpu_id(struct vcpu * vcpu)230 vcpu_id(struct vcpu *vcpu)
231 {
232 	return (vcpu->vcpuid);
233 }
234 
235 int
vm_parse_memsize(const char * opt,size_t * ret_memsize)236 vm_parse_memsize(const char *opt, size_t *ret_memsize)
237 {
238 	char *endptr;
239 	size_t optval;
240 	int error;
241 
242 	optval = strtoul(opt, &endptr, 0);
243 	if (*opt != '\0' && *endptr == '\0') {
244 		/*
245 		 * For the sake of backward compatibility if the memory size
246 		 * specified on the command line is less than a megabyte then
247 		 * it is interpreted as being in units of MB.
248 		 */
249 		if (optval < MB)
250 			optval *= MB;
251 		*ret_memsize = optval;
252 		error = 0;
253 	} else
254 		error = expand_number(opt, ret_memsize);
255 
256 	return (error);
257 }
258 
259 uint32_t
vm_get_lowmem_limit(struct vmctx * ctx __unused)260 vm_get_lowmem_limit(struct vmctx *ctx __unused)
261 {
262 
263 	return (VM_LOWMEM_LIMIT);
264 }
265 
266 void
vm_set_memflags(struct vmctx * ctx,int flags)267 vm_set_memflags(struct vmctx *ctx, int flags)
268 {
269 
270 	ctx->memflags = flags;
271 }
272 
273 int
vm_get_memflags(struct vmctx * ctx)274 vm_get_memflags(struct vmctx *ctx)
275 {
276 
277 	return (ctx->memflags);
278 }
279 
280 /*
281  * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
282  */
283 int
vm_mmap_memseg(struct vmctx * ctx,vm_paddr_t gpa,int segid,vm_ooffset_t off,size_t len,int prot)284 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
285     size_t len, int prot)
286 {
287 	struct vm_memmap memmap;
288 	int error, flags;
289 
290 	memmap.gpa = gpa;
291 	memmap.segid = segid;
292 	memmap.segoff = off;
293 	memmap.len = len;
294 	memmap.prot = prot;
295 	memmap.flags = 0;
296 
297 	if (ctx->memflags & VM_MEM_F_WIRED)
298 		memmap.flags |= VM_MEMMAP_F_WIRED;
299 
300 	/*
301 	 * If this mapping already exists then don't create it again. This
302 	 * is the common case for SYSMEM mappings created by bhyveload(8).
303 	 */
304 	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
305 	if (error == 0 && gpa == memmap.gpa) {
306 		if (segid != memmap.segid || off != memmap.segoff ||
307 		    prot != memmap.prot || flags != memmap.flags) {
308 			errno = EEXIST;
309 			return (-1);
310 		} else {
311 			return (0);
312 		}
313 	}
314 
315 	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
316 	return (error);
317 }
318 
319 int
vm_get_guestmem_from_ctx(struct vmctx * ctx,char ** guest_baseaddr,size_t * lowmem_size,size_t * highmem_size)320 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
321     size_t *lowmem_size, size_t *highmem_size)
322 {
323 
324 	*guest_baseaddr = ctx->baseaddr;
325 	*lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size;
326 	*highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size;
327 	return (0);
328 }
329 
330 int
vm_munmap_memseg(struct vmctx * ctx,vm_paddr_t gpa,size_t len)331 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
332 {
333 	struct vm_munmap munmap;
334 	int error;
335 
336 	munmap.gpa = gpa;
337 	munmap.len = len;
338 
339 	error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap);
340 	return (error);
341 }
342 
343 int
vm_mmap_getnext(struct vmctx * ctx,vm_paddr_t * gpa,int * segid,vm_ooffset_t * segoff,size_t * len,int * prot,int * flags)344 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
345     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
346 {
347 	struct vm_memmap memmap;
348 	int error;
349 
350 	bzero(&memmap, sizeof(struct vm_memmap));
351 	memmap.gpa = *gpa;
352 	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
353 	if (error == 0) {
354 		*gpa = memmap.gpa;
355 		*segid = memmap.segid;
356 		*segoff = memmap.segoff;
357 		*len = memmap.len;
358 		*prot = memmap.prot;
359 		*flags = memmap.flags;
360 	}
361 	return (error);
362 }
363 
364 /*
365  * Return 0 if the segments are identical and non-zero otherwise.
366  *
367  * This is slightly complicated by the fact that only device memory segments
368  * are named.
369  */
370 static int
cmpseg(size_t len,const char * str,size_t len2,const char * str2)371 cmpseg(size_t len, const char *str, size_t len2, const char *str2)
372 {
373 
374 	if (len == len2) {
375 		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
376 			return (0);
377 	}
378 	return (-1);
379 }
380 
381 static int
vm_alloc_memseg(struct vmctx * ctx,int segid,size_t len,const char * name)382 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
383 {
384 	struct vm_memseg memseg;
385 	size_t n;
386 	int error;
387 
388 	/*
389 	 * If the memory segment has already been created then just return.
390 	 * This is the usual case for the SYSMEM segment created by userspace
391 	 * loaders like bhyveload(8).
392 	 */
393 	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
394 	    sizeof(memseg.name));
395 	if (error)
396 		return (error);
397 
398 	if (memseg.len != 0) {
399 		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
400 			errno = EINVAL;
401 			return (-1);
402 		} else {
403 			return (0);
404 		}
405 	}
406 
407 	bzero(&memseg, sizeof(struct vm_memseg));
408 	memseg.segid = segid;
409 	memseg.len = len;
410 	if (name != NULL) {
411 		n = strlcpy(memseg.name, name, sizeof(memseg.name));
412 		if (n >= sizeof(memseg.name)) {
413 			errno = ENAMETOOLONG;
414 			return (-1);
415 		}
416 	}
417 
418 	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
419 	return (error);
420 }
421 
422 int
vm_get_memseg(struct vmctx * ctx,int segid,size_t * lenp,char * namebuf,size_t bufsize)423 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
424     size_t bufsize)
425 {
426 	struct vm_memseg memseg;
427 	size_t n;
428 	int error;
429 
430 	bzero(&memseg, sizeof(memseg));
431 	memseg.segid = segid;
432 	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
433 	if (error == 0) {
434 		*lenp = memseg.len;
435 		n = strlcpy(namebuf, memseg.name, bufsize);
436 		if (n >= bufsize) {
437 			errno = ENAMETOOLONG;
438 			error = -1;
439 		}
440 	}
441 	return (error);
442 }
443 
444 static int
setup_memory_segment(struct vmctx * ctx,vm_paddr_t gpa,size_t len,char * base)445 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
446 {
447 	char *ptr;
448 	int error, flags;
449 
450 	/* Map 'len' bytes starting at 'gpa' in the guest address space */
451 	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
452 	if (error)
453 		return (error);
454 
455 	flags = MAP_SHARED | MAP_FIXED;
456 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
457 		flags |= MAP_NOCORE;
458 
459 	/* mmap into the process address space on the host */
460 	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
461 	if (ptr == MAP_FAILED)
462 		return (-1);
463 
464 	return (0);
465 }
466 
467 int
vm_setup_memory(struct vmctx * ctx,size_t memsize,enum vm_mmap_style vms)468 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
469 {
470 	size_t objsize, len;
471 	vm_paddr_t gpa;
472 	char *baseaddr, *ptr;
473 	int error;
474 
475 	assert(vms == VM_MMAP_ALL);
476 
477 	/*
478 	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create
479 	 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder.
480 	 */
481 	if (memsize > VM_LOWMEM_LIMIT) {
482 		ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT;
483 		ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT;
484 		objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size;
485 	} else {
486 		ctx->memsegs[VM_MEMSEG_LOW].size = memsize;
487 		ctx->memsegs[VM_MEMSEG_HIGH].size = 0;
488 		objsize = memsize;
489 	}
490 
491 	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
492 	if (error)
493 		return (error);
494 
495 	/*
496 	 * Stake out a contiguous region covering the guest physical memory
497 	 * and the adjoining guard regions.
498 	 */
499 	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
500 	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
501 	if (ptr == MAP_FAILED)
502 		return (-1);
503 
504 	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
505 	if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) {
506 		gpa = VM_HIGHMEM_BASE;
507 		len = ctx->memsegs[VM_MEMSEG_HIGH].size;
508 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
509 		if (error)
510 			return (error);
511 	}
512 
513 	if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) {
514 		gpa = 0;
515 		len = ctx->memsegs[VM_MEMSEG_LOW].size;
516 		error = setup_memory_segment(ctx, gpa, len, baseaddr);
517 		if (error)
518 			return (error);
519 	}
520 
521 	ctx->baseaddr = baseaddr;
522 
523 	return (0);
524 }
525 
526 /*
527  * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
528  * the lowmem or highmem regions.
529  *
530  * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
531  * The instruction emulation code depends on this behavior.
532  */
533 void *
vm_map_gpa(struct vmctx * ctx,vm_paddr_t gaddr,size_t len)534 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
535 {
536 	vm_size_t lowsize, highsize;
537 
538 	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
539 	if (lowsize > 0) {
540 		if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize)
541 			return (ctx->baseaddr + gaddr);
542 	}
543 
544 	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
545 	if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) {
546 		if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize &&
547 		    gaddr + len <= VM_HIGHMEM_BASE + highsize)
548 			return (ctx->baseaddr + gaddr);
549 	}
550 
551 	return (NULL);
552 }
553 
554 vm_paddr_t
vm_rev_map_gpa(struct vmctx * ctx,void * addr)555 vm_rev_map_gpa(struct vmctx *ctx, void *addr)
556 {
557 	vm_paddr_t offaddr;
558 	vm_size_t lowsize, highsize;
559 
560 	offaddr = (char *)addr - ctx->baseaddr;
561 
562 	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
563 	if (lowsize > 0)
564 		if (offaddr <= lowsize)
565 			return (offaddr);
566 
567 	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
568 	if (highsize > 0)
569 		if (offaddr >= VM_HIGHMEM_BASE &&
570 		    offaddr < VM_HIGHMEM_BASE + highsize)
571 			return (offaddr);
572 
573 	return ((vm_paddr_t)-1);
574 }
575 
576 const char *
vm_get_name(struct vmctx * ctx)577 vm_get_name(struct vmctx *ctx)
578 {
579 
580 	return (ctx->name);
581 }
582 
583 size_t
vm_get_lowmem_size(struct vmctx * ctx)584 vm_get_lowmem_size(struct vmctx *ctx)
585 {
586 
587 	return (ctx->memsegs[VM_MEMSEG_LOW].size);
588 }
589 
590 vm_paddr_t
vm_get_highmem_base(struct vmctx * ctx __unused)591 vm_get_highmem_base(struct vmctx *ctx __unused)
592 {
593 
594 	return (VM_HIGHMEM_BASE);
595 }
596 
597 size_t
vm_get_highmem_size(struct vmctx * ctx)598 vm_get_highmem_size(struct vmctx *ctx)
599 {
600 
601 	return (ctx->memsegs[VM_MEMSEG_HIGH].size);
602 }
603 
604 void *
vm_create_devmem(struct vmctx * ctx,int segid,const char * name,size_t len)605 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
606 {
607 	char pathname[MAXPATHLEN];
608 	size_t len2;
609 	char *base, *ptr;
610 	int fd, error, flags;
611 
612 	fd = -1;
613 	ptr = MAP_FAILED;
614 	if (name == NULL || strlen(name) == 0) {
615 		errno = EINVAL;
616 		goto done;
617 	}
618 
619 	error = vm_alloc_memseg(ctx, segid, len, name);
620 	if (error)
621 		goto done;
622 
623 	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
624 	strlcat(pathname, ctx->name, sizeof(pathname));
625 	strlcat(pathname, ".", sizeof(pathname));
626 	strlcat(pathname, name, sizeof(pathname));
627 
628 	fd = open(pathname, O_RDWR);
629 	if (fd < 0)
630 		goto done;
631 
632 	/*
633 	 * Stake out a contiguous region covering the device memory and the
634 	 * adjoining guard regions.
635 	 */
636 	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
637 	base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
638 	    0);
639 	if (base == MAP_FAILED)
640 		goto done;
641 
642 	flags = MAP_SHARED | MAP_FIXED;
643 	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
644 		flags |= MAP_NOCORE;
645 
646 	/* mmap the devmem region in the host address space */
647 	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
648 done:
649 	if (fd >= 0)
650 		close(fd);
651 	return (ptr);
652 }
653 
654 int
vcpu_ioctl(struct vcpu * vcpu,u_long cmd,void * arg)655 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg)
656 {
657 	/*
658 	 * XXX: fragile, handle with care
659 	 * Assumes that the first field of the ioctl data
660 	 * is the vcpuid.
661 	 */
662 	*(int *)arg = vcpu->vcpuid;
663 	return (ioctl(vcpu->ctx->fd, cmd, arg));
664 }
665 
666 int
vm_set_register(struct vcpu * vcpu,int reg,uint64_t val)667 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
668 {
669 	int error;
670 	struct vm_register vmreg;
671 
672 	bzero(&vmreg, sizeof(vmreg));
673 	vmreg.regnum = reg;
674 	vmreg.regval = val;
675 
676 	error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg);
677 	return (error);
678 }
679 
680 int
vm_get_register(struct vcpu * vcpu,int reg,uint64_t * ret_val)681 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val)
682 {
683 	int error;
684 	struct vm_register vmreg;
685 
686 	bzero(&vmreg, sizeof(vmreg));
687 	vmreg.regnum = reg;
688 
689 	error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg);
690 	*ret_val = vmreg.regval;
691 	return (error);
692 }
693 
694 int
vm_set_register_set(struct vcpu * vcpu,unsigned int count,const int * regnums,uint64_t * regvals)695 vm_set_register_set(struct vcpu *vcpu, unsigned int count,
696     const int *regnums, uint64_t *regvals)
697 {
698 	int error;
699 	struct vm_register_set vmregset;
700 
701 	bzero(&vmregset, sizeof(vmregset));
702 	vmregset.count = count;
703 	vmregset.regnums = regnums;
704 	vmregset.regvals = regvals;
705 
706 	error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset);
707 	return (error);
708 }
709 
710 int
vm_get_register_set(struct vcpu * vcpu,unsigned int count,const int * regnums,uint64_t * regvals)711 vm_get_register_set(struct vcpu *vcpu, unsigned int count,
712     const int *regnums, uint64_t *regvals)
713 {
714 	int error;
715 	struct vm_register_set vmregset;
716 
717 	bzero(&vmregset, sizeof(vmregset));
718 	vmregset.count = count;
719 	vmregset.regnums = regnums;
720 	vmregset.regvals = regvals;
721 
722 	error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset);
723 	return (error);
724 }
725 
726 int
vm_run(struct vcpu * vcpu,struct vm_run * vmrun)727 vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
728 {
729 	return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
730 }
731 
732 int
vm_suspend(struct vmctx * ctx,enum vm_suspend_how how)733 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
734 {
735 	struct vm_suspend vmsuspend;
736 
737 	bzero(&vmsuspend, sizeof(vmsuspend));
738 	vmsuspend.how = how;
739 	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
740 }
741 
742 int
vm_reinit(struct vmctx * ctx)743 vm_reinit(struct vmctx *ctx)
744 {
745 
746 	return (ioctl(ctx->fd, VM_REINIT, 0));
747 }
748 
749 int
vm_capability_name2type(const char * capname)750 vm_capability_name2type(const char *capname)
751 {
752 	int i;
753 
754 	for (i = 0; i < VM_CAP_MAX; i++) {
755 		if (vm_capstrmap[i] != NULL &&
756 		    strcmp(vm_capstrmap[i], capname) == 0)
757 			return (i);
758 	}
759 
760 	return (-1);
761 }
762 
763 const char *
vm_capability_type2name(int type)764 vm_capability_type2name(int type)
765 {
766 	if (type >= 0 && type < VM_CAP_MAX)
767 		return (vm_capstrmap[type]);
768 
769 	return (NULL);
770 }
771 
772 int
vm_get_capability(struct vcpu * vcpu,enum vm_cap_type cap,int * retval)773 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval)
774 {
775 	int error;
776 	struct vm_capability vmcap;
777 
778 	bzero(&vmcap, sizeof(vmcap));
779 	vmcap.captype = cap;
780 
781 	error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap);
782 	*retval = vmcap.capval;
783 	return (error);
784 }
785 
786 int
vm_set_capability(struct vcpu * vcpu,enum vm_cap_type cap,int val)787 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val)
788 {
789 	struct vm_capability vmcap;
790 
791 	bzero(&vmcap, sizeof(vmcap));
792 	vmcap.captype = cap;
793 	vmcap.capval = val;
794 
795 	return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap));
796 }
797 
798 uint64_t *
vm_get_stats(struct vcpu * vcpu,struct timeval * ret_tv,int * ret_entries)799 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv,
800 	     int *ret_entries)
801 {
802 	static _Thread_local uint64_t *stats_buf;
803 	static _Thread_local u_int stats_count;
804 	uint64_t *new_stats;
805 	struct vm_stats vmstats;
806 	u_int count, index;
807 	bool have_stats;
808 
809 	have_stats = false;
810 	count = 0;
811 	for (index = 0;; index += nitems(vmstats.statbuf)) {
812 		vmstats.index = index;
813 		if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0)
814 			break;
815 		if (stats_count < index + vmstats.num_entries) {
816 			new_stats = realloc(stats_buf,
817 			    (index + vmstats.num_entries) * sizeof(uint64_t));
818 			if (new_stats == NULL) {
819 				errno = ENOMEM;
820 				return (NULL);
821 			}
822 			stats_count = index + vmstats.num_entries;
823 			stats_buf = new_stats;
824 		}
825 		memcpy(stats_buf + index, vmstats.statbuf,
826 		    vmstats.num_entries * sizeof(uint64_t));
827 		count += vmstats.num_entries;
828 		have_stats = true;
829 
830 		if (vmstats.num_entries != nitems(vmstats.statbuf))
831 			break;
832 	}
833 	if (have_stats) {
834 		if (ret_entries)
835 			*ret_entries = count;
836 		if (ret_tv)
837 			*ret_tv = vmstats.tv;
838 		return (stats_buf);
839 	} else
840 		return (NULL);
841 }
842 
843 const char *
vm_get_stat_desc(struct vmctx * ctx,int index)844 vm_get_stat_desc(struct vmctx *ctx, int index)
845 {
846 	static struct vm_stat_desc statdesc;
847 
848 	statdesc.index = index;
849 	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
850 		return (statdesc.desc);
851 	else
852 		return (NULL);
853 }
854 
855 #ifdef __amd64__
856 int
vm_get_gpa_pmap(struct vmctx * ctx,uint64_t gpa,uint64_t * pte,int * num)857 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
858 {
859 	int error, i;
860 	struct vm_gpa_pte gpapte;
861 
862 	bzero(&gpapte, sizeof(gpapte));
863 	gpapte.gpa = gpa;
864 
865 	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
866 
867 	if (error == 0) {
868 		*num = gpapte.ptenum;
869 		for (i = 0; i < gpapte.ptenum; i++)
870 			pte[i] = gpapte.pte[i];
871 	}
872 
873 	return (error);
874 }
875 
876 int
vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * fault)877 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
878     uint64_t gla, int prot, uint64_t *gpa, int *fault)
879 {
880 	struct vm_gla2gpa gg;
881 	int error;
882 
883 	bzero(&gg, sizeof(struct vm_gla2gpa));
884 	gg.prot = prot;
885 	gg.gla = gla;
886 	gg.paging = *paging;
887 
888 	error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg);
889 	if (error == 0) {
890 		*fault = gg.fault;
891 		*gpa = gg.gpa;
892 	}
893 	return (error);
894 }
895 #endif
896 
897 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * fault)898 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
899     uint64_t gla, int prot, uint64_t *gpa, int *fault)
900 {
901 	struct vm_gla2gpa gg;
902 	int error;
903 
904 	bzero(&gg, sizeof(struct vm_gla2gpa));
905 	gg.prot = prot;
906 	gg.gla = gla;
907 	gg.paging = *paging;
908 
909 	error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg);
910 	if (error == 0) {
911 		*fault = gg.fault;
912 		*gpa = gg.gpa;
913 	}
914 	return (error);
915 }
916 
917 #ifndef min
918 #define	min(a,b)	(((a) < (b)) ? (a) : (b))
919 #endif
920 
921 #ifdef __amd64__
922 int
vm_copy_setup(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,size_t len,int prot,struct iovec * iov,int iovcnt,int * fault)923 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
924     uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
925     int *fault)
926 {
927 	void *va;
928 	uint64_t gpa, off;
929 	int error, i, n;
930 
931 	for (i = 0; i < iovcnt; i++) {
932 		iov[i].iov_base = 0;
933 		iov[i].iov_len = 0;
934 	}
935 
936 	while (len) {
937 		assert(iovcnt > 0);
938 		error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
939 		if (error || *fault)
940 			return (error);
941 
942 		off = gpa & PAGE_MASK;
943 		n = MIN(len, PAGE_SIZE - off);
944 
945 		va = vm_map_gpa(vcpu->ctx, gpa, n);
946 		if (va == NULL)
947 			return (EFAULT);
948 
949 		iov->iov_base = va;
950 		iov->iov_len = n;
951 		iov++;
952 		iovcnt--;
953 
954 		gla += n;
955 		len -= n;
956 	}
957 	return (0);
958 }
959 #endif
960 
961 void
vm_copy_teardown(struct iovec * iov __unused,int iovcnt __unused)962 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused)
963 {
964 	/*
965 	 * Intentionally empty.  This is used by the instruction
966 	 * emulation code shared with the kernel.  The in-kernel
967 	 * version of this is non-empty.
968 	 */
969 }
970 
971 void
vm_copyin(struct iovec * iov,void * vp,size_t len)972 vm_copyin(struct iovec *iov, void *vp, size_t len)
973 {
974 	const char *src;
975 	char *dst;
976 	size_t n;
977 
978 	dst = vp;
979 	while (len) {
980 		assert(iov->iov_len);
981 		n = min(len, iov->iov_len);
982 		src = iov->iov_base;
983 		bcopy(src, dst, n);
984 
985 		iov++;
986 		dst += n;
987 		len -= n;
988 	}
989 }
990 
991 void
vm_copyout(const void * vp,struct iovec * iov,size_t len)992 vm_copyout(const void *vp, struct iovec *iov, size_t len)
993 {
994 	const char *src;
995 	char *dst;
996 	size_t n;
997 
998 	src = vp;
999 	while (len) {
1000 		assert(iov->iov_len);
1001 		n = min(len, iov->iov_len);
1002 		dst = iov->iov_base;
1003 		bcopy(src, dst, n);
1004 
1005 		iov++;
1006 		src += n;
1007 		len -= n;
1008 	}
1009 }
1010 
1011 static int
vm_get_cpus(struct vmctx * ctx,int which,cpuset_t * cpus)1012 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
1013 {
1014 	struct vm_cpuset vm_cpuset;
1015 	int error;
1016 
1017 	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
1018 	vm_cpuset.which = which;
1019 	vm_cpuset.cpusetsize = sizeof(cpuset_t);
1020 	vm_cpuset.cpus = cpus;
1021 
1022 	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
1023 	return (error);
1024 }
1025 
1026 int
vm_active_cpus(struct vmctx * ctx,cpuset_t * cpus)1027 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
1028 {
1029 
1030 	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
1031 }
1032 
1033 int
vm_suspended_cpus(struct vmctx * ctx,cpuset_t * cpus)1034 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
1035 {
1036 
1037 	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
1038 }
1039 
1040 int
vm_debug_cpus(struct vmctx * ctx,cpuset_t * cpus)1041 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
1042 {
1043 
1044 	return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
1045 }
1046 
1047 int
vm_activate_cpu(struct vcpu * vcpu)1048 vm_activate_cpu(struct vcpu *vcpu)
1049 {
1050 	struct vm_activate_cpu ac;
1051 	int error;
1052 
1053 	bzero(&ac, sizeof(struct vm_activate_cpu));
1054 	error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac);
1055 	return (error);
1056 }
1057 
1058 int
vm_suspend_all_cpus(struct vmctx * ctx)1059 vm_suspend_all_cpus(struct vmctx *ctx)
1060 {
1061 	struct vm_activate_cpu ac;
1062 	int error;
1063 
1064 	bzero(&ac, sizeof(struct vm_activate_cpu));
1065 	ac.vcpuid = -1;
1066 	error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
1067 	return (error);
1068 }
1069 
1070 int
vm_suspend_cpu(struct vcpu * vcpu)1071 vm_suspend_cpu(struct vcpu *vcpu)
1072 {
1073 	struct vm_activate_cpu ac;
1074 	int error;
1075 
1076 	bzero(&ac, sizeof(struct vm_activate_cpu));
1077 	error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac);
1078 	return (error);
1079 }
1080 
1081 int
vm_resume_cpu(struct vcpu * vcpu)1082 vm_resume_cpu(struct vcpu *vcpu)
1083 {
1084 	struct vm_activate_cpu ac;
1085 	int error;
1086 
1087 	bzero(&ac, sizeof(struct vm_activate_cpu));
1088 	error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac);
1089 	return (error);
1090 }
1091 
1092 int
vm_resume_all_cpus(struct vmctx * ctx)1093 vm_resume_all_cpus(struct vmctx *ctx)
1094 {
1095 	struct vm_activate_cpu ac;
1096 	int error;
1097 
1098 	bzero(&ac, sizeof(struct vm_activate_cpu));
1099 	ac.vcpuid = -1;
1100 	error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
1101 	return (error);
1102 }
1103 
1104 #ifdef __amd64__
1105 int
vm_get_intinfo(struct vcpu * vcpu,uint64_t * info1,uint64_t * info2)1106 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
1107 {
1108 	struct vm_intinfo vmii;
1109 	int error;
1110 
1111 	bzero(&vmii, sizeof(struct vm_intinfo));
1112 	error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii);
1113 	if (error == 0) {
1114 		*info1 = vmii.info1;
1115 		*info2 = vmii.info2;
1116 	}
1117 	return (error);
1118 }
1119 
1120 int
vm_set_intinfo(struct vcpu * vcpu,uint64_t info1)1121 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1)
1122 {
1123 	struct vm_intinfo vmii;
1124 	int error;
1125 
1126 	bzero(&vmii, sizeof(struct vm_intinfo));
1127 	vmii.info1 = info1;
1128 	error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii);
1129 	return (error);
1130 }
1131 #endif
1132 
1133 #ifdef WITH_VMMAPI_SNAPSHOT
1134 int
vm_restart_instruction(struct vcpu * vcpu)1135 vm_restart_instruction(struct vcpu *vcpu)
1136 {
1137 	int arg;
1138 
1139 	return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg));
1140 }
1141 
1142 int
vm_snapshot_req(struct vmctx * ctx,struct vm_snapshot_meta * meta)1143 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta)
1144 {
1145 
1146 	if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
1147 #ifdef SNAPSHOT_DEBUG
1148 		fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
1149 		    __func__, meta->dev_name, errno);
1150 #endif
1151 		return (-1);
1152 	}
1153 	return (0);
1154 }
1155 
1156 int
vm_restore_time(struct vmctx * ctx)1157 vm_restore_time(struct vmctx *ctx)
1158 {
1159 	int dummy;
1160 
1161 	dummy = 0;
1162 	return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
1163 }
1164 #endif
1165 
1166 int
vm_set_topology(struct vmctx * ctx,uint16_t sockets,uint16_t cores,uint16_t threads,uint16_t maxcpus)1167 vm_set_topology(struct vmctx *ctx,
1168     uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
1169 {
1170 	struct vm_cpu_topology topology;
1171 
1172 	bzero(&topology, sizeof (struct vm_cpu_topology));
1173 	topology.sockets = sockets;
1174 	topology.cores = cores;
1175 	topology.threads = threads;
1176 	topology.maxcpus = maxcpus;
1177 	return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
1178 }
1179 
1180 int
vm_get_topology(struct vmctx * ctx,uint16_t * sockets,uint16_t * cores,uint16_t * threads,uint16_t * maxcpus)1181 vm_get_topology(struct vmctx *ctx,
1182     uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
1183 {
1184 	struct vm_cpu_topology topology;
1185 	int error;
1186 
1187 	bzero(&topology, sizeof (struct vm_cpu_topology));
1188 	error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
1189 	if (error == 0) {
1190 		*sockets = topology.sockets;
1191 		*cores = topology.cores;
1192 		*threads = topology.threads;
1193 		*maxcpus = topology.maxcpus;
1194 	}
1195 	return (error);
1196 }
1197 
1198 int
vm_limit_rights(struct vmctx * ctx)1199 vm_limit_rights(struct vmctx *ctx)
1200 {
1201 	cap_rights_t rights;
1202 
1203 	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1204 	if (caph_rights_limit(ctx->fd, &rights) != 0)
1205 		return (-1);
1206 	if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0)
1207 		return (-1);
1208 	return (0);
1209 }
1210 
1211 /*
1212  * Avoid using in new code.  Operations on the fd should be wrapped here so that
1213  * capability rights can be kept in sync.
1214  */
1215 int
vm_get_device_fd(struct vmctx * ctx)1216 vm_get_device_fd(struct vmctx *ctx)
1217 {
1218 
1219 	return (ctx->fd);
1220 }
1221 
1222 /* Legacy interface, do not use. */
1223 const cap_ioctl_t *
vm_get_ioctls(size_t * len)1224 vm_get_ioctls(size_t *len)
1225 {
1226 	cap_ioctl_t *cmds;
1227 	size_t sz;
1228 
1229 	if (len == NULL) {
1230 		sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]);
1231 		cmds = malloc(sz);
1232 		if (cmds == NULL)
1233 			return (NULL);
1234 		bcopy(vm_ioctl_cmds, cmds, sz);
1235 		return (cmds);
1236 	}
1237 
1238 	*len = vm_ioctl_ncmds;
1239 	return (NULL);
1240 }
1241