1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/capsicum.h>
31 #include <sys/sysctl.h>
32 #include <sys/ioctl.h>
33 #include <sys/mman.h>
34 #include <sys/linker.h>
35 #include <sys/module.h>
36 #include <sys/_iovec.h>
37 #include <sys/cpuset.h>
38
39 #include <capsicum_helpers.h>
40 #include <err.h>
41 #include <errno.h>
42 #include <stdbool.h>
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <assert.h>
46 #include <string.h>
47 #include <fcntl.h>
48 #include <unistd.h>
49
50 #include <libutil.h>
51
52 #include <vm/vm.h>
53 #include <machine/vmm.h>
54 #ifdef WITH_VMMAPI_SNAPSHOT
55 #include <machine/vmm_snapshot.h>
56 #endif
57
58 #include <dev/vmm/vmm_dev.h>
59
60 #include "vmmapi.h"
61 #include "internal.h"
62
63 #define MB (1024 * 1024UL)
64 #define GB (1024 * 1024 * 1024UL)
65
66 #ifdef __amd64__
67 #define VM_LOWMEM_LIMIT (3 * GB)
68 #else
69 #define VM_LOWMEM_LIMIT 0
70 #endif
71 #define VM_HIGHMEM_BASE (4 * GB)
72
73 /*
74 * Size of the guard region before and after the virtual address space
75 * mapping the guest physical memory. This must be a multiple of the
76 * superpage size for performance reasons.
77 */
78 #define VM_MMAP_GUARD_SIZE (4 * MB)
79
80 #define PROT_RW (PROT_READ | PROT_WRITE)
81 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC)
82
83 static int
vm_device_open(const char * name)84 vm_device_open(const char *name)
85 {
86 char devpath[PATH_MAX];
87
88 assert(strlen(name) <= VM_MAX_NAMELEN);
89 (void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name);
90 return (open(devpath, O_RDWR));
91 }
92
93 static int
vm_ctl_open(void)94 vm_ctl_open(void)
95 {
96 if (modfind("vmm") < 0)
97 (void)kldload("vmm");
98 return (open("/dev/vmmctl", O_RDWR, 0));
99 }
100
101 static int
vm_ctl_create(const char * name,int ctlfd)102 vm_ctl_create(const char *name, int ctlfd)
103 {
104 struct vmmctl_vm_create vmc;
105
106 memset(&vmc, 0, sizeof(vmc));
107 if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) {
108 errno = ENAMETOOLONG;
109 return (-1);
110 }
111 return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc));
112 }
113
114 int
vm_create(const char * name)115 vm_create(const char *name)
116 {
117 int error, fd;
118
119 fd = vm_ctl_open();
120 if (fd < 0)
121 return (-1);
122
123 error = vm_ctl_create(name, fd);
124 if (error != 0) {
125 error = errno;
126 (void)close(fd);
127 errno = error;
128 return (-1);
129 }
130 (void)close(fd);
131 return (0);
132 }
133
134 struct vmctx *
vm_open(const char * name)135 vm_open(const char *name)
136 {
137 return (vm_openf(name, 0));
138 }
139
140 struct vmctx *
vm_openf(const char * name,int flags)141 vm_openf(const char *name, int flags)
142 {
143 struct vmctx *vm;
144 int saved_errno;
145 bool created;
146
147 created = false;
148
149 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
150 assert(vm != NULL);
151
152 vm->fd = vm->ctlfd = -1;
153 vm->memflags = 0;
154 vm->name = (char *)(vm + 1);
155 strcpy(vm->name, name);
156 memset(vm->memsegs, 0, sizeof(vm->memsegs));
157
158 if ((vm->ctlfd = vm_ctl_open()) < 0)
159 goto err;
160
161 vm->fd = vm_device_open(vm->name);
162 if (vm->fd < 0 && errno == ENOENT) {
163 if (flags & VMMAPI_OPEN_CREATE) {
164 if (vm_ctl_create(vm->name, vm->ctlfd) != 0)
165 goto err;
166 vm->fd = vm_device_open(vm->name);
167 created = true;
168 }
169 }
170 if (vm->fd < 0)
171 goto err;
172
173 if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0)
174 goto err;
175
176 return (vm);
177 err:
178 saved_errno = errno;
179 if (created)
180 vm_destroy(vm);
181 else
182 vm_close(vm);
183 errno = saved_errno;
184 return (NULL);
185 }
186
187 void
vm_close(struct vmctx * vm)188 vm_close(struct vmctx *vm)
189 {
190 assert(vm != NULL);
191
192 if (vm->fd >= 0)
193 (void)close(vm->fd);
194 if (vm->ctlfd >= 0)
195 (void)close(vm->ctlfd);
196 free(vm);
197 }
198
199 void
vm_destroy(struct vmctx * vm)200 vm_destroy(struct vmctx *vm)
201 {
202 struct vmmctl_vm_destroy vmd;
203
204 memset(&vmd, 0, sizeof(vmd));
205 (void)strlcpy(vmd.name, vm->name, sizeof(vmd.name));
206 if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0)
207 warn("ioctl(VMMCTL_VM_DESTROY)");
208
209 vm_close(vm);
210 }
211
212 struct vcpu *
vm_vcpu_open(struct vmctx * ctx,int vcpuid)213 vm_vcpu_open(struct vmctx *ctx, int vcpuid)
214 {
215 struct vcpu *vcpu;
216
217 vcpu = malloc(sizeof(*vcpu));
218 vcpu->ctx = ctx;
219 vcpu->vcpuid = vcpuid;
220 return (vcpu);
221 }
222
223 void
vm_vcpu_close(struct vcpu * vcpu)224 vm_vcpu_close(struct vcpu *vcpu)
225 {
226 free(vcpu);
227 }
228
229 int
vcpu_id(struct vcpu * vcpu)230 vcpu_id(struct vcpu *vcpu)
231 {
232 return (vcpu->vcpuid);
233 }
234
235 int
vm_parse_memsize(const char * opt,size_t * ret_memsize)236 vm_parse_memsize(const char *opt, size_t *ret_memsize)
237 {
238 char *endptr;
239 size_t optval;
240 int error;
241
242 optval = strtoul(opt, &endptr, 0);
243 if (*opt != '\0' && *endptr == '\0') {
244 /*
245 * For the sake of backward compatibility if the memory size
246 * specified on the command line is less than a megabyte then
247 * it is interpreted as being in units of MB.
248 */
249 if (optval < MB)
250 optval *= MB;
251 *ret_memsize = optval;
252 error = 0;
253 } else
254 error = expand_number(opt, ret_memsize);
255
256 return (error);
257 }
258
259 uint32_t
vm_get_lowmem_limit(struct vmctx * ctx __unused)260 vm_get_lowmem_limit(struct vmctx *ctx __unused)
261 {
262
263 return (VM_LOWMEM_LIMIT);
264 }
265
266 void
vm_set_memflags(struct vmctx * ctx,int flags)267 vm_set_memflags(struct vmctx *ctx, int flags)
268 {
269
270 ctx->memflags = flags;
271 }
272
273 int
vm_get_memflags(struct vmctx * ctx)274 vm_get_memflags(struct vmctx *ctx)
275 {
276
277 return (ctx->memflags);
278 }
279
280 /*
281 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
282 */
283 int
vm_mmap_memseg(struct vmctx * ctx,vm_paddr_t gpa,int segid,vm_ooffset_t off,size_t len,int prot)284 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
285 size_t len, int prot)
286 {
287 struct vm_memmap memmap;
288 int error, flags;
289
290 memmap.gpa = gpa;
291 memmap.segid = segid;
292 memmap.segoff = off;
293 memmap.len = len;
294 memmap.prot = prot;
295 memmap.flags = 0;
296
297 if (ctx->memflags & VM_MEM_F_WIRED)
298 memmap.flags |= VM_MEMMAP_F_WIRED;
299
300 /*
301 * If this mapping already exists then don't create it again. This
302 * is the common case for SYSMEM mappings created by bhyveload(8).
303 */
304 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
305 if (error == 0 && gpa == memmap.gpa) {
306 if (segid != memmap.segid || off != memmap.segoff ||
307 prot != memmap.prot || flags != memmap.flags) {
308 errno = EEXIST;
309 return (-1);
310 } else {
311 return (0);
312 }
313 }
314
315 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
316 return (error);
317 }
318
319 int
vm_get_guestmem_from_ctx(struct vmctx * ctx,char ** guest_baseaddr,size_t * lowmem_size,size_t * highmem_size)320 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
321 size_t *lowmem_size, size_t *highmem_size)
322 {
323
324 *guest_baseaddr = ctx->baseaddr;
325 *lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size;
326 *highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size;
327 return (0);
328 }
329
330 int
vm_munmap_memseg(struct vmctx * ctx,vm_paddr_t gpa,size_t len)331 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
332 {
333 struct vm_munmap munmap;
334 int error;
335
336 munmap.gpa = gpa;
337 munmap.len = len;
338
339 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap);
340 return (error);
341 }
342
343 int
vm_mmap_getnext(struct vmctx * ctx,vm_paddr_t * gpa,int * segid,vm_ooffset_t * segoff,size_t * len,int * prot,int * flags)344 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
345 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
346 {
347 struct vm_memmap memmap;
348 int error;
349
350 bzero(&memmap, sizeof(struct vm_memmap));
351 memmap.gpa = *gpa;
352 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
353 if (error == 0) {
354 *gpa = memmap.gpa;
355 *segid = memmap.segid;
356 *segoff = memmap.segoff;
357 *len = memmap.len;
358 *prot = memmap.prot;
359 *flags = memmap.flags;
360 }
361 return (error);
362 }
363
364 /*
365 * Return 0 if the segments are identical and non-zero otherwise.
366 *
367 * This is slightly complicated by the fact that only device memory segments
368 * are named.
369 */
370 static int
cmpseg(size_t len,const char * str,size_t len2,const char * str2)371 cmpseg(size_t len, const char *str, size_t len2, const char *str2)
372 {
373
374 if (len == len2) {
375 if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
376 return (0);
377 }
378 return (-1);
379 }
380
381 static int
vm_alloc_memseg(struct vmctx * ctx,int segid,size_t len,const char * name)382 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
383 {
384 struct vm_memseg memseg;
385 size_t n;
386 int error;
387
388 /*
389 * If the memory segment has already been created then just return.
390 * This is the usual case for the SYSMEM segment created by userspace
391 * loaders like bhyveload(8).
392 */
393 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
394 sizeof(memseg.name));
395 if (error)
396 return (error);
397
398 if (memseg.len != 0) {
399 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
400 errno = EINVAL;
401 return (-1);
402 } else {
403 return (0);
404 }
405 }
406
407 bzero(&memseg, sizeof(struct vm_memseg));
408 memseg.segid = segid;
409 memseg.len = len;
410 if (name != NULL) {
411 n = strlcpy(memseg.name, name, sizeof(memseg.name));
412 if (n >= sizeof(memseg.name)) {
413 errno = ENAMETOOLONG;
414 return (-1);
415 }
416 }
417
418 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
419 return (error);
420 }
421
422 int
vm_get_memseg(struct vmctx * ctx,int segid,size_t * lenp,char * namebuf,size_t bufsize)423 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
424 size_t bufsize)
425 {
426 struct vm_memseg memseg;
427 size_t n;
428 int error;
429
430 bzero(&memseg, sizeof(memseg));
431 memseg.segid = segid;
432 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
433 if (error == 0) {
434 *lenp = memseg.len;
435 n = strlcpy(namebuf, memseg.name, bufsize);
436 if (n >= bufsize) {
437 errno = ENAMETOOLONG;
438 error = -1;
439 }
440 }
441 return (error);
442 }
443
444 static int
setup_memory_segment(struct vmctx * ctx,vm_paddr_t gpa,size_t len,char * base)445 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
446 {
447 char *ptr;
448 int error, flags;
449
450 /* Map 'len' bytes starting at 'gpa' in the guest address space */
451 error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
452 if (error)
453 return (error);
454
455 flags = MAP_SHARED | MAP_FIXED;
456 if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
457 flags |= MAP_NOCORE;
458
459 /* mmap into the process address space on the host */
460 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
461 if (ptr == MAP_FAILED)
462 return (-1);
463
464 return (0);
465 }
466
467 int
vm_setup_memory(struct vmctx * ctx,size_t memsize,enum vm_mmap_style vms)468 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
469 {
470 size_t objsize, len;
471 vm_paddr_t gpa;
472 char *baseaddr, *ptr;
473 int error;
474
475 assert(vms == VM_MMAP_ALL);
476
477 /*
478 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create
479 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder.
480 */
481 if (memsize > VM_LOWMEM_LIMIT) {
482 ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT;
483 ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT;
484 objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size;
485 } else {
486 ctx->memsegs[VM_MEMSEG_LOW].size = memsize;
487 ctx->memsegs[VM_MEMSEG_HIGH].size = 0;
488 objsize = memsize;
489 }
490
491 error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
492 if (error)
493 return (error);
494
495 /*
496 * Stake out a contiguous region covering the guest physical memory
497 * and the adjoining guard regions.
498 */
499 len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
500 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
501 if (ptr == MAP_FAILED)
502 return (-1);
503
504 baseaddr = ptr + VM_MMAP_GUARD_SIZE;
505 if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) {
506 gpa = VM_HIGHMEM_BASE;
507 len = ctx->memsegs[VM_MEMSEG_HIGH].size;
508 error = setup_memory_segment(ctx, gpa, len, baseaddr);
509 if (error)
510 return (error);
511 }
512
513 if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) {
514 gpa = 0;
515 len = ctx->memsegs[VM_MEMSEG_LOW].size;
516 error = setup_memory_segment(ctx, gpa, len, baseaddr);
517 if (error)
518 return (error);
519 }
520
521 ctx->baseaddr = baseaddr;
522
523 return (0);
524 }
525
526 /*
527 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
528 * the lowmem or highmem regions.
529 *
530 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
531 * The instruction emulation code depends on this behavior.
532 */
533 void *
vm_map_gpa(struct vmctx * ctx,vm_paddr_t gaddr,size_t len)534 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
535 {
536 vm_size_t lowsize, highsize;
537
538 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
539 if (lowsize > 0) {
540 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize)
541 return (ctx->baseaddr + gaddr);
542 }
543
544 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
545 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) {
546 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize &&
547 gaddr + len <= VM_HIGHMEM_BASE + highsize)
548 return (ctx->baseaddr + gaddr);
549 }
550
551 return (NULL);
552 }
553
554 vm_paddr_t
vm_rev_map_gpa(struct vmctx * ctx,void * addr)555 vm_rev_map_gpa(struct vmctx *ctx, void *addr)
556 {
557 vm_paddr_t offaddr;
558 vm_size_t lowsize, highsize;
559
560 offaddr = (char *)addr - ctx->baseaddr;
561
562 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
563 if (lowsize > 0)
564 if (offaddr <= lowsize)
565 return (offaddr);
566
567 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
568 if (highsize > 0)
569 if (offaddr >= VM_HIGHMEM_BASE &&
570 offaddr < VM_HIGHMEM_BASE + highsize)
571 return (offaddr);
572
573 return ((vm_paddr_t)-1);
574 }
575
576 const char *
vm_get_name(struct vmctx * ctx)577 vm_get_name(struct vmctx *ctx)
578 {
579
580 return (ctx->name);
581 }
582
583 size_t
vm_get_lowmem_size(struct vmctx * ctx)584 vm_get_lowmem_size(struct vmctx *ctx)
585 {
586
587 return (ctx->memsegs[VM_MEMSEG_LOW].size);
588 }
589
590 vm_paddr_t
vm_get_highmem_base(struct vmctx * ctx __unused)591 vm_get_highmem_base(struct vmctx *ctx __unused)
592 {
593
594 return (VM_HIGHMEM_BASE);
595 }
596
597 size_t
vm_get_highmem_size(struct vmctx * ctx)598 vm_get_highmem_size(struct vmctx *ctx)
599 {
600
601 return (ctx->memsegs[VM_MEMSEG_HIGH].size);
602 }
603
604 void *
vm_create_devmem(struct vmctx * ctx,int segid,const char * name,size_t len)605 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
606 {
607 char pathname[MAXPATHLEN];
608 size_t len2;
609 char *base, *ptr;
610 int fd, error, flags;
611
612 fd = -1;
613 ptr = MAP_FAILED;
614 if (name == NULL || strlen(name) == 0) {
615 errno = EINVAL;
616 goto done;
617 }
618
619 error = vm_alloc_memseg(ctx, segid, len, name);
620 if (error)
621 goto done;
622
623 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
624 strlcat(pathname, ctx->name, sizeof(pathname));
625 strlcat(pathname, ".", sizeof(pathname));
626 strlcat(pathname, name, sizeof(pathname));
627
628 fd = open(pathname, O_RDWR);
629 if (fd < 0)
630 goto done;
631
632 /*
633 * Stake out a contiguous region covering the device memory and the
634 * adjoining guard regions.
635 */
636 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
637 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
638 0);
639 if (base == MAP_FAILED)
640 goto done;
641
642 flags = MAP_SHARED | MAP_FIXED;
643 if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
644 flags |= MAP_NOCORE;
645
646 /* mmap the devmem region in the host address space */
647 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
648 done:
649 if (fd >= 0)
650 close(fd);
651 return (ptr);
652 }
653
654 int
vcpu_ioctl(struct vcpu * vcpu,u_long cmd,void * arg)655 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg)
656 {
657 /*
658 * XXX: fragile, handle with care
659 * Assumes that the first field of the ioctl data
660 * is the vcpuid.
661 */
662 *(int *)arg = vcpu->vcpuid;
663 return (ioctl(vcpu->ctx->fd, cmd, arg));
664 }
665
666 int
vm_set_register(struct vcpu * vcpu,int reg,uint64_t val)667 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
668 {
669 int error;
670 struct vm_register vmreg;
671
672 bzero(&vmreg, sizeof(vmreg));
673 vmreg.regnum = reg;
674 vmreg.regval = val;
675
676 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg);
677 return (error);
678 }
679
680 int
vm_get_register(struct vcpu * vcpu,int reg,uint64_t * ret_val)681 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val)
682 {
683 int error;
684 struct vm_register vmreg;
685
686 bzero(&vmreg, sizeof(vmreg));
687 vmreg.regnum = reg;
688
689 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg);
690 *ret_val = vmreg.regval;
691 return (error);
692 }
693
694 int
vm_set_register_set(struct vcpu * vcpu,unsigned int count,const int * regnums,uint64_t * regvals)695 vm_set_register_set(struct vcpu *vcpu, unsigned int count,
696 const int *regnums, uint64_t *regvals)
697 {
698 int error;
699 struct vm_register_set vmregset;
700
701 bzero(&vmregset, sizeof(vmregset));
702 vmregset.count = count;
703 vmregset.regnums = regnums;
704 vmregset.regvals = regvals;
705
706 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset);
707 return (error);
708 }
709
710 int
vm_get_register_set(struct vcpu * vcpu,unsigned int count,const int * regnums,uint64_t * regvals)711 vm_get_register_set(struct vcpu *vcpu, unsigned int count,
712 const int *regnums, uint64_t *regvals)
713 {
714 int error;
715 struct vm_register_set vmregset;
716
717 bzero(&vmregset, sizeof(vmregset));
718 vmregset.count = count;
719 vmregset.regnums = regnums;
720 vmregset.regvals = regvals;
721
722 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset);
723 return (error);
724 }
725
726 int
vm_run(struct vcpu * vcpu,struct vm_run * vmrun)727 vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
728 {
729 return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
730 }
731
732 int
vm_suspend(struct vmctx * ctx,enum vm_suspend_how how)733 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
734 {
735 struct vm_suspend vmsuspend;
736
737 bzero(&vmsuspend, sizeof(vmsuspend));
738 vmsuspend.how = how;
739 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
740 }
741
742 int
vm_reinit(struct vmctx * ctx)743 vm_reinit(struct vmctx *ctx)
744 {
745
746 return (ioctl(ctx->fd, VM_REINIT, 0));
747 }
748
749 int
vm_capability_name2type(const char * capname)750 vm_capability_name2type(const char *capname)
751 {
752 int i;
753
754 for (i = 0; i < VM_CAP_MAX; i++) {
755 if (vm_capstrmap[i] != NULL &&
756 strcmp(vm_capstrmap[i], capname) == 0)
757 return (i);
758 }
759
760 return (-1);
761 }
762
763 const char *
vm_capability_type2name(int type)764 vm_capability_type2name(int type)
765 {
766 if (type >= 0 && type < VM_CAP_MAX)
767 return (vm_capstrmap[type]);
768
769 return (NULL);
770 }
771
772 int
vm_get_capability(struct vcpu * vcpu,enum vm_cap_type cap,int * retval)773 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval)
774 {
775 int error;
776 struct vm_capability vmcap;
777
778 bzero(&vmcap, sizeof(vmcap));
779 vmcap.captype = cap;
780
781 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap);
782 *retval = vmcap.capval;
783 return (error);
784 }
785
786 int
vm_set_capability(struct vcpu * vcpu,enum vm_cap_type cap,int val)787 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val)
788 {
789 struct vm_capability vmcap;
790
791 bzero(&vmcap, sizeof(vmcap));
792 vmcap.captype = cap;
793 vmcap.capval = val;
794
795 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap));
796 }
797
798 uint64_t *
vm_get_stats(struct vcpu * vcpu,struct timeval * ret_tv,int * ret_entries)799 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv,
800 int *ret_entries)
801 {
802 static _Thread_local uint64_t *stats_buf;
803 static _Thread_local u_int stats_count;
804 uint64_t *new_stats;
805 struct vm_stats vmstats;
806 u_int count, index;
807 bool have_stats;
808
809 have_stats = false;
810 count = 0;
811 for (index = 0;; index += nitems(vmstats.statbuf)) {
812 vmstats.index = index;
813 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0)
814 break;
815 if (stats_count < index + vmstats.num_entries) {
816 new_stats = realloc(stats_buf,
817 (index + vmstats.num_entries) * sizeof(uint64_t));
818 if (new_stats == NULL) {
819 errno = ENOMEM;
820 return (NULL);
821 }
822 stats_count = index + vmstats.num_entries;
823 stats_buf = new_stats;
824 }
825 memcpy(stats_buf + index, vmstats.statbuf,
826 vmstats.num_entries * sizeof(uint64_t));
827 count += vmstats.num_entries;
828 have_stats = true;
829
830 if (vmstats.num_entries != nitems(vmstats.statbuf))
831 break;
832 }
833 if (have_stats) {
834 if (ret_entries)
835 *ret_entries = count;
836 if (ret_tv)
837 *ret_tv = vmstats.tv;
838 return (stats_buf);
839 } else
840 return (NULL);
841 }
842
843 const char *
vm_get_stat_desc(struct vmctx * ctx,int index)844 vm_get_stat_desc(struct vmctx *ctx, int index)
845 {
846 static struct vm_stat_desc statdesc;
847
848 statdesc.index = index;
849 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
850 return (statdesc.desc);
851 else
852 return (NULL);
853 }
854
855 #ifdef __amd64__
856 int
vm_get_gpa_pmap(struct vmctx * ctx,uint64_t gpa,uint64_t * pte,int * num)857 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
858 {
859 int error, i;
860 struct vm_gpa_pte gpapte;
861
862 bzero(&gpapte, sizeof(gpapte));
863 gpapte.gpa = gpa;
864
865 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
866
867 if (error == 0) {
868 *num = gpapte.ptenum;
869 for (i = 0; i < gpapte.ptenum; i++)
870 pte[i] = gpapte.pte[i];
871 }
872
873 return (error);
874 }
875
876 int
vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * fault)877 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
878 uint64_t gla, int prot, uint64_t *gpa, int *fault)
879 {
880 struct vm_gla2gpa gg;
881 int error;
882
883 bzero(&gg, sizeof(struct vm_gla2gpa));
884 gg.prot = prot;
885 gg.gla = gla;
886 gg.paging = *paging;
887
888 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg);
889 if (error == 0) {
890 *fault = gg.fault;
891 *gpa = gg.gpa;
892 }
893 return (error);
894 }
895 #endif
896
897 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * fault)898 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
899 uint64_t gla, int prot, uint64_t *gpa, int *fault)
900 {
901 struct vm_gla2gpa gg;
902 int error;
903
904 bzero(&gg, sizeof(struct vm_gla2gpa));
905 gg.prot = prot;
906 gg.gla = gla;
907 gg.paging = *paging;
908
909 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg);
910 if (error == 0) {
911 *fault = gg.fault;
912 *gpa = gg.gpa;
913 }
914 return (error);
915 }
916
917 #ifndef min
918 #define min(a,b) (((a) < (b)) ? (a) : (b))
919 #endif
920
921 #ifdef __amd64__
922 int
vm_copy_setup(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,size_t len,int prot,struct iovec * iov,int iovcnt,int * fault)923 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
924 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
925 int *fault)
926 {
927 void *va;
928 uint64_t gpa, off;
929 int error, i, n;
930
931 for (i = 0; i < iovcnt; i++) {
932 iov[i].iov_base = 0;
933 iov[i].iov_len = 0;
934 }
935
936 while (len) {
937 assert(iovcnt > 0);
938 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
939 if (error || *fault)
940 return (error);
941
942 off = gpa & PAGE_MASK;
943 n = MIN(len, PAGE_SIZE - off);
944
945 va = vm_map_gpa(vcpu->ctx, gpa, n);
946 if (va == NULL)
947 return (EFAULT);
948
949 iov->iov_base = va;
950 iov->iov_len = n;
951 iov++;
952 iovcnt--;
953
954 gla += n;
955 len -= n;
956 }
957 return (0);
958 }
959 #endif
960
961 void
vm_copy_teardown(struct iovec * iov __unused,int iovcnt __unused)962 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused)
963 {
964 /*
965 * Intentionally empty. This is used by the instruction
966 * emulation code shared with the kernel. The in-kernel
967 * version of this is non-empty.
968 */
969 }
970
971 void
vm_copyin(struct iovec * iov,void * vp,size_t len)972 vm_copyin(struct iovec *iov, void *vp, size_t len)
973 {
974 const char *src;
975 char *dst;
976 size_t n;
977
978 dst = vp;
979 while (len) {
980 assert(iov->iov_len);
981 n = min(len, iov->iov_len);
982 src = iov->iov_base;
983 bcopy(src, dst, n);
984
985 iov++;
986 dst += n;
987 len -= n;
988 }
989 }
990
991 void
vm_copyout(const void * vp,struct iovec * iov,size_t len)992 vm_copyout(const void *vp, struct iovec *iov, size_t len)
993 {
994 const char *src;
995 char *dst;
996 size_t n;
997
998 src = vp;
999 while (len) {
1000 assert(iov->iov_len);
1001 n = min(len, iov->iov_len);
1002 dst = iov->iov_base;
1003 bcopy(src, dst, n);
1004
1005 iov++;
1006 src += n;
1007 len -= n;
1008 }
1009 }
1010
1011 static int
vm_get_cpus(struct vmctx * ctx,int which,cpuset_t * cpus)1012 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
1013 {
1014 struct vm_cpuset vm_cpuset;
1015 int error;
1016
1017 bzero(&vm_cpuset, sizeof(struct vm_cpuset));
1018 vm_cpuset.which = which;
1019 vm_cpuset.cpusetsize = sizeof(cpuset_t);
1020 vm_cpuset.cpus = cpus;
1021
1022 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
1023 return (error);
1024 }
1025
1026 int
vm_active_cpus(struct vmctx * ctx,cpuset_t * cpus)1027 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
1028 {
1029
1030 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
1031 }
1032
1033 int
vm_suspended_cpus(struct vmctx * ctx,cpuset_t * cpus)1034 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
1035 {
1036
1037 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
1038 }
1039
1040 int
vm_debug_cpus(struct vmctx * ctx,cpuset_t * cpus)1041 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
1042 {
1043
1044 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
1045 }
1046
1047 int
vm_activate_cpu(struct vcpu * vcpu)1048 vm_activate_cpu(struct vcpu *vcpu)
1049 {
1050 struct vm_activate_cpu ac;
1051 int error;
1052
1053 bzero(&ac, sizeof(struct vm_activate_cpu));
1054 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac);
1055 return (error);
1056 }
1057
1058 int
vm_suspend_all_cpus(struct vmctx * ctx)1059 vm_suspend_all_cpus(struct vmctx *ctx)
1060 {
1061 struct vm_activate_cpu ac;
1062 int error;
1063
1064 bzero(&ac, sizeof(struct vm_activate_cpu));
1065 ac.vcpuid = -1;
1066 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
1067 return (error);
1068 }
1069
1070 int
vm_suspend_cpu(struct vcpu * vcpu)1071 vm_suspend_cpu(struct vcpu *vcpu)
1072 {
1073 struct vm_activate_cpu ac;
1074 int error;
1075
1076 bzero(&ac, sizeof(struct vm_activate_cpu));
1077 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac);
1078 return (error);
1079 }
1080
1081 int
vm_resume_cpu(struct vcpu * vcpu)1082 vm_resume_cpu(struct vcpu *vcpu)
1083 {
1084 struct vm_activate_cpu ac;
1085 int error;
1086
1087 bzero(&ac, sizeof(struct vm_activate_cpu));
1088 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac);
1089 return (error);
1090 }
1091
1092 int
vm_resume_all_cpus(struct vmctx * ctx)1093 vm_resume_all_cpus(struct vmctx *ctx)
1094 {
1095 struct vm_activate_cpu ac;
1096 int error;
1097
1098 bzero(&ac, sizeof(struct vm_activate_cpu));
1099 ac.vcpuid = -1;
1100 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
1101 return (error);
1102 }
1103
1104 #ifdef __amd64__
1105 int
vm_get_intinfo(struct vcpu * vcpu,uint64_t * info1,uint64_t * info2)1106 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
1107 {
1108 struct vm_intinfo vmii;
1109 int error;
1110
1111 bzero(&vmii, sizeof(struct vm_intinfo));
1112 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii);
1113 if (error == 0) {
1114 *info1 = vmii.info1;
1115 *info2 = vmii.info2;
1116 }
1117 return (error);
1118 }
1119
1120 int
vm_set_intinfo(struct vcpu * vcpu,uint64_t info1)1121 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1)
1122 {
1123 struct vm_intinfo vmii;
1124 int error;
1125
1126 bzero(&vmii, sizeof(struct vm_intinfo));
1127 vmii.info1 = info1;
1128 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii);
1129 return (error);
1130 }
1131 #endif
1132
1133 #ifdef WITH_VMMAPI_SNAPSHOT
1134 int
vm_restart_instruction(struct vcpu * vcpu)1135 vm_restart_instruction(struct vcpu *vcpu)
1136 {
1137 int arg;
1138
1139 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg));
1140 }
1141
1142 int
vm_snapshot_req(struct vmctx * ctx,struct vm_snapshot_meta * meta)1143 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta)
1144 {
1145
1146 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
1147 #ifdef SNAPSHOT_DEBUG
1148 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
1149 __func__, meta->dev_name, errno);
1150 #endif
1151 return (-1);
1152 }
1153 return (0);
1154 }
1155
1156 int
vm_restore_time(struct vmctx * ctx)1157 vm_restore_time(struct vmctx *ctx)
1158 {
1159 int dummy;
1160
1161 dummy = 0;
1162 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
1163 }
1164 #endif
1165
1166 int
vm_set_topology(struct vmctx * ctx,uint16_t sockets,uint16_t cores,uint16_t threads,uint16_t maxcpus)1167 vm_set_topology(struct vmctx *ctx,
1168 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
1169 {
1170 struct vm_cpu_topology topology;
1171
1172 bzero(&topology, sizeof (struct vm_cpu_topology));
1173 topology.sockets = sockets;
1174 topology.cores = cores;
1175 topology.threads = threads;
1176 topology.maxcpus = maxcpus;
1177 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
1178 }
1179
1180 int
vm_get_topology(struct vmctx * ctx,uint16_t * sockets,uint16_t * cores,uint16_t * threads,uint16_t * maxcpus)1181 vm_get_topology(struct vmctx *ctx,
1182 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
1183 {
1184 struct vm_cpu_topology topology;
1185 int error;
1186
1187 bzero(&topology, sizeof (struct vm_cpu_topology));
1188 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
1189 if (error == 0) {
1190 *sockets = topology.sockets;
1191 *cores = topology.cores;
1192 *threads = topology.threads;
1193 *maxcpus = topology.maxcpus;
1194 }
1195 return (error);
1196 }
1197
1198 int
vm_limit_rights(struct vmctx * ctx)1199 vm_limit_rights(struct vmctx *ctx)
1200 {
1201 cap_rights_t rights;
1202
1203 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1204 if (caph_rights_limit(ctx->fd, &rights) != 0)
1205 return (-1);
1206 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0)
1207 return (-1);
1208 return (0);
1209 }
1210
1211 /*
1212 * Avoid using in new code. Operations on the fd should be wrapped here so that
1213 * capability rights can be kept in sync.
1214 */
1215 int
vm_get_device_fd(struct vmctx * ctx)1216 vm_get_device_fd(struct vmctx *ctx)
1217 {
1218
1219 return (ctx->fd);
1220 }
1221
1222 /* Legacy interface, do not use. */
1223 const cap_ioctl_t *
vm_get_ioctls(size_t * len)1224 vm_get_ioctls(size_t *len)
1225 {
1226 cap_ioctl_t *cmds;
1227 size_t sz;
1228
1229 if (len == NULL) {
1230 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]);
1231 cmds = malloc(sz);
1232 if (cmds == NULL)
1233 return (NULL);
1234 bcopy(vm_ioctl_cmds, cmds, sz);
1235 return (cmds);
1236 }
1237
1238 *len = vm_ioctl_ncmds;
1239 return (NULL);
1240 }
1241