1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/capsicum.h>
31 #include <sys/sysctl.h>
32 #include <sys/ioctl.h>
33 #include <sys/mman.h>
34 #include <sys/linker.h>
35 #include <sys/module.h>
36 #include <sys/_iovec.h>
37 #include <sys/cpuset.h>
38
39 #include <capsicum_helpers.h>
40 #include <err.h>
41 #include <errno.h>
42 #include <stdbool.h>
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <assert.h>
46 #include <string.h>
47 #include <fcntl.h>
48 #include <unistd.h>
49
50 #include <libutil.h>
51
52 #include <vm/vm.h>
53 #include <machine/vmm.h>
54 #ifdef WITH_VMMAPI_SNAPSHOT
55 #include <machine/vmm_snapshot.h>
56 #endif
57
58 #include <dev/vmm/vmm_dev.h>
59
60 #include "vmmapi.h"
61 #include "internal.h"
62
63 #define MB (1024 * 1024UL)
64 #define GB (1024 * 1024 * 1024UL)
65
66 #ifdef __amd64__
67 #define VM_LOWMEM_LIMIT (3 * GB)
68 #else
69 #define VM_LOWMEM_LIMIT 0
70 #endif
71 #define VM_HIGHMEM_BASE (4 * GB)
72
73 /*
74 * Size of the guard region before and after the virtual address space
75 * mapping the guest physical memory. This must be a multiple of the
76 * superpage size for performance reasons.
77 */
78 #define VM_MMAP_GUARD_SIZE (4 * MB)
79
80 #define PROT_RW (PROT_READ | PROT_WRITE)
81 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC)
82
83 static int
vm_device_open(const char * name)84 vm_device_open(const char *name)
85 {
86 char devpath[PATH_MAX];
87
88 assert(strlen(name) <= VM_MAX_NAMELEN);
89 (void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name);
90 return (open(devpath, O_RDWR));
91 }
92
93 static int
vm_ctl_create(const char * name,int ctlfd)94 vm_ctl_create(const char *name, int ctlfd)
95 {
96 struct vmmctl_vm_create vmc;
97
98 memset(&vmc, 0, sizeof(vmc));
99 if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) {
100 errno = ENAMETOOLONG;
101 return (-1);
102 }
103 return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc));
104 }
105
106 int
vm_create(const char * name)107 vm_create(const char *name)
108 {
109 int error, fd;
110
111 /* Try to load vmm(4) module before creating a guest. */
112 if (modfind("vmm") < 0) {
113 error = kldload("vmm");
114 if (error != 0)
115 return (-1);
116 }
117
118 fd = open("/dev/vmmctl", O_RDWR, 0);
119 if (fd < 0)
120 return (fd);
121 error = vm_ctl_create(name, fd);
122 if (error != 0) {
123 error = errno;
124 (void)close(fd);
125 errno = error;
126 return (-1);
127 }
128 (void)close(fd);
129 return (0);
130 }
131
132 struct vmctx *
vm_open(const char * name)133 vm_open(const char *name)
134 {
135 return (vm_openf(name, 0));
136 }
137
138 struct vmctx *
vm_openf(const char * name,int flags)139 vm_openf(const char *name, int flags)
140 {
141 struct vmctx *vm;
142 int saved_errno;
143 bool created;
144
145 created = false;
146
147 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
148 assert(vm != NULL);
149
150 vm->fd = vm->ctlfd = -1;
151 vm->memflags = 0;
152 vm->name = (char *)(vm + 1);
153 strcpy(vm->name, name);
154 memset(vm->memsegs, 0, sizeof(vm->memsegs));
155
156 if ((vm->ctlfd = open("/dev/vmmctl", O_RDWR, 0)) < 0)
157 goto err;
158
159 vm->fd = vm_device_open(vm->name);
160 if (vm->fd < 0 && errno == ENOENT) {
161 if (flags & VMMAPI_OPEN_CREATE) {
162 if (vm_ctl_create(vm->name, vm->ctlfd) != 0)
163 goto err;
164 vm->fd = vm_device_open(vm->name);
165 created = true;
166 }
167 }
168 if (vm->fd < 0)
169 goto err;
170
171 if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0)
172 goto err;
173
174 return (vm);
175 err:
176 saved_errno = errno;
177 if (created)
178 vm_destroy(vm);
179 else
180 vm_close(vm);
181 errno = saved_errno;
182 return (NULL);
183 }
184
185 void
vm_close(struct vmctx * vm)186 vm_close(struct vmctx *vm)
187 {
188 assert(vm != NULL);
189
190 if (vm->fd >= 0)
191 (void)close(vm->fd);
192 if (vm->ctlfd >= 0)
193 (void)close(vm->ctlfd);
194 free(vm);
195 }
196
197 void
vm_destroy(struct vmctx * vm)198 vm_destroy(struct vmctx *vm)
199 {
200 struct vmmctl_vm_destroy vmd;
201
202 memset(&vmd, 0, sizeof(vmd));
203 (void)strlcpy(vmd.name, vm->name, sizeof(vmd.name));
204 if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0)
205 warn("ioctl(VMMCTL_VM_DESTROY)");
206
207 vm_close(vm);
208 }
209
210 struct vcpu *
vm_vcpu_open(struct vmctx * ctx,int vcpuid)211 vm_vcpu_open(struct vmctx *ctx, int vcpuid)
212 {
213 struct vcpu *vcpu;
214
215 vcpu = malloc(sizeof(*vcpu));
216 vcpu->ctx = ctx;
217 vcpu->vcpuid = vcpuid;
218 return (vcpu);
219 }
220
221 void
vm_vcpu_close(struct vcpu * vcpu)222 vm_vcpu_close(struct vcpu *vcpu)
223 {
224 free(vcpu);
225 }
226
227 int
vcpu_id(struct vcpu * vcpu)228 vcpu_id(struct vcpu *vcpu)
229 {
230 return (vcpu->vcpuid);
231 }
232
233 int
vm_parse_memsize(const char * opt,size_t * ret_memsize)234 vm_parse_memsize(const char *opt, size_t *ret_memsize)
235 {
236 char *endptr;
237 size_t optval;
238 int error;
239
240 optval = strtoul(opt, &endptr, 0);
241 if (*opt != '\0' && *endptr == '\0') {
242 /*
243 * For the sake of backward compatibility if the memory size
244 * specified on the command line is less than a megabyte then
245 * it is interpreted as being in units of MB.
246 */
247 if (optval < MB)
248 optval *= MB;
249 *ret_memsize = optval;
250 error = 0;
251 } else
252 error = expand_number(opt, ret_memsize);
253
254 return (error);
255 }
256
257 uint32_t
vm_get_lowmem_limit(struct vmctx * ctx __unused)258 vm_get_lowmem_limit(struct vmctx *ctx __unused)
259 {
260
261 return (VM_LOWMEM_LIMIT);
262 }
263
264 void
vm_set_memflags(struct vmctx * ctx,int flags)265 vm_set_memflags(struct vmctx *ctx, int flags)
266 {
267
268 ctx->memflags = flags;
269 }
270
271 int
vm_get_memflags(struct vmctx * ctx)272 vm_get_memflags(struct vmctx *ctx)
273 {
274
275 return (ctx->memflags);
276 }
277
278 /*
279 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
280 */
281 int
vm_mmap_memseg(struct vmctx * ctx,vm_paddr_t gpa,int segid,vm_ooffset_t off,size_t len,int prot)282 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
283 size_t len, int prot)
284 {
285 struct vm_memmap memmap;
286 int error, flags;
287
288 memmap.gpa = gpa;
289 memmap.segid = segid;
290 memmap.segoff = off;
291 memmap.len = len;
292 memmap.prot = prot;
293 memmap.flags = 0;
294
295 if (ctx->memflags & VM_MEM_F_WIRED)
296 memmap.flags |= VM_MEMMAP_F_WIRED;
297
298 /*
299 * If this mapping already exists then don't create it again. This
300 * is the common case for SYSMEM mappings created by bhyveload(8).
301 */
302 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
303 if (error == 0 && gpa == memmap.gpa) {
304 if (segid != memmap.segid || off != memmap.segoff ||
305 prot != memmap.prot || flags != memmap.flags) {
306 errno = EEXIST;
307 return (-1);
308 } else {
309 return (0);
310 }
311 }
312
313 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
314 return (error);
315 }
316
317 int
vm_get_guestmem_from_ctx(struct vmctx * ctx,char ** guest_baseaddr,size_t * lowmem_size,size_t * highmem_size)318 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
319 size_t *lowmem_size, size_t *highmem_size)
320 {
321
322 *guest_baseaddr = ctx->baseaddr;
323 *lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size;
324 *highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size;
325 return (0);
326 }
327
328 int
vm_munmap_memseg(struct vmctx * ctx,vm_paddr_t gpa,size_t len)329 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
330 {
331 struct vm_munmap munmap;
332 int error;
333
334 munmap.gpa = gpa;
335 munmap.len = len;
336
337 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap);
338 return (error);
339 }
340
341 int
vm_mmap_getnext(struct vmctx * ctx,vm_paddr_t * gpa,int * segid,vm_ooffset_t * segoff,size_t * len,int * prot,int * flags)342 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
343 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
344 {
345 struct vm_memmap memmap;
346 int error;
347
348 bzero(&memmap, sizeof(struct vm_memmap));
349 memmap.gpa = *gpa;
350 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
351 if (error == 0) {
352 *gpa = memmap.gpa;
353 *segid = memmap.segid;
354 *segoff = memmap.segoff;
355 *len = memmap.len;
356 *prot = memmap.prot;
357 *flags = memmap.flags;
358 }
359 return (error);
360 }
361
362 /*
363 * Return 0 if the segments are identical and non-zero otherwise.
364 *
365 * This is slightly complicated by the fact that only device memory segments
366 * are named.
367 */
368 static int
cmpseg(size_t len,const char * str,size_t len2,const char * str2)369 cmpseg(size_t len, const char *str, size_t len2, const char *str2)
370 {
371
372 if (len == len2) {
373 if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
374 return (0);
375 }
376 return (-1);
377 }
378
379 static int
vm_alloc_memseg(struct vmctx * ctx,int segid,size_t len,const char * name)380 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
381 {
382 struct vm_memseg memseg;
383 size_t n;
384 int error;
385
386 /*
387 * If the memory segment has already been created then just return.
388 * This is the usual case for the SYSMEM segment created by userspace
389 * loaders like bhyveload(8).
390 */
391 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
392 sizeof(memseg.name));
393 if (error)
394 return (error);
395
396 if (memseg.len != 0) {
397 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
398 errno = EINVAL;
399 return (-1);
400 } else {
401 return (0);
402 }
403 }
404
405 bzero(&memseg, sizeof(struct vm_memseg));
406 memseg.segid = segid;
407 memseg.len = len;
408 if (name != NULL) {
409 n = strlcpy(memseg.name, name, sizeof(memseg.name));
410 if (n >= sizeof(memseg.name)) {
411 errno = ENAMETOOLONG;
412 return (-1);
413 }
414 }
415
416 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
417 return (error);
418 }
419
420 int
vm_get_memseg(struct vmctx * ctx,int segid,size_t * lenp,char * namebuf,size_t bufsize)421 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
422 size_t bufsize)
423 {
424 struct vm_memseg memseg;
425 size_t n;
426 int error;
427
428 bzero(&memseg, sizeof(memseg));
429 memseg.segid = segid;
430 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
431 if (error == 0) {
432 *lenp = memseg.len;
433 n = strlcpy(namebuf, memseg.name, bufsize);
434 if (n >= bufsize) {
435 errno = ENAMETOOLONG;
436 error = -1;
437 }
438 }
439 return (error);
440 }
441
442 static int
setup_memory_segment(struct vmctx * ctx,vm_paddr_t gpa,size_t len,char * base)443 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
444 {
445 char *ptr;
446 int error, flags;
447
448 /* Map 'len' bytes starting at 'gpa' in the guest address space */
449 error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
450 if (error)
451 return (error);
452
453 flags = MAP_SHARED | MAP_FIXED;
454 if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
455 flags |= MAP_NOCORE;
456
457 /* mmap into the process address space on the host */
458 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
459 if (ptr == MAP_FAILED)
460 return (-1);
461
462 return (0);
463 }
464
465 int
vm_setup_memory(struct vmctx * ctx,size_t memsize,enum vm_mmap_style vms)466 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
467 {
468 size_t objsize, len;
469 vm_paddr_t gpa;
470 char *baseaddr, *ptr;
471 int error;
472
473 assert(vms == VM_MMAP_ALL);
474
475 /*
476 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create
477 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder.
478 */
479 if (memsize > VM_LOWMEM_LIMIT) {
480 ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT;
481 ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT;
482 objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size;
483 } else {
484 ctx->memsegs[VM_MEMSEG_LOW].size = memsize;
485 ctx->memsegs[VM_MEMSEG_HIGH].size = 0;
486 objsize = memsize;
487 }
488
489 error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
490 if (error)
491 return (error);
492
493 /*
494 * Stake out a contiguous region covering the guest physical memory
495 * and the adjoining guard regions.
496 */
497 len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
498 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
499 if (ptr == MAP_FAILED)
500 return (-1);
501
502 baseaddr = ptr + VM_MMAP_GUARD_SIZE;
503 if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) {
504 gpa = VM_HIGHMEM_BASE;
505 len = ctx->memsegs[VM_MEMSEG_HIGH].size;
506 error = setup_memory_segment(ctx, gpa, len, baseaddr);
507 if (error)
508 return (error);
509 }
510
511 if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) {
512 gpa = 0;
513 len = ctx->memsegs[VM_MEMSEG_LOW].size;
514 error = setup_memory_segment(ctx, gpa, len, baseaddr);
515 if (error)
516 return (error);
517 }
518
519 ctx->baseaddr = baseaddr;
520
521 return (0);
522 }
523
524 /*
525 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
526 * the lowmem or highmem regions.
527 *
528 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
529 * The instruction emulation code depends on this behavior.
530 */
531 void *
vm_map_gpa(struct vmctx * ctx,vm_paddr_t gaddr,size_t len)532 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
533 {
534 vm_size_t lowsize, highsize;
535
536 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
537 if (lowsize > 0) {
538 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize)
539 return (ctx->baseaddr + gaddr);
540 }
541
542 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
543 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) {
544 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize &&
545 gaddr + len <= VM_HIGHMEM_BASE + highsize)
546 return (ctx->baseaddr + gaddr);
547 }
548
549 return (NULL);
550 }
551
552 vm_paddr_t
vm_rev_map_gpa(struct vmctx * ctx,void * addr)553 vm_rev_map_gpa(struct vmctx *ctx, void *addr)
554 {
555 vm_paddr_t offaddr;
556 vm_size_t lowsize, highsize;
557
558 offaddr = (char *)addr - ctx->baseaddr;
559
560 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
561 if (lowsize > 0)
562 if (offaddr <= lowsize)
563 return (offaddr);
564
565 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
566 if (highsize > 0)
567 if (offaddr >= VM_HIGHMEM_BASE &&
568 offaddr < VM_HIGHMEM_BASE + highsize)
569 return (offaddr);
570
571 return ((vm_paddr_t)-1);
572 }
573
574 const char *
vm_get_name(struct vmctx * ctx)575 vm_get_name(struct vmctx *ctx)
576 {
577
578 return (ctx->name);
579 }
580
581 size_t
vm_get_lowmem_size(struct vmctx * ctx)582 vm_get_lowmem_size(struct vmctx *ctx)
583 {
584
585 return (ctx->memsegs[VM_MEMSEG_LOW].size);
586 }
587
588 vm_paddr_t
vm_get_highmem_base(struct vmctx * ctx __unused)589 vm_get_highmem_base(struct vmctx *ctx __unused)
590 {
591
592 return (VM_HIGHMEM_BASE);
593 }
594
595 size_t
vm_get_highmem_size(struct vmctx * ctx)596 vm_get_highmem_size(struct vmctx *ctx)
597 {
598
599 return (ctx->memsegs[VM_MEMSEG_HIGH].size);
600 }
601
602 void *
vm_create_devmem(struct vmctx * ctx,int segid,const char * name,size_t len)603 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
604 {
605 char pathname[MAXPATHLEN];
606 size_t len2;
607 char *base, *ptr;
608 int fd, error, flags;
609
610 fd = -1;
611 ptr = MAP_FAILED;
612 if (name == NULL || strlen(name) == 0) {
613 errno = EINVAL;
614 goto done;
615 }
616
617 error = vm_alloc_memseg(ctx, segid, len, name);
618 if (error)
619 goto done;
620
621 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
622 strlcat(pathname, ctx->name, sizeof(pathname));
623 strlcat(pathname, ".", sizeof(pathname));
624 strlcat(pathname, name, sizeof(pathname));
625
626 fd = open(pathname, O_RDWR);
627 if (fd < 0)
628 goto done;
629
630 /*
631 * Stake out a contiguous region covering the device memory and the
632 * adjoining guard regions.
633 */
634 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
635 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
636 0);
637 if (base == MAP_FAILED)
638 goto done;
639
640 flags = MAP_SHARED | MAP_FIXED;
641 if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
642 flags |= MAP_NOCORE;
643
644 /* mmap the devmem region in the host address space */
645 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
646 done:
647 if (fd >= 0)
648 close(fd);
649 return (ptr);
650 }
651
652 int
vcpu_ioctl(struct vcpu * vcpu,u_long cmd,void * arg)653 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg)
654 {
655 /*
656 * XXX: fragile, handle with care
657 * Assumes that the first field of the ioctl data
658 * is the vcpuid.
659 */
660 *(int *)arg = vcpu->vcpuid;
661 return (ioctl(vcpu->ctx->fd, cmd, arg));
662 }
663
664 int
vm_set_register(struct vcpu * vcpu,int reg,uint64_t val)665 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
666 {
667 int error;
668 struct vm_register vmreg;
669
670 bzero(&vmreg, sizeof(vmreg));
671 vmreg.regnum = reg;
672 vmreg.regval = val;
673
674 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg);
675 return (error);
676 }
677
678 int
vm_get_register(struct vcpu * vcpu,int reg,uint64_t * ret_val)679 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val)
680 {
681 int error;
682 struct vm_register vmreg;
683
684 bzero(&vmreg, sizeof(vmreg));
685 vmreg.regnum = reg;
686
687 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg);
688 *ret_val = vmreg.regval;
689 return (error);
690 }
691
692 int
vm_set_register_set(struct vcpu * vcpu,unsigned int count,const int * regnums,uint64_t * regvals)693 vm_set_register_set(struct vcpu *vcpu, unsigned int count,
694 const int *regnums, uint64_t *regvals)
695 {
696 int error;
697 struct vm_register_set vmregset;
698
699 bzero(&vmregset, sizeof(vmregset));
700 vmregset.count = count;
701 vmregset.regnums = regnums;
702 vmregset.regvals = regvals;
703
704 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset);
705 return (error);
706 }
707
708 int
vm_get_register_set(struct vcpu * vcpu,unsigned int count,const int * regnums,uint64_t * regvals)709 vm_get_register_set(struct vcpu *vcpu, unsigned int count,
710 const int *regnums, uint64_t *regvals)
711 {
712 int error;
713 struct vm_register_set vmregset;
714
715 bzero(&vmregset, sizeof(vmregset));
716 vmregset.count = count;
717 vmregset.regnums = regnums;
718 vmregset.regvals = regvals;
719
720 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset);
721 return (error);
722 }
723
724 int
vm_run(struct vcpu * vcpu,struct vm_run * vmrun)725 vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
726 {
727 return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
728 }
729
730 int
vm_suspend(struct vmctx * ctx,enum vm_suspend_how how)731 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
732 {
733 struct vm_suspend vmsuspend;
734
735 bzero(&vmsuspend, sizeof(vmsuspend));
736 vmsuspend.how = how;
737 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
738 }
739
740 int
vm_reinit(struct vmctx * ctx)741 vm_reinit(struct vmctx *ctx)
742 {
743
744 return (ioctl(ctx->fd, VM_REINIT, 0));
745 }
746
747 int
vm_capability_name2type(const char * capname)748 vm_capability_name2type(const char *capname)
749 {
750 int i;
751
752 for (i = 0; i < VM_CAP_MAX; i++) {
753 if (vm_capstrmap[i] != NULL &&
754 strcmp(vm_capstrmap[i], capname) == 0)
755 return (i);
756 }
757
758 return (-1);
759 }
760
761 const char *
vm_capability_type2name(int type)762 vm_capability_type2name(int type)
763 {
764 if (type >= 0 && type < VM_CAP_MAX)
765 return (vm_capstrmap[type]);
766
767 return (NULL);
768 }
769
770 int
vm_get_capability(struct vcpu * vcpu,enum vm_cap_type cap,int * retval)771 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval)
772 {
773 int error;
774 struct vm_capability vmcap;
775
776 bzero(&vmcap, sizeof(vmcap));
777 vmcap.captype = cap;
778
779 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap);
780 *retval = vmcap.capval;
781 return (error);
782 }
783
784 int
vm_set_capability(struct vcpu * vcpu,enum vm_cap_type cap,int val)785 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val)
786 {
787 struct vm_capability vmcap;
788
789 bzero(&vmcap, sizeof(vmcap));
790 vmcap.captype = cap;
791 vmcap.capval = val;
792
793 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap));
794 }
795
796 uint64_t *
vm_get_stats(struct vcpu * vcpu,struct timeval * ret_tv,int * ret_entries)797 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv,
798 int *ret_entries)
799 {
800 static _Thread_local uint64_t *stats_buf;
801 static _Thread_local u_int stats_count;
802 uint64_t *new_stats;
803 struct vm_stats vmstats;
804 u_int count, index;
805 bool have_stats;
806
807 have_stats = false;
808 count = 0;
809 for (index = 0;; index += nitems(vmstats.statbuf)) {
810 vmstats.index = index;
811 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0)
812 break;
813 if (stats_count < index + vmstats.num_entries) {
814 new_stats = realloc(stats_buf,
815 (index + vmstats.num_entries) * sizeof(uint64_t));
816 if (new_stats == NULL) {
817 errno = ENOMEM;
818 return (NULL);
819 }
820 stats_count = index + vmstats.num_entries;
821 stats_buf = new_stats;
822 }
823 memcpy(stats_buf + index, vmstats.statbuf,
824 vmstats.num_entries * sizeof(uint64_t));
825 count += vmstats.num_entries;
826 have_stats = true;
827
828 if (vmstats.num_entries != nitems(vmstats.statbuf))
829 break;
830 }
831 if (have_stats) {
832 if (ret_entries)
833 *ret_entries = count;
834 if (ret_tv)
835 *ret_tv = vmstats.tv;
836 return (stats_buf);
837 } else
838 return (NULL);
839 }
840
841 const char *
vm_get_stat_desc(struct vmctx * ctx,int index)842 vm_get_stat_desc(struct vmctx *ctx, int index)
843 {
844 static struct vm_stat_desc statdesc;
845
846 statdesc.index = index;
847 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
848 return (statdesc.desc);
849 else
850 return (NULL);
851 }
852
853 #ifdef __amd64__
854 int
vm_get_gpa_pmap(struct vmctx * ctx,uint64_t gpa,uint64_t * pte,int * num)855 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
856 {
857 int error, i;
858 struct vm_gpa_pte gpapte;
859
860 bzero(&gpapte, sizeof(gpapte));
861 gpapte.gpa = gpa;
862
863 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
864
865 if (error == 0) {
866 *num = gpapte.ptenum;
867 for (i = 0; i < gpapte.ptenum; i++)
868 pte[i] = gpapte.pte[i];
869 }
870
871 return (error);
872 }
873
874 int
vm_gla2gpa(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * fault)875 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
876 uint64_t gla, int prot, uint64_t *gpa, int *fault)
877 {
878 struct vm_gla2gpa gg;
879 int error;
880
881 bzero(&gg, sizeof(struct vm_gla2gpa));
882 gg.prot = prot;
883 gg.gla = gla;
884 gg.paging = *paging;
885
886 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg);
887 if (error == 0) {
888 *fault = gg.fault;
889 *gpa = gg.gpa;
890 }
891 return (error);
892 }
893 #endif
894
895 int
vm_gla2gpa_nofault(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,int prot,uint64_t * gpa,int * fault)896 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
897 uint64_t gla, int prot, uint64_t *gpa, int *fault)
898 {
899 struct vm_gla2gpa gg;
900 int error;
901
902 bzero(&gg, sizeof(struct vm_gla2gpa));
903 gg.prot = prot;
904 gg.gla = gla;
905 gg.paging = *paging;
906
907 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg);
908 if (error == 0) {
909 *fault = gg.fault;
910 *gpa = gg.gpa;
911 }
912 return (error);
913 }
914
915 #ifndef min
916 #define min(a,b) (((a) < (b)) ? (a) : (b))
917 #endif
918
919 #ifdef __amd64__
920 int
vm_copy_setup(struct vcpu * vcpu,struct vm_guest_paging * paging,uint64_t gla,size_t len,int prot,struct iovec * iov,int iovcnt,int * fault)921 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
922 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
923 int *fault)
924 {
925 void *va;
926 uint64_t gpa, off;
927 int error, i, n;
928
929 for (i = 0; i < iovcnt; i++) {
930 iov[i].iov_base = 0;
931 iov[i].iov_len = 0;
932 }
933
934 while (len) {
935 assert(iovcnt > 0);
936 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
937 if (error || *fault)
938 return (error);
939
940 off = gpa & PAGE_MASK;
941 n = MIN(len, PAGE_SIZE - off);
942
943 va = vm_map_gpa(vcpu->ctx, gpa, n);
944 if (va == NULL)
945 return (EFAULT);
946
947 iov->iov_base = va;
948 iov->iov_len = n;
949 iov++;
950 iovcnt--;
951
952 gla += n;
953 len -= n;
954 }
955 return (0);
956 }
957 #endif
958
959 void
vm_copy_teardown(struct iovec * iov __unused,int iovcnt __unused)960 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused)
961 {
962 /*
963 * Intentionally empty. This is used by the instruction
964 * emulation code shared with the kernel. The in-kernel
965 * version of this is non-empty.
966 */
967 }
968
969 void
vm_copyin(struct iovec * iov,void * vp,size_t len)970 vm_copyin(struct iovec *iov, void *vp, size_t len)
971 {
972 const char *src;
973 char *dst;
974 size_t n;
975
976 dst = vp;
977 while (len) {
978 assert(iov->iov_len);
979 n = min(len, iov->iov_len);
980 src = iov->iov_base;
981 bcopy(src, dst, n);
982
983 iov++;
984 dst += n;
985 len -= n;
986 }
987 }
988
989 void
vm_copyout(const void * vp,struct iovec * iov,size_t len)990 vm_copyout(const void *vp, struct iovec *iov, size_t len)
991 {
992 const char *src;
993 char *dst;
994 size_t n;
995
996 src = vp;
997 while (len) {
998 assert(iov->iov_len);
999 n = min(len, iov->iov_len);
1000 dst = iov->iov_base;
1001 bcopy(src, dst, n);
1002
1003 iov++;
1004 src += n;
1005 len -= n;
1006 }
1007 }
1008
1009 static int
vm_get_cpus(struct vmctx * ctx,int which,cpuset_t * cpus)1010 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
1011 {
1012 struct vm_cpuset vm_cpuset;
1013 int error;
1014
1015 bzero(&vm_cpuset, sizeof(struct vm_cpuset));
1016 vm_cpuset.which = which;
1017 vm_cpuset.cpusetsize = sizeof(cpuset_t);
1018 vm_cpuset.cpus = cpus;
1019
1020 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
1021 return (error);
1022 }
1023
1024 int
vm_active_cpus(struct vmctx * ctx,cpuset_t * cpus)1025 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
1026 {
1027
1028 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
1029 }
1030
1031 int
vm_suspended_cpus(struct vmctx * ctx,cpuset_t * cpus)1032 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
1033 {
1034
1035 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
1036 }
1037
1038 int
vm_debug_cpus(struct vmctx * ctx,cpuset_t * cpus)1039 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
1040 {
1041
1042 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
1043 }
1044
1045 int
vm_activate_cpu(struct vcpu * vcpu)1046 vm_activate_cpu(struct vcpu *vcpu)
1047 {
1048 struct vm_activate_cpu ac;
1049 int error;
1050
1051 bzero(&ac, sizeof(struct vm_activate_cpu));
1052 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac);
1053 return (error);
1054 }
1055
1056 int
vm_suspend_all_cpus(struct vmctx * ctx)1057 vm_suspend_all_cpus(struct vmctx *ctx)
1058 {
1059 struct vm_activate_cpu ac;
1060 int error;
1061
1062 bzero(&ac, sizeof(struct vm_activate_cpu));
1063 ac.vcpuid = -1;
1064 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
1065 return (error);
1066 }
1067
1068 int
vm_suspend_cpu(struct vcpu * vcpu)1069 vm_suspend_cpu(struct vcpu *vcpu)
1070 {
1071 struct vm_activate_cpu ac;
1072 int error;
1073
1074 bzero(&ac, sizeof(struct vm_activate_cpu));
1075 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac);
1076 return (error);
1077 }
1078
1079 int
vm_resume_cpu(struct vcpu * vcpu)1080 vm_resume_cpu(struct vcpu *vcpu)
1081 {
1082 struct vm_activate_cpu ac;
1083 int error;
1084
1085 bzero(&ac, sizeof(struct vm_activate_cpu));
1086 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac);
1087 return (error);
1088 }
1089
1090 int
vm_resume_all_cpus(struct vmctx * ctx)1091 vm_resume_all_cpus(struct vmctx *ctx)
1092 {
1093 struct vm_activate_cpu ac;
1094 int error;
1095
1096 bzero(&ac, sizeof(struct vm_activate_cpu));
1097 ac.vcpuid = -1;
1098 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
1099 return (error);
1100 }
1101
1102 #ifdef __amd64__
1103 int
vm_get_intinfo(struct vcpu * vcpu,uint64_t * info1,uint64_t * info2)1104 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
1105 {
1106 struct vm_intinfo vmii;
1107 int error;
1108
1109 bzero(&vmii, sizeof(struct vm_intinfo));
1110 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii);
1111 if (error == 0) {
1112 *info1 = vmii.info1;
1113 *info2 = vmii.info2;
1114 }
1115 return (error);
1116 }
1117
1118 int
vm_set_intinfo(struct vcpu * vcpu,uint64_t info1)1119 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1)
1120 {
1121 struct vm_intinfo vmii;
1122 int error;
1123
1124 bzero(&vmii, sizeof(struct vm_intinfo));
1125 vmii.info1 = info1;
1126 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii);
1127 return (error);
1128 }
1129 #endif
1130
1131 #ifdef WITH_VMMAPI_SNAPSHOT
1132 int
vm_restart_instruction(struct vcpu * vcpu)1133 vm_restart_instruction(struct vcpu *vcpu)
1134 {
1135 int arg;
1136
1137 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg));
1138 }
1139
1140 int
vm_snapshot_req(struct vmctx * ctx,struct vm_snapshot_meta * meta)1141 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta)
1142 {
1143
1144 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
1145 #ifdef SNAPSHOT_DEBUG
1146 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
1147 __func__, meta->dev_name, errno);
1148 #endif
1149 return (-1);
1150 }
1151 return (0);
1152 }
1153
1154 int
vm_restore_time(struct vmctx * ctx)1155 vm_restore_time(struct vmctx *ctx)
1156 {
1157 int dummy;
1158
1159 dummy = 0;
1160 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
1161 }
1162 #endif
1163
1164 int
vm_set_topology(struct vmctx * ctx,uint16_t sockets,uint16_t cores,uint16_t threads,uint16_t maxcpus)1165 vm_set_topology(struct vmctx *ctx,
1166 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
1167 {
1168 struct vm_cpu_topology topology;
1169
1170 bzero(&topology, sizeof (struct vm_cpu_topology));
1171 topology.sockets = sockets;
1172 topology.cores = cores;
1173 topology.threads = threads;
1174 topology.maxcpus = maxcpus;
1175 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
1176 }
1177
1178 int
vm_get_topology(struct vmctx * ctx,uint16_t * sockets,uint16_t * cores,uint16_t * threads,uint16_t * maxcpus)1179 vm_get_topology(struct vmctx *ctx,
1180 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
1181 {
1182 struct vm_cpu_topology topology;
1183 int error;
1184
1185 bzero(&topology, sizeof (struct vm_cpu_topology));
1186 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
1187 if (error == 0) {
1188 *sockets = topology.sockets;
1189 *cores = topology.cores;
1190 *threads = topology.threads;
1191 *maxcpus = topology.maxcpus;
1192 }
1193 return (error);
1194 }
1195
1196 int
vm_limit_rights(struct vmctx * ctx)1197 vm_limit_rights(struct vmctx *ctx)
1198 {
1199 cap_rights_t rights;
1200
1201 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1202 if (caph_rights_limit(ctx->fd, &rights) != 0)
1203 return (-1);
1204 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0)
1205 return (-1);
1206 return (0);
1207 }
1208
1209 /*
1210 * Avoid using in new code. Operations on the fd should be wrapped here so that
1211 * capability rights can be kept in sync.
1212 */
1213 int
vm_get_device_fd(struct vmctx * ctx)1214 vm_get_device_fd(struct vmctx *ctx)
1215 {
1216
1217 return (ctx->fd);
1218 }
1219
1220 /* Legacy interface, do not use. */
1221 const cap_ioctl_t *
vm_get_ioctls(size_t * len)1222 vm_get_ioctls(size_t *len)
1223 {
1224 cap_ioctl_t *cmds;
1225 size_t sz;
1226
1227 if (len == NULL) {
1228 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]);
1229 cmds = malloc(sz);
1230 if (cmds == NULL)
1231 return (NULL);
1232 bcopy(vm_ioctl_cmds, cmds, sz);
1233 return (cmds);
1234 }
1235
1236 *len = vm_ioctl_ncmds;
1237 return (NULL);
1238 }
1239