1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/capsicum.h> 31 #include <sys/sysctl.h> 32 #include <sys/ioctl.h> 33 #include <sys/mman.h> 34 #include <sys/linker.h> 35 #include <sys/module.h> 36 #include <sys/_iovec.h> 37 #include <sys/cpuset.h> 38 39 #include <capsicum_helpers.h> 40 #include <err.h> 41 #include <errno.h> 42 #include <stdbool.h> 43 #include <stdio.h> 44 #include <stdlib.h> 45 #include <assert.h> 46 #include <string.h> 47 #include <fcntl.h> 48 #include <unistd.h> 49 50 #include <libutil.h> 51 52 #include <vm/vm.h> 53 #include <machine/vmm.h> 54 #ifdef WITH_VMMAPI_SNAPSHOT 55 #include <machine/vmm_snapshot.h> 56 #endif 57 58 #include <dev/vmm/vmm_dev.h> 59 60 #include "vmmapi.h" 61 #include "internal.h" 62 63 #define MB (1024 * 1024UL) 64 #define GB (1024 * 1024 * 1024UL) 65 66 #ifdef __amd64__ 67 #define VM_LOWMEM_LIMIT (3 * GB) 68 #else 69 #define VM_LOWMEM_LIMIT 0 70 #endif 71 #define VM_HIGHMEM_BASE (4 * GB) 72 73 /* 74 * Size of the guard region before and after the virtual address space 75 * mapping the guest physical memory. This must be a multiple of the 76 * superpage size for performance reasons. 77 */ 78 #define VM_MMAP_GUARD_SIZE (4 * MB) 79 80 #define PROT_RW (PROT_READ | PROT_WRITE) 81 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) 82 83 static int 84 vm_device_open(const char *name) 85 { 86 char devpath[PATH_MAX]; 87 88 assert(strlen(name) <= VM_MAX_NAMELEN); 89 (void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name); 90 return (open(devpath, O_RDWR)); 91 } 92 93 static int 94 vm_ctl_create(const char *name, int ctlfd) 95 { 96 struct vmmctl_vm_create vmc; 97 98 memset(&vmc, 0, sizeof(vmc)); 99 if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) { 100 errno = ENAMETOOLONG; 101 return (-1); 102 } 103 return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc)); 104 } 105 106 int 107 vm_create(const char *name) 108 { 109 int error, fd; 110 111 /* Try to load vmm(4) module before creating a guest. */ 112 if (modfind("vmm") < 0) { 113 error = kldload("vmm"); 114 if (error != 0) 115 return (-1); 116 } 117 118 fd = open("/dev/vmmctl", O_RDWR, 0); 119 if (fd < 0) 120 return (fd); 121 error = vm_ctl_create(name, fd); 122 if (error != 0) { 123 error = errno; 124 (void)close(fd); 125 errno = error; 126 return (-1); 127 } 128 (void)close(fd); 129 return (0); 130 } 131 132 struct vmctx * 133 vm_open(const char *name) 134 { 135 return (vm_openf(name, 0)); 136 } 137 138 struct vmctx * 139 vm_openf(const char *name, int flags) 140 { 141 struct vmctx *vm; 142 int saved_errno; 143 bool created; 144 145 created = false; 146 147 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); 148 assert(vm != NULL); 149 150 vm->fd = vm->ctlfd = -1; 151 vm->memflags = 0; 152 vm->name = (char *)(vm + 1); 153 strcpy(vm->name, name); 154 memset(vm->memsegs, 0, sizeof(vm->memsegs)); 155 156 if ((vm->ctlfd = open("/dev/vmmctl", O_RDWR, 0)) < 0) 157 goto err; 158 159 vm->fd = vm_device_open(vm->name); 160 if (vm->fd < 0 && errno == ENOENT) { 161 if (flags & VMMAPI_OPEN_CREATE) { 162 if (vm_ctl_create(vm->name, vm->ctlfd) != 0) 163 goto err; 164 vm->fd = vm_device_open(vm->name); 165 created = true; 166 } 167 } 168 if (vm->fd < 0) 169 goto err; 170 171 if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0) 172 goto err; 173 174 return (vm); 175 err: 176 saved_errno = errno; 177 if (created) 178 vm_destroy(vm); 179 else 180 vm_close(vm); 181 errno = saved_errno; 182 return (NULL); 183 } 184 185 void 186 vm_close(struct vmctx *vm) 187 { 188 assert(vm != NULL); 189 190 if (vm->fd >= 0) 191 (void)close(vm->fd); 192 if (vm->ctlfd >= 0) 193 (void)close(vm->ctlfd); 194 free(vm); 195 } 196 197 void 198 vm_destroy(struct vmctx *vm) 199 { 200 struct vmmctl_vm_destroy vmd; 201 202 memset(&vmd, 0, sizeof(vmd)); 203 (void)strlcpy(vmd.name, vm->name, sizeof(vmd.name)); 204 if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0) 205 warn("ioctl(VMMCTL_VM_DESTROY)"); 206 207 vm_close(vm); 208 } 209 210 struct vcpu * 211 vm_vcpu_open(struct vmctx *ctx, int vcpuid) 212 { 213 struct vcpu *vcpu; 214 215 vcpu = malloc(sizeof(*vcpu)); 216 vcpu->ctx = ctx; 217 vcpu->vcpuid = vcpuid; 218 return (vcpu); 219 } 220 221 void 222 vm_vcpu_close(struct vcpu *vcpu) 223 { 224 free(vcpu); 225 } 226 227 int 228 vcpu_id(struct vcpu *vcpu) 229 { 230 return (vcpu->vcpuid); 231 } 232 233 int 234 vm_parse_memsize(const char *opt, size_t *ret_memsize) 235 { 236 char *endptr; 237 size_t optval; 238 int error; 239 240 optval = strtoul(opt, &endptr, 0); 241 if (*opt != '\0' && *endptr == '\0') { 242 /* 243 * For the sake of backward compatibility if the memory size 244 * specified on the command line is less than a megabyte then 245 * it is interpreted as being in units of MB. 246 */ 247 if (optval < MB) 248 optval *= MB; 249 *ret_memsize = optval; 250 error = 0; 251 } else 252 error = expand_number(opt, ret_memsize); 253 254 return (error); 255 } 256 257 uint32_t 258 vm_get_lowmem_limit(struct vmctx *ctx __unused) 259 { 260 261 return (VM_LOWMEM_LIMIT); 262 } 263 264 void 265 vm_set_memflags(struct vmctx *ctx, int flags) 266 { 267 268 ctx->memflags = flags; 269 } 270 271 int 272 vm_get_memflags(struct vmctx *ctx) 273 { 274 275 return (ctx->memflags); 276 } 277 278 /* 279 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). 280 */ 281 int 282 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, 283 size_t len, int prot) 284 { 285 struct vm_memmap memmap; 286 int error, flags; 287 288 memmap.gpa = gpa; 289 memmap.segid = segid; 290 memmap.segoff = off; 291 memmap.len = len; 292 memmap.prot = prot; 293 memmap.flags = 0; 294 295 if (ctx->memflags & VM_MEM_F_WIRED) 296 memmap.flags |= VM_MEMMAP_F_WIRED; 297 298 /* 299 * If this mapping already exists then don't create it again. This 300 * is the common case for SYSMEM mappings created by bhyveload(8). 301 */ 302 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); 303 if (error == 0 && gpa == memmap.gpa) { 304 if (segid != memmap.segid || off != memmap.segoff || 305 prot != memmap.prot || flags != memmap.flags) { 306 errno = EEXIST; 307 return (-1); 308 } else { 309 return (0); 310 } 311 } 312 313 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); 314 return (error); 315 } 316 317 int 318 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, 319 size_t *lowmem_size, size_t *highmem_size) 320 { 321 322 *guest_baseaddr = ctx->baseaddr; 323 *lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size; 324 *highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size; 325 return (0); 326 } 327 328 int 329 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) 330 { 331 struct vm_munmap munmap; 332 int error; 333 334 munmap.gpa = gpa; 335 munmap.len = len; 336 337 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); 338 return (error); 339 } 340 341 int 342 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, 343 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 344 { 345 struct vm_memmap memmap; 346 int error; 347 348 bzero(&memmap, sizeof(struct vm_memmap)); 349 memmap.gpa = *gpa; 350 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); 351 if (error == 0) { 352 *gpa = memmap.gpa; 353 *segid = memmap.segid; 354 *segoff = memmap.segoff; 355 *len = memmap.len; 356 *prot = memmap.prot; 357 *flags = memmap.flags; 358 } 359 return (error); 360 } 361 362 /* 363 * Return 0 if the segments are identical and non-zero otherwise. 364 * 365 * This is slightly complicated by the fact that only device memory segments 366 * are named. 367 */ 368 static int 369 cmpseg(size_t len, const char *str, size_t len2, const char *str2) 370 { 371 372 if (len == len2) { 373 if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) 374 return (0); 375 } 376 return (-1); 377 } 378 379 static int 380 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) 381 { 382 struct vm_memseg memseg; 383 size_t n; 384 int error; 385 386 /* 387 * If the memory segment has already been created then just return. 388 * This is the usual case for the SYSMEM segment created by userspace 389 * loaders like bhyveload(8). 390 */ 391 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, 392 sizeof(memseg.name)); 393 if (error) 394 return (error); 395 396 if (memseg.len != 0) { 397 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { 398 errno = EINVAL; 399 return (-1); 400 } else { 401 return (0); 402 } 403 } 404 405 bzero(&memseg, sizeof(struct vm_memseg)); 406 memseg.segid = segid; 407 memseg.len = len; 408 if (name != NULL) { 409 n = strlcpy(memseg.name, name, sizeof(memseg.name)); 410 if (n >= sizeof(memseg.name)) { 411 errno = ENAMETOOLONG; 412 return (-1); 413 } 414 } 415 416 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); 417 return (error); 418 } 419 420 int 421 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, 422 size_t bufsize) 423 { 424 struct vm_memseg memseg; 425 size_t n; 426 int error; 427 428 bzero(&memseg, sizeof(memseg)); 429 memseg.segid = segid; 430 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); 431 if (error == 0) { 432 *lenp = memseg.len; 433 n = strlcpy(namebuf, memseg.name, bufsize); 434 if (n >= bufsize) { 435 errno = ENAMETOOLONG; 436 error = -1; 437 } 438 } 439 return (error); 440 } 441 442 static int 443 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) 444 { 445 char *ptr; 446 int error, flags; 447 448 /* Map 'len' bytes starting at 'gpa' in the guest address space */ 449 error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); 450 if (error) 451 return (error); 452 453 flags = MAP_SHARED | MAP_FIXED; 454 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 455 flags |= MAP_NOCORE; 456 457 /* mmap into the process address space on the host */ 458 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); 459 if (ptr == MAP_FAILED) 460 return (-1); 461 462 return (0); 463 } 464 465 int 466 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) 467 { 468 size_t objsize, len; 469 vm_paddr_t gpa; 470 char *baseaddr, *ptr; 471 int error; 472 473 assert(vms == VM_MMAP_ALL); 474 475 /* 476 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create 477 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder. 478 */ 479 if (memsize > VM_LOWMEM_LIMIT) { 480 ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT; 481 ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT; 482 objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size; 483 } else { 484 ctx->memsegs[VM_MEMSEG_LOW].size = memsize; 485 ctx->memsegs[VM_MEMSEG_HIGH].size = 0; 486 objsize = memsize; 487 } 488 489 error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); 490 if (error) 491 return (error); 492 493 /* 494 * Stake out a contiguous region covering the guest physical memory 495 * and the adjoining guard regions. 496 */ 497 len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; 498 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); 499 if (ptr == MAP_FAILED) 500 return (-1); 501 502 baseaddr = ptr + VM_MMAP_GUARD_SIZE; 503 if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) { 504 gpa = VM_HIGHMEM_BASE; 505 len = ctx->memsegs[VM_MEMSEG_HIGH].size; 506 error = setup_memory_segment(ctx, gpa, len, baseaddr); 507 if (error) 508 return (error); 509 } 510 511 if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) { 512 gpa = 0; 513 len = ctx->memsegs[VM_MEMSEG_LOW].size; 514 error = setup_memory_segment(ctx, gpa, len, baseaddr); 515 if (error) 516 return (error); 517 } 518 519 ctx->baseaddr = baseaddr; 520 521 return (0); 522 } 523 524 /* 525 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in 526 * the lowmem or highmem regions. 527 * 528 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. 529 * The instruction emulation code depends on this behavior. 530 */ 531 void * 532 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) 533 { 534 vm_size_t lowsize, highsize; 535 536 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 537 if (lowsize > 0) { 538 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) 539 return (ctx->baseaddr + gaddr); 540 } 541 542 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 543 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { 544 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && 545 gaddr + len <= VM_HIGHMEM_BASE + highsize) 546 return (ctx->baseaddr + gaddr); 547 } 548 549 return (NULL); 550 } 551 552 vm_paddr_t 553 vm_rev_map_gpa(struct vmctx *ctx, void *addr) 554 { 555 vm_paddr_t offaddr; 556 vm_size_t lowsize, highsize; 557 558 offaddr = (char *)addr - ctx->baseaddr; 559 560 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 561 if (lowsize > 0) 562 if (offaddr <= lowsize) 563 return (offaddr); 564 565 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 566 if (highsize > 0) 567 if (offaddr >= VM_HIGHMEM_BASE && 568 offaddr < VM_HIGHMEM_BASE + highsize) 569 return (offaddr); 570 571 return ((vm_paddr_t)-1); 572 } 573 574 const char * 575 vm_get_name(struct vmctx *ctx) 576 { 577 578 return (ctx->name); 579 } 580 581 size_t 582 vm_get_lowmem_size(struct vmctx *ctx) 583 { 584 585 return (ctx->memsegs[VM_MEMSEG_LOW].size); 586 } 587 588 vm_paddr_t 589 vm_get_highmem_base(struct vmctx *ctx __unused) 590 { 591 592 return (VM_HIGHMEM_BASE); 593 } 594 595 size_t 596 vm_get_highmem_size(struct vmctx *ctx) 597 { 598 599 return (ctx->memsegs[VM_MEMSEG_HIGH].size); 600 } 601 602 void * 603 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) 604 { 605 char pathname[MAXPATHLEN]; 606 size_t len2; 607 char *base, *ptr; 608 int fd, error, flags; 609 610 fd = -1; 611 ptr = MAP_FAILED; 612 if (name == NULL || strlen(name) == 0) { 613 errno = EINVAL; 614 goto done; 615 } 616 617 error = vm_alloc_memseg(ctx, segid, len, name); 618 if (error) 619 goto done; 620 621 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); 622 strlcat(pathname, ctx->name, sizeof(pathname)); 623 strlcat(pathname, ".", sizeof(pathname)); 624 strlcat(pathname, name, sizeof(pathname)); 625 626 fd = open(pathname, O_RDWR); 627 if (fd < 0) 628 goto done; 629 630 /* 631 * Stake out a contiguous region covering the device memory and the 632 * adjoining guard regions. 633 */ 634 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; 635 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 636 0); 637 if (base == MAP_FAILED) 638 goto done; 639 640 flags = MAP_SHARED | MAP_FIXED; 641 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 642 flags |= MAP_NOCORE; 643 644 /* mmap the devmem region in the host address space */ 645 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); 646 done: 647 if (fd >= 0) 648 close(fd); 649 return (ptr); 650 } 651 652 int 653 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) 654 { 655 /* 656 * XXX: fragile, handle with care 657 * Assumes that the first field of the ioctl data 658 * is the vcpuid. 659 */ 660 *(int *)arg = vcpu->vcpuid; 661 return (ioctl(vcpu->ctx->fd, cmd, arg)); 662 } 663 664 int 665 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 666 { 667 int error; 668 struct vm_register vmreg; 669 670 bzero(&vmreg, sizeof(vmreg)); 671 vmreg.regnum = reg; 672 vmreg.regval = val; 673 674 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); 675 return (error); 676 } 677 678 int 679 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) 680 { 681 int error; 682 struct vm_register vmreg; 683 684 bzero(&vmreg, sizeof(vmreg)); 685 vmreg.regnum = reg; 686 687 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); 688 *ret_val = vmreg.regval; 689 return (error); 690 } 691 692 int 693 vm_set_register_set(struct vcpu *vcpu, unsigned int count, 694 const int *regnums, uint64_t *regvals) 695 { 696 int error; 697 struct vm_register_set vmregset; 698 699 bzero(&vmregset, sizeof(vmregset)); 700 vmregset.count = count; 701 vmregset.regnums = regnums; 702 vmregset.regvals = regvals; 703 704 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); 705 return (error); 706 } 707 708 int 709 vm_get_register_set(struct vcpu *vcpu, unsigned int count, 710 const int *regnums, uint64_t *regvals) 711 { 712 int error; 713 struct vm_register_set vmregset; 714 715 bzero(&vmregset, sizeof(vmregset)); 716 vmregset.count = count; 717 vmregset.regnums = regnums; 718 vmregset.regvals = regvals; 719 720 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); 721 return (error); 722 } 723 724 int 725 vm_run(struct vcpu *vcpu, struct vm_run *vmrun) 726 { 727 return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); 728 } 729 730 int 731 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) 732 { 733 struct vm_suspend vmsuspend; 734 735 bzero(&vmsuspend, sizeof(vmsuspend)); 736 vmsuspend.how = how; 737 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); 738 } 739 740 int 741 vm_reinit(struct vmctx *ctx) 742 { 743 744 return (ioctl(ctx->fd, VM_REINIT, 0)); 745 } 746 747 int 748 vm_capability_name2type(const char *capname) 749 { 750 int i; 751 752 for (i = 0; i < VM_CAP_MAX; i++) { 753 if (vm_capstrmap[i] != NULL && 754 strcmp(vm_capstrmap[i], capname) == 0) 755 return (i); 756 } 757 758 return (-1); 759 } 760 761 const char * 762 vm_capability_type2name(int type) 763 { 764 if (type >= 0 && type < VM_CAP_MAX) 765 return (vm_capstrmap[type]); 766 767 return (NULL); 768 } 769 770 int 771 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) 772 { 773 int error; 774 struct vm_capability vmcap; 775 776 bzero(&vmcap, sizeof(vmcap)); 777 vmcap.captype = cap; 778 779 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); 780 *retval = vmcap.capval; 781 return (error); 782 } 783 784 int 785 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) 786 { 787 struct vm_capability vmcap; 788 789 bzero(&vmcap, sizeof(vmcap)); 790 vmcap.captype = cap; 791 vmcap.capval = val; 792 793 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); 794 } 795 796 uint64_t * 797 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, 798 int *ret_entries) 799 { 800 static _Thread_local uint64_t *stats_buf; 801 static _Thread_local u_int stats_count; 802 uint64_t *new_stats; 803 struct vm_stats vmstats; 804 u_int count, index; 805 bool have_stats; 806 807 have_stats = false; 808 count = 0; 809 for (index = 0;; index += nitems(vmstats.statbuf)) { 810 vmstats.index = index; 811 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) 812 break; 813 if (stats_count < index + vmstats.num_entries) { 814 new_stats = realloc(stats_buf, 815 (index + vmstats.num_entries) * sizeof(uint64_t)); 816 if (new_stats == NULL) { 817 errno = ENOMEM; 818 return (NULL); 819 } 820 stats_count = index + vmstats.num_entries; 821 stats_buf = new_stats; 822 } 823 memcpy(stats_buf + index, vmstats.statbuf, 824 vmstats.num_entries * sizeof(uint64_t)); 825 count += vmstats.num_entries; 826 have_stats = true; 827 828 if (vmstats.num_entries != nitems(vmstats.statbuf)) 829 break; 830 } 831 if (have_stats) { 832 if (ret_entries) 833 *ret_entries = count; 834 if (ret_tv) 835 *ret_tv = vmstats.tv; 836 return (stats_buf); 837 } else 838 return (NULL); 839 } 840 841 const char * 842 vm_get_stat_desc(struct vmctx *ctx, int index) 843 { 844 static struct vm_stat_desc statdesc; 845 846 statdesc.index = index; 847 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) 848 return (statdesc.desc); 849 else 850 return (NULL); 851 } 852 853 #ifdef __amd64__ 854 int 855 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) 856 { 857 int error, i; 858 struct vm_gpa_pte gpapte; 859 860 bzero(&gpapte, sizeof(gpapte)); 861 gpapte.gpa = gpa; 862 863 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); 864 865 if (error == 0) { 866 *num = gpapte.ptenum; 867 for (i = 0; i < gpapte.ptenum; i++) 868 pte[i] = gpapte.pte[i]; 869 } 870 871 return (error); 872 } 873 874 int 875 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 876 uint64_t gla, int prot, uint64_t *gpa, int *fault) 877 { 878 struct vm_gla2gpa gg; 879 int error; 880 881 bzero(&gg, sizeof(struct vm_gla2gpa)); 882 gg.prot = prot; 883 gg.gla = gla; 884 gg.paging = *paging; 885 886 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); 887 if (error == 0) { 888 *fault = gg.fault; 889 *gpa = gg.gpa; 890 } 891 return (error); 892 } 893 #endif 894 895 int 896 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 897 uint64_t gla, int prot, uint64_t *gpa, int *fault) 898 { 899 struct vm_gla2gpa gg; 900 int error; 901 902 bzero(&gg, sizeof(struct vm_gla2gpa)); 903 gg.prot = prot; 904 gg.gla = gla; 905 gg.paging = *paging; 906 907 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); 908 if (error == 0) { 909 *fault = gg.fault; 910 *gpa = gg.gpa; 911 } 912 return (error); 913 } 914 915 #ifndef min 916 #define min(a,b) (((a) < (b)) ? (a) : (b)) 917 #endif 918 919 #ifdef __amd64__ 920 int 921 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 922 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, 923 int *fault) 924 { 925 void *va; 926 uint64_t gpa, off; 927 int error, i, n; 928 929 for (i = 0; i < iovcnt; i++) { 930 iov[i].iov_base = 0; 931 iov[i].iov_len = 0; 932 } 933 934 while (len) { 935 assert(iovcnt > 0); 936 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 937 if (error || *fault) 938 return (error); 939 940 off = gpa & PAGE_MASK; 941 n = MIN(len, PAGE_SIZE - off); 942 943 va = vm_map_gpa(vcpu->ctx, gpa, n); 944 if (va == NULL) 945 return (EFAULT); 946 947 iov->iov_base = va; 948 iov->iov_len = n; 949 iov++; 950 iovcnt--; 951 952 gla += n; 953 len -= n; 954 } 955 return (0); 956 } 957 #endif 958 959 void 960 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) 961 { 962 /* 963 * Intentionally empty. This is used by the instruction 964 * emulation code shared with the kernel. The in-kernel 965 * version of this is non-empty. 966 */ 967 } 968 969 void 970 vm_copyin(struct iovec *iov, void *vp, size_t len) 971 { 972 const char *src; 973 char *dst; 974 size_t n; 975 976 dst = vp; 977 while (len) { 978 assert(iov->iov_len); 979 n = min(len, iov->iov_len); 980 src = iov->iov_base; 981 bcopy(src, dst, n); 982 983 iov++; 984 dst += n; 985 len -= n; 986 } 987 } 988 989 void 990 vm_copyout(const void *vp, struct iovec *iov, size_t len) 991 { 992 const char *src; 993 char *dst; 994 size_t n; 995 996 src = vp; 997 while (len) { 998 assert(iov->iov_len); 999 n = min(len, iov->iov_len); 1000 dst = iov->iov_base; 1001 bcopy(src, dst, n); 1002 1003 iov++; 1004 src += n; 1005 len -= n; 1006 } 1007 } 1008 1009 static int 1010 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) 1011 { 1012 struct vm_cpuset vm_cpuset; 1013 int error; 1014 1015 bzero(&vm_cpuset, sizeof(struct vm_cpuset)); 1016 vm_cpuset.which = which; 1017 vm_cpuset.cpusetsize = sizeof(cpuset_t); 1018 vm_cpuset.cpus = cpus; 1019 1020 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); 1021 return (error); 1022 } 1023 1024 int 1025 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) 1026 { 1027 1028 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); 1029 } 1030 1031 int 1032 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) 1033 { 1034 1035 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); 1036 } 1037 1038 int 1039 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) 1040 { 1041 1042 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); 1043 } 1044 1045 int 1046 vm_activate_cpu(struct vcpu *vcpu) 1047 { 1048 struct vm_activate_cpu ac; 1049 int error; 1050 1051 bzero(&ac, sizeof(struct vm_activate_cpu)); 1052 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); 1053 return (error); 1054 } 1055 1056 int 1057 vm_suspend_all_cpus(struct vmctx *ctx) 1058 { 1059 struct vm_activate_cpu ac; 1060 int error; 1061 1062 bzero(&ac, sizeof(struct vm_activate_cpu)); 1063 ac.vcpuid = -1; 1064 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); 1065 return (error); 1066 } 1067 1068 int 1069 vm_suspend_cpu(struct vcpu *vcpu) 1070 { 1071 struct vm_activate_cpu ac; 1072 int error; 1073 1074 bzero(&ac, sizeof(struct vm_activate_cpu)); 1075 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); 1076 return (error); 1077 } 1078 1079 int 1080 vm_resume_cpu(struct vcpu *vcpu) 1081 { 1082 struct vm_activate_cpu ac; 1083 int error; 1084 1085 bzero(&ac, sizeof(struct vm_activate_cpu)); 1086 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); 1087 return (error); 1088 } 1089 1090 int 1091 vm_resume_all_cpus(struct vmctx *ctx) 1092 { 1093 struct vm_activate_cpu ac; 1094 int error; 1095 1096 bzero(&ac, sizeof(struct vm_activate_cpu)); 1097 ac.vcpuid = -1; 1098 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); 1099 return (error); 1100 } 1101 1102 #ifdef __amd64__ 1103 int 1104 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1105 { 1106 struct vm_intinfo vmii; 1107 int error; 1108 1109 bzero(&vmii, sizeof(struct vm_intinfo)); 1110 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); 1111 if (error == 0) { 1112 *info1 = vmii.info1; 1113 *info2 = vmii.info2; 1114 } 1115 return (error); 1116 } 1117 1118 int 1119 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) 1120 { 1121 struct vm_intinfo vmii; 1122 int error; 1123 1124 bzero(&vmii, sizeof(struct vm_intinfo)); 1125 vmii.info1 = info1; 1126 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); 1127 return (error); 1128 } 1129 #endif 1130 1131 #ifdef WITH_VMMAPI_SNAPSHOT 1132 int 1133 vm_restart_instruction(struct vcpu *vcpu) 1134 { 1135 int arg; 1136 1137 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); 1138 } 1139 1140 int 1141 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) 1142 { 1143 1144 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { 1145 #ifdef SNAPSHOT_DEBUG 1146 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", 1147 __func__, meta->dev_name, errno); 1148 #endif 1149 return (-1); 1150 } 1151 return (0); 1152 } 1153 1154 int 1155 vm_restore_time(struct vmctx *ctx) 1156 { 1157 int dummy; 1158 1159 dummy = 0; 1160 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); 1161 } 1162 #endif 1163 1164 int 1165 vm_set_topology(struct vmctx *ctx, 1166 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) 1167 { 1168 struct vm_cpu_topology topology; 1169 1170 bzero(&topology, sizeof (struct vm_cpu_topology)); 1171 topology.sockets = sockets; 1172 topology.cores = cores; 1173 topology.threads = threads; 1174 topology.maxcpus = maxcpus; 1175 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); 1176 } 1177 1178 int 1179 vm_get_topology(struct vmctx *ctx, 1180 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) 1181 { 1182 struct vm_cpu_topology topology; 1183 int error; 1184 1185 bzero(&topology, sizeof (struct vm_cpu_topology)); 1186 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); 1187 if (error == 0) { 1188 *sockets = topology.sockets; 1189 *cores = topology.cores; 1190 *threads = topology.threads; 1191 *maxcpus = topology.maxcpus; 1192 } 1193 return (error); 1194 } 1195 1196 int 1197 vm_limit_rights(struct vmctx *ctx) 1198 { 1199 cap_rights_t rights; 1200 1201 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 1202 if (caph_rights_limit(ctx->fd, &rights) != 0) 1203 return (-1); 1204 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) 1205 return (-1); 1206 return (0); 1207 } 1208 1209 /* 1210 * Avoid using in new code. Operations on the fd should be wrapped here so that 1211 * capability rights can be kept in sync. 1212 */ 1213 int 1214 vm_get_device_fd(struct vmctx *ctx) 1215 { 1216 1217 return (ctx->fd); 1218 } 1219 1220 /* Legacy interface, do not use. */ 1221 const cap_ioctl_t * 1222 vm_get_ioctls(size_t *len) 1223 { 1224 cap_ioctl_t *cmds; 1225 size_t sz; 1226 1227 if (len == NULL) { 1228 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); 1229 cmds = malloc(sz); 1230 if (cmds == NULL) 1231 return (NULL); 1232 bcopy(vm_ioctl_cmds, cmds, sz); 1233 return (cmds); 1234 } 1235 1236 *len = vm_ioctl_ncmds; 1237 return (NULL); 1238 } 1239