1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/capsicum.h> 31 #include <sys/sysctl.h> 32 #include <sys/ioctl.h> 33 #include <sys/mman.h> 34 #include <sys/linker.h> 35 #include <sys/module.h> 36 #include <sys/_iovec.h> 37 #include <sys/cpuset.h> 38 39 #include <capsicum_helpers.h> 40 #include <err.h> 41 #include <errno.h> 42 #include <stdbool.h> 43 #include <stdio.h> 44 #include <stdlib.h> 45 #include <assert.h> 46 #include <string.h> 47 #include <fcntl.h> 48 #include <unistd.h> 49 50 #include <libutil.h> 51 52 #include <vm/vm.h> 53 #include <machine/vmm.h> 54 #ifdef WITH_VMMAPI_SNAPSHOT 55 #include <machine/vmm_snapshot.h> 56 #endif 57 58 #include <dev/vmm/vmm_dev.h> 59 60 #include "vmmapi.h" 61 #include "internal.h" 62 63 #define MB (1024 * 1024UL) 64 #define GB (1024 * 1024 * 1024UL) 65 66 #ifdef __amd64__ 67 #define VM_LOWMEM_LIMIT (3 * GB) 68 #else 69 #define VM_LOWMEM_LIMIT 0 70 #endif 71 #define VM_HIGHMEM_BASE (4 * GB) 72 73 /* 74 * Size of the guard region before and after the virtual address space 75 * mapping the guest physical memory. This must be a multiple of the 76 * superpage size for performance reasons. 77 */ 78 #define VM_MMAP_GUARD_SIZE (4 * MB) 79 80 #define PROT_RW (PROT_READ | PROT_WRITE) 81 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) 82 83 static int 84 vm_device_open(const char *name) 85 { 86 char devpath[PATH_MAX]; 87 88 assert(strlen(name) <= VM_MAX_NAMELEN); 89 (void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name); 90 return (open(devpath, O_RDWR)); 91 } 92 93 static int 94 vm_ctl_open(void) 95 { 96 if (modfind("vmm") < 0) 97 (void)kldload("vmm"); 98 return (open("/dev/vmmctl", O_RDWR, 0)); 99 } 100 101 static int 102 vm_ctl_create(const char *name, int ctlfd) 103 { 104 struct vmmctl_vm_create vmc; 105 106 memset(&vmc, 0, sizeof(vmc)); 107 if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) { 108 errno = ENAMETOOLONG; 109 return (-1); 110 } 111 return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc)); 112 } 113 114 int 115 vm_create(const char *name) 116 { 117 int error, fd; 118 119 fd = vm_ctl_open(); 120 if (fd < 0) 121 return (-1); 122 123 error = vm_ctl_create(name, fd); 124 if (error != 0) { 125 error = errno; 126 (void)close(fd); 127 errno = error; 128 return (-1); 129 } 130 (void)close(fd); 131 return (0); 132 } 133 134 struct vmctx * 135 vm_open(const char *name) 136 { 137 return (vm_openf(name, 0)); 138 } 139 140 struct vmctx * 141 vm_openf(const char *name, int flags) 142 { 143 struct vmctx *vm; 144 int saved_errno; 145 bool created; 146 147 created = false; 148 149 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); 150 assert(vm != NULL); 151 152 vm->fd = vm->ctlfd = -1; 153 vm->memflags = 0; 154 vm->name = (char *)(vm + 1); 155 strcpy(vm->name, name); 156 memset(vm->memsegs, 0, sizeof(vm->memsegs)); 157 158 if ((vm->ctlfd = vm_ctl_open()) < 0) 159 goto err; 160 161 vm->fd = vm_device_open(vm->name); 162 if (vm->fd < 0 && errno == ENOENT) { 163 if (flags & VMMAPI_OPEN_CREATE) { 164 if (vm_ctl_create(vm->name, vm->ctlfd) != 0) 165 goto err; 166 vm->fd = vm_device_open(vm->name); 167 created = true; 168 } 169 } 170 if (vm->fd < 0) 171 goto err; 172 173 if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0) 174 goto err; 175 176 return (vm); 177 err: 178 saved_errno = errno; 179 if (created) 180 vm_destroy(vm); 181 else 182 vm_close(vm); 183 errno = saved_errno; 184 return (NULL); 185 } 186 187 void 188 vm_close(struct vmctx *vm) 189 { 190 assert(vm != NULL); 191 192 if (vm->fd >= 0) 193 (void)close(vm->fd); 194 if (vm->ctlfd >= 0) 195 (void)close(vm->ctlfd); 196 free(vm); 197 } 198 199 void 200 vm_destroy(struct vmctx *vm) 201 { 202 struct vmmctl_vm_destroy vmd; 203 204 memset(&vmd, 0, sizeof(vmd)); 205 (void)strlcpy(vmd.name, vm->name, sizeof(vmd.name)); 206 if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0) 207 warn("ioctl(VMMCTL_VM_DESTROY)"); 208 209 vm_close(vm); 210 } 211 212 struct vcpu * 213 vm_vcpu_open(struct vmctx *ctx, int vcpuid) 214 { 215 struct vcpu *vcpu; 216 217 vcpu = malloc(sizeof(*vcpu)); 218 vcpu->ctx = ctx; 219 vcpu->vcpuid = vcpuid; 220 return (vcpu); 221 } 222 223 void 224 vm_vcpu_close(struct vcpu *vcpu) 225 { 226 free(vcpu); 227 } 228 229 int 230 vcpu_id(struct vcpu *vcpu) 231 { 232 return (vcpu->vcpuid); 233 } 234 235 int 236 vm_parse_memsize(const char *opt, size_t *ret_memsize) 237 { 238 char *endptr; 239 size_t optval; 240 int error; 241 242 optval = strtoul(opt, &endptr, 0); 243 if (*opt != '\0' && *endptr == '\0') { 244 /* 245 * For the sake of backward compatibility if the memory size 246 * specified on the command line is less than a megabyte then 247 * it is interpreted as being in units of MB. 248 */ 249 if (optval < MB) 250 optval *= MB; 251 *ret_memsize = optval; 252 error = 0; 253 } else 254 error = expand_number(opt, ret_memsize); 255 256 return (error); 257 } 258 259 uint32_t 260 vm_get_lowmem_limit(struct vmctx *ctx __unused) 261 { 262 263 return (VM_LOWMEM_LIMIT); 264 } 265 266 void 267 vm_set_memflags(struct vmctx *ctx, int flags) 268 { 269 270 ctx->memflags = flags; 271 } 272 273 int 274 vm_get_memflags(struct vmctx *ctx) 275 { 276 277 return (ctx->memflags); 278 } 279 280 /* 281 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). 282 */ 283 int 284 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, 285 size_t len, int prot) 286 { 287 struct vm_memmap memmap; 288 int error, flags; 289 290 memmap.gpa = gpa; 291 memmap.segid = segid; 292 memmap.segoff = off; 293 memmap.len = len; 294 memmap.prot = prot; 295 memmap.flags = 0; 296 297 if (ctx->memflags & VM_MEM_F_WIRED) 298 memmap.flags |= VM_MEMMAP_F_WIRED; 299 300 /* 301 * If this mapping already exists then don't create it again. This 302 * is the common case for SYSMEM mappings created by bhyveload(8). 303 */ 304 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); 305 if (error == 0 && gpa == memmap.gpa) { 306 if (segid != memmap.segid || off != memmap.segoff || 307 prot != memmap.prot || flags != memmap.flags) { 308 errno = EEXIST; 309 return (-1); 310 } else { 311 return (0); 312 } 313 } 314 315 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); 316 return (error); 317 } 318 319 int 320 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, 321 size_t *lowmem_size, size_t *highmem_size) 322 { 323 324 *guest_baseaddr = ctx->baseaddr; 325 *lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size; 326 *highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size; 327 return (0); 328 } 329 330 int 331 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) 332 { 333 struct vm_munmap munmap; 334 int error; 335 336 munmap.gpa = gpa; 337 munmap.len = len; 338 339 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); 340 return (error); 341 } 342 343 int 344 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, 345 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 346 { 347 struct vm_memmap memmap; 348 int error; 349 350 bzero(&memmap, sizeof(struct vm_memmap)); 351 memmap.gpa = *gpa; 352 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); 353 if (error == 0) { 354 *gpa = memmap.gpa; 355 *segid = memmap.segid; 356 *segoff = memmap.segoff; 357 *len = memmap.len; 358 *prot = memmap.prot; 359 *flags = memmap.flags; 360 } 361 return (error); 362 } 363 364 /* 365 * Return 0 if the segments are identical and non-zero otherwise. 366 * 367 * This is slightly complicated by the fact that only device memory segments 368 * are named. 369 */ 370 static int 371 cmpseg(size_t len, const char *str, size_t len2, const char *str2) 372 { 373 374 if (len == len2) { 375 if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) 376 return (0); 377 } 378 return (-1); 379 } 380 381 static int 382 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) 383 { 384 struct vm_memseg memseg; 385 size_t n; 386 int error; 387 388 /* 389 * If the memory segment has already been created then just return. 390 * This is the usual case for the SYSMEM segment created by userspace 391 * loaders like bhyveload(8). 392 */ 393 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, 394 sizeof(memseg.name)); 395 if (error) 396 return (error); 397 398 if (memseg.len != 0) { 399 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { 400 errno = EINVAL; 401 return (-1); 402 } else { 403 return (0); 404 } 405 } 406 407 bzero(&memseg, sizeof(struct vm_memseg)); 408 memseg.segid = segid; 409 memseg.len = len; 410 if (name != NULL) { 411 n = strlcpy(memseg.name, name, sizeof(memseg.name)); 412 if (n >= sizeof(memseg.name)) { 413 errno = ENAMETOOLONG; 414 return (-1); 415 } 416 } 417 418 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); 419 return (error); 420 } 421 422 int 423 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, 424 size_t bufsize) 425 { 426 struct vm_memseg memseg; 427 size_t n; 428 int error; 429 430 bzero(&memseg, sizeof(memseg)); 431 memseg.segid = segid; 432 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); 433 if (error == 0) { 434 *lenp = memseg.len; 435 n = strlcpy(namebuf, memseg.name, bufsize); 436 if (n >= bufsize) { 437 errno = ENAMETOOLONG; 438 error = -1; 439 } 440 } 441 return (error); 442 } 443 444 static int 445 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) 446 { 447 char *ptr; 448 int error, flags; 449 450 /* Map 'len' bytes starting at 'gpa' in the guest address space */ 451 error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); 452 if (error) 453 return (error); 454 455 flags = MAP_SHARED | MAP_FIXED; 456 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 457 flags |= MAP_NOCORE; 458 459 /* mmap into the process address space on the host */ 460 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); 461 if (ptr == MAP_FAILED) 462 return (-1); 463 464 return (0); 465 } 466 467 int 468 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) 469 { 470 size_t objsize, len; 471 vm_paddr_t gpa; 472 char *baseaddr, *ptr; 473 int error; 474 475 assert(vms == VM_MMAP_ALL); 476 477 /* 478 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create 479 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder. 480 */ 481 if (memsize > VM_LOWMEM_LIMIT) { 482 ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT; 483 ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT; 484 objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size; 485 } else { 486 ctx->memsegs[VM_MEMSEG_LOW].size = memsize; 487 ctx->memsegs[VM_MEMSEG_HIGH].size = 0; 488 objsize = memsize; 489 } 490 491 error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); 492 if (error) 493 return (error); 494 495 /* 496 * Stake out a contiguous region covering the guest physical memory 497 * and the adjoining guard regions. 498 */ 499 len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; 500 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); 501 if (ptr == MAP_FAILED) 502 return (-1); 503 504 baseaddr = ptr + VM_MMAP_GUARD_SIZE; 505 if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) { 506 gpa = VM_HIGHMEM_BASE; 507 len = ctx->memsegs[VM_MEMSEG_HIGH].size; 508 error = setup_memory_segment(ctx, gpa, len, baseaddr); 509 if (error) 510 return (error); 511 } 512 513 if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) { 514 gpa = 0; 515 len = ctx->memsegs[VM_MEMSEG_LOW].size; 516 error = setup_memory_segment(ctx, gpa, len, baseaddr); 517 if (error) 518 return (error); 519 } 520 521 ctx->baseaddr = baseaddr; 522 523 return (0); 524 } 525 526 /* 527 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in 528 * the lowmem or highmem regions. 529 * 530 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. 531 * The instruction emulation code depends on this behavior. 532 */ 533 void * 534 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) 535 { 536 vm_size_t lowsize, highsize; 537 538 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 539 if (lowsize > 0) { 540 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) 541 return (ctx->baseaddr + gaddr); 542 } 543 544 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 545 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { 546 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && 547 gaddr + len <= VM_HIGHMEM_BASE + highsize) 548 return (ctx->baseaddr + gaddr); 549 } 550 551 return (NULL); 552 } 553 554 vm_paddr_t 555 vm_rev_map_gpa(struct vmctx *ctx, void *addr) 556 { 557 vm_paddr_t offaddr; 558 vm_size_t lowsize, highsize; 559 560 offaddr = (char *)addr - ctx->baseaddr; 561 562 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 563 if (lowsize > 0) 564 if (offaddr <= lowsize) 565 return (offaddr); 566 567 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 568 if (highsize > 0) 569 if (offaddr >= VM_HIGHMEM_BASE && 570 offaddr < VM_HIGHMEM_BASE + highsize) 571 return (offaddr); 572 573 return ((vm_paddr_t)-1); 574 } 575 576 const char * 577 vm_get_name(struct vmctx *ctx) 578 { 579 580 return (ctx->name); 581 } 582 583 size_t 584 vm_get_lowmem_size(struct vmctx *ctx) 585 { 586 587 return (ctx->memsegs[VM_MEMSEG_LOW].size); 588 } 589 590 vm_paddr_t 591 vm_get_highmem_base(struct vmctx *ctx __unused) 592 { 593 594 return (VM_HIGHMEM_BASE); 595 } 596 597 size_t 598 vm_get_highmem_size(struct vmctx *ctx) 599 { 600 601 return (ctx->memsegs[VM_MEMSEG_HIGH].size); 602 } 603 604 void * 605 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) 606 { 607 char pathname[MAXPATHLEN]; 608 size_t len2; 609 char *base, *ptr; 610 int fd, error, flags; 611 612 fd = -1; 613 ptr = MAP_FAILED; 614 if (name == NULL || strlen(name) == 0) { 615 errno = EINVAL; 616 goto done; 617 } 618 619 error = vm_alloc_memseg(ctx, segid, len, name); 620 if (error) 621 goto done; 622 623 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); 624 strlcat(pathname, ctx->name, sizeof(pathname)); 625 strlcat(pathname, ".", sizeof(pathname)); 626 strlcat(pathname, name, sizeof(pathname)); 627 628 fd = open(pathname, O_RDWR); 629 if (fd < 0) 630 goto done; 631 632 /* 633 * Stake out a contiguous region covering the device memory and the 634 * adjoining guard regions. 635 */ 636 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; 637 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 638 0); 639 if (base == MAP_FAILED) 640 goto done; 641 642 flags = MAP_SHARED | MAP_FIXED; 643 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 644 flags |= MAP_NOCORE; 645 646 /* mmap the devmem region in the host address space */ 647 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); 648 done: 649 if (fd >= 0) 650 close(fd); 651 return (ptr); 652 } 653 654 int 655 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) 656 { 657 /* 658 * XXX: fragile, handle with care 659 * Assumes that the first field of the ioctl data 660 * is the vcpuid. 661 */ 662 *(int *)arg = vcpu->vcpuid; 663 return (ioctl(vcpu->ctx->fd, cmd, arg)); 664 } 665 666 int 667 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 668 { 669 int error; 670 struct vm_register vmreg; 671 672 bzero(&vmreg, sizeof(vmreg)); 673 vmreg.regnum = reg; 674 vmreg.regval = val; 675 676 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); 677 return (error); 678 } 679 680 int 681 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) 682 { 683 int error; 684 struct vm_register vmreg; 685 686 bzero(&vmreg, sizeof(vmreg)); 687 vmreg.regnum = reg; 688 689 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); 690 *ret_val = vmreg.regval; 691 return (error); 692 } 693 694 int 695 vm_set_register_set(struct vcpu *vcpu, unsigned int count, 696 const int *regnums, uint64_t *regvals) 697 { 698 int error; 699 struct vm_register_set vmregset; 700 701 bzero(&vmregset, sizeof(vmregset)); 702 vmregset.count = count; 703 vmregset.regnums = regnums; 704 vmregset.regvals = regvals; 705 706 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); 707 return (error); 708 } 709 710 int 711 vm_get_register_set(struct vcpu *vcpu, unsigned int count, 712 const int *regnums, uint64_t *regvals) 713 { 714 int error; 715 struct vm_register_set vmregset; 716 717 bzero(&vmregset, sizeof(vmregset)); 718 vmregset.count = count; 719 vmregset.regnums = regnums; 720 vmregset.regvals = regvals; 721 722 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); 723 return (error); 724 } 725 726 int 727 vm_run(struct vcpu *vcpu, struct vm_run *vmrun) 728 { 729 return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); 730 } 731 732 int 733 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) 734 { 735 struct vm_suspend vmsuspend; 736 737 bzero(&vmsuspend, sizeof(vmsuspend)); 738 vmsuspend.how = how; 739 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); 740 } 741 742 int 743 vm_reinit(struct vmctx *ctx) 744 { 745 746 return (ioctl(ctx->fd, VM_REINIT, 0)); 747 } 748 749 int 750 vm_capability_name2type(const char *capname) 751 { 752 int i; 753 754 for (i = 0; i < VM_CAP_MAX; i++) { 755 if (vm_capstrmap[i] != NULL && 756 strcmp(vm_capstrmap[i], capname) == 0) 757 return (i); 758 } 759 760 return (-1); 761 } 762 763 const char * 764 vm_capability_type2name(int type) 765 { 766 if (type >= 0 && type < VM_CAP_MAX) 767 return (vm_capstrmap[type]); 768 769 return (NULL); 770 } 771 772 int 773 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) 774 { 775 int error; 776 struct vm_capability vmcap; 777 778 bzero(&vmcap, sizeof(vmcap)); 779 vmcap.captype = cap; 780 781 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); 782 *retval = vmcap.capval; 783 return (error); 784 } 785 786 int 787 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) 788 { 789 struct vm_capability vmcap; 790 791 bzero(&vmcap, sizeof(vmcap)); 792 vmcap.captype = cap; 793 vmcap.capval = val; 794 795 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); 796 } 797 798 uint64_t * 799 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, 800 int *ret_entries) 801 { 802 static _Thread_local uint64_t *stats_buf; 803 static _Thread_local u_int stats_count; 804 uint64_t *new_stats; 805 struct vm_stats vmstats; 806 u_int count, index; 807 bool have_stats; 808 809 have_stats = false; 810 count = 0; 811 for (index = 0;; index += nitems(vmstats.statbuf)) { 812 vmstats.index = index; 813 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) 814 break; 815 if (stats_count < index + vmstats.num_entries) { 816 new_stats = realloc(stats_buf, 817 (index + vmstats.num_entries) * sizeof(uint64_t)); 818 if (new_stats == NULL) { 819 errno = ENOMEM; 820 return (NULL); 821 } 822 stats_count = index + vmstats.num_entries; 823 stats_buf = new_stats; 824 } 825 memcpy(stats_buf + index, vmstats.statbuf, 826 vmstats.num_entries * sizeof(uint64_t)); 827 count += vmstats.num_entries; 828 have_stats = true; 829 830 if (vmstats.num_entries != nitems(vmstats.statbuf)) 831 break; 832 } 833 if (have_stats) { 834 if (ret_entries) 835 *ret_entries = count; 836 if (ret_tv) 837 *ret_tv = vmstats.tv; 838 return (stats_buf); 839 } else 840 return (NULL); 841 } 842 843 const char * 844 vm_get_stat_desc(struct vmctx *ctx, int index) 845 { 846 static struct vm_stat_desc statdesc; 847 848 statdesc.index = index; 849 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) 850 return (statdesc.desc); 851 else 852 return (NULL); 853 } 854 855 #ifdef __amd64__ 856 int 857 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) 858 { 859 int error, i; 860 struct vm_gpa_pte gpapte; 861 862 bzero(&gpapte, sizeof(gpapte)); 863 gpapte.gpa = gpa; 864 865 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); 866 867 if (error == 0) { 868 *num = gpapte.ptenum; 869 for (i = 0; i < gpapte.ptenum; i++) 870 pte[i] = gpapte.pte[i]; 871 } 872 873 return (error); 874 } 875 876 int 877 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 878 uint64_t gla, int prot, uint64_t *gpa, int *fault) 879 { 880 struct vm_gla2gpa gg; 881 int error; 882 883 bzero(&gg, sizeof(struct vm_gla2gpa)); 884 gg.prot = prot; 885 gg.gla = gla; 886 gg.paging = *paging; 887 888 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); 889 if (error == 0) { 890 *fault = gg.fault; 891 *gpa = gg.gpa; 892 } 893 return (error); 894 } 895 #endif 896 897 int 898 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 899 uint64_t gla, int prot, uint64_t *gpa, int *fault) 900 { 901 struct vm_gla2gpa gg; 902 int error; 903 904 bzero(&gg, sizeof(struct vm_gla2gpa)); 905 gg.prot = prot; 906 gg.gla = gla; 907 gg.paging = *paging; 908 909 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); 910 if (error == 0) { 911 *fault = gg.fault; 912 *gpa = gg.gpa; 913 } 914 return (error); 915 } 916 917 #ifndef min 918 #define min(a,b) (((a) < (b)) ? (a) : (b)) 919 #endif 920 921 #ifdef __amd64__ 922 int 923 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 924 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, 925 int *fault) 926 { 927 void *va; 928 uint64_t gpa, off; 929 int error, i, n; 930 931 for (i = 0; i < iovcnt; i++) { 932 iov[i].iov_base = 0; 933 iov[i].iov_len = 0; 934 } 935 936 while (len) { 937 assert(iovcnt > 0); 938 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 939 if (error || *fault) 940 return (error); 941 942 off = gpa & PAGE_MASK; 943 n = MIN(len, PAGE_SIZE - off); 944 945 va = vm_map_gpa(vcpu->ctx, gpa, n); 946 if (va == NULL) 947 return (EFAULT); 948 949 iov->iov_base = va; 950 iov->iov_len = n; 951 iov++; 952 iovcnt--; 953 954 gla += n; 955 len -= n; 956 } 957 return (0); 958 } 959 #endif 960 961 void 962 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) 963 { 964 /* 965 * Intentionally empty. This is used by the instruction 966 * emulation code shared with the kernel. The in-kernel 967 * version of this is non-empty. 968 */ 969 } 970 971 void 972 vm_copyin(struct iovec *iov, void *vp, size_t len) 973 { 974 const char *src; 975 char *dst; 976 size_t n; 977 978 dst = vp; 979 while (len) { 980 assert(iov->iov_len); 981 n = min(len, iov->iov_len); 982 src = iov->iov_base; 983 bcopy(src, dst, n); 984 985 iov++; 986 dst += n; 987 len -= n; 988 } 989 } 990 991 void 992 vm_copyout(const void *vp, struct iovec *iov, size_t len) 993 { 994 const char *src; 995 char *dst; 996 size_t n; 997 998 src = vp; 999 while (len) { 1000 assert(iov->iov_len); 1001 n = min(len, iov->iov_len); 1002 dst = iov->iov_base; 1003 bcopy(src, dst, n); 1004 1005 iov++; 1006 src += n; 1007 len -= n; 1008 } 1009 } 1010 1011 static int 1012 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) 1013 { 1014 struct vm_cpuset vm_cpuset; 1015 int error; 1016 1017 bzero(&vm_cpuset, sizeof(struct vm_cpuset)); 1018 vm_cpuset.which = which; 1019 vm_cpuset.cpusetsize = sizeof(cpuset_t); 1020 vm_cpuset.cpus = cpus; 1021 1022 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); 1023 return (error); 1024 } 1025 1026 int 1027 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) 1028 { 1029 1030 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); 1031 } 1032 1033 int 1034 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) 1035 { 1036 1037 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); 1038 } 1039 1040 int 1041 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) 1042 { 1043 1044 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); 1045 } 1046 1047 int 1048 vm_activate_cpu(struct vcpu *vcpu) 1049 { 1050 struct vm_activate_cpu ac; 1051 int error; 1052 1053 bzero(&ac, sizeof(struct vm_activate_cpu)); 1054 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); 1055 return (error); 1056 } 1057 1058 int 1059 vm_suspend_all_cpus(struct vmctx *ctx) 1060 { 1061 struct vm_activate_cpu ac; 1062 int error; 1063 1064 bzero(&ac, sizeof(struct vm_activate_cpu)); 1065 ac.vcpuid = -1; 1066 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); 1067 return (error); 1068 } 1069 1070 int 1071 vm_suspend_cpu(struct vcpu *vcpu) 1072 { 1073 struct vm_activate_cpu ac; 1074 int error; 1075 1076 bzero(&ac, sizeof(struct vm_activate_cpu)); 1077 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); 1078 return (error); 1079 } 1080 1081 int 1082 vm_resume_cpu(struct vcpu *vcpu) 1083 { 1084 struct vm_activate_cpu ac; 1085 int error; 1086 1087 bzero(&ac, sizeof(struct vm_activate_cpu)); 1088 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); 1089 return (error); 1090 } 1091 1092 int 1093 vm_resume_all_cpus(struct vmctx *ctx) 1094 { 1095 struct vm_activate_cpu ac; 1096 int error; 1097 1098 bzero(&ac, sizeof(struct vm_activate_cpu)); 1099 ac.vcpuid = -1; 1100 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); 1101 return (error); 1102 } 1103 1104 #ifdef __amd64__ 1105 int 1106 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1107 { 1108 struct vm_intinfo vmii; 1109 int error; 1110 1111 bzero(&vmii, sizeof(struct vm_intinfo)); 1112 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); 1113 if (error == 0) { 1114 *info1 = vmii.info1; 1115 *info2 = vmii.info2; 1116 } 1117 return (error); 1118 } 1119 1120 int 1121 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) 1122 { 1123 struct vm_intinfo vmii; 1124 int error; 1125 1126 bzero(&vmii, sizeof(struct vm_intinfo)); 1127 vmii.info1 = info1; 1128 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); 1129 return (error); 1130 } 1131 #endif 1132 1133 #ifdef WITH_VMMAPI_SNAPSHOT 1134 int 1135 vm_restart_instruction(struct vcpu *vcpu) 1136 { 1137 int arg; 1138 1139 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); 1140 } 1141 1142 int 1143 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) 1144 { 1145 1146 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { 1147 #ifdef SNAPSHOT_DEBUG 1148 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", 1149 __func__, meta->dev_name, errno); 1150 #endif 1151 return (-1); 1152 } 1153 return (0); 1154 } 1155 1156 int 1157 vm_restore_time(struct vmctx *ctx) 1158 { 1159 int dummy; 1160 1161 dummy = 0; 1162 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); 1163 } 1164 #endif 1165 1166 int 1167 vm_set_topology(struct vmctx *ctx, 1168 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) 1169 { 1170 struct vm_cpu_topology topology; 1171 1172 bzero(&topology, sizeof (struct vm_cpu_topology)); 1173 topology.sockets = sockets; 1174 topology.cores = cores; 1175 topology.threads = threads; 1176 topology.maxcpus = maxcpus; 1177 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); 1178 } 1179 1180 int 1181 vm_get_topology(struct vmctx *ctx, 1182 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) 1183 { 1184 struct vm_cpu_topology topology; 1185 int error; 1186 1187 bzero(&topology, sizeof (struct vm_cpu_topology)); 1188 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); 1189 if (error == 0) { 1190 *sockets = topology.sockets; 1191 *cores = topology.cores; 1192 *threads = topology.threads; 1193 *maxcpus = topology.maxcpus; 1194 } 1195 return (error); 1196 } 1197 1198 int 1199 vm_limit_rights(struct vmctx *ctx) 1200 { 1201 cap_rights_t rights; 1202 1203 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 1204 if (caph_rights_limit(ctx->fd, &rights) != 0) 1205 return (-1); 1206 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) 1207 return (-1); 1208 return (0); 1209 } 1210 1211 /* 1212 * Avoid using in new code. Operations on the fd should be wrapped here so that 1213 * capability rights can be kept in sync. 1214 */ 1215 int 1216 vm_get_device_fd(struct vmctx *ctx) 1217 { 1218 1219 return (ctx->fd); 1220 } 1221 1222 /* Legacy interface, do not use. */ 1223 const cap_ioctl_t * 1224 vm_get_ioctls(size_t *len) 1225 { 1226 cap_ioctl_t *cmds; 1227 size_t sz; 1228 1229 if (len == NULL) { 1230 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); 1231 cmds = malloc(sz); 1232 if (cmds == NULL) 1233 return (NULL); 1234 bcopy(vm_ioctl_cmds, cmds, sz); 1235 return (cmds); 1236 } 1237 1238 *len = vm_ioctl_ncmds; 1239 return (NULL); 1240 } 1241