1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/capsicum.h> 31 #include <sys/cpuset.h> 32 #include <sys/domainset.h> 33 #include <sys/sysctl.h> 34 #include <sys/ioctl.h> 35 #include <sys/mman.h> 36 #include <sys/linker.h> 37 #include <sys/module.h> 38 #include <sys/_iovec.h> 39 40 #include <capsicum_helpers.h> 41 #include <err.h> 42 #include <errno.h> 43 #include <stdbool.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <assert.h> 47 #include <string.h> 48 #include <fcntl.h> 49 #include <unistd.h> 50 51 #include <libutil.h> 52 53 #include <vm/vm.h> 54 #include <machine/vmm.h> 55 #ifdef WITH_VMMAPI_SNAPSHOT 56 #include <machine/vmm_snapshot.h> 57 #endif 58 59 #include <dev/vmm/vmm_dev.h> 60 61 #include "vmmapi.h" 62 #include "internal.h" 63 64 #define MB (1024 * 1024UL) 65 #define GB (1024 * 1024 * 1024UL) 66 67 #ifdef __amd64__ 68 #define VM_LOWMEM_LIMIT (3 * GB) 69 #else 70 #define VM_LOWMEM_LIMIT 0 71 #endif 72 #define VM_HIGHMEM_BASE (4 * GB) 73 74 /* 75 * Size of the guard region before and after the virtual address space 76 * mapping the guest physical memory. This must be a multiple of the 77 * superpage size for performance reasons. 78 */ 79 #define VM_MMAP_GUARD_SIZE (4 * MB) 80 81 #define PROT_RW (PROT_READ | PROT_WRITE) 82 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) 83 84 static int 85 vm_device_open(const char *name) 86 { 87 char devpath[PATH_MAX]; 88 89 assert(strlen(name) <= VM_MAX_NAMELEN); 90 (void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name); 91 return (open(devpath, O_RDWR)); 92 } 93 94 static int 95 vm_ctl_open(void) 96 { 97 if (modfind("vmm") < 0) 98 (void)kldload("vmm"); 99 return (open("/dev/vmmctl", O_RDWR, 0)); 100 } 101 102 static int 103 vm_ctl_create(const char *name, int ctlfd) 104 { 105 struct vmmctl_vm_create vmc; 106 107 memset(&vmc, 0, sizeof(vmc)); 108 if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) { 109 errno = ENAMETOOLONG; 110 return (-1); 111 } 112 return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc)); 113 } 114 115 int 116 vm_create(const char *name) 117 { 118 int error, fd; 119 120 fd = vm_ctl_open(); 121 if (fd < 0) 122 return (-1); 123 124 error = vm_ctl_create(name, fd); 125 if (error != 0) { 126 error = errno; 127 (void)close(fd); 128 errno = error; 129 return (-1); 130 } 131 (void)close(fd); 132 return (0); 133 } 134 135 struct vmctx * 136 vm_open(const char *name) 137 { 138 return (vm_openf(name, 0)); 139 } 140 141 struct vmctx * 142 vm_openf(const char *name, int flags) 143 { 144 struct vmctx *vm; 145 int saved_errno; 146 bool created; 147 148 created = false; 149 150 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); 151 assert(vm != NULL); 152 153 vm->fd = vm->ctlfd = -1; 154 vm->memflags = 0; 155 vm->name = (char *)(vm + 1); 156 strcpy(vm->name, name); 157 memset(vm->memsegs, 0, sizeof(vm->memsegs)); 158 159 if ((vm->ctlfd = vm_ctl_open()) < 0) 160 goto err; 161 162 vm->fd = vm_device_open(vm->name); 163 if (vm->fd < 0 && errno == ENOENT) { 164 if (flags & VMMAPI_OPEN_CREATE) { 165 if (vm_ctl_create(vm->name, vm->ctlfd) != 0) 166 goto err; 167 vm->fd = vm_device_open(vm->name); 168 created = true; 169 } 170 } 171 if (vm->fd < 0) 172 goto err; 173 174 if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0) 175 goto err; 176 177 return (vm); 178 err: 179 saved_errno = errno; 180 if (created) 181 vm_destroy(vm); 182 else 183 vm_close(vm); 184 errno = saved_errno; 185 return (NULL); 186 } 187 188 void 189 vm_close(struct vmctx *vm) 190 { 191 assert(vm != NULL); 192 193 if (vm->fd >= 0) 194 (void)close(vm->fd); 195 if (vm->ctlfd >= 0) 196 (void)close(vm->ctlfd); 197 free(vm); 198 } 199 200 void 201 vm_destroy(struct vmctx *vm) 202 { 203 struct vmmctl_vm_destroy vmd; 204 205 memset(&vmd, 0, sizeof(vmd)); 206 (void)strlcpy(vmd.name, vm->name, sizeof(vmd.name)); 207 if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0) 208 warn("ioctl(VMMCTL_VM_DESTROY)"); 209 210 vm_close(vm); 211 } 212 213 struct vcpu * 214 vm_vcpu_open(struct vmctx *ctx, int vcpuid) 215 { 216 struct vcpu *vcpu; 217 218 vcpu = malloc(sizeof(*vcpu)); 219 vcpu->ctx = ctx; 220 vcpu->vcpuid = vcpuid; 221 return (vcpu); 222 } 223 224 void 225 vm_vcpu_close(struct vcpu *vcpu) 226 { 227 free(vcpu); 228 } 229 230 int 231 vcpu_id(struct vcpu *vcpu) 232 { 233 return (vcpu->vcpuid); 234 } 235 236 int 237 vm_parse_memsize(const char *opt, size_t *ret_memsize) 238 { 239 char *endptr; 240 size_t optval; 241 int error; 242 243 optval = strtoul(opt, &endptr, 0); 244 if (*opt != '\0' && *endptr == '\0') { 245 /* 246 * For the sake of backward compatibility if the memory size 247 * specified on the command line is less than a megabyte then 248 * it is interpreted as being in units of MB. 249 */ 250 if (optval < MB) 251 optval *= MB; 252 *ret_memsize = optval; 253 error = 0; 254 } else 255 error = expand_number(opt, ret_memsize); 256 257 return (error); 258 } 259 260 uint32_t 261 vm_get_lowmem_limit(struct vmctx *ctx __unused) 262 { 263 264 return (VM_LOWMEM_LIMIT); 265 } 266 267 void 268 vm_set_memflags(struct vmctx *ctx, int flags) 269 { 270 271 ctx->memflags = flags; 272 } 273 274 int 275 vm_get_memflags(struct vmctx *ctx) 276 { 277 278 return (ctx->memflags); 279 } 280 281 /* 282 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). 283 */ 284 int 285 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, 286 size_t len, int prot) 287 { 288 struct vm_memmap memmap; 289 int error, flags; 290 291 memmap.gpa = gpa; 292 memmap.segid = segid; 293 memmap.segoff = off; 294 memmap.len = len; 295 memmap.prot = prot; 296 memmap.flags = 0; 297 298 if (ctx->memflags & VM_MEM_F_WIRED) 299 memmap.flags |= VM_MEMMAP_F_WIRED; 300 301 /* 302 * If this mapping already exists then don't create it again. This 303 * is the common case for SYSMEM mappings created by bhyveload(8). 304 */ 305 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); 306 if (error == 0 && gpa == memmap.gpa) { 307 if (segid != memmap.segid || off != memmap.segoff || 308 prot != memmap.prot || flags != memmap.flags) { 309 errno = EEXIST; 310 return (-1); 311 } else { 312 return (0); 313 } 314 } 315 316 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); 317 return (error); 318 } 319 320 int 321 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, 322 size_t *lowmem_size, size_t *highmem_size) 323 { 324 325 *guest_baseaddr = ctx->baseaddr; 326 *lowmem_size = ctx->lowmem_size; 327 *highmem_size = ctx->highmem_size; 328 return (0); 329 } 330 331 int 332 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) 333 { 334 struct vm_munmap munmap; 335 int error; 336 337 munmap.gpa = gpa; 338 munmap.len = len; 339 340 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); 341 return (error); 342 } 343 344 int 345 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, 346 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 347 { 348 struct vm_memmap memmap; 349 int error; 350 351 bzero(&memmap, sizeof(struct vm_memmap)); 352 memmap.gpa = *gpa; 353 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); 354 if (error == 0) { 355 *gpa = memmap.gpa; 356 *segid = memmap.segid; 357 *segoff = memmap.segoff; 358 *len = memmap.len; 359 *prot = memmap.prot; 360 *flags = memmap.flags; 361 } 362 return (error); 363 } 364 365 /* 366 * Return 0 if the segments are identical and non-zero otherwise. 367 * 368 * This is slightly complicated by the fact that only device memory segments 369 * are named. 370 */ 371 static int 372 cmpseg(size_t len, const char *str, size_t len2, const char *str2) 373 { 374 375 if (len == len2) { 376 if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) 377 return (0); 378 } 379 return (-1); 380 } 381 382 static int 383 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name, 384 int ds_policy, domainset_t *ds_mask, size_t ds_size) 385 { 386 struct vm_memseg memseg; 387 size_t n; 388 int error; 389 390 /* 391 * If the memory segment has already been created then just return. 392 * This is the usual case for the SYSMEM segment created by userspace 393 * loaders like bhyveload(8). 394 */ 395 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, 396 sizeof(memseg.name)); 397 if (error) 398 return (error); 399 400 if (memseg.len != 0) { 401 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { 402 errno = EINVAL; 403 return (-1); 404 } else { 405 return (0); 406 } 407 } 408 409 bzero(&memseg, sizeof(struct vm_memseg)); 410 memseg.segid = segid; 411 memseg.len = len; 412 if (ds_mask == NULL) { 413 memseg.ds_policy = DOMAINSET_POLICY_INVALID; 414 } else { 415 memseg.ds_policy = ds_policy; 416 memseg.ds_mask = ds_mask; 417 memseg.ds_mask_size = ds_size; 418 } 419 if (name != NULL) { 420 n = strlcpy(memseg.name, name, sizeof(memseg.name)); 421 if (n >= sizeof(memseg.name)) { 422 errno = ENAMETOOLONG; 423 return (-1); 424 } 425 } 426 427 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); 428 return (error); 429 } 430 431 int 432 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, 433 size_t bufsize) 434 { 435 struct vm_memseg memseg; 436 size_t n; 437 int error; 438 439 bzero(&memseg, sizeof(memseg)); 440 memseg.segid = segid; 441 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); 442 if (error == 0) { 443 *lenp = memseg.len; 444 n = strlcpy(namebuf, memseg.name, bufsize); 445 if (n >= bufsize) { 446 errno = ENAMETOOLONG; 447 error = -1; 448 } 449 } 450 return (error); 451 } 452 453 static int 454 map_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len, 455 size_t segoff, char *base) 456 { 457 char *ptr; 458 int error, flags; 459 460 /* Map 'len' bytes starting at 'gpa' in the guest address space */ 461 error = vm_mmap_memseg(ctx, gpa, segid, segoff, len, PROT_ALL); 462 if (error) 463 return (error); 464 465 flags = MAP_SHARED | MAP_FIXED; 466 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 467 flags |= MAP_NOCORE; 468 469 /* mmap into the process address space on the host */ 470 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); 471 if (ptr == MAP_FAILED) 472 return (-1); 473 474 return (0); 475 } 476 477 /* 478 * Allocates and maps virtual machine memory segments according 479 * to the NUMA topology specified by the 'doms' array. 480 * 481 * The domains are laid out sequentially in the guest's physical address space. 482 * The [VM_LOWMEM_LIMIT, VM_HIGHMEM_BASE) address range is skipped and 483 * left unmapped. 484 */ 485 int 486 vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style vms, 487 struct vm_mem_domain *doms, int ndoms) 488 { 489 size_t low_len, len, totalsize; 490 struct vm_mem_domain *dom; 491 struct vm_memseg memseg; 492 char *baseaddr, *ptr; 493 int error, i, segid; 494 vm_paddr_t gpa; 495 496 /* Sanity checks. */ 497 assert(vms == VM_MMAP_ALL); 498 if (doms == NULL || ndoms <= 0 || ndoms > VM_MAXMEMDOM) { 499 errno = EINVAL; 500 return (-1); 501 } 502 503 /* Calculate total memory size. */ 504 totalsize = 0; 505 for (i = 0; i < ndoms; i++) 506 totalsize += doms[i].size; 507 508 if (totalsize > VM_LOWMEM_LIMIT) 509 totalsize = VM_HIGHMEM_BASE + (totalsize - VM_LOWMEM_LIMIT); 510 511 /* 512 * Stake out a contiguous region covering the guest physical memory 513 * and the adjoining guard regions. 514 */ 515 len = VM_MMAP_GUARD_SIZE + totalsize + VM_MMAP_GUARD_SIZE; 516 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); 517 if (ptr == MAP_FAILED) 518 return (-1); 519 baseaddr = ptr + VM_MMAP_GUARD_SIZE; 520 521 /* 522 * Allocate and map memory segments for the virtual machine. 523 */ 524 gpa = VM_LOWMEM_LIMIT > 0 ? 0 : VM_HIGHMEM_BASE; 525 ctx->lowmem_size = 0; 526 ctx->highmem_size = 0; 527 for (i = 0; i < ndoms; i++) { 528 segid = VM_SYSMEM + i; 529 dom = &doms[i]; 530 531 /* 532 * Check if the memory segment already exists. 533 * If 'ndoms' is greater than one, refuse to proceed if the 534 * memseg already exists. If only one domain was requested, use 535 * the existing segment to preserve the behaviour of the previous 536 * implementation. 537 * 538 * Splitting existing memory segments is tedious and 539 * error-prone, which is why we don't support NUMA 540 * domains for bhyveload(8)-loaded VMs. 541 */ 542 error = vm_get_memseg(ctx, segid, &len, memseg.name, 543 sizeof(memseg.name)); 544 if (error == 0 && len != 0) { 545 if (ndoms != 1) { 546 errno = EEXIST; 547 return (-1); 548 } else 549 doms[0].size = len; 550 } else { 551 error = vm_alloc_memseg(ctx, segid, dom->size, NULL, 552 dom->ds_policy, dom->ds_mask, dom->ds_size); 553 if (error) 554 return (error); 555 } 556 557 /* 558 * If a domain is split by VM_LOWMEM_LIMIT then break 559 * its segment mapping into two parts, one below VM_LOWMEM_LIMIT 560 * and one above VM_HIGHMEM_BASE. 561 */ 562 if (gpa <= VM_LOWMEM_LIMIT && 563 gpa + dom->size > VM_LOWMEM_LIMIT) { 564 low_len = VM_LOWMEM_LIMIT - gpa; 565 error = map_memory_segment(ctx, segid, gpa, low_len, 0, 566 baseaddr); 567 if (error) 568 return (error); 569 ctx->lowmem_size = VM_LOWMEM_LIMIT; 570 /* Map the remainder. */ 571 gpa = VM_HIGHMEM_BASE; 572 len = dom->size - low_len; 573 error = map_memory_segment(ctx, segid, gpa, len, 574 low_len, baseaddr); 575 if (error) 576 return (error); 577 } else { 578 len = dom->size; 579 error = map_memory_segment(ctx, segid, gpa, len, 0, 580 baseaddr); 581 if (error) 582 return (error); 583 } 584 if (gpa <= VM_LOWMEM_LIMIT) 585 ctx->lowmem_size += len; 586 else 587 ctx->highmem_size += len; 588 gpa += len; 589 } 590 ctx->baseaddr = baseaddr; 591 592 return (0); 593 } 594 595 int 596 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) 597 { 598 struct vm_mem_domain dom0; 599 600 memset(&dom0, 0, sizeof(dom0)); 601 dom0.ds_policy = DOMAINSET_POLICY_INVALID; 602 dom0.size = memsize; 603 604 return (vm_setup_memory_domains(ctx, vms, &dom0, 1)); 605 } 606 607 /* 608 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in 609 * the lowmem or highmem regions. 610 * 611 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. 612 * The instruction emulation code depends on this behavior. 613 */ 614 void * 615 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) 616 { 617 vm_size_t lowsize, highsize; 618 619 lowsize = ctx->lowmem_size; 620 if (lowsize > 0) { 621 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) 622 return (ctx->baseaddr + gaddr); 623 } 624 625 highsize = ctx->highmem_size; 626 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { 627 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && 628 gaddr + len <= VM_HIGHMEM_BASE + highsize) 629 return (ctx->baseaddr + gaddr); 630 } 631 632 return (NULL); 633 } 634 635 vm_paddr_t 636 vm_rev_map_gpa(struct vmctx *ctx, void *addr) 637 { 638 vm_paddr_t offaddr; 639 vm_size_t lowsize, highsize; 640 641 offaddr = (char *)addr - ctx->baseaddr; 642 643 lowsize = ctx->lowmem_size; 644 if (lowsize > 0) 645 if (offaddr <= lowsize) 646 return (offaddr); 647 648 highsize = ctx->highmem_size; 649 if (highsize > 0) 650 if (offaddr >= VM_HIGHMEM_BASE && 651 offaddr < VM_HIGHMEM_BASE + highsize) 652 return (offaddr); 653 654 return ((vm_paddr_t)-1); 655 } 656 657 const char * 658 vm_get_name(struct vmctx *ctx) 659 { 660 661 return (ctx->name); 662 } 663 664 size_t 665 vm_get_lowmem_size(struct vmctx *ctx) 666 { 667 return (ctx->lowmem_size); 668 } 669 670 vm_paddr_t 671 vm_get_highmem_base(struct vmctx *ctx __unused) 672 { 673 674 return (VM_HIGHMEM_BASE); 675 } 676 677 size_t 678 vm_get_highmem_size(struct vmctx *ctx) 679 { 680 return (ctx->highmem_size); 681 } 682 683 void * 684 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) 685 { 686 char pathname[MAXPATHLEN]; 687 size_t len2; 688 char *base, *ptr; 689 int fd, error, flags; 690 691 fd = -1; 692 ptr = MAP_FAILED; 693 if (name == NULL || strlen(name) == 0) { 694 errno = EINVAL; 695 goto done; 696 } 697 698 error = vm_alloc_memseg(ctx, segid, len, name, 0, NULL, 0); 699 if (error) 700 goto done; 701 702 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); 703 strlcat(pathname, ctx->name, sizeof(pathname)); 704 strlcat(pathname, ".", sizeof(pathname)); 705 strlcat(pathname, name, sizeof(pathname)); 706 707 fd = open(pathname, O_RDWR); 708 if (fd < 0) 709 goto done; 710 711 /* 712 * Stake out a contiguous region covering the device memory and the 713 * adjoining guard regions. 714 */ 715 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; 716 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 717 0); 718 if (base == MAP_FAILED) 719 goto done; 720 721 flags = MAP_SHARED | MAP_FIXED; 722 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 723 flags |= MAP_NOCORE; 724 725 /* mmap the devmem region in the host address space */ 726 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); 727 done: 728 if (fd >= 0) 729 close(fd); 730 return (ptr); 731 } 732 733 int 734 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) 735 { 736 /* 737 * XXX: fragile, handle with care 738 * Assumes that the first field of the ioctl data 739 * is the vcpuid. 740 */ 741 *(int *)arg = vcpu->vcpuid; 742 return (ioctl(vcpu->ctx->fd, cmd, arg)); 743 } 744 745 int 746 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 747 { 748 int error; 749 struct vm_register vmreg; 750 751 bzero(&vmreg, sizeof(vmreg)); 752 vmreg.regnum = reg; 753 vmreg.regval = val; 754 755 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); 756 return (error); 757 } 758 759 int 760 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) 761 { 762 int error; 763 struct vm_register vmreg; 764 765 bzero(&vmreg, sizeof(vmreg)); 766 vmreg.regnum = reg; 767 768 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); 769 *ret_val = vmreg.regval; 770 return (error); 771 } 772 773 int 774 vm_set_register_set(struct vcpu *vcpu, unsigned int count, 775 const int *regnums, uint64_t *regvals) 776 { 777 int error; 778 struct vm_register_set vmregset; 779 780 bzero(&vmregset, sizeof(vmregset)); 781 vmregset.count = count; 782 vmregset.regnums = regnums; 783 vmregset.regvals = regvals; 784 785 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); 786 return (error); 787 } 788 789 int 790 vm_get_register_set(struct vcpu *vcpu, unsigned int count, 791 const int *regnums, uint64_t *regvals) 792 { 793 int error; 794 struct vm_register_set vmregset; 795 796 bzero(&vmregset, sizeof(vmregset)); 797 vmregset.count = count; 798 vmregset.regnums = regnums; 799 vmregset.regvals = regvals; 800 801 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); 802 return (error); 803 } 804 805 int 806 vm_run(struct vcpu *vcpu, struct vm_run *vmrun) 807 { 808 return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); 809 } 810 811 int 812 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) 813 { 814 struct vm_suspend vmsuspend; 815 816 bzero(&vmsuspend, sizeof(vmsuspend)); 817 vmsuspend.how = how; 818 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); 819 } 820 821 int 822 vm_reinit(struct vmctx *ctx) 823 { 824 825 return (ioctl(ctx->fd, VM_REINIT, 0)); 826 } 827 828 int 829 vm_capability_name2type(const char *capname) 830 { 831 int i; 832 833 for (i = 0; i < VM_CAP_MAX; i++) { 834 if (vm_capstrmap[i] != NULL && 835 strcmp(vm_capstrmap[i], capname) == 0) 836 return (i); 837 } 838 839 return (-1); 840 } 841 842 const char * 843 vm_capability_type2name(int type) 844 { 845 if (type >= 0 && type < VM_CAP_MAX) 846 return (vm_capstrmap[type]); 847 848 return (NULL); 849 } 850 851 int 852 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) 853 { 854 int error; 855 struct vm_capability vmcap; 856 857 bzero(&vmcap, sizeof(vmcap)); 858 vmcap.captype = cap; 859 860 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); 861 *retval = vmcap.capval; 862 return (error); 863 } 864 865 int 866 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) 867 { 868 struct vm_capability vmcap; 869 870 bzero(&vmcap, sizeof(vmcap)); 871 vmcap.captype = cap; 872 vmcap.capval = val; 873 874 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); 875 } 876 877 uint64_t * 878 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, 879 int *ret_entries) 880 { 881 static _Thread_local uint64_t *stats_buf; 882 static _Thread_local u_int stats_count; 883 uint64_t *new_stats; 884 struct vm_stats vmstats; 885 u_int count, index; 886 bool have_stats; 887 888 have_stats = false; 889 count = 0; 890 for (index = 0;; index += nitems(vmstats.statbuf)) { 891 vmstats.index = index; 892 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) 893 break; 894 if (stats_count < index + vmstats.num_entries) { 895 new_stats = realloc(stats_buf, 896 (index + vmstats.num_entries) * sizeof(uint64_t)); 897 if (new_stats == NULL) { 898 errno = ENOMEM; 899 return (NULL); 900 } 901 stats_count = index + vmstats.num_entries; 902 stats_buf = new_stats; 903 } 904 memcpy(stats_buf + index, vmstats.statbuf, 905 vmstats.num_entries * sizeof(uint64_t)); 906 count += vmstats.num_entries; 907 have_stats = true; 908 909 if (vmstats.num_entries != nitems(vmstats.statbuf)) 910 break; 911 } 912 if (have_stats) { 913 if (ret_entries) 914 *ret_entries = count; 915 if (ret_tv) 916 *ret_tv = vmstats.tv; 917 return (stats_buf); 918 } else 919 return (NULL); 920 } 921 922 const char * 923 vm_get_stat_desc(struct vmctx *ctx, int index) 924 { 925 static struct vm_stat_desc statdesc; 926 927 statdesc.index = index; 928 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) 929 return (statdesc.desc); 930 else 931 return (NULL); 932 } 933 934 #ifdef __amd64__ 935 int 936 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) 937 { 938 int error, i; 939 struct vm_gpa_pte gpapte; 940 941 bzero(&gpapte, sizeof(gpapte)); 942 gpapte.gpa = gpa; 943 944 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); 945 946 if (error == 0) { 947 *num = gpapte.ptenum; 948 for (i = 0; i < gpapte.ptenum; i++) 949 pte[i] = gpapte.pte[i]; 950 } 951 952 return (error); 953 } 954 955 int 956 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 957 uint64_t gla, int prot, uint64_t *gpa, int *fault) 958 { 959 struct vm_gla2gpa gg; 960 int error; 961 962 bzero(&gg, sizeof(struct vm_gla2gpa)); 963 gg.prot = prot; 964 gg.gla = gla; 965 gg.paging = *paging; 966 967 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); 968 if (error == 0) { 969 *fault = gg.fault; 970 *gpa = gg.gpa; 971 } 972 return (error); 973 } 974 #endif 975 976 int 977 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 978 uint64_t gla, int prot, uint64_t *gpa, int *fault) 979 { 980 struct vm_gla2gpa gg; 981 int error; 982 983 bzero(&gg, sizeof(struct vm_gla2gpa)); 984 gg.prot = prot; 985 gg.gla = gla; 986 gg.paging = *paging; 987 988 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); 989 if (error == 0) { 990 *fault = gg.fault; 991 *gpa = gg.gpa; 992 } 993 return (error); 994 } 995 996 #ifndef min 997 #define min(a,b) (((a) < (b)) ? (a) : (b)) 998 #endif 999 1000 #ifdef __amd64__ 1001 int 1002 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 1003 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, 1004 int *fault) 1005 { 1006 void *va; 1007 uint64_t gpa, off; 1008 int error, i, n; 1009 1010 for (i = 0; i < iovcnt; i++) { 1011 iov[i].iov_base = 0; 1012 iov[i].iov_len = 0; 1013 } 1014 1015 while (len) { 1016 assert(iovcnt > 0); 1017 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 1018 if (error || *fault) 1019 return (error); 1020 1021 off = gpa & PAGE_MASK; 1022 n = MIN(len, PAGE_SIZE - off); 1023 1024 va = vm_map_gpa(vcpu->ctx, gpa, n); 1025 if (va == NULL) 1026 return (EFAULT); 1027 1028 iov->iov_base = va; 1029 iov->iov_len = n; 1030 iov++; 1031 iovcnt--; 1032 1033 gla += n; 1034 len -= n; 1035 } 1036 return (0); 1037 } 1038 #endif 1039 1040 void 1041 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) 1042 { 1043 /* 1044 * Intentionally empty. This is used by the instruction 1045 * emulation code shared with the kernel. The in-kernel 1046 * version of this is non-empty. 1047 */ 1048 } 1049 1050 void 1051 vm_copyin(struct iovec *iov, void *vp, size_t len) 1052 { 1053 const char *src; 1054 char *dst; 1055 size_t n; 1056 1057 dst = vp; 1058 while (len) { 1059 assert(iov->iov_len); 1060 n = min(len, iov->iov_len); 1061 src = iov->iov_base; 1062 bcopy(src, dst, n); 1063 1064 iov++; 1065 dst += n; 1066 len -= n; 1067 } 1068 } 1069 1070 void 1071 vm_copyout(const void *vp, struct iovec *iov, size_t len) 1072 { 1073 const char *src; 1074 char *dst; 1075 size_t n; 1076 1077 src = vp; 1078 while (len) { 1079 assert(iov->iov_len); 1080 n = min(len, iov->iov_len); 1081 dst = iov->iov_base; 1082 bcopy(src, dst, n); 1083 1084 iov++; 1085 src += n; 1086 len -= n; 1087 } 1088 } 1089 1090 static int 1091 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) 1092 { 1093 struct vm_cpuset vm_cpuset; 1094 int error; 1095 1096 bzero(&vm_cpuset, sizeof(struct vm_cpuset)); 1097 vm_cpuset.which = which; 1098 vm_cpuset.cpusetsize = sizeof(cpuset_t); 1099 vm_cpuset.cpus = cpus; 1100 1101 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); 1102 return (error); 1103 } 1104 1105 int 1106 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) 1107 { 1108 1109 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); 1110 } 1111 1112 int 1113 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) 1114 { 1115 1116 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); 1117 } 1118 1119 int 1120 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) 1121 { 1122 1123 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); 1124 } 1125 1126 int 1127 vm_activate_cpu(struct vcpu *vcpu) 1128 { 1129 struct vm_activate_cpu ac; 1130 int error; 1131 1132 bzero(&ac, sizeof(struct vm_activate_cpu)); 1133 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); 1134 return (error); 1135 } 1136 1137 int 1138 vm_suspend_all_cpus(struct vmctx *ctx) 1139 { 1140 struct vm_activate_cpu ac; 1141 int error; 1142 1143 bzero(&ac, sizeof(struct vm_activate_cpu)); 1144 ac.vcpuid = -1; 1145 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); 1146 return (error); 1147 } 1148 1149 int 1150 vm_suspend_cpu(struct vcpu *vcpu) 1151 { 1152 struct vm_activate_cpu ac; 1153 int error; 1154 1155 bzero(&ac, sizeof(struct vm_activate_cpu)); 1156 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); 1157 return (error); 1158 } 1159 1160 int 1161 vm_resume_cpu(struct vcpu *vcpu) 1162 { 1163 struct vm_activate_cpu ac; 1164 int error; 1165 1166 bzero(&ac, sizeof(struct vm_activate_cpu)); 1167 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); 1168 return (error); 1169 } 1170 1171 int 1172 vm_resume_all_cpus(struct vmctx *ctx) 1173 { 1174 struct vm_activate_cpu ac; 1175 int error; 1176 1177 bzero(&ac, sizeof(struct vm_activate_cpu)); 1178 ac.vcpuid = -1; 1179 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); 1180 return (error); 1181 } 1182 1183 #ifdef __amd64__ 1184 int 1185 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1186 { 1187 struct vm_intinfo vmii; 1188 int error; 1189 1190 bzero(&vmii, sizeof(struct vm_intinfo)); 1191 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); 1192 if (error == 0) { 1193 *info1 = vmii.info1; 1194 *info2 = vmii.info2; 1195 } 1196 return (error); 1197 } 1198 1199 int 1200 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) 1201 { 1202 struct vm_intinfo vmii; 1203 int error; 1204 1205 bzero(&vmii, sizeof(struct vm_intinfo)); 1206 vmii.info1 = info1; 1207 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); 1208 return (error); 1209 } 1210 #endif 1211 1212 #ifdef WITH_VMMAPI_SNAPSHOT 1213 int 1214 vm_restart_instruction(struct vcpu *vcpu) 1215 { 1216 int arg; 1217 1218 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); 1219 } 1220 1221 int 1222 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) 1223 { 1224 1225 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { 1226 #ifdef SNAPSHOT_DEBUG 1227 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", 1228 __func__, meta->dev_name, errno); 1229 #endif 1230 return (-1); 1231 } 1232 return (0); 1233 } 1234 1235 int 1236 vm_restore_time(struct vmctx *ctx) 1237 { 1238 int dummy; 1239 1240 dummy = 0; 1241 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); 1242 } 1243 #endif 1244 1245 int 1246 vm_set_topology(struct vmctx *ctx, 1247 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) 1248 { 1249 struct vm_cpu_topology topology; 1250 1251 bzero(&topology, sizeof (struct vm_cpu_topology)); 1252 topology.sockets = sockets; 1253 topology.cores = cores; 1254 topology.threads = threads; 1255 topology.maxcpus = maxcpus; 1256 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); 1257 } 1258 1259 int 1260 vm_get_topology(struct vmctx *ctx, 1261 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) 1262 { 1263 struct vm_cpu_topology topology; 1264 int error; 1265 1266 bzero(&topology, sizeof (struct vm_cpu_topology)); 1267 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); 1268 if (error == 0) { 1269 *sockets = topology.sockets; 1270 *cores = topology.cores; 1271 *threads = topology.threads; 1272 *maxcpus = topology.maxcpus; 1273 } 1274 return (error); 1275 } 1276 1277 int 1278 vm_limit_rights(struct vmctx *ctx) 1279 { 1280 cap_rights_t rights; 1281 1282 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 1283 if (caph_rights_limit(ctx->fd, &rights) != 0) 1284 return (-1); 1285 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) 1286 return (-1); 1287 return (0); 1288 } 1289 1290 /* 1291 * Avoid using in new code. Operations on the fd should be wrapped here so that 1292 * capability rights can be kept in sync. 1293 */ 1294 int 1295 vm_get_device_fd(struct vmctx *ctx) 1296 { 1297 1298 return (ctx->fd); 1299 } 1300 1301 /* Legacy interface, do not use. */ 1302 const cap_ioctl_t * 1303 vm_get_ioctls(size_t *len) 1304 { 1305 cap_ioctl_t *cmds; 1306 size_t sz; 1307 1308 if (len == NULL) { 1309 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); 1310 cmds = malloc(sz); 1311 if (cmds == NULL) 1312 return (NULL); 1313 bcopy(vm_ioctl_cmds, cmds, sz); 1314 return (cmds); 1315 } 1316 1317 *len = vm_ioctl_ncmds; 1318 return (NULL); 1319 } 1320