1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * Copyright (c) 2026 Hans Rosenfeld 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/param.h> 31 #include <sys/capsicum.h> 32 #include <sys/cpuset.h> 33 #include <sys/domainset.h> 34 #include <sys/sysctl.h> 35 #include <sys/ioctl.h> 36 #include <sys/mman.h> 37 #include <sys/linker.h> 38 #include <sys/module.h> 39 #include <sys/_iovec.h> 40 41 #include <capsicum_helpers.h> 42 #include <err.h> 43 #include <errno.h> 44 #include <stdbool.h> 45 #include <stdio.h> 46 #include <stdlib.h> 47 #include <assert.h> 48 #include <string.h> 49 #include <fcntl.h> 50 #include <unistd.h> 51 52 #include <libutil.h> 53 54 #include <vm/vm.h> 55 #include <machine/vmm.h> 56 #ifdef WITH_VMMAPI_SNAPSHOT 57 #include <machine/vmm_snapshot.h> 58 #endif 59 60 #include <dev/vmm/vmm_dev.h> 61 62 #include "vmmapi.h" 63 #include "internal.h" 64 65 #define MB (1024 * 1024UL) 66 #define GB (1024 * 1024 * 1024UL) 67 68 #ifdef __amd64__ 69 #define VM_LOWMEM_LIMIT (3 * GB) 70 #else 71 #define VM_LOWMEM_LIMIT 0 72 #endif 73 #define VM_HIGHMEM_BASE (4 * GB) 74 75 /* 76 * Size of the guard region before and after the virtual address space 77 * mapping the guest physical memory. This must be a multiple of the 78 * superpage size for performance reasons. 79 */ 80 #define VM_MMAP_GUARD_SIZE (4 * MB) 81 82 #define PROT_RW (PROT_READ | PROT_WRITE) 83 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) 84 85 static int 86 vm_device_open(const char *name) 87 { 88 char devpath[PATH_MAX]; 89 90 assert(strlen(name) <= VM_MAX_NAMELEN); 91 (void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name); 92 return (open(devpath, O_RDWR)); 93 } 94 95 static int 96 vm_ctl_open(void) 97 { 98 if (modfind("vmm") < 0) 99 (void)kldload("vmm"); 100 return (open("/dev/vmmctl", O_RDWR, 0)); 101 } 102 103 static int 104 vm_ctl_create(const char *name, int flags, int ctlfd) 105 { 106 struct vmmctl_vm_create vmc; 107 108 memset(&vmc, 0, sizeof(vmc)); 109 if ((flags & VMMAPI_OPEN_CREATE_DESTROY_ON_CLOSE) != 0) 110 vmc.flags |= VMMCTL_CREATE_DESTROY_ON_CLOSE; 111 if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) { 112 errno = ENAMETOOLONG; 113 return (-1); 114 } 115 return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc)); 116 } 117 118 int 119 vm_create(const char *name) 120 { 121 int error, fd; 122 123 fd = vm_ctl_open(); 124 if (fd < 0) 125 return (-1); 126 127 error = vm_ctl_create(name, 0, fd); 128 if (error != 0) { 129 error = errno; 130 (void)close(fd); 131 errno = error; 132 return (-1); 133 } 134 (void)close(fd); 135 return (0); 136 } 137 138 struct vmctx * 139 vm_open(const char *name) 140 { 141 return (vm_openf(name, 0)); 142 } 143 144 struct vmctx * 145 vm_openf(const char *name, int flags) 146 { 147 struct vmctx *vm; 148 int saved_errno; 149 bool created; 150 151 created = false; 152 153 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); 154 assert(vm != NULL); 155 156 vm->fd = vm->ctlfd = -1; 157 vm->memflags = 0; 158 vm->name = (char *)(vm + 1); 159 strcpy(vm->name, name); 160 memset(vm->memsegs, 0, sizeof(vm->memsegs)); 161 162 if ((vm->ctlfd = vm_ctl_open()) < 0) 163 goto err; 164 165 vm->fd = vm_device_open(vm->name); 166 if (vm->fd < 0 && errno == ENOENT) { 167 if (flags & VMMAPI_OPEN_CREATE) { 168 if (vm_ctl_create(vm->name, flags, vm->ctlfd) != 0) 169 goto err; 170 vm->fd = vm_device_open(vm->name); 171 created = true; 172 } 173 } 174 if (vm->fd < 0) 175 goto err; 176 177 if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0) 178 goto err; 179 180 return (vm); 181 err: 182 saved_errno = errno; 183 if (created) 184 vm_destroy(vm); 185 else 186 vm_close(vm); 187 errno = saved_errno; 188 return (NULL); 189 } 190 191 void 192 vm_close(struct vmctx *vm) 193 { 194 assert(vm != NULL); 195 196 if (vm->fd >= 0) 197 (void)close(vm->fd); 198 if (vm->ctlfd >= 0) 199 (void)close(vm->ctlfd); 200 free(vm); 201 } 202 203 void 204 vm_destroy(struct vmctx *vm) 205 { 206 struct vmmctl_vm_destroy vmd; 207 208 memset(&vmd, 0, sizeof(vmd)); 209 (void)strlcpy(vmd.name, vm->name, sizeof(vmd.name)); 210 if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0) 211 warn("ioctl(VMMCTL_VM_DESTROY)"); 212 213 vm_close(vm); 214 } 215 216 struct vcpu * 217 vm_vcpu_open(struct vmctx *ctx, int vcpuid) 218 { 219 struct vcpu *vcpu; 220 221 vcpu = malloc(sizeof(*vcpu)); 222 if (vcpu == NULL) 223 return (vcpu); 224 225 vcpu->ctx = ctx; 226 vcpu->vcpuid = vcpuid; 227 return (vcpu); 228 } 229 230 void 231 vm_vcpu_close(struct vcpu *vcpu) 232 { 233 free(vcpu); 234 } 235 236 int 237 vcpu_id(struct vcpu *vcpu) 238 { 239 return (vcpu->vcpuid); 240 } 241 242 int 243 vm_parse_memsize(const char *opt, size_t *ret_memsize) 244 { 245 char *endptr; 246 size_t optval; 247 int error; 248 249 optval = strtoul(opt, &endptr, 0); 250 if (*opt != '\0' && *endptr == '\0') { 251 /* 252 * For the sake of backward compatibility if the memory size 253 * specified on the command line is less than a megabyte then 254 * it is interpreted as being in units of MB. 255 */ 256 if (optval < MB) 257 optval *= MB; 258 *ret_memsize = optval; 259 error = 0; 260 } else 261 error = expand_number(opt, ret_memsize); 262 263 return (error); 264 } 265 266 uint32_t 267 vm_get_lowmem_limit(struct vmctx *ctx __unused) 268 { 269 270 return (VM_LOWMEM_LIMIT); 271 } 272 273 void 274 vm_set_memflags(struct vmctx *ctx, int flags) 275 { 276 277 ctx->memflags = flags; 278 } 279 280 int 281 vm_get_memflags(struct vmctx *ctx) 282 { 283 284 return (ctx->memflags); 285 } 286 287 /* 288 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). 289 */ 290 int 291 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, 292 size_t len, int prot) 293 { 294 struct vm_memmap memmap; 295 int error, flags; 296 297 memmap.gpa = gpa; 298 memmap.segid = segid; 299 memmap.segoff = off; 300 memmap.len = len; 301 memmap.prot = prot; 302 memmap.flags = 0; 303 304 if (ctx->memflags & VM_MEM_F_WIRED) 305 memmap.flags |= VM_MEMMAP_F_WIRED; 306 307 /* 308 * If this mapping already exists then don't create it again. This 309 * is the common case for SYSMEM mappings created by bhyveload(8). 310 */ 311 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); 312 if (error == 0 && gpa == memmap.gpa) { 313 if (segid != memmap.segid || off != memmap.segoff || 314 prot != memmap.prot || flags != memmap.flags) { 315 errno = EEXIST; 316 return (-1); 317 } else { 318 return (0); 319 } 320 } 321 322 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); 323 return (error); 324 } 325 326 int 327 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, 328 size_t *lowmem_size, size_t *highmem_size) 329 { 330 331 *guest_baseaddr = ctx->baseaddr; 332 *lowmem_size = ctx->lowmem_size; 333 *highmem_size = ctx->highmem_size; 334 return (0); 335 } 336 337 int 338 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) 339 { 340 struct vm_munmap munmap; 341 int error; 342 343 munmap.gpa = gpa; 344 munmap.len = len; 345 346 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); 347 return (error); 348 } 349 350 int 351 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, 352 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 353 { 354 struct vm_memmap memmap; 355 int error; 356 357 bzero(&memmap, sizeof(struct vm_memmap)); 358 memmap.gpa = *gpa; 359 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); 360 if (error == 0) { 361 *gpa = memmap.gpa; 362 *segid = memmap.segid; 363 *segoff = memmap.segoff; 364 *len = memmap.len; 365 *prot = memmap.prot; 366 *flags = memmap.flags; 367 } 368 return (error); 369 } 370 371 /* 372 * Return 0 if the segments are identical and non-zero otherwise. 373 * 374 * This is slightly complicated by the fact that only device memory segments 375 * are named. 376 */ 377 static int 378 cmpseg(size_t len, const char *str, size_t len2, const char *str2) 379 { 380 381 if (len == len2) { 382 if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) 383 return (0); 384 } 385 return (-1); 386 } 387 388 static int 389 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name, 390 int ds_policy, domainset_t *ds_mask, size_t ds_size) 391 { 392 struct vm_memseg memseg; 393 size_t n; 394 int error; 395 396 /* 397 * If the memory segment has already been created then just return. 398 * This is the usual case for the SYSMEM segment created by userspace 399 * loaders like bhyveload(8). 400 */ 401 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, 402 sizeof(memseg.name)); 403 if (error) 404 return (error); 405 406 if (memseg.len != 0) { 407 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { 408 errno = EINVAL; 409 return (-1); 410 } else { 411 return (0); 412 } 413 } 414 415 bzero(&memseg, sizeof(struct vm_memseg)); 416 memseg.segid = segid; 417 memseg.len = len; 418 if (ds_mask == NULL) { 419 memseg.ds_policy = DOMAINSET_POLICY_INVALID; 420 } else { 421 memseg.ds_policy = ds_policy; 422 memseg.ds_mask = ds_mask; 423 memseg.ds_mask_size = ds_size; 424 } 425 if (name != NULL) { 426 n = strlcpy(memseg.name, name, sizeof(memseg.name)); 427 if (n >= sizeof(memseg.name)) { 428 errno = ENAMETOOLONG; 429 return (-1); 430 } 431 } 432 433 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); 434 return (error); 435 } 436 437 int 438 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, 439 size_t bufsize) 440 { 441 struct vm_memseg memseg; 442 size_t n; 443 int error; 444 445 bzero(&memseg, sizeof(memseg)); 446 memseg.segid = segid; 447 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); 448 if (error == 0) { 449 *lenp = memseg.len; 450 n = strlcpy(namebuf, memseg.name, bufsize); 451 if (n >= bufsize) { 452 errno = ENAMETOOLONG; 453 error = -1; 454 } 455 } 456 return (error); 457 } 458 459 static int 460 map_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len, 461 size_t segoff, char *base) 462 { 463 char *ptr; 464 int error, flags; 465 466 /* Map 'len' bytes starting at 'gpa' in the guest address space */ 467 error = vm_mmap_memseg(ctx, gpa, segid, segoff, len, PROT_ALL); 468 if (error) 469 return (error); 470 471 flags = MAP_SHARED | MAP_FIXED; 472 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 473 flags |= MAP_NOCORE; 474 475 /* mmap into the process address space on the host */ 476 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); 477 if (ptr == MAP_FAILED) 478 return (-1); 479 480 return (0); 481 } 482 483 /* 484 * Allocates and maps virtual machine memory segments according 485 * to the NUMA topology specified by the 'doms' array. 486 * 487 * The domains are laid out sequentially in the guest's physical address space. 488 * The [VM_LOWMEM_LIMIT, VM_HIGHMEM_BASE) address range is skipped and 489 * left unmapped. 490 */ 491 int 492 vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style vms, 493 struct vm_mem_domain *doms, int ndoms) 494 { 495 size_t low_len, len, totalsize; 496 struct vm_mem_domain *dom; 497 struct vm_memseg memseg; 498 char *baseaddr, *ptr; 499 int error, i, segid; 500 vm_paddr_t gpa; 501 502 /* Sanity checks. */ 503 assert(vms == VM_MMAP_ALL); 504 if (doms == NULL || ndoms <= 0 || ndoms > VM_MAXMEMDOM) { 505 errno = EINVAL; 506 return (-1); 507 } 508 509 /* Calculate total memory size. */ 510 totalsize = 0; 511 for (i = 0; i < ndoms; i++) 512 totalsize += doms[i].size; 513 514 if (totalsize > VM_LOWMEM_LIMIT) 515 totalsize = VM_HIGHMEM_BASE + (totalsize - VM_LOWMEM_LIMIT); 516 517 /* 518 * Stake out a contiguous region covering the guest physical memory 519 * and the adjoining guard regions. 520 */ 521 len = VM_MMAP_GUARD_SIZE + totalsize + VM_MMAP_GUARD_SIZE; 522 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); 523 if (ptr == MAP_FAILED) 524 return (-1); 525 baseaddr = ptr + VM_MMAP_GUARD_SIZE; 526 527 /* 528 * Allocate and map memory segments for the virtual machine. 529 */ 530 gpa = VM_LOWMEM_LIMIT > 0 ? 0 : VM_HIGHMEM_BASE; 531 ctx->lowmem_size = 0; 532 ctx->highmem_size = 0; 533 for (i = 0; i < ndoms; i++) { 534 segid = VM_SYSMEM + i; 535 dom = &doms[i]; 536 537 /* 538 * Check if the memory segment already exists. 539 * If 'ndoms' is greater than one, refuse to proceed if the 540 * memseg already exists. If only one domain was requested, use 541 * the existing segment to preserve the behaviour of the previous 542 * implementation. 543 * 544 * Splitting existing memory segments is tedious and 545 * error-prone, which is why we don't support NUMA 546 * domains for bhyveload(8)-loaded VMs. 547 */ 548 error = vm_get_memseg(ctx, segid, &len, memseg.name, 549 sizeof(memseg.name)); 550 if (error == 0 && len != 0) { 551 if (ndoms != 1) { 552 errno = EEXIST; 553 return (-1); 554 } else 555 doms[0].size = len; 556 } else { 557 error = vm_alloc_memseg(ctx, segid, dom->size, NULL, 558 dom->ds_policy, dom->ds_mask, dom->ds_size); 559 if (error) 560 return (error); 561 } 562 563 /* 564 * If a domain is split by VM_LOWMEM_LIMIT then break 565 * its segment mapping into two parts, one below VM_LOWMEM_LIMIT 566 * and one above VM_HIGHMEM_BASE. 567 */ 568 if (gpa <= VM_LOWMEM_LIMIT && 569 gpa + dom->size > VM_LOWMEM_LIMIT) { 570 low_len = VM_LOWMEM_LIMIT - gpa; 571 error = map_memory_segment(ctx, segid, gpa, low_len, 0, 572 baseaddr); 573 if (error) 574 return (error); 575 ctx->lowmem_size = VM_LOWMEM_LIMIT; 576 /* Map the remainder. */ 577 gpa = VM_HIGHMEM_BASE; 578 len = dom->size - low_len; 579 error = map_memory_segment(ctx, segid, gpa, len, 580 low_len, baseaddr); 581 if (error) 582 return (error); 583 } else { 584 len = dom->size; 585 error = map_memory_segment(ctx, segid, gpa, len, 0, 586 baseaddr); 587 if (error) 588 return (error); 589 } 590 if (gpa <= VM_LOWMEM_LIMIT) 591 ctx->lowmem_size += len; 592 else 593 ctx->highmem_size += len; 594 gpa += len; 595 } 596 ctx->baseaddr = baseaddr; 597 598 return (0); 599 } 600 601 int 602 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) 603 { 604 struct vm_mem_domain dom0; 605 606 memset(&dom0, 0, sizeof(dom0)); 607 dom0.ds_policy = DOMAINSET_POLICY_INVALID; 608 dom0.size = memsize; 609 610 return (vm_setup_memory_domains(ctx, vms, &dom0, 1)); 611 } 612 613 /* 614 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in 615 * the lowmem or highmem regions. 616 * 617 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. 618 * The instruction emulation code depends on this behavior. 619 */ 620 void * 621 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) 622 { 623 vm_size_t lowsize, highsize; 624 625 lowsize = ctx->lowmem_size; 626 if (lowsize > 0) { 627 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) 628 return (ctx->baseaddr + gaddr); 629 } 630 631 highsize = ctx->highmem_size; 632 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { 633 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && 634 gaddr + len <= VM_HIGHMEM_BASE + highsize) 635 return (ctx->baseaddr + gaddr); 636 } 637 638 return (NULL); 639 } 640 641 vm_paddr_t 642 vm_rev_map_gpa(struct vmctx *ctx, void *addr) 643 { 644 vm_paddr_t offaddr; 645 vm_size_t lowsize, highsize; 646 647 offaddr = (char *)addr - ctx->baseaddr; 648 649 lowsize = ctx->lowmem_size; 650 if (lowsize > 0) 651 if (offaddr <= lowsize) 652 return (offaddr); 653 654 highsize = ctx->highmem_size; 655 if (highsize > 0) 656 if (offaddr >= VM_HIGHMEM_BASE && 657 offaddr < VM_HIGHMEM_BASE + highsize) 658 return (offaddr); 659 660 return ((vm_paddr_t)-1); 661 } 662 663 const char * 664 vm_get_name(struct vmctx *ctx) 665 { 666 667 return (ctx->name); 668 } 669 670 size_t 671 vm_get_lowmem_size(struct vmctx *ctx) 672 { 673 return (ctx->lowmem_size); 674 } 675 676 vm_paddr_t 677 vm_get_highmem_base(struct vmctx *ctx __unused) 678 { 679 680 return (VM_HIGHMEM_BASE); 681 } 682 683 size_t 684 vm_get_highmem_size(struct vmctx *ctx) 685 { 686 return (ctx->highmem_size); 687 } 688 689 void * 690 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) 691 { 692 char pathname[MAXPATHLEN]; 693 size_t len2; 694 char *base, *ptr; 695 int fd, error, flags; 696 697 fd = -1; 698 ptr = MAP_FAILED; 699 if (name == NULL || strlen(name) == 0) { 700 errno = EINVAL; 701 goto done; 702 } 703 704 error = vm_alloc_memseg(ctx, segid, len, name, 0, NULL, 0); 705 if (error) 706 goto done; 707 708 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); 709 strlcat(pathname, ctx->name, sizeof(pathname)); 710 strlcat(pathname, ".", sizeof(pathname)); 711 strlcat(pathname, name, sizeof(pathname)); 712 713 fd = open(pathname, O_RDWR); 714 if (fd < 0) 715 goto done; 716 717 /* 718 * Stake out a contiguous region covering the device memory and the 719 * adjoining guard regions. 720 */ 721 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; 722 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 723 0); 724 if (base == MAP_FAILED) 725 goto done; 726 727 flags = MAP_SHARED | MAP_FIXED; 728 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 729 flags |= MAP_NOCORE; 730 731 /* mmap the devmem region in the host address space */ 732 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); 733 done: 734 if (fd >= 0) 735 close(fd); 736 return (ptr); 737 } 738 739 int 740 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) 741 { 742 /* 743 * XXX: fragile, handle with care 744 * Assumes that the first field of the ioctl data 745 * is the vcpuid. 746 */ 747 *(int *)arg = vcpu->vcpuid; 748 return (ioctl(vcpu->ctx->fd, cmd, arg)); 749 } 750 751 int 752 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 753 { 754 int error; 755 struct vm_register vmreg; 756 757 bzero(&vmreg, sizeof(vmreg)); 758 vmreg.regnum = reg; 759 vmreg.regval = val; 760 761 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); 762 return (error); 763 } 764 765 int 766 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) 767 { 768 int error; 769 struct vm_register vmreg; 770 771 bzero(&vmreg, sizeof(vmreg)); 772 vmreg.regnum = reg; 773 774 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); 775 *ret_val = vmreg.regval; 776 return (error); 777 } 778 779 int 780 vm_set_register_set(struct vcpu *vcpu, unsigned int count, 781 const int *regnums, uint64_t *regvals) 782 { 783 int error; 784 struct vm_register_set vmregset; 785 786 bzero(&vmregset, sizeof(vmregset)); 787 vmregset.count = count; 788 vmregset.regnums = regnums; 789 vmregset.regvals = regvals; 790 791 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); 792 return (error); 793 } 794 795 int 796 vm_get_register_set(struct vcpu *vcpu, unsigned int count, 797 const int *regnums, uint64_t *regvals) 798 { 799 int error; 800 struct vm_register_set vmregset; 801 802 bzero(&vmregset, sizeof(vmregset)); 803 vmregset.count = count; 804 vmregset.regnums = regnums; 805 vmregset.regvals = regvals; 806 807 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); 808 return (error); 809 } 810 811 int 812 vm_run(struct vcpu *vcpu, struct vm_run *vmrun) 813 { 814 return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); 815 } 816 817 int 818 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) 819 { 820 struct vm_suspend vmsuspend; 821 822 bzero(&vmsuspend, sizeof(vmsuspend)); 823 vmsuspend.how = how; 824 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); 825 } 826 827 int 828 vm_reinit(struct vmctx *ctx) 829 { 830 831 return (ioctl(ctx->fd, VM_REINIT, 0)); 832 } 833 834 int 835 vm_capability_name2type(const char *capname) 836 { 837 int i; 838 839 for (i = 0; i < VM_CAP_MAX; i++) { 840 if (vm_capstrmap[i] != NULL && 841 strcmp(vm_capstrmap[i], capname) == 0) 842 return (i); 843 } 844 845 return (-1); 846 } 847 848 const char * 849 vm_capability_type2name(int type) 850 { 851 if (type >= 0 && type < VM_CAP_MAX) 852 return (vm_capstrmap[type]); 853 854 return (NULL); 855 } 856 857 int 858 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) 859 { 860 int error; 861 struct vm_capability vmcap; 862 863 bzero(&vmcap, sizeof(vmcap)); 864 vmcap.captype = cap; 865 866 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); 867 *retval = vmcap.capval; 868 return (error); 869 } 870 871 int 872 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) 873 { 874 struct vm_capability vmcap; 875 876 bzero(&vmcap, sizeof(vmcap)); 877 vmcap.captype = cap; 878 vmcap.capval = val; 879 880 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); 881 } 882 883 uint64_t * 884 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, 885 int *ret_entries) 886 { 887 static _Thread_local uint64_t *stats_buf; 888 static _Thread_local u_int stats_count; 889 uint64_t *new_stats; 890 struct vm_stats vmstats; 891 u_int count, index; 892 bool have_stats; 893 894 have_stats = false; 895 count = 0; 896 for (index = 0;; index += nitems(vmstats.statbuf)) { 897 vmstats.index = index; 898 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) 899 break; 900 if (stats_count < index + vmstats.num_entries) { 901 new_stats = realloc(stats_buf, 902 (index + vmstats.num_entries) * sizeof(uint64_t)); 903 if (new_stats == NULL) { 904 errno = ENOMEM; 905 return (NULL); 906 } 907 stats_count = index + vmstats.num_entries; 908 stats_buf = new_stats; 909 } 910 memcpy(stats_buf + index, vmstats.statbuf, 911 vmstats.num_entries * sizeof(uint64_t)); 912 count += vmstats.num_entries; 913 have_stats = true; 914 915 if (vmstats.num_entries != nitems(vmstats.statbuf)) 916 break; 917 } 918 if (have_stats) { 919 if (ret_entries) 920 *ret_entries = count; 921 if (ret_tv) 922 *ret_tv = vmstats.tv; 923 return (stats_buf); 924 } else 925 return (NULL); 926 } 927 928 const char * 929 vm_get_stat_desc(struct vmctx *ctx, int index) 930 { 931 static struct vm_stat_desc statdesc; 932 933 statdesc.index = index; 934 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) 935 return (statdesc.desc); 936 else 937 return (NULL); 938 } 939 940 #ifdef __amd64__ 941 int 942 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) 943 { 944 int error, i; 945 struct vm_gpa_pte gpapte; 946 947 bzero(&gpapte, sizeof(gpapte)); 948 gpapte.gpa = gpa; 949 950 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); 951 952 if (error == 0) { 953 *num = gpapte.ptenum; 954 for (i = 0; i < gpapte.ptenum; i++) 955 pte[i] = gpapte.pte[i]; 956 } 957 958 return (error); 959 } 960 961 int 962 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 963 uint64_t gla, int prot, uint64_t *gpa, int *fault) 964 { 965 struct vm_gla2gpa gg; 966 int error; 967 968 bzero(&gg, sizeof(struct vm_gla2gpa)); 969 gg.prot = prot; 970 gg.gla = gla; 971 gg.paging = *paging; 972 973 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); 974 if (error == 0) { 975 *fault = gg.fault; 976 *gpa = gg.gpa; 977 } 978 return (error); 979 } 980 #endif 981 982 int 983 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 984 uint64_t gla, int prot, uint64_t *gpa, int *fault) 985 { 986 struct vm_gla2gpa gg; 987 int error; 988 989 bzero(&gg, sizeof(struct vm_gla2gpa)); 990 gg.prot = prot; 991 gg.gla = gla; 992 gg.paging = *paging; 993 994 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); 995 if (error == 0) { 996 *fault = gg.fault; 997 *gpa = gg.gpa; 998 } 999 return (error); 1000 } 1001 1002 #ifndef min 1003 #define min(a,b) (((a) < (b)) ? (a) : (b)) 1004 #endif 1005 1006 #ifdef __amd64__ 1007 int 1008 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 1009 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, 1010 int *fault) 1011 { 1012 void *va; 1013 uint64_t gpa, off; 1014 int error, i, n; 1015 1016 for (i = 0; i < iovcnt; i++) { 1017 iov[i].iov_base = 0; 1018 iov[i].iov_len = 0; 1019 } 1020 1021 while (len) { 1022 assert(iovcnt > 0); 1023 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 1024 if (error || *fault) 1025 return (error); 1026 1027 off = gpa & PAGE_MASK; 1028 n = MIN(len, PAGE_SIZE - off); 1029 1030 va = vm_map_gpa(vcpu->ctx, gpa, n); 1031 if (va == NULL) 1032 return (EFAULT); 1033 1034 iov->iov_base = va; 1035 iov->iov_len = n; 1036 iov++; 1037 iovcnt--; 1038 1039 gla += n; 1040 len -= n; 1041 } 1042 return (0); 1043 } 1044 #endif 1045 1046 void 1047 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) 1048 { 1049 /* 1050 * Intentionally empty. This is used by the instruction 1051 * emulation code shared with the kernel. The in-kernel 1052 * version of this is non-empty. 1053 */ 1054 } 1055 1056 void 1057 vm_copyin(struct iovec *iov, void *vp, size_t len) 1058 { 1059 const char *src; 1060 char *dst; 1061 size_t n; 1062 1063 dst = vp; 1064 while (len) { 1065 assert(iov->iov_len); 1066 n = min(len, iov->iov_len); 1067 src = iov->iov_base; 1068 bcopy(src, dst, n); 1069 1070 iov++; 1071 dst += n; 1072 len -= n; 1073 } 1074 } 1075 1076 void 1077 vm_copyout(const void *vp, struct iovec *iov, size_t len) 1078 { 1079 const char *src; 1080 char *dst; 1081 size_t n; 1082 1083 src = vp; 1084 while (len) { 1085 assert(iov->iov_len); 1086 n = min(len, iov->iov_len); 1087 dst = iov->iov_base; 1088 bcopy(src, dst, n); 1089 1090 iov++; 1091 src += n; 1092 len -= n; 1093 } 1094 } 1095 1096 static int 1097 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) 1098 { 1099 struct vm_cpuset vm_cpuset; 1100 int error; 1101 1102 bzero(&vm_cpuset, sizeof(struct vm_cpuset)); 1103 vm_cpuset.which = which; 1104 vm_cpuset.cpusetsize = sizeof(cpuset_t); 1105 vm_cpuset.cpus = cpus; 1106 1107 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); 1108 return (error); 1109 } 1110 1111 int 1112 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) 1113 { 1114 1115 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); 1116 } 1117 1118 int 1119 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) 1120 { 1121 1122 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); 1123 } 1124 1125 int 1126 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) 1127 { 1128 1129 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); 1130 } 1131 1132 int 1133 vm_activate_cpu(struct vcpu *vcpu) 1134 { 1135 struct vm_activate_cpu ac; 1136 int error; 1137 1138 bzero(&ac, sizeof(struct vm_activate_cpu)); 1139 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); 1140 return (error); 1141 } 1142 1143 int 1144 vm_suspend_all_cpus(struct vmctx *ctx) 1145 { 1146 struct vm_activate_cpu ac; 1147 int error; 1148 1149 bzero(&ac, sizeof(struct vm_activate_cpu)); 1150 ac.vcpuid = -1; 1151 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); 1152 return (error); 1153 } 1154 1155 int 1156 vm_suspend_cpu(struct vcpu *vcpu) 1157 { 1158 struct vm_activate_cpu ac; 1159 int error; 1160 1161 bzero(&ac, sizeof(struct vm_activate_cpu)); 1162 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); 1163 return (error); 1164 } 1165 1166 int 1167 vm_resume_cpu(struct vcpu *vcpu) 1168 { 1169 struct vm_activate_cpu ac; 1170 int error; 1171 1172 bzero(&ac, sizeof(struct vm_activate_cpu)); 1173 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); 1174 return (error); 1175 } 1176 1177 int 1178 vm_resume_all_cpus(struct vmctx *ctx) 1179 { 1180 struct vm_activate_cpu ac; 1181 int error; 1182 1183 bzero(&ac, sizeof(struct vm_activate_cpu)); 1184 ac.vcpuid = -1; 1185 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); 1186 return (error); 1187 } 1188 1189 #ifdef __amd64__ 1190 int 1191 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1192 { 1193 struct vm_intinfo vmii; 1194 int error; 1195 1196 bzero(&vmii, sizeof(struct vm_intinfo)); 1197 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); 1198 if (error == 0) { 1199 *info1 = vmii.info1; 1200 *info2 = vmii.info2; 1201 } 1202 return (error); 1203 } 1204 1205 int 1206 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) 1207 { 1208 struct vm_intinfo vmii; 1209 int error; 1210 1211 bzero(&vmii, sizeof(struct vm_intinfo)); 1212 vmii.info1 = info1; 1213 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); 1214 return (error); 1215 } 1216 #endif 1217 1218 #ifdef WITH_VMMAPI_SNAPSHOT 1219 int 1220 vm_restart_instruction(struct vcpu *vcpu) 1221 { 1222 int arg; 1223 1224 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); 1225 } 1226 1227 int 1228 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) 1229 { 1230 1231 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { 1232 #ifdef SNAPSHOT_DEBUG 1233 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", 1234 __func__, meta->dev_name, errno); 1235 #endif 1236 return (-1); 1237 } 1238 return (0); 1239 } 1240 1241 int 1242 vm_restore_time(struct vmctx *ctx) 1243 { 1244 int dummy; 1245 1246 dummy = 0; 1247 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); 1248 } 1249 #endif 1250 1251 int 1252 vm_set_topology(struct vmctx *ctx, 1253 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) 1254 { 1255 struct vm_cpu_topology topology; 1256 1257 bzero(&topology, sizeof (struct vm_cpu_topology)); 1258 topology.sockets = sockets; 1259 topology.cores = cores; 1260 topology.threads = threads; 1261 topology.maxcpus = maxcpus; 1262 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); 1263 } 1264 1265 int 1266 vm_get_topology(struct vmctx *ctx, 1267 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) 1268 { 1269 struct vm_cpu_topology topology; 1270 int error; 1271 1272 bzero(&topology, sizeof (struct vm_cpu_topology)); 1273 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); 1274 if (error == 0) { 1275 *sockets = topology.sockets; 1276 *cores = topology.cores; 1277 *threads = topology.threads; 1278 *maxcpus = topology.maxcpus; 1279 } 1280 return (error); 1281 } 1282 1283 int 1284 vm_limit_rights(struct vmctx *ctx) 1285 { 1286 cap_rights_t rights; 1287 1288 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 1289 if (caph_rights_limit(ctx->fd, &rights) != 0) 1290 return (-1); 1291 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) 1292 return (-1); 1293 return (0); 1294 } 1295 1296 /* 1297 * Avoid using in new code. Operations on the fd should be wrapped here so that 1298 * capability rights can be kept in sync. 1299 */ 1300 int 1301 vm_get_device_fd(struct vmctx *ctx) 1302 { 1303 1304 return (ctx->fd); 1305 } 1306 1307 /* Legacy interface, do not use. */ 1308 const cap_ioctl_t * 1309 vm_get_ioctls(size_t *len) 1310 { 1311 cap_ioctl_t *cmds; 1312 size_t sz; 1313 1314 if (len == NULL) { 1315 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); 1316 cmds = malloc(sz); 1317 if (cmds == NULL) 1318 return (NULL); 1319 bcopy(vm_ioctl_cmds, cmds, sz); 1320 return (cmds); 1321 } 1322 1323 *len = vm_ioctl_ncmds; 1324 return (NULL); 1325 } 1326