1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/capsicum.h> 31 #include <sys/cpuset.h> 32 #include <sys/domainset.h> 33 #include <sys/sysctl.h> 34 #include <sys/ioctl.h> 35 #include <sys/mman.h> 36 #include <sys/linker.h> 37 #include <sys/module.h> 38 #include <sys/_iovec.h> 39 40 #include <capsicum_helpers.h> 41 #include <err.h> 42 #include <errno.h> 43 #include <stdbool.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <assert.h> 47 #include <string.h> 48 #include <fcntl.h> 49 #include <unistd.h> 50 51 #include <libutil.h> 52 53 #include <vm/vm.h> 54 #include <machine/vmm.h> 55 #ifdef WITH_VMMAPI_SNAPSHOT 56 #include <machine/vmm_snapshot.h> 57 #endif 58 59 #include <dev/vmm/vmm_dev.h> 60 61 #include "vmmapi.h" 62 #include "internal.h" 63 64 #define MB (1024 * 1024UL) 65 #define GB (1024 * 1024 * 1024UL) 66 67 #ifdef __amd64__ 68 #define VM_LOWMEM_LIMIT (3 * GB) 69 #else 70 #define VM_LOWMEM_LIMIT 0 71 #endif 72 #define VM_HIGHMEM_BASE (4 * GB) 73 74 /* 75 * Size of the guard region before and after the virtual address space 76 * mapping the guest physical memory. This must be a multiple of the 77 * superpage size for performance reasons. 78 */ 79 #define VM_MMAP_GUARD_SIZE (4 * MB) 80 81 #define PROT_RW (PROT_READ | PROT_WRITE) 82 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) 83 84 static int 85 vm_device_open(const char *name) 86 { 87 char devpath[PATH_MAX]; 88 89 assert(strlen(name) <= VM_MAX_NAMELEN); 90 (void)snprintf(devpath, sizeof(devpath), "/dev/vmm/%s", name); 91 return (open(devpath, O_RDWR)); 92 } 93 94 static int 95 vm_ctl_open(void) 96 { 97 if (modfind("vmm") < 0) 98 (void)kldload("vmm"); 99 return (open("/dev/vmmctl", O_RDWR, 0)); 100 } 101 102 static int 103 vm_ctl_create(const char *name, int flags, int ctlfd) 104 { 105 struct vmmctl_vm_create vmc; 106 107 memset(&vmc, 0, sizeof(vmc)); 108 if ((flags & VMMAPI_OPEN_CREATE_DESTROY_ON_CLOSE) != 0) 109 vmc.flags |= VMMCTL_CREATE_DESTROY_ON_CLOSE; 110 if (strlcpy(vmc.name, name, sizeof(vmc.name)) >= sizeof(vmc.name)) { 111 errno = ENAMETOOLONG; 112 return (-1); 113 } 114 return (ioctl(ctlfd, VMMCTL_VM_CREATE, &vmc)); 115 } 116 117 int 118 vm_create(const char *name) 119 { 120 int error, fd; 121 122 fd = vm_ctl_open(); 123 if (fd < 0) 124 return (-1); 125 126 error = vm_ctl_create(name, 0, fd); 127 if (error != 0) { 128 error = errno; 129 (void)close(fd); 130 errno = error; 131 return (-1); 132 } 133 (void)close(fd); 134 return (0); 135 } 136 137 struct vmctx * 138 vm_open(const char *name) 139 { 140 return (vm_openf(name, 0)); 141 } 142 143 struct vmctx * 144 vm_openf(const char *name, int flags) 145 { 146 struct vmctx *vm; 147 int saved_errno; 148 bool created; 149 150 created = false; 151 152 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); 153 assert(vm != NULL); 154 155 vm->fd = vm->ctlfd = -1; 156 vm->memflags = 0; 157 vm->name = (char *)(vm + 1); 158 strcpy(vm->name, name); 159 memset(vm->memsegs, 0, sizeof(vm->memsegs)); 160 161 if ((vm->ctlfd = vm_ctl_open()) < 0) 162 goto err; 163 164 vm->fd = vm_device_open(vm->name); 165 if (vm->fd < 0 && errno == ENOENT) { 166 if (flags & VMMAPI_OPEN_CREATE) { 167 if (vm_ctl_create(vm->name, flags, vm->ctlfd) != 0) 168 goto err; 169 vm->fd = vm_device_open(vm->name); 170 created = true; 171 } 172 } 173 if (vm->fd < 0) 174 goto err; 175 176 if (!created && (flags & VMMAPI_OPEN_REINIT) != 0 && vm_reinit(vm) != 0) 177 goto err; 178 179 return (vm); 180 err: 181 saved_errno = errno; 182 if (created) 183 vm_destroy(vm); 184 else 185 vm_close(vm); 186 errno = saved_errno; 187 return (NULL); 188 } 189 190 void 191 vm_close(struct vmctx *vm) 192 { 193 assert(vm != NULL); 194 195 if (vm->fd >= 0) 196 (void)close(vm->fd); 197 if (vm->ctlfd >= 0) 198 (void)close(vm->ctlfd); 199 free(vm); 200 } 201 202 void 203 vm_destroy(struct vmctx *vm) 204 { 205 struct vmmctl_vm_destroy vmd; 206 207 memset(&vmd, 0, sizeof(vmd)); 208 (void)strlcpy(vmd.name, vm->name, sizeof(vmd.name)); 209 if (ioctl(vm->ctlfd, VMMCTL_VM_DESTROY, &vmd) != 0) 210 warn("ioctl(VMMCTL_VM_DESTROY)"); 211 212 vm_close(vm); 213 } 214 215 struct vcpu * 216 vm_vcpu_open(struct vmctx *ctx, int vcpuid) 217 { 218 struct vcpu *vcpu; 219 220 vcpu = malloc(sizeof(*vcpu)); 221 vcpu->ctx = ctx; 222 vcpu->vcpuid = vcpuid; 223 return (vcpu); 224 } 225 226 void 227 vm_vcpu_close(struct vcpu *vcpu) 228 { 229 free(vcpu); 230 } 231 232 int 233 vcpu_id(struct vcpu *vcpu) 234 { 235 return (vcpu->vcpuid); 236 } 237 238 int 239 vm_parse_memsize(const char *opt, size_t *ret_memsize) 240 { 241 char *endptr; 242 size_t optval; 243 int error; 244 245 optval = strtoul(opt, &endptr, 0); 246 if (*opt != '\0' && *endptr == '\0') { 247 /* 248 * For the sake of backward compatibility if the memory size 249 * specified on the command line is less than a megabyte then 250 * it is interpreted as being in units of MB. 251 */ 252 if (optval < MB) 253 optval *= MB; 254 *ret_memsize = optval; 255 error = 0; 256 } else 257 error = expand_number(opt, ret_memsize); 258 259 return (error); 260 } 261 262 uint32_t 263 vm_get_lowmem_limit(struct vmctx *ctx __unused) 264 { 265 266 return (VM_LOWMEM_LIMIT); 267 } 268 269 void 270 vm_set_memflags(struct vmctx *ctx, int flags) 271 { 272 273 ctx->memflags = flags; 274 } 275 276 int 277 vm_get_memflags(struct vmctx *ctx) 278 { 279 280 return (ctx->memflags); 281 } 282 283 /* 284 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). 285 */ 286 int 287 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, 288 size_t len, int prot) 289 { 290 struct vm_memmap memmap; 291 int error, flags; 292 293 memmap.gpa = gpa; 294 memmap.segid = segid; 295 memmap.segoff = off; 296 memmap.len = len; 297 memmap.prot = prot; 298 memmap.flags = 0; 299 300 if (ctx->memflags & VM_MEM_F_WIRED) 301 memmap.flags |= VM_MEMMAP_F_WIRED; 302 303 /* 304 * If this mapping already exists then don't create it again. This 305 * is the common case for SYSMEM mappings created by bhyveload(8). 306 */ 307 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); 308 if (error == 0 && gpa == memmap.gpa) { 309 if (segid != memmap.segid || off != memmap.segoff || 310 prot != memmap.prot || flags != memmap.flags) { 311 errno = EEXIST; 312 return (-1); 313 } else { 314 return (0); 315 } 316 } 317 318 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); 319 return (error); 320 } 321 322 int 323 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, 324 size_t *lowmem_size, size_t *highmem_size) 325 { 326 327 *guest_baseaddr = ctx->baseaddr; 328 *lowmem_size = ctx->lowmem_size; 329 *highmem_size = ctx->highmem_size; 330 return (0); 331 } 332 333 int 334 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) 335 { 336 struct vm_munmap munmap; 337 int error; 338 339 munmap.gpa = gpa; 340 munmap.len = len; 341 342 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); 343 return (error); 344 } 345 346 int 347 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, 348 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 349 { 350 struct vm_memmap memmap; 351 int error; 352 353 bzero(&memmap, sizeof(struct vm_memmap)); 354 memmap.gpa = *gpa; 355 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); 356 if (error == 0) { 357 *gpa = memmap.gpa; 358 *segid = memmap.segid; 359 *segoff = memmap.segoff; 360 *len = memmap.len; 361 *prot = memmap.prot; 362 *flags = memmap.flags; 363 } 364 return (error); 365 } 366 367 /* 368 * Return 0 if the segments are identical and non-zero otherwise. 369 * 370 * This is slightly complicated by the fact that only device memory segments 371 * are named. 372 */ 373 static int 374 cmpseg(size_t len, const char *str, size_t len2, const char *str2) 375 { 376 377 if (len == len2) { 378 if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) 379 return (0); 380 } 381 return (-1); 382 } 383 384 static int 385 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name, 386 int ds_policy, domainset_t *ds_mask, size_t ds_size) 387 { 388 struct vm_memseg memseg; 389 size_t n; 390 int error; 391 392 /* 393 * If the memory segment has already been created then just return. 394 * This is the usual case for the SYSMEM segment created by userspace 395 * loaders like bhyveload(8). 396 */ 397 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, 398 sizeof(memseg.name)); 399 if (error) 400 return (error); 401 402 if (memseg.len != 0) { 403 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { 404 errno = EINVAL; 405 return (-1); 406 } else { 407 return (0); 408 } 409 } 410 411 bzero(&memseg, sizeof(struct vm_memseg)); 412 memseg.segid = segid; 413 memseg.len = len; 414 if (ds_mask == NULL) { 415 memseg.ds_policy = DOMAINSET_POLICY_INVALID; 416 } else { 417 memseg.ds_policy = ds_policy; 418 memseg.ds_mask = ds_mask; 419 memseg.ds_mask_size = ds_size; 420 } 421 if (name != NULL) { 422 n = strlcpy(memseg.name, name, sizeof(memseg.name)); 423 if (n >= sizeof(memseg.name)) { 424 errno = ENAMETOOLONG; 425 return (-1); 426 } 427 } 428 429 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); 430 return (error); 431 } 432 433 int 434 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, 435 size_t bufsize) 436 { 437 struct vm_memseg memseg; 438 size_t n; 439 int error; 440 441 bzero(&memseg, sizeof(memseg)); 442 memseg.segid = segid; 443 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); 444 if (error == 0) { 445 *lenp = memseg.len; 446 n = strlcpy(namebuf, memseg.name, bufsize); 447 if (n >= bufsize) { 448 errno = ENAMETOOLONG; 449 error = -1; 450 } 451 } 452 return (error); 453 } 454 455 static int 456 map_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len, 457 size_t segoff, char *base) 458 { 459 char *ptr; 460 int error, flags; 461 462 /* Map 'len' bytes starting at 'gpa' in the guest address space */ 463 error = vm_mmap_memseg(ctx, gpa, segid, segoff, len, PROT_ALL); 464 if (error) 465 return (error); 466 467 flags = MAP_SHARED | MAP_FIXED; 468 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 469 flags |= MAP_NOCORE; 470 471 /* mmap into the process address space on the host */ 472 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); 473 if (ptr == MAP_FAILED) 474 return (-1); 475 476 return (0); 477 } 478 479 /* 480 * Allocates and maps virtual machine memory segments according 481 * to the NUMA topology specified by the 'doms' array. 482 * 483 * The domains are laid out sequentially in the guest's physical address space. 484 * The [VM_LOWMEM_LIMIT, VM_HIGHMEM_BASE) address range is skipped and 485 * left unmapped. 486 */ 487 int 488 vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style vms, 489 struct vm_mem_domain *doms, int ndoms) 490 { 491 size_t low_len, len, totalsize; 492 struct vm_mem_domain *dom; 493 struct vm_memseg memseg; 494 char *baseaddr, *ptr; 495 int error, i, segid; 496 vm_paddr_t gpa; 497 498 /* Sanity checks. */ 499 assert(vms == VM_MMAP_ALL); 500 if (doms == NULL || ndoms <= 0 || ndoms > VM_MAXMEMDOM) { 501 errno = EINVAL; 502 return (-1); 503 } 504 505 /* Calculate total memory size. */ 506 totalsize = 0; 507 for (i = 0; i < ndoms; i++) 508 totalsize += doms[i].size; 509 510 if (totalsize > VM_LOWMEM_LIMIT) 511 totalsize = VM_HIGHMEM_BASE + (totalsize - VM_LOWMEM_LIMIT); 512 513 /* 514 * Stake out a contiguous region covering the guest physical memory 515 * and the adjoining guard regions. 516 */ 517 len = VM_MMAP_GUARD_SIZE + totalsize + VM_MMAP_GUARD_SIZE; 518 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); 519 if (ptr == MAP_FAILED) 520 return (-1); 521 baseaddr = ptr + VM_MMAP_GUARD_SIZE; 522 523 /* 524 * Allocate and map memory segments for the virtual machine. 525 */ 526 gpa = VM_LOWMEM_LIMIT > 0 ? 0 : VM_HIGHMEM_BASE; 527 ctx->lowmem_size = 0; 528 ctx->highmem_size = 0; 529 for (i = 0; i < ndoms; i++) { 530 segid = VM_SYSMEM + i; 531 dom = &doms[i]; 532 533 /* 534 * Check if the memory segment already exists. 535 * If 'ndoms' is greater than one, refuse to proceed if the 536 * memseg already exists. If only one domain was requested, use 537 * the existing segment to preserve the behaviour of the previous 538 * implementation. 539 * 540 * Splitting existing memory segments is tedious and 541 * error-prone, which is why we don't support NUMA 542 * domains for bhyveload(8)-loaded VMs. 543 */ 544 error = vm_get_memseg(ctx, segid, &len, memseg.name, 545 sizeof(memseg.name)); 546 if (error == 0 && len != 0) { 547 if (ndoms != 1) { 548 errno = EEXIST; 549 return (-1); 550 } else 551 doms[0].size = len; 552 } else { 553 error = vm_alloc_memseg(ctx, segid, dom->size, NULL, 554 dom->ds_policy, dom->ds_mask, dom->ds_size); 555 if (error) 556 return (error); 557 } 558 559 /* 560 * If a domain is split by VM_LOWMEM_LIMIT then break 561 * its segment mapping into two parts, one below VM_LOWMEM_LIMIT 562 * and one above VM_HIGHMEM_BASE. 563 */ 564 if (gpa <= VM_LOWMEM_LIMIT && 565 gpa + dom->size > VM_LOWMEM_LIMIT) { 566 low_len = VM_LOWMEM_LIMIT - gpa; 567 error = map_memory_segment(ctx, segid, gpa, low_len, 0, 568 baseaddr); 569 if (error) 570 return (error); 571 ctx->lowmem_size = VM_LOWMEM_LIMIT; 572 /* Map the remainder. */ 573 gpa = VM_HIGHMEM_BASE; 574 len = dom->size - low_len; 575 error = map_memory_segment(ctx, segid, gpa, len, 576 low_len, baseaddr); 577 if (error) 578 return (error); 579 } else { 580 len = dom->size; 581 error = map_memory_segment(ctx, segid, gpa, len, 0, 582 baseaddr); 583 if (error) 584 return (error); 585 } 586 if (gpa <= VM_LOWMEM_LIMIT) 587 ctx->lowmem_size += len; 588 else 589 ctx->highmem_size += len; 590 gpa += len; 591 } 592 ctx->baseaddr = baseaddr; 593 594 return (0); 595 } 596 597 int 598 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) 599 { 600 struct vm_mem_domain dom0; 601 602 memset(&dom0, 0, sizeof(dom0)); 603 dom0.ds_policy = DOMAINSET_POLICY_INVALID; 604 dom0.size = memsize; 605 606 return (vm_setup_memory_domains(ctx, vms, &dom0, 1)); 607 } 608 609 /* 610 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in 611 * the lowmem or highmem regions. 612 * 613 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. 614 * The instruction emulation code depends on this behavior. 615 */ 616 void * 617 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) 618 { 619 vm_size_t lowsize, highsize; 620 621 lowsize = ctx->lowmem_size; 622 if (lowsize > 0) { 623 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) 624 return (ctx->baseaddr + gaddr); 625 } 626 627 highsize = ctx->highmem_size; 628 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { 629 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && 630 gaddr + len <= VM_HIGHMEM_BASE + highsize) 631 return (ctx->baseaddr + gaddr); 632 } 633 634 return (NULL); 635 } 636 637 vm_paddr_t 638 vm_rev_map_gpa(struct vmctx *ctx, void *addr) 639 { 640 vm_paddr_t offaddr; 641 vm_size_t lowsize, highsize; 642 643 offaddr = (char *)addr - ctx->baseaddr; 644 645 lowsize = ctx->lowmem_size; 646 if (lowsize > 0) 647 if (offaddr <= lowsize) 648 return (offaddr); 649 650 highsize = ctx->highmem_size; 651 if (highsize > 0) 652 if (offaddr >= VM_HIGHMEM_BASE && 653 offaddr < VM_HIGHMEM_BASE + highsize) 654 return (offaddr); 655 656 return ((vm_paddr_t)-1); 657 } 658 659 const char * 660 vm_get_name(struct vmctx *ctx) 661 { 662 663 return (ctx->name); 664 } 665 666 size_t 667 vm_get_lowmem_size(struct vmctx *ctx) 668 { 669 return (ctx->lowmem_size); 670 } 671 672 vm_paddr_t 673 vm_get_highmem_base(struct vmctx *ctx __unused) 674 { 675 676 return (VM_HIGHMEM_BASE); 677 } 678 679 size_t 680 vm_get_highmem_size(struct vmctx *ctx) 681 { 682 return (ctx->highmem_size); 683 } 684 685 void * 686 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) 687 { 688 char pathname[MAXPATHLEN]; 689 size_t len2; 690 char *base, *ptr; 691 int fd, error, flags; 692 693 fd = -1; 694 ptr = MAP_FAILED; 695 if (name == NULL || strlen(name) == 0) { 696 errno = EINVAL; 697 goto done; 698 } 699 700 error = vm_alloc_memseg(ctx, segid, len, name, 0, NULL, 0); 701 if (error) 702 goto done; 703 704 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); 705 strlcat(pathname, ctx->name, sizeof(pathname)); 706 strlcat(pathname, ".", sizeof(pathname)); 707 strlcat(pathname, name, sizeof(pathname)); 708 709 fd = open(pathname, O_RDWR); 710 if (fd < 0) 711 goto done; 712 713 /* 714 * Stake out a contiguous region covering the device memory and the 715 * adjoining guard regions. 716 */ 717 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; 718 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 719 0); 720 if (base == MAP_FAILED) 721 goto done; 722 723 flags = MAP_SHARED | MAP_FIXED; 724 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 725 flags |= MAP_NOCORE; 726 727 /* mmap the devmem region in the host address space */ 728 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); 729 done: 730 if (fd >= 0) 731 close(fd); 732 return (ptr); 733 } 734 735 int 736 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) 737 { 738 /* 739 * XXX: fragile, handle with care 740 * Assumes that the first field of the ioctl data 741 * is the vcpuid. 742 */ 743 *(int *)arg = vcpu->vcpuid; 744 return (ioctl(vcpu->ctx->fd, cmd, arg)); 745 } 746 747 int 748 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 749 { 750 int error; 751 struct vm_register vmreg; 752 753 bzero(&vmreg, sizeof(vmreg)); 754 vmreg.regnum = reg; 755 vmreg.regval = val; 756 757 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); 758 return (error); 759 } 760 761 int 762 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) 763 { 764 int error; 765 struct vm_register vmreg; 766 767 bzero(&vmreg, sizeof(vmreg)); 768 vmreg.regnum = reg; 769 770 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); 771 *ret_val = vmreg.regval; 772 return (error); 773 } 774 775 int 776 vm_set_register_set(struct vcpu *vcpu, unsigned int count, 777 const int *regnums, uint64_t *regvals) 778 { 779 int error; 780 struct vm_register_set vmregset; 781 782 bzero(&vmregset, sizeof(vmregset)); 783 vmregset.count = count; 784 vmregset.regnums = regnums; 785 vmregset.regvals = regvals; 786 787 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); 788 return (error); 789 } 790 791 int 792 vm_get_register_set(struct vcpu *vcpu, unsigned int count, 793 const int *regnums, uint64_t *regvals) 794 { 795 int error; 796 struct vm_register_set vmregset; 797 798 bzero(&vmregset, sizeof(vmregset)); 799 vmregset.count = count; 800 vmregset.regnums = regnums; 801 vmregset.regvals = regvals; 802 803 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); 804 return (error); 805 } 806 807 int 808 vm_run(struct vcpu *vcpu, struct vm_run *vmrun) 809 { 810 return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); 811 } 812 813 int 814 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) 815 { 816 struct vm_suspend vmsuspend; 817 818 bzero(&vmsuspend, sizeof(vmsuspend)); 819 vmsuspend.how = how; 820 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); 821 } 822 823 int 824 vm_reinit(struct vmctx *ctx) 825 { 826 827 return (ioctl(ctx->fd, VM_REINIT, 0)); 828 } 829 830 int 831 vm_capability_name2type(const char *capname) 832 { 833 int i; 834 835 for (i = 0; i < VM_CAP_MAX; i++) { 836 if (vm_capstrmap[i] != NULL && 837 strcmp(vm_capstrmap[i], capname) == 0) 838 return (i); 839 } 840 841 return (-1); 842 } 843 844 const char * 845 vm_capability_type2name(int type) 846 { 847 if (type >= 0 && type < VM_CAP_MAX) 848 return (vm_capstrmap[type]); 849 850 return (NULL); 851 } 852 853 int 854 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) 855 { 856 int error; 857 struct vm_capability vmcap; 858 859 bzero(&vmcap, sizeof(vmcap)); 860 vmcap.captype = cap; 861 862 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); 863 *retval = vmcap.capval; 864 return (error); 865 } 866 867 int 868 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) 869 { 870 struct vm_capability vmcap; 871 872 bzero(&vmcap, sizeof(vmcap)); 873 vmcap.captype = cap; 874 vmcap.capval = val; 875 876 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); 877 } 878 879 uint64_t * 880 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, 881 int *ret_entries) 882 { 883 static _Thread_local uint64_t *stats_buf; 884 static _Thread_local u_int stats_count; 885 uint64_t *new_stats; 886 struct vm_stats vmstats; 887 u_int count, index; 888 bool have_stats; 889 890 have_stats = false; 891 count = 0; 892 for (index = 0;; index += nitems(vmstats.statbuf)) { 893 vmstats.index = index; 894 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) 895 break; 896 if (stats_count < index + vmstats.num_entries) { 897 new_stats = realloc(stats_buf, 898 (index + vmstats.num_entries) * sizeof(uint64_t)); 899 if (new_stats == NULL) { 900 errno = ENOMEM; 901 return (NULL); 902 } 903 stats_count = index + vmstats.num_entries; 904 stats_buf = new_stats; 905 } 906 memcpy(stats_buf + index, vmstats.statbuf, 907 vmstats.num_entries * sizeof(uint64_t)); 908 count += vmstats.num_entries; 909 have_stats = true; 910 911 if (vmstats.num_entries != nitems(vmstats.statbuf)) 912 break; 913 } 914 if (have_stats) { 915 if (ret_entries) 916 *ret_entries = count; 917 if (ret_tv) 918 *ret_tv = vmstats.tv; 919 return (stats_buf); 920 } else 921 return (NULL); 922 } 923 924 const char * 925 vm_get_stat_desc(struct vmctx *ctx, int index) 926 { 927 static struct vm_stat_desc statdesc; 928 929 statdesc.index = index; 930 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) 931 return (statdesc.desc); 932 else 933 return (NULL); 934 } 935 936 #ifdef __amd64__ 937 int 938 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) 939 { 940 int error, i; 941 struct vm_gpa_pte gpapte; 942 943 bzero(&gpapte, sizeof(gpapte)); 944 gpapte.gpa = gpa; 945 946 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); 947 948 if (error == 0) { 949 *num = gpapte.ptenum; 950 for (i = 0; i < gpapte.ptenum; i++) 951 pte[i] = gpapte.pte[i]; 952 } 953 954 return (error); 955 } 956 957 int 958 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 959 uint64_t gla, int prot, uint64_t *gpa, int *fault) 960 { 961 struct vm_gla2gpa gg; 962 int error; 963 964 bzero(&gg, sizeof(struct vm_gla2gpa)); 965 gg.prot = prot; 966 gg.gla = gla; 967 gg.paging = *paging; 968 969 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); 970 if (error == 0) { 971 *fault = gg.fault; 972 *gpa = gg.gpa; 973 } 974 return (error); 975 } 976 #endif 977 978 int 979 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 980 uint64_t gla, int prot, uint64_t *gpa, int *fault) 981 { 982 struct vm_gla2gpa gg; 983 int error; 984 985 bzero(&gg, sizeof(struct vm_gla2gpa)); 986 gg.prot = prot; 987 gg.gla = gla; 988 gg.paging = *paging; 989 990 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); 991 if (error == 0) { 992 *fault = gg.fault; 993 *gpa = gg.gpa; 994 } 995 return (error); 996 } 997 998 #ifndef min 999 #define min(a,b) (((a) < (b)) ? (a) : (b)) 1000 #endif 1001 1002 #ifdef __amd64__ 1003 int 1004 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 1005 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, 1006 int *fault) 1007 { 1008 void *va; 1009 uint64_t gpa, off; 1010 int error, i, n; 1011 1012 for (i = 0; i < iovcnt; i++) { 1013 iov[i].iov_base = 0; 1014 iov[i].iov_len = 0; 1015 } 1016 1017 while (len) { 1018 assert(iovcnt > 0); 1019 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 1020 if (error || *fault) 1021 return (error); 1022 1023 off = gpa & PAGE_MASK; 1024 n = MIN(len, PAGE_SIZE - off); 1025 1026 va = vm_map_gpa(vcpu->ctx, gpa, n); 1027 if (va == NULL) 1028 return (EFAULT); 1029 1030 iov->iov_base = va; 1031 iov->iov_len = n; 1032 iov++; 1033 iovcnt--; 1034 1035 gla += n; 1036 len -= n; 1037 } 1038 return (0); 1039 } 1040 #endif 1041 1042 void 1043 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) 1044 { 1045 /* 1046 * Intentionally empty. This is used by the instruction 1047 * emulation code shared with the kernel. The in-kernel 1048 * version of this is non-empty. 1049 */ 1050 } 1051 1052 void 1053 vm_copyin(struct iovec *iov, void *vp, size_t len) 1054 { 1055 const char *src; 1056 char *dst; 1057 size_t n; 1058 1059 dst = vp; 1060 while (len) { 1061 assert(iov->iov_len); 1062 n = min(len, iov->iov_len); 1063 src = iov->iov_base; 1064 bcopy(src, dst, n); 1065 1066 iov++; 1067 dst += n; 1068 len -= n; 1069 } 1070 } 1071 1072 void 1073 vm_copyout(const void *vp, struct iovec *iov, size_t len) 1074 { 1075 const char *src; 1076 char *dst; 1077 size_t n; 1078 1079 src = vp; 1080 while (len) { 1081 assert(iov->iov_len); 1082 n = min(len, iov->iov_len); 1083 dst = iov->iov_base; 1084 bcopy(src, dst, n); 1085 1086 iov++; 1087 src += n; 1088 len -= n; 1089 } 1090 } 1091 1092 static int 1093 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) 1094 { 1095 struct vm_cpuset vm_cpuset; 1096 int error; 1097 1098 bzero(&vm_cpuset, sizeof(struct vm_cpuset)); 1099 vm_cpuset.which = which; 1100 vm_cpuset.cpusetsize = sizeof(cpuset_t); 1101 vm_cpuset.cpus = cpus; 1102 1103 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); 1104 return (error); 1105 } 1106 1107 int 1108 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) 1109 { 1110 1111 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); 1112 } 1113 1114 int 1115 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) 1116 { 1117 1118 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); 1119 } 1120 1121 int 1122 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) 1123 { 1124 1125 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); 1126 } 1127 1128 int 1129 vm_activate_cpu(struct vcpu *vcpu) 1130 { 1131 struct vm_activate_cpu ac; 1132 int error; 1133 1134 bzero(&ac, sizeof(struct vm_activate_cpu)); 1135 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); 1136 return (error); 1137 } 1138 1139 int 1140 vm_suspend_all_cpus(struct vmctx *ctx) 1141 { 1142 struct vm_activate_cpu ac; 1143 int error; 1144 1145 bzero(&ac, sizeof(struct vm_activate_cpu)); 1146 ac.vcpuid = -1; 1147 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); 1148 return (error); 1149 } 1150 1151 int 1152 vm_suspend_cpu(struct vcpu *vcpu) 1153 { 1154 struct vm_activate_cpu ac; 1155 int error; 1156 1157 bzero(&ac, sizeof(struct vm_activate_cpu)); 1158 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); 1159 return (error); 1160 } 1161 1162 int 1163 vm_resume_cpu(struct vcpu *vcpu) 1164 { 1165 struct vm_activate_cpu ac; 1166 int error; 1167 1168 bzero(&ac, sizeof(struct vm_activate_cpu)); 1169 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); 1170 return (error); 1171 } 1172 1173 int 1174 vm_resume_all_cpus(struct vmctx *ctx) 1175 { 1176 struct vm_activate_cpu ac; 1177 int error; 1178 1179 bzero(&ac, sizeof(struct vm_activate_cpu)); 1180 ac.vcpuid = -1; 1181 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); 1182 return (error); 1183 } 1184 1185 #ifdef __amd64__ 1186 int 1187 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1188 { 1189 struct vm_intinfo vmii; 1190 int error; 1191 1192 bzero(&vmii, sizeof(struct vm_intinfo)); 1193 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); 1194 if (error == 0) { 1195 *info1 = vmii.info1; 1196 *info2 = vmii.info2; 1197 } 1198 return (error); 1199 } 1200 1201 int 1202 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) 1203 { 1204 struct vm_intinfo vmii; 1205 int error; 1206 1207 bzero(&vmii, sizeof(struct vm_intinfo)); 1208 vmii.info1 = info1; 1209 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); 1210 return (error); 1211 } 1212 #endif 1213 1214 #ifdef WITH_VMMAPI_SNAPSHOT 1215 int 1216 vm_restart_instruction(struct vcpu *vcpu) 1217 { 1218 int arg; 1219 1220 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); 1221 } 1222 1223 int 1224 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) 1225 { 1226 1227 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { 1228 #ifdef SNAPSHOT_DEBUG 1229 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", 1230 __func__, meta->dev_name, errno); 1231 #endif 1232 return (-1); 1233 } 1234 return (0); 1235 } 1236 1237 int 1238 vm_restore_time(struct vmctx *ctx) 1239 { 1240 int dummy; 1241 1242 dummy = 0; 1243 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); 1244 } 1245 #endif 1246 1247 int 1248 vm_set_topology(struct vmctx *ctx, 1249 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) 1250 { 1251 struct vm_cpu_topology topology; 1252 1253 bzero(&topology, sizeof (struct vm_cpu_topology)); 1254 topology.sockets = sockets; 1255 topology.cores = cores; 1256 topology.threads = threads; 1257 topology.maxcpus = maxcpus; 1258 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); 1259 } 1260 1261 int 1262 vm_get_topology(struct vmctx *ctx, 1263 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) 1264 { 1265 struct vm_cpu_topology topology; 1266 int error; 1267 1268 bzero(&topology, sizeof (struct vm_cpu_topology)); 1269 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); 1270 if (error == 0) { 1271 *sockets = topology.sockets; 1272 *cores = topology.cores; 1273 *threads = topology.threads; 1274 *maxcpus = topology.maxcpus; 1275 } 1276 return (error); 1277 } 1278 1279 int 1280 vm_limit_rights(struct vmctx *ctx) 1281 { 1282 cap_rights_t rights; 1283 1284 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 1285 if (caph_rights_limit(ctx->fd, &rights) != 0) 1286 return (-1); 1287 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) 1288 return (-1); 1289 return (0); 1290 } 1291 1292 /* 1293 * Avoid using in new code. Operations on the fd should be wrapped here so that 1294 * capability rights can be kept in sync. 1295 */ 1296 int 1297 vm_get_device_fd(struct vmctx *ctx) 1298 { 1299 1300 return (ctx->fd); 1301 } 1302 1303 /* Legacy interface, do not use. */ 1304 const cap_ioctl_t * 1305 vm_get_ioctls(size_t *len) 1306 { 1307 cap_ioctl_t *cmds; 1308 size_t sz; 1309 1310 if (len == NULL) { 1311 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); 1312 cmds = malloc(sz); 1313 if (cmds == NULL) 1314 return (NULL); 1315 bcopy(vm_ioctl_cmds, cmds, sz); 1316 return (cmds); 1317 } 1318 1319 *len = vm_ioctl_ncmds; 1320 return (NULL); 1321 } 1322