1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/capsicum.h> 31 #include <sys/sysctl.h> 32 #include <sys/ioctl.h> 33 #include <sys/mman.h> 34 #include <sys/linker.h> 35 #include <sys/module.h> 36 #include <sys/_iovec.h> 37 #include <sys/cpuset.h> 38 39 #include <capsicum_helpers.h> 40 #include <errno.h> 41 #include <stdbool.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <assert.h> 45 #include <string.h> 46 #include <fcntl.h> 47 #include <unistd.h> 48 49 #include <libutil.h> 50 51 #include <vm/vm.h> 52 #include <machine/vmm.h> 53 #include <machine/vmm_dev.h> 54 #include <machine/vmm_snapshot.h> 55 56 #include "vmmapi.h" 57 #include "internal.h" 58 59 #define MB (1024 * 1024UL) 60 #define GB (1024 * 1024 * 1024UL) 61 62 /* 63 * Size of the guard region before and after the virtual address space 64 * mapping the guest physical memory. This must be a multiple of the 65 * superpage size for performance reasons. 66 */ 67 #define VM_MMAP_GUARD_SIZE (4 * MB) 68 69 #define PROT_RW (PROT_READ | PROT_WRITE) 70 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) 71 72 #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) 73 #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) 74 75 static int 76 vm_device_open(const char *name) 77 { 78 int fd, len; 79 char *vmfile; 80 81 len = strlen("/dev/vmm/") + strlen(name) + 1; 82 vmfile = malloc(len); 83 assert(vmfile != NULL); 84 snprintf(vmfile, len, "/dev/vmm/%s", name); 85 86 /* Open the device file */ 87 fd = open(vmfile, O_RDWR, 0); 88 89 free(vmfile); 90 return (fd); 91 } 92 93 int 94 vm_create(const char *name) 95 { 96 /* Try to load vmm(4) module before creating a guest. */ 97 if (modfind("vmm") < 0) 98 kldload("vmm"); 99 return (CREATE(name)); 100 } 101 102 struct vmctx * 103 vm_open(const char *name) 104 { 105 struct vmctx *vm; 106 int saved_errno; 107 108 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); 109 assert(vm != NULL); 110 111 vm->fd = -1; 112 vm->memflags = 0; 113 vm->lowmem_limit = 3 * GB; 114 vm->name = (char *)(vm + 1); 115 strcpy(vm->name, name); 116 117 if ((vm->fd = vm_device_open(vm->name)) < 0) 118 goto err; 119 120 return (vm); 121 err: 122 saved_errno = errno; 123 free(vm); 124 errno = saved_errno; 125 return (NULL); 126 } 127 128 void 129 vm_close(struct vmctx *vm) 130 { 131 assert(vm != NULL); 132 133 close(vm->fd); 134 free(vm); 135 } 136 137 void 138 vm_destroy(struct vmctx *vm) 139 { 140 assert(vm != NULL); 141 142 if (vm->fd >= 0) 143 close(vm->fd); 144 DESTROY(vm->name); 145 146 free(vm); 147 } 148 149 struct vcpu * 150 vm_vcpu_open(struct vmctx *ctx, int vcpuid) 151 { 152 struct vcpu *vcpu; 153 154 vcpu = malloc(sizeof(*vcpu)); 155 vcpu->ctx = ctx; 156 vcpu->vcpuid = vcpuid; 157 return (vcpu); 158 } 159 160 void 161 vm_vcpu_close(struct vcpu *vcpu) 162 { 163 free(vcpu); 164 } 165 166 int 167 vcpu_id(struct vcpu *vcpu) 168 { 169 return (vcpu->vcpuid); 170 } 171 172 int 173 vm_parse_memsize(const char *opt, size_t *ret_memsize) 174 { 175 char *endptr; 176 size_t optval; 177 int error; 178 179 optval = strtoul(opt, &endptr, 0); 180 if (*opt != '\0' && *endptr == '\0') { 181 /* 182 * For the sake of backward compatibility if the memory size 183 * specified on the command line is less than a megabyte then 184 * it is interpreted as being in units of MB. 185 */ 186 if (optval < MB) 187 optval *= MB; 188 *ret_memsize = optval; 189 error = 0; 190 } else 191 error = expand_number(opt, ret_memsize); 192 193 return (error); 194 } 195 196 uint32_t 197 vm_get_lowmem_limit(struct vmctx *ctx) 198 { 199 200 return (ctx->lowmem_limit); 201 } 202 203 void 204 vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) 205 { 206 207 ctx->lowmem_limit = limit; 208 } 209 210 void 211 vm_set_memflags(struct vmctx *ctx, int flags) 212 { 213 214 ctx->memflags = flags; 215 } 216 217 int 218 vm_get_memflags(struct vmctx *ctx) 219 { 220 221 return (ctx->memflags); 222 } 223 224 /* 225 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). 226 */ 227 int 228 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, 229 size_t len, int prot) 230 { 231 struct vm_memmap memmap; 232 int error, flags; 233 234 memmap.gpa = gpa; 235 memmap.segid = segid; 236 memmap.segoff = off; 237 memmap.len = len; 238 memmap.prot = prot; 239 memmap.flags = 0; 240 241 if (ctx->memflags & VM_MEM_F_WIRED) 242 memmap.flags |= VM_MEMMAP_F_WIRED; 243 244 /* 245 * If this mapping already exists then don't create it again. This 246 * is the common case for SYSMEM mappings created by bhyveload(8). 247 */ 248 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); 249 if (error == 0 && gpa == memmap.gpa) { 250 if (segid != memmap.segid || off != memmap.segoff || 251 prot != memmap.prot || flags != memmap.flags) { 252 errno = EEXIST; 253 return (-1); 254 } else { 255 return (0); 256 } 257 } 258 259 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); 260 return (error); 261 } 262 263 int 264 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, 265 size_t *lowmem_size, size_t *highmem_size) 266 { 267 268 *guest_baseaddr = ctx->baseaddr; 269 *lowmem_size = ctx->lowmem; 270 *highmem_size = ctx->highmem; 271 return (0); 272 } 273 274 int 275 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) 276 { 277 struct vm_munmap munmap; 278 int error; 279 280 munmap.gpa = gpa; 281 munmap.len = len; 282 283 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); 284 return (error); 285 } 286 287 int 288 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, 289 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 290 { 291 struct vm_memmap memmap; 292 int error; 293 294 bzero(&memmap, sizeof(struct vm_memmap)); 295 memmap.gpa = *gpa; 296 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); 297 if (error == 0) { 298 *gpa = memmap.gpa; 299 *segid = memmap.segid; 300 *segoff = memmap.segoff; 301 *len = memmap.len; 302 *prot = memmap.prot; 303 *flags = memmap.flags; 304 } 305 return (error); 306 } 307 308 /* 309 * Return 0 if the segments are identical and non-zero otherwise. 310 * 311 * This is slightly complicated by the fact that only device memory segments 312 * are named. 313 */ 314 static int 315 cmpseg(size_t len, const char *str, size_t len2, const char *str2) 316 { 317 318 if (len == len2) { 319 if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) 320 return (0); 321 } 322 return (-1); 323 } 324 325 static int 326 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) 327 { 328 struct vm_memseg memseg; 329 size_t n; 330 int error; 331 332 /* 333 * If the memory segment has already been created then just return. 334 * This is the usual case for the SYSMEM segment created by userspace 335 * loaders like bhyveload(8). 336 */ 337 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, 338 sizeof(memseg.name)); 339 if (error) 340 return (error); 341 342 if (memseg.len != 0) { 343 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { 344 errno = EINVAL; 345 return (-1); 346 } else { 347 return (0); 348 } 349 } 350 351 bzero(&memseg, sizeof(struct vm_memseg)); 352 memseg.segid = segid; 353 memseg.len = len; 354 if (name != NULL) { 355 n = strlcpy(memseg.name, name, sizeof(memseg.name)); 356 if (n >= sizeof(memseg.name)) { 357 errno = ENAMETOOLONG; 358 return (-1); 359 } 360 } 361 362 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); 363 return (error); 364 } 365 366 int 367 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, 368 size_t bufsize) 369 { 370 struct vm_memseg memseg; 371 size_t n; 372 int error; 373 374 memseg.segid = segid; 375 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); 376 if (error == 0) { 377 *lenp = memseg.len; 378 n = strlcpy(namebuf, memseg.name, bufsize); 379 if (n >= bufsize) { 380 errno = ENAMETOOLONG; 381 error = -1; 382 } 383 } 384 return (error); 385 } 386 387 static int 388 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) 389 { 390 char *ptr; 391 int error, flags; 392 393 /* Map 'len' bytes starting at 'gpa' in the guest address space */ 394 error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); 395 if (error) 396 return (error); 397 398 flags = MAP_SHARED | MAP_FIXED; 399 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 400 flags |= MAP_NOCORE; 401 402 /* mmap into the process address space on the host */ 403 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); 404 if (ptr == MAP_FAILED) 405 return (-1); 406 407 return (0); 408 } 409 410 int 411 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) 412 { 413 size_t objsize, len; 414 vm_paddr_t gpa; 415 char *baseaddr, *ptr; 416 int error; 417 418 assert(vms == VM_MMAP_ALL); 419 420 /* 421 * If 'memsize' cannot fit entirely in the 'lowmem' segment then 422 * create another 'highmem' segment above 4GB for the remainder. 423 */ 424 if (memsize > ctx->lowmem_limit) { 425 ctx->lowmem = ctx->lowmem_limit; 426 ctx->highmem = memsize - ctx->lowmem_limit; 427 objsize = 4*GB + ctx->highmem; 428 } else { 429 ctx->lowmem = memsize; 430 ctx->highmem = 0; 431 objsize = ctx->lowmem; 432 } 433 434 error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); 435 if (error) 436 return (error); 437 438 /* 439 * Stake out a contiguous region covering the guest physical memory 440 * and the adjoining guard regions. 441 */ 442 len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; 443 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); 444 if (ptr == MAP_FAILED) 445 return (-1); 446 447 baseaddr = ptr + VM_MMAP_GUARD_SIZE; 448 if (ctx->highmem > 0) { 449 gpa = 4*GB; 450 len = ctx->highmem; 451 error = setup_memory_segment(ctx, gpa, len, baseaddr); 452 if (error) 453 return (error); 454 } 455 456 if (ctx->lowmem > 0) { 457 gpa = 0; 458 len = ctx->lowmem; 459 error = setup_memory_segment(ctx, gpa, len, baseaddr); 460 if (error) 461 return (error); 462 } 463 464 ctx->baseaddr = baseaddr; 465 466 return (0); 467 } 468 469 /* 470 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in 471 * the lowmem or highmem regions. 472 * 473 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. 474 * The instruction emulation code depends on this behavior. 475 */ 476 void * 477 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) 478 { 479 480 if (ctx->lowmem > 0) { 481 if (gaddr < ctx->lowmem && len <= ctx->lowmem && 482 gaddr + len <= ctx->lowmem) 483 return (ctx->baseaddr + gaddr); 484 } 485 486 if (ctx->highmem > 0) { 487 if (gaddr >= 4*GB) { 488 if (gaddr < 4*GB + ctx->highmem && 489 len <= ctx->highmem && 490 gaddr + len <= 4*GB + ctx->highmem) 491 return (ctx->baseaddr + gaddr); 492 } 493 } 494 495 return (NULL); 496 } 497 498 vm_paddr_t 499 vm_rev_map_gpa(struct vmctx *ctx, void *addr) 500 { 501 vm_paddr_t offaddr; 502 503 offaddr = (char *)addr - ctx->baseaddr; 504 505 if (ctx->lowmem > 0) 506 if (offaddr <= ctx->lowmem) 507 return (offaddr); 508 509 if (ctx->highmem > 0) 510 if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem) 511 return (offaddr); 512 513 return ((vm_paddr_t)-1); 514 } 515 516 const char * 517 vm_get_name(struct vmctx *ctx) 518 { 519 520 return (ctx->name); 521 } 522 523 size_t 524 vm_get_lowmem_size(struct vmctx *ctx) 525 { 526 527 return (ctx->lowmem); 528 } 529 530 size_t 531 vm_get_highmem_size(struct vmctx *ctx) 532 { 533 534 return (ctx->highmem); 535 } 536 537 void * 538 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) 539 { 540 char pathname[MAXPATHLEN]; 541 size_t len2; 542 char *base, *ptr; 543 int fd, error, flags; 544 545 fd = -1; 546 ptr = MAP_FAILED; 547 if (name == NULL || strlen(name) == 0) { 548 errno = EINVAL; 549 goto done; 550 } 551 552 error = vm_alloc_memseg(ctx, segid, len, name); 553 if (error) 554 goto done; 555 556 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); 557 strlcat(pathname, ctx->name, sizeof(pathname)); 558 strlcat(pathname, ".", sizeof(pathname)); 559 strlcat(pathname, name, sizeof(pathname)); 560 561 fd = open(pathname, O_RDWR); 562 if (fd < 0) 563 goto done; 564 565 /* 566 * Stake out a contiguous region covering the device memory and the 567 * adjoining guard regions. 568 */ 569 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; 570 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 571 0); 572 if (base == MAP_FAILED) 573 goto done; 574 575 flags = MAP_SHARED | MAP_FIXED; 576 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 577 flags |= MAP_NOCORE; 578 579 /* mmap the devmem region in the host address space */ 580 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); 581 done: 582 if (fd >= 0) 583 close(fd); 584 return (ptr); 585 } 586 587 int 588 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) 589 { 590 /* 591 * XXX: fragile, handle with care 592 * Assumes that the first field of the ioctl data 593 * is the vcpuid. 594 */ 595 *(int *)arg = vcpu->vcpuid; 596 return (ioctl(vcpu->ctx->fd, cmd, arg)); 597 } 598 599 int 600 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 601 { 602 int error; 603 struct vm_register vmreg; 604 605 bzero(&vmreg, sizeof(vmreg)); 606 vmreg.regnum = reg; 607 vmreg.regval = val; 608 609 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); 610 return (error); 611 } 612 613 int 614 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) 615 { 616 int error; 617 struct vm_register vmreg; 618 619 bzero(&vmreg, sizeof(vmreg)); 620 vmreg.regnum = reg; 621 622 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); 623 *ret_val = vmreg.regval; 624 return (error); 625 } 626 627 int 628 vm_set_register_set(struct vcpu *vcpu, unsigned int count, 629 const int *regnums, uint64_t *regvals) 630 { 631 int error; 632 struct vm_register_set vmregset; 633 634 bzero(&vmregset, sizeof(vmregset)); 635 vmregset.count = count; 636 vmregset.regnums = regnums; 637 vmregset.regvals = regvals; 638 639 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); 640 return (error); 641 } 642 643 int 644 vm_get_register_set(struct vcpu *vcpu, unsigned int count, 645 const int *regnums, uint64_t *regvals) 646 { 647 int error; 648 struct vm_register_set vmregset; 649 650 bzero(&vmregset, sizeof(vmregset)); 651 vmregset.count = count; 652 vmregset.regnums = regnums; 653 vmregset.regvals = regvals; 654 655 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); 656 return (error); 657 } 658 659 int 660 vm_run(struct vcpu *vcpu, struct vm_run *vmrun) 661 { 662 return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); 663 } 664 665 int 666 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) 667 { 668 struct vm_suspend vmsuspend; 669 670 bzero(&vmsuspend, sizeof(vmsuspend)); 671 vmsuspend.how = how; 672 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); 673 } 674 675 int 676 vm_reinit(struct vmctx *ctx) 677 { 678 679 return (ioctl(ctx->fd, VM_REINIT, 0)); 680 } 681 682 int 683 vm_capability_name2type(const char *capname) 684 { 685 int i; 686 687 for (i = 0; i < VM_CAP_MAX; i++) { 688 if (vm_capstrmap[i] != NULL && 689 strcmp(vm_capstrmap[i], capname) == 0) 690 return (i); 691 } 692 693 return (-1); 694 } 695 696 const char * 697 vm_capability_type2name(int type) 698 { 699 if (type >= 0 && type < VM_CAP_MAX) 700 return (vm_capstrmap[type]); 701 702 return (NULL); 703 } 704 705 int 706 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) 707 { 708 int error; 709 struct vm_capability vmcap; 710 711 bzero(&vmcap, sizeof(vmcap)); 712 vmcap.captype = cap; 713 714 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); 715 *retval = vmcap.capval; 716 return (error); 717 } 718 719 int 720 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) 721 { 722 struct vm_capability vmcap; 723 724 bzero(&vmcap, sizeof(vmcap)); 725 vmcap.captype = cap; 726 vmcap.capval = val; 727 728 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); 729 } 730 731 uint64_t * 732 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, 733 int *ret_entries) 734 { 735 static _Thread_local uint64_t *stats_buf; 736 static _Thread_local u_int stats_count; 737 uint64_t *new_stats; 738 struct vm_stats vmstats; 739 u_int count, index; 740 bool have_stats; 741 742 have_stats = false; 743 count = 0; 744 for (index = 0;; index += nitems(vmstats.statbuf)) { 745 vmstats.index = index; 746 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) 747 break; 748 if (stats_count < index + vmstats.num_entries) { 749 new_stats = realloc(stats_buf, 750 (index + vmstats.num_entries) * sizeof(uint64_t)); 751 if (new_stats == NULL) { 752 errno = ENOMEM; 753 return (NULL); 754 } 755 stats_count = index + vmstats.num_entries; 756 stats_buf = new_stats; 757 } 758 memcpy(stats_buf + index, vmstats.statbuf, 759 vmstats.num_entries * sizeof(uint64_t)); 760 count += vmstats.num_entries; 761 have_stats = true; 762 763 if (vmstats.num_entries != nitems(vmstats.statbuf)) 764 break; 765 } 766 if (have_stats) { 767 if (ret_entries) 768 *ret_entries = count; 769 if (ret_tv) 770 *ret_tv = vmstats.tv; 771 return (stats_buf); 772 } else 773 return (NULL); 774 } 775 776 const char * 777 vm_get_stat_desc(struct vmctx *ctx, int index) 778 { 779 static struct vm_stat_desc statdesc; 780 781 statdesc.index = index; 782 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) 783 return (statdesc.desc); 784 else 785 return (NULL); 786 } 787 788 int 789 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) 790 { 791 int error, i; 792 struct vm_gpa_pte gpapte; 793 794 bzero(&gpapte, sizeof(gpapte)); 795 gpapte.gpa = gpa; 796 797 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); 798 799 if (error == 0) { 800 *num = gpapte.ptenum; 801 for (i = 0; i < gpapte.ptenum; i++) 802 pte[i] = gpapte.pte[i]; 803 } 804 805 return (error); 806 } 807 808 int 809 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 810 uint64_t gla, int prot, uint64_t *gpa, int *fault) 811 { 812 struct vm_gla2gpa gg; 813 int error; 814 815 bzero(&gg, sizeof(struct vm_gla2gpa)); 816 gg.prot = prot; 817 gg.gla = gla; 818 gg.paging = *paging; 819 820 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); 821 if (error == 0) { 822 *fault = gg.fault; 823 *gpa = gg.gpa; 824 } 825 return (error); 826 } 827 828 int 829 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 830 uint64_t gla, int prot, uint64_t *gpa, int *fault) 831 { 832 struct vm_gla2gpa gg; 833 int error; 834 835 bzero(&gg, sizeof(struct vm_gla2gpa)); 836 gg.prot = prot; 837 gg.gla = gla; 838 gg.paging = *paging; 839 840 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); 841 if (error == 0) { 842 *fault = gg.fault; 843 *gpa = gg.gpa; 844 } 845 return (error); 846 } 847 848 #ifndef min 849 #define min(a,b) (((a) < (b)) ? (a) : (b)) 850 #endif 851 852 int 853 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 854 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, 855 int *fault) 856 { 857 void *va; 858 uint64_t gpa, off; 859 int error, i, n; 860 861 for (i = 0; i < iovcnt; i++) { 862 iov[i].iov_base = 0; 863 iov[i].iov_len = 0; 864 } 865 866 while (len) { 867 assert(iovcnt > 0); 868 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 869 if (error || *fault) 870 return (error); 871 872 off = gpa & PAGE_MASK; 873 n = MIN(len, PAGE_SIZE - off); 874 875 va = vm_map_gpa(vcpu->ctx, gpa, n); 876 if (va == NULL) 877 return (EFAULT); 878 879 iov->iov_base = va; 880 iov->iov_len = n; 881 iov++; 882 iovcnt--; 883 884 gla += n; 885 len -= n; 886 } 887 return (0); 888 } 889 890 void 891 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) 892 { 893 /* 894 * Intentionally empty. This is used by the instruction 895 * emulation code shared with the kernel. The in-kernel 896 * version of this is non-empty. 897 */ 898 } 899 900 void 901 vm_copyin(struct iovec *iov, void *vp, size_t len) 902 { 903 const char *src; 904 char *dst; 905 size_t n; 906 907 dst = vp; 908 while (len) { 909 assert(iov->iov_len); 910 n = min(len, iov->iov_len); 911 src = iov->iov_base; 912 bcopy(src, dst, n); 913 914 iov++; 915 dst += n; 916 len -= n; 917 } 918 } 919 920 void 921 vm_copyout(const void *vp, struct iovec *iov, size_t len) 922 { 923 const char *src; 924 char *dst; 925 size_t n; 926 927 src = vp; 928 while (len) { 929 assert(iov->iov_len); 930 n = min(len, iov->iov_len); 931 dst = iov->iov_base; 932 bcopy(src, dst, n); 933 934 iov++; 935 src += n; 936 len -= n; 937 } 938 } 939 940 static int 941 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) 942 { 943 struct vm_cpuset vm_cpuset; 944 int error; 945 946 bzero(&vm_cpuset, sizeof(struct vm_cpuset)); 947 vm_cpuset.which = which; 948 vm_cpuset.cpusetsize = sizeof(cpuset_t); 949 vm_cpuset.cpus = cpus; 950 951 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); 952 return (error); 953 } 954 955 int 956 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) 957 { 958 959 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); 960 } 961 962 int 963 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) 964 { 965 966 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); 967 } 968 969 int 970 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) 971 { 972 973 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); 974 } 975 976 int 977 vm_activate_cpu(struct vcpu *vcpu) 978 { 979 struct vm_activate_cpu ac; 980 int error; 981 982 bzero(&ac, sizeof(struct vm_activate_cpu)); 983 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); 984 return (error); 985 } 986 987 int 988 vm_suspend_all_cpus(struct vmctx *ctx) 989 { 990 struct vm_activate_cpu ac; 991 int error; 992 993 bzero(&ac, sizeof(struct vm_activate_cpu)); 994 ac.vcpuid = -1; 995 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); 996 return (error); 997 } 998 999 int 1000 vm_suspend_cpu(struct vcpu *vcpu) 1001 { 1002 struct vm_activate_cpu ac; 1003 int error; 1004 1005 bzero(&ac, sizeof(struct vm_activate_cpu)); 1006 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); 1007 return (error); 1008 } 1009 1010 int 1011 vm_resume_cpu(struct vcpu *vcpu) 1012 { 1013 struct vm_activate_cpu ac; 1014 int error; 1015 1016 bzero(&ac, sizeof(struct vm_activate_cpu)); 1017 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); 1018 return (error); 1019 } 1020 1021 int 1022 vm_resume_all_cpus(struct vmctx *ctx) 1023 { 1024 struct vm_activate_cpu ac; 1025 int error; 1026 1027 bzero(&ac, sizeof(struct vm_activate_cpu)); 1028 ac.vcpuid = -1; 1029 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); 1030 return (error); 1031 } 1032 1033 int 1034 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1035 { 1036 struct vm_intinfo vmii; 1037 int error; 1038 1039 bzero(&vmii, sizeof(struct vm_intinfo)); 1040 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); 1041 if (error == 0) { 1042 *info1 = vmii.info1; 1043 *info2 = vmii.info2; 1044 } 1045 return (error); 1046 } 1047 1048 int 1049 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) 1050 { 1051 struct vm_intinfo vmii; 1052 int error; 1053 1054 bzero(&vmii, sizeof(struct vm_intinfo)); 1055 vmii.info1 = info1; 1056 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); 1057 return (error); 1058 } 1059 1060 int 1061 vm_restart_instruction(struct vcpu *vcpu) 1062 { 1063 int arg; 1064 1065 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); 1066 } 1067 1068 int 1069 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) 1070 { 1071 1072 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { 1073 #ifdef SNAPSHOT_DEBUG 1074 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", 1075 __func__, meta->dev_name, errno); 1076 #endif 1077 return (-1); 1078 } 1079 return (0); 1080 } 1081 1082 int 1083 vm_restore_time(struct vmctx *ctx) 1084 { 1085 int dummy; 1086 1087 dummy = 0; 1088 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); 1089 } 1090 1091 int 1092 vm_set_topology(struct vmctx *ctx, 1093 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) 1094 { 1095 struct vm_cpu_topology topology; 1096 1097 bzero(&topology, sizeof (struct vm_cpu_topology)); 1098 topology.sockets = sockets; 1099 topology.cores = cores; 1100 topology.threads = threads; 1101 topology.maxcpus = maxcpus; 1102 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); 1103 } 1104 1105 int 1106 vm_get_topology(struct vmctx *ctx, 1107 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) 1108 { 1109 struct vm_cpu_topology topology; 1110 int error; 1111 1112 bzero(&topology, sizeof (struct vm_cpu_topology)); 1113 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); 1114 if (error == 0) { 1115 *sockets = topology.sockets; 1116 *cores = topology.cores; 1117 *threads = topology.threads; 1118 *maxcpus = topology.maxcpus; 1119 } 1120 return (error); 1121 } 1122 1123 int 1124 vm_limit_rights(struct vmctx *ctx) 1125 { 1126 cap_rights_t rights; 1127 1128 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 1129 if (caph_rights_limit(ctx->fd, &rights) != 0) 1130 return (-1); 1131 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) 1132 return (-1); 1133 return (0); 1134 } 1135 1136 /* 1137 * Avoid using in new code. Operations on the fd should be wrapped here so that 1138 * capability rights can be kept in sync. 1139 */ 1140 int 1141 vm_get_device_fd(struct vmctx *ctx) 1142 { 1143 1144 return (ctx->fd); 1145 } 1146 1147 /* Legacy interface, do not use. */ 1148 const cap_ioctl_t * 1149 vm_get_ioctls(size_t *len) 1150 { 1151 cap_ioctl_t *cmds; 1152 size_t sz; 1153 1154 if (len == NULL) { 1155 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); 1156 cmds = malloc(sz); 1157 if (cmds == NULL) 1158 return (NULL); 1159 bcopy(vm_ioctl_cmds, cmds, sz); 1160 return (cmds); 1161 } 1162 1163 *len = vm_ioctl_ncmds; 1164 return (NULL); 1165 } 1166