1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/capsicum.h> 31 #include <sys/sysctl.h> 32 #include <sys/ioctl.h> 33 #include <sys/mman.h> 34 #include <sys/linker.h> 35 #include <sys/module.h> 36 #include <sys/_iovec.h> 37 #include <sys/cpuset.h> 38 39 #include <capsicum_helpers.h> 40 #include <errno.h> 41 #include <stdbool.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <assert.h> 45 #include <string.h> 46 #include <fcntl.h> 47 #include <unistd.h> 48 49 #include <libutil.h> 50 51 #include <vm/vm.h> 52 #include <machine/vmm.h> 53 #include <machine/vmm_dev.h> 54 #include <machine/vmm_snapshot.h> 55 56 #include "vmmapi.h" 57 #include "internal.h" 58 59 #define MB (1024 * 1024UL) 60 #define GB (1024 * 1024 * 1024UL) 61 62 #ifdef __amd64__ 63 #define VM_LOWMEM_LIMIT (3 * GB) 64 #else 65 #define VM_LOWMEM_LIMIT 0 66 #endif 67 #define VM_HIGHMEM_BASE (4 * GB) 68 69 /* 70 * Size of the guard region before and after the virtual address space 71 * mapping the guest physical memory. This must be a multiple of the 72 * superpage size for performance reasons. 73 */ 74 #define VM_MMAP_GUARD_SIZE (4 * MB) 75 76 #define PROT_RW (PROT_READ | PROT_WRITE) 77 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) 78 79 #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) 80 #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) 81 82 static int 83 vm_device_open(const char *name) 84 { 85 int fd, len; 86 char *vmfile; 87 88 len = strlen("/dev/vmm/") + strlen(name) + 1; 89 vmfile = malloc(len); 90 assert(vmfile != NULL); 91 snprintf(vmfile, len, "/dev/vmm/%s", name); 92 93 /* Open the device file */ 94 fd = open(vmfile, O_RDWR, 0); 95 96 free(vmfile); 97 return (fd); 98 } 99 100 int 101 vm_create(const char *name) 102 { 103 /* Try to load vmm(4) module before creating a guest. */ 104 if (modfind("vmm") < 0) 105 kldload("vmm"); 106 return (CREATE(name)); 107 } 108 109 struct vmctx * 110 vm_open(const char *name) 111 { 112 struct vmctx *vm; 113 int saved_errno; 114 115 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); 116 assert(vm != NULL); 117 118 vm->fd = -1; 119 vm->memflags = 0; 120 vm->name = (char *)(vm + 1); 121 strcpy(vm->name, name); 122 memset(vm->memsegs, 0, sizeof(vm->memsegs)); 123 124 if ((vm->fd = vm_device_open(vm->name)) < 0) 125 goto err; 126 127 return (vm); 128 err: 129 saved_errno = errno; 130 free(vm); 131 errno = saved_errno; 132 return (NULL); 133 } 134 135 void 136 vm_close(struct vmctx *vm) 137 { 138 assert(vm != NULL); 139 140 close(vm->fd); 141 free(vm); 142 } 143 144 void 145 vm_destroy(struct vmctx *vm) 146 { 147 assert(vm != NULL); 148 149 if (vm->fd >= 0) 150 close(vm->fd); 151 DESTROY(vm->name); 152 153 free(vm); 154 } 155 156 struct vcpu * 157 vm_vcpu_open(struct vmctx *ctx, int vcpuid) 158 { 159 struct vcpu *vcpu; 160 161 vcpu = malloc(sizeof(*vcpu)); 162 vcpu->ctx = ctx; 163 vcpu->vcpuid = vcpuid; 164 return (vcpu); 165 } 166 167 void 168 vm_vcpu_close(struct vcpu *vcpu) 169 { 170 free(vcpu); 171 } 172 173 int 174 vcpu_id(struct vcpu *vcpu) 175 { 176 return (vcpu->vcpuid); 177 } 178 179 int 180 vm_parse_memsize(const char *opt, size_t *ret_memsize) 181 { 182 char *endptr; 183 size_t optval; 184 int error; 185 186 optval = strtoul(opt, &endptr, 0); 187 if (*opt != '\0' && *endptr == '\0') { 188 /* 189 * For the sake of backward compatibility if the memory size 190 * specified on the command line is less than a megabyte then 191 * it is interpreted as being in units of MB. 192 */ 193 if (optval < MB) 194 optval *= MB; 195 *ret_memsize = optval; 196 error = 0; 197 } else 198 error = expand_number(opt, ret_memsize); 199 200 return (error); 201 } 202 203 uint32_t 204 vm_get_lowmem_limit(struct vmctx *ctx __unused) 205 { 206 207 return (VM_LOWMEM_LIMIT); 208 } 209 210 void 211 vm_set_memflags(struct vmctx *ctx, int flags) 212 { 213 214 ctx->memflags = flags; 215 } 216 217 int 218 vm_get_memflags(struct vmctx *ctx) 219 { 220 221 return (ctx->memflags); 222 } 223 224 /* 225 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). 226 */ 227 int 228 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, 229 size_t len, int prot) 230 { 231 struct vm_memmap memmap; 232 int error, flags; 233 234 memmap.gpa = gpa; 235 memmap.segid = segid; 236 memmap.segoff = off; 237 memmap.len = len; 238 memmap.prot = prot; 239 memmap.flags = 0; 240 241 if (ctx->memflags & VM_MEM_F_WIRED) 242 memmap.flags |= VM_MEMMAP_F_WIRED; 243 244 /* 245 * If this mapping already exists then don't create it again. This 246 * is the common case for SYSMEM mappings created by bhyveload(8). 247 */ 248 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); 249 if (error == 0 && gpa == memmap.gpa) { 250 if (segid != memmap.segid || off != memmap.segoff || 251 prot != memmap.prot || flags != memmap.flags) { 252 errno = EEXIST; 253 return (-1); 254 } else { 255 return (0); 256 } 257 } 258 259 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); 260 return (error); 261 } 262 263 int 264 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, 265 size_t *lowmem_size, size_t *highmem_size) 266 { 267 268 *guest_baseaddr = ctx->baseaddr; 269 *lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size; 270 *highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size; 271 return (0); 272 } 273 274 int 275 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) 276 { 277 struct vm_munmap munmap; 278 int error; 279 280 munmap.gpa = gpa; 281 munmap.len = len; 282 283 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); 284 return (error); 285 } 286 287 int 288 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, 289 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 290 { 291 struct vm_memmap memmap; 292 int error; 293 294 bzero(&memmap, sizeof(struct vm_memmap)); 295 memmap.gpa = *gpa; 296 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); 297 if (error == 0) { 298 *gpa = memmap.gpa; 299 *segid = memmap.segid; 300 *segoff = memmap.segoff; 301 *len = memmap.len; 302 *prot = memmap.prot; 303 *flags = memmap.flags; 304 } 305 return (error); 306 } 307 308 /* 309 * Return 0 if the segments are identical and non-zero otherwise. 310 * 311 * This is slightly complicated by the fact that only device memory segments 312 * are named. 313 */ 314 static int 315 cmpseg(size_t len, const char *str, size_t len2, const char *str2) 316 { 317 318 if (len == len2) { 319 if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) 320 return (0); 321 } 322 return (-1); 323 } 324 325 static int 326 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) 327 { 328 struct vm_memseg memseg; 329 size_t n; 330 int error; 331 332 /* 333 * If the memory segment has already been created then just return. 334 * This is the usual case for the SYSMEM segment created by userspace 335 * loaders like bhyveload(8). 336 */ 337 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, 338 sizeof(memseg.name)); 339 if (error) 340 return (error); 341 342 if (memseg.len != 0) { 343 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { 344 errno = EINVAL; 345 return (-1); 346 } else { 347 return (0); 348 } 349 } 350 351 bzero(&memseg, sizeof(struct vm_memseg)); 352 memseg.segid = segid; 353 memseg.len = len; 354 if (name != NULL) { 355 n = strlcpy(memseg.name, name, sizeof(memseg.name)); 356 if (n >= sizeof(memseg.name)) { 357 errno = ENAMETOOLONG; 358 return (-1); 359 } 360 } 361 362 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); 363 return (error); 364 } 365 366 int 367 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, 368 size_t bufsize) 369 { 370 struct vm_memseg memseg; 371 size_t n; 372 int error; 373 374 bzero(&memseg, sizeof(memseg)); 375 memseg.segid = segid; 376 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); 377 if (error == 0) { 378 *lenp = memseg.len; 379 n = strlcpy(namebuf, memseg.name, bufsize); 380 if (n >= bufsize) { 381 errno = ENAMETOOLONG; 382 error = -1; 383 } 384 } 385 return (error); 386 } 387 388 static int 389 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) 390 { 391 char *ptr; 392 int error, flags; 393 394 /* Map 'len' bytes starting at 'gpa' in the guest address space */ 395 error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); 396 if (error) 397 return (error); 398 399 flags = MAP_SHARED | MAP_FIXED; 400 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 401 flags |= MAP_NOCORE; 402 403 /* mmap into the process address space on the host */ 404 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); 405 if (ptr == MAP_FAILED) 406 return (-1); 407 408 return (0); 409 } 410 411 int 412 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) 413 { 414 size_t objsize, len; 415 vm_paddr_t gpa; 416 char *baseaddr, *ptr; 417 int error; 418 419 assert(vms == VM_MMAP_ALL); 420 421 /* 422 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create 423 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder. 424 */ 425 if (memsize > VM_LOWMEM_LIMIT) { 426 ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT; 427 ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT; 428 objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size; 429 } else { 430 ctx->memsegs[VM_MEMSEG_LOW].size = memsize; 431 ctx->memsegs[VM_MEMSEG_HIGH].size = 0; 432 objsize = memsize; 433 } 434 435 error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); 436 if (error) 437 return (error); 438 439 /* 440 * Stake out a contiguous region covering the guest physical memory 441 * and the adjoining guard regions. 442 */ 443 len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; 444 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); 445 if (ptr == MAP_FAILED) 446 return (-1); 447 448 baseaddr = ptr + VM_MMAP_GUARD_SIZE; 449 if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) { 450 gpa = VM_HIGHMEM_BASE; 451 len = ctx->memsegs[VM_MEMSEG_HIGH].size; 452 error = setup_memory_segment(ctx, gpa, len, baseaddr); 453 if (error) 454 return (error); 455 } 456 457 if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) { 458 gpa = 0; 459 len = ctx->memsegs[VM_MEMSEG_LOW].size; 460 error = setup_memory_segment(ctx, gpa, len, baseaddr); 461 if (error) 462 return (error); 463 } 464 465 ctx->baseaddr = baseaddr; 466 467 return (0); 468 } 469 470 /* 471 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in 472 * the lowmem or highmem regions. 473 * 474 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. 475 * The instruction emulation code depends on this behavior. 476 */ 477 void * 478 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) 479 { 480 vm_size_t lowsize, highsize; 481 482 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 483 if (lowsize > 0) { 484 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) 485 return (ctx->baseaddr + gaddr); 486 } 487 488 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 489 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { 490 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && 491 gaddr + len <= VM_HIGHMEM_BASE + highsize) 492 return (ctx->baseaddr + gaddr); 493 } 494 495 return (NULL); 496 } 497 498 vm_paddr_t 499 vm_rev_map_gpa(struct vmctx *ctx, void *addr) 500 { 501 vm_paddr_t offaddr; 502 vm_size_t lowsize, highsize; 503 504 offaddr = (char *)addr - ctx->baseaddr; 505 506 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 507 if (lowsize > 0) 508 if (offaddr <= lowsize) 509 return (offaddr); 510 511 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 512 if (highsize > 0) 513 if (offaddr >= VM_HIGHMEM_BASE && 514 offaddr < VM_HIGHMEM_BASE + highsize) 515 return (offaddr); 516 517 return ((vm_paddr_t)-1); 518 } 519 520 const char * 521 vm_get_name(struct vmctx *ctx) 522 { 523 524 return (ctx->name); 525 } 526 527 size_t 528 vm_get_lowmem_size(struct vmctx *ctx) 529 { 530 531 return (ctx->memsegs[VM_MEMSEG_LOW].size); 532 } 533 534 vm_paddr_t 535 vm_get_highmem_base(struct vmctx *ctx __unused) 536 { 537 538 return (VM_HIGHMEM_BASE); 539 } 540 541 size_t 542 vm_get_highmem_size(struct vmctx *ctx) 543 { 544 545 return (ctx->memsegs[VM_MEMSEG_HIGH].size); 546 } 547 548 void * 549 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) 550 { 551 char pathname[MAXPATHLEN]; 552 size_t len2; 553 char *base, *ptr; 554 int fd, error, flags; 555 556 fd = -1; 557 ptr = MAP_FAILED; 558 if (name == NULL || strlen(name) == 0) { 559 errno = EINVAL; 560 goto done; 561 } 562 563 error = vm_alloc_memseg(ctx, segid, len, name); 564 if (error) 565 goto done; 566 567 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); 568 strlcat(pathname, ctx->name, sizeof(pathname)); 569 strlcat(pathname, ".", sizeof(pathname)); 570 strlcat(pathname, name, sizeof(pathname)); 571 572 fd = open(pathname, O_RDWR); 573 if (fd < 0) 574 goto done; 575 576 /* 577 * Stake out a contiguous region covering the device memory and the 578 * adjoining guard regions. 579 */ 580 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; 581 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 582 0); 583 if (base == MAP_FAILED) 584 goto done; 585 586 flags = MAP_SHARED | MAP_FIXED; 587 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 588 flags |= MAP_NOCORE; 589 590 /* mmap the devmem region in the host address space */ 591 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); 592 done: 593 if (fd >= 0) 594 close(fd); 595 return (ptr); 596 } 597 598 int 599 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) 600 { 601 /* 602 * XXX: fragile, handle with care 603 * Assumes that the first field of the ioctl data 604 * is the vcpuid. 605 */ 606 *(int *)arg = vcpu->vcpuid; 607 return (ioctl(vcpu->ctx->fd, cmd, arg)); 608 } 609 610 int 611 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 612 { 613 int error; 614 struct vm_register vmreg; 615 616 bzero(&vmreg, sizeof(vmreg)); 617 vmreg.regnum = reg; 618 vmreg.regval = val; 619 620 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); 621 return (error); 622 } 623 624 int 625 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) 626 { 627 int error; 628 struct vm_register vmreg; 629 630 bzero(&vmreg, sizeof(vmreg)); 631 vmreg.regnum = reg; 632 633 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); 634 *ret_val = vmreg.regval; 635 return (error); 636 } 637 638 int 639 vm_set_register_set(struct vcpu *vcpu, unsigned int count, 640 const int *regnums, uint64_t *regvals) 641 { 642 int error; 643 struct vm_register_set vmregset; 644 645 bzero(&vmregset, sizeof(vmregset)); 646 vmregset.count = count; 647 vmregset.regnums = regnums; 648 vmregset.regvals = regvals; 649 650 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); 651 return (error); 652 } 653 654 int 655 vm_get_register_set(struct vcpu *vcpu, unsigned int count, 656 const int *regnums, uint64_t *regvals) 657 { 658 int error; 659 struct vm_register_set vmregset; 660 661 bzero(&vmregset, sizeof(vmregset)); 662 vmregset.count = count; 663 vmregset.regnums = regnums; 664 vmregset.regvals = regvals; 665 666 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); 667 return (error); 668 } 669 670 int 671 vm_run(struct vcpu *vcpu, struct vm_run *vmrun) 672 { 673 return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); 674 } 675 676 int 677 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) 678 { 679 struct vm_suspend vmsuspend; 680 681 bzero(&vmsuspend, sizeof(vmsuspend)); 682 vmsuspend.how = how; 683 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); 684 } 685 686 int 687 vm_reinit(struct vmctx *ctx) 688 { 689 690 return (ioctl(ctx->fd, VM_REINIT, 0)); 691 } 692 693 int 694 vm_capability_name2type(const char *capname) 695 { 696 int i; 697 698 for (i = 0; i < VM_CAP_MAX; i++) { 699 if (vm_capstrmap[i] != NULL && 700 strcmp(vm_capstrmap[i], capname) == 0) 701 return (i); 702 } 703 704 return (-1); 705 } 706 707 const char * 708 vm_capability_type2name(int type) 709 { 710 if (type >= 0 && type < VM_CAP_MAX) 711 return (vm_capstrmap[type]); 712 713 return (NULL); 714 } 715 716 int 717 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) 718 { 719 int error; 720 struct vm_capability vmcap; 721 722 bzero(&vmcap, sizeof(vmcap)); 723 vmcap.captype = cap; 724 725 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); 726 *retval = vmcap.capval; 727 return (error); 728 } 729 730 int 731 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) 732 { 733 struct vm_capability vmcap; 734 735 bzero(&vmcap, sizeof(vmcap)); 736 vmcap.captype = cap; 737 vmcap.capval = val; 738 739 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); 740 } 741 742 uint64_t * 743 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, 744 int *ret_entries) 745 { 746 static _Thread_local uint64_t *stats_buf; 747 static _Thread_local u_int stats_count; 748 uint64_t *new_stats; 749 struct vm_stats vmstats; 750 u_int count, index; 751 bool have_stats; 752 753 have_stats = false; 754 count = 0; 755 for (index = 0;; index += nitems(vmstats.statbuf)) { 756 vmstats.index = index; 757 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) 758 break; 759 if (stats_count < index + vmstats.num_entries) { 760 new_stats = realloc(stats_buf, 761 (index + vmstats.num_entries) * sizeof(uint64_t)); 762 if (new_stats == NULL) { 763 errno = ENOMEM; 764 return (NULL); 765 } 766 stats_count = index + vmstats.num_entries; 767 stats_buf = new_stats; 768 } 769 memcpy(stats_buf + index, vmstats.statbuf, 770 vmstats.num_entries * sizeof(uint64_t)); 771 count += vmstats.num_entries; 772 have_stats = true; 773 774 if (vmstats.num_entries != nitems(vmstats.statbuf)) 775 break; 776 } 777 if (have_stats) { 778 if (ret_entries) 779 *ret_entries = count; 780 if (ret_tv) 781 *ret_tv = vmstats.tv; 782 return (stats_buf); 783 } else 784 return (NULL); 785 } 786 787 const char * 788 vm_get_stat_desc(struct vmctx *ctx, int index) 789 { 790 static struct vm_stat_desc statdesc; 791 792 statdesc.index = index; 793 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) 794 return (statdesc.desc); 795 else 796 return (NULL); 797 } 798 799 int 800 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) 801 { 802 int error, i; 803 struct vm_gpa_pte gpapte; 804 805 bzero(&gpapte, sizeof(gpapte)); 806 gpapte.gpa = gpa; 807 808 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); 809 810 if (error == 0) { 811 *num = gpapte.ptenum; 812 for (i = 0; i < gpapte.ptenum; i++) 813 pte[i] = gpapte.pte[i]; 814 } 815 816 return (error); 817 } 818 819 int 820 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 821 uint64_t gla, int prot, uint64_t *gpa, int *fault) 822 { 823 struct vm_gla2gpa gg; 824 int error; 825 826 bzero(&gg, sizeof(struct vm_gla2gpa)); 827 gg.prot = prot; 828 gg.gla = gla; 829 gg.paging = *paging; 830 831 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); 832 if (error == 0) { 833 *fault = gg.fault; 834 *gpa = gg.gpa; 835 } 836 return (error); 837 } 838 839 int 840 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 841 uint64_t gla, int prot, uint64_t *gpa, int *fault) 842 { 843 struct vm_gla2gpa gg; 844 int error; 845 846 bzero(&gg, sizeof(struct vm_gla2gpa)); 847 gg.prot = prot; 848 gg.gla = gla; 849 gg.paging = *paging; 850 851 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); 852 if (error == 0) { 853 *fault = gg.fault; 854 *gpa = gg.gpa; 855 } 856 return (error); 857 } 858 859 #ifndef min 860 #define min(a,b) (((a) < (b)) ? (a) : (b)) 861 #endif 862 863 int 864 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 865 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, 866 int *fault) 867 { 868 void *va; 869 uint64_t gpa, off; 870 int error, i, n; 871 872 for (i = 0; i < iovcnt; i++) { 873 iov[i].iov_base = 0; 874 iov[i].iov_len = 0; 875 } 876 877 while (len) { 878 assert(iovcnt > 0); 879 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 880 if (error || *fault) 881 return (error); 882 883 off = gpa & PAGE_MASK; 884 n = MIN(len, PAGE_SIZE - off); 885 886 va = vm_map_gpa(vcpu->ctx, gpa, n); 887 if (va == NULL) 888 return (EFAULT); 889 890 iov->iov_base = va; 891 iov->iov_len = n; 892 iov++; 893 iovcnt--; 894 895 gla += n; 896 len -= n; 897 } 898 return (0); 899 } 900 901 void 902 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) 903 { 904 /* 905 * Intentionally empty. This is used by the instruction 906 * emulation code shared with the kernel. The in-kernel 907 * version of this is non-empty. 908 */ 909 } 910 911 void 912 vm_copyin(struct iovec *iov, void *vp, size_t len) 913 { 914 const char *src; 915 char *dst; 916 size_t n; 917 918 dst = vp; 919 while (len) { 920 assert(iov->iov_len); 921 n = min(len, iov->iov_len); 922 src = iov->iov_base; 923 bcopy(src, dst, n); 924 925 iov++; 926 dst += n; 927 len -= n; 928 } 929 } 930 931 void 932 vm_copyout(const void *vp, struct iovec *iov, size_t len) 933 { 934 const char *src; 935 char *dst; 936 size_t n; 937 938 src = vp; 939 while (len) { 940 assert(iov->iov_len); 941 n = min(len, iov->iov_len); 942 dst = iov->iov_base; 943 bcopy(src, dst, n); 944 945 iov++; 946 src += n; 947 len -= n; 948 } 949 } 950 951 static int 952 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) 953 { 954 struct vm_cpuset vm_cpuset; 955 int error; 956 957 bzero(&vm_cpuset, sizeof(struct vm_cpuset)); 958 vm_cpuset.which = which; 959 vm_cpuset.cpusetsize = sizeof(cpuset_t); 960 vm_cpuset.cpus = cpus; 961 962 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); 963 return (error); 964 } 965 966 int 967 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) 968 { 969 970 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); 971 } 972 973 int 974 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) 975 { 976 977 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); 978 } 979 980 int 981 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) 982 { 983 984 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); 985 } 986 987 int 988 vm_activate_cpu(struct vcpu *vcpu) 989 { 990 struct vm_activate_cpu ac; 991 int error; 992 993 bzero(&ac, sizeof(struct vm_activate_cpu)); 994 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); 995 return (error); 996 } 997 998 int 999 vm_suspend_all_cpus(struct vmctx *ctx) 1000 { 1001 struct vm_activate_cpu ac; 1002 int error; 1003 1004 bzero(&ac, sizeof(struct vm_activate_cpu)); 1005 ac.vcpuid = -1; 1006 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); 1007 return (error); 1008 } 1009 1010 int 1011 vm_suspend_cpu(struct vcpu *vcpu) 1012 { 1013 struct vm_activate_cpu ac; 1014 int error; 1015 1016 bzero(&ac, sizeof(struct vm_activate_cpu)); 1017 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); 1018 return (error); 1019 } 1020 1021 int 1022 vm_resume_cpu(struct vcpu *vcpu) 1023 { 1024 struct vm_activate_cpu ac; 1025 int error; 1026 1027 bzero(&ac, sizeof(struct vm_activate_cpu)); 1028 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); 1029 return (error); 1030 } 1031 1032 int 1033 vm_resume_all_cpus(struct vmctx *ctx) 1034 { 1035 struct vm_activate_cpu ac; 1036 int error; 1037 1038 bzero(&ac, sizeof(struct vm_activate_cpu)); 1039 ac.vcpuid = -1; 1040 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); 1041 return (error); 1042 } 1043 1044 int 1045 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1046 { 1047 struct vm_intinfo vmii; 1048 int error; 1049 1050 bzero(&vmii, sizeof(struct vm_intinfo)); 1051 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); 1052 if (error == 0) { 1053 *info1 = vmii.info1; 1054 *info2 = vmii.info2; 1055 } 1056 return (error); 1057 } 1058 1059 int 1060 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) 1061 { 1062 struct vm_intinfo vmii; 1063 int error; 1064 1065 bzero(&vmii, sizeof(struct vm_intinfo)); 1066 vmii.info1 = info1; 1067 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); 1068 return (error); 1069 } 1070 1071 int 1072 vm_restart_instruction(struct vcpu *vcpu) 1073 { 1074 int arg; 1075 1076 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); 1077 } 1078 1079 int 1080 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) 1081 { 1082 1083 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { 1084 #ifdef SNAPSHOT_DEBUG 1085 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", 1086 __func__, meta->dev_name, errno); 1087 #endif 1088 return (-1); 1089 } 1090 return (0); 1091 } 1092 1093 int 1094 vm_restore_time(struct vmctx *ctx) 1095 { 1096 int dummy; 1097 1098 dummy = 0; 1099 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); 1100 } 1101 1102 int 1103 vm_set_topology(struct vmctx *ctx, 1104 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) 1105 { 1106 struct vm_cpu_topology topology; 1107 1108 bzero(&topology, sizeof (struct vm_cpu_topology)); 1109 topology.sockets = sockets; 1110 topology.cores = cores; 1111 topology.threads = threads; 1112 topology.maxcpus = maxcpus; 1113 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); 1114 } 1115 1116 int 1117 vm_get_topology(struct vmctx *ctx, 1118 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) 1119 { 1120 struct vm_cpu_topology topology; 1121 int error; 1122 1123 bzero(&topology, sizeof (struct vm_cpu_topology)); 1124 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); 1125 if (error == 0) { 1126 *sockets = topology.sockets; 1127 *cores = topology.cores; 1128 *threads = topology.threads; 1129 *maxcpus = topology.maxcpus; 1130 } 1131 return (error); 1132 } 1133 1134 int 1135 vm_limit_rights(struct vmctx *ctx) 1136 { 1137 cap_rights_t rights; 1138 1139 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 1140 if (caph_rights_limit(ctx->fd, &rights) != 0) 1141 return (-1); 1142 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) 1143 return (-1); 1144 return (0); 1145 } 1146 1147 /* 1148 * Avoid using in new code. Operations on the fd should be wrapped here so that 1149 * capability rights can be kept in sync. 1150 */ 1151 int 1152 vm_get_device_fd(struct vmctx *ctx) 1153 { 1154 1155 return (ctx->fd); 1156 } 1157 1158 /* Legacy interface, do not use. */ 1159 const cap_ioctl_t * 1160 vm_get_ioctls(size_t *len) 1161 { 1162 cap_ioctl_t *cmds; 1163 size_t sz; 1164 1165 if (len == NULL) { 1166 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); 1167 cmds = malloc(sz); 1168 if (cmds == NULL) 1169 return (NULL); 1170 bcopy(vm_ioctl_cmds, cmds, sz); 1171 return (cmds); 1172 } 1173 1174 *len = vm_ioctl_ncmds; 1175 return (NULL); 1176 } 1177