1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/capsicum.h> 31 #include <sys/sysctl.h> 32 #include <sys/ioctl.h> 33 #include <sys/mman.h> 34 #include <sys/linker.h> 35 #include <sys/module.h> 36 #include <sys/_iovec.h> 37 #include <sys/cpuset.h> 38 39 #include <capsicum_helpers.h> 40 #include <errno.h> 41 #include <stdbool.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <assert.h> 45 #include <string.h> 46 #include <fcntl.h> 47 #include <unistd.h> 48 49 #include <libutil.h> 50 51 #include <vm/vm.h> 52 #include <machine/vmm.h> 53 #include <machine/vmm_dev.h> 54 #ifdef WITH_VMMAPI_SNAPSHOT 55 #include <machine/vmm_snapshot.h> 56 #endif 57 58 #include "vmmapi.h" 59 #include "internal.h" 60 61 #define MB (1024 * 1024UL) 62 #define GB (1024 * 1024 * 1024UL) 63 64 #ifdef __amd64__ 65 #define VM_LOWMEM_LIMIT (3 * GB) 66 #else 67 #define VM_LOWMEM_LIMIT 0 68 #endif 69 #define VM_HIGHMEM_BASE (4 * GB) 70 71 /* 72 * Size of the guard region before and after the virtual address space 73 * mapping the guest physical memory. This must be a multiple of the 74 * superpage size for performance reasons. 75 */ 76 #define VM_MMAP_GUARD_SIZE (4 * MB) 77 78 #define PROT_RW (PROT_READ | PROT_WRITE) 79 #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) 80 81 #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) 82 #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) 83 84 static int 85 vm_device_open(const char *name) 86 { 87 int fd, len; 88 char *vmfile; 89 90 len = strlen("/dev/vmm/") + strlen(name) + 1; 91 vmfile = malloc(len); 92 assert(vmfile != NULL); 93 snprintf(vmfile, len, "/dev/vmm/%s", name); 94 95 /* Open the device file */ 96 fd = open(vmfile, O_RDWR, 0); 97 98 free(vmfile); 99 return (fd); 100 } 101 102 int 103 vm_create(const char *name) 104 { 105 /* Try to load vmm(4) module before creating a guest. */ 106 if (modfind("vmm") < 0) 107 kldload("vmm"); 108 return (CREATE(name)); 109 } 110 111 struct vmctx * 112 vm_open(const char *name) 113 { 114 struct vmctx *vm; 115 int saved_errno; 116 117 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); 118 assert(vm != NULL); 119 120 vm->fd = -1; 121 vm->memflags = 0; 122 vm->name = (char *)(vm + 1); 123 strcpy(vm->name, name); 124 memset(vm->memsegs, 0, sizeof(vm->memsegs)); 125 126 if ((vm->fd = vm_device_open(vm->name)) < 0) 127 goto err; 128 129 return (vm); 130 err: 131 saved_errno = errno; 132 free(vm); 133 errno = saved_errno; 134 return (NULL); 135 } 136 137 void 138 vm_close(struct vmctx *vm) 139 { 140 assert(vm != NULL); 141 142 close(vm->fd); 143 free(vm); 144 } 145 146 void 147 vm_destroy(struct vmctx *vm) 148 { 149 assert(vm != NULL); 150 151 if (vm->fd >= 0) 152 close(vm->fd); 153 DESTROY(vm->name); 154 155 free(vm); 156 } 157 158 struct vcpu * 159 vm_vcpu_open(struct vmctx *ctx, int vcpuid) 160 { 161 struct vcpu *vcpu; 162 163 vcpu = malloc(sizeof(*vcpu)); 164 vcpu->ctx = ctx; 165 vcpu->vcpuid = vcpuid; 166 return (vcpu); 167 } 168 169 void 170 vm_vcpu_close(struct vcpu *vcpu) 171 { 172 free(vcpu); 173 } 174 175 int 176 vcpu_id(struct vcpu *vcpu) 177 { 178 return (vcpu->vcpuid); 179 } 180 181 int 182 vm_parse_memsize(const char *opt, size_t *ret_memsize) 183 { 184 char *endptr; 185 size_t optval; 186 int error; 187 188 optval = strtoul(opt, &endptr, 0); 189 if (*opt != '\0' && *endptr == '\0') { 190 /* 191 * For the sake of backward compatibility if the memory size 192 * specified on the command line is less than a megabyte then 193 * it is interpreted as being in units of MB. 194 */ 195 if (optval < MB) 196 optval *= MB; 197 *ret_memsize = optval; 198 error = 0; 199 } else 200 error = expand_number(opt, ret_memsize); 201 202 return (error); 203 } 204 205 uint32_t 206 vm_get_lowmem_limit(struct vmctx *ctx __unused) 207 { 208 209 return (VM_LOWMEM_LIMIT); 210 } 211 212 void 213 vm_set_memflags(struct vmctx *ctx, int flags) 214 { 215 216 ctx->memflags = flags; 217 } 218 219 int 220 vm_get_memflags(struct vmctx *ctx) 221 { 222 223 return (ctx->memflags); 224 } 225 226 /* 227 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). 228 */ 229 int 230 vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, 231 size_t len, int prot) 232 { 233 struct vm_memmap memmap; 234 int error, flags; 235 236 memmap.gpa = gpa; 237 memmap.segid = segid; 238 memmap.segoff = off; 239 memmap.len = len; 240 memmap.prot = prot; 241 memmap.flags = 0; 242 243 if (ctx->memflags & VM_MEM_F_WIRED) 244 memmap.flags |= VM_MEMMAP_F_WIRED; 245 246 /* 247 * If this mapping already exists then don't create it again. This 248 * is the common case for SYSMEM mappings created by bhyveload(8). 249 */ 250 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); 251 if (error == 0 && gpa == memmap.gpa) { 252 if (segid != memmap.segid || off != memmap.segoff || 253 prot != memmap.prot || flags != memmap.flags) { 254 errno = EEXIST; 255 return (-1); 256 } else { 257 return (0); 258 } 259 } 260 261 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); 262 return (error); 263 } 264 265 int 266 vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, 267 size_t *lowmem_size, size_t *highmem_size) 268 { 269 270 *guest_baseaddr = ctx->baseaddr; 271 *lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size; 272 *highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size; 273 return (0); 274 } 275 276 int 277 vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) 278 { 279 struct vm_munmap munmap; 280 int error; 281 282 munmap.gpa = gpa; 283 munmap.len = len; 284 285 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); 286 return (error); 287 } 288 289 int 290 vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, 291 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 292 { 293 struct vm_memmap memmap; 294 int error; 295 296 bzero(&memmap, sizeof(struct vm_memmap)); 297 memmap.gpa = *gpa; 298 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); 299 if (error == 0) { 300 *gpa = memmap.gpa; 301 *segid = memmap.segid; 302 *segoff = memmap.segoff; 303 *len = memmap.len; 304 *prot = memmap.prot; 305 *flags = memmap.flags; 306 } 307 return (error); 308 } 309 310 /* 311 * Return 0 if the segments are identical and non-zero otherwise. 312 * 313 * This is slightly complicated by the fact that only device memory segments 314 * are named. 315 */ 316 static int 317 cmpseg(size_t len, const char *str, size_t len2, const char *str2) 318 { 319 320 if (len == len2) { 321 if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) 322 return (0); 323 } 324 return (-1); 325 } 326 327 static int 328 vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) 329 { 330 struct vm_memseg memseg; 331 size_t n; 332 int error; 333 334 /* 335 * If the memory segment has already been created then just return. 336 * This is the usual case for the SYSMEM segment created by userspace 337 * loaders like bhyveload(8). 338 */ 339 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, 340 sizeof(memseg.name)); 341 if (error) 342 return (error); 343 344 if (memseg.len != 0) { 345 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { 346 errno = EINVAL; 347 return (-1); 348 } else { 349 return (0); 350 } 351 } 352 353 bzero(&memseg, sizeof(struct vm_memseg)); 354 memseg.segid = segid; 355 memseg.len = len; 356 if (name != NULL) { 357 n = strlcpy(memseg.name, name, sizeof(memseg.name)); 358 if (n >= sizeof(memseg.name)) { 359 errno = ENAMETOOLONG; 360 return (-1); 361 } 362 } 363 364 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); 365 return (error); 366 } 367 368 int 369 vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, 370 size_t bufsize) 371 { 372 struct vm_memseg memseg; 373 size_t n; 374 int error; 375 376 bzero(&memseg, sizeof(memseg)); 377 memseg.segid = segid; 378 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); 379 if (error == 0) { 380 *lenp = memseg.len; 381 n = strlcpy(namebuf, memseg.name, bufsize); 382 if (n >= bufsize) { 383 errno = ENAMETOOLONG; 384 error = -1; 385 } 386 } 387 return (error); 388 } 389 390 static int 391 setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) 392 { 393 char *ptr; 394 int error, flags; 395 396 /* Map 'len' bytes starting at 'gpa' in the guest address space */ 397 error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); 398 if (error) 399 return (error); 400 401 flags = MAP_SHARED | MAP_FIXED; 402 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 403 flags |= MAP_NOCORE; 404 405 /* mmap into the process address space on the host */ 406 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); 407 if (ptr == MAP_FAILED) 408 return (-1); 409 410 return (0); 411 } 412 413 int 414 vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) 415 { 416 size_t objsize, len; 417 vm_paddr_t gpa; 418 char *baseaddr, *ptr; 419 int error; 420 421 assert(vms == VM_MMAP_ALL); 422 423 /* 424 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create 425 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder. 426 */ 427 if (memsize > VM_LOWMEM_LIMIT) { 428 ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT; 429 ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT; 430 objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size; 431 } else { 432 ctx->memsegs[VM_MEMSEG_LOW].size = memsize; 433 ctx->memsegs[VM_MEMSEG_HIGH].size = 0; 434 objsize = memsize; 435 } 436 437 error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); 438 if (error) 439 return (error); 440 441 /* 442 * Stake out a contiguous region covering the guest physical memory 443 * and the adjoining guard regions. 444 */ 445 len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; 446 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); 447 if (ptr == MAP_FAILED) 448 return (-1); 449 450 baseaddr = ptr + VM_MMAP_GUARD_SIZE; 451 if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) { 452 gpa = VM_HIGHMEM_BASE; 453 len = ctx->memsegs[VM_MEMSEG_HIGH].size; 454 error = setup_memory_segment(ctx, gpa, len, baseaddr); 455 if (error) 456 return (error); 457 } 458 459 if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) { 460 gpa = 0; 461 len = ctx->memsegs[VM_MEMSEG_LOW].size; 462 error = setup_memory_segment(ctx, gpa, len, baseaddr); 463 if (error) 464 return (error); 465 } 466 467 ctx->baseaddr = baseaddr; 468 469 return (0); 470 } 471 472 /* 473 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in 474 * the lowmem or highmem regions. 475 * 476 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. 477 * The instruction emulation code depends on this behavior. 478 */ 479 void * 480 vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) 481 { 482 vm_size_t lowsize, highsize; 483 484 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 485 if (lowsize > 0) { 486 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) 487 return (ctx->baseaddr + gaddr); 488 } 489 490 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 491 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { 492 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && 493 gaddr + len <= VM_HIGHMEM_BASE + highsize) 494 return (ctx->baseaddr + gaddr); 495 } 496 497 return (NULL); 498 } 499 500 vm_paddr_t 501 vm_rev_map_gpa(struct vmctx *ctx, void *addr) 502 { 503 vm_paddr_t offaddr; 504 vm_size_t lowsize, highsize; 505 506 offaddr = (char *)addr - ctx->baseaddr; 507 508 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 509 if (lowsize > 0) 510 if (offaddr <= lowsize) 511 return (offaddr); 512 513 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 514 if (highsize > 0) 515 if (offaddr >= VM_HIGHMEM_BASE && 516 offaddr < VM_HIGHMEM_BASE + highsize) 517 return (offaddr); 518 519 return ((vm_paddr_t)-1); 520 } 521 522 const char * 523 vm_get_name(struct vmctx *ctx) 524 { 525 526 return (ctx->name); 527 } 528 529 size_t 530 vm_get_lowmem_size(struct vmctx *ctx) 531 { 532 533 return (ctx->memsegs[VM_MEMSEG_LOW].size); 534 } 535 536 vm_paddr_t 537 vm_get_highmem_base(struct vmctx *ctx __unused) 538 { 539 540 return (VM_HIGHMEM_BASE); 541 } 542 543 size_t 544 vm_get_highmem_size(struct vmctx *ctx) 545 { 546 547 return (ctx->memsegs[VM_MEMSEG_HIGH].size); 548 } 549 550 void * 551 vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) 552 { 553 char pathname[MAXPATHLEN]; 554 size_t len2; 555 char *base, *ptr; 556 int fd, error, flags; 557 558 fd = -1; 559 ptr = MAP_FAILED; 560 if (name == NULL || strlen(name) == 0) { 561 errno = EINVAL; 562 goto done; 563 } 564 565 error = vm_alloc_memseg(ctx, segid, len, name); 566 if (error) 567 goto done; 568 569 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); 570 strlcat(pathname, ctx->name, sizeof(pathname)); 571 strlcat(pathname, ".", sizeof(pathname)); 572 strlcat(pathname, name, sizeof(pathname)); 573 574 fd = open(pathname, O_RDWR); 575 if (fd < 0) 576 goto done; 577 578 /* 579 * Stake out a contiguous region covering the device memory and the 580 * adjoining guard regions. 581 */ 582 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; 583 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 584 0); 585 if (base == MAP_FAILED) 586 goto done; 587 588 flags = MAP_SHARED | MAP_FIXED; 589 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 590 flags |= MAP_NOCORE; 591 592 /* mmap the devmem region in the host address space */ 593 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); 594 done: 595 if (fd >= 0) 596 close(fd); 597 return (ptr); 598 } 599 600 int 601 vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) 602 { 603 /* 604 * XXX: fragile, handle with care 605 * Assumes that the first field of the ioctl data 606 * is the vcpuid. 607 */ 608 *(int *)arg = vcpu->vcpuid; 609 return (ioctl(vcpu->ctx->fd, cmd, arg)); 610 } 611 612 int 613 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 614 { 615 int error; 616 struct vm_register vmreg; 617 618 bzero(&vmreg, sizeof(vmreg)); 619 vmreg.regnum = reg; 620 vmreg.regval = val; 621 622 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); 623 return (error); 624 } 625 626 int 627 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) 628 { 629 int error; 630 struct vm_register vmreg; 631 632 bzero(&vmreg, sizeof(vmreg)); 633 vmreg.regnum = reg; 634 635 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); 636 *ret_val = vmreg.regval; 637 return (error); 638 } 639 640 int 641 vm_set_register_set(struct vcpu *vcpu, unsigned int count, 642 const int *regnums, uint64_t *regvals) 643 { 644 int error; 645 struct vm_register_set vmregset; 646 647 bzero(&vmregset, sizeof(vmregset)); 648 vmregset.count = count; 649 vmregset.regnums = regnums; 650 vmregset.regvals = regvals; 651 652 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); 653 return (error); 654 } 655 656 int 657 vm_get_register_set(struct vcpu *vcpu, unsigned int count, 658 const int *regnums, uint64_t *regvals) 659 { 660 int error; 661 struct vm_register_set vmregset; 662 663 bzero(&vmregset, sizeof(vmregset)); 664 vmregset.count = count; 665 vmregset.regnums = regnums; 666 vmregset.regvals = regvals; 667 668 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); 669 return (error); 670 } 671 672 int 673 vm_run(struct vcpu *vcpu, struct vm_run *vmrun) 674 { 675 return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); 676 } 677 678 int 679 vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) 680 { 681 struct vm_suspend vmsuspend; 682 683 bzero(&vmsuspend, sizeof(vmsuspend)); 684 vmsuspend.how = how; 685 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); 686 } 687 688 int 689 vm_reinit(struct vmctx *ctx) 690 { 691 692 return (ioctl(ctx->fd, VM_REINIT, 0)); 693 } 694 695 int 696 vm_capability_name2type(const char *capname) 697 { 698 int i; 699 700 for (i = 0; i < VM_CAP_MAX; i++) { 701 if (vm_capstrmap[i] != NULL && 702 strcmp(vm_capstrmap[i], capname) == 0) 703 return (i); 704 } 705 706 return (-1); 707 } 708 709 const char * 710 vm_capability_type2name(int type) 711 { 712 if (type >= 0 && type < VM_CAP_MAX) 713 return (vm_capstrmap[type]); 714 715 return (NULL); 716 } 717 718 int 719 vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) 720 { 721 int error; 722 struct vm_capability vmcap; 723 724 bzero(&vmcap, sizeof(vmcap)); 725 vmcap.captype = cap; 726 727 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); 728 *retval = vmcap.capval; 729 return (error); 730 } 731 732 int 733 vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) 734 { 735 struct vm_capability vmcap; 736 737 bzero(&vmcap, sizeof(vmcap)); 738 vmcap.captype = cap; 739 vmcap.capval = val; 740 741 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); 742 } 743 744 uint64_t * 745 vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, 746 int *ret_entries) 747 { 748 static _Thread_local uint64_t *stats_buf; 749 static _Thread_local u_int stats_count; 750 uint64_t *new_stats; 751 struct vm_stats vmstats; 752 u_int count, index; 753 bool have_stats; 754 755 have_stats = false; 756 count = 0; 757 for (index = 0;; index += nitems(vmstats.statbuf)) { 758 vmstats.index = index; 759 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) 760 break; 761 if (stats_count < index + vmstats.num_entries) { 762 new_stats = realloc(stats_buf, 763 (index + vmstats.num_entries) * sizeof(uint64_t)); 764 if (new_stats == NULL) { 765 errno = ENOMEM; 766 return (NULL); 767 } 768 stats_count = index + vmstats.num_entries; 769 stats_buf = new_stats; 770 } 771 memcpy(stats_buf + index, vmstats.statbuf, 772 vmstats.num_entries * sizeof(uint64_t)); 773 count += vmstats.num_entries; 774 have_stats = true; 775 776 if (vmstats.num_entries != nitems(vmstats.statbuf)) 777 break; 778 } 779 if (have_stats) { 780 if (ret_entries) 781 *ret_entries = count; 782 if (ret_tv) 783 *ret_tv = vmstats.tv; 784 return (stats_buf); 785 } else 786 return (NULL); 787 } 788 789 const char * 790 vm_get_stat_desc(struct vmctx *ctx, int index) 791 { 792 static struct vm_stat_desc statdesc; 793 794 statdesc.index = index; 795 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) 796 return (statdesc.desc); 797 else 798 return (NULL); 799 } 800 801 #ifdef __amd64__ 802 int 803 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) 804 { 805 int error, i; 806 struct vm_gpa_pte gpapte; 807 808 bzero(&gpapte, sizeof(gpapte)); 809 gpapte.gpa = gpa; 810 811 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); 812 813 if (error == 0) { 814 *num = gpapte.ptenum; 815 for (i = 0; i < gpapte.ptenum; i++) 816 pte[i] = gpapte.pte[i]; 817 } 818 819 return (error); 820 } 821 822 int 823 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 824 uint64_t gla, int prot, uint64_t *gpa, int *fault) 825 { 826 struct vm_gla2gpa gg; 827 int error; 828 829 bzero(&gg, sizeof(struct vm_gla2gpa)); 830 gg.prot = prot; 831 gg.gla = gla; 832 gg.paging = *paging; 833 834 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); 835 if (error == 0) { 836 *fault = gg.fault; 837 *gpa = gg.gpa; 838 } 839 return (error); 840 } 841 #endif 842 843 int 844 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 845 uint64_t gla, int prot, uint64_t *gpa, int *fault) 846 { 847 struct vm_gla2gpa gg; 848 int error; 849 850 bzero(&gg, sizeof(struct vm_gla2gpa)); 851 gg.prot = prot; 852 gg.gla = gla; 853 gg.paging = *paging; 854 855 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); 856 if (error == 0) { 857 *fault = gg.fault; 858 *gpa = gg.gpa; 859 } 860 return (error); 861 } 862 863 #ifndef min 864 #define min(a,b) (((a) < (b)) ? (a) : (b)) 865 #endif 866 867 #ifdef __amd64__ 868 int 869 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 870 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, 871 int *fault) 872 { 873 void *va; 874 uint64_t gpa, off; 875 int error, i, n; 876 877 for (i = 0; i < iovcnt; i++) { 878 iov[i].iov_base = 0; 879 iov[i].iov_len = 0; 880 } 881 882 while (len) { 883 assert(iovcnt > 0); 884 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 885 if (error || *fault) 886 return (error); 887 888 off = gpa & PAGE_MASK; 889 n = MIN(len, PAGE_SIZE - off); 890 891 va = vm_map_gpa(vcpu->ctx, gpa, n); 892 if (va == NULL) 893 return (EFAULT); 894 895 iov->iov_base = va; 896 iov->iov_len = n; 897 iov++; 898 iovcnt--; 899 900 gla += n; 901 len -= n; 902 } 903 return (0); 904 } 905 #endif 906 907 void 908 vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) 909 { 910 /* 911 * Intentionally empty. This is used by the instruction 912 * emulation code shared with the kernel. The in-kernel 913 * version of this is non-empty. 914 */ 915 } 916 917 void 918 vm_copyin(struct iovec *iov, void *vp, size_t len) 919 { 920 const char *src; 921 char *dst; 922 size_t n; 923 924 dst = vp; 925 while (len) { 926 assert(iov->iov_len); 927 n = min(len, iov->iov_len); 928 src = iov->iov_base; 929 bcopy(src, dst, n); 930 931 iov++; 932 dst += n; 933 len -= n; 934 } 935 } 936 937 void 938 vm_copyout(const void *vp, struct iovec *iov, size_t len) 939 { 940 const char *src; 941 char *dst; 942 size_t n; 943 944 src = vp; 945 while (len) { 946 assert(iov->iov_len); 947 n = min(len, iov->iov_len); 948 dst = iov->iov_base; 949 bcopy(src, dst, n); 950 951 iov++; 952 src += n; 953 len -= n; 954 } 955 } 956 957 static int 958 vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) 959 { 960 struct vm_cpuset vm_cpuset; 961 int error; 962 963 bzero(&vm_cpuset, sizeof(struct vm_cpuset)); 964 vm_cpuset.which = which; 965 vm_cpuset.cpusetsize = sizeof(cpuset_t); 966 vm_cpuset.cpus = cpus; 967 968 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); 969 return (error); 970 } 971 972 int 973 vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) 974 { 975 976 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); 977 } 978 979 int 980 vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) 981 { 982 983 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); 984 } 985 986 int 987 vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) 988 { 989 990 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); 991 } 992 993 int 994 vm_activate_cpu(struct vcpu *vcpu) 995 { 996 struct vm_activate_cpu ac; 997 int error; 998 999 bzero(&ac, sizeof(struct vm_activate_cpu)); 1000 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); 1001 return (error); 1002 } 1003 1004 int 1005 vm_suspend_all_cpus(struct vmctx *ctx) 1006 { 1007 struct vm_activate_cpu ac; 1008 int error; 1009 1010 bzero(&ac, sizeof(struct vm_activate_cpu)); 1011 ac.vcpuid = -1; 1012 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); 1013 return (error); 1014 } 1015 1016 int 1017 vm_suspend_cpu(struct vcpu *vcpu) 1018 { 1019 struct vm_activate_cpu ac; 1020 int error; 1021 1022 bzero(&ac, sizeof(struct vm_activate_cpu)); 1023 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); 1024 return (error); 1025 } 1026 1027 int 1028 vm_resume_cpu(struct vcpu *vcpu) 1029 { 1030 struct vm_activate_cpu ac; 1031 int error; 1032 1033 bzero(&ac, sizeof(struct vm_activate_cpu)); 1034 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); 1035 return (error); 1036 } 1037 1038 int 1039 vm_resume_all_cpus(struct vmctx *ctx) 1040 { 1041 struct vm_activate_cpu ac; 1042 int error; 1043 1044 bzero(&ac, sizeof(struct vm_activate_cpu)); 1045 ac.vcpuid = -1; 1046 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); 1047 return (error); 1048 } 1049 1050 #ifdef __amd64__ 1051 int 1052 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1053 { 1054 struct vm_intinfo vmii; 1055 int error; 1056 1057 bzero(&vmii, sizeof(struct vm_intinfo)); 1058 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); 1059 if (error == 0) { 1060 *info1 = vmii.info1; 1061 *info2 = vmii.info2; 1062 } 1063 return (error); 1064 } 1065 1066 int 1067 vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) 1068 { 1069 struct vm_intinfo vmii; 1070 int error; 1071 1072 bzero(&vmii, sizeof(struct vm_intinfo)); 1073 vmii.info1 = info1; 1074 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); 1075 return (error); 1076 } 1077 #endif 1078 1079 #ifdef WITH_VMMAPI_SNAPSHOT 1080 int 1081 vm_restart_instruction(struct vcpu *vcpu) 1082 { 1083 int arg; 1084 1085 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); 1086 } 1087 1088 int 1089 vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) 1090 { 1091 1092 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { 1093 #ifdef SNAPSHOT_DEBUG 1094 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", 1095 __func__, meta->dev_name, errno); 1096 #endif 1097 return (-1); 1098 } 1099 return (0); 1100 } 1101 1102 int 1103 vm_restore_time(struct vmctx *ctx) 1104 { 1105 int dummy; 1106 1107 dummy = 0; 1108 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); 1109 } 1110 #endif 1111 1112 int 1113 vm_set_topology(struct vmctx *ctx, 1114 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) 1115 { 1116 struct vm_cpu_topology topology; 1117 1118 bzero(&topology, sizeof (struct vm_cpu_topology)); 1119 topology.sockets = sockets; 1120 topology.cores = cores; 1121 topology.threads = threads; 1122 topology.maxcpus = maxcpus; 1123 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); 1124 } 1125 1126 int 1127 vm_get_topology(struct vmctx *ctx, 1128 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) 1129 { 1130 struct vm_cpu_topology topology; 1131 int error; 1132 1133 bzero(&topology, sizeof (struct vm_cpu_topology)); 1134 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); 1135 if (error == 0) { 1136 *sockets = topology.sockets; 1137 *cores = topology.cores; 1138 *threads = topology.threads; 1139 *maxcpus = topology.maxcpus; 1140 } 1141 return (error); 1142 } 1143 1144 int 1145 vm_limit_rights(struct vmctx *ctx) 1146 { 1147 cap_rights_t rights; 1148 1149 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 1150 if (caph_rights_limit(ctx->fd, &rights) != 0) 1151 return (-1); 1152 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) 1153 return (-1); 1154 return (0); 1155 } 1156 1157 /* 1158 * Avoid using in new code. Operations on the fd should be wrapped here so that 1159 * capability rights can be kept in sync. 1160 */ 1161 int 1162 vm_get_device_fd(struct vmctx *ctx) 1163 { 1164 1165 return (ctx->fd); 1166 } 1167 1168 /* Legacy interface, do not use. */ 1169 const cap_ioctl_t * 1170 vm_get_ioctls(size_t *len) 1171 { 1172 cap_ioctl_t *cmds; 1173 size_t sz; 1174 1175 if (len == NULL) { 1176 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); 1177 cmds = malloc(sz); 1178 if (cmds == NULL) 1179 return (NULL); 1180 bcopy(vm_ioctl_cmds, cmds, sz); 1181 return (cmds); 1182 } 1183 1184 *len = vm_ioctl_ncmds; 1185 return (NULL); 1186 } 1187