1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/types.h> 30 #ifndef WITHOUT_CAPSICUM 31 #include <sys/capsicum.h> 32 #endif 33 #include <sys/mman.h> 34 #ifdef BHYVE_SNAPSHOT 35 #include <sys/socket.h> 36 #include <sys/stat.h> 37 #endif 38 #include <sys/time.h> 39 #ifdef BHYVE_SNAPSHOT 40 #include <sys/un.h> 41 #endif 42 43 #include <machine/atomic.h> 44 45 #ifndef WITHOUT_CAPSICUM 46 #include <capsicum_helpers.h> 47 #endif 48 #include <stdio.h> 49 #include <stdlib.h> 50 #include <string.h> 51 #include <err.h> 52 #include <errno.h> 53 #ifdef BHYVE_SNAPSHOT 54 #include <fcntl.h> 55 #endif 56 #include <libgen.h> 57 #include <unistd.h> 58 #include <assert.h> 59 #include <pthread.h> 60 #include <pthread_np.h> 61 #include <sysexits.h> 62 #include <stdbool.h> 63 #include <stdint.h> 64 #ifdef BHYVE_SNAPSHOT 65 #include <ucl.h> 66 #include <unistd.h> 67 68 #include <libxo/xo.h> 69 #endif 70 71 #include <vmmapi.h> 72 73 #include "acpi.h" 74 #include "bhyverun.h" 75 #include "bootrom.h" 76 #include "config.h" 77 #include "debug.h" 78 #ifdef BHYVE_GDB 79 #include "gdb.h" 80 #endif 81 #include "mem.h" 82 #include "mevent.h" 83 #include "pci_emul.h" 84 #ifdef __amd64__ 85 #include "amd64/pci_lpc.h" 86 #endif 87 #include "qemu_fwcfg.h" 88 #ifdef BHYVE_SNAPSHOT 89 #include "snapshot.h" 90 #endif 91 #include "tpm_device.h" 92 #include "vmgenc.h" 93 #include "vmexit.h" 94 95 #define MB (1024UL * 1024) 96 #define GB (1024UL * MB) 97 98 int guest_ncpus; 99 uint16_t cpu_cores, cpu_sockets, cpu_threads; 100 101 int raw_stdio = 0; 102 103 static char *progname; 104 static const int BSP = 0; 105 106 static cpuset_t cpumask; 107 108 static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); 109 110 static struct vcpu_info { 111 struct vmctx *ctx; 112 struct vcpu *vcpu; 113 int vcpuid; 114 } *vcpu_info; 115 116 static cpuset_t **vcpumap; 117 118 static void 119 usage(int code) 120 { 121 122 fprintf(stderr, 123 "Usage: %s [-AaCDeHhPSuWwxY]\n" 124 " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" 125 " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" 126 " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" 127 " -A: create ACPI tables\n" 128 " -a: local apic is in xAPIC mode (deprecated)\n" 129 " -C: include guest memory in core file\n" 130 " -c: number of CPUs and/or topology specification\n" 131 " -D: destroy on power-off\n" 132 " -e: exit on unhandled I/O access\n" 133 " -G: start a debug server\n" 134 " -H: vmexit from the guest on HLT\n" 135 " -h: help\n" 136 " -k: key=value flat config file\n" 137 " -K: PS2 keyboard layout\n" 138 " -l: LPC device configuration\n" 139 " -m: memory size\n" 140 " -o: set config 'var' to 'value'\n" 141 " -P: vmexit from the guest on pause\n" 142 " -p: pin 'vcpu' to 'hostcpu'\n" 143 #ifdef BHYVE_SNAPSHOT 144 " -r: path to checkpoint file\n" 145 #endif 146 " -S: guest memory cannot be swapped\n" 147 " -s: <slot,driver,configinfo> PCI slot config\n" 148 " -U: UUID\n" 149 " -u: RTC keeps UTC time\n" 150 " -W: force virtio to use single-vector MSI\n" 151 " -w: ignore unimplemented MSRs\n" 152 " -x: local APIC is in x2APIC mode\n" 153 " -Y: disable MPtable generation\n", 154 progname, (int)strlen(progname), "", (int)strlen(progname), "", 155 (int)strlen(progname), ""); 156 157 exit(code); 158 } 159 160 /* 161 * XXX This parser is known to have the following issues: 162 * 1. It accepts null key=value tokens ",," as setting "cpus" to an 163 * empty string. 164 * 165 * The acceptance of a null specification ('-c ""') is by design to match the 166 * manual page syntax specification, this results in a topology of 1 vCPU. 167 */ 168 static int 169 topology_parse(const char *opt) 170 { 171 char *cp, *str, *tofree; 172 173 if (*opt == '\0') { 174 set_config_value("sockets", "1"); 175 set_config_value("cores", "1"); 176 set_config_value("threads", "1"); 177 set_config_value("cpus", "1"); 178 return (0); 179 } 180 181 tofree = str = strdup(opt); 182 if (str == NULL) 183 errx(4, "Failed to allocate memory"); 184 185 while ((cp = strsep(&str, ",")) != NULL) { 186 if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) 187 set_config_value("cpus", cp + strlen("cpus=")); 188 else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) 189 set_config_value("sockets", cp + strlen("sockets=")); 190 else if (strncmp(cp, "cores=", strlen("cores=")) == 0) 191 set_config_value("cores", cp + strlen("cores=")); 192 else if (strncmp(cp, "threads=", strlen("threads=")) == 0) 193 set_config_value("threads", cp + strlen("threads=")); 194 else if (strchr(cp, '=') != NULL) 195 goto out; 196 else 197 set_config_value("cpus", cp); 198 } 199 free(tofree); 200 return (0); 201 202 out: 203 free(tofree); 204 return (-1); 205 } 206 207 static int 208 parse_int_value(const char *key, const char *value, int minval, int maxval) 209 { 210 char *cp; 211 long lval; 212 213 errno = 0; 214 lval = strtol(value, &cp, 0); 215 if (errno != 0 || *cp != '\0' || cp == value || lval < minval || 216 lval > maxval) 217 errx(4, "Invalid value for %s: '%s'", key, value); 218 return (lval); 219 } 220 221 /* 222 * Set the sockets, cores, threads, and guest_cpus variables based on 223 * the configured topology. 224 * 225 * The limits of UINT16_MAX are due to the types passed to 226 * vm_set_topology(). vmm.ko may enforce tighter limits. 227 */ 228 static void 229 calc_topology(void) 230 { 231 const char *value; 232 bool explicit_cpus; 233 uint64_t ncpus; 234 235 value = get_config_value("cpus"); 236 if (value != NULL) { 237 guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); 238 explicit_cpus = true; 239 } else { 240 guest_ncpus = 1; 241 explicit_cpus = false; 242 } 243 value = get_config_value("cores"); 244 if (value != NULL) 245 cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); 246 else 247 cpu_cores = 1; 248 value = get_config_value("threads"); 249 if (value != NULL) 250 cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); 251 else 252 cpu_threads = 1; 253 value = get_config_value("sockets"); 254 if (value != NULL) 255 cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); 256 else 257 cpu_sockets = guest_ncpus; 258 259 /* 260 * Compute sockets * cores * threads avoiding overflow. The 261 * range check above insures these are 16 bit values. 262 */ 263 ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; 264 if (ncpus > UINT16_MAX) 265 errx(4, "Computed number of vCPUs too high: %ju", 266 (uintmax_t)ncpus); 267 268 if (explicit_cpus) { 269 if (guest_ncpus != (int)ncpus) 270 errx(4, "Topology (%d sockets, %d cores, %d threads) " 271 "does not match %d vCPUs", 272 cpu_sockets, cpu_cores, cpu_threads, 273 guest_ncpus); 274 } else 275 guest_ncpus = ncpus; 276 } 277 278 static int 279 pincpu_parse(const char *opt) 280 { 281 const char *value; 282 char *newval; 283 char key[16]; 284 int vcpu, pcpu; 285 286 if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { 287 fprintf(stderr, "invalid format: %s\n", opt); 288 return (-1); 289 } 290 291 if (vcpu < 0) { 292 fprintf(stderr, "invalid vcpu '%d'\n", vcpu); 293 return (-1); 294 } 295 296 if (pcpu < 0 || pcpu >= CPU_SETSIZE) { 297 fprintf(stderr, "hostcpu '%d' outside valid range from " 298 "0 to %d\n", pcpu, CPU_SETSIZE - 1); 299 return (-1); 300 } 301 302 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); 303 value = get_config_value(key); 304 305 if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", 306 value != NULL ? "," : "", pcpu) == -1) { 307 perror("failed to build new cpuset string"); 308 return (-1); 309 } 310 311 set_config_value(key, newval); 312 free(newval); 313 return (0); 314 } 315 316 static void 317 parse_cpuset(int vcpu, const char *list, cpuset_t *set) 318 { 319 char *cp, *token; 320 int pcpu, start; 321 322 CPU_ZERO(set); 323 start = -1; 324 token = __DECONST(char *, list); 325 for (;;) { 326 pcpu = strtoul(token, &cp, 0); 327 if (cp == token) 328 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); 329 if (pcpu < 0 || pcpu >= CPU_SETSIZE) 330 errx(4, "hostcpu '%d' outside valid range from 0 to %d", 331 pcpu, CPU_SETSIZE - 1); 332 switch (*cp) { 333 case ',': 334 case '\0': 335 if (start >= 0) { 336 if (start > pcpu) 337 errx(4, "Invalid hostcpu range %d-%d", 338 start, pcpu); 339 while (start < pcpu) { 340 CPU_SET(start, set); 341 start++; 342 } 343 start = -1; 344 } 345 CPU_SET(pcpu, set); 346 break; 347 case '-': 348 if (start >= 0) 349 errx(4, "invalid cpuset for vcpu %d: '%s'", 350 vcpu, list); 351 start = pcpu; 352 break; 353 default: 354 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); 355 } 356 if (*cp == '\0') 357 break; 358 token = cp + 1; 359 } 360 } 361 362 static void 363 build_vcpumaps(void) 364 { 365 char key[16]; 366 const char *value; 367 int vcpu; 368 369 vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); 370 for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { 371 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); 372 value = get_config_value(key); 373 if (value == NULL) 374 continue; 375 vcpumap[vcpu] = malloc(sizeof(cpuset_t)); 376 if (vcpumap[vcpu] == NULL) 377 err(4, "Failed to allocate cpuset for vcpu %d", vcpu); 378 parse_cpuset(vcpu, value, vcpumap[vcpu]); 379 } 380 } 381 382 void * 383 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) 384 { 385 386 return (vm_map_gpa(ctx, gaddr, len)); 387 } 388 389 #ifdef BHYVE_SNAPSHOT 390 uintptr_t 391 paddr_host2guest(struct vmctx *ctx, void *addr) 392 { 393 return (vm_rev_map_gpa(ctx, addr)); 394 } 395 #endif 396 397 int 398 fbsdrun_virtio_msix(void) 399 { 400 401 return (get_config_bool_default("virtio_msix", true)); 402 } 403 404 struct vcpu * 405 fbsdrun_vcpu(int vcpuid) 406 { 407 return (vcpu_info[vcpuid].vcpu); 408 } 409 410 static void * 411 fbsdrun_start_thread(void *param) 412 { 413 char tname[MAXCOMLEN + 1]; 414 struct vcpu_info *vi = param; 415 int error; 416 417 snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); 418 pthread_set_name_np(pthread_self(), tname); 419 420 if (vcpumap[vi->vcpuid] != NULL) { 421 error = pthread_setaffinity_np(pthread_self(), 422 sizeof(cpuset_t), vcpumap[vi->vcpuid]); 423 assert(error == 0); 424 } 425 426 #ifdef BHYVE_SNAPSHOT 427 checkpoint_cpu_add(vi->vcpuid); 428 #endif 429 #ifdef BHYVE_GDB 430 gdb_cpu_add(vi->vcpu); 431 #endif 432 433 vm_loop(vi->ctx, vi->vcpu); 434 435 /* not reached */ 436 exit(1); 437 return (NULL); 438 } 439 440 void 441 fbsdrun_addcpu(int vcpuid) 442 { 443 struct vcpu_info *vi; 444 pthread_t thr; 445 int error; 446 447 vi = &vcpu_info[vcpuid]; 448 449 error = vm_activate_cpu(vi->vcpu); 450 if (error != 0) 451 err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); 452 453 CPU_SET_ATOMIC(vcpuid, &cpumask); 454 455 vm_suspend_cpu(vi->vcpu); 456 457 error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); 458 assert(error == 0); 459 } 460 461 void 462 fbsdrun_deletecpu(int vcpu) 463 { 464 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; 465 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; 466 467 pthread_mutex_lock(&resetcpu_mtx); 468 if (!CPU_ISSET(vcpu, &cpumask)) { 469 EPRINTLN("Attempting to delete unknown cpu %d", vcpu); 470 exit(4); 471 } 472 473 CPU_CLR(vcpu, &cpumask); 474 475 if (vcpu != BSP) { 476 pthread_cond_signal(&resetcpu_cond); 477 pthread_mutex_unlock(&resetcpu_mtx); 478 pthread_exit(NULL); 479 /* NOTREACHED */ 480 } 481 482 while (!CPU_EMPTY(&cpumask)) { 483 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); 484 } 485 pthread_mutex_unlock(&resetcpu_mtx); 486 } 487 488 int 489 fbsdrun_suspendcpu(int vcpuid) 490 { 491 return (vm_suspend_cpu(vcpu_info[vcpuid].vcpu)); 492 } 493 494 static void 495 vm_loop(struct vmctx *ctx, struct vcpu *vcpu) 496 { 497 struct vm_exit vme; 498 struct vm_run vmrun; 499 int error, rc; 500 enum vm_exitcode exitcode; 501 cpuset_t active_cpus, dmask; 502 503 error = vm_active_cpus(ctx, &active_cpus); 504 assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); 505 506 vmrun.vm_exit = &vme; 507 vmrun.cpuset = &dmask; 508 vmrun.cpusetsize = sizeof(dmask); 509 510 while (1) { 511 error = vm_run(vcpu, &vmrun); 512 if (error != 0) 513 break; 514 515 exitcode = vme.exitcode; 516 if (exitcode >= VM_EXITCODE_MAX || 517 vmexit_handlers[exitcode] == NULL) { 518 warnx("vm_loop: unexpected exitcode 0x%x", exitcode); 519 exit(4); 520 } 521 522 rc = (*vmexit_handlers[exitcode])(ctx, vcpu, &vmrun); 523 524 switch (rc) { 525 case VMEXIT_CONTINUE: 526 break; 527 case VMEXIT_ABORT: 528 abort(); 529 default: 530 exit(4); 531 } 532 } 533 EPRINTLN("vm_run error %d, errno %d", error, errno); 534 } 535 536 static int 537 num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) 538 { 539 uint16_t sockets, cores, threads, maxcpus; 540 int tmp, error; 541 542 /* 543 * The guest is allowed to spinup more than one processor only if the 544 * UNRESTRICTED_GUEST capability is available. 545 */ 546 error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); 547 if (error != 0) 548 return (1); 549 550 error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); 551 if (error == 0) 552 return (maxcpus); 553 else 554 return (1); 555 } 556 557 static struct vmctx * 558 do_open(const char *vmname) 559 { 560 struct vmctx *ctx; 561 int error; 562 bool reinit, romboot; 563 564 reinit = romboot = false; 565 566 #ifdef __amd64__ 567 if (lpc_bootrom()) 568 romboot = true; 569 #endif 570 571 error = vm_create(vmname); 572 if (error) { 573 if (errno == EEXIST) { 574 if (romboot) { 575 reinit = true; 576 } else { 577 /* 578 * The virtual machine has been setup by the 579 * userspace bootloader. 580 */ 581 } 582 } else { 583 perror("vm_create"); 584 exit(4); 585 } 586 } else { 587 if (!romboot) { 588 /* 589 * If the virtual machine was just created then a 590 * bootrom must be configured to boot it. 591 */ 592 fprintf(stderr, "virtual machine cannot be booted\n"); 593 exit(4); 594 } 595 } 596 597 ctx = vm_open(vmname); 598 if (ctx == NULL) { 599 perror("vm_open"); 600 exit(4); 601 } 602 603 #ifndef WITHOUT_CAPSICUM 604 if (vm_limit_rights(ctx) != 0) 605 err(EX_OSERR, "vm_limit_rights"); 606 #endif 607 608 if (reinit) { 609 error = vm_reinit(ctx); 610 if (error) { 611 perror("vm_reinit"); 612 exit(4); 613 } 614 } 615 error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0); 616 if (error) 617 errx(EX_OSERR, "vm_set_topology"); 618 return (ctx); 619 } 620 621 static bool 622 parse_config_option(const char *option) 623 { 624 const char *value; 625 char *path; 626 627 value = strchr(option, '='); 628 if (value == NULL || value[1] == '\0') 629 return (false); 630 path = strndup(option, value - option); 631 if (path == NULL) 632 err(4, "Failed to allocate memory"); 633 set_config_value(path, value + 1); 634 return (true); 635 } 636 637 static void 638 parse_simple_config_file(const char *path) 639 { 640 FILE *fp; 641 char *line, *cp; 642 size_t linecap; 643 unsigned int lineno; 644 645 fp = fopen(path, "r"); 646 if (fp == NULL) 647 err(4, "Failed to open configuration file %s", path); 648 line = NULL; 649 linecap = 0; 650 lineno = 1; 651 for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { 652 if (*line == '#' || *line == '\n') 653 continue; 654 cp = strchr(line, '\n'); 655 if (cp != NULL) 656 *cp = '\0'; 657 if (!parse_config_option(line)) 658 errx(4, "%s line %u: invalid config option '%s'", path, 659 lineno, line); 660 } 661 free(line); 662 fclose(fp); 663 } 664 665 #ifdef BHYVE_GDB 666 static void 667 parse_gdb_options(const char *opt) 668 { 669 const char *sport; 670 char *colon; 671 672 if (opt[0] == 'w') { 673 set_config_bool("gdb.wait", true); 674 opt++; 675 } 676 677 colon = strrchr(opt, ':'); 678 if (colon == NULL) { 679 sport = opt; 680 } else { 681 *colon = '\0'; 682 colon++; 683 sport = colon; 684 set_config_value("gdb.address", opt); 685 } 686 687 set_config_value("gdb.port", sport); 688 } 689 #endif 690 691 int 692 main(int argc, char *argv[]) 693 { 694 int c, error; 695 int max_vcpus, memflags; 696 struct vcpu *bsp; 697 struct vmctx *ctx; 698 size_t memsize; 699 const char *optstr, *value, *vmname; 700 #ifdef BHYVE_SNAPSHOT 701 char *restore_file; 702 struct restore_state rstate; 703 704 restore_file = NULL; 705 #endif 706 707 bhyve_init_config(); 708 709 progname = basename(argv[0]); 710 711 #ifdef BHYVE_SNAPSHOT 712 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; 713 #else 714 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; 715 #endif 716 while ((c = getopt(argc, argv, optstr)) != -1) { 717 switch (c) { 718 #ifdef __amd64__ 719 case 'a': 720 set_config_bool("x86.x2apic", false); 721 break; 722 #endif 723 case 'A': 724 /* 725 * NOP. For backward compatibility. Most systems don't 726 * work properly without sane ACPI tables. Therefore, 727 * we're always generating them. 728 */ 729 break; 730 case 'D': 731 set_config_bool("destroy_on_poweroff", true); 732 break; 733 case 'p': 734 if (pincpu_parse(optarg) != 0) { 735 errx(EX_USAGE, "invalid vcpu pinning " 736 "configuration '%s'", optarg); 737 } 738 break; 739 case 'c': 740 if (topology_parse(optarg) != 0) { 741 errx(EX_USAGE, "invalid cpu topology " 742 "'%s'", optarg); 743 } 744 break; 745 case 'C': 746 set_config_bool("memory.guest_in_core", true); 747 break; 748 case 'f': 749 if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { 750 errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); 751 } 752 break; 753 #ifdef BHYVE_GDB 754 case 'G': 755 parse_gdb_options(optarg); 756 break; 757 #endif 758 case 'k': 759 parse_simple_config_file(optarg); 760 break; 761 case 'K': 762 set_config_value("keyboard.layout", optarg); 763 break; 764 #ifdef __amd64__ 765 case 'l': 766 if (strncmp(optarg, "help", strlen(optarg)) == 0) { 767 lpc_print_supported_devices(); 768 exit(0); 769 } else if (lpc_device_parse(optarg) != 0) { 770 errx(EX_USAGE, "invalid lpc device " 771 "configuration '%s'", optarg); 772 } 773 break; 774 #endif 775 #ifdef BHYVE_SNAPSHOT 776 case 'r': 777 restore_file = optarg; 778 break; 779 #endif 780 case 's': 781 if (strncmp(optarg, "help", strlen(optarg)) == 0) { 782 pci_print_supported_devices(); 783 exit(0); 784 } else if (pci_parse_slot(optarg) != 0) 785 exit(4); 786 else 787 break; 788 case 'S': 789 set_config_bool("memory.wired", true); 790 break; 791 case 'm': 792 set_config_value("memory.size", optarg); 793 break; 794 case 'o': 795 if (!parse_config_option(optarg)) 796 errx(EX_USAGE, "invalid configuration option '%s'", optarg); 797 break; 798 #ifdef __amd64__ 799 case 'H': 800 set_config_bool("x86.vmexit_on_hlt", true); 801 break; 802 case 'I': 803 /* 804 * The "-I" option was used to add an ioapic to the 805 * virtual machine. 806 * 807 * An ioapic is now provided unconditionally for each 808 * virtual machine and this option is now deprecated. 809 */ 810 break; 811 case 'P': 812 set_config_bool("x86.vmexit_on_pause", true); 813 break; 814 case 'e': 815 set_config_bool("x86.strictio", true); 816 break; 817 case 'u': 818 set_config_bool("rtc.use_localtime", false); 819 break; 820 #endif 821 case 'U': 822 set_config_value("uuid", optarg); 823 break; 824 #ifdef __amd64__ 825 case 'w': 826 set_config_bool("x86.strictmsr", false); 827 break; 828 #endif 829 case 'W': 830 set_config_bool("virtio_msix", false); 831 break; 832 #ifdef __amd64__ 833 case 'x': 834 set_config_bool("x86.x2apic", true); 835 break; 836 case 'Y': 837 set_config_bool("x86.mptable", false); 838 break; 839 #endif 840 case 'h': 841 usage(0); 842 default: 843 usage(1); 844 } 845 } 846 argc -= optind; 847 argv += optind; 848 849 if (argc > 1) 850 usage(1); 851 852 #ifdef BHYVE_SNAPSHOT 853 if (restore_file != NULL) { 854 error = load_restore_file(restore_file, &rstate); 855 if (error) { 856 fprintf(stderr, "Failed to read checkpoint info from " 857 "file: '%s'.\n", restore_file); 858 exit(1); 859 } 860 vmname = lookup_vmname(&rstate); 861 if (vmname != NULL) 862 set_config_value("name", vmname); 863 } 864 #endif 865 866 if (argc == 1) 867 set_config_value("name", argv[0]); 868 869 vmname = get_config_value("name"); 870 if (vmname == NULL) 871 usage(1); 872 873 if (get_config_bool_default("config.dump", false)) { 874 dump_config(); 875 exit(1); 876 } 877 878 calc_topology(); 879 build_vcpumaps(); 880 881 value = get_config_value("memory.size"); 882 error = vm_parse_memsize(value, &memsize); 883 if (error) 884 errx(EX_USAGE, "invalid memsize '%s'", value); 885 886 ctx = do_open(vmname); 887 888 #ifdef BHYVE_SNAPSHOT 889 if (restore_file != NULL) { 890 guest_ncpus = lookup_guest_ncpus(&rstate); 891 memflags = lookup_memflags(&rstate); 892 memsize = lookup_memsize(&rstate); 893 } 894 895 if (guest_ncpus < 1) { 896 fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); 897 exit(1); 898 } 899 #endif 900 901 bsp = vm_vcpu_open(ctx, BSP); 902 max_vcpus = num_vcpus_allowed(ctx, bsp); 903 if (guest_ncpus > max_vcpus) { 904 fprintf(stderr, "%d vCPUs requested but only %d available\n", 905 guest_ncpus, max_vcpus); 906 exit(4); 907 } 908 909 bhyve_init_vcpu(bsp); 910 911 /* Allocate per-VCPU resources. */ 912 vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); 913 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { 914 vcpu_info[vcpuid].ctx = ctx; 915 vcpu_info[vcpuid].vcpuid = vcpuid; 916 if (vcpuid == BSP) 917 vcpu_info[vcpuid].vcpu = bsp; 918 else 919 vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); 920 } 921 922 memflags = 0; 923 if (get_config_bool_default("memory.wired", false)) 924 memflags |= VM_MEM_F_WIRED; 925 if (get_config_bool_default("memory.guest_in_core", false)) 926 memflags |= VM_MEM_F_INCORE; 927 vm_set_memflags(ctx, memflags); 928 error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); 929 if (error) { 930 fprintf(stderr, "Unable to setup memory (%d)\n", errno); 931 exit(4); 932 } 933 934 init_mem(guest_ncpus); 935 init_bootrom(ctx); 936 if (bhyve_init_platform(ctx, bsp) != 0) 937 exit(4); 938 939 if (qemu_fwcfg_init(ctx) != 0) { 940 fprintf(stderr, "qemu fwcfg initialization error\n"); 941 exit(4); 942 } 943 944 if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), 945 &guest_ncpus) != 0) { 946 fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu\n"); 947 exit(4); 948 } 949 950 /* 951 * Exit if a device emulation finds an error in its initialization 952 */ 953 if (init_pci(ctx) != 0) { 954 EPRINTLN("Device emulation initialization error: %s", 955 strerror(errno)); 956 exit(4); 957 } 958 if (init_tpm(ctx) != 0) { 959 EPRINTLN("Failed to init TPM device"); 960 exit(4); 961 } 962 963 /* 964 * Initialize after PCI, to allow a bootrom file to reserve the high 965 * region. 966 */ 967 if (get_config_bool("acpi_tables")) 968 vmgenc_init(ctx); 969 970 #ifdef BHYVE_GDB 971 init_gdb(ctx); 972 #endif 973 974 /* 975 * Add all vCPUs. 976 */ 977 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) 978 bhyve_start_vcpu(vcpu_info[vcpuid].vcpu, vcpuid == BSP); 979 980 #ifdef BHYVE_SNAPSHOT 981 if (restore_file != NULL) { 982 FPRINTLN(stdout, "Pausing pci devs..."); 983 if (vm_pause_devices() != 0) { 984 EPRINTLN("Failed to pause PCI device state."); 985 exit(1); 986 } 987 988 FPRINTLN(stdout, "Restoring vm mem..."); 989 if (restore_vm_mem(ctx, &rstate) != 0) { 990 EPRINTLN("Failed to restore VM memory."); 991 exit(1); 992 } 993 994 FPRINTLN(stdout, "Restoring pci devs..."); 995 if (vm_restore_devices(&rstate) != 0) { 996 EPRINTLN("Failed to restore PCI device state."); 997 exit(1); 998 } 999 1000 FPRINTLN(stdout, "Restoring kernel structs..."); 1001 if (vm_restore_kern_structs(ctx, &rstate) != 0) { 1002 EPRINTLN("Failed to restore kernel structs."); 1003 exit(1); 1004 } 1005 1006 FPRINTLN(stdout, "Resuming pci devs..."); 1007 if (vm_resume_devices() != 0) { 1008 EPRINTLN("Failed to resume PCI device state."); 1009 exit(1); 1010 } 1011 } 1012 #endif 1013 1014 if (bhyve_init_platform_late(ctx, bsp) != 0) 1015 exit(4); 1016 1017 /* 1018 * Change the proc title to include the VM name. 1019 */ 1020 setproctitle("%s", vmname); 1021 1022 #ifdef BHYVE_SNAPSHOT 1023 /* 1024 * checkpointing thread for communication with bhyvectl 1025 */ 1026 if (init_checkpoint_thread(ctx) != 0) 1027 errx(EX_OSERR, "Failed to start checkpoint thread"); 1028 #endif 1029 1030 #ifndef WITHOUT_CAPSICUM 1031 caph_cache_catpages(); 1032 1033 if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) 1034 errx(EX_OSERR, "Unable to apply rights for sandbox"); 1035 1036 if (caph_enter() == -1) 1037 errx(EX_OSERR, "cap_enter() failed"); 1038 #endif 1039 1040 #ifdef BHYVE_SNAPSHOT 1041 if (restore_file != NULL) { 1042 destroy_restore_state(&rstate); 1043 if (vm_restore_time(ctx) < 0) 1044 err(EX_OSERR, "Unable to restore time"); 1045 1046 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) 1047 vm_resume_cpu(vcpu_info[vcpuid].vcpu); 1048 } else 1049 #endif 1050 vm_resume_cpu(bsp); 1051 1052 /* 1053 * Head off to the main event dispatch loop 1054 */ 1055 mevent_dispatch(); 1056 1057 exit(4); 1058 } 1059