1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #include <sys/types.h> 31 #ifndef WITHOUT_CAPSICUM 32 #include <sys/capsicum.h> 33 #endif 34 #include <sys/mman.h> 35 #ifdef BHYVE_SNAPSHOT 36 #include <sys/socket.h> 37 #include <sys/stat.h> 38 #endif 39 #include <sys/time.h> 40 #ifdef BHYVE_SNAPSHOT 41 #include <sys/un.h> 42 #endif 43 44 #include <machine/atomic.h> 45 46 #ifndef WITHOUT_CAPSICUM 47 #include <capsicum_helpers.h> 48 #endif 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <err.h> 53 #include <errno.h> 54 #ifdef BHYVE_SNAPSHOT 55 #include <fcntl.h> 56 #endif 57 #include <libgen.h> 58 #include <unistd.h> 59 #include <assert.h> 60 #include <pthread.h> 61 #include <pthread_np.h> 62 #include <sysexits.h> 63 #include <stdbool.h> 64 #include <stdint.h> 65 #ifdef BHYVE_SNAPSHOT 66 #include <ucl.h> 67 #include <unistd.h> 68 69 #include <libxo/xo.h> 70 #endif 71 72 #include <vmmapi.h> 73 74 #include "acpi.h" 75 #include "bhyverun.h" 76 #include "bootrom.h" 77 #include "config.h" 78 #include "debug.h" 79 #ifdef BHYVE_GDB 80 #include "gdb.h" 81 #endif 82 #include "mem.h" 83 #include "mevent.h" 84 #include "pci_emul.h" 85 #ifdef __amd64__ 86 #include "amd64/pci_lpc.h" 87 #endif 88 #include "qemu_fwcfg.h" 89 #ifdef BHYVE_SNAPSHOT 90 #include "snapshot.h" 91 #endif 92 #include "tpm_device.h" 93 #include "vmgenc.h" 94 #include "vmexit.h" 95 96 #define MB (1024UL * 1024) 97 #define GB (1024UL * MB) 98 99 int guest_ncpus; 100 uint16_t cpu_cores, cpu_sockets, cpu_threads; 101 102 int raw_stdio = 0; 103 104 static char *progname; 105 static const int BSP = 0; 106 107 static cpuset_t cpumask; 108 109 static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); 110 111 static struct vcpu_info { 112 struct vmctx *ctx; 113 struct vcpu *vcpu; 114 int vcpuid; 115 } *vcpu_info; 116 117 static cpuset_t **vcpumap; 118 119 static void 120 usage(int code) 121 { 122 123 fprintf(stderr, 124 "Usage: %s [-AaCDeHhPSuWwxY]\n" 125 " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" 126 " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" 127 " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" 128 " -A: create ACPI tables\n" 129 " -a: local apic is in xAPIC mode (deprecated)\n" 130 " -C: include guest memory in core file\n" 131 " -c: number of CPUs and/or topology specification\n" 132 " -D: destroy on power-off\n" 133 " -e: exit on unhandled I/O access\n" 134 " -G: start a debug server\n" 135 " -H: vmexit from the guest on HLT\n" 136 " -h: help\n" 137 " -k: key=value flat config file\n" 138 " -K: PS2 keyboard layout\n" 139 " -l: LPC device configuration\n" 140 " -m: memory size\n" 141 " -o: set config 'var' to 'value'\n" 142 " -P: vmexit from the guest on pause\n" 143 " -p: pin 'vcpu' to 'hostcpu'\n" 144 #ifdef BHYVE_SNAPSHOT 145 " -r: path to checkpoint file\n" 146 #endif 147 " -S: guest memory cannot be swapped\n" 148 " -s: <slot,driver,configinfo> PCI slot config\n" 149 " -U: UUID\n" 150 " -u: RTC keeps UTC time\n" 151 " -W: force virtio to use single-vector MSI\n" 152 " -w: ignore unimplemented MSRs\n" 153 " -x: local APIC is in x2APIC mode\n" 154 " -Y: disable MPtable generation\n", 155 progname, (int)strlen(progname), "", (int)strlen(progname), "", 156 (int)strlen(progname), ""); 157 158 exit(code); 159 } 160 161 /* 162 * XXX This parser is known to have the following issues: 163 * 1. It accepts null key=value tokens ",," as setting "cpus" to an 164 * empty string. 165 * 166 * The acceptance of a null specification ('-c ""') is by design to match the 167 * manual page syntax specification, this results in a topology of 1 vCPU. 168 */ 169 static int 170 topology_parse(const char *opt) 171 { 172 char *cp, *str, *tofree; 173 174 if (*opt == '\0') { 175 set_config_value("sockets", "1"); 176 set_config_value("cores", "1"); 177 set_config_value("threads", "1"); 178 set_config_value("cpus", "1"); 179 return (0); 180 } 181 182 tofree = str = strdup(opt); 183 if (str == NULL) 184 errx(4, "Failed to allocate memory"); 185 186 while ((cp = strsep(&str, ",")) != NULL) { 187 if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) 188 set_config_value("cpus", cp + strlen("cpus=")); 189 else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) 190 set_config_value("sockets", cp + strlen("sockets=")); 191 else if (strncmp(cp, "cores=", strlen("cores=")) == 0) 192 set_config_value("cores", cp + strlen("cores=")); 193 else if (strncmp(cp, "threads=", strlen("threads=")) == 0) 194 set_config_value("threads", cp + strlen("threads=")); 195 else if (strchr(cp, '=') != NULL) 196 goto out; 197 else 198 set_config_value("cpus", cp); 199 } 200 free(tofree); 201 return (0); 202 203 out: 204 free(tofree); 205 return (-1); 206 } 207 208 static int 209 parse_int_value(const char *key, const char *value, int minval, int maxval) 210 { 211 char *cp; 212 long lval; 213 214 errno = 0; 215 lval = strtol(value, &cp, 0); 216 if (errno != 0 || *cp != '\0' || cp == value || lval < minval || 217 lval > maxval) 218 errx(4, "Invalid value for %s: '%s'", key, value); 219 return (lval); 220 } 221 222 /* 223 * Set the sockets, cores, threads, and guest_cpus variables based on 224 * the configured topology. 225 * 226 * The limits of UINT16_MAX are due to the types passed to 227 * vm_set_topology(). vmm.ko may enforce tighter limits. 228 */ 229 static void 230 calc_topology(void) 231 { 232 const char *value; 233 bool explicit_cpus; 234 uint64_t ncpus; 235 236 value = get_config_value("cpus"); 237 if (value != NULL) { 238 guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); 239 explicit_cpus = true; 240 } else { 241 guest_ncpus = 1; 242 explicit_cpus = false; 243 } 244 value = get_config_value("cores"); 245 if (value != NULL) 246 cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); 247 else 248 cpu_cores = 1; 249 value = get_config_value("threads"); 250 if (value != NULL) 251 cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); 252 else 253 cpu_threads = 1; 254 value = get_config_value("sockets"); 255 if (value != NULL) 256 cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); 257 else 258 cpu_sockets = guest_ncpus; 259 260 /* 261 * Compute sockets * cores * threads avoiding overflow. The 262 * range check above insures these are 16 bit values. 263 */ 264 ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; 265 if (ncpus > UINT16_MAX) 266 errx(4, "Computed number of vCPUs too high: %ju", 267 (uintmax_t)ncpus); 268 269 if (explicit_cpus) { 270 if (guest_ncpus != (int)ncpus) 271 errx(4, "Topology (%d sockets, %d cores, %d threads) " 272 "does not match %d vCPUs", 273 cpu_sockets, cpu_cores, cpu_threads, 274 guest_ncpus); 275 } else 276 guest_ncpus = ncpus; 277 } 278 279 static int 280 pincpu_parse(const char *opt) 281 { 282 const char *value; 283 char *newval; 284 char key[16]; 285 int vcpu, pcpu; 286 287 if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { 288 fprintf(stderr, "invalid format: %s\n", opt); 289 return (-1); 290 } 291 292 if (vcpu < 0) { 293 fprintf(stderr, "invalid vcpu '%d'\n", vcpu); 294 return (-1); 295 } 296 297 if (pcpu < 0 || pcpu >= CPU_SETSIZE) { 298 fprintf(stderr, "hostcpu '%d' outside valid range from " 299 "0 to %d\n", pcpu, CPU_SETSIZE - 1); 300 return (-1); 301 } 302 303 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); 304 value = get_config_value(key); 305 306 if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", 307 value != NULL ? "," : "", pcpu) == -1) { 308 perror("failed to build new cpuset string"); 309 return (-1); 310 } 311 312 set_config_value(key, newval); 313 free(newval); 314 return (0); 315 } 316 317 static void 318 parse_cpuset(int vcpu, const char *list, cpuset_t *set) 319 { 320 char *cp, *token; 321 int pcpu, start; 322 323 CPU_ZERO(set); 324 start = -1; 325 token = __DECONST(char *, list); 326 for (;;) { 327 pcpu = strtoul(token, &cp, 0); 328 if (cp == token) 329 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); 330 if (pcpu < 0 || pcpu >= CPU_SETSIZE) 331 errx(4, "hostcpu '%d' outside valid range from 0 to %d", 332 pcpu, CPU_SETSIZE - 1); 333 switch (*cp) { 334 case ',': 335 case '\0': 336 if (start >= 0) { 337 if (start > pcpu) 338 errx(4, "Invalid hostcpu range %d-%d", 339 start, pcpu); 340 while (start < pcpu) { 341 CPU_SET(start, set); 342 start++; 343 } 344 start = -1; 345 } 346 CPU_SET(pcpu, set); 347 break; 348 case '-': 349 if (start >= 0) 350 errx(4, "invalid cpuset for vcpu %d: '%s'", 351 vcpu, list); 352 start = pcpu; 353 break; 354 default: 355 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); 356 } 357 if (*cp == '\0') 358 break; 359 token = cp + 1; 360 } 361 } 362 363 static void 364 build_vcpumaps(void) 365 { 366 char key[16]; 367 const char *value; 368 int vcpu; 369 370 vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); 371 for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { 372 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); 373 value = get_config_value(key); 374 if (value == NULL) 375 continue; 376 vcpumap[vcpu] = malloc(sizeof(cpuset_t)); 377 if (vcpumap[vcpu] == NULL) 378 err(4, "Failed to allocate cpuset for vcpu %d", vcpu); 379 parse_cpuset(vcpu, value, vcpumap[vcpu]); 380 } 381 } 382 383 void * 384 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) 385 { 386 387 return (vm_map_gpa(ctx, gaddr, len)); 388 } 389 390 #ifdef BHYVE_SNAPSHOT 391 uintptr_t 392 paddr_host2guest(struct vmctx *ctx, void *addr) 393 { 394 return (vm_rev_map_gpa(ctx, addr)); 395 } 396 #endif 397 398 int 399 fbsdrun_virtio_msix(void) 400 { 401 402 return (get_config_bool_default("virtio_msix", true)); 403 } 404 405 struct vcpu * 406 fbsdrun_vcpu(int vcpuid) 407 { 408 return (vcpu_info[vcpuid].vcpu); 409 } 410 411 static void * 412 fbsdrun_start_thread(void *param) 413 { 414 char tname[MAXCOMLEN + 1]; 415 struct vcpu_info *vi = param; 416 int error; 417 418 snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); 419 pthread_set_name_np(pthread_self(), tname); 420 421 if (vcpumap[vi->vcpuid] != NULL) { 422 error = pthread_setaffinity_np(pthread_self(), 423 sizeof(cpuset_t), vcpumap[vi->vcpuid]); 424 assert(error == 0); 425 } 426 427 #ifdef BHYVE_SNAPSHOT 428 checkpoint_cpu_add(vi->vcpuid); 429 #endif 430 #ifdef BHYVE_GDB 431 gdb_cpu_add(vi->vcpu); 432 #endif 433 434 vm_loop(vi->ctx, vi->vcpu); 435 436 /* not reached */ 437 exit(1); 438 return (NULL); 439 } 440 441 void 442 fbsdrun_addcpu(int vcpuid) 443 { 444 struct vcpu_info *vi; 445 pthread_t thr; 446 int error; 447 448 vi = &vcpu_info[vcpuid]; 449 450 error = vm_activate_cpu(vi->vcpu); 451 if (error != 0) 452 err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); 453 454 CPU_SET_ATOMIC(vcpuid, &cpumask); 455 456 vm_suspend_cpu(vi->vcpu); 457 458 error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); 459 assert(error == 0); 460 } 461 462 void 463 fbsdrun_deletecpu(int vcpu) 464 { 465 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; 466 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; 467 468 pthread_mutex_lock(&resetcpu_mtx); 469 if (!CPU_ISSET(vcpu, &cpumask)) { 470 EPRINTLN("Attempting to delete unknown cpu %d", vcpu); 471 exit(4); 472 } 473 474 CPU_CLR(vcpu, &cpumask); 475 476 if (vcpu != BSP) { 477 pthread_cond_signal(&resetcpu_cond); 478 pthread_mutex_unlock(&resetcpu_mtx); 479 pthread_exit(NULL); 480 /* NOTREACHED */ 481 } 482 483 while (!CPU_EMPTY(&cpumask)) { 484 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); 485 } 486 pthread_mutex_unlock(&resetcpu_mtx); 487 } 488 489 int 490 fbsdrun_suspendcpu(int vcpuid) 491 { 492 return (vm_suspend_cpu(vcpu_info[vcpuid].vcpu)); 493 } 494 495 static void 496 vm_loop(struct vmctx *ctx, struct vcpu *vcpu) 497 { 498 struct vm_exit vme; 499 struct vm_run vmrun; 500 int error, rc; 501 enum vm_exitcode exitcode; 502 cpuset_t active_cpus, dmask; 503 504 error = vm_active_cpus(ctx, &active_cpus); 505 assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); 506 507 vmrun.vm_exit = &vme; 508 vmrun.cpuset = &dmask; 509 vmrun.cpusetsize = sizeof(dmask); 510 511 while (1) { 512 error = vm_run(vcpu, &vmrun); 513 if (error != 0) 514 break; 515 516 exitcode = vme.exitcode; 517 if (exitcode >= VM_EXITCODE_MAX || 518 vmexit_handlers[exitcode] == NULL) { 519 warnx("vm_loop: unexpected exitcode 0x%x", exitcode); 520 exit(4); 521 } 522 523 rc = (*vmexit_handlers[exitcode])(ctx, vcpu, &vmrun); 524 525 switch (rc) { 526 case VMEXIT_CONTINUE: 527 break; 528 case VMEXIT_ABORT: 529 abort(); 530 default: 531 exit(4); 532 } 533 } 534 EPRINTLN("vm_run error %d, errno %d", error, errno); 535 } 536 537 static int 538 num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) 539 { 540 uint16_t sockets, cores, threads, maxcpus; 541 int tmp, error; 542 543 /* 544 * The guest is allowed to spinup more than one processor only if the 545 * UNRESTRICTED_GUEST capability is available. 546 */ 547 error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); 548 if (error != 0) 549 return (1); 550 551 error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); 552 if (error == 0) 553 return (maxcpus); 554 else 555 return (1); 556 } 557 558 static struct vmctx * 559 do_open(const char *vmname) 560 { 561 struct vmctx *ctx; 562 int error; 563 bool reinit, romboot; 564 565 reinit = romboot = false; 566 567 #ifdef __amd64__ 568 if (lpc_bootrom()) 569 romboot = true; 570 #endif 571 572 error = vm_create(vmname); 573 if (error) { 574 if (errno == EEXIST) { 575 if (romboot) { 576 reinit = true; 577 } else { 578 /* 579 * The virtual machine has been setup by the 580 * userspace bootloader. 581 */ 582 } 583 } else { 584 perror("vm_create"); 585 exit(4); 586 } 587 } else { 588 if (!romboot) { 589 /* 590 * If the virtual machine was just created then a 591 * bootrom must be configured to boot it. 592 */ 593 fprintf(stderr, "virtual machine cannot be booted\n"); 594 exit(4); 595 } 596 } 597 598 ctx = vm_open(vmname); 599 if (ctx == NULL) { 600 perror("vm_open"); 601 exit(4); 602 } 603 604 #ifndef WITHOUT_CAPSICUM 605 if (vm_limit_rights(ctx) != 0) 606 err(EX_OSERR, "vm_limit_rights"); 607 #endif 608 609 if (reinit) { 610 error = vm_reinit(ctx); 611 if (error) { 612 perror("vm_reinit"); 613 exit(4); 614 } 615 } 616 error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0); 617 if (error) 618 errx(EX_OSERR, "vm_set_topology"); 619 return (ctx); 620 } 621 622 static bool 623 parse_config_option(const char *option) 624 { 625 const char *value; 626 char *path; 627 628 value = strchr(option, '='); 629 if (value == NULL || value[1] == '\0') 630 return (false); 631 path = strndup(option, value - option); 632 if (path == NULL) 633 err(4, "Failed to allocate memory"); 634 set_config_value(path, value + 1); 635 return (true); 636 } 637 638 static void 639 parse_simple_config_file(const char *path) 640 { 641 FILE *fp; 642 char *line, *cp; 643 size_t linecap; 644 unsigned int lineno; 645 646 fp = fopen(path, "r"); 647 if (fp == NULL) 648 err(4, "Failed to open configuration file %s", path); 649 line = NULL; 650 linecap = 0; 651 lineno = 1; 652 for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { 653 if (*line == '#' || *line == '\n') 654 continue; 655 cp = strchr(line, '\n'); 656 if (cp != NULL) 657 *cp = '\0'; 658 if (!parse_config_option(line)) 659 errx(4, "%s line %u: invalid config option '%s'", path, 660 lineno, line); 661 } 662 free(line); 663 fclose(fp); 664 } 665 666 #ifdef BHYVE_GDB 667 static void 668 parse_gdb_options(const char *opt) 669 { 670 const char *sport; 671 char *colon; 672 673 if (opt[0] == 'w') { 674 set_config_bool("gdb.wait", true); 675 opt++; 676 } 677 678 colon = strrchr(opt, ':'); 679 if (colon == NULL) { 680 sport = opt; 681 } else { 682 *colon = '\0'; 683 colon++; 684 sport = colon; 685 set_config_value("gdb.address", opt); 686 } 687 688 set_config_value("gdb.port", sport); 689 } 690 #endif 691 692 int 693 main(int argc, char *argv[]) 694 { 695 int c, error; 696 int max_vcpus, memflags; 697 struct vcpu *bsp; 698 struct vmctx *ctx; 699 size_t memsize; 700 const char *optstr, *value, *vmname; 701 #ifdef BHYVE_SNAPSHOT 702 char *restore_file; 703 struct restore_state rstate; 704 705 restore_file = NULL; 706 #endif 707 708 bhyve_init_config(); 709 710 progname = basename(argv[0]); 711 712 #ifdef BHYVE_SNAPSHOT 713 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; 714 #else 715 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; 716 #endif 717 while ((c = getopt(argc, argv, optstr)) != -1) { 718 switch (c) { 719 #ifdef __amd64__ 720 case 'a': 721 set_config_bool("x86.x2apic", false); 722 break; 723 #endif 724 case 'A': 725 /* 726 * NOP. For backward compatibility. Most systems don't 727 * work properly without sane ACPI tables. Therefore, 728 * we're always generating them. 729 */ 730 break; 731 case 'D': 732 set_config_bool("destroy_on_poweroff", true); 733 break; 734 case 'p': 735 if (pincpu_parse(optarg) != 0) { 736 errx(EX_USAGE, "invalid vcpu pinning " 737 "configuration '%s'", optarg); 738 } 739 break; 740 case 'c': 741 if (topology_parse(optarg) != 0) { 742 errx(EX_USAGE, "invalid cpu topology " 743 "'%s'", optarg); 744 } 745 break; 746 case 'C': 747 set_config_bool("memory.guest_in_core", true); 748 break; 749 case 'f': 750 if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { 751 errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); 752 } 753 break; 754 #ifdef BHYVE_GDB 755 case 'G': 756 parse_gdb_options(optarg); 757 break; 758 #endif 759 case 'k': 760 parse_simple_config_file(optarg); 761 break; 762 case 'K': 763 set_config_value("keyboard.layout", optarg); 764 break; 765 #ifdef __amd64__ 766 case 'l': 767 if (strncmp(optarg, "help", strlen(optarg)) == 0) { 768 lpc_print_supported_devices(); 769 exit(0); 770 } else if (lpc_device_parse(optarg) != 0) { 771 errx(EX_USAGE, "invalid lpc device " 772 "configuration '%s'", optarg); 773 } 774 break; 775 #endif 776 #ifdef BHYVE_SNAPSHOT 777 case 'r': 778 restore_file = optarg; 779 break; 780 #endif 781 case 's': 782 if (strncmp(optarg, "help", strlen(optarg)) == 0) { 783 pci_print_supported_devices(); 784 exit(0); 785 } else if (pci_parse_slot(optarg) != 0) 786 exit(4); 787 else 788 break; 789 case 'S': 790 set_config_bool("memory.wired", true); 791 break; 792 case 'm': 793 set_config_value("memory.size", optarg); 794 break; 795 case 'o': 796 if (!parse_config_option(optarg)) 797 errx(EX_USAGE, "invalid configuration option '%s'", optarg); 798 break; 799 #ifdef __amd64__ 800 case 'H': 801 set_config_bool("x86.vmexit_on_hlt", true); 802 break; 803 case 'I': 804 /* 805 * The "-I" option was used to add an ioapic to the 806 * virtual machine. 807 * 808 * An ioapic is now provided unconditionally for each 809 * virtual machine and this option is now deprecated. 810 */ 811 break; 812 case 'P': 813 set_config_bool("x86.vmexit_on_pause", true); 814 break; 815 case 'e': 816 set_config_bool("x86.strictio", true); 817 break; 818 case 'u': 819 set_config_bool("rtc.use_localtime", false); 820 break; 821 #endif 822 case 'U': 823 set_config_value("uuid", optarg); 824 break; 825 #ifdef __amd64__ 826 case 'w': 827 set_config_bool("x86.strictmsr", false); 828 break; 829 #endif 830 case 'W': 831 set_config_bool("virtio_msix", false); 832 break; 833 #ifdef __amd64__ 834 case 'x': 835 set_config_bool("x86.x2apic", true); 836 break; 837 case 'Y': 838 set_config_bool("x86.mptable", false); 839 break; 840 #endif 841 case 'h': 842 usage(0); 843 default: 844 usage(1); 845 } 846 } 847 argc -= optind; 848 argv += optind; 849 850 if (argc > 1) 851 usage(1); 852 853 #ifdef BHYVE_SNAPSHOT 854 if (restore_file != NULL) { 855 error = load_restore_file(restore_file, &rstate); 856 if (error) { 857 fprintf(stderr, "Failed to read checkpoint info from " 858 "file: '%s'.\n", restore_file); 859 exit(1); 860 } 861 vmname = lookup_vmname(&rstate); 862 if (vmname != NULL) 863 set_config_value("name", vmname); 864 } 865 #endif 866 867 if (argc == 1) 868 set_config_value("name", argv[0]); 869 870 vmname = get_config_value("name"); 871 if (vmname == NULL) 872 usage(1); 873 874 if (get_config_bool_default("config.dump", false)) { 875 dump_config(); 876 exit(1); 877 } 878 879 calc_topology(); 880 build_vcpumaps(); 881 882 value = get_config_value("memory.size"); 883 error = vm_parse_memsize(value, &memsize); 884 if (error) 885 errx(EX_USAGE, "invalid memsize '%s'", value); 886 887 ctx = do_open(vmname); 888 889 #ifdef BHYVE_SNAPSHOT 890 if (restore_file != NULL) { 891 guest_ncpus = lookup_guest_ncpus(&rstate); 892 memflags = lookup_memflags(&rstate); 893 memsize = lookup_memsize(&rstate); 894 } 895 896 if (guest_ncpus < 1) { 897 fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); 898 exit(1); 899 } 900 #endif 901 902 bsp = vm_vcpu_open(ctx, BSP); 903 max_vcpus = num_vcpus_allowed(ctx, bsp); 904 if (guest_ncpus > max_vcpus) { 905 fprintf(stderr, "%d vCPUs requested but only %d available\n", 906 guest_ncpus, max_vcpus); 907 exit(4); 908 } 909 910 bhyve_init_vcpu(bsp); 911 912 /* Allocate per-VCPU resources. */ 913 vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); 914 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { 915 vcpu_info[vcpuid].ctx = ctx; 916 vcpu_info[vcpuid].vcpuid = vcpuid; 917 if (vcpuid == BSP) 918 vcpu_info[vcpuid].vcpu = bsp; 919 else 920 vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); 921 } 922 923 memflags = 0; 924 if (get_config_bool_default("memory.wired", false)) 925 memflags |= VM_MEM_F_WIRED; 926 if (get_config_bool_default("memory.guest_in_core", false)) 927 memflags |= VM_MEM_F_INCORE; 928 vm_set_memflags(ctx, memflags); 929 error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); 930 if (error) { 931 fprintf(stderr, "Unable to setup memory (%d)\n", errno); 932 exit(4); 933 } 934 935 init_mem(guest_ncpus); 936 init_bootrom(ctx); 937 if (bhyve_init_platform(ctx, bsp) != 0) 938 exit(4); 939 940 if (qemu_fwcfg_init(ctx) != 0) { 941 fprintf(stderr, "qemu fwcfg initialization error\n"); 942 exit(4); 943 } 944 945 if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), 946 &guest_ncpus) != 0) { 947 fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu\n"); 948 exit(4); 949 } 950 951 /* 952 * Exit if a device emulation finds an error in its initialization 953 */ 954 if (init_pci(ctx) != 0) { 955 EPRINTLN("Device emulation initialization error: %s", 956 strerror(errno)); 957 exit(4); 958 } 959 if (init_tpm(ctx) != 0) { 960 EPRINTLN("Failed to init TPM device"); 961 exit(4); 962 } 963 964 /* 965 * Initialize after PCI, to allow a bootrom file to reserve the high 966 * region. 967 */ 968 if (get_config_bool("acpi_tables")) 969 vmgenc_init(ctx); 970 971 #ifdef BHYVE_GDB 972 init_gdb(ctx); 973 #endif 974 975 /* 976 * Add all vCPUs. 977 */ 978 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) 979 bhyve_start_vcpu(vcpu_info[vcpuid].vcpu, vcpuid == BSP); 980 981 #ifdef BHYVE_SNAPSHOT 982 if (restore_file != NULL) { 983 FPRINTLN(stdout, "Pausing pci devs..."); 984 if (vm_pause_devices() != 0) { 985 EPRINTLN("Failed to pause PCI device state."); 986 exit(1); 987 } 988 989 FPRINTLN(stdout, "Restoring vm mem..."); 990 if (restore_vm_mem(ctx, &rstate) != 0) { 991 EPRINTLN("Failed to restore VM memory."); 992 exit(1); 993 } 994 995 FPRINTLN(stdout, "Restoring pci devs..."); 996 if (vm_restore_devices(&rstate) != 0) { 997 EPRINTLN("Failed to restore PCI device state."); 998 exit(1); 999 } 1000 1001 FPRINTLN(stdout, "Restoring kernel structs..."); 1002 if (vm_restore_kern_structs(ctx, &rstate) != 0) { 1003 EPRINTLN("Failed to restore kernel structs."); 1004 exit(1); 1005 } 1006 1007 FPRINTLN(stdout, "Resuming pci devs..."); 1008 if (vm_resume_devices() != 0) { 1009 EPRINTLN("Failed to resume PCI device state."); 1010 exit(1); 1011 } 1012 } 1013 #endif 1014 1015 if (bhyve_init_platform_late(ctx, bsp) != 0) 1016 exit(4); 1017 1018 /* 1019 * Change the proc title to include the VM name. 1020 */ 1021 setproctitle("%s", vmname); 1022 1023 #ifdef BHYVE_SNAPSHOT 1024 /* 1025 * checkpointing thread for communication with bhyvectl 1026 */ 1027 if (init_checkpoint_thread(ctx) != 0) 1028 errx(EX_OSERR, "Failed to start checkpoint thread"); 1029 #endif 1030 1031 #ifndef WITHOUT_CAPSICUM 1032 caph_cache_catpages(); 1033 1034 if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) 1035 errx(EX_OSERR, "Unable to apply rights for sandbox"); 1036 1037 if (caph_enter() == -1) 1038 errx(EX_OSERR, "cap_enter() failed"); 1039 #endif 1040 1041 #ifdef BHYVE_SNAPSHOT 1042 if (restore_file != NULL) { 1043 destroy_restore_state(&rstate); 1044 if (vm_restore_time(ctx) < 0) 1045 err(EX_OSERR, "Unable to restore time"); 1046 1047 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) 1048 vm_resume_cpu(vcpu_info[vcpuid].vcpu); 1049 } else 1050 #endif 1051 vm_resume_cpu(bsp); 1052 1053 /* 1054 * Head off to the main event dispatch loop 1055 */ 1056 mevent_dispatch(); 1057 1058 exit(4); 1059 } 1060