1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #include <sys/types.h> 31 #ifndef WITHOUT_CAPSICUM 32 #include <sys/capsicum.h> 33 #endif 34 #include <sys/mman.h> 35 #ifdef BHYVE_SNAPSHOT 36 #include <sys/socket.h> 37 #include <sys/stat.h> 38 #endif 39 #include <sys/time.h> 40 #ifdef BHYVE_SNAPSHOT 41 #include <sys/un.h> 42 #endif 43 44 #include <amd64/vmm/intel/vmcs.h> 45 #include <x86/apicreg.h> 46 47 #include <machine/atomic.h> 48 #include <machine/segments.h> 49 50 #ifndef WITHOUT_CAPSICUM 51 #include <capsicum_helpers.h> 52 #endif 53 #include <stdio.h> 54 #include <stdlib.h> 55 #include <string.h> 56 #include <err.h> 57 #include <errno.h> 58 #ifdef BHYVE_SNAPSHOT 59 #include <fcntl.h> 60 #endif 61 #include <libgen.h> 62 #include <unistd.h> 63 #include <assert.h> 64 #include <pthread.h> 65 #include <pthread_np.h> 66 #include <sysexits.h> 67 #include <stdbool.h> 68 #include <stdint.h> 69 #ifdef BHYVE_SNAPSHOT 70 #include <ucl.h> 71 #include <unistd.h> 72 73 #include <libxo/xo.h> 74 #endif 75 76 #include <machine/vmm.h> 77 #ifndef WITHOUT_CAPSICUM 78 #include <machine/vmm_dev.h> 79 #endif 80 #include <machine/vmm_instruction_emul.h> 81 #include <vmmapi.h> 82 83 #include "bhyverun.h" 84 #include "acpi.h" 85 #include "atkbdc.h" 86 #include "bootrom.h" 87 #include "config.h" 88 #include "inout.h" 89 #include "debug.h" 90 #include "e820.h" 91 #include "fwctl.h" 92 #include "gdb.h" 93 #include "ioapic.h" 94 #include "kernemu_dev.h" 95 #include "mem.h" 96 #include "mevent.h" 97 #include "mptbl.h" 98 #include "pci_emul.h" 99 #include "pci_irq.h" 100 #include "pci_lpc.h" 101 #include "qemu_fwcfg.h" 102 #include "smbiostbl.h" 103 #ifdef BHYVE_SNAPSHOT 104 #include "snapshot.h" 105 #endif 106 #include "xmsr.h" 107 #include "spinup_ap.h" 108 #include "rtc.h" 109 #include "vmgenc.h" 110 111 #define MB (1024UL * 1024) 112 #define GB (1024UL * MB) 113 114 static const char * const vmx_exit_reason_desc[] = { 115 [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", 116 [EXIT_REASON_EXT_INTR] = "External interrupt", 117 [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", 118 [EXIT_REASON_INIT] = "INIT signal", 119 [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", 120 [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", 121 [EXIT_REASON_SMI] = "Other SMI", 122 [EXIT_REASON_INTR_WINDOW] = "Interrupt window", 123 [EXIT_REASON_NMI_WINDOW] = "NMI window", 124 [EXIT_REASON_TASK_SWITCH] = "Task switch", 125 [EXIT_REASON_CPUID] = "CPUID", 126 [EXIT_REASON_GETSEC] = "GETSEC", 127 [EXIT_REASON_HLT] = "HLT", 128 [EXIT_REASON_INVD] = "INVD", 129 [EXIT_REASON_INVLPG] = "INVLPG", 130 [EXIT_REASON_RDPMC] = "RDPMC", 131 [EXIT_REASON_RDTSC] = "RDTSC", 132 [EXIT_REASON_RSM] = "RSM", 133 [EXIT_REASON_VMCALL] = "VMCALL", 134 [EXIT_REASON_VMCLEAR] = "VMCLEAR", 135 [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", 136 [EXIT_REASON_VMPTRLD] = "VMPTRLD", 137 [EXIT_REASON_VMPTRST] = "VMPTRST", 138 [EXIT_REASON_VMREAD] = "VMREAD", 139 [EXIT_REASON_VMRESUME] = "VMRESUME", 140 [EXIT_REASON_VMWRITE] = "VMWRITE", 141 [EXIT_REASON_VMXOFF] = "VMXOFF", 142 [EXIT_REASON_VMXON] = "VMXON", 143 [EXIT_REASON_CR_ACCESS] = "Control-register accesses", 144 [EXIT_REASON_DR_ACCESS] = "MOV DR", 145 [EXIT_REASON_INOUT] = "I/O instruction", 146 [EXIT_REASON_RDMSR] = "RDMSR", 147 [EXIT_REASON_WRMSR] = "WRMSR", 148 [EXIT_REASON_INVAL_VMCS] = 149 "VM-entry failure due to invalid guest state", 150 [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", 151 [EXIT_REASON_MWAIT] = "MWAIT", 152 [EXIT_REASON_MTF] = "Monitor trap flag", 153 [EXIT_REASON_MONITOR] = "MONITOR", 154 [EXIT_REASON_PAUSE] = "PAUSE", 155 [EXIT_REASON_MCE_DURING_ENTRY] = 156 "VM-entry failure due to machine-check event", 157 [EXIT_REASON_TPR] = "TPR below threshold", 158 [EXIT_REASON_APIC_ACCESS] = "APIC access", 159 [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", 160 [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", 161 [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", 162 [EXIT_REASON_EPT_FAULT] = "EPT violation", 163 [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", 164 [EXIT_REASON_INVEPT] = "INVEPT", 165 [EXIT_REASON_RDTSCP] = "RDTSCP", 166 [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", 167 [EXIT_REASON_INVVPID] = "INVVPID", 168 [EXIT_REASON_WBINVD] = "WBINVD", 169 [EXIT_REASON_XSETBV] = "XSETBV", 170 [EXIT_REASON_APIC_WRITE] = "APIC write", 171 [EXIT_REASON_RDRAND] = "RDRAND", 172 [EXIT_REASON_INVPCID] = "INVPCID", 173 [EXIT_REASON_VMFUNC] = "VMFUNC", 174 [EXIT_REASON_ENCLS] = "ENCLS", 175 [EXIT_REASON_RDSEED] = "RDSEED", 176 [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", 177 [EXIT_REASON_XSAVES] = "XSAVES", 178 [EXIT_REASON_XRSTORS] = "XRSTORS" 179 }; 180 181 typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *); 182 183 int guest_ncpus; 184 uint16_t cpu_cores, cpu_sockets, cpu_threads; 185 186 int raw_stdio = 0; 187 188 static char *progname; 189 static const int BSP = 0; 190 191 static cpuset_t cpumask; 192 193 static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); 194 195 static struct vcpu_info { 196 struct vmctx *ctx; 197 struct vcpu *vcpu; 198 int vcpuid; 199 } *vcpu_info; 200 201 static cpuset_t **vcpumap; 202 203 static void 204 usage(int code) 205 { 206 207 fprintf(stderr, 208 "Usage: %s [-AaCDeHhPSuWwxY]\n" 209 " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" 210 " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" 211 " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" 212 " -A: create ACPI tables\n" 213 " -a: local apic is in xAPIC mode (deprecated)\n" 214 " -C: include guest memory in core file\n" 215 " -c: number of CPUs and/or topology specification\n" 216 " -D: destroy on power-off\n" 217 " -e: exit on unhandled I/O access\n" 218 " -G: start a debug server\n" 219 " -H: vmexit from the guest on HLT\n" 220 " -h: help\n" 221 " -k: key=value flat config file\n" 222 " -K: PS2 keyboard layout\n" 223 " -l: LPC device configuration\n" 224 " -m: memory size\n" 225 " -o: set config 'var' to 'value'\n" 226 " -P: vmexit from the guest on pause\n" 227 " -p: pin 'vcpu' to 'hostcpu'\n" 228 #ifdef BHYVE_SNAPSHOT 229 " -r: path to checkpoint file\n" 230 #endif 231 " -S: guest memory cannot be swapped\n" 232 " -s: <slot,driver,configinfo> PCI slot config\n" 233 " -U: UUID\n" 234 " -u: RTC keeps UTC time\n" 235 " -W: force virtio to use single-vector MSI\n" 236 " -w: ignore unimplemented MSRs\n" 237 " -x: local APIC is in x2APIC mode\n" 238 " -Y: disable MPtable generation\n", 239 progname, (int)strlen(progname), "", (int)strlen(progname), "", 240 (int)strlen(progname), ""); 241 242 exit(code); 243 } 244 245 /* 246 * XXX This parser is known to have the following issues: 247 * 1. It accepts null key=value tokens ",," as setting "cpus" to an 248 * empty string. 249 * 250 * The acceptance of a null specification ('-c ""') is by design to match the 251 * manual page syntax specification, this results in a topology of 1 vCPU. 252 */ 253 static int 254 topology_parse(const char *opt) 255 { 256 char *cp, *str, *tofree; 257 258 if (*opt == '\0') { 259 set_config_value("sockets", "1"); 260 set_config_value("cores", "1"); 261 set_config_value("threads", "1"); 262 set_config_value("cpus", "1"); 263 return (0); 264 } 265 266 tofree = str = strdup(opt); 267 if (str == NULL) 268 errx(4, "Failed to allocate memory"); 269 270 while ((cp = strsep(&str, ",")) != NULL) { 271 if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) 272 set_config_value("cpus", cp + strlen("cpus=")); 273 else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) 274 set_config_value("sockets", cp + strlen("sockets=")); 275 else if (strncmp(cp, "cores=", strlen("cores=")) == 0) 276 set_config_value("cores", cp + strlen("cores=")); 277 else if (strncmp(cp, "threads=", strlen("threads=")) == 0) 278 set_config_value("threads", cp + strlen("threads=")); 279 else if (strchr(cp, '=') != NULL) 280 goto out; 281 else 282 set_config_value("cpus", cp); 283 } 284 free(tofree); 285 return (0); 286 287 out: 288 free(tofree); 289 return (-1); 290 } 291 292 static int 293 parse_int_value(const char *key, const char *value, int minval, int maxval) 294 { 295 char *cp; 296 long lval; 297 298 errno = 0; 299 lval = strtol(value, &cp, 0); 300 if (errno != 0 || *cp != '\0' || cp == value || lval < minval || 301 lval > maxval) 302 errx(4, "Invalid value for %s: '%s'", key, value); 303 return (lval); 304 } 305 306 /* 307 * Set the sockets, cores, threads, and guest_cpus variables based on 308 * the configured topology. 309 * 310 * The limits of UINT16_MAX are due to the types passed to 311 * vm_set_topology(). vmm.ko may enforce tighter limits. 312 */ 313 static void 314 calc_topology(void) 315 { 316 const char *value; 317 bool explicit_cpus; 318 uint64_t ncpus; 319 320 value = get_config_value("cpus"); 321 if (value != NULL) { 322 guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); 323 explicit_cpus = true; 324 } else { 325 guest_ncpus = 1; 326 explicit_cpus = false; 327 } 328 value = get_config_value("cores"); 329 if (value != NULL) 330 cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); 331 else 332 cpu_cores = 1; 333 value = get_config_value("threads"); 334 if (value != NULL) 335 cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); 336 else 337 cpu_threads = 1; 338 value = get_config_value("sockets"); 339 if (value != NULL) 340 cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); 341 else 342 cpu_sockets = guest_ncpus; 343 344 /* 345 * Compute sockets * cores * threads avoiding overflow. The 346 * range check above insures these are 16 bit values. 347 */ 348 ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; 349 if (ncpus > UINT16_MAX) 350 errx(4, "Computed number of vCPUs too high: %ju", 351 (uintmax_t)ncpus); 352 353 if (explicit_cpus) { 354 if (guest_ncpus != (int)ncpus) 355 errx(4, "Topology (%d sockets, %d cores, %d threads) " 356 "does not match %d vCPUs", 357 cpu_sockets, cpu_cores, cpu_threads, 358 guest_ncpus); 359 } else 360 guest_ncpus = ncpus; 361 } 362 363 static int 364 pincpu_parse(const char *opt) 365 { 366 const char *value; 367 char *newval; 368 char key[16]; 369 int vcpu, pcpu; 370 371 if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { 372 fprintf(stderr, "invalid format: %s\n", opt); 373 return (-1); 374 } 375 376 if (vcpu < 0) { 377 fprintf(stderr, "invalid vcpu '%d'\n", vcpu); 378 return (-1); 379 } 380 381 if (pcpu < 0 || pcpu >= CPU_SETSIZE) { 382 fprintf(stderr, "hostcpu '%d' outside valid range from " 383 "0 to %d\n", pcpu, CPU_SETSIZE - 1); 384 return (-1); 385 } 386 387 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); 388 value = get_config_value(key); 389 390 if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", 391 value != NULL ? "," : "", pcpu) == -1) { 392 perror("failed to build new cpuset string"); 393 return (-1); 394 } 395 396 set_config_value(key, newval); 397 free(newval); 398 return (0); 399 } 400 401 static void 402 parse_cpuset(int vcpu, const char *list, cpuset_t *set) 403 { 404 char *cp, *token; 405 int pcpu, start; 406 407 CPU_ZERO(set); 408 start = -1; 409 token = __DECONST(char *, list); 410 for (;;) { 411 pcpu = strtoul(token, &cp, 0); 412 if (cp == token) 413 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); 414 if (pcpu < 0 || pcpu >= CPU_SETSIZE) 415 errx(4, "hostcpu '%d' outside valid range from 0 to %d", 416 pcpu, CPU_SETSIZE - 1); 417 switch (*cp) { 418 case ',': 419 case '\0': 420 if (start >= 0) { 421 if (start > pcpu) 422 errx(4, "Invalid hostcpu range %d-%d", 423 start, pcpu); 424 while (start < pcpu) { 425 CPU_SET(start, set); 426 start++; 427 } 428 start = -1; 429 } 430 CPU_SET(pcpu, set); 431 break; 432 case '-': 433 if (start >= 0) 434 errx(4, "invalid cpuset for vcpu %d: '%s'", 435 vcpu, list); 436 start = pcpu; 437 break; 438 default: 439 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); 440 } 441 if (*cp == '\0') 442 break; 443 token = cp + 1; 444 } 445 } 446 447 static void 448 build_vcpumaps(void) 449 { 450 char key[16]; 451 const char *value; 452 int vcpu; 453 454 vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); 455 for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { 456 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); 457 value = get_config_value(key); 458 if (value == NULL) 459 continue; 460 vcpumap[vcpu] = malloc(sizeof(cpuset_t)); 461 if (vcpumap[vcpu] == NULL) 462 err(4, "Failed to allocate cpuset for vcpu %d", vcpu); 463 parse_cpuset(vcpu, value, vcpumap[vcpu]); 464 } 465 } 466 467 void 468 vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, 469 int errcode) 470 { 471 int error, restart_instruction; 472 473 restart_instruction = 1; 474 475 error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, 476 restart_instruction); 477 assert(error == 0); 478 } 479 480 void * 481 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) 482 { 483 484 return (vm_map_gpa(ctx, gaddr, len)); 485 } 486 487 #ifdef BHYVE_SNAPSHOT 488 uintptr_t 489 paddr_host2guest(struct vmctx *ctx, void *addr) 490 { 491 return (vm_rev_map_gpa(ctx, addr)); 492 } 493 #endif 494 495 int 496 fbsdrun_virtio_msix(void) 497 { 498 499 return (get_config_bool_default("virtio_msix", true)); 500 } 501 502 static void * 503 fbsdrun_start_thread(void *param) 504 { 505 char tname[MAXCOMLEN + 1]; 506 struct vcpu_info *vi = param; 507 int error; 508 509 snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); 510 pthread_set_name_np(pthread_self(), tname); 511 512 if (vcpumap[vi->vcpuid] != NULL) { 513 error = pthread_setaffinity_np(pthread_self(), 514 sizeof(cpuset_t), vcpumap[vi->vcpuid]); 515 assert(error == 0); 516 } 517 518 #ifdef BHYVE_SNAPSHOT 519 checkpoint_cpu_add(vi->vcpuid); 520 #endif 521 gdb_cpu_add(vi->vcpu); 522 523 vm_loop(vi->ctx, vi->vcpu); 524 525 /* not reached */ 526 exit(1); 527 return (NULL); 528 } 529 530 static void 531 fbsdrun_addcpu(struct vcpu_info *vi) 532 { 533 pthread_t thr; 534 int error; 535 536 error = vm_activate_cpu(vi->vcpu); 537 if (error != 0) 538 err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); 539 540 CPU_SET_ATOMIC(vi->vcpuid, &cpumask); 541 542 vm_suspend_cpu(vi->vcpu); 543 544 error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); 545 assert(error == 0); 546 } 547 548 static void 549 fbsdrun_deletecpu(int vcpu) 550 { 551 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; 552 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; 553 554 pthread_mutex_lock(&resetcpu_mtx); 555 if (!CPU_ISSET(vcpu, &cpumask)) { 556 fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); 557 exit(4); 558 } 559 560 CPU_CLR(vcpu, &cpumask); 561 562 if (vcpu != BSP) { 563 pthread_cond_signal(&resetcpu_cond); 564 pthread_mutex_unlock(&resetcpu_mtx); 565 pthread_exit(NULL); 566 /* NOTREACHED */ 567 } 568 569 while (!CPU_EMPTY(&cpumask)) { 570 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); 571 } 572 pthread_mutex_unlock(&resetcpu_mtx); 573 } 574 575 static int 576 vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) 577 { 578 struct vm_exit *vme; 579 int error; 580 int bytes, port, in; 581 582 vme = vmrun->vm_exit; 583 port = vme->u.inout.port; 584 bytes = vme->u.inout.bytes; 585 in = vme->u.inout.in; 586 587 error = emulate_inout(ctx, vcpu, vme); 588 if (error) { 589 fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", 590 in ? "in" : "out", 591 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), 592 port, vme->rip); 593 return (VMEXIT_ABORT); 594 } else { 595 return (VMEXIT_CONTINUE); 596 } 597 } 598 599 static int 600 vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, 601 struct vm_run *vmrun) 602 { 603 struct vm_exit *vme; 604 uint64_t val; 605 uint32_t eax, edx; 606 int error; 607 608 vme = vmrun->vm_exit; 609 610 val = 0; 611 error = emulate_rdmsr(vcpu, vme->u.msr.code, &val); 612 if (error != 0) { 613 fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", 614 vme->u.msr.code, vcpu_id(vcpu)); 615 if (get_config_bool("x86.strictmsr")) { 616 vm_inject_gp(vcpu); 617 return (VMEXIT_CONTINUE); 618 } 619 } 620 621 eax = val; 622 error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); 623 assert(error == 0); 624 625 edx = val >> 32; 626 error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx); 627 assert(error == 0); 628 629 return (VMEXIT_CONTINUE); 630 } 631 632 static int 633 vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, 634 struct vm_run *vmrun) 635 { 636 struct vm_exit *vme; 637 int error; 638 639 vme = vmrun->vm_exit; 640 641 error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval); 642 if (error != 0) { 643 fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", 644 vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu)); 645 if (get_config_bool("x86.strictmsr")) { 646 vm_inject_gp(vcpu); 647 return (VMEXIT_CONTINUE); 648 } 649 } 650 return (VMEXIT_CONTINUE); 651 } 652 653 #define DEBUG_EPT_MISCONFIG 654 #ifdef DEBUG_EPT_MISCONFIG 655 #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 656 657 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; 658 static int ept_misconfig_ptenum; 659 #endif 660 661 static const char * 662 vmexit_vmx_desc(uint32_t exit_reason) 663 { 664 665 if (exit_reason >= nitems(vmx_exit_reason_desc) || 666 vmx_exit_reason_desc[exit_reason] == NULL) 667 return ("Unknown"); 668 return (vmx_exit_reason_desc[exit_reason]); 669 } 670 671 static int 672 vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) 673 { 674 struct vm_exit *vme; 675 676 vme = vmrun->vm_exit; 677 678 fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); 679 fprintf(stderr, "\treason\t\tVMX\n"); 680 fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); 681 fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); 682 fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); 683 fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, 684 vmexit_vmx_desc(vme->u.vmx.exit_reason)); 685 fprintf(stderr, "\tqualification\t0x%016lx\n", 686 vme->u.vmx.exit_qualification); 687 fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); 688 fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); 689 #ifdef DEBUG_EPT_MISCONFIG 690 if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { 691 vm_get_register(vcpu, 692 VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), 693 &ept_misconfig_gpa); 694 vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, 695 &ept_misconfig_ptenum); 696 fprintf(stderr, "\tEPT misconfiguration:\n"); 697 fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); 698 fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", 699 ept_misconfig_ptenum, ept_misconfig_pte[0], 700 ept_misconfig_pte[1], ept_misconfig_pte[2], 701 ept_misconfig_pte[3]); 702 } 703 #endif /* DEBUG_EPT_MISCONFIG */ 704 return (VMEXIT_ABORT); 705 } 706 707 static int 708 vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) 709 { 710 struct vm_exit *vme; 711 712 vme = vmrun->vm_exit; 713 714 fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); 715 fprintf(stderr, "\treason\t\tSVM\n"); 716 fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); 717 fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); 718 fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); 719 fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); 720 fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); 721 return (VMEXIT_ABORT); 722 } 723 724 static int 725 vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 726 struct vm_run *vmrun) 727 { 728 assert(vmrun->vm_exit->inst_length == 0); 729 730 return (VMEXIT_CONTINUE); 731 } 732 733 static int 734 vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 735 struct vm_run *vmrun) 736 { 737 assert(vmrun->vm_exit->inst_length == 0); 738 739 return (VMEXIT_CONTINUE); 740 } 741 742 static int 743 vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 744 struct vm_run *vmrun __unused) 745 { 746 /* 747 * Just continue execution with the next instruction. We use 748 * the HLT VM exit as a way to be friendly with the host 749 * scheduler. 750 */ 751 return (VMEXIT_CONTINUE); 752 } 753 754 static int 755 vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 756 struct vm_run *vmrun __unused) 757 { 758 return (VMEXIT_CONTINUE); 759 } 760 761 static int 762 vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu, 763 struct vm_run *vmrun) 764 { 765 assert(vmrun->vm_exit->inst_length == 0); 766 767 #ifdef BHYVE_SNAPSHOT 768 checkpoint_cpu_suspend(vcpu_id(vcpu)); 769 #endif 770 gdb_cpu_mtrap(vcpu); 771 #ifdef BHYVE_SNAPSHOT 772 checkpoint_cpu_resume(vcpu_id(vcpu)); 773 #endif 774 775 return (VMEXIT_CONTINUE); 776 } 777 778 static int 779 vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, 780 struct vm_run *vmrun) 781 { 782 struct vm_exit *vme; 783 struct vie *vie; 784 int err, i, cs_d; 785 enum vm_cpu_mode mode; 786 787 vme = vmrun->vm_exit; 788 789 vie = &vme->u.inst_emul.vie; 790 if (!vie->decoded) { 791 /* 792 * Attempt to decode in userspace as a fallback. This allows 793 * updating instruction decode in bhyve without rebooting the 794 * kernel (rapid prototyping), albeit with much slower 795 * emulation. 796 */ 797 vie_restart(vie); 798 mode = vme->u.inst_emul.paging.cpu_mode; 799 cs_d = vme->u.inst_emul.cs_d; 800 if (vmm_decode_instruction(mode, cs_d, vie) != 0) 801 goto fail; 802 if (vm_set_register(vcpu, VM_REG_GUEST_RIP, 803 vme->rip + vie->num_processed) != 0) 804 goto fail; 805 } 806 807 err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, 808 &vme->u.inst_emul.paging); 809 if (err) { 810 if (err == ESRCH) { 811 EPRINTLN("Unhandled memory access to 0x%lx\n", 812 vme->u.inst_emul.gpa); 813 } 814 goto fail; 815 } 816 817 return (VMEXIT_CONTINUE); 818 819 fail: 820 fprintf(stderr, "Failed to emulate instruction sequence [ "); 821 for (i = 0; i < vie->num_valid; i++) 822 fprintf(stderr, "%02x", vie->inst[i]); 823 FPRINTLN(stderr, " ] at 0x%lx", vme->rip); 824 return (VMEXIT_ABORT); 825 } 826 827 static int 828 vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) 829 { 830 struct vm_exit *vme; 831 enum vm_suspend_how how; 832 int vcpuid = vcpu_id(vcpu); 833 834 vme = vmrun->vm_exit; 835 836 how = vme->u.suspended.how; 837 838 fbsdrun_deletecpu(vcpuid); 839 840 switch (how) { 841 case VM_SUSPEND_RESET: 842 exit(0); 843 case VM_SUSPEND_POWEROFF: 844 if (get_config_bool_default("destroy_on_poweroff", false)) 845 vm_destroy(ctx); 846 exit(1); 847 case VM_SUSPEND_HALT: 848 exit(2); 849 case VM_SUSPEND_TRIPLEFAULT: 850 exit(3); 851 default: 852 fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); 853 exit(100); 854 } 855 return (0); /* NOTREACHED */ 856 } 857 858 static int 859 vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu, 860 struct vm_run *vmrun __unused) 861 { 862 863 #ifdef BHYVE_SNAPSHOT 864 checkpoint_cpu_suspend(vcpu_id(vcpu)); 865 #endif 866 gdb_cpu_suspend(vcpu); 867 #ifdef BHYVE_SNAPSHOT 868 checkpoint_cpu_resume(vcpu_id(vcpu)); 869 #endif 870 /* 871 * XXX-MJ sleep for a short period to avoid chewing up the CPU in the 872 * window between activation of the vCPU thread and the STARTUP IPI. 873 */ 874 usleep(1000); 875 return (VMEXIT_CONTINUE); 876 } 877 878 static int 879 vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu, 880 struct vm_run *vmrun) 881 { 882 gdb_cpu_breakpoint(vcpu, vmrun->vm_exit); 883 return (VMEXIT_CONTINUE); 884 } 885 886 static int 887 vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 888 struct vm_run *vmrun) 889 { 890 struct vm_exit *vme; 891 cpuset_t *dmask; 892 int error = -1; 893 int i; 894 895 dmask = vmrun->cpuset; 896 vme = vmrun->vm_exit; 897 898 switch (vme->u.ipi.mode) { 899 case APIC_DELMODE_INIT: 900 CPU_FOREACH_ISSET(i, dmask) { 901 error = vm_suspend_cpu(vcpu_info[i].vcpu); 902 if (error) { 903 warnx("%s: failed to suspend cpu %d\n", 904 __func__, i); 905 break; 906 } 907 } 908 break; 909 case APIC_DELMODE_STARTUP: 910 CPU_FOREACH_ISSET(i, dmask) { 911 spinup_ap(vcpu_info[i].vcpu, 912 vme->u.ipi.vector << PAGE_SHIFT); 913 } 914 error = 0; 915 break; 916 default: 917 break; 918 } 919 920 return (error); 921 } 922 923 static const vmexit_handler_t handler[VM_EXITCODE_MAX] = { 924 [VM_EXITCODE_INOUT] = vmexit_inout, 925 [VM_EXITCODE_INOUT_STR] = vmexit_inout, 926 [VM_EXITCODE_VMX] = vmexit_vmx, 927 [VM_EXITCODE_SVM] = vmexit_svm, 928 [VM_EXITCODE_BOGUS] = vmexit_bogus, 929 [VM_EXITCODE_REQIDLE] = vmexit_reqidle, 930 [VM_EXITCODE_RDMSR] = vmexit_rdmsr, 931 [VM_EXITCODE_WRMSR] = vmexit_wrmsr, 932 [VM_EXITCODE_MTRAP] = vmexit_mtrap, 933 [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, 934 [VM_EXITCODE_SUSPENDED] = vmexit_suspend, 935 [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, 936 [VM_EXITCODE_DEBUG] = vmexit_debug, 937 [VM_EXITCODE_BPT] = vmexit_breakpoint, 938 [VM_EXITCODE_IPI] = vmexit_ipi, 939 [VM_EXITCODE_HLT] = vmexit_hlt, 940 [VM_EXITCODE_PAUSE] = vmexit_pause, 941 }; 942 943 static void 944 vm_loop(struct vmctx *ctx, struct vcpu *vcpu) 945 { 946 struct vm_exit vme; 947 struct vm_run vmrun; 948 int error, rc; 949 enum vm_exitcode exitcode; 950 cpuset_t active_cpus, dmask; 951 952 error = vm_active_cpus(ctx, &active_cpus); 953 assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); 954 955 vmrun.vm_exit = &vme; 956 vmrun.cpuset = &dmask; 957 vmrun.cpusetsize = sizeof(dmask); 958 959 while (1) { 960 error = vm_run(vcpu, &vmrun); 961 if (error != 0) 962 break; 963 964 exitcode = vme.exitcode; 965 if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { 966 fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", 967 exitcode); 968 exit(4); 969 } 970 971 rc = (*handler[exitcode])(ctx, vcpu, &vmrun); 972 973 switch (rc) { 974 case VMEXIT_CONTINUE: 975 break; 976 case VMEXIT_ABORT: 977 abort(); 978 default: 979 exit(4); 980 } 981 } 982 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); 983 } 984 985 static int 986 num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) 987 { 988 uint16_t sockets, cores, threads, maxcpus; 989 int tmp, error; 990 991 /* 992 * The guest is allowed to spinup more than one processor only if the 993 * UNRESTRICTED_GUEST capability is available. 994 */ 995 error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); 996 if (error != 0) 997 return (1); 998 999 error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); 1000 if (error == 0) 1001 return (maxcpus); 1002 else 1003 return (1); 1004 } 1005 1006 static void 1007 fbsdrun_set_capabilities(struct vcpu *vcpu) 1008 { 1009 int err, tmp; 1010 1011 if (get_config_bool_default("x86.vmexit_on_hlt", false)) { 1012 err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); 1013 if (err < 0) { 1014 fprintf(stderr, "VM exit on HLT not supported\n"); 1015 exit(4); 1016 } 1017 vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); 1018 } 1019 1020 if (get_config_bool_default("x86.vmexit_on_pause", false)) { 1021 /* 1022 * pause exit support required for this mode 1023 */ 1024 err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); 1025 if (err < 0) { 1026 fprintf(stderr, 1027 "SMP mux requested, no pause support\n"); 1028 exit(4); 1029 } 1030 vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); 1031 } 1032 1033 if (get_config_bool_default("x86.x2apic", false)) 1034 err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); 1035 else 1036 err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); 1037 1038 if (err) { 1039 fprintf(stderr, "Unable to set x2apic state (%d)\n", err); 1040 exit(4); 1041 } 1042 1043 vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); 1044 1045 err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); 1046 assert(err == 0); 1047 } 1048 1049 static struct vmctx * 1050 do_open(const char *vmname) 1051 { 1052 struct vmctx *ctx; 1053 int error; 1054 bool reinit, romboot; 1055 1056 reinit = romboot = false; 1057 1058 if (lpc_bootrom()) 1059 romboot = true; 1060 1061 error = vm_create(vmname); 1062 if (error) { 1063 if (errno == EEXIST) { 1064 if (romboot) { 1065 reinit = true; 1066 } else { 1067 /* 1068 * The virtual machine has been setup by the 1069 * userspace bootloader. 1070 */ 1071 } 1072 } else { 1073 perror("vm_create"); 1074 exit(4); 1075 } 1076 } else { 1077 if (!romboot) { 1078 /* 1079 * If the virtual machine was just created then a 1080 * bootrom must be configured to boot it. 1081 */ 1082 fprintf(stderr, "virtual machine cannot be booted\n"); 1083 exit(4); 1084 } 1085 } 1086 1087 ctx = vm_open(vmname); 1088 if (ctx == NULL) { 1089 perror("vm_open"); 1090 exit(4); 1091 } 1092 1093 #ifndef WITHOUT_CAPSICUM 1094 if (vm_limit_rights(ctx) != 0) 1095 err(EX_OSERR, "vm_limit_rights"); 1096 #endif 1097 1098 if (reinit) { 1099 error = vm_reinit(ctx); 1100 if (error) { 1101 perror("vm_reinit"); 1102 exit(4); 1103 } 1104 } 1105 error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0); 1106 if (error) 1107 errx(EX_OSERR, "vm_set_topology"); 1108 return (ctx); 1109 } 1110 1111 static void 1112 spinup_vcpu(struct vcpu_info *vi, bool bsp) 1113 { 1114 int error; 1115 1116 if (!bsp) { 1117 fbsdrun_set_capabilities(vi->vcpu); 1118 1119 /* 1120 * Enable the 'unrestricted guest' mode for APs. 1121 * 1122 * APs startup in power-on 16-bit mode. 1123 */ 1124 error = vm_set_capability(vi->vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); 1125 assert(error == 0); 1126 } 1127 1128 fbsdrun_addcpu(vi); 1129 } 1130 1131 static bool 1132 parse_config_option(const char *option) 1133 { 1134 const char *value; 1135 char *path; 1136 1137 value = strchr(option, '='); 1138 if (value == NULL || value[1] == '\0') 1139 return (false); 1140 path = strndup(option, value - option); 1141 if (path == NULL) 1142 err(4, "Failed to allocate memory"); 1143 set_config_value(path, value + 1); 1144 return (true); 1145 } 1146 1147 static void 1148 parse_simple_config_file(const char *path) 1149 { 1150 FILE *fp; 1151 char *line, *cp; 1152 size_t linecap; 1153 unsigned int lineno; 1154 1155 fp = fopen(path, "r"); 1156 if (fp == NULL) 1157 err(4, "Failed to open configuration file %s", path); 1158 line = NULL; 1159 linecap = 0; 1160 lineno = 1; 1161 for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { 1162 if (*line == '#' || *line == '\n') 1163 continue; 1164 cp = strchr(line, '\n'); 1165 if (cp != NULL) 1166 *cp = '\0'; 1167 if (!parse_config_option(line)) 1168 errx(4, "%s line %u: invalid config option '%s'", path, 1169 lineno, line); 1170 } 1171 free(line); 1172 fclose(fp); 1173 } 1174 1175 static void 1176 parse_gdb_options(const char *opt) 1177 { 1178 const char *sport; 1179 char *colon; 1180 1181 if (opt[0] == 'w') { 1182 set_config_bool("gdb.wait", true); 1183 opt++; 1184 } 1185 1186 colon = strrchr(opt, ':'); 1187 if (colon == NULL) { 1188 sport = opt; 1189 } else { 1190 *colon = '\0'; 1191 colon++; 1192 sport = colon; 1193 set_config_value("gdb.address", opt); 1194 } 1195 1196 set_config_value("gdb.port", sport); 1197 } 1198 1199 static void 1200 set_defaults(void) 1201 { 1202 1203 set_config_bool("acpi_tables", false); 1204 set_config_value("memory.size", "256M"); 1205 set_config_bool("x86.strictmsr", true); 1206 set_config_value("lpc.fwcfg", "bhyve"); 1207 } 1208 1209 int 1210 main(int argc, char *argv[]) 1211 { 1212 int c, error; 1213 int max_vcpus, memflags; 1214 struct vcpu *bsp; 1215 struct vmctx *ctx; 1216 struct qemu_fwcfg_item *e820_fwcfg_item; 1217 size_t memsize; 1218 const char *optstr, *value, *vmname; 1219 #ifdef BHYVE_SNAPSHOT 1220 char *restore_file; 1221 struct restore_state rstate; 1222 1223 restore_file = NULL; 1224 #endif 1225 1226 init_config(); 1227 set_defaults(); 1228 progname = basename(argv[0]); 1229 1230 #ifdef BHYVE_SNAPSHOT 1231 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; 1232 #else 1233 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; 1234 #endif 1235 while ((c = getopt(argc, argv, optstr)) != -1) { 1236 switch (c) { 1237 case 'a': 1238 set_config_bool("x86.x2apic", false); 1239 break; 1240 case 'A': 1241 set_config_bool("acpi_tables", true); 1242 break; 1243 case 'D': 1244 set_config_bool("destroy_on_poweroff", true); 1245 break; 1246 case 'p': 1247 if (pincpu_parse(optarg) != 0) { 1248 errx(EX_USAGE, "invalid vcpu pinning " 1249 "configuration '%s'", optarg); 1250 } 1251 break; 1252 case 'c': 1253 if (topology_parse(optarg) != 0) { 1254 errx(EX_USAGE, "invalid cpu topology " 1255 "'%s'", optarg); 1256 } 1257 break; 1258 case 'C': 1259 set_config_bool("memory.guest_in_core", true); 1260 break; 1261 case 'f': 1262 if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { 1263 errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); 1264 } 1265 break; 1266 case 'G': 1267 parse_gdb_options(optarg); 1268 break; 1269 case 'k': 1270 parse_simple_config_file(optarg); 1271 break; 1272 case 'K': 1273 set_config_value("keyboard.layout", optarg); 1274 break; 1275 case 'l': 1276 if (strncmp(optarg, "help", strlen(optarg)) == 0) { 1277 lpc_print_supported_devices(); 1278 exit(0); 1279 } else if (lpc_device_parse(optarg) != 0) { 1280 errx(EX_USAGE, "invalid lpc device " 1281 "configuration '%s'", optarg); 1282 } 1283 break; 1284 #ifdef BHYVE_SNAPSHOT 1285 case 'r': 1286 restore_file = optarg; 1287 break; 1288 #endif 1289 case 's': 1290 if (strncmp(optarg, "help", strlen(optarg)) == 0) { 1291 pci_print_supported_devices(); 1292 exit(0); 1293 } else if (pci_parse_slot(optarg) != 0) 1294 exit(4); 1295 else 1296 break; 1297 case 'S': 1298 set_config_bool("memory.wired", true); 1299 break; 1300 case 'm': 1301 set_config_value("memory.size", optarg); 1302 break; 1303 case 'o': 1304 if (!parse_config_option(optarg)) 1305 errx(EX_USAGE, "invalid configuration option '%s'", optarg); 1306 break; 1307 case 'H': 1308 set_config_bool("x86.vmexit_on_hlt", true); 1309 break; 1310 case 'I': 1311 /* 1312 * The "-I" option was used to add an ioapic to the 1313 * virtual machine. 1314 * 1315 * An ioapic is now provided unconditionally for each 1316 * virtual machine and this option is now deprecated. 1317 */ 1318 break; 1319 case 'P': 1320 set_config_bool("x86.vmexit_on_pause", true); 1321 break; 1322 case 'e': 1323 set_config_bool("x86.strictio", true); 1324 break; 1325 case 'u': 1326 set_config_bool("rtc.use_localtime", false); 1327 break; 1328 case 'U': 1329 set_config_value("uuid", optarg); 1330 break; 1331 case 'w': 1332 set_config_bool("x86.strictmsr", false); 1333 break; 1334 case 'W': 1335 set_config_bool("virtio_msix", false); 1336 break; 1337 case 'x': 1338 set_config_bool("x86.x2apic", true); 1339 break; 1340 case 'Y': 1341 set_config_bool("x86.mptable", false); 1342 break; 1343 case 'h': 1344 usage(0); 1345 default: 1346 usage(1); 1347 } 1348 } 1349 argc -= optind; 1350 argv += optind; 1351 1352 if (argc > 1) 1353 usage(1); 1354 1355 #ifdef BHYVE_SNAPSHOT 1356 if (restore_file != NULL) { 1357 error = load_restore_file(restore_file, &rstate); 1358 if (error) { 1359 fprintf(stderr, "Failed to read checkpoint info from " 1360 "file: '%s'.\n", restore_file); 1361 exit(1); 1362 } 1363 vmname = lookup_vmname(&rstate); 1364 if (vmname != NULL) 1365 set_config_value("name", vmname); 1366 } 1367 #endif 1368 1369 if (argc == 1) 1370 set_config_value("name", argv[0]); 1371 1372 vmname = get_config_value("name"); 1373 if (vmname == NULL) 1374 usage(1); 1375 1376 if (get_config_bool_default("config.dump", false)) { 1377 dump_config(); 1378 exit(1); 1379 } 1380 1381 calc_topology(); 1382 build_vcpumaps(); 1383 1384 value = get_config_value("memory.size"); 1385 error = vm_parse_memsize(value, &memsize); 1386 if (error) 1387 errx(EX_USAGE, "invalid memsize '%s'", value); 1388 1389 ctx = do_open(vmname); 1390 1391 #ifdef BHYVE_SNAPSHOT 1392 if (restore_file != NULL) { 1393 guest_ncpus = lookup_guest_ncpus(&rstate); 1394 memflags = lookup_memflags(&rstate); 1395 memsize = lookup_memsize(&rstate); 1396 } 1397 1398 if (guest_ncpus < 1) { 1399 fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); 1400 exit(1); 1401 } 1402 #endif 1403 1404 bsp = vm_vcpu_open(ctx, BSP); 1405 max_vcpus = num_vcpus_allowed(ctx, bsp); 1406 if (guest_ncpus > max_vcpus) { 1407 fprintf(stderr, "%d vCPUs requested but only %d available\n", 1408 guest_ncpus, max_vcpus); 1409 exit(4); 1410 } 1411 1412 fbsdrun_set_capabilities(bsp); 1413 1414 /* Allocate per-VCPU resources. */ 1415 vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); 1416 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { 1417 vcpu_info[vcpuid].ctx = ctx; 1418 vcpu_info[vcpuid].vcpuid = vcpuid; 1419 if (vcpuid == BSP) 1420 vcpu_info[vcpuid].vcpu = bsp; 1421 else 1422 vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); 1423 } 1424 1425 memflags = 0; 1426 if (get_config_bool_default("memory.wired", false)) 1427 memflags |= VM_MEM_F_WIRED; 1428 if (get_config_bool_default("memory.guest_in_core", false)) 1429 memflags |= VM_MEM_F_INCORE; 1430 vm_set_memflags(ctx, memflags); 1431 error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); 1432 if (error) { 1433 fprintf(stderr, "Unable to setup memory (%d)\n", errno); 1434 exit(4); 1435 } 1436 1437 error = init_msr(); 1438 if (error) { 1439 fprintf(stderr, "init_msr error %d", error); 1440 exit(4); 1441 } 1442 1443 init_mem(guest_ncpus); 1444 init_inout(); 1445 kernemu_dev_init(); 1446 init_bootrom(ctx); 1447 atkbdc_init(ctx); 1448 pci_irq_init(ctx); 1449 ioapic_init(ctx); 1450 1451 rtc_init(ctx); 1452 sci_init(ctx); 1453 1454 if (qemu_fwcfg_init(ctx) != 0) { 1455 fprintf(stderr, "qemu fwcfg initialization error"); 1456 exit(4); 1457 } 1458 1459 if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), 1460 &guest_ncpus) != 0) { 1461 fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); 1462 exit(4); 1463 } 1464 1465 if (e820_init(ctx) != 0) { 1466 fprintf(stderr, "Unable to setup E820"); 1467 exit(4); 1468 } 1469 1470 /* 1471 * Exit if a device emulation finds an error in its initialization 1472 */ 1473 if (init_pci(ctx) != 0) { 1474 perror("device emulation initialization error"); 1475 exit(4); 1476 } 1477 1478 /* 1479 * Initialize after PCI, to allow a bootrom file to reserve the high 1480 * region. 1481 */ 1482 if (get_config_bool("acpi_tables")) 1483 vmgenc_init(ctx); 1484 1485 init_gdb(ctx); 1486 1487 if (lpc_bootrom()) { 1488 if (vm_set_capability(bsp, VM_CAP_UNRESTRICTED_GUEST, 1)) { 1489 fprintf(stderr, "ROM boot failed: unrestricted guest " 1490 "capability not available\n"); 1491 exit(4); 1492 } 1493 error = vcpu_reset(bsp); 1494 assert(error == 0); 1495 } 1496 1497 /* 1498 * Add all vCPUs. 1499 */ 1500 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) 1501 spinup_vcpu(&vcpu_info[vcpuid], vcpuid == BSP); 1502 1503 #ifdef BHYVE_SNAPSHOT 1504 if (restore_file != NULL) { 1505 fprintf(stdout, "Pausing pci devs...\r\n"); 1506 if (vm_pause_devices() != 0) { 1507 fprintf(stderr, "Failed to pause PCI device state.\n"); 1508 exit(1); 1509 } 1510 1511 fprintf(stdout, "Restoring vm mem...\r\n"); 1512 if (restore_vm_mem(ctx, &rstate) != 0) { 1513 fprintf(stderr, "Failed to restore VM memory.\n"); 1514 exit(1); 1515 } 1516 1517 fprintf(stdout, "Restoring pci devs...\r\n"); 1518 if (vm_restore_devices(&rstate) != 0) { 1519 fprintf(stderr, "Failed to restore PCI device state.\n"); 1520 exit(1); 1521 } 1522 1523 fprintf(stdout, "Restoring kernel structs...\r\n"); 1524 if (vm_restore_kern_structs(ctx, &rstate) != 0) { 1525 fprintf(stderr, "Failed to restore kernel structs.\n"); 1526 exit(1); 1527 } 1528 1529 fprintf(stdout, "Resuming pci devs...\r\n"); 1530 if (vm_resume_devices() != 0) { 1531 fprintf(stderr, "Failed to resume PCI device state.\n"); 1532 exit(1); 1533 } 1534 } 1535 #endif 1536 1537 /* 1538 * build the guest tables, MP etc. 1539 */ 1540 if (get_config_bool_default("x86.mptable", true)) { 1541 error = mptable_build(ctx, guest_ncpus); 1542 if (error) { 1543 perror("error to build the guest tables"); 1544 exit(4); 1545 } 1546 } 1547 1548 error = smbios_build(ctx); 1549 if (error != 0) 1550 exit(4); 1551 1552 if (get_config_bool("acpi_tables")) { 1553 error = acpi_build(ctx, guest_ncpus); 1554 assert(error == 0); 1555 } 1556 1557 e820_fwcfg_item = e820_get_fwcfg_item(); 1558 if (e820_fwcfg_item == NULL) { 1559 fprintf(stderr, "invalid e820 table"); 1560 exit(4); 1561 } 1562 if (qemu_fwcfg_add_file("etc/e820", e820_fwcfg_item->size, 1563 e820_fwcfg_item->data) != 0) { 1564 fprintf(stderr, "could not add qemu fwcfg etc/e820"); 1565 exit(4); 1566 } 1567 free(e820_fwcfg_item); 1568 1569 if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) { 1570 fwctl_init(); 1571 } 1572 1573 /* 1574 * Change the proc title to include the VM name. 1575 */ 1576 setproctitle("%s", vmname); 1577 1578 #ifdef BHYVE_SNAPSHOT 1579 /* initialize mutex/cond variables */ 1580 init_snapshot(); 1581 1582 /* 1583 * checkpointing thread for communication with bhyvectl 1584 */ 1585 if (init_checkpoint_thread(ctx) != 0) 1586 errx(EX_OSERR, "Failed to start checkpoint thread"); 1587 #endif 1588 1589 #ifndef WITHOUT_CAPSICUM 1590 caph_cache_catpages(); 1591 1592 if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) 1593 errx(EX_OSERR, "Unable to apply rights for sandbox"); 1594 1595 if (caph_enter() == -1) 1596 errx(EX_OSERR, "cap_enter() failed"); 1597 #endif 1598 1599 #ifdef BHYVE_SNAPSHOT 1600 if (restore_file != NULL) { 1601 destroy_restore_state(&rstate); 1602 if (vm_restore_time(ctx) < 0) 1603 err(EX_OSERR, "Unable to restore time"); 1604 1605 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) 1606 vm_resume_cpu(vcpu_info[vcpuid].vcpu); 1607 } else 1608 #endif 1609 vm_resume_cpu(bsp); 1610 1611 /* 1612 * Head off to the main event dispatch loop 1613 */ 1614 mevent_dispatch(); 1615 1616 exit(4); 1617 } 1618