1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/types.h> 35 #ifndef WITHOUT_CAPSICUM 36 #include <sys/capsicum.h> 37 #endif 38 #include <sys/mman.h> 39 #ifdef BHYVE_SNAPSHOT 40 #include <sys/socket.h> 41 #include <sys/stat.h> 42 #endif 43 #include <sys/time.h> 44 #ifdef BHYVE_SNAPSHOT 45 #include <sys/un.h> 46 #endif 47 48 #include <amd64/vmm/intel/vmcs.h> 49 #include <x86/apicreg.h> 50 51 #include <machine/atomic.h> 52 #include <machine/segments.h> 53 54 #ifndef WITHOUT_CAPSICUM 55 #include <capsicum_helpers.h> 56 #endif 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <string.h> 60 #include <err.h> 61 #include <errno.h> 62 #ifdef BHYVE_SNAPSHOT 63 #include <fcntl.h> 64 #endif 65 #include <libgen.h> 66 #include <unistd.h> 67 #include <assert.h> 68 #include <pthread.h> 69 #include <pthread_np.h> 70 #include <sysexits.h> 71 #include <stdbool.h> 72 #include <stdint.h> 73 #ifdef BHYVE_SNAPSHOT 74 #include <ucl.h> 75 #include <unistd.h> 76 77 #include <libxo/xo.h> 78 #endif 79 80 #include <machine/vmm.h> 81 #ifndef WITHOUT_CAPSICUM 82 #include <machine/vmm_dev.h> 83 #endif 84 #include <machine/vmm_instruction_emul.h> 85 #include <vmmapi.h> 86 87 #include "bhyverun.h" 88 #include "acpi.h" 89 #include "atkbdc.h" 90 #include "bootrom.h" 91 #include "config.h" 92 #include "inout.h" 93 #include "debug.h" 94 #include "e820.h" 95 #include "fwctl.h" 96 #include "gdb.h" 97 #include "ioapic.h" 98 #include "kernemu_dev.h" 99 #include "mem.h" 100 #include "mevent.h" 101 #ifdef BHYVE_SNAPSHOT 102 #include "migration.h" 103 #endif 104 #include "mptbl.h" 105 #include "pci_emul.h" 106 #include "pci_irq.h" 107 #include "pci_lpc.h" 108 #include "qemu_fwcfg.h" 109 #include "smbiostbl.h" 110 #ifdef BHYVE_SNAPSHOT 111 #include "snapshot.h" 112 #endif 113 #include "xmsr.h" 114 #include "spinup_ap.h" 115 #include "rtc.h" 116 #include "vmgenc.h" 117 118 #define MB (1024UL * 1024) 119 #define GB (1024UL * MB) 120 121 static const char * const vmx_exit_reason_desc[] = { 122 [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", 123 [EXIT_REASON_EXT_INTR] = "External interrupt", 124 [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", 125 [EXIT_REASON_INIT] = "INIT signal", 126 [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", 127 [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", 128 [EXIT_REASON_SMI] = "Other SMI", 129 [EXIT_REASON_INTR_WINDOW] = "Interrupt window", 130 [EXIT_REASON_NMI_WINDOW] = "NMI window", 131 [EXIT_REASON_TASK_SWITCH] = "Task switch", 132 [EXIT_REASON_CPUID] = "CPUID", 133 [EXIT_REASON_GETSEC] = "GETSEC", 134 [EXIT_REASON_HLT] = "HLT", 135 [EXIT_REASON_INVD] = "INVD", 136 [EXIT_REASON_INVLPG] = "INVLPG", 137 [EXIT_REASON_RDPMC] = "RDPMC", 138 [EXIT_REASON_RDTSC] = "RDTSC", 139 [EXIT_REASON_RSM] = "RSM", 140 [EXIT_REASON_VMCALL] = "VMCALL", 141 [EXIT_REASON_VMCLEAR] = "VMCLEAR", 142 [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", 143 [EXIT_REASON_VMPTRLD] = "VMPTRLD", 144 [EXIT_REASON_VMPTRST] = "VMPTRST", 145 [EXIT_REASON_VMREAD] = "VMREAD", 146 [EXIT_REASON_VMRESUME] = "VMRESUME", 147 [EXIT_REASON_VMWRITE] = "VMWRITE", 148 [EXIT_REASON_VMXOFF] = "VMXOFF", 149 [EXIT_REASON_VMXON] = "VMXON", 150 [EXIT_REASON_CR_ACCESS] = "Control-register accesses", 151 [EXIT_REASON_DR_ACCESS] = "MOV DR", 152 [EXIT_REASON_INOUT] = "I/O instruction", 153 [EXIT_REASON_RDMSR] = "RDMSR", 154 [EXIT_REASON_WRMSR] = "WRMSR", 155 [EXIT_REASON_INVAL_VMCS] = 156 "VM-entry failure due to invalid guest state", 157 [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", 158 [EXIT_REASON_MWAIT] = "MWAIT", 159 [EXIT_REASON_MTF] = "Monitor trap flag", 160 [EXIT_REASON_MONITOR] = "MONITOR", 161 [EXIT_REASON_PAUSE] = "PAUSE", 162 [EXIT_REASON_MCE_DURING_ENTRY] = 163 "VM-entry failure due to machine-check event", 164 [EXIT_REASON_TPR] = "TPR below threshold", 165 [EXIT_REASON_APIC_ACCESS] = "APIC access", 166 [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", 167 [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", 168 [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", 169 [EXIT_REASON_EPT_FAULT] = "EPT violation", 170 [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", 171 [EXIT_REASON_INVEPT] = "INVEPT", 172 [EXIT_REASON_RDTSCP] = "RDTSCP", 173 [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", 174 [EXIT_REASON_INVVPID] = "INVVPID", 175 [EXIT_REASON_WBINVD] = "WBINVD", 176 [EXIT_REASON_XSETBV] = "XSETBV", 177 [EXIT_REASON_APIC_WRITE] = "APIC write", 178 [EXIT_REASON_RDRAND] = "RDRAND", 179 [EXIT_REASON_INVPCID] = "INVPCID", 180 [EXIT_REASON_VMFUNC] = "VMFUNC", 181 [EXIT_REASON_ENCLS] = "ENCLS", 182 [EXIT_REASON_RDSEED] = "RDSEED", 183 [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", 184 [EXIT_REASON_XSAVES] = "XSAVES", 185 [EXIT_REASON_XRSTORS] = "XRSTORS" 186 }; 187 188 typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *); 189 190 int guest_ncpus; 191 uint16_t cpu_cores, cpu_sockets, cpu_threads; 192 193 int raw_stdio = 0; 194 195 static char *progname; 196 static const int BSP = 0; 197 198 static cpuset_t cpumask; 199 200 static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); 201 202 static struct vcpu_info { 203 struct vmctx *ctx; 204 struct vcpu *vcpu; 205 int vcpuid; 206 } *vcpu_info; 207 208 static cpuset_t **vcpumap; 209 210 static void 211 usage(int code) 212 { 213 214 fprintf(stderr, 215 "Usage: %s [-AaCDeHhPSuWwxY]\n" 216 " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" 217 " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" 218 " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" 219 " -A: create ACPI tables\n" 220 " -a: local apic is in xAPIC mode (deprecated)\n" 221 " -C: include guest memory in core file\n" 222 " -c: number of CPUs and/or topology specification\n" 223 " -D: destroy on power-off\n" 224 " -e: exit on unhandled I/O access\n" 225 " -G: start a debug server\n" 226 " -H: vmexit from the guest on HLT\n" 227 " -h: help\n" 228 " -k: key=value flat config file\n" 229 " -K: PS2 keyboard layout\n" 230 " -l: LPC device configuration\n" 231 " -m: memory size\n" 232 " -o: set config 'var' to 'value'\n" 233 " -P: vmexit from the guest on pause\n" 234 " -p: pin 'vcpu' to 'hostcpu'\n" 235 #ifdef BHYVE_SNAPSHOT 236 " -r: path to checkpoint file\n" 237 " -R: <host[:port]> the source vm host and port for migration\n" 238 #endif 239 " -S: guest memory cannot be swapped\n" 240 " -s: <slot,driver,configinfo> PCI slot config\n" 241 " -U: UUID\n" 242 " -u: RTC keeps UTC time\n" 243 " -W: force virtio to use single-vector MSI\n" 244 " -w: ignore unimplemented MSRs\n" 245 " -x: local APIC is in x2APIC mode\n" 246 " -Y: disable MPtable generation\n", 247 progname, (int)strlen(progname), "", (int)strlen(progname), "", 248 (int)strlen(progname), ""); 249 250 exit(code); 251 } 252 253 /* 254 * XXX This parser is known to have the following issues: 255 * 1. It accepts null key=value tokens ",," as setting "cpus" to an 256 * empty string. 257 * 258 * The acceptance of a null specification ('-c ""') is by design to match the 259 * manual page syntax specification, this results in a topology of 1 vCPU. 260 */ 261 static int 262 topology_parse(const char *opt) 263 { 264 char *cp, *str, *tofree; 265 266 if (*opt == '\0') { 267 set_config_value("sockets", "1"); 268 set_config_value("cores", "1"); 269 set_config_value("threads", "1"); 270 set_config_value("cpus", "1"); 271 return (0); 272 } 273 274 tofree = str = strdup(opt); 275 if (str == NULL) 276 errx(4, "Failed to allocate memory"); 277 278 while ((cp = strsep(&str, ",")) != NULL) { 279 if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) 280 set_config_value("cpus", cp + strlen("cpus=")); 281 else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) 282 set_config_value("sockets", cp + strlen("sockets=")); 283 else if (strncmp(cp, "cores=", strlen("cores=")) == 0) 284 set_config_value("cores", cp + strlen("cores=")); 285 else if (strncmp(cp, "threads=", strlen("threads=")) == 0) 286 set_config_value("threads", cp + strlen("threads=")); 287 else if (strchr(cp, '=') != NULL) 288 goto out; 289 else 290 set_config_value("cpus", cp); 291 } 292 free(tofree); 293 return (0); 294 295 out: 296 free(tofree); 297 return (-1); 298 } 299 300 static int 301 parse_int_value(const char *key, const char *value, int minval, int maxval) 302 { 303 char *cp; 304 long lval; 305 306 errno = 0; 307 lval = strtol(value, &cp, 0); 308 if (errno != 0 || *cp != '\0' || cp == value || lval < minval || 309 lval > maxval) 310 errx(4, "Invalid value for %s: '%s'", key, value); 311 return (lval); 312 } 313 314 /* 315 * Set the sockets, cores, threads, and guest_cpus variables based on 316 * the configured topology. 317 * 318 * The limits of UINT16_MAX are due to the types passed to 319 * vm_set_topology(). vmm.ko may enforce tighter limits. 320 */ 321 static void 322 calc_topology(void) 323 { 324 const char *value; 325 bool explicit_cpus; 326 uint64_t ncpus; 327 328 value = get_config_value("cpus"); 329 if (value != NULL) { 330 guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); 331 explicit_cpus = true; 332 } else { 333 guest_ncpus = 1; 334 explicit_cpus = false; 335 } 336 value = get_config_value("cores"); 337 if (value != NULL) 338 cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); 339 else 340 cpu_cores = 1; 341 value = get_config_value("threads"); 342 if (value != NULL) 343 cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); 344 else 345 cpu_threads = 1; 346 value = get_config_value("sockets"); 347 if (value != NULL) 348 cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); 349 else 350 cpu_sockets = guest_ncpus; 351 352 /* 353 * Compute sockets * cores * threads avoiding overflow. The 354 * range check above insures these are 16 bit values. 355 */ 356 ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; 357 if (ncpus > UINT16_MAX) 358 errx(4, "Computed number of vCPUs too high: %ju", 359 (uintmax_t)ncpus); 360 361 if (explicit_cpus) { 362 if (guest_ncpus != (int)ncpus) 363 errx(4, "Topology (%d sockets, %d cores, %d threads) " 364 "does not match %d vCPUs", 365 cpu_sockets, cpu_cores, cpu_threads, 366 guest_ncpus); 367 } else 368 guest_ncpus = ncpus; 369 } 370 371 static int 372 pincpu_parse(const char *opt) 373 { 374 const char *value; 375 char *newval; 376 char key[16]; 377 int vcpu, pcpu; 378 379 if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { 380 fprintf(stderr, "invalid format: %s\n", opt); 381 return (-1); 382 } 383 384 if (vcpu < 0) { 385 fprintf(stderr, "invalid vcpu '%d'\n", vcpu); 386 return (-1); 387 } 388 389 if (pcpu < 0 || pcpu >= CPU_SETSIZE) { 390 fprintf(stderr, "hostcpu '%d' outside valid range from " 391 "0 to %d\n", pcpu, CPU_SETSIZE - 1); 392 return (-1); 393 } 394 395 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); 396 value = get_config_value(key); 397 398 if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", 399 value != NULL ? "," : "", pcpu) == -1) { 400 perror("failed to build new cpuset string"); 401 return (-1); 402 } 403 404 set_config_value(key, newval); 405 free(newval); 406 return (0); 407 } 408 409 static void 410 parse_cpuset(int vcpu, const char *list, cpuset_t *set) 411 { 412 char *cp, *token; 413 int pcpu, start; 414 415 CPU_ZERO(set); 416 start = -1; 417 token = __DECONST(char *, list); 418 for (;;) { 419 pcpu = strtoul(token, &cp, 0); 420 if (cp == token) 421 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); 422 if (pcpu < 0 || pcpu >= CPU_SETSIZE) 423 errx(4, "hostcpu '%d' outside valid range from 0 to %d", 424 pcpu, CPU_SETSIZE - 1); 425 switch (*cp) { 426 case ',': 427 case '\0': 428 if (start >= 0) { 429 if (start > pcpu) 430 errx(4, "Invalid hostcpu range %d-%d", 431 start, pcpu); 432 while (start < pcpu) { 433 CPU_SET(start, set); 434 start++; 435 } 436 start = -1; 437 } 438 CPU_SET(pcpu, set); 439 break; 440 case '-': 441 if (start >= 0) 442 errx(4, "invalid cpuset for vcpu %d: '%s'", 443 vcpu, list); 444 start = pcpu; 445 break; 446 default: 447 errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); 448 } 449 if (*cp == '\0') 450 break; 451 token = cp + 1; 452 } 453 } 454 455 static void 456 build_vcpumaps(void) 457 { 458 char key[16]; 459 const char *value; 460 int vcpu; 461 462 vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); 463 for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { 464 snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); 465 value = get_config_value(key); 466 if (value == NULL) 467 continue; 468 vcpumap[vcpu] = malloc(sizeof(cpuset_t)); 469 if (vcpumap[vcpu] == NULL) 470 err(4, "Failed to allocate cpuset for vcpu %d", vcpu); 471 parse_cpuset(vcpu, value, vcpumap[vcpu]); 472 } 473 } 474 475 void 476 vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, 477 int errcode) 478 { 479 int error, restart_instruction; 480 481 restart_instruction = 1; 482 483 error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, 484 restart_instruction); 485 assert(error == 0); 486 } 487 488 void * 489 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) 490 { 491 492 return (vm_map_gpa(ctx, gaddr, len)); 493 } 494 495 #ifdef BHYVE_SNAPSHOT 496 uintptr_t 497 paddr_host2guest(struct vmctx *ctx, void *addr) 498 { 499 return (vm_rev_map_gpa(ctx, addr)); 500 } 501 #endif 502 503 int 504 fbsdrun_virtio_msix(void) 505 { 506 507 return (get_config_bool_default("virtio_msix", true)); 508 } 509 510 static void * 511 fbsdrun_start_thread(void *param) 512 { 513 char tname[MAXCOMLEN + 1]; 514 struct vcpu_info *vi = param; 515 int error; 516 517 snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); 518 pthread_set_name_np(pthread_self(), tname); 519 520 if (vcpumap[vi->vcpuid] != NULL) { 521 error = pthread_setaffinity_np(pthread_self(), 522 sizeof(cpuset_t), vcpumap[vi->vcpuid]); 523 assert(error == 0); 524 } 525 526 #ifdef BHYVE_SNAPSHOT 527 checkpoint_cpu_add(vi->vcpuid); 528 #endif 529 gdb_cpu_add(vi->vcpu); 530 531 vm_loop(vi->ctx, vi->vcpu); 532 533 /* not reached */ 534 exit(1); 535 return (NULL); 536 } 537 538 static void 539 fbsdrun_addcpu(struct vcpu_info *vi) 540 { 541 pthread_t thr; 542 int error; 543 544 error = vm_activate_cpu(vi->vcpu); 545 if (error != 0) 546 err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); 547 548 CPU_SET_ATOMIC(vi->vcpuid, &cpumask); 549 550 vm_suspend_cpu(vi->vcpu); 551 552 error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); 553 assert(error == 0); 554 } 555 556 static int 557 fbsdrun_deletecpu(int vcpu) 558 { 559 560 if (!CPU_ISSET(vcpu, &cpumask)) { 561 fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); 562 exit(4); 563 } 564 565 CPU_CLR_ATOMIC(vcpu, &cpumask); 566 return (CPU_EMPTY(&cpumask)); 567 } 568 569 static int 570 vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) 571 { 572 struct vm_exit *vme; 573 int error; 574 int bytes, port, in; 575 576 vme = vmrun->vm_exit; 577 port = vme->u.inout.port; 578 bytes = vme->u.inout.bytes; 579 in = vme->u.inout.in; 580 581 error = emulate_inout(ctx, vcpu, vme); 582 if (error) { 583 fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", 584 in ? "in" : "out", 585 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), 586 port, vme->rip); 587 return (VMEXIT_ABORT); 588 } else { 589 return (VMEXIT_CONTINUE); 590 } 591 } 592 593 static int 594 vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, 595 struct vm_run *vmrun) 596 { 597 struct vm_exit *vme; 598 uint64_t val; 599 uint32_t eax, edx; 600 int error; 601 602 vme = vmrun->vm_exit; 603 604 val = 0; 605 error = emulate_rdmsr(vcpu, vme->u.msr.code, &val); 606 if (error != 0) { 607 fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", 608 vme->u.msr.code, vcpu_id(vcpu)); 609 if (get_config_bool("x86.strictmsr")) { 610 vm_inject_gp(vcpu); 611 return (VMEXIT_CONTINUE); 612 } 613 } 614 615 eax = val; 616 error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); 617 assert(error == 0); 618 619 edx = val >> 32; 620 error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx); 621 assert(error == 0); 622 623 return (VMEXIT_CONTINUE); 624 } 625 626 static int 627 vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, 628 struct vm_run *vmrun) 629 { 630 struct vm_exit *vme; 631 int error; 632 633 vme = vmrun->vm_exit; 634 635 error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval); 636 if (error != 0) { 637 fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", 638 vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu)); 639 if (get_config_bool("x86.strictmsr")) { 640 vm_inject_gp(vcpu); 641 return (VMEXIT_CONTINUE); 642 } 643 } 644 return (VMEXIT_CONTINUE); 645 } 646 647 #define DEBUG_EPT_MISCONFIG 648 #ifdef DEBUG_EPT_MISCONFIG 649 #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 650 651 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; 652 static int ept_misconfig_ptenum; 653 #endif 654 655 static const char * 656 vmexit_vmx_desc(uint32_t exit_reason) 657 { 658 659 if (exit_reason >= nitems(vmx_exit_reason_desc) || 660 vmx_exit_reason_desc[exit_reason] == NULL) 661 return ("Unknown"); 662 return (vmx_exit_reason_desc[exit_reason]); 663 } 664 665 static int 666 vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) 667 { 668 struct vm_exit *vme; 669 670 vme = vmrun->vm_exit; 671 672 fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); 673 fprintf(stderr, "\treason\t\tVMX\n"); 674 fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); 675 fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); 676 fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); 677 fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, 678 vmexit_vmx_desc(vme->u.vmx.exit_reason)); 679 fprintf(stderr, "\tqualification\t0x%016lx\n", 680 vme->u.vmx.exit_qualification); 681 fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); 682 fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); 683 #ifdef DEBUG_EPT_MISCONFIG 684 if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { 685 vm_get_register(vcpu, 686 VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), 687 &ept_misconfig_gpa); 688 vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, 689 &ept_misconfig_ptenum); 690 fprintf(stderr, "\tEPT misconfiguration:\n"); 691 fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); 692 fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", 693 ept_misconfig_ptenum, ept_misconfig_pte[0], 694 ept_misconfig_pte[1], ept_misconfig_pte[2], 695 ept_misconfig_pte[3]); 696 } 697 #endif /* DEBUG_EPT_MISCONFIG */ 698 return (VMEXIT_ABORT); 699 } 700 701 static int 702 vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) 703 { 704 struct vm_exit *vme; 705 706 vme = vmrun->vm_exit; 707 708 fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); 709 fprintf(stderr, "\treason\t\tSVM\n"); 710 fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); 711 fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); 712 fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); 713 fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); 714 fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); 715 return (VMEXIT_ABORT); 716 } 717 718 static int 719 vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 720 struct vm_run *vmrun) 721 { 722 assert(vmrun->vm_exit->inst_length == 0); 723 724 return (VMEXIT_CONTINUE); 725 } 726 727 static int 728 vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 729 struct vm_run *vmrun) 730 { 731 assert(vmrun->vm_exit->inst_length == 0); 732 733 return (VMEXIT_CONTINUE); 734 } 735 736 static int 737 vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 738 struct vm_run *vmrun __unused) 739 { 740 /* 741 * Just continue execution with the next instruction. We use 742 * the HLT VM exit as a way to be friendly with the host 743 * scheduler. 744 */ 745 return (VMEXIT_CONTINUE); 746 } 747 748 static int 749 vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 750 struct vm_run *vmrun __unused) 751 { 752 return (VMEXIT_CONTINUE); 753 } 754 755 static int 756 vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu, 757 struct vm_run *vmrun) 758 { 759 assert(vmrun->vm_exit->inst_length == 0); 760 761 #ifdef BHYVE_SNAPSHOT 762 checkpoint_cpu_suspend(vcpu_id(vcpu)); 763 #endif 764 gdb_cpu_mtrap(vcpu); 765 #ifdef BHYVE_SNAPSHOT 766 checkpoint_cpu_resume(vcpu_id(vcpu)); 767 #endif 768 769 return (VMEXIT_CONTINUE); 770 } 771 772 static int 773 vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, 774 struct vm_run *vmrun) 775 { 776 struct vm_exit *vme; 777 struct vie *vie; 778 int err, i, cs_d; 779 enum vm_cpu_mode mode; 780 781 vme = vmrun->vm_exit; 782 783 vie = &vme->u.inst_emul.vie; 784 if (!vie->decoded) { 785 /* 786 * Attempt to decode in userspace as a fallback. This allows 787 * updating instruction decode in bhyve without rebooting the 788 * kernel (rapid prototyping), albeit with much slower 789 * emulation. 790 */ 791 vie_restart(vie); 792 mode = vme->u.inst_emul.paging.cpu_mode; 793 cs_d = vme->u.inst_emul.cs_d; 794 if (vmm_decode_instruction(mode, cs_d, vie) != 0) 795 goto fail; 796 if (vm_set_register(vcpu, VM_REG_GUEST_RIP, 797 vme->rip + vie->num_processed) != 0) 798 goto fail; 799 } 800 801 err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, 802 &vme->u.inst_emul.paging); 803 if (err) { 804 if (err == ESRCH) { 805 EPRINTLN("Unhandled memory access to 0x%lx\n", 806 vme->u.inst_emul.gpa); 807 } 808 goto fail; 809 } 810 811 return (VMEXIT_CONTINUE); 812 813 fail: 814 fprintf(stderr, "Failed to emulate instruction sequence [ "); 815 for (i = 0; i < vie->num_valid; i++) 816 fprintf(stderr, "%02x", vie->inst[i]); 817 FPRINTLN(stderr, " ] at 0x%lx", vme->rip); 818 return (VMEXIT_ABORT); 819 } 820 821 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; 822 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; 823 824 static int 825 vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) 826 { 827 struct vm_exit *vme; 828 enum vm_suspend_how how; 829 int vcpuid = vcpu_id(vcpu); 830 831 vme = vmrun->vm_exit; 832 833 how = vme->u.suspended.how; 834 835 fbsdrun_deletecpu(vcpuid); 836 837 if (vcpuid != BSP) { 838 pthread_mutex_lock(&resetcpu_mtx); 839 pthread_cond_signal(&resetcpu_cond); 840 pthread_mutex_unlock(&resetcpu_mtx); 841 pthread_exit(NULL); 842 } 843 844 pthread_mutex_lock(&resetcpu_mtx); 845 while (!CPU_EMPTY(&cpumask)) { 846 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); 847 } 848 pthread_mutex_unlock(&resetcpu_mtx); 849 850 switch (how) { 851 case VM_SUSPEND_RESET: 852 exit(0); 853 case VM_SUSPEND_POWEROFF: 854 if (get_config_bool_default("destroy_on_poweroff", false)) 855 vm_destroy(ctx); 856 exit(1); 857 case VM_SUSPEND_HALT: 858 exit(2); 859 case VM_SUSPEND_TRIPLEFAULT: 860 exit(3); 861 default: 862 fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); 863 exit(100); 864 } 865 return (0); /* NOTREACHED */ 866 } 867 868 static int 869 vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu, 870 struct vm_run *vmrun __unused) 871 { 872 873 #ifdef BHYVE_SNAPSHOT 874 checkpoint_cpu_suspend(vcpu_id(vcpu)); 875 #endif 876 gdb_cpu_suspend(vcpu); 877 #ifdef BHYVE_SNAPSHOT 878 checkpoint_cpu_resume(vcpu_id(vcpu)); 879 #endif 880 /* 881 * XXX-MJ sleep for a short period to avoid chewing up the CPU in the 882 * window between activation of the vCPU thread and the STARTUP IPI. 883 */ 884 usleep(1000); 885 return (VMEXIT_CONTINUE); 886 } 887 888 static int 889 vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu, 890 struct vm_run *vmrun) 891 { 892 gdb_cpu_breakpoint(vcpu, vmrun->vm_exit); 893 return (VMEXIT_CONTINUE); 894 } 895 896 static int 897 vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, 898 struct vm_run *vmrun) 899 { 900 struct vm_exit *vme; 901 cpuset_t *dmask; 902 int error = -1; 903 int i; 904 905 dmask = vmrun->cpuset; 906 vme = vmrun->vm_exit; 907 908 switch (vme->u.ipi.mode) { 909 case APIC_DELMODE_INIT: 910 CPU_FOREACH_ISSET(i, dmask) { 911 error = vm_suspend_cpu(vcpu_info[i].vcpu); 912 if (error) { 913 warnx("%s: failed to suspend cpu %d\n", 914 __func__, i); 915 break; 916 } 917 } 918 break; 919 case APIC_DELMODE_STARTUP: 920 CPU_FOREACH_ISSET(i, dmask) { 921 spinup_ap(vcpu_info[i].vcpu, 922 vme->u.ipi.vector << PAGE_SHIFT); 923 } 924 error = 0; 925 break; 926 default: 927 break; 928 } 929 930 return (error); 931 } 932 933 static vmexit_handler_t handler[VM_EXITCODE_MAX] = { 934 [VM_EXITCODE_INOUT] = vmexit_inout, 935 [VM_EXITCODE_INOUT_STR] = vmexit_inout, 936 [VM_EXITCODE_VMX] = vmexit_vmx, 937 [VM_EXITCODE_SVM] = vmexit_svm, 938 [VM_EXITCODE_BOGUS] = vmexit_bogus, 939 [VM_EXITCODE_REQIDLE] = vmexit_reqidle, 940 [VM_EXITCODE_RDMSR] = vmexit_rdmsr, 941 [VM_EXITCODE_WRMSR] = vmexit_wrmsr, 942 [VM_EXITCODE_MTRAP] = vmexit_mtrap, 943 [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, 944 [VM_EXITCODE_SUSPENDED] = vmexit_suspend, 945 [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, 946 [VM_EXITCODE_DEBUG] = vmexit_debug, 947 [VM_EXITCODE_BPT] = vmexit_breakpoint, 948 [VM_EXITCODE_IPI] = vmexit_ipi, 949 }; 950 951 static void 952 vm_loop(struct vmctx *ctx, struct vcpu *vcpu) 953 { 954 struct vm_exit vme; 955 struct vm_run vmrun; 956 int error, rc; 957 enum vm_exitcode exitcode; 958 cpuset_t active_cpus, dmask; 959 960 error = vm_active_cpus(ctx, &active_cpus); 961 assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); 962 963 vmrun.vm_exit = &vme; 964 vmrun.cpuset = &dmask; 965 vmrun.cpusetsize = sizeof(dmask); 966 967 while (1) { 968 error = vm_run(vcpu, &vmrun); 969 if (error != 0) 970 break; 971 972 exitcode = vme.exitcode; 973 if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { 974 fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", 975 exitcode); 976 exit(4); 977 } 978 979 rc = (*handler[exitcode])(ctx, vcpu, &vmrun); 980 981 switch (rc) { 982 case VMEXIT_CONTINUE: 983 break; 984 case VMEXIT_ABORT: 985 abort(); 986 default: 987 exit(4); 988 } 989 } 990 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); 991 } 992 993 static int 994 num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) 995 { 996 uint16_t sockets, cores, threads, maxcpus; 997 int tmp, error; 998 999 /* 1000 * The guest is allowed to spinup more than one processor only if the 1001 * UNRESTRICTED_GUEST capability is available. 1002 */ 1003 error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); 1004 if (error != 0) 1005 return (1); 1006 1007 error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); 1008 if (error == 0) 1009 return (maxcpus); 1010 else 1011 return (1); 1012 } 1013 1014 static void 1015 fbsdrun_set_capabilities(struct vcpu *vcpu, bool bsp) 1016 { 1017 int err, tmp; 1018 1019 if (get_config_bool_default("x86.vmexit_on_hlt", false)) { 1020 err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); 1021 if (err < 0) { 1022 fprintf(stderr, "VM exit on HLT not supported\n"); 1023 exit(4); 1024 } 1025 vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); 1026 if (bsp) 1027 handler[VM_EXITCODE_HLT] = vmexit_hlt; 1028 } 1029 1030 if (get_config_bool_default("x86.vmexit_on_pause", false)) { 1031 /* 1032 * pause exit support required for this mode 1033 */ 1034 err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); 1035 if (err < 0) { 1036 fprintf(stderr, 1037 "SMP mux requested, no pause support\n"); 1038 exit(4); 1039 } 1040 vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); 1041 if (bsp) 1042 handler[VM_EXITCODE_PAUSE] = vmexit_pause; 1043 } 1044 1045 if (get_config_bool_default("x86.x2apic", false)) 1046 err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); 1047 else 1048 err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); 1049 1050 if (err) { 1051 fprintf(stderr, "Unable to set x2apic state (%d)\n", err); 1052 exit(4); 1053 } 1054 1055 vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); 1056 1057 err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); 1058 assert(err == 0); 1059 } 1060 1061 static struct vmctx * 1062 do_open(const char *vmname) 1063 { 1064 struct vmctx *ctx; 1065 int error; 1066 bool reinit, romboot; 1067 1068 reinit = romboot = false; 1069 1070 if (lpc_bootrom()) 1071 romboot = true; 1072 1073 error = vm_create(vmname); 1074 if (error) { 1075 if (errno == EEXIST) { 1076 if (romboot) { 1077 reinit = true; 1078 } else { 1079 /* 1080 * The virtual machine has been setup by the 1081 * userspace bootloader. 1082 */ 1083 } 1084 } else { 1085 perror("vm_create"); 1086 exit(4); 1087 } 1088 } else { 1089 #ifndef BHYVE_SNAPSHOT 1090 if (!romboot) { 1091 #else 1092 if (!romboot && !get_config_bool_default("is_migrated", false)) { 1093 #endif 1094 /* 1095 * If the virtual machine was just created then a 1096 * bootrom must be configured to boot it. 1097 */ 1098 fprintf(stderr, "virtual machine cannot be booted\n"); 1099 exit(4); 1100 } 1101 } 1102 1103 ctx = vm_open(vmname); 1104 if (ctx == NULL) { 1105 perror("vm_open"); 1106 exit(4); 1107 } 1108 1109 #ifndef WITHOUT_CAPSICUM 1110 if (vm_limit_rights(ctx) != 0) 1111 err(EX_OSERR, "vm_limit_rights"); 1112 #endif 1113 1114 if (reinit) { 1115 error = vm_reinit(ctx); 1116 if (error) { 1117 perror("vm_reinit"); 1118 exit(4); 1119 } 1120 } 1121 error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0); 1122 if (error) 1123 errx(EX_OSERR, "vm_set_topology"); 1124 return (ctx); 1125 } 1126 1127 static void 1128 spinup_vcpu(struct vcpu_info *vi, bool bsp) 1129 { 1130 int error; 1131 1132 if (!bsp) { 1133 fbsdrun_set_capabilities(vi->vcpu, false); 1134 1135 /* 1136 * Enable the 'unrestricted guest' mode for APs. 1137 * 1138 * APs startup in power-on 16-bit mode. 1139 */ 1140 error = vm_set_capability(vi->vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); 1141 assert(error == 0); 1142 } 1143 1144 fbsdrun_addcpu(vi); 1145 } 1146 1147 static bool 1148 parse_config_option(const char *option) 1149 { 1150 const char *value; 1151 char *path; 1152 1153 value = strchr(option, '='); 1154 if (value == NULL || value[1] == '\0') 1155 return (false); 1156 path = strndup(option, value - option); 1157 if (path == NULL) 1158 err(4, "Failed to allocate memory"); 1159 set_config_value(path, value + 1); 1160 return (true); 1161 } 1162 1163 static void 1164 parse_simple_config_file(const char *path) 1165 { 1166 FILE *fp; 1167 char *line, *cp; 1168 size_t linecap; 1169 unsigned int lineno; 1170 1171 fp = fopen(path, "r"); 1172 if (fp == NULL) 1173 err(4, "Failed to open configuration file %s", path); 1174 line = NULL; 1175 linecap = 0; 1176 lineno = 1; 1177 for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { 1178 if (*line == '#' || *line == '\n') 1179 continue; 1180 cp = strchr(line, '\n'); 1181 if (cp != NULL) 1182 *cp = '\0'; 1183 if (!parse_config_option(line)) 1184 errx(4, "%s line %u: invalid config option '%s'", path, 1185 lineno, line); 1186 } 1187 free(line); 1188 fclose(fp); 1189 } 1190 1191 static void 1192 parse_gdb_options(const char *opt) 1193 { 1194 const char *sport; 1195 char *colon; 1196 1197 if (opt[0] == 'w') { 1198 set_config_bool("gdb.wait", true); 1199 opt++; 1200 } 1201 1202 colon = strrchr(opt, ':'); 1203 if (colon == NULL) { 1204 sport = opt; 1205 } else { 1206 *colon = '\0'; 1207 colon++; 1208 sport = colon; 1209 set_config_value("gdb.address", opt); 1210 } 1211 1212 set_config_value("gdb.port", sport); 1213 } 1214 1215 static void 1216 set_defaults(void) 1217 { 1218 1219 set_config_bool("acpi_tables", false); 1220 set_config_value("memory.size", "256M"); 1221 set_config_bool("x86.strictmsr", true); 1222 set_config_value("lpc.fwcfg", "bhyve"); 1223 } 1224 1225 int 1226 main(int argc, char *argv[]) 1227 { 1228 int c, error; 1229 int max_vcpus, memflags; 1230 struct vcpu *bsp; 1231 struct vmctx *ctx; 1232 struct qemu_fwcfg_item *e820_fwcfg_item; 1233 uint64_t rip; 1234 size_t memsize; 1235 const char *optstr, *value, *vmname; 1236 #ifdef BHYVE_SNAPSHOT 1237 char *restore_file; 1238 char *migration_host; 1239 struct restore_state rstate; 1240 1241 restore_file = NULL; 1242 migration_host = NULL; 1243 #endif 1244 1245 init_config(); 1246 set_defaults(); 1247 progname = basename(argv[0]); 1248 1249 #ifdef BHYVE_SNAPSHOT 1250 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:R:"; 1251 #else 1252 optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; 1253 #endif 1254 while ((c = getopt(argc, argv, optstr)) != -1) { 1255 switch (c) { 1256 case 'a': 1257 set_config_bool("x86.x2apic", false); 1258 break; 1259 case 'A': 1260 set_config_bool("acpi_tables", true); 1261 break; 1262 case 'D': 1263 set_config_bool("destroy_on_poweroff", true); 1264 break; 1265 case 'p': 1266 if (pincpu_parse(optarg) != 0) { 1267 errx(EX_USAGE, "invalid vcpu pinning " 1268 "configuration '%s'", optarg); 1269 } 1270 break; 1271 case 'c': 1272 if (topology_parse(optarg) != 0) { 1273 errx(EX_USAGE, "invalid cpu topology " 1274 "'%s'", optarg); 1275 } 1276 break; 1277 case 'C': 1278 set_config_bool("memory.guest_in_core", true); 1279 break; 1280 case 'f': 1281 if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { 1282 errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); 1283 } 1284 break; 1285 case 'G': 1286 parse_gdb_options(optarg); 1287 break; 1288 case 'k': 1289 parse_simple_config_file(optarg); 1290 break; 1291 case 'K': 1292 set_config_value("keyboard.layout", optarg); 1293 break; 1294 case 'l': 1295 if (strncmp(optarg, "help", strlen(optarg)) == 0) { 1296 lpc_print_supported_devices(); 1297 exit(0); 1298 } else if (lpc_device_parse(optarg) != 0) { 1299 errx(EX_USAGE, "invalid lpc device " 1300 "configuration '%s'", optarg); 1301 } 1302 break; 1303 #ifdef BHYVE_SNAPSHOT 1304 case 'r': 1305 restore_file = optarg; 1306 break; 1307 case 'R': 1308 migration_host = optarg; 1309 set_config_bool("is_migrated", true); 1310 break; 1311 #endif 1312 case 's': 1313 if (strncmp(optarg, "help", strlen(optarg)) == 0) { 1314 pci_print_supported_devices(); 1315 exit(0); 1316 } else if (pci_parse_slot(optarg) != 0) 1317 exit(4); 1318 else 1319 break; 1320 case 'S': 1321 set_config_bool("memory.wired", true); 1322 break; 1323 case 'm': 1324 set_config_value("memory.size", optarg); 1325 break; 1326 case 'o': 1327 if (!parse_config_option(optarg)) 1328 errx(EX_USAGE, "invalid configuration option '%s'", optarg); 1329 break; 1330 case 'H': 1331 set_config_bool("x86.vmexit_on_hlt", true); 1332 break; 1333 case 'I': 1334 /* 1335 * The "-I" option was used to add an ioapic to the 1336 * virtual machine. 1337 * 1338 * An ioapic is now provided unconditionally for each 1339 * virtual machine and this option is now deprecated. 1340 */ 1341 break; 1342 case 'P': 1343 set_config_bool("x86.vmexit_on_pause", true); 1344 break; 1345 case 'e': 1346 set_config_bool("x86.strictio", true); 1347 break; 1348 case 'u': 1349 set_config_bool("rtc.use_localtime", false); 1350 break; 1351 case 'U': 1352 set_config_value("uuid", optarg); 1353 break; 1354 case 'w': 1355 set_config_bool("x86.strictmsr", false); 1356 break; 1357 case 'W': 1358 set_config_bool("virtio_msix", false); 1359 break; 1360 case 'x': 1361 set_config_bool("x86.x2apic", true); 1362 break; 1363 case 'Y': 1364 set_config_bool("x86.mptable", false); 1365 break; 1366 case 'h': 1367 usage(0); 1368 default: 1369 usage(1); 1370 } 1371 } 1372 argc -= optind; 1373 argv += optind; 1374 1375 if (argc > 1) 1376 usage(1); 1377 1378 #ifdef BHYVE_SNAPSHOT 1379 if (restore_file != NULL) { 1380 error = load_restore_file(restore_file, &rstate); 1381 if (error) { 1382 fprintf(stderr, "Failed to read checkpoint info from " 1383 "file: '%s'.\n", restore_file); 1384 exit(1); 1385 } 1386 vmname = lookup_vmname(&rstate); 1387 if (vmname != NULL) 1388 set_config_value("name", vmname); 1389 } 1390 #endif 1391 1392 if (argc == 1) 1393 set_config_value("name", argv[0]); 1394 1395 vmname = get_config_value("name"); 1396 if (vmname == NULL) 1397 usage(1); 1398 1399 if (get_config_bool_default("config.dump", false)) { 1400 dump_config(); 1401 exit(1); 1402 } 1403 1404 calc_topology(); 1405 build_vcpumaps(); 1406 1407 value = get_config_value("memory.size"); 1408 error = vm_parse_memsize(value, &memsize); 1409 if (error) 1410 errx(EX_USAGE, "invalid memsize '%s'", value); 1411 1412 ctx = do_open(vmname); 1413 1414 #ifdef BHYVE_SNAPSHOT 1415 if (restore_file != NULL) { 1416 guest_ncpus = lookup_guest_ncpus(&rstate); 1417 memflags = lookup_memflags(&rstate); 1418 memsize = lookup_memsize(&rstate); 1419 } 1420 1421 if (guest_ncpus < 1) { 1422 fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); 1423 exit(1); 1424 } 1425 #endif 1426 1427 bsp = vm_vcpu_open(ctx, BSP); 1428 max_vcpus = num_vcpus_allowed(ctx, bsp); 1429 if (guest_ncpus > max_vcpus) { 1430 fprintf(stderr, "%d vCPUs requested but only %d available\n", 1431 guest_ncpus, max_vcpus); 1432 exit(4); 1433 } 1434 1435 fbsdrun_set_capabilities(bsp, true); 1436 1437 /* Allocate per-VCPU resources. */ 1438 vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); 1439 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { 1440 vcpu_info[vcpuid].ctx = ctx; 1441 vcpu_info[vcpuid].vcpuid = vcpuid; 1442 if (vcpuid == BSP) 1443 vcpu_info[vcpuid].vcpu = bsp; 1444 else 1445 vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); 1446 } 1447 1448 memflags = 0; 1449 if (get_config_bool_default("memory.wired", false)) 1450 memflags |= VM_MEM_F_WIRED; 1451 if (get_config_bool_default("memory.guest_in_core", false)) 1452 memflags |= VM_MEM_F_INCORE; 1453 vm_set_memflags(ctx, memflags); 1454 error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); 1455 if (error) { 1456 fprintf(stderr, "Unable to setup memory (%d)\n", errno); 1457 exit(4); 1458 } 1459 1460 error = init_msr(); 1461 if (error) { 1462 fprintf(stderr, "init_msr error %d", error); 1463 exit(4); 1464 } 1465 1466 init_mem(guest_ncpus); 1467 init_inout(); 1468 kernemu_dev_init(); 1469 init_bootrom(ctx); 1470 atkbdc_init(ctx); 1471 pci_irq_init(ctx); 1472 ioapic_init(ctx); 1473 1474 rtc_init(ctx); 1475 sci_init(ctx); 1476 1477 if (qemu_fwcfg_init(ctx) != 0) { 1478 fprintf(stderr, "qemu fwcfg initialization error"); 1479 exit(4); 1480 } 1481 1482 if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), 1483 &guest_ncpus) != 0) { 1484 fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); 1485 exit(4); 1486 } 1487 1488 if (e820_init(ctx) != 0) { 1489 fprintf(stderr, "Unable to setup E820"); 1490 exit(4); 1491 } 1492 1493 /* 1494 * Exit if a device emulation finds an error in its initialization 1495 */ 1496 if (init_pci(ctx) != 0) { 1497 perror("device emulation initialization error"); 1498 exit(4); 1499 } 1500 1501 /* 1502 * Initialize after PCI, to allow a bootrom file to reserve the high 1503 * region. 1504 */ 1505 if (get_config_bool("acpi_tables")) 1506 vmgenc_init(ctx); 1507 1508 init_gdb(ctx); 1509 1510 if (lpc_bootrom()) { 1511 if (vm_set_capability(bsp, VM_CAP_UNRESTRICTED_GUEST, 1)) { 1512 fprintf(stderr, "ROM boot failed: unrestricted guest " 1513 "capability not available\n"); 1514 exit(4); 1515 } 1516 error = vcpu_reset(bsp); 1517 assert(error == 0); 1518 } 1519 1520 /* 1521 * Add all vCPUs. 1522 */ 1523 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) 1524 spinup_vcpu(&vcpu_info[vcpuid], vcpuid == BSP); 1525 1526 #ifdef BHYVE_SNAPSHOT 1527 if (restore_file != NULL || migration_host != NULL) { 1528 fprintf(stdout, "Pausing pci devs...\n"); 1529 if (vm_pause_devices() != 0) { 1530 fprintf(stderr, "Failed to pause PCI device state.\n"); 1531 exit(1); 1532 } 1533 1534 if (restore_file != NULL) { 1535 fprintf(stdout, "Restoring vm mem...\n"); 1536 if (restore_vm_mem(ctx, &rstate) != 0) { 1537 fprintf(stderr, 1538 "Failed to restore VM memory.\n"); 1539 exit(1); 1540 } 1541 1542 fprintf(stdout, "Restoring pci devs...\n"); 1543 if (vm_restore_devices(&rstate) != 0) { 1544 fprintf(stderr, 1545 "Failed to restore PCI device state.\n"); 1546 exit(1); 1547 } 1548 1549 fprintf(stdout, "Restoring kernel structs...\n"); 1550 if (vm_restore_kern_structs(ctx, &rstate) != 0) { 1551 fprintf(stderr, 1552 "Failed to restore kernel structs.\n"); 1553 exit(1); 1554 } 1555 } 1556 1557 if (migration_host != NULL) { 1558 fprintf(stdout, "Starting the migration process...\n"); 1559 if (receive_vm_migration(ctx, migration_host) != 0) { 1560 fprintf(stderr, "Failed to migrate the vm.\n"); 1561 exit(1); 1562 } 1563 } 1564 1565 fprintf(stdout, "Resuming pci devs...\n"); 1566 if (vm_resume_devices() != 0) { 1567 fprintf(stderr, "Failed to resume PCI device state.\n"); 1568 exit(1); 1569 } 1570 } 1571 #endif /* BHYVE_SNAPSHOT */ 1572 1573 error = vm_get_register(bsp, VM_REG_GUEST_RIP, &rip); 1574 assert(error == 0); 1575 1576 /* 1577 * build the guest tables, MP etc. 1578 */ 1579 if (get_config_bool_default("x86.mptable", true)) { 1580 error = mptable_build(ctx, guest_ncpus); 1581 if (error) { 1582 perror("error to build the guest tables"); 1583 exit(4); 1584 } 1585 } 1586 1587 error = smbios_build(ctx); 1588 if (error != 0) 1589 exit(4); 1590 1591 if (get_config_bool("acpi_tables")) { 1592 error = acpi_build(ctx, guest_ncpus); 1593 assert(error == 0); 1594 } 1595 1596 e820_fwcfg_item = e820_get_fwcfg_item(); 1597 if (e820_fwcfg_item == NULL) { 1598 fprintf(stderr, "invalid e820 table"); 1599 exit(4); 1600 } 1601 if (qemu_fwcfg_add_file("etc/e820", e820_fwcfg_item->size, 1602 e820_fwcfg_item->data) != 0) { 1603 fprintf(stderr, "could not add qemu fwcfg etc/e820"); 1604 exit(4); 1605 } 1606 free(e820_fwcfg_item); 1607 1608 if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) { 1609 fwctl_init(); 1610 } 1611 1612 /* 1613 * Change the proc title to include the VM name. 1614 */ 1615 setproctitle("%s", vmname); 1616 1617 #ifdef BHYVE_SNAPSHOT 1618 /* initialize mutex/cond variables */ 1619 init_snapshot(); 1620 1621 /* 1622 * checkpointing thread for communication with bhyvectl 1623 */ 1624 if (init_checkpoint_thread(ctx) != 0) 1625 errx(EX_OSERR, "Failed to start checkpoint thread"); 1626 #endif 1627 1628 #ifndef WITHOUT_CAPSICUM 1629 caph_cache_catpages(); 1630 1631 if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) 1632 errx(EX_OSERR, "Unable to apply rights for sandbox"); 1633 1634 if (caph_enter() == -1) 1635 errx(EX_OSERR, "cap_enter() failed"); 1636 #endif 1637 1638 #ifdef BHYVE_SNAPSHOT 1639 if (restore_file != NULL) 1640 destroy_restore_state(&rstate); 1641 if (restore_file != NULL || migration_host != NULL) { 1642 if (vm_restore_time(ctx) < 0) 1643 err(EX_OSERR, "Unable to restore time"); 1644 1645 for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) 1646 vm_resume_cpu(vcpu_info[vcpuid].vcpu); 1647 } else 1648 #endif 1649 vm_resume_cpu(bsp); 1650 1651 /* 1652 * Head off to the main event dispatch loop 1653 */ 1654 mevent_dispatch(); 1655 1656 exit(4); 1657 } 1658