1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/types.h> 35 #ifndef WITHOUT_CAPSICUM 36 #include <sys/capsicum.h> 37 #endif 38 #include <sys/mman.h> 39 #include <sys/time.h> 40 41 #include <machine/atomic.h> 42 #include <machine/segments.h> 43 44 #ifndef WITHOUT_CAPSICUM 45 #include <capsicum_helpers.h> 46 #endif 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include <err.h> 51 #include <errno.h> 52 #include <libgen.h> 53 #include <unistd.h> 54 #include <assert.h> 55 #include <errno.h> 56 #include <pthread.h> 57 #include <pthread_np.h> 58 #include <sysexits.h> 59 #include <stdbool.h> 60 #include <stdint.h> 61 62 #include <machine/vmm.h> 63 #ifndef WITHOUT_CAPSICUM 64 #include <machine/vmm_dev.h> 65 #endif 66 #include <vmmapi.h> 67 68 #include "bhyverun.h" 69 #include "acpi.h" 70 #include "atkbdc.h" 71 #include "inout.h" 72 #include "dbgport.h" 73 #include "fwctl.h" 74 #include "ioapic.h" 75 #include "mem.h" 76 #include "mevent.h" 77 #include "mptbl.h" 78 #include "pci_emul.h" 79 #include "pci_irq.h" 80 #include "pci_lpc.h" 81 #include "smbiostbl.h" 82 #include "xmsr.h" 83 #include "spinup_ap.h" 84 #include "rtc.h" 85 86 #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ 87 88 #define MB (1024UL * 1024) 89 #define GB (1024UL * MB) 90 91 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); 92 extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); 93 94 char *vmname; 95 96 int guest_ncpus; 97 uint16_t cores, maxcpus, sockets, threads; 98 99 char *guest_uuid_str; 100 101 static int guest_vmexit_on_hlt, guest_vmexit_on_pause; 102 static int virtio_msix = 1; 103 static int x2apic_mode = 0; /* default is xAPIC */ 104 105 static int strictio; 106 static int strictmsr = 1; 107 108 static int acpi; 109 110 static char *progname; 111 static const int BSP = 0; 112 113 static cpuset_t cpumask; 114 115 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); 116 117 static struct vm_exit vmexit[VM_MAXCPU]; 118 119 struct bhyvestats { 120 uint64_t vmexit_bogus; 121 uint64_t vmexit_reqidle; 122 uint64_t vmexit_hlt; 123 uint64_t vmexit_pause; 124 uint64_t vmexit_mtrap; 125 uint64_t vmexit_inst_emul; 126 uint64_t cpu_switch_rotate; 127 uint64_t cpu_switch_direct; 128 } stats; 129 130 struct mt_vmm_info { 131 pthread_t mt_thr; 132 struct vmctx *mt_ctx; 133 int mt_vcpu; 134 } mt_vmm_info[VM_MAXCPU]; 135 136 static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; 137 138 static void 139 usage(int code) 140 { 141 142 fprintf(stderr, 143 "Usage: %s [-abehuwxACHPSWY]\n" 144 " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" 145 " %*s [-g <gdb port>] [-l <lpc>]\n" 146 " %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n" 147 " -a: local apic is in xAPIC mode (deprecated)\n" 148 " -A: create ACPI tables\n" 149 " -c: number of cpus and/or topology specification" 150 " -C: include guest memory in core file\n" 151 " -e: exit on unhandled I/O access\n" 152 " -g: gdb port\n" 153 " -h: help\n" 154 " -H: vmexit from the guest on hlt\n" 155 " -l: LPC device configuration\n" 156 " -m: memory size in MB\n" 157 " -p: pin 'vcpu' to 'hostcpu'\n" 158 " -P: vmexit from the guest on pause\n" 159 " -s: <slot,driver,configinfo> PCI slot config\n" 160 " -S: guest memory cannot be swapped\n" 161 " -u: RTC keeps UTC time\n" 162 " -U: uuid\n" 163 " -w: ignore unimplemented MSRs\n" 164 " -W: force virtio to use single-vector MSI\n" 165 " -x: local apic is in x2APIC mode\n" 166 " -Y: disable MPtable generation\n", 167 progname, (int)strlen(progname), "", (int)strlen(progname), "", 168 (int)strlen(progname), ""); 169 170 exit(code); 171 } 172 173 /* 174 * XXX This parser is known to have the following issues: 175 * 1. It accepts null key=value tokens ",,". 176 * 2. It accepts whitespace after = and before value. 177 * 3. Values out of range of INT are silently wrapped. 178 * 4. It doesn't check non-final values. 179 * 5. The apparently bogus limits of UINT16_MAX are for future expansion. 180 * 181 * The acceptance of a null specification ('-c ""') is by design to match the 182 * manual page syntax specification, this results in a topology of 1 vCPU. 183 */ 184 static int 185 topology_parse(const char *opt) 186 { 187 uint64_t ncpus; 188 int c, chk, n, s, t, tmp; 189 char *cp, *str; 190 bool ns, scts; 191 192 c = 1, n = 1, s = 1, t = 1; 193 ns = false, scts = false; 194 str = strdup(opt); 195 196 while ((cp = strsep(&str, ",")) != NULL) { 197 if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { 198 n = tmp; 199 ns = true; 200 } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) { 201 n = tmp; 202 ns = true; 203 } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) { 204 s = tmp; 205 scts = true; 206 } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) { 207 c = tmp; 208 scts = true; 209 } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) { 210 t = tmp; 211 scts = true; 212 #ifdef notyet /* Do not expose this until vmm.ko implements it */ 213 } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) { 214 m = tmp; 215 #endif 216 /* Skip the empty argument case from -c "" */ 217 } else if (cp[0] == '\0') 218 continue; 219 else 220 return (-1); 221 /* Any trailing garbage causes an error */ 222 if (cp[chk] != '\0') 223 return (-1); 224 } 225 /* 226 * Range check 1 <= n <= UINT16_MAX all values 227 */ 228 if (n < 1 || s < 1 || c < 1 || t < 1 || 229 n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX || 230 t > UINT16_MAX) 231 return (-1); 232 233 /* If only the cpus was specified, use that as sockets */ 234 if (!scts) 235 s = n; 236 /* 237 * Compute sockets * cores * threads avoiding overflow 238 * The range check above insures these are 16 bit values 239 * If n was specified check it against computed ncpus 240 */ 241 ncpus = (uint64_t)s * c * t; 242 if (ncpus > UINT16_MAX || (ns && n != ncpus)) 243 return (-1); 244 245 guest_ncpus = ncpus; 246 sockets = s; 247 cores = c; 248 threads = t; 249 return(0); 250 } 251 252 static int 253 pincpu_parse(const char *opt) 254 { 255 int vcpu, pcpu; 256 257 if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { 258 fprintf(stderr, "invalid format: %s\n", opt); 259 return (-1); 260 } 261 262 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 263 fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", 264 vcpu, VM_MAXCPU - 1); 265 return (-1); 266 } 267 268 if (pcpu < 0 || pcpu >= CPU_SETSIZE) { 269 fprintf(stderr, "hostcpu '%d' outside valid range from " 270 "0 to %d\n", pcpu, CPU_SETSIZE - 1); 271 return (-1); 272 } 273 274 if (vcpumap[vcpu] == NULL) { 275 if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { 276 perror("malloc"); 277 return (-1); 278 } 279 CPU_ZERO(vcpumap[vcpu]); 280 } 281 CPU_SET(pcpu, vcpumap[vcpu]); 282 return (0); 283 } 284 285 void 286 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, 287 int errcode) 288 { 289 struct vmctx *ctx; 290 int error, restart_instruction; 291 292 ctx = arg; 293 restart_instruction = 1; 294 295 error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, 296 restart_instruction); 297 assert(error == 0); 298 } 299 300 void * 301 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) 302 { 303 304 return (vm_map_gpa(ctx, gaddr, len)); 305 } 306 307 int 308 fbsdrun_vmexit_on_pause(void) 309 { 310 311 return (guest_vmexit_on_pause); 312 } 313 314 int 315 fbsdrun_vmexit_on_hlt(void) 316 { 317 318 return (guest_vmexit_on_hlt); 319 } 320 321 int 322 fbsdrun_virtio_msix(void) 323 { 324 325 return (virtio_msix); 326 } 327 328 static void * 329 fbsdrun_start_thread(void *param) 330 { 331 char tname[MAXCOMLEN + 1]; 332 struct mt_vmm_info *mtp; 333 int vcpu; 334 335 mtp = param; 336 vcpu = mtp->mt_vcpu; 337 338 snprintf(tname, sizeof(tname), "vcpu %d", vcpu); 339 pthread_set_name_np(mtp->mt_thr, tname); 340 341 vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); 342 343 /* not reached */ 344 exit(1); 345 return (NULL); 346 } 347 348 void 349 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) 350 { 351 int error; 352 353 assert(fromcpu == BSP); 354 355 /* 356 * The 'newcpu' must be activated in the context of 'fromcpu'. If 357 * vm_activate_cpu() is delayed until newcpu's pthread starts running 358 * then vmm.ko is out-of-sync with bhyve and this can create a race 359 * with vm_suspend(). 360 */ 361 error = vm_activate_cpu(ctx, newcpu); 362 if (error != 0) 363 err(EX_OSERR, "could not activate CPU %d", newcpu); 364 365 CPU_SET_ATOMIC(newcpu, &cpumask); 366 367 /* 368 * Set up the vmexit struct to allow execution to start 369 * at the given RIP 370 */ 371 vmexit[newcpu].rip = rip; 372 vmexit[newcpu].inst_length = 0; 373 374 mt_vmm_info[newcpu].mt_ctx = ctx; 375 mt_vmm_info[newcpu].mt_vcpu = newcpu; 376 377 error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, 378 fbsdrun_start_thread, &mt_vmm_info[newcpu]); 379 assert(error == 0); 380 } 381 382 static int 383 fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) 384 { 385 386 if (!CPU_ISSET(vcpu, &cpumask)) { 387 fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); 388 exit(1); 389 } 390 391 CPU_CLR_ATOMIC(vcpu, &cpumask); 392 return (CPU_EMPTY(&cpumask)); 393 } 394 395 static int 396 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, 397 uint32_t eax) 398 { 399 #if BHYVE_DEBUG 400 /* 401 * put guest-driven debug here 402 */ 403 #endif 404 return (VMEXIT_CONTINUE); 405 } 406 407 static int 408 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 409 { 410 int error; 411 int bytes, port, in, out; 412 int vcpu; 413 414 vcpu = *pvcpu; 415 416 port = vme->u.inout.port; 417 bytes = vme->u.inout.bytes; 418 in = vme->u.inout.in; 419 out = !in; 420 421 /* Extra-special case of host notifications */ 422 if (out && port == GUEST_NIO_PORT) { 423 error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); 424 return (error); 425 } 426 427 error = emulate_inout(ctx, vcpu, vme, strictio); 428 if (error) { 429 fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", 430 in ? "in" : "out", 431 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), 432 port, vmexit->rip); 433 return (VMEXIT_ABORT); 434 } else { 435 return (VMEXIT_CONTINUE); 436 } 437 } 438 439 static int 440 vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 441 { 442 uint64_t val; 443 uint32_t eax, edx; 444 int error; 445 446 val = 0; 447 error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); 448 if (error != 0) { 449 fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", 450 vme->u.msr.code, *pvcpu); 451 if (strictmsr) { 452 vm_inject_gp(ctx, *pvcpu); 453 return (VMEXIT_CONTINUE); 454 } 455 } 456 457 eax = val; 458 error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); 459 assert(error == 0); 460 461 edx = val >> 32; 462 error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); 463 assert(error == 0); 464 465 return (VMEXIT_CONTINUE); 466 } 467 468 static int 469 vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 470 { 471 int error; 472 473 error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); 474 if (error != 0) { 475 fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", 476 vme->u.msr.code, vme->u.msr.wval, *pvcpu); 477 if (strictmsr) { 478 vm_inject_gp(ctx, *pvcpu); 479 return (VMEXIT_CONTINUE); 480 } 481 } 482 return (VMEXIT_CONTINUE); 483 } 484 485 static int 486 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 487 { 488 489 (void)spinup_ap(ctx, *pvcpu, 490 vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); 491 492 return (VMEXIT_CONTINUE); 493 } 494 495 #define DEBUG_EPT_MISCONFIG 496 #ifdef DEBUG_EPT_MISCONFIG 497 #define EXIT_REASON_EPT_MISCONFIG 49 498 #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 499 #define VMCS_IDENT(x) ((x) | 0x80000000) 500 501 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; 502 static int ept_misconfig_ptenum; 503 #endif 504 505 static int 506 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 507 { 508 509 fprintf(stderr, "vm exit[%d]\n", *pvcpu); 510 fprintf(stderr, "\treason\t\tVMX\n"); 511 fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); 512 fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); 513 fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); 514 fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); 515 fprintf(stderr, "\tqualification\t0x%016lx\n", 516 vmexit->u.vmx.exit_qualification); 517 fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); 518 fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); 519 #ifdef DEBUG_EPT_MISCONFIG 520 if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { 521 vm_get_register(ctx, *pvcpu, 522 VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), 523 &ept_misconfig_gpa); 524 vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, 525 &ept_misconfig_ptenum); 526 fprintf(stderr, "\tEPT misconfiguration:\n"); 527 fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); 528 fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", 529 ept_misconfig_ptenum, ept_misconfig_pte[0], 530 ept_misconfig_pte[1], ept_misconfig_pte[2], 531 ept_misconfig_pte[3]); 532 } 533 #endif /* DEBUG_EPT_MISCONFIG */ 534 return (VMEXIT_ABORT); 535 } 536 537 static int 538 vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 539 { 540 541 fprintf(stderr, "vm exit[%d]\n", *pvcpu); 542 fprintf(stderr, "\treason\t\tSVM\n"); 543 fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); 544 fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); 545 fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); 546 fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); 547 fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); 548 return (VMEXIT_ABORT); 549 } 550 551 static int 552 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 553 { 554 555 assert(vmexit->inst_length == 0); 556 557 stats.vmexit_bogus++; 558 559 return (VMEXIT_CONTINUE); 560 } 561 562 static int 563 vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 564 { 565 566 assert(vmexit->inst_length == 0); 567 568 stats.vmexit_reqidle++; 569 570 return (VMEXIT_CONTINUE); 571 } 572 573 static int 574 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 575 { 576 577 stats.vmexit_hlt++; 578 579 /* 580 * Just continue execution with the next instruction. We use 581 * the HLT VM exit as a way to be friendly with the host 582 * scheduler. 583 */ 584 return (VMEXIT_CONTINUE); 585 } 586 587 static int 588 vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 589 { 590 591 stats.vmexit_pause++; 592 593 return (VMEXIT_CONTINUE); 594 } 595 596 static int 597 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 598 { 599 600 assert(vmexit->inst_length == 0); 601 602 stats.vmexit_mtrap++; 603 604 return (VMEXIT_CONTINUE); 605 } 606 607 static int 608 vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 609 { 610 int err, i; 611 struct vie *vie; 612 613 stats.vmexit_inst_emul++; 614 615 vie = &vmexit->u.inst_emul.vie; 616 err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, 617 vie, &vmexit->u.inst_emul.paging); 618 619 if (err) { 620 if (err == ESRCH) { 621 fprintf(stderr, "Unhandled memory access to 0x%lx\n", 622 vmexit->u.inst_emul.gpa); 623 } 624 625 fprintf(stderr, "Failed to emulate instruction ["); 626 for (i = 0; i < vie->num_valid; i++) { 627 fprintf(stderr, "0x%02x%s", vie->inst[i], 628 i != (vie->num_valid - 1) ? " " : ""); 629 } 630 fprintf(stderr, "] at 0x%lx\n", vmexit->rip); 631 return (VMEXIT_ABORT); 632 } 633 634 return (VMEXIT_CONTINUE); 635 } 636 637 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; 638 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; 639 640 static int 641 vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 642 { 643 enum vm_suspend_how how; 644 645 how = vmexit->u.suspended.how; 646 647 fbsdrun_deletecpu(ctx, *pvcpu); 648 649 if (*pvcpu != BSP) { 650 pthread_mutex_lock(&resetcpu_mtx); 651 pthread_cond_signal(&resetcpu_cond); 652 pthread_mutex_unlock(&resetcpu_mtx); 653 pthread_exit(NULL); 654 } 655 656 pthread_mutex_lock(&resetcpu_mtx); 657 while (!CPU_EMPTY(&cpumask)) { 658 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); 659 } 660 pthread_mutex_unlock(&resetcpu_mtx); 661 662 switch (how) { 663 case VM_SUSPEND_RESET: 664 exit(0); 665 case VM_SUSPEND_POWEROFF: 666 exit(1); 667 case VM_SUSPEND_HALT: 668 exit(2); 669 case VM_SUSPEND_TRIPLEFAULT: 670 exit(3); 671 default: 672 fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); 673 exit(100); 674 } 675 return (0); /* NOTREACHED */ 676 } 677 678 static vmexit_handler_t handler[VM_EXITCODE_MAX] = { 679 [VM_EXITCODE_INOUT] = vmexit_inout, 680 [VM_EXITCODE_INOUT_STR] = vmexit_inout, 681 [VM_EXITCODE_VMX] = vmexit_vmx, 682 [VM_EXITCODE_SVM] = vmexit_svm, 683 [VM_EXITCODE_BOGUS] = vmexit_bogus, 684 [VM_EXITCODE_REQIDLE] = vmexit_reqidle, 685 [VM_EXITCODE_RDMSR] = vmexit_rdmsr, 686 [VM_EXITCODE_WRMSR] = vmexit_wrmsr, 687 [VM_EXITCODE_MTRAP] = vmexit_mtrap, 688 [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, 689 [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, 690 [VM_EXITCODE_SUSPENDED] = vmexit_suspend, 691 [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, 692 }; 693 694 static void 695 vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) 696 { 697 int error, rc; 698 enum vm_exitcode exitcode; 699 cpuset_t active_cpus; 700 701 if (vcpumap[vcpu] != NULL) { 702 error = pthread_setaffinity_np(pthread_self(), 703 sizeof(cpuset_t), vcpumap[vcpu]); 704 assert(error == 0); 705 } 706 707 error = vm_active_cpus(ctx, &active_cpus); 708 assert(CPU_ISSET(vcpu, &active_cpus)); 709 710 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); 711 assert(error == 0); 712 713 while (1) { 714 error = vm_run(ctx, vcpu, &vmexit[vcpu]); 715 if (error != 0) 716 break; 717 718 exitcode = vmexit[vcpu].exitcode; 719 if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { 720 fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", 721 exitcode); 722 exit(1); 723 } 724 725 rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); 726 727 switch (rc) { 728 case VMEXIT_CONTINUE: 729 break; 730 case VMEXIT_ABORT: 731 abort(); 732 default: 733 exit(1); 734 } 735 } 736 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); 737 } 738 739 static int 740 num_vcpus_allowed(struct vmctx *ctx) 741 { 742 int tmp, error; 743 744 error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); 745 746 /* 747 * The guest is allowed to spinup more than one processor only if the 748 * UNRESTRICTED_GUEST capability is available. 749 */ 750 if (error == 0) 751 return (VM_MAXCPU); 752 else 753 return (1); 754 } 755 756 void 757 fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) 758 { 759 int err, tmp; 760 761 if (fbsdrun_vmexit_on_hlt()) { 762 err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); 763 if (err < 0) { 764 fprintf(stderr, "VM exit on HLT not supported\n"); 765 exit(1); 766 } 767 vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); 768 if (cpu == BSP) 769 handler[VM_EXITCODE_HLT] = vmexit_hlt; 770 } 771 772 if (fbsdrun_vmexit_on_pause()) { 773 /* 774 * pause exit support required for this mode 775 */ 776 err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); 777 if (err < 0) { 778 fprintf(stderr, 779 "SMP mux requested, no pause support\n"); 780 exit(1); 781 } 782 vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); 783 if (cpu == BSP) 784 handler[VM_EXITCODE_PAUSE] = vmexit_pause; 785 } 786 787 if (x2apic_mode) 788 err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); 789 else 790 err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); 791 792 if (err) { 793 fprintf(stderr, "Unable to set x2apic state (%d)\n", err); 794 exit(1); 795 } 796 797 vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); 798 } 799 800 static struct vmctx * 801 do_open(const char *vmname) 802 { 803 struct vmctx *ctx; 804 int error; 805 bool reinit, romboot; 806 #ifndef WITHOUT_CAPSICUM 807 cap_rights_t rights; 808 const cap_ioctl_t *cmds; 809 size_t ncmds; 810 #endif 811 812 reinit = romboot = false; 813 814 if (lpc_bootrom()) 815 romboot = true; 816 817 error = vm_create(vmname); 818 if (error) { 819 if (errno == EEXIST) { 820 if (romboot) { 821 reinit = true; 822 } else { 823 /* 824 * The virtual machine has been setup by the 825 * userspace bootloader. 826 */ 827 } 828 } else { 829 perror("vm_create"); 830 exit(1); 831 } 832 } else { 833 if (!romboot) { 834 /* 835 * If the virtual machine was just created then a 836 * bootrom must be configured to boot it. 837 */ 838 fprintf(stderr, "virtual machine cannot be booted\n"); 839 exit(1); 840 } 841 } 842 843 ctx = vm_open(vmname); 844 if (ctx == NULL) { 845 perror("vm_open"); 846 exit(1); 847 } 848 849 #ifndef WITHOUT_CAPSICUM 850 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 851 if (cap_rights_limit(vm_get_device_fd(ctx), &rights) == -1 && 852 errno != ENOSYS) 853 errx(EX_OSERR, "Unable to apply rights for sandbox"); 854 vm_get_ioctls(&ncmds); 855 cmds = vm_get_ioctls(NULL); 856 if (cmds == NULL) 857 errx(EX_OSERR, "out of memory"); 858 if (cap_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1 && 859 errno != ENOSYS) 860 errx(EX_OSERR, "Unable to apply rights for sandbox"); 861 free((cap_ioctl_t *)cmds); 862 #endif 863 864 if (reinit) { 865 error = vm_reinit(ctx); 866 if (error) { 867 perror("vm_reinit"); 868 exit(1); 869 } 870 } 871 error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); 872 if (error) 873 errx(EX_OSERR, "vm_set_topology"); 874 return (ctx); 875 } 876 877 int 878 main(int argc, char *argv[]) 879 { 880 int c, error, gdb_port, err, bvmcons; 881 int max_vcpus, mptgen, memflags; 882 int rtc_localtime; 883 struct vmctx *ctx; 884 uint64_t rip; 885 size_t memsize; 886 char *optstr; 887 888 bvmcons = 0; 889 progname = basename(argv[0]); 890 gdb_port = 0; 891 guest_ncpus = 1; 892 sockets = cores = threads = 1; 893 maxcpus = 0; 894 memsize = 256 * MB; 895 mptgen = 1; 896 rtc_localtime = 1; 897 memflags = 0; 898 899 optstr = "abehuwxACHIPSWYp:g:c:s:m:l:U:"; 900 while ((c = getopt(argc, argv, optstr)) != -1) { 901 switch (c) { 902 case 'a': 903 x2apic_mode = 0; 904 break; 905 case 'A': 906 acpi = 1; 907 break; 908 case 'b': 909 bvmcons = 1; 910 break; 911 case 'p': 912 if (pincpu_parse(optarg) != 0) { 913 errx(EX_USAGE, "invalid vcpu pinning " 914 "configuration '%s'", optarg); 915 } 916 break; 917 case 'c': 918 if (topology_parse(optarg) != 0) { 919 errx(EX_USAGE, "invalid cpu topology " 920 "'%s'", optarg); 921 } 922 break; 923 case 'C': 924 memflags |= VM_MEM_F_INCORE; 925 break; 926 case 'g': 927 gdb_port = atoi(optarg); 928 break; 929 case 'l': 930 if (lpc_device_parse(optarg) != 0) { 931 errx(EX_USAGE, "invalid lpc device " 932 "configuration '%s'", optarg); 933 } 934 break; 935 case 's': 936 if (pci_parse_slot(optarg) != 0) 937 exit(1); 938 else 939 break; 940 case 'S': 941 memflags |= VM_MEM_F_WIRED; 942 break; 943 case 'm': 944 error = vm_parse_memsize(optarg, &memsize); 945 if (error) 946 errx(EX_USAGE, "invalid memsize '%s'", optarg); 947 break; 948 case 'H': 949 guest_vmexit_on_hlt = 1; 950 break; 951 case 'I': 952 /* 953 * The "-I" option was used to add an ioapic to the 954 * virtual machine. 955 * 956 * An ioapic is now provided unconditionally for each 957 * virtual machine and this option is now deprecated. 958 */ 959 break; 960 case 'P': 961 guest_vmexit_on_pause = 1; 962 break; 963 case 'e': 964 strictio = 1; 965 break; 966 case 'u': 967 rtc_localtime = 0; 968 break; 969 case 'U': 970 guest_uuid_str = optarg; 971 break; 972 case 'w': 973 strictmsr = 0; 974 break; 975 case 'W': 976 virtio_msix = 0; 977 break; 978 case 'x': 979 x2apic_mode = 1; 980 break; 981 case 'Y': 982 mptgen = 0; 983 break; 984 case 'h': 985 usage(0); 986 default: 987 usage(1); 988 } 989 } 990 argc -= optind; 991 argv += optind; 992 993 if (argc != 1) 994 usage(1); 995 996 vmname = argv[0]; 997 ctx = do_open(vmname); 998 999 max_vcpus = num_vcpus_allowed(ctx); 1000 if (guest_ncpus > max_vcpus) { 1001 fprintf(stderr, "%d vCPUs requested but only %d available\n", 1002 guest_ncpus, max_vcpus); 1003 exit(1); 1004 } 1005 1006 fbsdrun_set_capabilities(ctx, BSP); 1007 1008 vm_set_memflags(ctx, memflags); 1009 err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); 1010 if (err) { 1011 fprintf(stderr, "Unable to setup memory (%d)\n", errno); 1012 exit(1); 1013 } 1014 1015 error = init_msr(); 1016 if (error) { 1017 fprintf(stderr, "init_msr error %d", error); 1018 exit(1); 1019 } 1020 1021 init_mem(); 1022 init_inout(); 1023 atkbdc_init(ctx); 1024 pci_irq_init(ctx); 1025 ioapic_init(ctx); 1026 1027 rtc_init(ctx, rtc_localtime); 1028 sci_init(ctx); 1029 1030 /* 1031 * Exit if a device emulation finds an error in its initilization 1032 */ 1033 if (init_pci(ctx) != 0) 1034 exit(1); 1035 1036 if (gdb_port != 0) 1037 init_dbgport(gdb_port); 1038 1039 if (bvmcons) 1040 init_bvmcons(); 1041 1042 if (lpc_bootrom()) { 1043 if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { 1044 fprintf(stderr, "ROM boot failed: unrestricted guest " 1045 "capability not available\n"); 1046 exit(1); 1047 } 1048 error = vcpu_reset(ctx, BSP); 1049 assert(error == 0); 1050 } 1051 1052 error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); 1053 assert(error == 0); 1054 1055 /* 1056 * build the guest tables, MP etc. 1057 */ 1058 if (mptgen) { 1059 error = mptable_build(ctx, guest_ncpus); 1060 if (error) 1061 exit(1); 1062 } 1063 1064 error = smbios_build(ctx); 1065 assert(error == 0); 1066 1067 if (acpi) { 1068 error = acpi_build(ctx, guest_ncpus); 1069 assert(error == 0); 1070 } 1071 1072 if (lpc_bootrom()) 1073 fwctl_init(); 1074 1075 #ifndef WITHOUT_CAPSICUM 1076 caph_cache_catpages(); 1077 1078 if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) 1079 errx(EX_OSERR, "Unable to apply rights for sandbox"); 1080 1081 if (cap_enter() == -1 && errno != ENOSYS) 1082 errx(EX_OSERR, "cap_enter() failed"); 1083 #endif 1084 1085 /* 1086 * Change the proc title to include the VM name. 1087 */ 1088 setproctitle("%s", vmname); 1089 1090 /* 1091 * Add CPU 0 1092 */ 1093 fbsdrun_addcpu(ctx, BSP, BSP, rip); 1094 1095 /* 1096 * Head off to the main event dispatch loop 1097 */ 1098 mevent_dispatch(); 1099 1100 exit(1); 1101 } 1102