1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/types.h> 35 #ifndef WITHOUT_CAPSICUM 36 #include <sys/capsicum.h> 37 #endif 38 #include <sys/mman.h> 39 #include <sys/time.h> 40 41 #include <machine/atomic.h> 42 #include <machine/segments.h> 43 44 #ifndef WITHOUT_CAPSICUM 45 #include <capsicum_helpers.h> 46 #endif 47 #include <stdio.h> 48 #include <stdlib.h> 49 #include <string.h> 50 #include <err.h> 51 #include <errno.h> 52 #include <libgen.h> 53 #include <unistd.h> 54 #include <assert.h> 55 #include <errno.h> 56 #include <pthread.h> 57 #include <pthread_np.h> 58 #include <sysexits.h> 59 #include <stdbool.h> 60 #include <stdint.h> 61 62 #include <machine/vmm.h> 63 #ifndef WITHOUT_CAPSICUM 64 #include <machine/vmm_dev.h> 65 #endif 66 #include <vmmapi.h> 67 68 #include "bhyverun.h" 69 #include "acpi.h" 70 #include "atkbdc.h" 71 #include "inout.h" 72 #include "dbgport.h" 73 #include "fwctl.h" 74 #include "gdb.h" 75 #include "ioapic.h" 76 #include "mem.h" 77 #include "mevent.h" 78 #include "mptbl.h" 79 #include "pci_emul.h" 80 #include "pci_irq.h" 81 #include "pci_lpc.h" 82 #include "smbiostbl.h" 83 #include "xmsr.h" 84 #include "spinup_ap.h" 85 #include "rtc.h" 86 87 #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ 88 89 #define MB (1024UL * 1024) 90 #define GB (1024UL * MB) 91 92 typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); 93 extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); 94 95 char *vmname; 96 97 int guest_ncpus; 98 uint16_t cores, maxcpus, sockets, threads; 99 100 char *guest_uuid_str; 101 102 static int guest_vmexit_on_hlt, guest_vmexit_on_pause; 103 static int virtio_msix = 1; 104 static int x2apic_mode = 0; /* default is xAPIC */ 105 106 static int strictio; 107 static int strictmsr = 1; 108 109 static int acpi; 110 111 static char *progname; 112 static const int BSP = 0; 113 114 static cpuset_t cpumask; 115 116 static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); 117 118 static struct vm_exit vmexit[VM_MAXCPU]; 119 120 struct bhyvestats { 121 uint64_t vmexit_bogus; 122 uint64_t vmexit_reqidle; 123 uint64_t vmexit_hlt; 124 uint64_t vmexit_pause; 125 uint64_t vmexit_mtrap; 126 uint64_t vmexit_inst_emul; 127 uint64_t cpu_switch_rotate; 128 uint64_t cpu_switch_direct; 129 } stats; 130 131 struct mt_vmm_info { 132 pthread_t mt_thr; 133 struct vmctx *mt_ctx; 134 int mt_vcpu; 135 } mt_vmm_info[VM_MAXCPU]; 136 137 static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; 138 139 static void 140 usage(int code) 141 { 142 143 fprintf(stderr, 144 "Usage: %s [-abehuwxACHPSWY]\n" 145 " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" 146 " %*s [-g <gdb port>] [-l <lpc>]\n" 147 " %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n" 148 " -a: local apic is in xAPIC mode (deprecated)\n" 149 " -A: create ACPI tables\n" 150 " -c: number of cpus and/or topology specification\n" 151 " -C: include guest memory in core file\n" 152 " -e: exit on unhandled I/O access\n" 153 " -g: gdb port\n" 154 " -h: help\n" 155 " -H: vmexit from the guest on hlt\n" 156 " -l: LPC device configuration\n" 157 " -m: memory size in MB\n" 158 " -p: pin 'vcpu' to 'hostcpu'\n" 159 " -P: vmexit from the guest on pause\n" 160 " -s: <slot,driver,configinfo> PCI slot config\n" 161 " -S: guest memory cannot be swapped\n" 162 " -u: RTC keeps UTC time\n" 163 " -U: uuid\n" 164 " -w: ignore unimplemented MSRs\n" 165 " -W: force virtio to use single-vector MSI\n" 166 " -x: local apic is in x2APIC mode\n" 167 " -Y: disable MPtable generation\n", 168 progname, (int)strlen(progname), "", (int)strlen(progname), "", 169 (int)strlen(progname), ""); 170 171 exit(code); 172 } 173 174 /* 175 * XXX This parser is known to have the following issues: 176 * 1. It accepts null key=value tokens ",,". 177 * 2. It accepts whitespace after = and before value. 178 * 3. Values out of range of INT are silently wrapped. 179 * 4. It doesn't check non-final values. 180 * 5. The apparently bogus limits of UINT16_MAX are for future expansion. 181 * 182 * The acceptance of a null specification ('-c ""') is by design to match the 183 * manual page syntax specification, this results in a topology of 1 vCPU. 184 */ 185 static int 186 topology_parse(const char *opt) 187 { 188 uint64_t ncpus; 189 int c, chk, n, s, t, tmp; 190 char *cp, *str; 191 bool ns, scts; 192 193 c = 1, n = 1, s = 1, t = 1; 194 ns = false, scts = false; 195 str = strdup(opt); 196 197 while ((cp = strsep(&str, ",")) != NULL) { 198 if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { 199 n = tmp; 200 ns = true; 201 } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) { 202 n = tmp; 203 ns = true; 204 } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) { 205 s = tmp; 206 scts = true; 207 } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) { 208 c = tmp; 209 scts = true; 210 } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) { 211 t = tmp; 212 scts = true; 213 #ifdef notyet /* Do not expose this until vmm.ko implements it */ 214 } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) { 215 m = tmp; 216 #endif 217 /* Skip the empty argument case from -c "" */ 218 } else if (cp[0] == '\0') 219 continue; 220 else 221 return (-1); 222 /* Any trailing garbage causes an error */ 223 if (cp[chk] != '\0') 224 return (-1); 225 } 226 /* 227 * Range check 1 <= n <= UINT16_MAX all values 228 */ 229 if (n < 1 || s < 1 || c < 1 || t < 1 || 230 n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX || 231 t > UINT16_MAX) 232 return (-1); 233 234 /* If only the cpus was specified, use that as sockets */ 235 if (!scts) 236 s = n; 237 /* 238 * Compute sockets * cores * threads avoiding overflow 239 * The range check above insures these are 16 bit values 240 * If n was specified check it against computed ncpus 241 */ 242 ncpus = (uint64_t)s * c * t; 243 if (ncpus > UINT16_MAX || (ns && n != ncpus)) 244 return (-1); 245 246 guest_ncpus = ncpus; 247 sockets = s; 248 cores = c; 249 threads = t; 250 return(0); 251 } 252 253 static int 254 pincpu_parse(const char *opt) 255 { 256 int vcpu, pcpu; 257 258 if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { 259 fprintf(stderr, "invalid format: %s\n", opt); 260 return (-1); 261 } 262 263 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 264 fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", 265 vcpu, VM_MAXCPU - 1); 266 return (-1); 267 } 268 269 if (pcpu < 0 || pcpu >= CPU_SETSIZE) { 270 fprintf(stderr, "hostcpu '%d' outside valid range from " 271 "0 to %d\n", pcpu, CPU_SETSIZE - 1); 272 return (-1); 273 } 274 275 if (vcpumap[vcpu] == NULL) { 276 if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { 277 perror("malloc"); 278 return (-1); 279 } 280 CPU_ZERO(vcpumap[vcpu]); 281 } 282 CPU_SET(pcpu, vcpumap[vcpu]); 283 return (0); 284 } 285 286 void 287 vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, 288 int errcode) 289 { 290 struct vmctx *ctx; 291 int error, restart_instruction; 292 293 ctx = arg; 294 restart_instruction = 1; 295 296 error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, 297 restart_instruction); 298 assert(error == 0); 299 } 300 301 void * 302 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) 303 { 304 305 return (vm_map_gpa(ctx, gaddr, len)); 306 } 307 308 int 309 fbsdrun_vmexit_on_pause(void) 310 { 311 312 return (guest_vmexit_on_pause); 313 } 314 315 int 316 fbsdrun_vmexit_on_hlt(void) 317 { 318 319 return (guest_vmexit_on_hlt); 320 } 321 322 int 323 fbsdrun_virtio_msix(void) 324 { 325 326 return (virtio_msix); 327 } 328 329 static void * 330 fbsdrun_start_thread(void *param) 331 { 332 char tname[MAXCOMLEN + 1]; 333 struct mt_vmm_info *mtp; 334 int vcpu; 335 336 mtp = param; 337 vcpu = mtp->mt_vcpu; 338 339 snprintf(tname, sizeof(tname), "vcpu %d", vcpu); 340 pthread_set_name_np(mtp->mt_thr, tname); 341 342 gdb_cpu_add(vcpu); 343 344 vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); 345 346 /* not reached */ 347 exit(1); 348 return (NULL); 349 } 350 351 void 352 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) 353 { 354 int error; 355 356 assert(fromcpu == BSP); 357 358 /* 359 * The 'newcpu' must be activated in the context of 'fromcpu'. If 360 * vm_activate_cpu() is delayed until newcpu's pthread starts running 361 * then vmm.ko is out-of-sync with bhyve and this can create a race 362 * with vm_suspend(). 363 */ 364 error = vm_activate_cpu(ctx, newcpu); 365 if (error != 0) 366 err(EX_OSERR, "could not activate CPU %d", newcpu); 367 368 CPU_SET_ATOMIC(newcpu, &cpumask); 369 370 /* 371 * Set up the vmexit struct to allow execution to start 372 * at the given RIP 373 */ 374 vmexit[newcpu].rip = rip; 375 vmexit[newcpu].inst_length = 0; 376 377 mt_vmm_info[newcpu].mt_ctx = ctx; 378 mt_vmm_info[newcpu].mt_vcpu = newcpu; 379 380 error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, 381 fbsdrun_start_thread, &mt_vmm_info[newcpu]); 382 assert(error == 0); 383 } 384 385 static int 386 fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) 387 { 388 389 if (!CPU_ISSET(vcpu, &cpumask)) { 390 fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); 391 exit(1); 392 } 393 394 CPU_CLR_ATOMIC(vcpu, &cpumask); 395 return (CPU_EMPTY(&cpumask)); 396 } 397 398 static int 399 vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, 400 uint32_t eax) 401 { 402 #if BHYVE_DEBUG 403 /* 404 * put guest-driven debug here 405 */ 406 #endif 407 return (VMEXIT_CONTINUE); 408 } 409 410 static int 411 vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 412 { 413 int error; 414 int bytes, port, in, out; 415 int vcpu; 416 417 vcpu = *pvcpu; 418 419 port = vme->u.inout.port; 420 bytes = vme->u.inout.bytes; 421 in = vme->u.inout.in; 422 out = !in; 423 424 /* Extra-special case of host notifications */ 425 if (out && port == GUEST_NIO_PORT) { 426 error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); 427 return (error); 428 } 429 430 error = emulate_inout(ctx, vcpu, vme, strictio); 431 if (error) { 432 fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", 433 in ? "in" : "out", 434 bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), 435 port, vmexit->rip); 436 return (VMEXIT_ABORT); 437 } else { 438 return (VMEXIT_CONTINUE); 439 } 440 } 441 442 static int 443 vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 444 { 445 uint64_t val; 446 uint32_t eax, edx; 447 int error; 448 449 val = 0; 450 error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); 451 if (error != 0) { 452 fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", 453 vme->u.msr.code, *pvcpu); 454 if (strictmsr) { 455 vm_inject_gp(ctx, *pvcpu); 456 return (VMEXIT_CONTINUE); 457 } 458 } 459 460 eax = val; 461 error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); 462 assert(error == 0); 463 464 edx = val >> 32; 465 error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); 466 assert(error == 0); 467 468 return (VMEXIT_CONTINUE); 469 } 470 471 static int 472 vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 473 { 474 int error; 475 476 error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); 477 if (error != 0) { 478 fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", 479 vme->u.msr.code, vme->u.msr.wval, *pvcpu); 480 if (strictmsr) { 481 vm_inject_gp(ctx, *pvcpu); 482 return (VMEXIT_CONTINUE); 483 } 484 } 485 return (VMEXIT_CONTINUE); 486 } 487 488 static int 489 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) 490 { 491 492 (void)spinup_ap(ctx, *pvcpu, 493 vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); 494 495 return (VMEXIT_CONTINUE); 496 } 497 498 #define DEBUG_EPT_MISCONFIG 499 #ifdef DEBUG_EPT_MISCONFIG 500 #define EXIT_REASON_EPT_MISCONFIG 49 501 #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 502 #define VMCS_IDENT(x) ((x) | 0x80000000) 503 504 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; 505 static int ept_misconfig_ptenum; 506 #endif 507 508 static int 509 vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 510 { 511 512 fprintf(stderr, "vm exit[%d]\n", *pvcpu); 513 fprintf(stderr, "\treason\t\tVMX\n"); 514 fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); 515 fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); 516 fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); 517 fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); 518 fprintf(stderr, "\tqualification\t0x%016lx\n", 519 vmexit->u.vmx.exit_qualification); 520 fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); 521 fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); 522 #ifdef DEBUG_EPT_MISCONFIG 523 if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { 524 vm_get_register(ctx, *pvcpu, 525 VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), 526 &ept_misconfig_gpa); 527 vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, 528 &ept_misconfig_ptenum); 529 fprintf(stderr, "\tEPT misconfiguration:\n"); 530 fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); 531 fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", 532 ept_misconfig_ptenum, ept_misconfig_pte[0], 533 ept_misconfig_pte[1], ept_misconfig_pte[2], 534 ept_misconfig_pte[3]); 535 } 536 #endif /* DEBUG_EPT_MISCONFIG */ 537 return (VMEXIT_ABORT); 538 } 539 540 static int 541 vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 542 { 543 544 fprintf(stderr, "vm exit[%d]\n", *pvcpu); 545 fprintf(stderr, "\treason\t\tSVM\n"); 546 fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); 547 fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); 548 fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); 549 fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); 550 fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); 551 return (VMEXIT_ABORT); 552 } 553 554 static int 555 vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 556 { 557 558 assert(vmexit->inst_length == 0); 559 560 stats.vmexit_bogus++; 561 562 return (VMEXIT_CONTINUE); 563 } 564 565 static int 566 vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 567 { 568 569 assert(vmexit->inst_length == 0); 570 571 stats.vmexit_reqidle++; 572 573 return (VMEXIT_CONTINUE); 574 } 575 576 static int 577 vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 578 { 579 580 stats.vmexit_hlt++; 581 582 /* 583 * Just continue execution with the next instruction. We use 584 * the HLT VM exit as a way to be friendly with the host 585 * scheduler. 586 */ 587 return (VMEXIT_CONTINUE); 588 } 589 590 static int 591 vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 592 { 593 594 stats.vmexit_pause++; 595 596 return (VMEXIT_CONTINUE); 597 } 598 599 static int 600 vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 601 { 602 603 assert(vmexit->inst_length == 0); 604 605 stats.vmexit_mtrap++; 606 607 gdb_cpu_mtrap(*pvcpu); 608 609 return (VMEXIT_CONTINUE); 610 } 611 612 static int 613 vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 614 { 615 int err, i; 616 struct vie *vie; 617 618 stats.vmexit_inst_emul++; 619 620 vie = &vmexit->u.inst_emul.vie; 621 err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, 622 vie, &vmexit->u.inst_emul.paging); 623 624 if (err) { 625 if (err == ESRCH) { 626 fprintf(stderr, "Unhandled memory access to 0x%lx\n", 627 vmexit->u.inst_emul.gpa); 628 } 629 630 fprintf(stderr, "Failed to emulate instruction ["); 631 for (i = 0; i < vie->num_valid; i++) { 632 fprintf(stderr, "0x%02x%s", vie->inst[i], 633 i != (vie->num_valid - 1) ? " " : ""); 634 } 635 fprintf(stderr, "] at 0x%lx\n", vmexit->rip); 636 return (VMEXIT_ABORT); 637 } 638 639 return (VMEXIT_CONTINUE); 640 } 641 642 static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; 643 static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; 644 645 static int 646 vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 647 { 648 enum vm_suspend_how how; 649 650 how = vmexit->u.suspended.how; 651 652 fbsdrun_deletecpu(ctx, *pvcpu); 653 654 if (*pvcpu != BSP) { 655 pthread_mutex_lock(&resetcpu_mtx); 656 pthread_cond_signal(&resetcpu_cond); 657 pthread_mutex_unlock(&resetcpu_mtx); 658 pthread_exit(NULL); 659 } 660 661 pthread_mutex_lock(&resetcpu_mtx); 662 while (!CPU_EMPTY(&cpumask)) { 663 pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); 664 } 665 pthread_mutex_unlock(&resetcpu_mtx); 666 667 switch (how) { 668 case VM_SUSPEND_RESET: 669 exit(0); 670 case VM_SUSPEND_POWEROFF: 671 exit(1); 672 case VM_SUSPEND_HALT: 673 exit(2); 674 case VM_SUSPEND_TRIPLEFAULT: 675 exit(3); 676 default: 677 fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); 678 exit(100); 679 } 680 return (0); /* NOTREACHED */ 681 } 682 683 static int 684 vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) 685 { 686 687 gdb_cpu_suspend(*pvcpu); 688 return (VMEXIT_CONTINUE); 689 } 690 691 static vmexit_handler_t handler[VM_EXITCODE_MAX] = { 692 [VM_EXITCODE_INOUT] = vmexit_inout, 693 [VM_EXITCODE_INOUT_STR] = vmexit_inout, 694 [VM_EXITCODE_VMX] = vmexit_vmx, 695 [VM_EXITCODE_SVM] = vmexit_svm, 696 [VM_EXITCODE_BOGUS] = vmexit_bogus, 697 [VM_EXITCODE_REQIDLE] = vmexit_reqidle, 698 [VM_EXITCODE_RDMSR] = vmexit_rdmsr, 699 [VM_EXITCODE_WRMSR] = vmexit_wrmsr, 700 [VM_EXITCODE_MTRAP] = vmexit_mtrap, 701 [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, 702 [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, 703 [VM_EXITCODE_SUSPENDED] = vmexit_suspend, 704 [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, 705 [VM_EXITCODE_DEBUG] = vmexit_debug, 706 }; 707 708 static void 709 vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) 710 { 711 int error, rc; 712 enum vm_exitcode exitcode; 713 cpuset_t active_cpus; 714 715 if (vcpumap[vcpu] != NULL) { 716 error = pthread_setaffinity_np(pthread_self(), 717 sizeof(cpuset_t), vcpumap[vcpu]); 718 assert(error == 0); 719 } 720 721 error = vm_active_cpus(ctx, &active_cpus); 722 assert(CPU_ISSET(vcpu, &active_cpus)); 723 724 error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); 725 assert(error == 0); 726 727 while (1) { 728 error = vm_run(ctx, vcpu, &vmexit[vcpu]); 729 if (error != 0) 730 break; 731 732 exitcode = vmexit[vcpu].exitcode; 733 if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { 734 fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", 735 exitcode); 736 exit(1); 737 } 738 739 rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); 740 741 switch (rc) { 742 case VMEXIT_CONTINUE: 743 break; 744 case VMEXIT_ABORT: 745 abort(); 746 default: 747 exit(1); 748 } 749 } 750 fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); 751 } 752 753 static int 754 num_vcpus_allowed(struct vmctx *ctx) 755 { 756 int tmp, error; 757 758 error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); 759 760 /* 761 * The guest is allowed to spinup more than one processor only if the 762 * UNRESTRICTED_GUEST capability is available. 763 */ 764 if (error == 0) 765 return (VM_MAXCPU); 766 else 767 return (1); 768 } 769 770 void 771 fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) 772 { 773 int err, tmp; 774 775 if (fbsdrun_vmexit_on_hlt()) { 776 err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); 777 if (err < 0) { 778 fprintf(stderr, "VM exit on HLT not supported\n"); 779 exit(1); 780 } 781 vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); 782 if (cpu == BSP) 783 handler[VM_EXITCODE_HLT] = vmexit_hlt; 784 } 785 786 if (fbsdrun_vmexit_on_pause()) { 787 /* 788 * pause exit support required for this mode 789 */ 790 err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); 791 if (err < 0) { 792 fprintf(stderr, 793 "SMP mux requested, no pause support\n"); 794 exit(1); 795 } 796 vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); 797 if (cpu == BSP) 798 handler[VM_EXITCODE_PAUSE] = vmexit_pause; 799 } 800 801 if (x2apic_mode) 802 err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); 803 else 804 err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); 805 806 if (err) { 807 fprintf(stderr, "Unable to set x2apic state (%d)\n", err); 808 exit(1); 809 } 810 811 vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); 812 } 813 814 static struct vmctx * 815 do_open(const char *vmname) 816 { 817 struct vmctx *ctx; 818 int error; 819 bool reinit, romboot; 820 #ifndef WITHOUT_CAPSICUM 821 cap_rights_t rights; 822 const cap_ioctl_t *cmds; 823 size_t ncmds; 824 #endif 825 826 reinit = romboot = false; 827 828 if (lpc_bootrom()) 829 romboot = true; 830 831 error = vm_create(vmname); 832 if (error) { 833 if (errno == EEXIST) { 834 if (romboot) { 835 reinit = true; 836 } else { 837 /* 838 * The virtual machine has been setup by the 839 * userspace bootloader. 840 */ 841 } 842 } else { 843 perror("vm_create"); 844 exit(1); 845 } 846 } else { 847 if (!romboot) { 848 /* 849 * If the virtual machine was just created then a 850 * bootrom must be configured to boot it. 851 */ 852 fprintf(stderr, "virtual machine cannot be booted\n"); 853 exit(1); 854 } 855 } 856 857 ctx = vm_open(vmname); 858 if (ctx == NULL) { 859 perror("vm_open"); 860 exit(1); 861 } 862 863 #ifndef WITHOUT_CAPSICUM 864 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 865 if (cap_rights_limit(vm_get_device_fd(ctx), &rights) == -1 && 866 errno != ENOSYS) 867 errx(EX_OSERR, "Unable to apply rights for sandbox"); 868 vm_get_ioctls(&ncmds); 869 cmds = vm_get_ioctls(NULL); 870 if (cmds == NULL) 871 errx(EX_OSERR, "out of memory"); 872 if (cap_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1 && 873 errno != ENOSYS) 874 errx(EX_OSERR, "Unable to apply rights for sandbox"); 875 free((cap_ioctl_t *)cmds); 876 #endif 877 878 if (reinit) { 879 error = vm_reinit(ctx); 880 if (error) { 881 perror("vm_reinit"); 882 exit(1); 883 } 884 } 885 error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); 886 if (error) 887 errx(EX_OSERR, "vm_set_topology"); 888 return (ctx); 889 } 890 891 int 892 main(int argc, char *argv[]) 893 { 894 int c, error, dbg_port, gdb_port, err, bvmcons; 895 int max_vcpus, mptgen, memflags; 896 int rtc_localtime; 897 bool gdb_stop; 898 struct vmctx *ctx; 899 uint64_t rip; 900 size_t memsize; 901 char *optstr; 902 903 bvmcons = 0; 904 progname = basename(argv[0]); 905 dbg_port = 0; 906 gdb_port = 0; 907 gdb_stop = false; 908 guest_ncpus = 1; 909 sockets = cores = threads = 1; 910 maxcpus = 0; 911 memsize = 256 * MB; 912 mptgen = 1; 913 rtc_localtime = 1; 914 memflags = 0; 915 916 optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:"; 917 while ((c = getopt(argc, argv, optstr)) != -1) { 918 switch (c) { 919 case 'a': 920 x2apic_mode = 0; 921 break; 922 case 'A': 923 acpi = 1; 924 break; 925 case 'b': 926 bvmcons = 1; 927 break; 928 case 'p': 929 if (pincpu_parse(optarg) != 0) { 930 errx(EX_USAGE, "invalid vcpu pinning " 931 "configuration '%s'", optarg); 932 } 933 break; 934 case 'c': 935 if (topology_parse(optarg) != 0) { 936 errx(EX_USAGE, "invalid cpu topology " 937 "'%s'", optarg); 938 } 939 break; 940 case 'C': 941 memflags |= VM_MEM_F_INCORE; 942 break; 943 case 'g': 944 dbg_port = atoi(optarg); 945 break; 946 case 'G': 947 if (optarg[0] == 'w') { 948 gdb_stop = true; 949 optarg++; 950 } 951 gdb_port = atoi(optarg); 952 break; 953 case 'l': 954 if (lpc_device_parse(optarg) != 0) { 955 errx(EX_USAGE, "invalid lpc device " 956 "configuration '%s'", optarg); 957 } 958 break; 959 case 's': 960 if (pci_parse_slot(optarg) != 0) 961 exit(1); 962 else 963 break; 964 case 'S': 965 memflags |= VM_MEM_F_WIRED; 966 break; 967 case 'm': 968 error = vm_parse_memsize(optarg, &memsize); 969 if (error) 970 errx(EX_USAGE, "invalid memsize '%s'", optarg); 971 break; 972 case 'H': 973 guest_vmexit_on_hlt = 1; 974 break; 975 case 'I': 976 /* 977 * The "-I" option was used to add an ioapic to the 978 * virtual machine. 979 * 980 * An ioapic is now provided unconditionally for each 981 * virtual machine and this option is now deprecated. 982 */ 983 break; 984 case 'P': 985 guest_vmexit_on_pause = 1; 986 break; 987 case 'e': 988 strictio = 1; 989 break; 990 case 'u': 991 rtc_localtime = 0; 992 break; 993 case 'U': 994 guest_uuid_str = optarg; 995 break; 996 case 'w': 997 strictmsr = 0; 998 break; 999 case 'W': 1000 virtio_msix = 0; 1001 break; 1002 case 'x': 1003 x2apic_mode = 1; 1004 break; 1005 case 'Y': 1006 mptgen = 0; 1007 break; 1008 case 'h': 1009 usage(0); 1010 default: 1011 usage(1); 1012 } 1013 } 1014 argc -= optind; 1015 argv += optind; 1016 1017 if (argc != 1) 1018 usage(1); 1019 1020 vmname = argv[0]; 1021 ctx = do_open(vmname); 1022 1023 max_vcpus = num_vcpus_allowed(ctx); 1024 if (guest_ncpus > max_vcpus) { 1025 fprintf(stderr, "%d vCPUs requested but only %d available\n", 1026 guest_ncpus, max_vcpus); 1027 exit(1); 1028 } 1029 1030 fbsdrun_set_capabilities(ctx, BSP); 1031 1032 vm_set_memflags(ctx, memflags); 1033 err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); 1034 if (err) { 1035 fprintf(stderr, "Unable to setup memory (%d)\n", errno); 1036 exit(1); 1037 } 1038 1039 error = init_msr(); 1040 if (error) { 1041 fprintf(stderr, "init_msr error %d", error); 1042 exit(1); 1043 } 1044 1045 init_mem(); 1046 init_inout(); 1047 atkbdc_init(ctx); 1048 pci_irq_init(ctx); 1049 ioapic_init(ctx); 1050 1051 rtc_init(ctx, rtc_localtime); 1052 sci_init(ctx); 1053 1054 /* 1055 * Exit if a device emulation finds an error in its initilization 1056 */ 1057 if (init_pci(ctx) != 0) 1058 exit(1); 1059 1060 if (dbg_port != 0) 1061 init_dbgport(dbg_port); 1062 1063 if (gdb_port != 0) 1064 init_gdb(ctx, gdb_port, gdb_stop); 1065 1066 if (bvmcons) 1067 init_bvmcons(); 1068 1069 if (lpc_bootrom()) { 1070 if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { 1071 fprintf(stderr, "ROM boot failed: unrestricted guest " 1072 "capability not available\n"); 1073 exit(1); 1074 } 1075 error = vcpu_reset(ctx, BSP); 1076 assert(error == 0); 1077 } 1078 1079 error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); 1080 assert(error == 0); 1081 1082 /* 1083 * build the guest tables, MP etc. 1084 */ 1085 if (mptgen) { 1086 error = mptable_build(ctx, guest_ncpus); 1087 if (error) 1088 exit(1); 1089 } 1090 1091 error = smbios_build(ctx); 1092 assert(error == 0); 1093 1094 if (acpi) { 1095 error = acpi_build(ctx, guest_ncpus); 1096 assert(error == 0); 1097 } 1098 1099 if (lpc_bootrom()) 1100 fwctl_init(); 1101 1102 #ifndef WITHOUT_CAPSICUM 1103 caph_cache_catpages(); 1104 1105 if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) 1106 errx(EX_OSERR, "Unable to apply rights for sandbox"); 1107 1108 if (cap_enter() == -1 && errno != ENOSYS) 1109 errx(EX_OSERR, "cap_enter() failed"); 1110 #endif 1111 1112 /* 1113 * Change the proc title to include the VM name. 1114 */ 1115 setproctitle("%s", vmname); 1116 1117 /* 1118 * Add CPU 0 1119 */ 1120 fbsdrun_addcpu(ctx, BSP, BSP, rip); 1121 1122 /* 1123 * Head off to the main event dispatch loop 1124 */ 1125 mevent_dispatch(); 1126 1127 exit(1); 1128 } 1129