xref: /freebsd/usr.sbin/bhyve/bhyverun.c (revision 2008043f386721d58158e37e0d7e50df8095942d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/types.h>
31 #ifndef WITHOUT_CAPSICUM
32 #include <sys/capsicum.h>
33 #endif
34 #include <sys/mman.h>
35 #ifdef BHYVE_SNAPSHOT
36 #include <sys/socket.h>
37 #include <sys/stat.h>
38 #endif
39 #include <sys/time.h>
40 #ifdef BHYVE_SNAPSHOT
41 #include <sys/un.h>
42 #endif
43 
44 #include <machine/atomic.h>
45 
46 #ifndef WITHOUT_CAPSICUM
47 #include <capsicum_helpers.h>
48 #endif
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <err.h>
53 #include <errno.h>
54 #ifdef BHYVE_SNAPSHOT
55 #include <fcntl.h>
56 #endif
57 #include <libgen.h>
58 #include <unistd.h>
59 #include <assert.h>
60 #include <pthread.h>
61 #include <pthread_np.h>
62 #include <sysexits.h>
63 #include <stdbool.h>
64 #include <stdint.h>
65 #ifdef BHYVE_SNAPSHOT
66 #include <ucl.h>
67 #include <unistd.h>
68 
69 #include <libxo/xo.h>
70 #endif
71 
72 #include <vmmapi.h>
73 
74 #include "acpi.h"
75 #include "bhyverun.h"
76 #include "bootrom.h"
77 #include "config.h"
78 #include "debug.h"
79 #ifdef BHYVE_GDB
80 #include "gdb.h"
81 #endif
82 #include "mem.h"
83 #include "mevent.h"
84 #include "pci_emul.h"
85 #ifdef __amd64__
86 #include "amd64/pci_lpc.h"
87 #endif
88 #include "qemu_fwcfg.h"
89 #ifdef BHYVE_SNAPSHOT
90 #include "snapshot.h"
91 #endif
92 #include "tpm_device.h"
93 #include "vmgenc.h"
94 #include "vmexit.h"
95 
96 #define MB		(1024UL * 1024)
97 #define GB		(1024UL * MB)
98 
99 int guest_ncpus;
100 uint16_t cpu_cores, cpu_sockets, cpu_threads;
101 
102 int raw_stdio = 0;
103 
104 static char *progname;
105 static const int BSP = 0;
106 
107 static cpuset_t cpumask;
108 
109 static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu);
110 
111 static struct vcpu_info {
112 	struct vmctx	*ctx;
113 	struct vcpu	*vcpu;
114 	int		vcpuid;
115 } *vcpu_info;
116 
117 static cpuset_t **vcpumap;
118 
119 static void
120 usage(int code)
121 {
122 
123 	fprintf(stderr,
124 		"Usage: %s [-AaCDeHhPSuWwxY]\n"
125 		"       %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n"
126 		"       %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n"
127 		"       %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n"
128 		"       -A: create ACPI tables\n"
129 		"       -a: local apic is in xAPIC mode (deprecated)\n"
130 		"       -C: include guest memory in core file\n"
131 		"       -c: number of CPUs and/or topology specification\n"
132 		"       -D: destroy on power-off\n"
133 		"       -e: exit on unhandled I/O access\n"
134 		"       -G: start a debug server\n"
135 		"       -H: vmexit from the guest on HLT\n"
136 		"       -h: help\n"
137 		"       -k: key=value flat config file\n"
138 		"       -K: PS2 keyboard layout\n"
139 		"       -l: LPC device configuration\n"
140 		"       -m: memory size\n"
141 		"       -o: set config 'var' to 'value'\n"
142 		"       -P: vmexit from the guest on pause\n"
143 		"       -p: pin 'vcpu' to 'hostcpu'\n"
144 #ifdef BHYVE_SNAPSHOT
145 		"       -r: path to checkpoint file\n"
146 #endif
147 		"       -S: guest memory cannot be swapped\n"
148 		"       -s: <slot,driver,configinfo> PCI slot config\n"
149 		"       -U: UUID\n"
150 		"       -u: RTC keeps UTC time\n"
151 		"       -W: force virtio to use single-vector MSI\n"
152 		"       -w: ignore unimplemented MSRs\n"
153 		"       -x: local APIC is in x2APIC mode\n"
154 		"       -Y: disable MPtable generation\n",
155 		progname, (int)strlen(progname), "", (int)strlen(progname), "",
156 		(int)strlen(progname), "");
157 
158 	exit(code);
159 }
160 
161 /*
162  * XXX This parser is known to have the following issues:
163  * 1.  It accepts null key=value tokens ",," as setting "cpus" to an
164  *     empty string.
165  *
166  * The acceptance of a null specification ('-c ""') is by design to match the
167  * manual page syntax specification, this results in a topology of 1 vCPU.
168  */
169 static int
170 topology_parse(const char *opt)
171 {
172 	char *cp, *str, *tofree;
173 
174 	if (*opt == '\0') {
175 		set_config_value("sockets", "1");
176 		set_config_value("cores", "1");
177 		set_config_value("threads", "1");
178 		set_config_value("cpus", "1");
179 		return (0);
180 	}
181 
182 	tofree = str = strdup(opt);
183 	if (str == NULL)
184 		errx(4, "Failed to allocate memory");
185 
186 	while ((cp = strsep(&str, ",")) != NULL) {
187 		if (strncmp(cp, "cpus=", strlen("cpus=")) == 0)
188 			set_config_value("cpus", cp + strlen("cpus="));
189 		else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0)
190 			set_config_value("sockets", cp + strlen("sockets="));
191 		else if (strncmp(cp, "cores=", strlen("cores=")) == 0)
192 			set_config_value("cores", cp + strlen("cores="));
193 		else if (strncmp(cp, "threads=", strlen("threads=")) == 0)
194 			set_config_value("threads", cp + strlen("threads="));
195 		else if (strchr(cp, '=') != NULL)
196 			goto out;
197 		else
198 			set_config_value("cpus", cp);
199 	}
200 	free(tofree);
201 	return (0);
202 
203 out:
204 	free(tofree);
205 	return (-1);
206 }
207 
208 static int
209 parse_int_value(const char *key, const char *value, int minval, int maxval)
210 {
211 	char *cp;
212 	long lval;
213 
214 	errno = 0;
215 	lval = strtol(value, &cp, 0);
216 	if (errno != 0 || *cp != '\0' || cp == value || lval < minval ||
217 	    lval > maxval)
218 		errx(4, "Invalid value for %s: '%s'", key, value);
219 	return (lval);
220 }
221 
222 /*
223  * Set the sockets, cores, threads, and guest_cpus variables based on
224  * the configured topology.
225  *
226  * The limits of UINT16_MAX are due to the types passed to
227  * vm_set_topology().  vmm.ko may enforce tighter limits.
228  */
229 static void
230 calc_topology(void)
231 {
232 	const char *value;
233 	bool explicit_cpus;
234 	uint64_t ncpus;
235 
236 	value = get_config_value("cpus");
237 	if (value != NULL) {
238 		guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX);
239 		explicit_cpus = true;
240 	} else {
241 		guest_ncpus = 1;
242 		explicit_cpus = false;
243 	}
244 	value = get_config_value("cores");
245 	if (value != NULL)
246 		cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX);
247 	else
248 		cpu_cores = 1;
249 	value = get_config_value("threads");
250 	if (value != NULL)
251 		cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX);
252 	else
253 		cpu_threads = 1;
254 	value = get_config_value("sockets");
255 	if (value != NULL)
256 		cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX);
257 	else
258 		cpu_sockets = guest_ncpus;
259 
260 	/*
261 	 * Compute sockets * cores * threads avoiding overflow.  The
262 	 * range check above insures these are 16 bit values.
263 	 */
264 	ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads;
265 	if (ncpus > UINT16_MAX)
266 		errx(4, "Computed number of vCPUs too high: %ju",
267 		    (uintmax_t)ncpus);
268 
269 	if (explicit_cpus) {
270 		if (guest_ncpus != (int)ncpus)
271 			errx(4, "Topology (%d sockets, %d cores, %d threads) "
272 			    "does not match %d vCPUs",
273 			    cpu_sockets, cpu_cores, cpu_threads,
274 			    guest_ncpus);
275 	} else
276 		guest_ncpus = ncpus;
277 }
278 
279 static int
280 pincpu_parse(const char *opt)
281 {
282 	const char *value;
283 	char *newval;
284 	char key[16];
285 	int vcpu, pcpu;
286 
287 	if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) {
288 		fprintf(stderr, "invalid format: %s\n", opt);
289 		return (-1);
290 	}
291 
292 	if (vcpu < 0) {
293 		fprintf(stderr, "invalid vcpu '%d'\n", vcpu);
294 		return (-1);
295 	}
296 
297 	if (pcpu < 0 || pcpu >= CPU_SETSIZE) {
298 		fprintf(stderr, "hostcpu '%d' outside valid range from "
299 		    "0 to %d\n", pcpu, CPU_SETSIZE - 1);
300 		return (-1);
301 	}
302 
303 	snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
304 	value = get_config_value(key);
305 
306 	if (asprintf(&newval, "%s%s%d", value != NULL ? value : "",
307 	    value != NULL ? "," : "", pcpu) == -1) {
308 		perror("failed to build new cpuset string");
309 		return (-1);
310 	}
311 
312 	set_config_value(key, newval);
313 	free(newval);
314 	return (0);
315 }
316 
317 static void
318 parse_cpuset(int vcpu, const char *list, cpuset_t *set)
319 {
320 	char *cp, *token;
321 	int pcpu, start;
322 
323 	CPU_ZERO(set);
324 	start = -1;
325 	token = __DECONST(char *, list);
326 	for (;;) {
327 		pcpu = strtoul(token, &cp, 0);
328 		if (cp == token)
329 			errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
330 		if (pcpu < 0 || pcpu >= CPU_SETSIZE)
331 			errx(4, "hostcpu '%d' outside valid range from 0 to %d",
332 			    pcpu, CPU_SETSIZE - 1);
333 		switch (*cp) {
334 		case ',':
335 		case '\0':
336 			if (start >= 0) {
337 				if (start > pcpu)
338 					errx(4, "Invalid hostcpu range %d-%d",
339 					    start, pcpu);
340 				while (start < pcpu) {
341 					CPU_SET(start, set);
342 					start++;
343 				}
344 				start = -1;
345 			}
346 			CPU_SET(pcpu, set);
347 			break;
348 		case '-':
349 			if (start >= 0)
350 				errx(4, "invalid cpuset for vcpu %d: '%s'",
351 				    vcpu, list);
352 			start = pcpu;
353 			break;
354 		default:
355 			errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list);
356 		}
357 		if (*cp == '\0')
358 			break;
359 		token = cp + 1;
360 	}
361 }
362 
363 static void
364 build_vcpumaps(void)
365 {
366 	char key[16];
367 	const char *value;
368 	int vcpu;
369 
370 	vcpumap = calloc(guest_ncpus, sizeof(*vcpumap));
371 	for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
372 		snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu);
373 		value = get_config_value(key);
374 		if (value == NULL)
375 			continue;
376 		vcpumap[vcpu] = malloc(sizeof(cpuset_t));
377 		if (vcpumap[vcpu] == NULL)
378 			err(4, "Failed to allocate cpuset for vcpu %d", vcpu);
379 		parse_cpuset(vcpu, value, vcpumap[vcpu]);
380 	}
381 }
382 
383 void *
384 paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
385 {
386 
387 	return (vm_map_gpa(ctx, gaddr, len));
388 }
389 
390 #ifdef BHYVE_SNAPSHOT
391 uintptr_t
392 paddr_host2guest(struct vmctx *ctx, void *addr)
393 {
394 	return (vm_rev_map_gpa(ctx, addr));
395 }
396 #endif
397 
398 int
399 fbsdrun_virtio_msix(void)
400 {
401 
402 	return (get_config_bool_default("virtio_msix", true));
403 }
404 
405 struct vcpu *
406 fbsdrun_vcpu(int vcpuid)
407 {
408 	return (vcpu_info[vcpuid].vcpu);
409 }
410 
411 static void *
412 fbsdrun_start_thread(void *param)
413 {
414 	char tname[MAXCOMLEN + 1];
415 	struct vcpu_info *vi = param;
416 	int error;
417 
418 	snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid);
419 	pthread_set_name_np(pthread_self(), tname);
420 
421 	if (vcpumap[vi->vcpuid] != NULL) {
422 		error = pthread_setaffinity_np(pthread_self(),
423 		    sizeof(cpuset_t), vcpumap[vi->vcpuid]);
424 		assert(error == 0);
425 	}
426 
427 #ifdef BHYVE_SNAPSHOT
428 	checkpoint_cpu_add(vi->vcpuid);
429 #endif
430 #ifdef BHYVE_GDB
431 	gdb_cpu_add(vi->vcpu);
432 #endif
433 
434 	vm_loop(vi->ctx, vi->vcpu);
435 
436 	/* not reached */
437 	exit(1);
438 	return (NULL);
439 }
440 
441 void
442 fbsdrun_addcpu(int vcpuid)
443 {
444 	struct vcpu_info *vi;
445 	pthread_t thr;
446 	int error;
447 
448 	vi = &vcpu_info[vcpuid];
449 
450 	error = vm_activate_cpu(vi->vcpu);
451 	if (error != 0)
452 		err(EX_OSERR, "could not activate CPU %d", vi->vcpuid);
453 
454 	CPU_SET_ATOMIC(vcpuid, &cpumask);
455 
456 	vm_suspend_cpu(vi->vcpu);
457 
458 	error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi);
459 	assert(error == 0);
460 }
461 
462 void
463 fbsdrun_deletecpu(int vcpu)
464 {
465 	static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
466 	static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER;
467 
468 	pthread_mutex_lock(&resetcpu_mtx);
469 	if (!CPU_ISSET(vcpu, &cpumask)) {
470 		EPRINTLN("Attempting to delete unknown cpu %d", vcpu);
471 		exit(4);
472 	}
473 
474 	CPU_CLR(vcpu, &cpumask);
475 
476 	if (vcpu != BSP) {
477 		pthread_cond_signal(&resetcpu_cond);
478 		pthread_mutex_unlock(&resetcpu_mtx);
479 		pthread_exit(NULL);
480 		/* NOTREACHED */
481 	}
482 
483 	while (!CPU_EMPTY(&cpumask)) {
484 		pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx);
485 	}
486 	pthread_mutex_unlock(&resetcpu_mtx);
487 }
488 
489 int
490 fbsdrun_suspendcpu(int vcpuid)
491 {
492 	return (vm_suspend_cpu(vcpu_info[vcpuid].vcpu));
493 }
494 
495 static void
496 vm_loop(struct vmctx *ctx, struct vcpu *vcpu)
497 {
498 	struct vm_exit vme;
499 	struct vm_run vmrun;
500 	int error, rc;
501 	enum vm_exitcode exitcode;
502 	cpuset_t active_cpus, dmask;
503 
504 	error = vm_active_cpus(ctx, &active_cpus);
505 	assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus));
506 
507 	vmrun.vm_exit = &vme;
508 	vmrun.cpuset = &dmask;
509 	vmrun.cpusetsize = sizeof(dmask);
510 
511 	while (1) {
512 		error = vm_run(vcpu, &vmrun);
513 		if (error != 0)
514 			break;
515 
516 		exitcode = vme.exitcode;
517 		if (exitcode >= VM_EXITCODE_MAX ||
518 		    vmexit_handlers[exitcode] == NULL) {
519 			warnx("vm_loop: unexpected exitcode 0x%x", exitcode);
520 			exit(4);
521 		}
522 
523 		rc = (*vmexit_handlers[exitcode])(ctx, vcpu, &vmrun);
524 
525 		switch (rc) {
526 		case VMEXIT_CONTINUE:
527 			break;
528 		case VMEXIT_ABORT:
529 			abort();
530 		default:
531 			exit(4);
532 		}
533 	}
534 	EPRINTLN("vm_run error %d, errno %d", error, errno);
535 }
536 
537 static int
538 num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu)
539 {
540 	uint16_t sockets, cores, threads, maxcpus;
541 	int tmp, error;
542 
543 	/*
544 	 * The guest is allowed to spinup more than one processor only if the
545 	 * UNRESTRICTED_GUEST capability is available.
546 	 */
547 	error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp);
548 	if (error != 0)
549 		return (1);
550 
551 	error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus);
552 	if (error == 0)
553 		return (maxcpus);
554 	else
555 		return (1);
556 }
557 
558 static struct vmctx *
559 do_open(const char *vmname)
560 {
561 	struct vmctx *ctx;
562 	int error;
563 	bool reinit, romboot;
564 
565 	reinit = romboot = false;
566 
567 #ifdef __amd64__
568 	if (lpc_bootrom())
569 		romboot = true;
570 #endif
571 
572 	error = vm_create(vmname);
573 	if (error) {
574 		if (errno == EEXIST) {
575 			if (romboot) {
576 				reinit = true;
577 			} else {
578 				/*
579 				 * The virtual machine has been setup by the
580 				 * userspace bootloader.
581 				 */
582 			}
583 		} else {
584 			perror("vm_create");
585 			exit(4);
586 		}
587 	} else {
588 		if (!romboot) {
589 			/*
590 			 * If the virtual machine was just created then a
591 			 * bootrom must be configured to boot it.
592 			 */
593 			fprintf(stderr, "virtual machine cannot be booted\n");
594 			exit(4);
595 		}
596 	}
597 
598 	ctx = vm_open(vmname);
599 	if (ctx == NULL) {
600 		perror("vm_open");
601 		exit(4);
602 	}
603 
604 #ifndef WITHOUT_CAPSICUM
605 	if (vm_limit_rights(ctx) != 0)
606 		err(EX_OSERR, "vm_limit_rights");
607 #endif
608 
609 	if (reinit) {
610 		error = vm_reinit(ctx);
611 		if (error) {
612 			perror("vm_reinit");
613 			exit(4);
614 		}
615 	}
616 	error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0);
617 	if (error)
618 		errx(EX_OSERR, "vm_set_topology");
619 	return (ctx);
620 }
621 
622 static bool
623 parse_config_option(const char *option)
624 {
625 	const char *value;
626 	char *path;
627 
628 	value = strchr(option, '=');
629 	if (value == NULL || value[1] == '\0')
630 		return (false);
631 	path = strndup(option, value - option);
632 	if (path == NULL)
633 		err(4, "Failed to allocate memory");
634 	set_config_value(path, value + 1);
635 	return (true);
636 }
637 
638 static void
639 parse_simple_config_file(const char *path)
640 {
641 	FILE *fp;
642 	char *line, *cp;
643 	size_t linecap;
644 	unsigned int lineno;
645 
646 	fp = fopen(path, "r");
647 	if (fp == NULL)
648 		err(4, "Failed to open configuration file %s", path);
649 	line = NULL;
650 	linecap = 0;
651 	lineno = 1;
652 	for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) {
653 		if (*line == '#' || *line == '\n')
654 			continue;
655 		cp = strchr(line, '\n');
656 		if (cp != NULL)
657 			*cp = '\0';
658 		if (!parse_config_option(line))
659 			errx(4, "%s line %u: invalid config option '%s'", path,
660 			    lineno, line);
661 	}
662 	free(line);
663 	fclose(fp);
664 }
665 
666 #ifdef BHYVE_GDB
667 static void
668 parse_gdb_options(const char *opt)
669 {
670 	const char *sport;
671 	char *colon;
672 
673 	if (opt[0] == 'w') {
674 		set_config_bool("gdb.wait", true);
675 		opt++;
676 	}
677 
678 	colon = strrchr(opt, ':');
679 	if (colon == NULL) {
680 		sport = opt;
681 	} else {
682 		*colon = '\0';
683 		colon++;
684 		sport = colon;
685 		set_config_value("gdb.address", opt);
686 	}
687 
688 	set_config_value("gdb.port", sport);
689 }
690 #endif
691 
692 int
693 main(int argc, char *argv[])
694 {
695 	int c, error;
696 	int max_vcpus, memflags;
697 	struct vcpu *bsp;
698 	struct vmctx *ctx;
699 	size_t memsize;
700 	const char *optstr, *value, *vmname;
701 #ifdef BHYVE_SNAPSHOT
702 	char *restore_file;
703 	struct restore_state rstate;
704 
705 	restore_file = NULL;
706 #endif
707 
708 	bhyve_init_config();
709 
710 	progname = basename(argv[0]);
711 
712 #ifdef BHYVE_SNAPSHOT
713 	optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:";
714 #else
715 	optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:";
716 #endif
717 	while ((c = getopt(argc, argv, optstr)) != -1) {
718 		switch (c) {
719 #ifdef __amd64__
720 		case 'a':
721 			set_config_bool("x86.x2apic", false);
722 			break;
723 #endif
724 		case 'A':
725 			/*
726 			 * NOP. For backward compatibility. Most systems don't
727 			 * work properly without sane ACPI tables. Therefore,
728 			 * we're always generating them.
729 			 */
730 			break;
731 		case 'D':
732 			set_config_bool("destroy_on_poweroff", true);
733 			break;
734 		case 'p':
735 			if (pincpu_parse(optarg) != 0) {
736 				errx(EX_USAGE, "invalid vcpu pinning "
737 				    "configuration '%s'", optarg);
738 			}
739 			break;
740 		case 'c':
741 			if (topology_parse(optarg) != 0) {
742 			    errx(EX_USAGE, "invalid cpu topology "
743 				"'%s'", optarg);
744 			}
745 			break;
746 		case 'C':
747 			set_config_bool("memory.guest_in_core", true);
748 			break;
749 		case 'f':
750 			if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) {
751 			    errx(EX_USAGE, "invalid fwcfg item '%s'", optarg);
752 			}
753 			break;
754 #ifdef BHYVE_GDB
755 		case 'G':
756 			parse_gdb_options(optarg);
757 			break;
758 #endif
759 		case 'k':
760 			parse_simple_config_file(optarg);
761 			break;
762 		case 'K':
763 			set_config_value("keyboard.layout", optarg);
764 			break;
765 #ifdef __amd64__
766 		case 'l':
767 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
768 				lpc_print_supported_devices();
769 				exit(0);
770 			} else if (lpc_device_parse(optarg) != 0) {
771 				errx(EX_USAGE, "invalid lpc device "
772 				    "configuration '%s'", optarg);
773 			}
774 			break;
775 #endif
776 #ifdef BHYVE_SNAPSHOT
777 		case 'r':
778 			restore_file = optarg;
779 			break;
780 #endif
781 		case 's':
782 			if (strncmp(optarg, "help", strlen(optarg)) == 0) {
783 				pci_print_supported_devices();
784 				exit(0);
785 			} else if (pci_parse_slot(optarg) != 0)
786 				exit(4);
787 			else
788 				break;
789 		case 'S':
790 			set_config_bool("memory.wired", true);
791 			break;
792 		case 'm':
793 			set_config_value("memory.size", optarg);
794 			break;
795 		case 'o':
796 			if (!parse_config_option(optarg))
797 				errx(EX_USAGE, "invalid configuration option '%s'", optarg);
798 			break;
799 #ifdef __amd64__
800 		case 'H':
801 			set_config_bool("x86.vmexit_on_hlt", true);
802 			break;
803 		case 'I':
804 			/*
805 			 * The "-I" option was used to add an ioapic to the
806 			 * virtual machine.
807 			 *
808 			 * An ioapic is now provided unconditionally for each
809 			 * virtual machine and this option is now deprecated.
810 			 */
811 			break;
812 		case 'P':
813 			set_config_bool("x86.vmexit_on_pause", true);
814 			break;
815 		case 'e':
816 			set_config_bool("x86.strictio", true);
817 			break;
818 		case 'u':
819 			set_config_bool("rtc.use_localtime", false);
820 			break;
821 #endif
822 		case 'U':
823 			set_config_value("uuid", optarg);
824 			break;
825 #ifdef __amd64__
826 		case 'w':
827 			set_config_bool("x86.strictmsr", false);
828 			break;
829 #endif
830 		case 'W':
831 			set_config_bool("virtio_msix", false);
832 			break;
833 #ifdef __amd64__
834 		case 'x':
835 			set_config_bool("x86.x2apic", true);
836 			break;
837 		case 'Y':
838 			set_config_bool("x86.mptable", false);
839 			break;
840 #endif
841 		case 'h':
842 			usage(0);
843 		default:
844 			usage(1);
845 		}
846 	}
847 	argc -= optind;
848 	argv += optind;
849 
850 	if (argc > 1)
851 		usage(1);
852 
853 #ifdef BHYVE_SNAPSHOT
854 	if (restore_file != NULL) {
855 		error = load_restore_file(restore_file, &rstate);
856 		if (error) {
857 			fprintf(stderr, "Failed to read checkpoint info from "
858 					"file: '%s'.\n", restore_file);
859 			exit(1);
860 		}
861 		vmname = lookup_vmname(&rstate);
862 		if (vmname != NULL)
863 			set_config_value("name", vmname);
864 	}
865 #endif
866 
867 	if (argc == 1)
868 		set_config_value("name", argv[0]);
869 
870 	vmname = get_config_value("name");
871 	if (vmname == NULL)
872 		usage(1);
873 
874 	if (get_config_bool_default("config.dump", false)) {
875 		dump_config();
876 		exit(1);
877 	}
878 
879 	calc_topology();
880 	build_vcpumaps();
881 
882 	value = get_config_value("memory.size");
883 	error = vm_parse_memsize(value, &memsize);
884 	if (error)
885 		errx(EX_USAGE, "invalid memsize '%s'", value);
886 
887 	ctx = do_open(vmname);
888 
889 #ifdef BHYVE_SNAPSHOT
890 	if (restore_file != NULL) {
891 		guest_ncpus = lookup_guest_ncpus(&rstate);
892 		memflags = lookup_memflags(&rstate);
893 		memsize = lookup_memsize(&rstate);
894 	}
895 
896 	if (guest_ncpus < 1) {
897 		fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus);
898 		exit(1);
899 	}
900 #endif
901 
902 	bsp = vm_vcpu_open(ctx, BSP);
903 	max_vcpus = num_vcpus_allowed(ctx, bsp);
904 	if (guest_ncpus > max_vcpus) {
905 		fprintf(stderr, "%d vCPUs requested but only %d available\n",
906 			guest_ncpus, max_vcpus);
907 		exit(4);
908 	}
909 
910 	bhyve_init_vcpu(bsp);
911 
912 	/* Allocate per-VCPU resources. */
913 	vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info));
914 	for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) {
915 		vcpu_info[vcpuid].ctx = ctx;
916 		vcpu_info[vcpuid].vcpuid = vcpuid;
917 		if (vcpuid == BSP)
918 			vcpu_info[vcpuid].vcpu = bsp;
919 		else
920 			vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid);
921 	}
922 
923 	memflags = 0;
924 	if (get_config_bool_default("memory.wired", false))
925 		memflags |= VM_MEM_F_WIRED;
926 	if (get_config_bool_default("memory.guest_in_core", false))
927 		memflags |= VM_MEM_F_INCORE;
928 	vm_set_memflags(ctx, memflags);
929 	error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
930 	if (error) {
931 		fprintf(stderr, "Unable to setup memory (%d)\n", errno);
932 		exit(4);
933 	}
934 
935 	init_mem(guest_ncpus);
936 	init_bootrom(ctx);
937 	if (bhyve_init_platform(ctx, bsp) != 0)
938 		exit(4);
939 
940 	if (qemu_fwcfg_init(ctx) != 0) {
941 		fprintf(stderr, "qemu fwcfg initialization error\n");
942 		exit(4);
943 	}
944 
945 	if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus),
946 	    &guest_ncpus) != 0) {
947 		fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu\n");
948 		exit(4);
949 	}
950 
951 	/*
952 	 * Exit if a device emulation finds an error in its initialization
953 	 */
954 	if (init_pci(ctx) != 0) {
955 		EPRINTLN("Device emulation initialization error: %s",
956 		    strerror(errno));
957 		exit(4);
958 	}
959 	if (init_tpm(ctx) != 0) {
960 		EPRINTLN("Failed to init TPM device");
961 		exit(4);
962 	}
963 
964 	/*
965 	 * Initialize after PCI, to allow a bootrom file to reserve the high
966 	 * region.
967 	 */
968 	if (get_config_bool("acpi_tables"))
969 		vmgenc_init(ctx);
970 
971 #ifdef BHYVE_GDB
972 	init_gdb(ctx);
973 #endif
974 
975 	/*
976 	 * Add all vCPUs.
977 	 */
978 	for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
979 		bhyve_start_vcpu(vcpu_info[vcpuid].vcpu, vcpuid == BSP);
980 
981 #ifdef BHYVE_SNAPSHOT
982 	if (restore_file != NULL) {
983 		FPRINTLN(stdout, "Pausing pci devs...");
984 		if (vm_pause_devices() != 0) {
985 			EPRINTLN("Failed to pause PCI device state.");
986 			exit(1);
987 		}
988 
989 		FPRINTLN(stdout, "Restoring vm mem...");
990 		if (restore_vm_mem(ctx, &rstate) != 0) {
991 			EPRINTLN("Failed to restore VM memory.");
992 			exit(1);
993 		}
994 
995 		FPRINTLN(stdout, "Restoring pci devs...");
996 		if (vm_restore_devices(&rstate) != 0) {
997 			EPRINTLN("Failed to restore PCI device state.");
998 			exit(1);
999 		}
1000 
1001 		FPRINTLN(stdout, "Restoring kernel structs...");
1002 		if (vm_restore_kern_structs(ctx, &rstate) != 0) {
1003 			EPRINTLN("Failed to restore kernel structs.");
1004 			exit(1);
1005 		}
1006 
1007 		FPRINTLN(stdout, "Resuming pci devs...");
1008 		if (vm_resume_devices() != 0) {
1009 			EPRINTLN("Failed to resume PCI device state.");
1010 			exit(1);
1011 		}
1012 	}
1013 #endif
1014 
1015 	if (bhyve_init_platform_late(ctx, bsp) != 0)
1016 		exit(4);
1017 
1018 	/*
1019 	 * Change the proc title to include the VM name.
1020 	 */
1021 	setproctitle("%s", vmname);
1022 
1023 #ifdef BHYVE_SNAPSHOT
1024 	/*
1025 	 * checkpointing thread for communication with bhyvectl
1026 	 */
1027 	if (init_checkpoint_thread(ctx) != 0)
1028 		errx(EX_OSERR, "Failed to start checkpoint thread");
1029 #endif
1030 
1031 #ifndef WITHOUT_CAPSICUM
1032 	caph_cache_catpages();
1033 
1034 	if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1)
1035 		errx(EX_OSERR, "Unable to apply rights for sandbox");
1036 
1037 	if (caph_enter() == -1)
1038 		errx(EX_OSERR, "cap_enter() failed");
1039 #endif
1040 
1041 #ifdef BHYVE_SNAPSHOT
1042 	if (restore_file != NULL) {
1043 		destroy_restore_state(&rstate);
1044 		if (vm_restore_time(ctx) < 0)
1045 			err(EX_OSERR, "Unable to restore time");
1046 
1047 		for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++)
1048 			vm_resume_cpu(vcpu_info[vcpuid].vcpu);
1049 	} else
1050 #endif
1051 		vm_resume_cpu(bsp);
1052 
1053 	/*
1054 	 * Head off to the main event dispatch loop
1055 	 */
1056 	mevent_dispatch();
1057 
1058 	exit(4);
1059 }
1060