xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm.c (revision 61b20185b3a9f12c5f69672abe47b79dfb002cab)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * This file and its contents are supplied under the terms of the
32  * Common Development and Distribution License ("CDDL"), version 1.0.
33  * You may only use this file in accordance with the terms of version
34  * 1.0 of the CDDL.
35  *
36  * A full copy of the text of the CDDL should have accompanied this
37  * source.  A copy of the CDDL is also available via the Internet at
38  * http://www.illumos.org/license/CDDL.
39  *
40  * Copyright 2015 Pluribus Networks Inc.
41  * Copyright 2018 Joyent, Inc.
42  * Copyright 2022 Oxide Computer Company
43  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
44  */
45 
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/kernel.h>
52 #include <sys/module.h>
53 #include <sys/sysctl.h>
54 #include <sys/malloc.h>
55 #include <sys/pcpu.h>
56 #include <sys/mutex.h>
57 #include <sys/proc.h>
58 #include <sys/rwlock.h>
59 #include <sys/sched.h>
60 #include <sys/systm.h>
61 #include <sys/sunddi.h>
62 #include <sys/hma.h>
63 
64 #include <machine/md_var.h>
65 #include <x86/psl.h>
66 #include <x86/apicreg.h>
67 
68 #include <machine/specialreg.h>
69 #include <machine/vmm.h>
70 #include <machine/vmm_dev.h>
71 #include <machine/vmparam.h>
72 #include <sys/vmm_instruction_emul.h>
73 #include <sys/vmm_vm.h>
74 #include <sys/vmm_gpt.h>
75 
76 #include "vmm_ioport.h"
77 #include "vmm_ktr.h"
78 #include "vmm_host.h"
79 #include "vmm_util.h"
80 #include "vatpic.h"
81 #include "vatpit.h"
82 #include "vhpet.h"
83 #include "vioapic.h"
84 #include "vlapic.h"
85 #include "vpmtmr.h"
86 #include "vrtc.h"
87 #include "vmm_stat.h"
88 #include "vmm_lapic.h"
89 
90 #include "io/ppt.h"
91 #include "io/iommu.h"
92 
93 struct vlapic;
94 
95 /* Flags for vtc_status */
96 #define	VTCS_FPU_RESTORED	1 /* guest FPU restored, host FPU saved */
97 #define	VTCS_FPU_CTX_CRITICAL	2 /* in ctx where FPU restore cannot be lazy */
98 
99 typedef struct vm_thread_ctx {
100 	struct vm	*vtc_vm;
101 	int		vtc_vcpuid;
102 	uint_t		vtc_status;
103 	enum vcpu_ustate vtc_ustate;
104 } vm_thread_ctx_t;
105 
106 #define	VMM_MTRR_VAR_MAX 10
107 #define	VMM_MTRR_DEF_MASK \
108 	(MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE)
109 #define	VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE)
110 #define	VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID)
111 struct vm_mtrr {
112 	uint64_t def_type;
113 	uint64_t fixed4k[8];
114 	uint64_t fixed16k[2];
115 	uint64_t fixed64k;
116 	struct {
117 		uint64_t base;
118 		uint64_t mask;
119 	} var[VMM_MTRR_VAR_MAX];
120 };
121 
122 /*
123  * Initialization:
124  * (a) allocated when vcpu is created
125  * (i) initialized when vcpu is created and when it is reinitialized
126  * (o) initialized the first time the vcpu is created
127  * (x) initialized before use
128  */
129 struct vcpu {
130 	/* (o) protects state, run_state, hostcpu, sipi_vector */
131 	kmutex_t	lock;
132 
133 	enum vcpu_state	state;		/* (o) vcpu state */
134 	enum vcpu_run_state run_state;	/* (i) vcpu init/sipi/run state */
135 	kcondvar_t	vcpu_cv;	/* (o) cpu waiter cv */
136 	kcondvar_t	state_cv;	/* (o) IDLE-transition cv */
137 	int		hostcpu;	/* (o) vcpu's current host cpu */
138 	int		lastloccpu;	/* (o) last host cpu localized to */
139 	int		reqidle;	/* (i) request vcpu to idle */
140 	struct vlapic	*vlapic;	/* (i) APIC device model */
141 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
142 	uint64_t	exit_intinfo;	/* (i) events pending at VM exit */
143 	uint64_t	exc_pending;	/* (i) exception pending */
144 	bool		nmi_pending;	/* (i) NMI pending */
145 	bool		extint_pending;	/* (i) INTR pending */
146 
147 	uint8_t		sipi_vector;	/* (i) SIPI vector */
148 	hma_fpu_t	*guestfpu;	/* (a,i) guest fpu state */
149 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
150 	void		*stats;		/* (a,i) statistics */
151 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
152 	uint64_t	nextrip;	/* (x) next instruction to execute */
153 	struct vie	*vie_ctx;	/* (x) instruction emulation context */
154 	vm_client_t	*vmclient;	/* (a) VM-system client */
155 	uint64_t	tsc_offset;	/* (x) offset from host TSC */
156 	struct vm_mtrr	mtrr;		/* (i) vcpu's MTRR */
157 
158 	enum vcpu_ustate ustate;	/* (i) microstate for the vcpu */
159 	hrtime_t	ustate_when;	/* (i) time of last ustate change */
160 	uint64_t ustate_total[VU_MAX];	/* (o) total time spent in ustates */
161 	vm_thread_ctx_t	vtc;		/* (o) thread state for ctxops */
162 	struct ctxop	*ctxop;		/* (o) ctxop storage for vcpu */
163 };
164 
165 #define	vcpu_lock(v)		mutex_enter(&((v)->lock))
166 #define	vcpu_unlock(v)		mutex_exit(&((v)->lock))
167 #define	vcpu_assert_locked(v)	ASSERT(MUTEX_HELD(&((v)->lock)))
168 
169 struct mem_seg {
170 	size_t	len;
171 	bool	sysmem;
172 	vm_object_t *object;
173 };
174 #define	VM_MAX_MEMSEGS	5
175 
176 struct mem_map {
177 	vm_paddr_t	gpa;
178 	size_t		len;
179 	vm_ooffset_t	segoff;
180 	int		segid;
181 	int		prot;
182 	int		flags;
183 };
184 #define	VM_MAX_MEMMAPS	8
185 
186 /*
187  * Initialization:
188  * (o) initialized the first time the VM is created
189  * (i) initialized when VM is created and when it is reinitialized
190  * (x) initialized before use
191  */
192 struct vm {
193 	void		*cookie;		/* (i) cpu-specific data */
194 	void		*iommu;			/* (x) iommu-specific data */
195 	struct vhpet	*vhpet;			/* (i) virtual HPET */
196 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
197 	struct vatpic	*vatpic;		/* (i) virtual atpic */
198 	struct vatpit	*vatpit;		/* (i) virtual atpit */
199 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
200 	struct vrtc	*vrtc;			/* (o) virtual RTC */
201 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
202 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for dbg */
203 	int		suspend;		/* (i) stop VM execution */
204 	volatile cpuset_t suspended_cpus;	/* (i) suspended vcpus */
205 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
206 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
207 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
208 	struct vmspace	*vmspace;		/* (o) guest's address space */
209 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
210 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
211 	/* The following describe the vm cpu topology */
212 	uint16_t	sockets;		/* (o) num of sockets */
213 	uint16_t	cores;			/* (o) num of cores/socket */
214 	uint16_t	threads;		/* (o) num of threads/core */
215 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
216 	uint64_t	boot_tsc_offset;	/* (i) TSC offset at VM boot */
217 
218 	struct ioport_config ioports;		/* (o) ioport handling */
219 
220 	bool		mem_transient;		/* (o) alloc transient memory */
221 };
222 
223 static int vmm_initialized;
224 
225 
226 static void
227 nullop_panic(void)
228 {
229 	panic("null vmm operation call");
230 }
231 
232 /* Do not allow use of an un-set `ops` to do anything but panic */
233 static struct vmm_ops vmm_ops_null = {
234 	.init		= (vmm_init_func_t)nullop_panic,
235 	.cleanup	= (vmm_cleanup_func_t)nullop_panic,
236 	.resume		= (vmm_resume_func_t)nullop_panic,
237 	.vminit		= (vmi_init_func_t)nullop_panic,
238 	.vmrun		= (vmi_run_func_t)nullop_panic,
239 	.vmcleanup	= (vmi_cleanup_func_t)nullop_panic,
240 	.vmgetreg	= (vmi_get_register_t)nullop_panic,
241 	.vmsetreg	= (vmi_set_register_t)nullop_panic,
242 	.vmgetdesc	= (vmi_get_desc_t)nullop_panic,
243 	.vmsetdesc	= (vmi_set_desc_t)nullop_panic,
244 	.vmgetcap	= (vmi_get_cap_t)nullop_panic,
245 	.vmsetcap	= (vmi_set_cap_t)nullop_panic,
246 	.vlapic_init	= (vmi_vlapic_init)nullop_panic,
247 	.vlapic_cleanup	= (vmi_vlapic_cleanup)nullop_panic,
248 	.vmsavectx	= (vmi_savectx)nullop_panic,
249 	.vmrestorectx	= (vmi_restorectx)nullop_panic,
250 };
251 
252 static struct vmm_ops *ops = &vmm_ops_null;
253 static vmm_pte_ops_t *pte_ops = NULL;
254 
255 #define	VMM_INIT()			((*ops->init)())
256 #define	VMM_CLEANUP()			((*ops->cleanup)())
257 #define	VMM_RESUME()			((*ops->resume)())
258 
259 #define	VMINIT(vm)		((*ops->vminit)(vm))
260 #define	VMRUN(vmi, vcpu, rip)	((*ops->vmrun)(vmi, vcpu, rip))
261 #define	VMCLEANUP(vmi)			((*ops->vmcleanup)(vmi))
262 
263 #define	VMGETREG(vmi, vcpu, num, rv)	((*ops->vmgetreg)(vmi, vcpu, num, rv))
264 #define	VMSETREG(vmi, vcpu, num, val)	((*ops->vmsetreg)(vmi, vcpu, num, val))
265 #define	VMGETDESC(vmi, vcpu, num, dsc)	((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
266 #define	VMSETDESC(vmi, vcpu, num, dsc)	((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
267 #define	VMGETCAP(vmi, vcpu, num, rv)	((*ops->vmgetcap)(vmi, vcpu, num, rv))
268 #define	VMSETCAP(vmi, vcpu, num, val)	((*ops->vmsetcap)(vmi, vcpu, num, val))
269 #define	VLAPIC_INIT(vmi, vcpu)		((*ops->vlapic_init)(vmi, vcpu))
270 #define	VLAPIC_CLEANUP(vmi, vlapic)	((*ops->vlapic_cleanup)(vmi, vlapic))
271 
272 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
273 #define	fpu_stop_emulating()	clts()
274 
275 SDT_PROVIDER_DEFINE(vmm);
276 
277 static MALLOC_DEFINE(M_VM, "vm", "vm");
278 
279 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
280     NULL);
281 
282 /*
283  * Halt the guest if all vcpus are executing a HLT instruction with
284  * interrupts disabled.
285  */
286 static int halt_detection_enabled = 1;
287 
288 /* Trap into hypervisor on all guest exceptions and reflect them back */
289 static int trace_guest_exceptions;
290 
291 static void vm_free_memmap(struct vm *vm, int ident);
292 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
293 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
294 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
295 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
296 
297 static void vmm_savectx(void *);
298 static void vmm_restorectx(void *);
299 static const struct ctxop_template vmm_ctxop_tpl = {
300 	.ct_rev		= CTXOP_TPL_REV,
301 	.ct_save	= vmm_savectx,
302 	.ct_restore	= vmm_restorectx,
303 };
304 
305 #ifdef KTR
306 static const char *
307 vcpu_state2str(enum vcpu_state state)
308 {
309 
310 	switch (state) {
311 	case VCPU_IDLE:
312 		return ("idle");
313 	case VCPU_FROZEN:
314 		return ("frozen");
315 	case VCPU_RUNNING:
316 		return ("running");
317 	case VCPU_SLEEPING:
318 		return ("sleeping");
319 	default:
320 		return ("unknown");
321 	}
322 }
323 #endif
324 
325 static void
326 vcpu_cleanup(struct vm *vm, int i, bool destroy)
327 {
328 	struct vcpu *vcpu = &vm->vcpu[i];
329 
330 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
331 	if (destroy) {
332 		vmm_stat_free(vcpu->stats);
333 
334 		hma_fpu_free(vcpu->guestfpu);
335 		vcpu->guestfpu = NULL;
336 
337 		vie_free(vcpu->vie_ctx);
338 		vcpu->vie_ctx = NULL;
339 
340 		vmc_destroy(vcpu->vmclient);
341 		vcpu->vmclient = NULL;
342 
343 		ctxop_free(vcpu->ctxop);
344 		mutex_destroy(&vcpu->lock);
345 	}
346 }
347 
348 static void
349 vcpu_init(struct vm *vm, int vcpu_id, bool create)
350 {
351 	struct vcpu *vcpu;
352 
353 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
354 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
355 
356 	vcpu = &vm->vcpu[vcpu_id];
357 
358 	if (create) {
359 		mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL);
360 
361 		vcpu->state = VCPU_IDLE;
362 		vcpu->hostcpu = NOCPU;
363 		vcpu->lastloccpu = NOCPU;
364 		vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP);
365 		vcpu->stats = vmm_stat_alloc();
366 		vcpu->vie_ctx = vie_alloc();
367 
368 		vcpu->ustate = VU_INIT;
369 		vcpu->ustate_when = gethrtime();
370 
371 		vcpu->vtc.vtc_vm = vm;
372 		vcpu->vtc.vtc_vcpuid = vcpu_id;
373 		vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc);
374 	} else {
375 		vie_reset(vcpu->vie_ctx);
376 		bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
377 		if (vcpu->ustate != VU_INIT) {
378 			vcpu_ustate_change(vm, vcpu_id, VU_INIT);
379 		}
380 		bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
381 	}
382 
383 	vcpu->run_state = VRS_HALT;
384 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
385 	(void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
386 	vcpu->reqidle = 0;
387 	vcpu->exit_intinfo = 0;
388 	vcpu->nmi_pending = false;
389 	vcpu->extint_pending = false;
390 	vcpu->exc_pending = 0;
391 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
392 	(void) hma_fpu_init(vcpu->guestfpu);
393 	vmm_stat_init(vcpu->stats);
394 	vcpu->tsc_offset = 0;
395 }
396 
397 int
398 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
399 {
400 
401 	return (trace_guest_exceptions);
402 }
403 
404 struct vm_exit *
405 vm_exitinfo(struct vm *vm, int cpuid)
406 {
407 	struct vcpu *vcpu;
408 
409 	if (cpuid < 0 || cpuid >= vm->maxcpus)
410 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
411 
412 	vcpu = &vm->vcpu[cpuid];
413 
414 	return (&vcpu->exitinfo);
415 }
416 
417 struct vie *
418 vm_vie_ctx(struct vm *vm, int cpuid)
419 {
420 	if (cpuid < 0 || cpuid >= vm->maxcpus)
421 		panic("vm_vie_ctx: invalid cpuid %d", cpuid);
422 
423 	return (vm->vcpu[cpuid].vie_ctx);
424 }
425 
426 static int
427 vmm_init(void)
428 {
429 	vmm_host_state_init();
430 
431 	if (vmm_is_intel()) {
432 		ops = &vmm_ops_intel;
433 		pte_ops = &ept_pte_ops;
434 	} else if (vmm_is_svm()) {
435 		ops = &vmm_ops_amd;
436 		pte_ops = &rvi_pte_ops;
437 	} else {
438 		return (ENXIO);
439 	}
440 
441 	return (VMM_INIT());
442 }
443 
444 int
445 vmm_mod_load()
446 {
447 	int	error;
448 
449 	VERIFY(vmm_initialized == 0);
450 
451 	error = vmm_init();
452 	if (error == 0)
453 		vmm_initialized = 1;
454 
455 	return (error);
456 }
457 
458 int
459 vmm_mod_unload()
460 {
461 	int	error;
462 
463 	VERIFY(vmm_initialized == 1);
464 
465 	iommu_cleanup();
466 	error = VMM_CLEANUP();
467 	if (error)
468 		return (error);
469 	vmm_initialized = 0;
470 
471 	return (0);
472 }
473 
474 static void
475 vm_init(struct vm *vm, bool create)
476 {
477 	int i;
478 
479 	vm->cookie = VMINIT(vm);
480 	vm->iommu = NULL;
481 	vm->vioapic = vioapic_init(vm);
482 	vm->vhpet = vhpet_init(vm);
483 	vm->vatpic = vatpic_init(vm);
484 	vm->vatpit = vatpit_init(vm);
485 	vm->vpmtmr = vpmtmr_init(vm);
486 	if (create)
487 		vm->vrtc = vrtc_init(vm);
488 
489 	vm_inout_init(vm, &vm->ioports);
490 
491 	CPU_ZERO(&vm->active_cpus);
492 	CPU_ZERO(&vm->debug_cpus);
493 
494 	vm->suspend = 0;
495 	CPU_ZERO(&vm->suspended_cpus);
496 
497 	for (i = 0; i < vm->maxcpus; i++)
498 		vcpu_init(vm, i, create);
499 
500 	/*
501 	 * Configure the VM-wide TSC offset so that the call to vm_init()
502 	 * represents the boot time (when the TSC(s) read 0).  Each vCPU will
503 	 * have its own offset from this, which is altered if/when the guest
504 	 * writes to MSR_TSC.
505 	 *
506 	 * The TSC offsetting math is all unsigned, using overflow for negative
507 	 * offets.  A reading of the TSC is negated to form the boot offset.
508 	 */
509 	vm->boot_tsc_offset = (uint64_t)(-(int64_t)rdtsc_offset());
510 }
511 
512 /*
513  * The default CPU topology is a single thread per package.
514  */
515 uint_t cores_per_package = 1;
516 uint_t threads_per_core = 1;
517 
518 /*
519  * Debugging tunable to enable dirty-page-tracking.
520  * (Remains off by default for now)
521  */
522 bool gpt_track_dirty = false;
523 
524 int
525 vm_create(const char *name, uint64_t flags, struct vm **retvm)
526 {
527 	struct vm *vm;
528 	struct vmspace *vmspace;
529 
530 	/*
531 	 * If vmm.ko could not be successfully initialized then don't attempt
532 	 * to create the virtual machine.
533 	 */
534 	if (!vmm_initialized)
535 		return (ENXIO);
536 
537 	/* Name validation has already occurred */
538 	VERIFY3U(strnlen(name, VM_MAX_NAMELEN), <, VM_MAX_NAMELEN);
539 
540 	vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty);
541 	if (vmspace == NULL)
542 		return (ENOMEM);
543 
544 	vm = malloc(sizeof (struct vm), M_VM, M_WAITOK | M_ZERO);
545 	(void) strlcpy(vm->name, name, sizeof (vm->name));
546 
547 	vm->vmspace = vmspace;
548 	vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0;
549 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
550 		vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace);
551 	}
552 
553 	vm->sockets = 1;
554 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
555 	vm->threads = threads_per_core;	/* XXX backwards compatibility */
556 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
557 
558 	vm_init(vm, true);
559 
560 	*retvm = vm;
561 	return (0);
562 }
563 
564 void
565 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
566     uint16_t *threads, uint16_t *maxcpus)
567 {
568 	*sockets = vm->sockets;
569 	*cores = vm->cores;
570 	*threads = vm->threads;
571 	*maxcpus = vm->maxcpus;
572 }
573 
574 uint16_t
575 vm_get_maxcpus(struct vm *vm)
576 {
577 	return (vm->maxcpus);
578 }
579 
580 int
581 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
582     uint16_t threads, uint16_t maxcpus)
583 {
584 	if (maxcpus != 0)
585 		return (EINVAL);	/* XXX remove when supported */
586 	if ((sockets * cores * threads) > vm->maxcpus)
587 		return (EINVAL);
588 	/* XXX need to check sockets * cores * threads == vCPU, how? */
589 	vm->sockets = sockets;
590 	vm->cores = cores;
591 	vm->threads = threads;
592 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
593 	return (0);
594 }
595 
596 static void
597 vm_cleanup(struct vm *vm, bool destroy)
598 {
599 	struct mem_map *mm;
600 	int i;
601 
602 	ppt_unassign_all(vm);
603 
604 	if (vm->iommu != NULL)
605 		iommu_destroy_domain(vm->iommu);
606 
607 	/*
608 	 * Devices which attach their own ioport hooks should be cleaned up
609 	 * first so they can tear down those registrations.
610 	 */
611 	vpmtmr_cleanup(vm->vpmtmr);
612 
613 	vm_inout_cleanup(vm, &vm->ioports);
614 
615 	if (destroy)
616 		vrtc_cleanup(vm->vrtc);
617 	else
618 		vrtc_reset(vm->vrtc);
619 
620 	vatpit_cleanup(vm->vatpit);
621 	vhpet_cleanup(vm->vhpet);
622 	vatpic_cleanup(vm->vatpic);
623 	vioapic_cleanup(vm->vioapic);
624 
625 	for (i = 0; i < vm->maxcpus; i++)
626 		vcpu_cleanup(vm, i, destroy);
627 
628 	VMCLEANUP(vm->cookie);
629 
630 	/*
631 	 * System memory is removed from the guest address space only when
632 	 * the VM is destroyed. This is because the mapping remains the same
633 	 * across VM reset.
634 	 *
635 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
636 	 * so those mappings are removed on a VM reset.
637 	 */
638 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
639 		mm = &vm->mem_maps[i];
640 		if (destroy || !sysmem_mapping(vm, mm)) {
641 			vm_free_memmap(vm, i);
642 		} else {
643 			/*
644 			 * We need to reset the IOMMU flag so this mapping can
645 			 * be reused when a VM is rebooted. Since the IOMMU
646 			 * domain has already been destroyed we can just reset
647 			 * the flag here.
648 			 */
649 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
650 		}
651 	}
652 
653 	if (destroy) {
654 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
655 			vm_free_memseg(vm, i);
656 
657 		vmspace_destroy(vm->vmspace);
658 		vm->vmspace = NULL;
659 	}
660 }
661 
662 void
663 vm_destroy(struct vm *vm)
664 {
665 	vm_cleanup(vm, true);
666 	free(vm, M_VM);
667 }
668 
669 int
670 vm_reinit(struct vm *vm, uint64_t flags)
671 {
672 	/* A virtual machine can be reset only if all vcpus are suspended. */
673 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) {
674 		if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) {
675 			return (EBUSY);
676 		}
677 
678 		/*
679 		 * Force the VM (and all its vCPUs) into a suspended state.
680 		 * This should be quick and easy, since the vm_reinit() call is
681 		 * made while holding the VM write lock, which requires holding
682 		 * all of the vCPUs in the VCPU_FROZEN state.
683 		 */
684 		(void) atomic_cmpset_int((uint_t *)&vm->suspend, 0,
685 		    VM_SUSPEND_RESET);
686 		for (uint_t i = 0; i < vm->maxcpus; i++) {
687 			struct vcpu *vcpu = &vm->vcpu[i];
688 
689 			if (CPU_ISSET(i, &vm->suspended_cpus) ||
690 			    !CPU_ISSET(i, &vm->active_cpus)) {
691 				continue;
692 			}
693 
694 			vcpu_lock(vcpu);
695 			VERIFY3U(vcpu->state, ==, VCPU_FROZEN);
696 			CPU_SET_ATOMIC(i, &vm->suspended_cpus);
697 			vcpu_unlock(vcpu);
698 		}
699 
700 		VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus));
701 	}
702 
703 	vm_cleanup(vm, false);
704 	vm_init(vm, false);
705 	return (0);
706 }
707 
708 const char *
709 vm_name(struct vm *vm)
710 {
711 	return (vm->name);
712 }
713 
714 int
715 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
716 {
717 	vm_object_t *obj;
718 
719 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
720 		return (ENOMEM);
721 	else
722 		return (0);
723 }
724 
725 int
726 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
727 {
728 	return (vmspace_unmap(vm->vmspace, gpa, gpa + len));
729 }
730 
731 /*
732  * Return 'true' if 'gpa' is allocated in the guest address space.
733  *
734  * This function is called in the context of a running vcpu which acts as
735  * an implicit lock on 'vm->mem_maps[]'.
736  */
737 bool
738 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
739 {
740 	struct mem_map *mm;
741 	int i;
742 
743 #ifdef INVARIANTS
744 	int hostcpu, state;
745 	state = vcpu_get_state(vm, vcpuid, &hostcpu);
746 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
747 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
748 #endif
749 
750 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
751 		mm = &vm->mem_maps[i];
752 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
753 			return (true);		/* 'gpa' is sysmem or devmem */
754 	}
755 
756 	if (ppt_is_mmio(vm, gpa))
757 		return (true);			/* 'gpa' is pci passthru mmio */
758 
759 	return (false);
760 }
761 
762 int
763 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
764 {
765 	struct mem_seg *seg;
766 	vm_object_t *obj;
767 
768 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
769 		return (EINVAL);
770 
771 	if (len == 0 || (len & PAGE_MASK))
772 		return (EINVAL);
773 
774 	seg = &vm->mem_segs[ident];
775 	if (seg->object != NULL) {
776 		if (seg->len == len && seg->sysmem == sysmem)
777 			return (EEXIST);
778 		else
779 			return (EINVAL);
780 	}
781 
782 	obj = vm_object_mem_allocate(len, vm->mem_transient);
783 	if (obj == NULL)
784 		return (ENOMEM);
785 
786 	seg->len = len;
787 	seg->object = obj;
788 	seg->sysmem = sysmem;
789 	return (0);
790 }
791 
792 int
793 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
794     vm_object_t **objptr)
795 {
796 	struct mem_seg *seg;
797 
798 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
799 		return (EINVAL);
800 
801 	seg = &vm->mem_segs[ident];
802 	if (len)
803 		*len = seg->len;
804 	if (sysmem)
805 		*sysmem = seg->sysmem;
806 	if (objptr)
807 		*objptr = seg->object;
808 	return (0);
809 }
810 
811 void
812 vm_free_memseg(struct vm *vm, int ident)
813 {
814 	struct mem_seg *seg;
815 
816 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
817 	    ("%s: invalid memseg ident %d", __func__, ident));
818 
819 	seg = &vm->mem_segs[ident];
820 	if (seg->object != NULL) {
821 		vm_object_release(seg->object);
822 		bzero(seg, sizeof (struct mem_seg));
823 	}
824 }
825 
826 int
827 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
828     size_t len, int prot, int flags)
829 {
830 	struct mem_seg *seg;
831 	struct mem_map *m, *map;
832 	vm_ooffset_t last;
833 	int i, error;
834 
835 	if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
836 		return (EINVAL);
837 
838 	if (flags & ~VM_MEMMAP_F_WIRED)
839 		return (EINVAL);
840 
841 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
842 		return (EINVAL);
843 
844 	seg = &vm->mem_segs[segid];
845 	if (seg->object == NULL)
846 		return (EINVAL);
847 
848 	last = first + len;
849 	if (first < 0 || first >= last || last > seg->len)
850 		return (EINVAL);
851 
852 	if ((gpa | first | last) & PAGE_MASK)
853 		return (EINVAL);
854 
855 	map = NULL;
856 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
857 		m = &vm->mem_maps[i];
858 		if (m->len == 0) {
859 			map = m;
860 			break;
861 		}
862 	}
863 
864 	if (map == NULL)
865 		return (ENOSPC);
866 
867 	error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot);
868 	if (error != 0)
869 		return (EFAULT);
870 
871 	vm_object_reference(seg->object);
872 
873 	if ((flags & VM_MEMMAP_F_WIRED) != 0) {
874 		error = vmspace_populate(vm->vmspace, gpa, gpa + len);
875 		if (error != 0) {
876 			VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len));
877 			return (EFAULT);
878 		}
879 	}
880 
881 	map->gpa = gpa;
882 	map->len = len;
883 	map->segoff = first;
884 	map->segid = segid;
885 	map->prot = prot;
886 	map->flags = flags;
887 	return (0);
888 }
889 
890 int
891 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
892 {
893 	struct mem_map *m;
894 	int i;
895 
896 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
897 		m = &vm->mem_maps[i];
898 		if (m->gpa == gpa && m->len == len &&
899 		    (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
900 			vm_free_memmap(vm, i);
901 			return (0);
902 		}
903 	}
904 
905 	return (EINVAL);
906 }
907 
908 int
909 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
910     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
911 {
912 	struct mem_map *mm, *mmnext;
913 	int i;
914 
915 	mmnext = NULL;
916 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
917 		mm = &vm->mem_maps[i];
918 		if (mm->len == 0 || mm->gpa < *gpa)
919 			continue;
920 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
921 			mmnext = mm;
922 	}
923 
924 	if (mmnext != NULL) {
925 		*gpa = mmnext->gpa;
926 		if (segid)
927 			*segid = mmnext->segid;
928 		if (segoff)
929 			*segoff = mmnext->segoff;
930 		if (len)
931 			*len = mmnext->len;
932 		if (prot)
933 			*prot = mmnext->prot;
934 		if (flags)
935 			*flags = mmnext->flags;
936 		return (0);
937 	} else {
938 		return (ENOENT);
939 	}
940 }
941 
942 static void
943 vm_free_memmap(struct vm *vm, int ident)
944 {
945 	struct mem_map *mm;
946 	int error;
947 
948 	mm = &vm->mem_maps[ident];
949 	if (mm->len) {
950 		error = vmspace_unmap(vm->vmspace, mm->gpa,
951 		    mm->gpa + mm->len);
952 		KASSERT(error == 0, ("%s: vmspace_unmap error %d",
953 		    __func__, error));
954 		bzero(mm, sizeof (struct mem_map));
955 	}
956 }
957 
958 static __inline bool
959 sysmem_mapping(struct vm *vm, struct mem_map *mm)
960 {
961 
962 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
963 		return (true);
964 	else
965 		return (false);
966 }
967 
968 vm_paddr_t
969 vmm_sysmem_maxaddr(struct vm *vm)
970 {
971 	struct mem_map *mm;
972 	vm_paddr_t maxaddr;
973 	int i;
974 
975 	maxaddr = 0;
976 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
977 		mm = &vm->mem_maps[i];
978 		if (sysmem_mapping(vm, mm)) {
979 			if (maxaddr < mm->gpa + mm->len)
980 				maxaddr = mm->gpa + mm->len;
981 		}
982 	}
983 	return (maxaddr);
984 }
985 
986 static void
987 vm_iommu_modify(struct vm *vm, bool map)
988 {
989 	int i, sz;
990 	vm_paddr_t gpa, hpa;
991 	struct mem_map *mm;
992 #ifdef __FreeBSD__
993 	void *vp, *cookie, *host_domain;
994 #endif
995 	vm_client_t *vmc;
996 
997 	sz = PAGE_SIZE;
998 #ifdef __FreeBSD__
999 	host_domain = iommu_host_domain();
1000 #endif
1001 	vmc = vmspace_client_alloc(vm->vmspace);
1002 
1003 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1004 		mm = &vm->mem_maps[i];
1005 		if (!sysmem_mapping(vm, mm))
1006 			continue;
1007 
1008 		if (map) {
1009 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
1010 			    ("iommu map found invalid memmap %lx/%lx/%x",
1011 			    mm->gpa, mm->len, mm->flags));
1012 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
1013 				continue;
1014 			mm->flags |= VM_MEMMAP_F_IOMMU;
1015 		} else {
1016 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
1017 				continue;
1018 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
1019 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
1020 			    ("iommu unmap found invalid memmap %lx/%lx/%x",
1021 			    mm->gpa, mm->len, mm->flags));
1022 		}
1023 
1024 		gpa = mm->gpa;
1025 		while (gpa < mm->gpa + mm->len) {
1026 			vm_page_t *vmp;
1027 
1028 			vmp = vmc_hold(vmc, gpa, PROT_WRITE);
1029 			ASSERT(vmp != NULL);
1030 			hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT);
1031 			(void) vmp_release(vmp);
1032 
1033 			if (map) {
1034 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
1035 #ifdef __FreeBSD__
1036 				iommu_remove_mapping(host_domain, hpa, sz);
1037 #endif
1038 			} else {
1039 				iommu_remove_mapping(vm->iommu, gpa, sz);
1040 #ifdef __FreeBSD__
1041 				iommu_create_mapping(host_domain, hpa, hpa, sz);
1042 #endif
1043 			}
1044 
1045 			gpa += PAGE_SIZE;
1046 		}
1047 	}
1048 	vmc_destroy(vmc);
1049 
1050 	/*
1051 	 * Invalidate the cached translations associated with the domain
1052 	 * from which pages were removed.
1053 	 */
1054 #ifdef __FreeBSD__
1055 	if (map)
1056 		iommu_invalidate_tlb(host_domain);
1057 	else
1058 		iommu_invalidate_tlb(vm->iommu);
1059 #else
1060 	iommu_invalidate_tlb(vm->iommu);
1061 #endif
1062 }
1063 
1064 int
1065 vm_unassign_pptdev(struct vm *vm, int pptfd)
1066 {
1067 	int error;
1068 
1069 	error = ppt_unassign_device(vm, pptfd);
1070 	if (error)
1071 		return (error);
1072 
1073 	if (ppt_assigned_devices(vm) == 0)
1074 		vm_iommu_modify(vm, false);
1075 
1076 	return (0);
1077 }
1078 
1079 int
1080 vm_assign_pptdev(struct vm *vm, int pptfd)
1081 {
1082 	int error;
1083 	vm_paddr_t maxaddr;
1084 
1085 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1086 	if (ppt_assigned_devices(vm) == 0) {
1087 		KASSERT(vm->iommu == NULL,
1088 		    ("vm_assign_pptdev: iommu must be NULL"));
1089 		maxaddr = vmm_sysmem_maxaddr(vm);
1090 		vm->iommu = iommu_create_domain(maxaddr);
1091 		if (vm->iommu == NULL)
1092 			return (ENXIO);
1093 		vm_iommu_modify(vm, true);
1094 	}
1095 
1096 	error = ppt_assign_device(vm, pptfd);
1097 	return (error);
1098 }
1099 
1100 int
1101 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1102 {
1103 
1104 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1105 		return (EINVAL);
1106 
1107 	if (reg >= VM_REG_LAST)
1108 		return (EINVAL);
1109 
1110 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
1111 }
1112 
1113 int
1114 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1115 {
1116 	struct vcpu *vcpu;
1117 	int error;
1118 
1119 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1120 		return (EINVAL);
1121 
1122 	if (reg >= VM_REG_LAST)
1123 		return (EINVAL);
1124 
1125 	error = VMSETREG(vm->cookie, vcpuid, reg, val);
1126 	if (error || reg != VM_REG_GUEST_RIP)
1127 		return (error);
1128 
1129 	/* Set 'nextrip' to match the value of %rip */
1130 	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %lx", val);
1131 	vcpu = &vm->vcpu[vcpuid];
1132 	vcpu->nextrip = val;
1133 	return (0);
1134 }
1135 
1136 static bool
1137 is_descriptor_table(int reg)
1138 {
1139 	switch (reg) {
1140 	case VM_REG_GUEST_IDTR:
1141 	case VM_REG_GUEST_GDTR:
1142 		return (true);
1143 	default:
1144 		return (false);
1145 	}
1146 }
1147 
1148 static bool
1149 is_segment_register(int reg)
1150 {
1151 	switch (reg) {
1152 	case VM_REG_GUEST_ES:
1153 	case VM_REG_GUEST_CS:
1154 	case VM_REG_GUEST_SS:
1155 	case VM_REG_GUEST_DS:
1156 	case VM_REG_GUEST_FS:
1157 	case VM_REG_GUEST_GS:
1158 	case VM_REG_GUEST_TR:
1159 	case VM_REG_GUEST_LDTR:
1160 		return (true);
1161 	default:
1162 		return (false);
1163 	}
1164 }
1165 
1166 int
1167 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1168 {
1169 
1170 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1171 		return (EINVAL);
1172 
1173 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1174 		return (EINVAL);
1175 
1176 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1177 }
1178 
1179 int
1180 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1181 {
1182 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1183 		return (EINVAL);
1184 
1185 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1186 		return (EINVAL);
1187 
1188 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1189 }
1190 
1191 static int
1192 translate_hma_xsave_result(hma_fpu_xsave_result_t res)
1193 {
1194 	switch (res) {
1195 	case HFXR_OK:
1196 		return (0);
1197 	case HFXR_NO_SPACE:
1198 		return (ENOSPC);
1199 	case HFXR_BAD_ALIGN:
1200 	case HFXR_UNSUP_FMT:
1201 	case HFXR_UNSUP_FEAT:
1202 	case HFXR_INVALID_DATA:
1203 		return (EINVAL);
1204 	default:
1205 		panic("unexpected xsave result");
1206 	}
1207 }
1208 
1209 int
1210 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
1211 {
1212 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1213 		return (EINVAL);
1214 
1215 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1216 	hma_fpu_xsave_result_t res;
1217 
1218 	res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len);
1219 	return (translate_hma_xsave_result(res));
1220 }
1221 
1222 int
1223 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
1224 {
1225 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1226 		return (EINVAL);
1227 
1228 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1229 	hma_fpu_xsave_result_t res;
1230 
1231 	res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len);
1232 	return (translate_hma_xsave_result(res));
1233 }
1234 
1235 int
1236 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1237 {
1238 	struct vcpu *vcpu;
1239 
1240 	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1241 		return (EINVAL);
1242 	}
1243 
1244 	vcpu = &vm->vcpu[vcpuid];
1245 
1246 	vcpu_lock(vcpu);
1247 	*state = vcpu->run_state;
1248 	*sipi_vec = vcpu->sipi_vector;
1249 	vcpu_unlock(vcpu);
1250 
1251 	return (0);
1252 }
1253 
1254 int
1255 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1256 {
1257 	struct vcpu *vcpu;
1258 
1259 	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1260 		return (EINVAL);
1261 	}
1262 	if (!VRS_IS_VALID(state)) {
1263 		return (EINVAL);
1264 	}
1265 
1266 	vcpu = &vm->vcpu[vcpuid];
1267 
1268 	vcpu_lock(vcpu);
1269 	vcpu->run_state = state;
1270 	vcpu->sipi_vector = sipi_vec;
1271 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1272 	vcpu_unlock(vcpu);
1273 
1274 	return (0);
1275 }
1276 
1277 void
1278 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap)
1279 {
1280 	vmspace_t *vms = vm_get_vmspace(vm);
1281 	vmspace_track_dirty(vms, gpa, len, bitmap);
1282 }
1283 
1284 static void
1285 restore_guest_fpustate(struct vcpu *vcpu)
1286 {
1287 	/* Save host FPU and restore guest FPU */
1288 	fpu_stop_emulating();
1289 	hma_fpu_start_guest(vcpu->guestfpu);
1290 
1291 	/* restore guest XCR0 if XSAVE is enabled in the host */
1292 	if (rcr4() & CR4_XSAVE)
1293 		load_xcr(0, vcpu->guest_xcr0);
1294 
1295 	/*
1296 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1297 	 * to trap any access to the FPU by the host.
1298 	 */
1299 	fpu_start_emulating();
1300 }
1301 
1302 static void
1303 save_guest_fpustate(struct vcpu *vcpu)
1304 {
1305 
1306 	if ((rcr0() & CR0_TS) == 0)
1307 		panic("fpu emulation not enabled in host!");
1308 
1309 	/* save guest XCR0 and restore host XCR0 */
1310 	if (rcr4() & CR4_XSAVE) {
1311 		vcpu->guest_xcr0 = rxcr(0);
1312 		load_xcr(0, vmm_get_host_xcr0());
1313 	}
1314 
1315 	/* save guest FPU and restore host FPU */
1316 	fpu_stop_emulating();
1317 	hma_fpu_stop_guest(vcpu->guestfpu);
1318 	/*
1319 	 * When the host state has been restored, we should not re-enable
1320 	 * CR0.TS on illumos for eager FPU.
1321 	 */
1322 }
1323 
1324 static int
1325 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1326     bool from_idle)
1327 {
1328 	struct vcpu *vcpu;
1329 	int error;
1330 
1331 	vcpu = &vm->vcpu[vcpuid];
1332 	vcpu_assert_locked(vcpu);
1333 
1334 	/*
1335 	 * State transitions from the vmmdev_ioctl() must always begin from
1336 	 * the VCPU_IDLE state. This guarantees that there is only a single
1337 	 * ioctl() operating on a vcpu at any point.
1338 	 */
1339 	if (from_idle) {
1340 		while (vcpu->state != VCPU_IDLE) {
1341 			vcpu->reqidle = 1;
1342 			vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1343 			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1344 			    "idle requested", vcpu_state2str(vcpu->state));
1345 			cv_wait(&vcpu->state_cv, &vcpu->lock);
1346 		}
1347 	} else {
1348 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1349 		    "vcpu idle state"));
1350 	}
1351 
1352 	if (vcpu->state == VCPU_RUNNING) {
1353 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1354 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1355 	} else {
1356 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1357 		    "vcpu that is not running", vcpu->hostcpu));
1358 	}
1359 
1360 	/*
1361 	 * The following state transitions are allowed:
1362 	 * IDLE -> FROZEN -> IDLE
1363 	 * FROZEN -> RUNNING -> FROZEN
1364 	 * FROZEN -> SLEEPING -> FROZEN
1365 	 */
1366 	switch (vcpu->state) {
1367 	case VCPU_IDLE:
1368 	case VCPU_RUNNING:
1369 	case VCPU_SLEEPING:
1370 		error = (newstate != VCPU_FROZEN);
1371 		break;
1372 	case VCPU_FROZEN:
1373 		error = (newstate == VCPU_FROZEN);
1374 		break;
1375 	default:
1376 		error = 1;
1377 		break;
1378 	}
1379 
1380 	if (error)
1381 		return (EBUSY);
1382 
1383 	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1384 	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1385 
1386 	vcpu->state = newstate;
1387 	if (newstate == VCPU_RUNNING)
1388 		vcpu->hostcpu = curcpu;
1389 	else
1390 		vcpu->hostcpu = NOCPU;
1391 
1392 	if (newstate == VCPU_IDLE) {
1393 		cv_broadcast(&vcpu->state_cv);
1394 	}
1395 
1396 	return (0);
1397 }
1398 
1399 static void
1400 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1401 {
1402 	int error;
1403 
1404 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1405 		panic("Error %d setting state to %d\n", error, newstate);
1406 }
1407 
1408 static void
1409 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1410 {
1411 	int error;
1412 
1413 	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1414 		panic("Error %d setting state to %d", error, newstate);
1415 }
1416 
1417 /*
1418  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1419  */
1420 static int
1421 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1422 {
1423 	struct vcpu *vcpu;
1424 	int vcpu_halted, vm_halted;
1425 	bool userspace_exit = false;
1426 
1427 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1428 
1429 	vcpu = &vm->vcpu[vcpuid];
1430 	vcpu_halted = 0;
1431 	vm_halted = 0;
1432 
1433 	vcpu_lock(vcpu);
1434 	while (1) {
1435 		/*
1436 		 * Do a final check for pending interrupts (including NMI and
1437 		 * INIT) before putting this thread to sleep.
1438 		 */
1439 		if (vm_nmi_pending(vm, vcpuid))
1440 			break;
1441 		if (vcpu_run_state_pending(vm, vcpuid))
1442 			break;
1443 		if (!intr_disabled) {
1444 			if (vm_extint_pending(vm, vcpuid) ||
1445 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1446 				break;
1447 			}
1448 		}
1449 
1450 		/*
1451 		 * Also check for software events which would cause a wake-up.
1452 		 * This will set the appropriate exitcode directly, rather than
1453 		 * requiring a trip through VM_RUN().
1454 		 */
1455 		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1456 			userspace_exit = true;
1457 			break;
1458 		}
1459 
1460 		/*
1461 		 * Some Linux guests implement "halt" by having all vcpus
1462 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1463 		 * track of the vcpus that have entered this state. When all
1464 		 * vcpus enter the halted state the virtual machine is halted.
1465 		 */
1466 		if (intr_disabled) {
1467 			if (!vcpu_halted && halt_detection_enabled) {
1468 				vcpu_halted = 1;
1469 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1470 			}
1471 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1472 				vm_halted = 1;
1473 				break;
1474 			}
1475 		}
1476 
1477 		vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1478 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1479 		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
1480 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1481 		vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1482 	}
1483 
1484 	if (vcpu_halted)
1485 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1486 
1487 	vcpu_unlock(vcpu);
1488 
1489 	if (vm_halted) {
1490 		(void) vm_suspend(vm, VM_SUSPEND_HALT);
1491 	}
1492 
1493 	return (userspace_exit ? -1 : 0);
1494 }
1495 
1496 static int
1497 vm_handle_paging(struct vm *vm, int vcpuid)
1498 {
1499 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1500 	vm_client_t *vmc = vcpu->vmclient;
1501 	struct vm_exit *vme = &vcpu->exitinfo;
1502 	int rv, ftype;
1503 
1504 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1505 	    __func__, vme->inst_length));
1506 
1507 	ftype = vme->u.paging.fault_type;
1508 	KASSERT(ftype == PROT_READ ||
1509 	    ftype == PROT_WRITE || ftype == PROT_EXEC,
1510 	    ("vm_handle_paging: invalid fault_type %d", ftype));
1511 
1512 	rv = vmc_fault(vmc, vme->u.paging.gpa, ftype);
1513 
1514 	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %lx, "
1515 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1516 
1517 	if (rv != 0)
1518 		return (EFAULT);
1519 	return (0);
1520 }
1521 
1522 int
1523 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1524     int rsize)
1525 {
1526 	int err = ESRCH;
1527 
1528 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1529 		struct vlapic *vlapic = vm_lapic(vm, cpuid);
1530 
1531 		err = vlapic_mmio_read(vlapic, gpa, rval, rsize);
1532 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1533 		err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1534 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1535 		err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1536 	}
1537 
1538 	return (err);
1539 }
1540 
1541 int
1542 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1543     int wsize)
1544 {
1545 	int err = ESRCH;
1546 
1547 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1548 		struct vlapic *vlapic = vm_lapic(vm, cpuid);
1549 
1550 		err = vlapic_mmio_write(vlapic, gpa, wval, wsize);
1551 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1552 		err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1553 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1554 		err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1555 	}
1556 
1557 	return (err);
1558 }
1559 
1560 static int
1561 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1562 {
1563 	struct vie *vie;
1564 	struct vcpu *vcpu;
1565 	struct vm_exit *vme;
1566 	uint64_t inst_addr;
1567 	int error, fault, cs_d;
1568 
1569 	vcpu = &vm->vcpu[vcpuid];
1570 	vme = &vcpu->exitinfo;
1571 	vie = vcpu->vie_ctx;
1572 
1573 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1574 	    __func__, vme->inst_length));
1575 
1576 	inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1577 	cs_d = vme->u.mmio_emul.cs_d;
1578 
1579 	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %lx",
1580 	    vme->u.mmio_emul.gpa);
1581 
1582 	/* Fetch the faulting instruction */
1583 	if (vie_needs_fetch(vie)) {
1584 		error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1585 		    &fault);
1586 		if (error != 0) {
1587 			return (error);
1588 		} else if (fault) {
1589 			/*
1590 			 * If a fault during instruction fetch was encountered,
1591 			 * it will have asserted that the appropriate exception
1592 			 * be injected at next entry.
1593 			 * No further work is required.
1594 			 */
1595 			return (0);
1596 		}
1597 	}
1598 
1599 	if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1600 		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %lx",
1601 		    inst_addr);
1602 		/* Dump (unrecognized) instruction bytes in userspace */
1603 		vie_fallback_exitinfo(vie, vme);
1604 		return (-1);
1605 	}
1606 	if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1607 	    vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1608 		/* Decoded GLA does not match GLA from VM exit state */
1609 		vie_fallback_exitinfo(vie, vme);
1610 		return (-1);
1611 	}
1612 
1613 repeat:
1614 	error = vie_emulate_mmio(vie, vm, vcpuid);
1615 	if (error < 0) {
1616 		/*
1617 		 * MMIO not handled by any of the in-kernel-emulated devices, so
1618 		 * make a trip out to userspace for it.
1619 		 */
1620 		vie_exitinfo(vie, vme);
1621 	} else if (error == EAGAIN) {
1622 		/*
1623 		 * Continue emulating the rep-prefixed instruction, which has
1624 		 * not completed its iterations.
1625 		 *
1626 		 * In case this can be emulated in-kernel and has a high
1627 		 * repetition count (causing a tight spin), it should be
1628 		 * deferential to yield conditions.
1629 		 */
1630 		if (!vcpu_should_yield(vm, vcpuid)) {
1631 			goto repeat;
1632 		} else {
1633 			/*
1634 			 * Defer to the contending load by making a trip to
1635 			 * userspace with a no-op (BOGUS) exit reason.
1636 			 */
1637 			vie_reset(vie);
1638 			vme->exitcode = VM_EXITCODE_BOGUS;
1639 			return (-1);
1640 		}
1641 	} else if (error == 0) {
1642 		/* Update %rip now that instruction has been emulated */
1643 		vie_advance_pc(vie, &vcpu->nextrip);
1644 	}
1645 	return (error);
1646 }
1647 
1648 static int
1649 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1650 {
1651 	struct vcpu *vcpu;
1652 	struct vie *vie;
1653 	int err;
1654 
1655 	vcpu = &vm->vcpu[vcpuid];
1656 	vie = vcpu->vie_ctx;
1657 
1658 repeat:
1659 	err = vie_emulate_inout(vie, vm, vcpuid);
1660 
1661 	if (err < 0) {
1662 		/*
1663 		 * In/out not handled by any of the in-kernel-emulated devices,
1664 		 * so make a trip out to userspace for it.
1665 		 */
1666 		vie_exitinfo(vie, vme);
1667 		return (err);
1668 	} else if (err == EAGAIN) {
1669 		/*
1670 		 * Continue emulating the rep-prefixed ins/outs, which has not
1671 		 * completed its iterations.
1672 		 *
1673 		 * In case this can be emulated in-kernel and has a high
1674 		 * repetition count (causing a tight spin), it should be
1675 		 * deferential to yield conditions.
1676 		 */
1677 		if (!vcpu_should_yield(vm, vcpuid)) {
1678 			goto repeat;
1679 		} else {
1680 			/*
1681 			 * Defer to the contending load by making a trip to
1682 			 * userspace with a no-op (BOGUS) exit reason.
1683 			 */
1684 			vie_reset(vie);
1685 			vme->exitcode = VM_EXITCODE_BOGUS;
1686 			return (-1);
1687 		}
1688 	} else if (err != 0) {
1689 		/* Emulation failure.  Bail all the way out to userspace. */
1690 		vme->exitcode = VM_EXITCODE_INST_EMUL;
1691 		bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1692 		return (-1);
1693 	}
1694 
1695 	vie_advance_pc(vie, &vcpu->nextrip);
1696 	return (0);
1697 }
1698 
1699 static int
1700 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1701 {
1702 	struct vie *vie;
1703 	struct vcpu *vcpu;
1704 	struct vm_exit *vme;
1705 	uint64_t cs_base;
1706 	int error, fault, cs_d;
1707 
1708 	vcpu = &vm->vcpu[vcpuid];
1709 	vme = &vcpu->exitinfo;
1710 	vie = vcpu->vie_ctx;
1711 
1712 	vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1713 
1714 	/* Fetch the faulting instruction */
1715 	ASSERT(vie_needs_fetch(vie));
1716 	error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1717 	    &fault);
1718 	if (error != 0) {
1719 		return (error);
1720 	} else if (fault) {
1721 		/*
1722 		 * If a fault during instruction fetch was encounted, it will
1723 		 * have asserted that the appropriate exception be injected at
1724 		 * next entry.  No further work is required.
1725 		 */
1726 		return (0);
1727 	}
1728 
1729 	if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1730 		/* Dump (unrecognized) instruction bytes in userspace */
1731 		vie_fallback_exitinfo(vie, vme);
1732 		return (-1);
1733 	}
1734 
1735 	error = vie_emulate_other(vie, vm, vcpuid);
1736 	if (error != 0) {
1737 		/*
1738 		 * Instruction emulation was unable to complete successfully, so
1739 		 * kick it out to userspace for handling.
1740 		 */
1741 		vie_fallback_exitinfo(vie, vme);
1742 	} else {
1743 		/* Update %rip now that instruction has been emulated */
1744 		vie_advance_pc(vie, &vcpu->nextrip);
1745 	}
1746 	return (error);
1747 }
1748 
1749 static int
1750 vm_handle_suspend(struct vm *vm, int vcpuid)
1751 {
1752 	int i;
1753 	struct vcpu *vcpu;
1754 
1755 	vcpu = &vm->vcpu[vcpuid];
1756 
1757 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1758 
1759 	/*
1760 	 * Wait until all 'active_cpus' have suspended themselves.
1761 	 */
1762 	vcpu_lock(vcpu);
1763 	vcpu_ustate_change(vm, vcpuid, VU_INIT);
1764 	while (1) {
1765 		int rc;
1766 
1767 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1768 			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1769 			break;
1770 		}
1771 
1772 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1773 		rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz,
1774 		    TR_CLOCK_TICK);
1775 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1776 
1777 		/*
1778 		 * If the userspace process driving the instance is killed, any
1779 		 * vCPUs yet to be marked suspended (because they are not
1780 		 * VM_RUN-ing in the kernel presently) will never reach that
1781 		 * state.
1782 		 *
1783 		 * To avoid vm_handle_suspend() getting stuck in the kernel
1784 		 * waiting for those vCPUs, offer a bail-out even though it
1785 		 * means returning without all vCPUs in a suspended state.
1786 		 */
1787 		if (rc <= 0) {
1788 			if ((curproc->p_flag & SEXITING) != 0) {
1789 				break;
1790 			}
1791 		}
1792 	}
1793 	vcpu_unlock(vcpu);
1794 
1795 	/*
1796 	 * Wakeup the other sleeping vcpus and return to userspace.
1797 	 */
1798 	for (i = 0; i < vm->maxcpus; i++) {
1799 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1800 			vcpu_notify_event(vm, i);
1801 		}
1802 	}
1803 
1804 	return (-1);
1805 }
1806 
1807 static int
1808 vm_handle_reqidle(struct vm *vm, int vcpuid)
1809 {
1810 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1811 
1812 	vcpu_lock(vcpu);
1813 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1814 	vcpu->reqidle = 0;
1815 	vcpu_unlock(vcpu);
1816 	return (-1);
1817 }
1818 
1819 static int
1820 vm_handle_run_state(struct vm *vm, int vcpuid)
1821 {
1822 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1823 	bool handled = false;
1824 
1825 	vcpu_lock(vcpu);
1826 	while (1) {
1827 		if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1828 			vcpu_unlock(vcpu);
1829 			VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1830 			vcpu_lock(vcpu);
1831 
1832 			vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1833 			vcpu->run_state |= VRS_INIT;
1834 		}
1835 
1836 		if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1837 		    (VRS_INIT | VRS_PEND_SIPI)) {
1838 			const uint8_t vector = vcpu->sipi_vector;
1839 
1840 			vcpu_unlock(vcpu);
1841 			VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1842 			vcpu_lock(vcpu);
1843 
1844 			vcpu->run_state &= ~VRS_PEND_SIPI;
1845 			vcpu->run_state |= VRS_RUN;
1846 		}
1847 
1848 		/*
1849 		 * If the vCPU is now in the running state, there is no need to
1850 		 * wait for anything prior to re-entry.
1851 		 */
1852 		if ((vcpu->run_state & VRS_RUN) != 0) {
1853 			handled = true;
1854 			break;
1855 		}
1856 
1857 		/*
1858 		 * Also check for software events which would cause a wake-up.
1859 		 * This will set the appropriate exitcode directly, rather than
1860 		 * requiring a trip through VM_RUN().
1861 		 */
1862 		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1863 			break;
1864 		}
1865 
1866 		vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1867 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1868 		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
1869 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1870 		vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1871 	}
1872 	vcpu_unlock(vcpu);
1873 
1874 	return (handled ? 0 : -1);
1875 }
1876 
1877 static int
1878 vm_rdmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t *val)
1879 {
1880 	switch (num) {
1881 	case MSR_MTRRcap:
1882 		*val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX;
1883 		break;
1884 	case MSR_MTRRdefType:
1885 		*val = mtrr->def_type;
1886 		break;
1887 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
1888 		*val = mtrr->fixed4k[num - MSR_MTRR4kBase];
1889 		break;
1890 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1891 		*val = mtrr->fixed16k[num - MSR_MTRR16kBase];
1892 		break;
1893 	case MSR_MTRR64kBase:
1894 		*val = mtrr->fixed64k;
1895 		break;
1896 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
1897 		uint_t offset = num - MSR_MTRRVarBase;
1898 		if (offset % 2 == 0) {
1899 			*val = mtrr->var[offset / 2].base;
1900 		} else {
1901 			*val = mtrr->var[offset / 2].mask;
1902 		}
1903 		break;
1904 	}
1905 	default:
1906 		return (-1);
1907 	}
1908 
1909 	return (0);
1910 }
1911 
1912 static int
1913 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val)
1914 {
1915 	switch (num) {
1916 	case MSR_MTRRcap:
1917 		/* MTRRCAP is read only */
1918 		return (-1);
1919 	case MSR_MTRRdefType:
1920 		if (val & ~VMM_MTRR_DEF_MASK) {
1921 			/* generate #GP on writes to reserved fields */
1922 			return (-1);
1923 		}
1924 		mtrr->def_type = val;
1925 		break;
1926 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
1927 		mtrr->fixed4k[num - MSR_MTRR4kBase] = val;
1928 		break;
1929 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1930 		mtrr->fixed16k[num - MSR_MTRR16kBase] = val;
1931 		break;
1932 	case MSR_MTRR64kBase:
1933 		mtrr->fixed64k = val;
1934 		break;
1935 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
1936 		uint_t offset = num - MSR_MTRRVarBase;
1937 		if (offset % 2 == 0) {
1938 			if (val & ~VMM_MTRR_PHYSBASE_MASK) {
1939 				/* generate #GP on writes to reserved fields */
1940 				return (-1);
1941 			}
1942 			mtrr->var[offset / 2].base = val;
1943 		} else {
1944 			if (val & ~VMM_MTRR_PHYSMASK_MASK) {
1945 				/* generate #GP on writes to reserved fields */
1946 				return (-1);
1947 			}
1948 			mtrr->var[offset / 2].mask = val;
1949 		}
1950 		break;
1951 	}
1952 	default:
1953 		return (-1);
1954 	}
1955 
1956 	return (0);
1957 }
1958 
1959 static int
1960 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1961 {
1962 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1963 	const uint32_t code = vme->u.msr.code;
1964 	uint64_t val = 0;
1965 
1966 	switch (code) {
1967 	case MSR_MCG_CAP:
1968 	case MSR_MCG_STATUS:
1969 		val = 0;
1970 		break;
1971 
1972 	case MSR_MTRRcap:
1973 	case MSR_MTRRdefType:
1974 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
1975 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1976 	case MSR_MTRR64kBase:
1977 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
1978 		if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0)
1979 			vm_inject_gp(vm, vcpuid);
1980 		break;
1981 
1982 	case MSR_TSC:
1983 		/*
1984 		 * In all likelihood, this should always be handled in guest
1985 		 * context by VMX/SVM rather than taking an exit.  (Both VMX and
1986 		 * SVM pass through read-only access to MSR_TSC to the guest.)
1987 		 *
1988 		 * No physical offset is requested of vcpu_tsc_offset() since
1989 		 * rdtsc_offset() takes care of that instead.
1990 		 */
1991 		val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1992 		break;
1993 
1994 	default:
1995 		/*
1996 		 * Anything not handled at this point will be kicked out to
1997 		 * userspace for attempted processing there.
1998 		 */
1999 		return (-1);
2000 	}
2001 
2002 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
2003 	    val & 0xffffffff));
2004 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
2005 	    val >> 32));
2006 	return (0);
2007 }
2008 
2009 static int
2010 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
2011 {
2012 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2013 	const uint32_t code = vme->u.msr.code;
2014 	const uint64_t val = vme->u.msr.wval;
2015 
2016 	switch (code) {
2017 	case MSR_MCG_CAP:
2018 	case MSR_MCG_STATUS:
2019 		/* Ignore writes */
2020 		break;
2021 
2022 	case MSR_MTRRcap:
2023 	case MSR_MTRRdefType:
2024 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
2025 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
2026 	case MSR_MTRR64kBase:
2027 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
2028 		if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0)
2029 			vm_inject_gp(vm, vcpuid);
2030 		break;
2031 
2032 	case MSR_TSC:
2033 		/*
2034 		 * The effect of writing the TSC MSR is that a subsequent read
2035 		 * of the TSC would report that value written (plus any time
2036 		 * elapsed between the write and the read).  The guest TSC value
2037 		 * is calculated from a global offset for the guest (which
2038 		 * effectively makes its TSC read 0 at guest boot) and a
2039 		 * per-vCPU offset to handle these writes to the MSR.
2040 		 *
2041 		 * To calculate that per-vCPU offset, we can work backwards from
2042 		 * the guest value at the time of write:
2043 		 *
2044 		 * value = host TSC + VM boot offset + vCPU offset
2045 		 *
2046 		 * so therefore:
2047 		 *
2048 		 * value - host TSC - VM boot offset = vCPU offset
2049 		 */
2050 		vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
2051 		break;
2052 
2053 	default:
2054 		/*
2055 		 * Anything not handled at this point will be kicked out to
2056 		 * userspace for attempted processing there.
2057 		 */
2058 		return (-1);
2059 	}
2060 
2061 	return (0);
2062 }
2063 
2064 int
2065 vm_suspend(struct vm *vm, enum vm_suspend_how how)
2066 {
2067 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
2068 		return (EINVAL);
2069 
2070 	if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
2071 		return (EALREADY);
2072 	}
2073 
2074 	/*
2075 	 * Notify all active vcpus that they are now suspended.
2076 	 */
2077 	for (uint_t i = 0; i < vm->maxcpus; i++) {
2078 		struct vcpu *vcpu = &vm->vcpu[i];
2079 
2080 		vcpu_lock(vcpu);
2081 		if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) {
2082 			/*
2083 			 * Any vCPUs not actively running or in HLT can be
2084 			 * marked as suspended immediately.
2085 			 */
2086 			if (CPU_ISSET(i, &vm->active_cpus)) {
2087 				CPU_SET_ATOMIC(i, &vm->suspended_cpus);
2088 			}
2089 		} else {
2090 			/*
2091 			 * Those which are running or in HLT will pick up the
2092 			 * suspended state after notification.
2093 			 */
2094 			vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2095 		}
2096 		vcpu_unlock(vcpu);
2097 	}
2098 	return (0);
2099 }
2100 
2101 void
2102 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
2103 {
2104 	struct vm_exit *vmexit;
2105 
2106 	vmexit = vm_exitinfo(vm, vcpuid);
2107 	vmexit->rip = rip;
2108 	vmexit->inst_length = 0;
2109 	vmexit->exitcode = VM_EXITCODE_RUN_STATE;
2110 	vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
2111 }
2112 
2113 /*
2114  * Some vmm resources, such as the lapic, may have CPU-specific resources
2115  * allocated to them which would benefit from migration onto the host CPU which
2116  * is processing the vcpu state.
2117  */
2118 static void
2119 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2120 {
2121 	/*
2122 	 * Localizing cyclic resources requires acquisition of cpu_lock, and
2123 	 * doing so with kpreempt disabled is a recipe for deadlock disaster.
2124 	 */
2125 	VERIFY(curthread->t_preempt == 0);
2126 
2127 	/*
2128 	 * Do not bother with localization if this vCPU is about to return to
2129 	 * the host CPU it was last localized to.
2130 	 */
2131 	if (vcpu->lastloccpu == curcpu)
2132 		return;
2133 
2134 	/*
2135 	 * Localize system-wide resources to the primary boot vCPU.  While any
2136 	 * of the other vCPUs may access them, it keeps the potential interrupt
2137 	 * footprint constrained to CPUs involved with this instance.
2138 	 */
2139 	if (vcpu == &vm->vcpu[0]) {
2140 		vhpet_localize_resources(vm->vhpet);
2141 		vrtc_localize_resources(vm->vrtc);
2142 		vatpit_localize_resources(vm->vatpit);
2143 	}
2144 
2145 	vlapic_localize_resources(vcpu->vlapic);
2146 
2147 	vcpu->lastloccpu = curcpu;
2148 }
2149 
2150 static void
2151 vmm_savectx(void *arg)
2152 {
2153 	vm_thread_ctx_t *vtc = arg;
2154 	struct vm *vm = vtc->vtc_vm;
2155 	const int vcpuid = vtc->vtc_vcpuid;
2156 
2157 	if (ops->vmsavectx != NULL) {
2158 		ops->vmsavectx(vm->cookie, vcpuid);
2159 	}
2160 
2161 	/*
2162 	 * Account for going off-cpu, unless the vCPU is idled, where being
2163 	 * off-cpu is the explicit point.
2164 	 */
2165 	if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2166 		vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2167 		vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2168 	}
2169 
2170 	/*
2171 	 * If the CPU holds the restored guest FPU state, save it and restore
2172 	 * the host FPU state before this thread goes off-cpu.
2173 	 */
2174 	if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2175 		struct vcpu *vcpu = &vm->vcpu[vcpuid];
2176 
2177 		save_guest_fpustate(vcpu);
2178 		vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2179 	}
2180 }
2181 
2182 static void
2183 vmm_restorectx(void *arg)
2184 {
2185 	vm_thread_ctx_t *vtc = arg;
2186 	struct vm *vm = vtc->vtc_vm;
2187 	const int vcpuid = vtc->vtc_vcpuid;
2188 
2189 	/* Complete microstate accounting for vCPU being off-cpu */
2190 	if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2191 		vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2192 	}
2193 
2194 	/*
2195 	 * When coming back on-cpu, only restore the guest FPU status if the
2196 	 * thread is in a context marked as requiring it.  This should be rare,
2197 	 * occurring only when a future logic error results in a voluntary
2198 	 * sleep during the VMRUN critical section.
2199 	 *
2200 	 * The common case will result in elision of the guest FPU state
2201 	 * restoration, deferring that action until it is clearly necessary
2202 	 * during vm_run.
2203 	 */
2204 	VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2205 	if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2206 		struct vcpu *vcpu = &vm->vcpu[vcpuid];
2207 
2208 		restore_guest_fpustate(vcpu);
2209 		vtc->vtc_status |= VTCS_FPU_RESTORED;
2210 	}
2211 
2212 	if (ops->vmrestorectx != NULL) {
2213 		ops->vmrestorectx(vm->cookie, vcpuid);
2214 	}
2215 
2216 }
2217 
2218 static int
2219 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2220     struct vm_exit *vme)
2221 {
2222 	struct vcpu *vcpu;
2223 	struct vie *vie;
2224 	int err;
2225 
2226 	vcpu = &vm->vcpu[vcpuid];
2227 	vie = vcpu->vie_ctx;
2228 	err = 0;
2229 
2230 	switch (entry->cmd) {
2231 	case VEC_DEFAULT:
2232 		return (0);
2233 	case VEC_DISCARD_INSTR:
2234 		vie_reset(vie);
2235 		return (0);
2236 	case VEC_FULFILL_MMIO:
2237 		err = vie_fulfill_mmio(vie, &entry->u.mmio);
2238 		if (err == 0) {
2239 			err = vie_emulate_mmio(vie, vm, vcpuid);
2240 			if (err == 0) {
2241 				vie_advance_pc(vie, &vcpu->nextrip);
2242 			} else if (err < 0) {
2243 				vie_exitinfo(vie, vme);
2244 			} else if (err == EAGAIN) {
2245 				/*
2246 				 * Clear the instruction emulation state in
2247 				 * order to re-enter VM context and continue
2248 				 * this 'rep <instruction>'
2249 				 */
2250 				vie_reset(vie);
2251 				err = 0;
2252 			}
2253 		}
2254 		break;
2255 	case VEC_FULFILL_INOUT:
2256 		err = vie_fulfill_inout(vie, &entry->u.inout);
2257 		if (err == 0) {
2258 			err = vie_emulate_inout(vie, vm, vcpuid);
2259 			if (err == 0) {
2260 				vie_advance_pc(vie, &vcpu->nextrip);
2261 			} else if (err < 0) {
2262 				vie_exitinfo(vie, vme);
2263 			} else if (err == EAGAIN) {
2264 				/*
2265 				 * Clear the instruction emulation state in
2266 				 * order to re-enter VM context and continue
2267 				 * this 'rep ins/outs'
2268 				 */
2269 				vie_reset(vie);
2270 				err = 0;
2271 			}
2272 		}
2273 		break;
2274 	default:
2275 		return (EINVAL);
2276 	}
2277 	return (err);
2278 }
2279 
2280 static int
2281 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2282 {
2283 	struct vie *vie;
2284 
2285 	vie = vm->vcpu[vcpuid].vie_ctx;
2286 
2287 	if (vie_pending(vie)) {
2288 		/*
2289 		 * Userspace has not fulfilled the pending needs of the
2290 		 * instruction emulation, so bail back out.
2291 		 */
2292 		vie_exitinfo(vie, vme);
2293 		return (-1);
2294 	}
2295 
2296 	return (0);
2297 }
2298 
2299 int
2300 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2301 {
2302 	int error;
2303 	struct vcpu *vcpu;
2304 	struct vm_exit *vme;
2305 	bool intr_disabled;
2306 	int affinity_type = CPU_CURRENT;
2307 
2308 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2309 		return (EINVAL);
2310 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2311 		return (EINVAL);
2312 
2313 	vcpu = &vm->vcpu[vcpuid];
2314 	vme = &vcpu->exitinfo;
2315 
2316 	vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2317 
2318 	vcpu->vtc.vtc_status = 0;
2319 	ctxop_attach(curthread, vcpu->ctxop);
2320 
2321 	error = vm_entry_actions(vm, vcpuid, entry, vme);
2322 	if (error != 0) {
2323 		goto exit;
2324 	}
2325 
2326 restart:
2327 	error = vm_loop_checks(vm, vcpuid, vme);
2328 	if (error != 0) {
2329 		goto exit;
2330 	}
2331 
2332 	thread_affinity_set(curthread, affinity_type);
2333 	/*
2334 	 * Resource localization should happen after the CPU affinity for the
2335 	 * thread has been set to ensure that access from restricted contexts,
2336 	 * such as VMX-accelerated APIC operations, can occur without inducing
2337 	 * cyclic cross-calls.
2338 	 *
2339 	 * This must be done prior to disabling kpreempt via critical_enter().
2340 	 */
2341 	vm_localize_resources(vm, vcpu);
2342 	affinity_type = CPU_CURRENT;
2343 	critical_enter();
2344 
2345 	/* Force a trip through update_sregs to reload %fs/%gs and friends */
2346 	PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2347 
2348 	if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2349 		restore_guest_fpustate(vcpu);
2350 		vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED;
2351 	}
2352 	vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2353 
2354 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2355 	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip);
2356 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2357 
2358 	/*
2359 	 * Once clear of the delicate contexts comprising the VM_RUN handler,
2360 	 * thread CPU affinity can be loosened while other processing occurs.
2361 	 */
2362 	vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2363 	thread_affinity_clear(curthread);
2364 	critical_exit();
2365 
2366 	if (error != 0) {
2367 		/* Communicate out any error from VMRUN() above */
2368 		goto exit;
2369 	}
2370 
2371 	vcpu->nextrip = vme->rip + vme->inst_length;
2372 	switch (vme->exitcode) {
2373 	case VM_EXITCODE_REQIDLE:
2374 		error = vm_handle_reqidle(vm, vcpuid);
2375 		break;
2376 	case VM_EXITCODE_RUN_STATE:
2377 		error = vm_handle_run_state(vm, vcpuid);
2378 		break;
2379 	case VM_EXITCODE_SUSPENDED:
2380 		error = vm_handle_suspend(vm, vcpuid);
2381 		break;
2382 	case VM_EXITCODE_IOAPIC_EOI:
2383 		vioapic_process_eoi(vm, vcpuid,
2384 		    vme->u.ioapic_eoi.vector);
2385 		break;
2386 	case VM_EXITCODE_HLT:
2387 		intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2388 		error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2389 		break;
2390 	case VM_EXITCODE_PAGING:
2391 		error = vm_handle_paging(vm, vcpuid);
2392 		break;
2393 	case VM_EXITCODE_MMIO_EMUL:
2394 		error = vm_handle_mmio_emul(vm, vcpuid);
2395 		break;
2396 	case VM_EXITCODE_INOUT:
2397 		error = vm_handle_inout(vm, vcpuid, vme);
2398 		break;
2399 	case VM_EXITCODE_INST_EMUL:
2400 		error = vm_handle_inst_emul(vm, vcpuid);
2401 		break;
2402 	case VM_EXITCODE_MONITOR:
2403 	case VM_EXITCODE_MWAIT:
2404 	case VM_EXITCODE_VMINSN:
2405 		vm_inject_ud(vm, vcpuid);
2406 		break;
2407 	case VM_EXITCODE_RDMSR:
2408 		error = vm_handle_rdmsr(vm, vcpuid, vme);
2409 		break;
2410 	case VM_EXITCODE_WRMSR:
2411 		error = vm_handle_wrmsr(vm, vcpuid, vme);
2412 		break;
2413 	case VM_EXITCODE_HT:
2414 		affinity_type = CPU_BEST;
2415 		break;
2416 	case VM_EXITCODE_MTRAP:
2417 		VERIFY0(vm_suspend_cpu(vm, vcpuid));
2418 		error = -1;
2419 		break;
2420 	default:
2421 		/* handled in userland */
2422 		error = -1;
2423 		break;
2424 	}
2425 
2426 	if (error == 0) {
2427 		/* VM exit conditions handled in-kernel, continue running */
2428 		goto restart;
2429 	}
2430 
2431 exit:
2432 	kpreempt_disable();
2433 	ctxop_detach(curthread, vcpu->ctxop);
2434 	/* Make sure all of the needed vCPU context state is saved */
2435 	vmm_savectx(&vcpu->vtc);
2436 	kpreempt_enable();
2437 
2438 	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
2439 
2440 	vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2441 	return (error);
2442 }
2443 
2444 int
2445 vm_restart_instruction(void *arg, int vcpuid)
2446 {
2447 	struct vm *vm;
2448 	struct vcpu *vcpu;
2449 	enum vcpu_state state;
2450 	uint64_t rip;
2451 	int error;
2452 
2453 	vm = arg;
2454 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2455 		return (EINVAL);
2456 
2457 	vcpu = &vm->vcpu[vcpuid];
2458 	state = vcpu_get_state(vm, vcpuid, NULL);
2459 	if (state == VCPU_RUNNING) {
2460 		/*
2461 		 * When a vcpu is "running" the next instruction is determined
2462 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2463 		 * Thus setting 'inst_length' to zero will cause the current
2464 		 * instruction to be restarted.
2465 		 */
2466 		vcpu->exitinfo.inst_length = 0;
2467 		VCPU_CTR1(vm, vcpuid, "restarting instruction at %lx by "
2468 		    "setting inst_length to zero", vcpu->exitinfo.rip);
2469 	} else if (state == VCPU_FROZEN) {
2470 		/*
2471 		 * When a vcpu is "frozen" it is outside the critical section
2472 		 * around VMRUN() and 'nextrip' points to the next instruction.
2473 		 * Thus instruction restart is achieved by setting 'nextrip'
2474 		 * to the vcpu's %rip.
2475 		 */
2476 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2477 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2478 		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
2479 		    "nextrip from %lx to %lx", vcpu->nextrip, rip);
2480 		vcpu->nextrip = rip;
2481 	} else {
2482 		panic("%s: invalid state %d", __func__, state);
2483 	}
2484 	return (0);
2485 }
2486 
2487 int
2488 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2489 {
2490 	struct vcpu *vcpu;
2491 
2492 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2493 		return (EINVAL);
2494 
2495 	vcpu = &vm->vcpu[vcpuid];
2496 
2497 	if (VM_INTINFO_PENDING(info)) {
2498 		const uint32_t type = VM_INTINFO_TYPE(info);
2499 		const uint8_t vector = VM_INTINFO_VECTOR(info);
2500 
2501 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2502 			return (EINVAL);
2503 		if (type == VM_INTINFO_HWEXCP && vector >= 32)
2504 			return (EINVAL);
2505 		if (info & VM_INTINFO_MASK_RSVD)
2506 			return (EINVAL);
2507 	} else {
2508 		info = 0;
2509 	}
2510 	vcpu->exit_intinfo = info;
2511 	return (0);
2512 }
2513 
2514 enum exc_class {
2515 	EXC_BENIGN,
2516 	EXC_CONTRIBUTORY,
2517 	EXC_PAGEFAULT
2518 };
2519 
2520 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
2521 
2522 static enum exc_class
2523 exception_class(uint64_t info)
2524 {
2525 	ASSERT(VM_INTINFO_PENDING(info));
2526 
2527 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2528 	switch (VM_INTINFO_TYPE(info)) {
2529 	case VM_INTINFO_HWINTR:
2530 	case VM_INTINFO_SWINTR:
2531 	case VM_INTINFO_NMI:
2532 		return (EXC_BENIGN);
2533 	default:
2534 		/*
2535 		 * Hardware exception.
2536 		 *
2537 		 * SVM and VT-x use identical type values to represent NMI,
2538 		 * hardware interrupt and software interrupt.
2539 		 *
2540 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2541 		 * for exceptions except #BP and #OF. #BP and #OF use a type
2542 		 * value of '5' or '6'. Therefore we don't check for explicit
2543 		 * values of 'type' to classify 'intinfo' into a hardware
2544 		 * exception.
2545 		 */
2546 		break;
2547 	}
2548 
2549 	switch (VM_INTINFO_VECTOR(info)) {
2550 	case IDT_PF:
2551 	case IDT_VE:
2552 		return (EXC_PAGEFAULT);
2553 	case IDT_DE:
2554 	case IDT_TS:
2555 	case IDT_NP:
2556 	case IDT_SS:
2557 	case IDT_GP:
2558 		return (EXC_CONTRIBUTORY);
2559 	default:
2560 		return (EXC_BENIGN);
2561 	}
2562 }
2563 
2564 /*
2565  * Fetch event pending injection into the guest, if one exists.
2566  *
2567  * Returns true if an event is to be injected (which is placed in `retinfo`).
2568  */
2569 bool
2570 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2571 {
2572 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2573 	const uint64_t info1 = vcpu->exit_intinfo;
2574 	vcpu->exit_intinfo = 0;
2575 	const uint64_t info2 = vcpu->exc_pending;
2576 	vcpu->exc_pending = 0;
2577 
2578 	if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) {
2579 		/*
2580 		 * If an exception occurs while attempting to call the
2581 		 * double-fault handler the processor enters shutdown mode
2582 		 * (aka triple fault).
2583 		 */
2584 		if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP &&
2585 		    VM_INTINFO_VECTOR(info1) == IDT_DF) {
2586 			(void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2587 			*retinfo = 0;
2588 			return (false);
2589 		}
2590 		/*
2591 		 * "Conditions for Generating a Double Fault"
2592 		 *  Intel SDM, Vol3, Table 6-5
2593 		 */
2594 		const enum exc_class exc1 = exception_class(info1);
2595 		const enum exc_class exc2 = exception_class(info2);
2596 		if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2597 		    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2598 			/* Convert nested fault into a double fault. */
2599 			*retinfo =
2600 			    VM_INTINFO_VALID |
2601 			    VM_INTINFO_DEL_ERRCODE |
2602 			    VM_INTINFO_HWEXCP |
2603 			    IDT_DF;
2604 		} else {
2605 			/* Handle exceptions serially */
2606 			vcpu->exit_intinfo = info1;
2607 			*retinfo = info2;
2608 		}
2609 		return (true);
2610 	} else if (VM_INTINFO_PENDING(info1)) {
2611 		*retinfo = info1;
2612 		return (true);
2613 	} else if (VM_INTINFO_PENDING(info2)) {
2614 		*retinfo = info2;
2615 		return (true);
2616 	}
2617 
2618 	return (false);
2619 }
2620 
2621 int
2622 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2623 {
2624 	struct vcpu *vcpu;
2625 
2626 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2627 		return (EINVAL);
2628 
2629 	vcpu = &vm->vcpu[vcpuid];
2630 	*info1 = vcpu->exit_intinfo;
2631 	*info2 = vcpu->exc_pending;
2632 	return (0);
2633 }
2634 
2635 int
2636 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector,
2637     bool errcode_valid, uint32_t errcode, bool restart_instruction)
2638 {
2639 	struct vcpu *vcpu;
2640 	uint64_t regval;
2641 	int error;
2642 
2643 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2644 		return (EINVAL);
2645 
2646 	if (vector >= 32)
2647 		return (EINVAL);
2648 
2649 	/*
2650 	 * NMIs are to be injected via their own specialized path using
2651 	 * vm_inject_nmi().
2652 	 */
2653 	if (vector == IDT_NMI) {
2654 		return (EINVAL);
2655 	}
2656 
2657 	/*
2658 	 * A double fault exception should never be injected directly into
2659 	 * the guest. It is a derived exception that results from specific
2660 	 * combinations of nested faults.
2661 	 */
2662 	if (vector == IDT_DF) {
2663 		return (EINVAL);
2664 	}
2665 
2666 	vcpu = &vm->vcpu[vcpuid];
2667 
2668 	if (VM_INTINFO_PENDING(vcpu->exc_pending)) {
2669 		/* Unable to inject exception due to one already pending */
2670 		return (EBUSY);
2671 	}
2672 
2673 	if (errcode_valid) {
2674 		/*
2675 		 * Exceptions don't deliver an error code in real mode.
2676 		 */
2677 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2678 		VERIFY0(error);
2679 		if ((regval & CR0_PE) == 0) {
2680 			errcode_valid = false;
2681 		}
2682 	}
2683 
2684 	/*
2685 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2686 	 *
2687 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2688 	 * one instruction or incurs an exception.
2689 	 */
2690 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2691 	VERIFY0(error);
2692 
2693 	if (restart_instruction) {
2694 		VERIFY0(vm_restart_instruction(vm, vcpuid));
2695 	}
2696 
2697 	uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector;
2698 	if (errcode_valid) {
2699 		val |= VM_INTINFO_DEL_ERRCODE;
2700 		val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE;
2701 	}
2702 	vcpu->exc_pending = val;
2703 	return (0);
2704 }
2705 
2706 void
2707 vm_inject_ud(struct vm *vm, int vcpuid)
2708 {
2709 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true));
2710 }
2711 
2712 void
2713 vm_inject_gp(struct vm *vm, int vcpuid)
2714 {
2715 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true));
2716 }
2717 
2718 void
2719 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode)
2720 {
2721 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true));
2722 }
2723 
2724 void
2725 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode)
2726 {
2727 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true));
2728 }
2729 
2730 void
2731 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2)
2732 {
2733 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2));
2734 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true));
2735 }
2736 
2737 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2738 
2739 int
2740 vm_inject_nmi(struct vm *vm, int vcpuid)
2741 {
2742 	struct vcpu *vcpu;
2743 
2744 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2745 		return (EINVAL);
2746 
2747 	vcpu = &vm->vcpu[vcpuid];
2748 
2749 	vcpu->nmi_pending = true;
2750 	vcpu_notify_event(vm, vcpuid);
2751 	return (0);
2752 }
2753 
2754 bool
2755 vm_nmi_pending(struct vm *vm, int vcpuid)
2756 {
2757 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2758 
2759 	return (vcpu->nmi_pending);
2760 }
2761 
2762 void
2763 vm_nmi_clear(struct vm *vm, int vcpuid)
2764 {
2765 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2766 
2767 	ASSERT(vcpu->nmi_pending);
2768 
2769 	vcpu->nmi_pending = false;
2770 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2771 }
2772 
2773 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2774 
2775 int
2776 vm_inject_extint(struct vm *vm, int vcpuid)
2777 {
2778 	struct vcpu *vcpu;
2779 
2780 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2781 		return (EINVAL);
2782 
2783 	vcpu = &vm->vcpu[vcpuid];
2784 
2785 	vcpu->extint_pending = true;
2786 	vcpu_notify_event(vm, vcpuid);
2787 	return (0);
2788 }
2789 
2790 bool
2791 vm_extint_pending(struct vm *vm, int vcpuid)
2792 {
2793 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2794 
2795 	return (vcpu->extint_pending);
2796 }
2797 
2798 void
2799 vm_extint_clear(struct vm *vm, int vcpuid)
2800 {
2801 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2802 
2803 	ASSERT(vcpu->extint_pending);
2804 
2805 	vcpu->extint_pending = false;
2806 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2807 }
2808 
2809 int
2810 vm_inject_init(struct vm *vm, int vcpuid)
2811 {
2812 	struct vcpu *vcpu;
2813 
2814 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2815 		return (EINVAL);
2816 
2817 	vcpu = &vm->vcpu[vcpuid];
2818 	vcpu_lock(vcpu);
2819 	vcpu->run_state |= VRS_PEND_INIT;
2820 	/*
2821 	 * As part of queuing the INIT request, clear any pending SIPI.  It
2822 	 * would not otherwise survive across the reset of the vCPU when it
2823 	 * undergoes the requested INIT.  We would not want it to linger when it
2824 	 * could be mistaken as a subsequent (after the INIT) SIPI request.
2825 	 */
2826 	vcpu->run_state &= ~VRS_PEND_SIPI;
2827 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2828 
2829 	vcpu_unlock(vcpu);
2830 	return (0);
2831 }
2832 
2833 int
2834 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2835 {
2836 	struct vcpu *vcpu;
2837 
2838 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2839 		return (EINVAL);
2840 
2841 	vcpu = &vm->vcpu[vcpuid];
2842 	vcpu_lock(vcpu);
2843 	vcpu->run_state |= VRS_PEND_SIPI;
2844 	vcpu->sipi_vector = vector;
2845 	/* SIPI is only actionable if the CPU is waiting in INIT state */
2846 	if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2847 		vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2848 	}
2849 	vcpu_unlock(vcpu);
2850 	return (0);
2851 }
2852 
2853 bool
2854 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2855 {
2856 	struct vcpu *vcpu;
2857 
2858 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2859 	vcpu = &vm->vcpu[vcpuid];
2860 
2861 	/* Of interest: vCPU not in running state or with pending INIT */
2862 	return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2863 }
2864 
2865 int
2866 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2867 {
2868 	struct seg_desc desc;
2869 	const enum vm_reg_name clear_regs[] = {
2870 		VM_REG_GUEST_CR2,
2871 		VM_REG_GUEST_CR3,
2872 		VM_REG_GUEST_CR4,
2873 		VM_REG_GUEST_RAX,
2874 		VM_REG_GUEST_RBX,
2875 		VM_REG_GUEST_RCX,
2876 		VM_REG_GUEST_RSI,
2877 		VM_REG_GUEST_RDI,
2878 		VM_REG_GUEST_RBP,
2879 		VM_REG_GUEST_RSP,
2880 		VM_REG_GUEST_R8,
2881 		VM_REG_GUEST_R9,
2882 		VM_REG_GUEST_R10,
2883 		VM_REG_GUEST_R11,
2884 		VM_REG_GUEST_R12,
2885 		VM_REG_GUEST_R13,
2886 		VM_REG_GUEST_R14,
2887 		VM_REG_GUEST_R15,
2888 		VM_REG_GUEST_DR0,
2889 		VM_REG_GUEST_DR1,
2890 		VM_REG_GUEST_DR2,
2891 		VM_REG_GUEST_DR3,
2892 		VM_REG_GUEST_EFER,
2893 	};
2894 	const enum vm_reg_name data_segs[] = {
2895 		VM_REG_GUEST_SS,
2896 		VM_REG_GUEST_DS,
2897 		VM_REG_GUEST_ES,
2898 		VM_REG_GUEST_FS,
2899 		VM_REG_GUEST_GS,
2900 	};
2901 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2902 
2903 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2904 		return (EINVAL);
2905 
2906 	for (uint_t i = 0; i < nitems(clear_regs); i++) {
2907 		VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2908 	}
2909 
2910 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2911 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2912 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2913 
2914 	/*
2915 	 * The prescribed contents of %rdx differ slightly between the Intel and
2916 	 * AMD architectural definitions.  The former expects the Extended Model
2917 	 * in bits 16-19 where the latter expects all the Family, Model, and
2918 	 * Stepping be there.  Common boot ROMs appear to disregard this
2919 	 * anyways, so we stick with a compromise value similar to what is
2920 	 * spelled out in the Intel SDM.
2921 	 */
2922 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2923 
2924 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2925 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2926 
2927 	/* CS: Present, R/W, Accessed */
2928 	desc.access = 0x0093;
2929 	desc.base = 0xffff0000;
2930 	desc.limit = 0xffff;
2931 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2932 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2933 
2934 	/* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2935 	desc.access = 0x0093;
2936 	desc.base = 0;
2937 	desc.limit = 0xffff;
2938 	for (uint_t i = 0; i < nitems(data_segs); i++) {
2939 		VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2940 		VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2941 	}
2942 
2943 	/* GDTR, IDTR */
2944 	desc.base = 0;
2945 	desc.limit = 0xffff;
2946 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2947 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2948 
2949 	/* LDTR: Present, LDT */
2950 	desc.access = 0x0082;
2951 	desc.base = 0;
2952 	desc.limit = 0xffff;
2953 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2954 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2955 
2956 	/* TR: Present, 32-bit TSS */
2957 	desc.access = 0x008b;
2958 	desc.base = 0;
2959 	desc.limit = 0xffff;
2960 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2961 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2962 
2963 	vlapic_reset(vm_lapic(vm, vcpuid));
2964 
2965 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2966 
2967 	vcpu->exit_intinfo = 0;
2968 	vcpu->exc_pending = 0;
2969 	vcpu->nmi_pending = false;
2970 	vcpu->extint_pending = 0;
2971 
2972 	/*
2973 	 * A CPU reset caused by power-on or system reset clears more state than
2974 	 * one which is trigged from an INIT IPI.
2975 	 */
2976 	if (!init_only) {
2977 		vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2978 		(void) hma_fpu_init(vcpu->guestfpu);
2979 
2980 		/* XXX: clear MSRs and other pieces */
2981 		bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
2982 	}
2983 
2984 	return (0);
2985 }
2986 
2987 static int
2988 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2989 {
2990 	struct seg_desc desc;
2991 
2992 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2993 		return (EINVAL);
2994 
2995 	/* CS: Present, R/W, Accessed */
2996 	desc.access = 0x0093;
2997 	desc.base = (uint64_t)vector << 12;
2998 	desc.limit = 0xffff;
2999 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
3000 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
3001 	    (uint64_t)vector << 8));
3002 
3003 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
3004 
3005 	return (0);
3006 }
3007 
3008 int
3009 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
3010 {
3011 	if (vcpu < 0 || vcpu >= vm->maxcpus)
3012 		return (EINVAL);
3013 
3014 	if (type < 0 || type >= VM_CAP_MAX)
3015 		return (EINVAL);
3016 
3017 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
3018 }
3019 
3020 int
3021 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3022 {
3023 	if (vcpu < 0 || vcpu >= vm->maxcpus)
3024 		return (EINVAL);
3025 
3026 	if (type < 0 || type >= VM_CAP_MAX)
3027 		return (EINVAL);
3028 
3029 	return (VMSETCAP(vm->cookie, vcpu, type, val));
3030 }
3031 
3032 struct vlapic *
3033 vm_lapic(struct vm *vm, int cpu)
3034 {
3035 	return (vm->vcpu[cpu].vlapic);
3036 }
3037 
3038 struct vioapic *
3039 vm_ioapic(struct vm *vm)
3040 {
3041 
3042 	return (vm->vioapic);
3043 }
3044 
3045 struct vhpet *
3046 vm_hpet(struct vm *vm)
3047 {
3048 
3049 	return (vm->vhpet);
3050 }
3051 
3052 void *
3053 vm_iommu_domain(struct vm *vm)
3054 {
3055 
3056 	return (vm->iommu);
3057 }
3058 
3059 int
3060 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3061     bool from_idle)
3062 {
3063 	int error;
3064 	struct vcpu *vcpu;
3065 
3066 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3067 		panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3068 
3069 	vcpu = &vm->vcpu[vcpuid];
3070 
3071 	vcpu_lock(vcpu);
3072 	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3073 	vcpu_unlock(vcpu);
3074 
3075 	return (error);
3076 }
3077 
3078 enum vcpu_state
3079 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3080 {
3081 	struct vcpu *vcpu;
3082 	enum vcpu_state state;
3083 
3084 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3085 		panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3086 
3087 	vcpu = &vm->vcpu[vcpuid];
3088 
3089 	vcpu_lock(vcpu);
3090 	state = vcpu->state;
3091 	if (hostcpu != NULL)
3092 		*hostcpu = vcpu->hostcpu;
3093 	vcpu_unlock(vcpu);
3094 
3095 	return (state);
3096 }
3097 
3098 uint64_t
3099 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3100 {
3101 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3102 
3103 	uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3104 
3105 	if (phys_adj) {
3106 		/* Include any offset for the current physical CPU too */
3107 		extern hrtime_t tsc_gethrtime_tick_delta(void);
3108 		vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3109 	}
3110 
3111 	return (vcpu_off);
3112 }
3113 
3114 int
3115 vm_activate_cpu(struct vm *vm, int vcpuid)
3116 {
3117 
3118 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3119 		return (EINVAL);
3120 
3121 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
3122 		return (EBUSY);
3123 
3124 	if (vm->suspend != 0) {
3125 		return (EBUSY);
3126 	}
3127 
3128 	VCPU_CTR0(vm, vcpuid, "activated");
3129 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3130 
3131 	/*
3132 	 * It is possible that this vCPU was undergoing activation at the same
3133 	 * time that the VM was being suspended.  If that happens to be the
3134 	 * case, it should reflect the suspended state immediately.
3135 	 */
3136 	if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) {
3137 		CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
3138 	}
3139 
3140 	return (0);
3141 }
3142 
3143 int
3144 vm_suspend_cpu(struct vm *vm, int vcpuid)
3145 {
3146 	int i;
3147 
3148 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3149 		return (EINVAL);
3150 
3151 	if (vcpuid == -1) {
3152 		vm->debug_cpus = vm->active_cpus;
3153 		for (i = 0; i < vm->maxcpus; i++) {
3154 			if (CPU_ISSET(i, &vm->active_cpus))
3155 				vcpu_notify_event(vm, i);
3156 		}
3157 	} else {
3158 		if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3159 			return (EINVAL);
3160 
3161 		CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3162 		vcpu_notify_event(vm, vcpuid);
3163 	}
3164 	return (0);
3165 }
3166 
3167 int
3168 vm_resume_cpu(struct vm *vm, int vcpuid)
3169 {
3170 
3171 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3172 		return (EINVAL);
3173 
3174 	if (vcpuid == -1) {
3175 		CPU_ZERO(&vm->debug_cpus);
3176 	} else {
3177 		if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3178 			return (EINVAL);
3179 
3180 		CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3181 	}
3182 	return (0);
3183 }
3184 
3185 static bool
3186 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3187     uint64_t entry_rip)
3188 {
3189 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3190 	struct vm_exit *vme = &vcpu->exitinfo;
3191 	bool bail = false;
3192 
3193 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3194 
3195 	if (vm->suspend) {
3196 		if (on_entry) {
3197 			VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3198 			    vm->suspend < VM_SUSPEND_LAST);
3199 
3200 			vme->exitcode = VM_EXITCODE_SUSPENDED;
3201 			vme->u.suspended.how = vm->suspend;
3202 		} else {
3203 			/*
3204 			 * Handling VM suspend is complicated, so if that
3205 			 * condition is detected outside of VM-entry itself,
3206 			 * just emit a BOGUS exitcode so we take a lap to pick
3207 			 * up the event during an entry and are directed into
3208 			 * the vm_handle_suspend() logic.
3209 			 */
3210 			vme->exitcode = VM_EXITCODE_BOGUS;
3211 		}
3212 		bail = true;
3213 	}
3214 	if (vcpu->reqidle) {
3215 		vme->exitcode = VM_EXITCODE_REQIDLE;
3216 		vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3217 
3218 		if (!on_entry) {
3219 			/*
3220 			 * A reqidle request detected outside of VM-entry can be
3221 			 * handled directly by clearing the request (and taking
3222 			 * a lap to userspace).
3223 			 */
3224 			vcpu_assert_locked(vcpu);
3225 			vcpu->reqidle = 0;
3226 		}
3227 		bail = true;
3228 	}
3229 	if (vcpu_should_yield(vm, vcpuid)) {
3230 		vme->exitcode = VM_EXITCODE_BOGUS;
3231 		vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3232 		bail = true;
3233 	}
3234 	if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3235 		vme->exitcode = VM_EXITCODE_DEBUG;
3236 		bail = true;
3237 	}
3238 
3239 	if (bail) {
3240 		if (on_entry) {
3241 			/*
3242 			 * If bailing out during VM-entry, the current %rip must
3243 			 * be recorded in the exitinfo.
3244 			 */
3245 			vme->rip = entry_rip;
3246 		}
3247 		vme->inst_length = 0;
3248 	}
3249 	return (bail);
3250 }
3251 
3252 static bool
3253 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3254 {
3255 	/*
3256 	 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3257 	 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3258 	 * structure, and we would only modify the exitcode.
3259 	 */
3260 	return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3261 }
3262 
3263 bool
3264 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3265 {
3266 	/*
3267 	 * Bail-out checks done as part of VM entry require an updated %rip to
3268 	 * populate the vm_exit struct if any of the conditions of interest are
3269 	 * matched in the check.
3270 	 */
3271 	return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3272 }
3273 
3274 cpuset_t
3275 vm_active_cpus(struct vm *vm)
3276 {
3277 
3278 	return (vm->active_cpus);
3279 }
3280 
3281 cpuset_t
3282 vm_debug_cpus(struct vm *vm)
3283 {
3284 
3285 	return (vm->debug_cpus);
3286 }
3287 
3288 cpuset_t
3289 vm_suspended_cpus(struct vm *vm)
3290 {
3291 
3292 	return (vm->suspended_cpus);
3293 }
3294 
3295 void *
3296 vcpu_stats(struct vm *vm, int vcpuid)
3297 {
3298 
3299 	return (vm->vcpu[vcpuid].stats);
3300 }
3301 
3302 int
3303 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3304 {
3305 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3306 		return (EINVAL);
3307 
3308 	*state = vm->vcpu[vcpuid].x2apic_state;
3309 
3310 	return (0);
3311 }
3312 
3313 int
3314 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3315 {
3316 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3317 		return (EINVAL);
3318 
3319 	if (state >= X2APIC_STATE_LAST)
3320 		return (EINVAL);
3321 
3322 	vm->vcpu[vcpuid].x2apic_state = state;
3323 
3324 	vlapic_set_x2apic_state(vm, vcpuid, state);
3325 
3326 	return (0);
3327 }
3328 
3329 /*
3330  * This function is called to ensure that a vcpu "sees" a pending event
3331  * as soon as possible:
3332  * - If the vcpu thread is sleeping then it is woken up.
3333  * - If the vcpu is running on a different host_cpu then an IPI will be directed
3334  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3335  */
3336 static void
3337 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3338 {
3339 	int hostcpu;
3340 
3341 	ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3342 
3343 	hostcpu = vcpu->hostcpu;
3344 	if (vcpu->state == VCPU_RUNNING) {
3345 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3346 		if (hostcpu != curcpu) {
3347 			if (ntype == VCPU_NOTIFY_APIC) {
3348 				vlapic_post_intr(vcpu->vlapic, hostcpu);
3349 			} else {
3350 				poke_cpu(hostcpu);
3351 			}
3352 		} else {
3353 			/*
3354 			 * If the 'vcpu' is running on 'curcpu' then it must
3355 			 * be sending a notification to itself (e.g. SELF_IPI).
3356 			 * The pending event will be picked up when the vcpu
3357 			 * transitions back to guest context.
3358 			 */
3359 		}
3360 	} else {
3361 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3362 		    "with hostcpu %d", vcpu->state, hostcpu));
3363 		if (vcpu->state == VCPU_SLEEPING) {
3364 			cv_signal(&vcpu->vcpu_cv);
3365 		}
3366 	}
3367 }
3368 
3369 void
3370 vcpu_notify_event(struct vm *vm, int vcpuid)
3371 {
3372 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3373 
3374 	vcpu_lock(vcpu);
3375 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3376 	vcpu_unlock(vcpu);
3377 }
3378 
3379 void
3380 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3381 {
3382 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3383 
3384 	if (ntype == VCPU_NOTIFY_NONE) {
3385 		return;
3386 	}
3387 
3388 	vcpu_lock(vcpu);
3389 	vcpu_notify_event_locked(vcpu, ntype);
3390 	vcpu_unlock(vcpu);
3391 }
3392 
3393 void
3394 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3395 {
3396 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3397 	hrtime_t now = gethrtime();
3398 
3399 	ASSERT3U(ustate, !=, vcpu->ustate);
3400 	ASSERT3S(ustate, <, VU_MAX);
3401 	ASSERT3S(ustate, >=, VU_INIT);
3402 
3403 	hrtime_t delta = now - vcpu->ustate_when;
3404 	vcpu->ustate_total[vcpu->ustate] += delta;
3405 
3406 	membar_producer();
3407 
3408 	vcpu->ustate_when = now;
3409 	vcpu->ustate = ustate;
3410 }
3411 
3412 struct vmspace *
3413 vm_get_vmspace(struct vm *vm)
3414 {
3415 
3416 	return (vm->vmspace);
3417 }
3418 
3419 struct vm_client *
3420 vm_get_vmclient(struct vm *vm, int vcpuid)
3421 {
3422 	return (vm->vcpu[vcpuid].vmclient);
3423 }
3424 
3425 int
3426 vm_apicid2vcpuid(struct vm *vm, int apicid)
3427 {
3428 	/*
3429 	 * XXX apic id is assumed to be numerically identical to vcpu id
3430 	 */
3431 	return (apicid);
3432 }
3433 
3434 struct vatpic *
3435 vm_atpic(struct vm *vm)
3436 {
3437 	return (vm->vatpic);
3438 }
3439 
3440 struct vatpit *
3441 vm_atpit(struct vm *vm)
3442 {
3443 	return (vm->vatpit);
3444 }
3445 
3446 struct vpmtmr *
3447 vm_pmtmr(struct vm *vm)
3448 {
3449 
3450 	return (vm->vpmtmr);
3451 }
3452 
3453 struct vrtc *
3454 vm_rtc(struct vm *vm)
3455 {
3456 
3457 	return (vm->vrtc);
3458 }
3459 
3460 enum vm_reg_name
3461 vm_segment_name(int seg)
3462 {
3463 	static enum vm_reg_name seg_names[] = {
3464 		VM_REG_GUEST_ES,
3465 		VM_REG_GUEST_CS,
3466 		VM_REG_GUEST_SS,
3467 		VM_REG_GUEST_DS,
3468 		VM_REG_GUEST_FS,
3469 		VM_REG_GUEST_GS
3470 	};
3471 
3472 	KASSERT(seg >= 0 && seg < nitems(seg_names),
3473 	    ("%s: invalid segment encoding %d", __func__, seg));
3474 	return (seg_names[seg]);
3475 }
3476 
3477 void
3478 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3479     uint_t num_copyinfo)
3480 {
3481 	for (uint_t idx = 0; idx < num_copyinfo; idx++) {
3482 		if (copyinfo[idx].cookie != NULL) {
3483 			(void) vmp_release((vm_page_t *)copyinfo[idx].cookie);
3484 		}
3485 	}
3486 	bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3487 }
3488 
3489 int
3490 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3491     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3492     uint_t num_copyinfo, int *fault)
3493 {
3494 	uint_t idx, nused;
3495 	size_t n, off, remaining;
3496 	vm_client_t *vmc = vm_get_vmclient(vm, vcpuid);
3497 
3498 	bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3499 
3500 	nused = 0;
3501 	remaining = len;
3502 	while (remaining > 0) {
3503 		uint64_t gpa;
3504 		int error;
3505 
3506 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3507 		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3508 		if (error || *fault)
3509 			return (error);
3510 		off = gpa & PAGEOFFSET;
3511 		n = min(remaining, PAGESIZE - off);
3512 		copyinfo[nused].gpa = gpa;
3513 		copyinfo[nused].len = n;
3514 		remaining -= n;
3515 		gla += n;
3516 		nused++;
3517 	}
3518 
3519 	for (idx = 0; idx < nused; idx++) {
3520 		vm_page_t *vmp;
3521 		caddr_t hva;
3522 
3523 		vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot);
3524 		if (vmp == NULL) {
3525 			break;
3526 		}
3527 		if ((prot & PROT_WRITE) != 0) {
3528 			hva = (caddr_t)vmp_get_writable(vmp);
3529 		} else {
3530 			hva = (caddr_t)vmp_get_readable(vmp);
3531 		}
3532 		copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET);
3533 		copyinfo[idx].cookie = vmp;
3534 		copyinfo[idx].prot = prot;
3535 	}
3536 
3537 	if (idx != nused) {
3538 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3539 		return (EFAULT);
3540 	} else {
3541 		*fault = 0;
3542 		return (0);
3543 	}
3544 }
3545 
3546 void
3547 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3548     size_t len)
3549 {
3550 	char *dst;
3551 	int idx;
3552 
3553 	dst = kaddr;
3554 	idx = 0;
3555 	while (len > 0) {
3556 		ASSERT(copyinfo[idx].prot & PROT_READ);
3557 
3558 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3559 		len -= copyinfo[idx].len;
3560 		dst += copyinfo[idx].len;
3561 		idx++;
3562 	}
3563 }
3564 
3565 void
3566 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3567     struct vm_copyinfo *copyinfo, size_t len)
3568 {
3569 	const char *src;
3570 	int idx;
3571 
3572 	src = kaddr;
3573 	idx = 0;
3574 	while (len > 0) {
3575 		ASSERT(copyinfo[idx].prot & PROT_WRITE);
3576 
3577 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3578 		len -= copyinfo[idx].len;
3579 		src += copyinfo[idx].len;
3580 		idx++;
3581 	}
3582 }
3583 
3584 /*
3585  * Return the amount of in-use and wired memory for the VM. Since
3586  * these are global stats, only return the values with for vCPU 0
3587  */
3588 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3589 
3590 static void
3591 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3592 {
3593 	if (vcpu == 0) {
3594 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3595 		    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3596 	}
3597 }
3598 
3599 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3600 
3601 int
3602 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3603     uint8_t bytes, uint32_t *val)
3604 {
3605 	return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3606 }
3607 
3608 /*
3609  * bhyve-internal interfaces to attach or detach IO port handlers.
3610  * Must be called with VM write lock held for safety.
3611  */
3612 int
3613 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3614     void **cookie)
3615 {
3616 	int err;
3617 	err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3618 	if (err == 0) {
3619 		*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3620 	}
3621 	return (err);
3622 }
3623 int
3624 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3625     void **old_arg)
3626 {
3627 	uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3628 	int err;
3629 
3630 	err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3631 	if (err == 0) {
3632 		*cookie = NULL;
3633 	}
3634 	return (err);
3635 }
3636 
3637 /*
3638  * External driver interfaces to attach or detach IO port handlers.
3639  * Must be called with VM write lock held for safety.
3640  */
3641 int
3642 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3643     void *arg, void **cookie)
3644 {
3645 	int err;
3646 
3647 	if (port == 0) {
3648 		return (EINVAL);
3649 	}
3650 
3651 	err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3652 	if (err == 0) {
3653 		*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3654 	}
3655 	return (err);
3656 }
3657 void
3658 vm_ioport_unhook(struct vm *vm, void **cookie)
3659 {
3660 	uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3661 	ioport_handler_t old_func;
3662 	void *old_arg;
3663 	int err;
3664 
3665 	err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3666 
3667 	/* ioport-hook-using drivers are expected to be well-behaved */
3668 	VERIFY0(err);
3669 	VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3670 
3671 	*cookie = NULL;
3672 }
3673 
3674 int
3675 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3676 {
3677 	struct vm *vm = ksp->ks_private;
3678 	vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3679 	const int vcpuid = vvk->vvk_vcpu.value.ui32;
3680 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3681 
3682 	ASSERT3U(vcpuid, <, VM_MAXCPU);
3683 
3684 	vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3685 	vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3686 	vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3687 	vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3688 	vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3689 	vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3690 
3691 	return (0);
3692 }
3693