xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * This file and its contents are supplied under the terms of the
32  * Common Development and Distribution License ("CDDL"), version 1.0.
33  * You may only use this file in accordance with the terms of version
34  * 1.0 of the CDDL.
35  *
36  * A full copy of the text of the CDDL should have accompanied this
37  * source.  A copy of the CDDL is also available via the Internet at
38  * http://www.illumos.org/license/CDDL.
39  *
40  * Copyright 2015 Pluribus Networks Inc.
41  * Copyright 2018 Joyent, Inc.
42  * Copyright 2022 Oxide Computer Company
43  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
44  */
45 
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/kernel.h>
52 #include <sys/module.h>
53 #include <sys/sysctl.h>
54 #include <sys/kmem.h>
55 #include <sys/pcpu.h>
56 #include <sys/mutex.h>
57 #include <sys/proc.h>
58 #include <sys/rwlock.h>
59 #include <sys/sched.h>
60 #include <sys/systm.h>
61 #include <sys/sunddi.h>
62 #include <sys/hma.h>
63 
64 #include <machine/md_var.h>
65 #include <x86/psl.h>
66 #include <x86/apicreg.h>
67 
68 #include <machine/specialreg.h>
69 #include <machine/vmm.h>
70 #include <machine/vmm_dev.h>
71 #include <machine/vmparam.h>
72 #include <sys/vmm_instruction_emul.h>
73 #include <sys/vmm_vm.h>
74 #include <sys/vmm_gpt.h>
75 #include <sys/vmm_data.h>
76 
77 #include "vmm_ioport.h"
78 #include "vmm_host.h"
79 #include "vmm_util.h"
80 #include "vatpic.h"
81 #include "vatpit.h"
82 #include "vhpet.h"
83 #include "vioapic.h"
84 #include "vlapic.h"
85 #include "vpmtmr.h"
86 #include "vrtc.h"
87 #include "vmm_stat.h"
88 #include "vmm_lapic.h"
89 
90 #include "io/ppt.h"
91 #include "io/iommu.h"
92 
93 struct vlapic;
94 
95 /* Flags for vtc_status */
96 #define	VTCS_FPU_RESTORED	1 /* guest FPU restored, host FPU saved */
97 #define	VTCS_FPU_CTX_CRITICAL	2 /* in ctx where FPU restore cannot be lazy */
98 
99 typedef struct vm_thread_ctx {
100 	struct vm	*vtc_vm;
101 	int		vtc_vcpuid;
102 	uint_t		vtc_status;
103 	enum vcpu_ustate vtc_ustate;
104 } vm_thread_ctx_t;
105 
106 #define	VMM_MTRR_VAR_MAX 10
107 #define	VMM_MTRR_DEF_MASK \
108 	(MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE)
109 #define	VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE)
110 #define	VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID)
111 struct vm_mtrr {
112 	uint64_t def_type;
113 	uint64_t fixed4k[8];
114 	uint64_t fixed16k[2];
115 	uint64_t fixed64k;
116 	struct {
117 		uint64_t base;
118 		uint64_t mask;
119 	} var[VMM_MTRR_VAR_MAX];
120 };
121 
122 /*
123  * Initialization:
124  * (a) allocated when vcpu is created
125  * (i) initialized when vcpu is created and when it is reinitialized
126  * (o) initialized the first time the vcpu is created
127  * (x) initialized before use
128  */
129 struct vcpu {
130 	/* (o) protects state, run_state, hostcpu, sipi_vector */
131 	kmutex_t	lock;
132 
133 	enum vcpu_state	state;		/* (o) vcpu state */
134 	enum vcpu_run_state run_state;	/* (i) vcpu init/sipi/run state */
135 	kcondvar_t	vcpu_cv;	/* (o) cpu waiter cv */
136 	kcondvar_t	state_cv;	/* (o) IDLE-transition cv */
137 	int		hostcpu;	/* (o) vcpu's current host cpu */
138 	int		lastloccpu;	/* (o) last host cpu localized to */
139 	int		reqidle;	/* (i) request vcpu to idle */
140 	struct vlapic	*vlapic;	/* (i) APIC device model */
141 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
142 	uint64_t	exit_intinfo;	/* (i) events pending at VM exit */
143 	uint64_t	exc_pending;	/* (i) exception pending */
144 	bool		nmi_pending;	/* (i) NMI pending */
145 	bool		extint_pending;	/* (i) INTR pending */
146 
147 	uint8_t		sipi_vector;	/* (i) SIPI vector */
148 	hma_fpu_t	*guestfpu;	/* (a,i) guest fpu state */
149 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
150 	void		*stats;		/* (a,i) statistics */
151 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
152 	uint64_t	nextrip;	/* (x) next instruction to execute */
153 	struct vie	*vie_ctx;	/* (x) instruction emulation context */
154 	vm_client_t	*vmclient;	/* (a) VM-system client */
155 	uint64_t	tsc_offset;	/* (x) offset from host TSC */
156 	struct vm_mtrr	mtrr;		/* (i) vcpu's MTRR */
157 	vcpu_cpuid_config_t cpuid_cfg;	/* (x) cpuid configuration */
158 
159 	enum vcpu_ustate ustate;	/* (i) microstate for the vcpu */
160 	hrtime_t	ustate_when;	/* (i) time of last ustate change */
161 	uint64_t ustate_total[VU_MAX];	/* (o) total time spent in ustates */
162 	vm_thread_ctx_t	vtc;		/* (o) thread state for ctxops */
163 	struct ctxop	*ctxop;		/* (o) ctxop storage for vcpu */
164 };
165 
166 #define	vcpu_lock(v)		mutex_enter(&((v)->lock))
167 #define	vcpu_unlock(v)		mutex_exit(&((v)->lock))
168 #define	vcpu_assert_locked(v)	ASSERT(MUTEX_HELD(&((v)->lock)))
169 
170 struct mem_seg {
171 	size_t	len;
172 	bool	sysmem;
173 	vm_object_t *object;
174 };
175 #define	VM_MAX_MEMSEGS	5
176 
177 struct mem_map {
178 	vm_paddr_t	gpa;
179 	size_t		len;
180 	vm_ooffset_t	segoff;
181 	int		segid;
182 	int		prot;
183 	int		flags;
184 };
185 #define	VM_MAX_MEMMAPS	8
186 
187 /*
188  * Initialization:
189  * (o) initialized the first time the VM is created
190  * (i) initialized when VM is created and when it is reinitialized
191  * (x) initialized before use
192  */
193 struct vm {
194 	void		*cookie;		/* (i) cpu-specific data */
195 	void		*iommu;			/* (x) iommu-specific data */
196 	struct vhpet	*vhpet;			/* (i) virtual HPET */
197 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
198 	struct vatpic	*vatpic;		/* (i) virtual atpic */
199 	struct vatpit	*vatpit;		/* (i) virtual atpit */
200 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
201 	struct vrtc	*vrtc;			/* (o) virtual RTC */
202 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
203 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for dbg */
204 	int		suspend;		/* (i) stop VM execution */
205 	volatile cpuset_t suspended_cpus;	/* (i) suspended vcpus */
206 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
207 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
208 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
209 	struct vmspace	*vmspace;		/* (o) guest's address space */
210 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
211 	/* The following describe the vm cpu topology */
212 	uint16_t	sockets;		/* (o) num of sockets */
213 	uint16_t	cores;			/* (o) num of cores/socket */
214 	uint16_t	threads;		/* (o) num of threads/core */
215 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
216 
217 	uint64_t	boot_tsc_offset;	/* (i) TSC offset at VM boot */
218 	hrtime_t	boot_hrtime;		/* (i) hrtime at VM boot */
219 
220 	struct ioport_config ioports;		/* (o) ioport handling */
221 
222 	bool		mem_transient;		/* (o) alloc transient memory */
223 	bool		is_paused;		/* (i) instance is paused */
224 };
225 
226 static int vmm_initialized;
227 
228 
229 static void
230 nullop_panic(void)
231 {
232 	panic("null vmm operation call");
233 }
234 
235 /* Do not allow use of an un-set `ops` to do anything but panic */
236 static struct vmm_ops vmm_ops_null = {
237 	.init		= (vmm_init_func_t)nullop_panic,
238 	.cleanup	= (vmm_cleanup_func_t)nullop_panic,
239 	.resume		= (vmm_resume_func_t)nullop_panic,
240 	.vminit		= (vmi_init_func_t)nullop_panic,
241 	.vmrun		= (vmi_run_func_t)nullop_panic,
242 	.vmcleanup	= (vmi_cleanup_func_t)nullop_panic,
243 	.vmgetreg	= (vmi_get_register_t)nullop_panic,
244 	.vmsetreg	= (vmi_set_register_t)nullop_panic,
245 	.vmgetdesc	= (vmi_get_desc_t)nullop_panic,
246 	.vmsetdesc	= (vmi_set_desc_t)nullop_panic,
247 	.vmgetcap	= (vmi_get_cap_t)nullop_panic,
248 	.vmsetcap	= (vmi_set_cap_t)nullop_panic,
249 	.vlapic_init	= (vmi_vlapic_init)nullop_panic,
250 	.vlapic_cleanup	= (vmi_vlapic_cleanup)nullop_panic,
251 	.vmsavectx	= (vmi_savectx)nullop_panic,
252 	.vmrestorectx	= (vmi_restorectx)nullop_panic,
253 	.vmgetmsr	= (vmi_get_msr_t)nullop_panic,
254 	.vmsetmsr	= (vmi_set_msr_t)nullop_panic,
255 };
256 
257 static struct vmm_ops *ops = &vmm_ops_null;
258 static vmm_pte_ops_t *pte_ops = NULL;
259 
260 #define	VMM_INIT()			((*ops->init)())
261 #define	VMM_CLEANUP()			((*ops->cleanup)())
262 #define	VMM_RESUME()			((*ops->resume)())
263 
264 #define	VMINIT(vm)		((*ops->vminit)(vm))
265 #define	VMRUN(vmi, vcpu, rip)	((*ops->vmrun)(vmi, vcpu, rip))
266 #define	VMCLEANUP(vmi)			((*ops->vmcleanup)(vmi))
267 
268 #define	VMGETREG(vmi, vcpu, num, rv)	((*ops->vmgetreg)(vmi, vcpu, num, rv))
269 #define	VMSETREG(vmi, vcpu, num, val)	((*ops->vmsetreg)(vmi, vcpu, num, val))
270 #define	VMGETDESC(vmi, vcpu, num, dsc)	((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
271 #define	VMSETDESC(vmi, vcpu, num, dsc)	((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
272 #define	VMGETCAP(vmi, vcpu, num, rv)	((*ops->vmgetcap)(vmi, vcpu, num, rv))
273 #define	VMSETCAP(vmi, vcpu, num, val)	((*ops->vmsetcap)(vmi, vcpu, num, val))
274 #define	VLAPIC_INIT(vmi, vcpu)		((*ops->vlapic_init)(vmi, vcpu))
275 #define	VLAPIC_CLEANUP(vmi, vlapic)	((*ops->vlapic_cleanup)(vmi, vlapic))
276 
277 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
278 #define	fpu_stop_emulating()	clts()
279 
280 SDT_PROVIDER_DEFINE(vmm);
281 
282 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
283     NULL);
284 
285 /*
286  * Halt the guest if all vcpus are executing a HLT instruction with
287  * interrupts disabled.
288  */
289 int halt_detection_enabled = 1;
290 
291 /* Trap into hypervisor on all guest exceptions and reflect them back */
292 int trace_guest_exceptions;
293 
294 /* Trap WBINVD and ignore it */
295 int trap_wbinvd = 1;
296 
297 static void vm_free_memmap(struct vm *vm, int ident);
298 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
299 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
300 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
301 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
302 
303 static void vmm_savectx(void *);
304 static void vmm_restorectx(void *);
305 static const struct ctxop_template vmm_ctxop_tpl = {
306 	.ct_rev		= CTXOP_TPL_REV,
307 	.ct_save	= vmm_savectx,
308 	.ct_restore	= vmm_restorectx,
309 };
310 
311 #ifdef KTR
312 static const char *
313 vcpu_state2str(enum vcpu_state state)
314 {
315 
316 	switch (state) {
317 	case VCPU_IDLE:
318 		return ("idle");
319 	case VCPU_FROZEN:
320 		return ("frozen");
321 	case VCPU_RUNNING:
322 		return ("running");
323 	case VCPU_SLEEPING:
324 		return ("sleeping");
325 	default:
326 		return ("unknown");
327 	}
328 }
329 #endif
330 
331 static void
332 vcpu_cleanup(struct vm *vm, int i, bool destroy)
333 {
334 	struct vcpu *vcpu = &vm->vcpu[i];
335 
336 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
337 	if (destroy) {
338 		vmm_stat_free(vcpu->stats);
339 
340 		vcpu_cpuid_cleanup(&vcpu->cpuid_cfg);
341 
342 		hma_fpu_free(vcpu->guestfpu);
343 		vcpu->guestfpu = NULL;
344 
345 		vie_free(vcpu->vie_ctx);
346 		vcpu->vie_ctx = NULL;
347 
348 		vmc_destroy(vcpu->vmclient);
349 		vcpu->vmclient = NULL;
350 
351 		ctxop_free(vcpu->ctxop);
352 		mutex_destroy(&vcpu->lock);
353 	}
354 }
355 
356 static void
357 vcpu_init(struct vm *vm, int vcpu_id, bool create)
358 {
359 	struct vcpu *vcpu;
360 
361 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
362 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
363 
364 	vcpu = &vm->vcpu[vcpu_id];
365 
366 	if (create) {
367 		mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL);
368 
369 		vcpu->state = VCPU_IDLE;
370 		vcpu->hostcpu = NOCPU;
371 		vcpu->lastloccpu = NOCPU;
372 		vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP);
373 		vcpu->stats = vmm_stat_alloc();
374 		vcpu->vie_ctx = vie_alloc();
375 		vcpu_cpuid_init(&vcpu->cpuid_cfg);
376 
377 		vcpu->ustate = VU_INIT;
378 		vcpu->ustate_when = gethrtime();
379 
380 		vcpu->vtc.vtc_vm = vm;
381 		vcpu->vtc.vtc_vcpuid = vcpu_id;
382 		vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc);
383 	} else {
384 		vie_reset(vcpu->vie_ctx);
385 		bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
386 		if (vcpu->ustate != VU_INIT) {
387 			vcpu_ustate_change(vm, vcpu_id, VU_INIT);
388 		}
389 		bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
390 	}
391 
392 	vcpu->run_state = VRS_HALT;
393 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
394 	(void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
395 	vcpu->reqidle = 0;
396 	vcpu->exit_intinfo = 0;
397 	vcpu->nmi_pending = false;
398 	vcpu->extint_pending = false;
399 	vcpu->exc_pending = 0;
400 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
401 	(void) hma_fpu_init(vcpu->guestfpu);
402 	vmm_stat_init(vcpu->stats);
403 	vcpu->tsc_offset = 0;
404 }
405 
406 int
407 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
408 {
409 	return (trace_guest_exceptions);
410 }
411 
412 int
413 vcpu_trap_wbinvd(struct vm *vm, int vcpuid)
414 {
415 	return (trap_wbinvd);
416 }
417 
418 struct vm_exit *
419 vm_exitinfo(struct vm *vm, int cpuid)
420 {
421 	struct vcpu *vcpu;
422 
423 	if (cpuid < 0 || cpuid >= vm->maxcpus)
424 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
425 
426 	vcpu = &vm->vcpu[cpuid];
427 
428 	return (&vcpu->exitinfo);
429 }
430 
431 struct vie *
432 vm_vie_ctx(struct vm *vm, int cpuid)
433 {
434 	if (cpuid < 0 || cpuid >= vm->maxcpus)
435 		panic("vm_vie_ctx: invalid cpuid %d", cpuid);
436 
437 	return (vm->vcpu[cpuid].vie_ctx);
438 }
439 
440 static int
441 vmm_init(void)
442 {
443 	vmm_host_state_init();
444 
445 	if (vmm_is_intel()) {
446 		ops = &vmm_ops_intel;
447 		pte_ops = &ept_pte_ops;
448 	} else if (vmm_is_svm()) {
449 		ops = &vmm_ops_amd;
450 		pte_ops = &rvi_pte_ops;
451 	} else {
452 		return (ENXIO);
453 	}
454 
455 	return (VMM_INIT());
456 }
457 
458 int
459 vmm_mod_load()
460 {
461 	int	error;
462 
463 	VERIFY(vmm_initialized == 0);
464 
465 	error = vmm_init();
466 	if (error == 0)
467 		vmm_initialized = 1;
468 
469 	return (error);
470 }
471 
472 int
473 vmm_mod_unload()
474 {
475 	int	error;
476 
477 	VERIFY(vmm_initialized == 1);
478 
479 	error = VMM_CLEANUP();
480 	if (error)
481 		return (error);
482 	vmm_initialized = 0;
483 
484 	return (0);
485 }
486 
487 /*
488  * Create a test IOMMU domain to see if the host system has necessary hardware
489  * and drivers to do so.
490  */
491 bool
492 vmm_check_iommu(void)
493 {
494 	void *domain;
495 	const size_t arb_test_sz = (1UL << 32);
496 
497 	domain = iommu_create_domain(arb_test_sz);
498 	if (domain == NULL) {
499 		return (false);
500 	}
501 	iommu_destroy_domain(domain);
502 	return (true);
503 }
504 
505 static void
506 vm_init(struct vm *vm, bool create)
507 {
508 	int i;
509 
510 	vm->cookie = VMINIT(vm);
511 	vm->iommu = NULL;
512 	vm->vioapic = vioapic_init(vm);
513 	vm->vhpet = vhpet_init(vm);
514 	vm->vatpic = vatpic_init(vm);
515 	vm->vatpit = vatpit_init(vm);
516 	vm->vpmtmr = vpmtmr_init(vm);
517 	if (create)
518 		vm->vrtc = vrtc_init(vm);
519 
520 	vm_inout_init(vm, &vm->ioports);
521 
522 	CPU_ZERO(&vm->active_cpus);
523 	CPU_ZERO(&vm->debug_cpus);
524 
525 	vm->suspend = 0;
526 	CPU_ZERO(&vm->suspended_cpus);
527 
528 	for (i = 0; i < vm->maxcpus; i++)
529 		vcpu_init(vm, i, create);
530 
531 	/*
532 	 * Configure the VM-wide TSC offset so that the call to vm_init()
533 	 * represents the boot time (when the TSC(s) read 0).  Each vCPU will
534 	 * have its own offset from this, which is altered if/when the guest
535 	 * writes to MSR_TSC.
536 	 *
537 	 * The TSC offsetting math is all unsigned, using overflow for negative
538 	 * offets.  A reading of the TSC is negated to form the boot offset.
539 	 */
540 	const uint64_t boot_tsc = rdtsc_offset();
541 	vm->boot_tsc_offset = (uint64_t)(-(int64_t)boot_tsc);
542 
543 	/* Convert the boot TSC reading to hrtime */
544 	vm->boot_hrtime = (hrtime_t)boot_tsc;
545 	scalehrtime(&vm->boot_hrtime);
546 }
547 
548 /*
549  * The default CPU topology is a single thread per package.
550  */
551 uint_t cores_per_package = 1;
552 uint_t threads_per_core = 1;
553 
554 int
555 vm_create(uint64_t flags, struct vm **retvm)
556 {
557 	struct vm *vm;
558 	struct vmspace *vmspace;
559 
560 	/*
561 	 * If vmm.ko could not be successfully initialized then don't attempt
562 	 * to create the virtual machine.
563 	 */
564 	if (!vmm_initialized)
565 		return (ENXIO);
566 
567 	bool track_dirty = (flags & VCF_TRACK_DIRTY) != 0;
568 	if (track_dirty && !pte_ops->vpeo_hw_ad_supported())
569 		return (ENOTSUP);
570 
571 	vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, track_dirty);
572 	if (vmspace == NULL)
573 		return (ENOMEM);
574 
575 	vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP);
576 
577 	vm->vmspace = vmspace;
578 	vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0;
579 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
580 		vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace);
581 	}
582 
583 	vm->sockets = 1;
584 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
585 	vm->threads = threads_per_core;	/* XXX backwards compatibility */
586 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
587 
588 	vm_init(vm, true);
589 
590 	*retvm = vm;
591 	return (0);
592 }
593 
594 void
595 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
596     uint16_t *threads, uint16_t *maxcpus)
597 {
598 	*sockets = vm->sockets;
599 	*cores = vm->cores;
600 	*threads = vm->threads;
601 	*maxcpus = vm->maxcpus;
602 }
603 
604 uint16_t
605 vm_get_maxcpus(struct vm *vm)
606 {
607 	return (vm->maxcpus);
608 }
609 
610 int
611 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
612     uint16_t threads, uint16_t maxcpus)
613 {
614 	if (maxcpus != 0)
615 		return (EINVAL);	/* XXX remove when supported */
616 	if ((sockets * cores * threads) > vm->maxcpus)
617 		return (EINVAL);
618 	/* XXX need to check sockets * cores * threads == vCPU, how? */
619 	vm->sockets = sockets;
620 	vm->cores = cores;
621 	vm->threads = threads;
622 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
623 	return (0);
624 }
625 
626 static void
627 vm_cleanup(struct vm *vm, bool destroy)
628 {
629 	struct mem_map *mm;
630 	int i;
631 
632 	ppt_unassign_all(vm);
633 
634 	if (vm->iommu != NULL)
635 		iommu_destroy_domain(vm->iommu);
636 
637 	/*
638 	 * Devices which attach their own ioport hooks should be cleaned up
639 	 * first so they can tear down those registrations.
640 	 */
641 	vpmtmr_cleanup(vm->vpmtmr);
642 
643 	vm_inout_cleanup(vm, &vm->ioports);
644 
645 	if (destroy)
646 		vrtc_cleanup(vm->vrtc);
647 	else
648 		vrtc_reset(vm->vrtc);
649 
650 	vatpit_cleanup(vm->vatpit);
651 	vhpet_cleanup(vm->vhpet);
652 	vatpic_cleanup(vm->vatpic);
653 	vioapic_cleanup(vm->vioapic);
654 
655 	for (i = 0; i < vm->maxcpus; i++)
656 		vcpu_cleanup(vm, i, destroy);
657 
658 	VMCLEANUP(vm->cookie);
659 
660 	/*
661 	 * System memory is removed from the guest address space only when
662 	 * the VM is destroyed. This is because the mapping remains the same
663 	 * across VM reset.
664 	 *
665 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
666 	 * so those mappings are removed on a VM reset.
667 	 */
668 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
669 		mm = &vm->mem_maps[i];
670 		if (destroy || !sysmem_mapping(vm, mm)) {
671 			vm_free_memmap(vm, i);
672 		} else {
673 			/*
674 			 * We need to reset the IOMMU flag so this mapping can
675 			 * be reused when a VM is rebooted. Since the IOMMU
676 			 * domain has already been destroyed we can just reset
677 			 * the flag here.
678 			 */
679 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
680 		}
681 	}
682 
683 	if (destroy) {
684 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
685 			vm_free_memseg(vm, i);
686 
687 		vmspace_destroy(vm->vmspace);
688 		vm->vmspace = NULL;
689 	}
690 }
691 
692 void
693 vm_destroy(struct vm *vm)
694 {
695 	vm_cleanup(vm, true);
696 	kmem_free(vm, sizeof (*vm));
697 }
698 
699 int
700 vm_reinit(struct vm *vm, uint64_t flags)
701 {
702 	/* A virtual machine can be reset only if all vcpus are suspended. */
703 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) {
704 		if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) {
705 			return (EBUSY);
706 		}
707 
708 		/*
709 		 * Force the VM (and all its vCPUs) into a suspended state.
710 		 * This should be quick and easy, since the vm_reinit() call is
711 		 * made while holding the VM write lock, which requires holding
712 		 * all of the vCPUs in the VCPU_FROZEN state.
713 		 */
714 		(void) atomic_cmpset_int((uint_t *)&vm->suspend, 0,
715 		    VM_SUSPEND_RESET);
716 		for (uint_t i = 0; i < vm->maxcpus; i++) {
717 			struct vcpu *vcpu = &vm->vcpu[i];
718 
719 			if (CPU_ISSET(i, &vm->suspended_cpus) ||
720 			    !CPU_ISSET(i, &vm->active_cpus)) {
721 				continue;
722 			}
723 
724 			vcpu_lock(vcpu);
725 			VERIFY3U(vcpu->state, ==, VCPU_FROZEN);
726 			CPU_SET_ATOMIC(i, &vm->suspended_cpus);
727 			vcpu_unlock(vcpu);
728 		}
729 
730 		VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus));
731 	}
732 
733 	vm_cleanup(vm, false);
734 	vm_init(vm, false);
735 	return (0);
736 }
737 
738 bool
739 vm_is_paused(struct vm *vm)
740 {
741 	return (vm->is_paused);
742 }
743 
744 int
745 vm_pause_instance(struct vm *vm)
746 {
747 	if (vm->is_paused) {
748 		return (EALREADY);
749 	}
750 	vm->is_paused = true;
751 
752 	for (uint_t i = 0; i < vm->maxcpus; i++) {
753 		struct vcpu *vcpu = &vm->vcpu[i];
754 
755 		if (!CPU_ISSET(i, &vm->active_cpus)) {
756 			continue;
757 		}
758 		vlapic_pause(vcpu->vlapic);
759 	}
760 	vhpet_pause(vm->vhpet);
761 	vatpit_pause(vm->vatpit);
762 	vrtc_pause(vm->vrtc);
763 
764 	return (0);
765 }
766 
767 int
768 vm_resume_instance(struct vm *vm)
769 {
770 	if (!vm->is_paused) {
771 		return (EALREADY);
772 	}
773 	vm->is_paused = false;
774 
775 	vrtc_resume(vm->vrtc);
776 	vatpit_resume(vm->vatpit);
777 	vhpet_resume(vm->vhpet);
778 	for (uint_t i = 0; i < vm->maxcpus; i++) {
779 		struct vcpu *vcpu = &vm->vcpu[i];
780 
781 		if (!CPU_ISSET(i, &vm->active_cpus)) {
782 			continue;
783 		}
784 		vlapic_resume(vcpu->vlapic);
785 	}
786 
787 	return (0);
788 }
789 
790 int
791 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
792 {
793 	vm_object_t *obj;
794 
795 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
796 		return (ENOMEM);
797 	else
798 		return (0);
799 }
800 
801 int
802 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
803 {
804 	return (vmspace_unmap(vm->vmspace, gpa, gpa + len));
805 }
806 
807 /*
808  * Return 'true' if 'gpa' is allocated in the guest address space.
809  *
810  * This function is called in the context of a running vcpu which acts as
811  * an implicit lock on 'vm->mem_maps[]'.
812  */
813 bool
814 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
815 {
816 	struct mem_map *mm;
817 	int i;
818 
819 #ifdef INVARIANTS
820 	int hostcpu, state;
821 	state = vcpu_get_state(vm, vcpuid, &hostcpu);
822 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
823 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
824 #endif
825 
826 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
827 		mm = &vm->mem_maps[i];
828 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
829 			return (true);		/* 'gpa' is sysmem or devmem */
830 	}
831 
832 	if (ppt_is_mmio(vm, gpa))
833 		return (true);			/* 'gpa' is pci passthru mmio */
834 
835 	return (false);
836 }
837 
838 int
839 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
840 {
841 	struct mem_seg *seg;
842 	vm_object_t *obj;
843 
844 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
845 		return (EINVAL);
846 
847 	if (len == 0 || (len & PAGE_MASK))
848 		return (EINVAL);
849 
850 	seg = &vm->mem_segs[ident];
851 	if (seg->object != NULL) {
852 		if (seg->len == len && seg->sysmem == sysmem)
853 			return (EEXIST);
854 		else
855 			return (EINVAL);
856 	}
857 
858 	obj = vm_object_mem_allocate(len, vm->mem_transient);
859 	if (obj == NULL)
860 		return (ENOMEM);
861 
862 	seg->len = len;
863 	seg->object = obj;
864 	seg->sysmem = sysmem;
865 	return (0);
866 }
867 
868 int
869 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
870     vm_object_t **objptr)
871 {
872 	struct mem_seg *seg;
873 
874 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
875 		return (EINVAL);
876 
877 	seg = &vm->mem_segs[ident];
878 	if (len)
879 		*len = seg->len;
880 	if (sysmem)
881 		*sysmem = seg->sysmem;
882 	if (objptr)
883 		*objptr = seg->object;
884 	return (0);
885 }
886 
887 void
888 vm_free_memseg(struct vm *vm, int ident)
889 {
890 	struct mem_seg *seg;
891 
892 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
893 	    ("%s: invalid memseg ident %d", __func__, ident));
894 
895 	seg = &vm->mem_segs[ident];
896 	if (seg->object != NULL) {
897 		vm_object_release(seg->object);
898 		bzero(seg, sizeof (struct mem_seg));
899 	}
900 }
901 
902 int
903 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
904     size_t len, int prot, int flags)
905 {
906 	struct mem_seg *seg;
907 	struct mem_map *m, *map;
908 	vm_ooffset_t last;
909 	int i, error;
910 
911 	if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
912 		return (EINVAL);
913 
914 	if (flags & ~VM_MEMMAP_F_WIRED)
915 		return (EINVAL);
916 
917 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
918 		return (EINVAL);
919 
920 	seg = &vm->mem_segs[segid];
921 	if (seg->object == NULL)
922 		return (EINVAL);
923 
924 	last = first + len;
925 	if (first < 0 || first >= last || last > seg->len)
926 		return (EINVAL);
927 
928 	if ((gpa | first | last) & PAGE_MASK)
929 		return (EINVAL);
930 
931 	map = NULL;
932 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
933 		m = &vm->mem_maps[i];
934 		if (m->len == 0) {
935 			map = m;
936 			break;
937 		}
938 	}
939 
940 	if (map == NULL)
941 		return (ENOSPC);
942 
943 	error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot);
944 	if (error != 0)
945 		return (EFAULT);
946 
947 	vm_object_reference(seg->object);
948 
949 	if ((flags & VM_MEMMAP_F_WIRED) != 0) {
950 		error = vmspace_populate(vm->vmspace, gpa, gpa + len);
951 		if (error != 0) {
952 			VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len));
953 			return (EFAULT);
954 		}
955 	}
956 
957 	map->gpa = gpa;
958 	map->len = len;
959 	map->segoff = first;
960 	map->segid = segid;
961 	map->prot = prot;
962 	map->flags = flags;
963 	return (0);
964 }
965 
966 int
967 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
968 {
969 	struct mem_map *m;
970 	int i;
971 
972 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
973 		m = &vm->mem_maps[i];
974 		if (m->gpa == gpa && m->len == len &&
975 		    (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
976 			vm_free_memmap(vm, i);
977 			return (0);
978 		}
979 	}
980 
981 	return (EINVAL);
982 }
983 
984 int
985 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
986     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
987 {
988 	struct mem_map *mm, *mmnext;
989 	int i;
990 
991 	mmnext = NULL;
992 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
993 		mm = &vm->mem_maps[i];
994 		if (mm->len == 0 || mm->gpa < *gpa)
995 			continue;
996 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
997 			mmnext = mm;
998 	}
999 
1000 	if (mmnext != NULL) {
1001 		*gpa = mmnext->gpa;
1002 		if (segid)
1003 			*segid = mmnext->segid;
1004 		if (segoff)
1005 			*segoff = mmnext->segoff;
1006 		if (len)
1007 			*len = mmnext->len;
1008 		if (prot)
1009 			*prot = mmnext->prot;
1010 		if (flags)
1011 			*flags = mmnext->flags;
1012 		return (0);
1013 	} else {
1014 		return (ENOENT);
1015 	}
1016 }
1017 
1018 static void
1019 vm_free_memmap(struct vm *vm, int ident)
1020 {
1021 	struct mem_map *mm;
1022 	int error;
1023 
1024 	mm = &vm->mem_maps[ident];
1025 	if (mm->len) {
1026 		error = vmspace_unmap(vm->vmspace, mm->gpa,
1027 		    mm->gpa + mm->len);
1028 		KASSERT(error == 0, ("%s: vmspace_unmap error %d",
1029 		    __func__, error));
1030 		bzero(mm, sizeof (struct mem_map));
1031 	}
1032 }
1033 
1034 static __inline bool
1035 sysmem_mapping(struct vm *vm, struct mem_map *mm)
1036 {
1037 
1038 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
1039 		return (true);
1040 	else
1041 		return (false);
1042 }
1043 
1044 vm_paddr_t
1045 vmm_sysmem_maxaddr(struct vm *vm)
1046 {
1047 	struct mem_map *mm;
1048 	vm_paddr_t maxaddr;
1049 	int i;
1050 
1051 	maxaddr = 0;
1052 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1053 		mm = &vm->mem_maps[i];
1054 		if (sysmem_mapping(vm, mm)) {
1055 			if (maxaddr < mm->gpa + mm->len)
1056 				maxaddr = mm->gpa + mm->len;
1057 		}
1058 	}
1059 	return (maxaddr);
1060 }
1061 
1062 static void
1063 vm_iommu_modify(struct vm *vm, bool map)
1064 {
1065 	int i, sz;
1066 	vm_paddr_t gpa, hpa;
1067 	struct mem_map *mm;
1068 	vm_client_t *vmc;
1069 
1070 	sz = PAGE_SIZE;
1071 	vmc = vmspace_client_alloc(vm->vmspace);
1072 
1073 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1074 		mm = &vm->mem_maps[i];
1075 		if (!sysmem_mapping(vm, mm))
1076 			continue;
1077 
1078 		if (map) {
1079 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
1080 			    ("iommu map found invalid memmap %lx/%lx/%x",
1081 			    mm->gpa, mm->len, mm->flags));
1082 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
1083 				continue;
1084 			mm->flags |= VM_MEMMAP_F_IOMMU;
1085 		} else {
1086 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
1087 				continue;
1088 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
1089 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
1090 			    ("iommu unmap found invalid memmap %lx/%lx/%x",
1091 			    mm->gpa, mm->len, mm->flags));
1092 		}
1093 
1094 		gpa = mm->gpa;
1095 		while (gpa < mm->gpa + mm->len) {
1096 			vm_page_t *vmp;
1097 
1098 			vmp = vmc_hold(vmc, gpa, PROT_WRITE);
1099 			ASSERT(vmp != NULL);
1100 			hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT);
1101 			(void) vmp_release(vmp);
1102 
1103 			/*
1104 			 * When originally ported from FreeBSD, the logic for
1105 			 * adding memory to the guest domain would
1106 			 * simultaneously remove it from the host domain.  The
1107 			 * justification for that is not clear, and FreeBSD has
1108 			 * subsequently changed the behavior to not remove the
1109 			 * memory from the host domain.
1110 			 *
1111 			 * Leaving the guest memory in the host domain for the
1112 			 * life of the VM is necessary to make it available for
1113 			 * DMA, such as through viona in the TX path.
1114 			 */
1115 			if (map) {
1116 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
1117 			} else {
1118 				iommu_remove_mapping(vm->iommu, gpa, sz);
1119 			}
1120 
1121 			gpa += PAGE_SIZE;
1122 		}
1123 	}
1124 	vmc_destroy(vmc);
1125 
1126 	/*
1127 	 * Invalidate the cached translations associated with the domain
1128 	 * from which pages were removed.
1129 	 */
1130 	iommu_invalidate_tlb(vm->iommu);
1131 }
1132 
1133 int
1134 vm_unassign_pptdev(struct vm *vm, int pptfd)
1135 {
1136 	int error;
1137 
1138 	error = ppt_unassign_device(vm, pptfd);
1139 	if (error)
1140 		return (error);
1141 
1142 	if (ppt_assigned_devices(vm) == 0)
1143 		vm_iommu_modify(vm, false);
1144 
1145 	return (0);
1146 }
1147 
1148 int
1149 vm_assign_pptdev(struct vm *vm, int pptfd)
1150 {
1151 	int error;
1152 	vm_paddr_t maxaddr;
1153 
1154 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1155 	if (ppt_assigned_devices(vm) == 0) {
1156 		KASSERT(vm->iommu == NULL,
1157 		    ("vm_assign_pptdev: iommu must be NULL"));
1158 		maxaddr = vmm_sysmem_maxaddr(vm);
1159 		vm->iommu = iommu_create_domain(maxaddr);
1160 		if (vm->iommu == NULL)
1161 			return (ENXIO);
1162 		vm_iommu_modify(vm, true);
1163 	}
1164 
1165 	error = ppt_assign_device(vm, pptfd);
1166 	return (error);
1167 }
1168 
1169 int
1170 vm_get_register(struct vm *vm, int vcpuid, int reg, uint64_t *retval)
1171 {
1172 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1173 		return (EINVAL);
1174 
1175 	if (reg >= VM_REG_LAST)
1176 		return (EINVAL);
1177 
1178 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1179 	switch (reg) {
1180 	case VM_REG_GUEST_XCR0:
1181 		*retval = vcpu->guest_xcr0;
1182 		return (0);
1183 	default:
1184 		return (VMGETREG(vm->cookie, vcpuid, reg, retval));
1185 	}
1186 }
1187 
1188 int
1189 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1190 {
1191 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1192 		return (EINVAL);
1193 
1194 	if (reg >= VM_REG_LAST)
1195 		return (EINVAL);
1196 
1197 	int error;
1198 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1199 	switch (reg) {
1200 	case VM_REG_GUEST_RIP:
1201 		error = VMSETREG(vm->cookie, vcpuid, reg, val);
1202 		if (error == 0) {
1203 			vcpu->nextrip = val;
1204 		}
1205 		return (error);
1206 	case VM_REG_GUEST_XCR0:
1207 		if (!validate_guest_xcr0(val, vmm_get_host_xcr0())) {
1208 			return (EINVAL);
1209 		}
1210 		vcpu->guest_xcr0 = val;
1211 		return (0);
1212 	default:
1213 		return (VMSETREG(vm->cookie, vcpuid, reg, val));
1214 	}
1215 }
1216 
1217 static bool
1218 is_descriptor_table(int reg)
1219 {
1220 	switch (reg) {
1221 	case VM_REG_GUEST_IDTR:
1222 	case VM_REG_GUEST_GDTR:
1223 		return (true);
1224 	default:
1225 		return (false);
1226 	}
1227 }
1228 
1229 static bool
1230 is_segment_register(int reg)
1231 {
1232 	switch (reg) {
1233 	case VM_REG_GUEST_ES:
1234 	case VM_REG_GUEST_CS:
1235 	case VM_REG_GUEST_SS:
1236 	case VM_REG_GUEST_DS:
1237 	case VM_REG_GUEST_FS:
1238 	case VM_REG_GUEST_GS:
1239 	case VM_REG_GUEST_TR:
1240 	case VM_REG_GUEST_LDTR:
1241 		return (true);
1242 	default:
1243 		return (false);
1244 	}
1245 }
1246 
1247 int
1248 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1249 {
1250 
1251 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1252 		return (EINVAL);
1253 
1254 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1255 		return (EINVAL);
1256 
1257 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1258 }
1259 
1260 int
1261 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1262 {
1263 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1264 		return (EINVAL);
1265 
1266 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1267 		return (EINVAL);
1268 
1269 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1270 }
1271 
1272 static int
1273 translate_hma_xsave_result(hma_fpu_xsave_result_t res)
1274 {
1275 	switch (res) {
1276 	case HFXR_OK:
1277 		return (0);
1278 	case HFXR_NO_SPACE:
1279 		return (ENOSPC);
1280 	case HFXR_BAD_ALIGN:
1281 	case HFXR_UNSUP_FMT:
1282 	case HFXR_UNSUP_FEAT:
1283 	case HFXR_INVALID_DATA:
1284 		return (EINVAL);
1285 	default:
1286 		panic("unexpected xsave result");
1287 	}
1288 }
1289 
1290 int
1291 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
1292 {
1293 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1294 		return (EINVAL);
1295 
1296 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1297 	hma_fpu_xsave_result_t res;
1298 
1299 	res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len);
1300 	return (translate_hma_xsave_result(res));
1301 }
1302 
1303 int
1304 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
1305 {
1306 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1307 		return (EINVAL);
1308 
1309 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1310 	hma_fpu_xsave_result_t res;
1311 
1312 	res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len);
1313 	return (translate_hma_xsave_result(res));
1314 }
1315 
1316 int
1317 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1318 {
1319 	struct vcpu *vcpu;
1320 
1321 	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1322 		return (EINVAL);
1323 	}
1324 
1325 	vcpu = &vm->vcpu[vcpuid];
1326 
1327 	vcpu_lock(vcpu);
1328 	*state = vcpu->run_state;
1329 	*sipi_vec = vcpu->sipi_vector;
1330 	vcpu_unlock(vcpu);
1331 
1332 	return (0);
1333 }
1334 
1335 int
1336 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1337 {
1338 	struct vcpu *vcpu;
1339 
1340 	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1341 		return (EINVAL);
1342 	}
1343 	if (!VRS_IS_VALID(state)) {
1344 		return (EINVAL);
1345 	}
1346 
1347 	vcpu = &vm->vcpu[vcpuid];
1348 
1349 	vcpu_lock(vcpu);
1350 	vcpu->run_state = state;
1351 	vcpu->sipi_vector = sipi_vec;
1352 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1353 	vcpu_unlock(vcpu);
1354 
1355 	return (0);
1356 }
1357 
1358 int
1359 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap)
1360 {
1361 	vmspace_t *vms = vm_get_vmspace(vm);
1362 	return (vmspace_track_dirty(vms, gpa, len, bitmap));
1363 }
1364 
1365 static void
1366 restore_guest_fpustate(struct vcpu *vcpu)
1367 {
1368 	/* Save host FPU and restore guest FPU */
1369 	fpu_stop_emulating();
1370 	hma_fpu_start_guest(vcpu->guestfpu);
1371 
1372 	/* restore guest XCR0 if XSAVE is enabled in the host */
1373 	if (rcr4() & CR4_XSAVE)
1374 		load_xcr(0, vcpu->guest_xcr0);
1375 
1376 	/*
1377 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1378 	 * to trap any access to the FPU by the host.
1379 	 */
1380 	fpu_start_emulating();
1381 }
1382 
1383 static void
1384 save_guest_fpustate(struct vcpu *vcpu)
1385 {
1386 
1387 	if ((rcr0() & CR0_TS) == 0)
1388 		panic("fpu emulation not enabled in host!");
1389 
1390 	/* save guest XCR0 and restore host XCR0 */
1391 	if (rcr4() & CR4_XSAVE) {
1392 		vcpu->guest_xcr0 = rxcr(0);
1393 		load_xcr(0, vmm_get_host_xcr0());
1394 	}
1395 
1396 	/* save guest FPU and restore host FPU */
1397 	fpu_stop_emulating();
1398 	hma_fpu_stop_guest(vcpu->guestfpu);
1399 	/*
1400 	 * When the host state has been restored, we should not re-enable
1401 	 * CR0.TS on illumos for eager FPU.
1402 	 */
1403 }
1404 
1405 static int
1406 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1407     bool from_idle)
1408 {
1409 	struct vcpu *vcpu;
1410 	int error;
1411 
1412 	vcpu = &vm->vcpu[vcpuid];
1413 	vcpu_assert_locked(vcpu);
1414 
1415 	/*
1416 	 * State transitions from the vmmdev_ioctl() must always begin from
1417 	 * the VCPU_IDLE state. This guarantees that there is only a single
1418 	 * ioctl() operating on a vcpu at any point.
1419 	 */
1420 	if (from_idle) {
1421 		while (vcpu->state != VCPU_IDLE) {
1422 			vcpu->reqidle = 1;
1423 			vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1424 			cv_wait(&vcpu->state_cv, &vcpu->lock);
1425 		}
1426 	} else {
1427 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1428 		    "vcpu idle state"));
1429 	}
1430 
1431 	if (vcpu->state == VCPU_RUNNING) {
1432 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1433 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1434 	} else {
1435 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1436 		    "vcpu that is not running", vcpu->hostcpu));
1437 	}
1438 
1439 	/*
1440 	 * The following state transitions are allowed:
1441 	 * IDLE -> FROZEN -> IDLE
1442 	 * FROZEN -> RUNNING -> FROZEN
1443 	 * FROZEN -> SLEEPING -> FROZEN
1444 	 */
1445 	switch (vcpu->state) {
1446 	case VCPU_IDLE:
1447 	case VCPU_RUNNING:
1448 	case VCPU_SLEEPING:
1449 		error = (newstate != VCPU_FROZEN);
1450 		break;
1451 	case VCPU_FROZEN:
1452 		error = (newstate == VCPU_FROZEN);
1453 		break;
1454 	default:
1455 		error = 1;
1456 		break;
1457 	}
1458 
1459 	if (error)
1460 		return (EBUSY);
1461 
1462 	vcpu->state = newstate;
1463 	if (newstate == VCPU_RUNNING)
1464 		vcpu->hostcpu = curcpu;
1465 	else
1466 		vcpu->hostcpu = NOCPU;
1467 
1468 	if (newstate == VCPU_IDLE) {
1469 		cv_broadcast(&vcpu->state_cv);
1470 	}
1471 
1472 	return (0);
1473 }
1474 
1475 static void
1476 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1477 {
1478 	int error;
1479 
1480 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1481 		panic("Error %d setting state to %d\n", error, newstate);
1482 }
1483 
1484 static void
1485 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1486 {
1487 	int error;
1488 
1489 	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1490 		panic("Error %d setting state to %d", error, newstate);
1491 }
1492 
1493 /*
1494  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1495  */
1496 static int
1497 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1498 {
1499 	struct vcpu *vcpu;
1500 	int vcpu_halted, vm_halted;
1501 	bool userspace_exit = false;
1502 
1503 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1504 
1505 	vcpu = &vm->vcpu[vcpuid];
1506 	vcpu_halted = 0;
1507 	vm_halted = 0;
1508 
1509 	vcpu_lock(vcpu);
1510 	while (1) {
1511 		/*
1512 		 * Do a final check for pending interrupts (including NMI and
1513 		 * INIT) before putting this thread to sleep.
1514 		 */
1515 		if (vm_nmi_pending(vm, vcpuid))
1516 			break;
1517 		if (vcpu_run_state_pending(vm, vcpuid))
1518 			break;
1519 		if (!intr_disabled) {
1520 			if (vm_extint_pending(vm, vcpuid) ||
1521 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1522 				break;
1523 			}
1524 		}
1525 
1526 		/*
1527 		 * Also check for software events which would cause a wake-up.
1528 		 * This will set the appropriate exitcode directly, rather than
1529 		 * requiring a trip through VM_RUN().
1530 		 */
1531 		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1532 			userspace_exit = true;
1533 			break;
1534 		}
1535 
1536 		/*
1537 		 * Some Linux guests implement "halt" by having all vcpus
1538 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1539 		 * track of the vcpus that have entered this state. When all
1540 		 * vcpus enter the halted state the virtual machine is halted.
1541 		 */
1542 		if (intr_disabled) {
1543 			if (!vcpu_halted && halt_detection_enabled) {
1544 				vcpu_halted = 1;
1545 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1546 			}
1547 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1548 				vm_halted = 1;
1549 				break;
1550 			}
1551 		}
1552 
1553 		vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1554 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1555 		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
1556 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1557 		vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1558 	}
1559 
1560 	if (vcpu_halted)
1561 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1562 
1563 	vcpu_unlock(vcpu);
1564 
1565 	if (vm_halted) {
1566 		(void) vm_suspend(vm, VM_SUSPEND_HALT);
1567 	}
1568 
1569 	return (userspace_exit ? -1 : 0);
1570 }
1571 
1572 static int
1573 vm_handle_paging(struct vm *vm, int vcpuid)
1574 {
1575 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1576 	vm_client_t *vmc = vcpu->vmclient;
1577 	struct vm_exit *vme = &vcpu->exitinfo;
1578 	const int ftype = vme->u.paging.fault_type;
1579 
1580 	ASSERT0(vme->inst_length);
1581 	ASSERT(ftype == PROT_READ || ftype == PROT_WRITE || ftype == PROT_EXEC);
1582 
1583 	if (vmc_fault(vmc, vme->u.paging.gpa, ftype) != 0) {
1584 		/*
1585 		 * If the fault cannot be serviced, kick it out to userspace for
1586 		 * handling (or more likely, halting the instance).
1587 		 */
1588 		return (-1);
1589 	}
1590 
1591 	return (0);
1592 }
1593 
1594 int
1595 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1596     int rsize)
1597 {
1598 	int err = ESRCH;
1599 
1600 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1601 		struct vlapic *vlapic = vm_lapic(vm, cpuid);
1602 
1603 		err = vlapic_mmio_read(vlapic, gpa, rval, rsize);
1604 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1605 		err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1606 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1607 		err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1608 	}
1609 
1610 	return (err);
1611 }
1612 
1613 int
1614 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1615     int wsize)
1616 {
1617 	int err = ESRCH;
1618 
1619 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1620 		struct vlapic *vlapic = vm_lapic(vm, cpuid);
1621 
1622 		err = vlapic_mmio_write(vlapic, gpa, wval, wsize);
1623 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1624 		err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1625 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1626 		err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1627 	}
1628 
1629 	return (err);
1630 }
1631 
1632 static int
1633 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1634 {
1635 	struct vie *vie;
1636 	struct vcpu *vcpu;
1637 	struct vm_exit *vme;
1638 	uint64_t inst_addr;
1639 	int error, fault, cs_d;
1640 
1641 	vcpu = &vm->vcpu[vcpuid];
1642 	vme = &vcpu->exitinfo;
1643 	vie = vcpu->vie_ctx;
1644 
1645 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1646 	    __func__, vme->inst_length));
1647 
1648 	inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1649 	cs_d = vme->u.mmio_emul.cs_d;
1650 
1651 	/* Fetch the faulting instruction */
1652 	if (vie_needs_fetch(vie)) {
1653 		error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1654 		    &fault);
1655 		if (error != 0) {
1656 			return (error);
1657 		} else if (fault) {
1658 			/*
1659 			 * If a fault during instruction fetch was encountered,
1660 			 * it will have asserted that the appropriate exception
1661 			 * be injected at next entry.
1662 			 * No further work is required.
1663 			 */
1664 			return (0);
1665 		}
1666 	}
1667 
1668 	if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1669 		/* Dump (unrecognized) instruction bytes in userspace */
1670 		vie_fallback_exitinfo(vie, vme);
1671 		return (-1);
1672 	}
1673 	if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1674 	    vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1675 		/* Decoded GLA does not match GLA from VM exit state */
1676 		vie_fallback_exitinfo(vie, vme);
1677 		return (-1);
1678 	}
1679 
1680 repeat:
1681 	error = vie_emulate_mmio(vie, vm, vcpuid);
1682 	if (error < 0) {
1683 		/*
1684 		 * MMIO not handled by any of the in-kernel-emulated devices, so
1685 		 * make a trip out to userspace for it.
1686 		 */
1687 		vie_exitinfo(vie, vme);
1688 	} else if (error == EAGAIN) {
1689 		/*
1690 		 * Continue emulating the rep-prefixed instruction, which has
1691 		 * not completed its iterations.
1692 		 *
1693 		 * In case this can be emulated in-kernel and has a high
1694 		 * repetition count (causing a tight spin), it should be
1695 		 * deferential to yield conditions.
1696 		 */
1697 		if (!vcpu_should_yield(vm, vcpuid)) {
1698 			goto repeat;
1699 		} else {
1700 			/*
1701 			 * Defer to the contending load by making a trip to
1702 			 * userspace with a no-op (BOGUS) exit reason.
1703 			 */
1704 			vie_reset(vie);
1705 			vme->exitcode = VM_EXITCODE_BOGUS;
1706 			return (-1);
1707 		}
1708 	} else if (error == 0) {
1709 		/* Update %rip now that instruction has been emulated */
1710 		vie_advance_pc(vie, &vcpu->nextrip);
1711 	}
1712 	return (error);
1713 }
1714 
1715 static int
1716 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1717 {
1718 	struct vcpu *vcpu;
1719 	struct vie *vie;
1720 	int err;
1721 
1722 	vcpu = &vm->vcpu[vcpuid];
1723 	vie = vcpu->vie_ctx;
1724 
1725 repeat:
1726 	err = vie_emulate_inout(vie, vm, vcpuid);
1727 
1728 	if (err < 0) {
1729 		/*
1730 		 * In/out not handled by any of the in-kernel-emulated devices,
1731 		 * so make a trip out to userspace for it.
1732 		 */
1733 		vie_exitinfo(vie, vme);
1734 		return (err);
1735 	} else if (err == EAGAIN) {
1736 		/*
1737 		 * Continue emulating the rep-prefixed ins/outs, which has not
1738 		 * completed its iterations.
1739 		 *
1740 		 * In case this can be emulated in-kernel and has a high
1741 		 * repetition count (causing a tight spin), it should be
1742 		 * deferential to yield conditions.
1743 		 */
1744 		if (!vcpu_should_yield(vm, vcpuid)) {
1745 			goto repeat;
1746 		} else {
1747 			/*
1748 			 * Defer to the contending load by making a trip to
1749 			 * userspace with a no-op (BOGUS) exit reason.
1750 			 */
1751 			vie_reset(vie);
1752 			vme->exitcode = VM_EXITCODE_BOGUS;
1753 			return (-1);
1754 		}
1755 	} else if (err != 0) {
1756 		/* Emulation failure.  Bail all the way out to userspace. */
1757 		vme->exitcode = VM_EXITCODE_INST_EMUL;
1758 		bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1759 		return (-1);
1760 	}
1761 
1762 	vie_advance_pc(vie, &vcpu->nextrip);
1763 	return (0);
1764 }
1765 
1766 static int
1767 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1768 {
1769 	struct vie *vie;
1770 	struct vcpu *vcpu;
1771 	struct vm_exit *vme;
1772 	uint64_t cs_base;
1773 	int error, fault, cs_d;
1774 
1775 	vcpu = &vm->vcpu[vcpuid];
1776 	vme = &vcpu->exitinfo;
1777 	vie = vcpu->vie_ctx;
1778 
1779 	vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1780 
1781 	/* Fetch the faulting instruction */
1782 	ASSERT(vie_needs_fetch(vie));
1783 	error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1784 	    &fault);
1785 	if (error != 0) {
1786 		return (error);
1787 	} else if (fault) {
1788 		/*
1789 		 * If a fault during instruction fetch was encounted, it will
1790 		 * have asserted that the appropriate exception be injected at
1791 		 * next entry.  No further work is required.
1792 		 */
1793 		return (0);
1794 	}
1795 
1796 	if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1797 		/* Dump (unrecognized) instruction bytes in userspace */
1798 		vie_fallback_exitinfo(vie, vme);
1799 		return (-1);
1800 	}
1801 
1802 	error = vie_emulate_other(vie, vm, vcpuid);
1803 	if (error != 0) {
1804 		/*
1805 		 * Instruction emulation was unable to complete successfully, so
1806 		 * kick it out to userspace for handling.
1807 		 */
1808 		vie_fallback_exitinfo(vie, vme);
1809 	} else {
1810 		/* Update %rip now that instruction has been emulated */
1811 		vie_advance_pc(vie, &vcpu->nextrip);
1812 	}
1813 	return (error);
1814 }
1815 
1816 static int
1817 vm_handle_suspend(struct vm *vm, int vcpuid)
1818 {
1819 	int i;
1820 	struct vcpu *vcpu;
1821 
1822 	vcpu = &vm->vcpu[vcpuid];
1823 
1824 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1825 
1826 	/*
1827 	 * Wait until all 'active_cpus' have suspended themselves.
1828 	 */
1829 	vcpu_lock(vcpu);
1830 	vcpu_ustate_change(vm, vcpuid, VU_INIT);
1831 	while (1) {
1832 		int rc;
1833 
1834 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1835 			break;
1836 		}
1837 
1838 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1839 		rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz,
1840 		    TR_CLOCK_TICK);
1841 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1842 
1843 		/*
1844 		 * If the userspace process driving the instance is killed, any
1845 		 * vCPUs yet to be marked suspended (because they are not
1846 		 * VM_RUN-ing in the kernel presently) will never reach that
1847 		 * state.
1848 		 *
1849 		 * To avoid vm_handle_suspend() getting stuck in the kernel
1850 		 * waiting for those vCPUs, offer a bail-out even though it
1851 		 * means returning without all vCPUs in a suspended state.
1852 		 */
1853 		if (rc <= 0) {
1854 			if ((curproc->p_flag & SEXITING) != 0) {
1855 				break;
1856 			}
1857 		}
1858 	}
1859 	vcpu_unlock(vcpu);
1860 
1861 	/*
1862 	 * Wakeup the other sleeping vcpus and return to userspace.
1863 	 */
1864 	for (i = 0; i < vm->maxcpus; i++) {
1865 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1866 			vcpu_notify_event(vm, i);
1867 		}
1868 	}
1869 
1870 	return (-1);
1871 }
1872 
1873 static int
1874 vm_handle_reqidle(struct vm *vm, int vcpuid)
1875 {
1876 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1877 
1878 	vcpu_lock(vcpu);
1879 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1880 	vcpu->reqidle = 0;
1881 	vcpu_unlock(vcpu);
1882 	return (-1);
1883 }
1884 
1885 static int
1886 vm_handle_run_state(struct vm *vm, int vcpuid)
1887 {
1888 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1889 	bool handled = false;
1890 
1891 	vcpu_lock(vcpu);
1892 	while (1) {
1893 		if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1894 			vcpu_unlock(vcpu);
1895 			VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1896 			vcpu_lock(vcpu);
1897 
1898 			vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1899 			vcpu->run_state |= VRS_INIT;
1900 		}
1901 
1902 		if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1903 		    (VRS_INIT | VRS_PEND_SIPI)) {
1904 			const uint8_t vector = vcpu->sipi_vector;
1905 
1906 			vcpu_unlock(vcpu);
1907 			VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1908 			vcpu_lock(vcpu);
1909 
1910 			vcpu->run_state &= ~VRS_PEND_SIPI;
1911 			vcpu->run_state |= VRS_RUN;
1912 		}
1913 
1914 		/*
1915 		 * If the vCPU is now in the running state, there is no need to
1916 		 * wait for anything prior to re-entry.
1917 		 */
1918 		if ((vcpu->run_state & VRS_RUN) != 0) {
1919 			handled = true;
1920 			break;
1921 		}
1922 
1923 		/*
1924 		 * Also check for software events which would cause a wake-up.
1925 		 * This will set the appropriate exitcode directly, rather than
1926 		 * requiring a trip through VM_RUN().
1927 		 */
1928 		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1929 			break;
1930 		}
1931 
1932 		vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1933 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1934 		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
1935 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1936 		vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1937 	}
1938 	vcpu_unlock(vcpu);
1939 
1940 	return (handled ? 0 : -1);
1941 }
1942 
1943 static int
1944 vm_rdmtrr(const struct vm_mtrr *mtrr, uint32_t num, uint64_t *val)
1945 {
1946 	switch (num) {
1947 	case MSR_MTRRcap:
1948 		*val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX;
1949 		break;
1950 	case MSR_MTRRdefType:
1951 		*val = mtrr->def_type;
1952 		break;
1953 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
1954 		*val = mtrr->fixed4k[num - MSR_MTRR4kBase];
1955 		break;
1956 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1957 		*val = mtrr->fixed16k[num - MSR_MTRR16kBase];
1958 		break;
1959 	case MSR_MTRR64kBase:
1960 		*val = mtrr->fixed64k;
1961 		break;
1962 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
1963 		uint_t offset = num - MSR_MTRRVarBase;
1964 		if (offset % 2 == 0) {
1965 			*val = mtrr->var[offset / 2].base;
1966 		} else {
1967 			*val = mtrr->var[offset / 2].mask;
1968 		}
1969 		break;
1970 	}
1971 	default:
1972 		return (-1);
1973 	}
1974 
1975 	return (0);
1976 }
1977 
1978 static int
1979 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val)
1980 {
1981 	switch (num) {
1982 	case MSR_MTRRcap:
1983 		/* MTRRCAP is read only */
1984 		return (-1);
1985 	case MSR_MTRRdefType:
1986 		if (val & ~VMM_MTRR_DEF_MASK) {
1987 			/* generate #GP on writes to reserved fields */
1988 			return (-1);
1989 		}
1990 		mtrr->def_type = val;
1991 		break;
1992 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
1993 		mtrr->fixed4k[num - MSR_MTRR4kBase] = val;
1994 		break;
1995 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1996 		mtrr->fixed16k[num - MSR_MTRR16kBase] = val;
1997 		break;
1998 	case MSR_MTRR64kBase:
1999 		mtrr->fixed64k = val;
2000 		break;
2001 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
2002 		uint_t offset = num - MSR_MTRRVarBase;
2003 		if (offset % 2 == 0) {
2004 			if (val & ~VMM_MTRR_PHYSBASE_MASK) {
2005 				/* generate #GP on writes to reserved fields */
2006 				return (-1);
2007 			}
2008 			mtrr->var[offset / 2].base = val;
2009 		} else {
2010 			if (val & ~VMM_MTRR_PHYSMASK_MASK) {
2011 				/* generate #GP on writes to reserved fields */
2012 				return (-1);
2013 			}
2014 			mtrr->var[offset / 2].mask = val;
2015 		}
2016 		break;
2017 	}
2018 	default:
2019 		return (-1);
2020 	}
2021 
2022 	return (0);
2023 }
2024 
2025 static bool
2026 is_mtrr_msr(uint32_t msr)
2027 {
2028 	switch (msr) {
2029 	case MSR_MTRRcap:
2030 	case MSR_MTRRdefType:
2031 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
2032 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
2033 	case MSR_MTRR64kBase:
2034 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
2035 		return (true);
2036 	default:
2037 		return (false);
2038 	}
2039 }
2040 
2041 static int
2042 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
2043 {
2044 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2045 	const uint32_t code = vme->u.msr.code;
2046 	uint64_t val = 0;
2047 
2048 	switch (code) {
2049 	case MSR_MCG_CAP:
2050 	case MSR_MCG_STATUS:
2051 		val = 0;
2052 		break;
2053 
2054 	case MSR_MTRRcap:
2055 	case MSR_MTRRdefType:
2056 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
2057 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
2058 	case MSR_MTRR64kBase:
2059 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
2060 		if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0)
2061 			vm_inject_gp(vm, vcpuid);
2062 		break;
2063 
2064 	case MSR_TSC:
2065 		/*
2066 		 * In all likelihood, this should always be handled in guest
2067 		 * context by VMX/SVM rather than taking an exit.  (Both VMX and
2068 		 * SVM pass through read-only access to MSR_TSC to the guest.)
2069 		 *
2070 		 * No physical offset is requested of vcpu_tsc_offset() since
2071 		 * rdtsc_offset() takes care of that instead.
2072 		 */
2073 		val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
2074 		break;
2075 
2076 	default:
2077 		/*
2078 		 * Anything not handled at this point will be kicked out to
2079 		 * userspace for attempted processing there.
2080 		 */
2081 		return (-1);
2082 	}
2083 
2084 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
2085 	    val & 0xffffffff));
2086 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
2087 	    val >> 32));
2088 	return (0);
2089 }
2090 
2091 static int
2092 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
2093 {
2094 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2095 	const uint32_t code = vme->u.msr.code;
2096 	const uint64_t val = vme->u.msr.wval;
2097 
2098 	switch (code) {
2099 	case MSR_MCG_CAP:
2100 	case MSR_MCG_STATUS:
2101 		/* Ignore writes */
2102 		break;
2103 
2104 	case MSR_MTRRcap:
2105 	case MSR_MTRRdefType:
2106 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
2107 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
2108 	case MSR_MTRR64kBase:
2109 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
2110 		if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0)
2111 			vm_inject_gp(vm, vcpuid);
2112 		break;
2113 
2114 	case MSR_TSC:
2115 		/*
2116 		 * The effect of writing the TSC MSR is that a subsequent read
2117 		 * of the TSC would report that value written (plus any time
2118 		 * elapsed between the write and the read).  The guest TSC value
2119 		 * is calculated from a global offset for the guest (which
2120 		 * effectively makes its TSC read 0 at guest boot) and a
2121 		 * per-vCPU offset to handle these writes to the MSR.
2122 		 *
2123 		 * To calculate that per-vCPU offset, we can work backwards from
2124 		 * the guest value at the time of write:
2125 		 *
2126 		 * value = host TSC + VM boot offset + vCPU offset
2127 		 *
2128 		 * so therefore:
2129 		 *
2130 		 * value - host TSC - VM boot offset = vCPU offset
2131 		 */
2132 		vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
2133 		break;
2134 
2135 	default:
2136 		/*
2137 		 * Anything not handled at this point will be kicked out to
2138 		 * userspace for attempted processing there.
2139 		 */
2140 		return (-1);
2141 	}
2142 
2143 	return (0);
2144 }
2145 
2146 int
2147 vm_suspend(struct vm *vm, enum vm_suspend_how how)
2148 {
2149 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
2150 		return (EINVAL);
2151 
2152 	if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
2153 		return (EALREADY);
2154 	}
2155 
2156 	/*
2157 	 * Notify all active vcpus that they are now suspended.
2158 	 */
2159 	for (uint_t i = 0; i < vm->maxcpus; i++) {
2160 		struct vcpu *vcpu = &vm->vcpu[i];
2161 
2162 		vcpu_lock(vcpu);
2163 		if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) {
2164 			/*
2165 			 * Any vCPUs not actively running or in HLT can be
2166 			 * marked as suspended immediately.
2167 			 */
2168 			if (CPU_ISSET(i, &vm->active_cpus)) {
2169 				CPU_SET_ATOMIC(i, &vm->suspended_cpus);
2170 			}
2171 		} else {
2172 			/*
2173 			 * Those which are running or in HLT will pick up the
2174 			 * suspended state after notification.
2175 			 */
2176 			vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2177 		}
2178 		vcpu_unlock(vcpu);
2179 	}
2180 	return (0);
2181 }
2182 
2183 void
2184 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
2185 {
2186 	struct vm_exit *vmexit;
2187 
2188 	vmexit = vm_exitinfo(vm, vcpuid);
2189 	vmexit->rip = rip;
2190 	vmexit->inst_length = 0;
2191 	vmexit->exitcode = VM_EXITCODE_RUN_STATE;
2192 	vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
2193 }
2194 
2195 /*
2196  * Some vmm resources, such as the lapic, may have CPU-specific resources
2197  * allocated to them which would benefit from migration onto the host CPU which
2198  * is processing the vcpu state.
2199  */
2200 static void
2201 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2202 {
2203 	/*
2204 	 * Localizing cyclic resources requires acquisition of cpu_lock, and
2205 	 * doing so with kpreempt disabled is a recipe for deadlock disaster.
2206 	 */
2207 	VERIFY(curthread->t_preempt == 0);
2208 
2209 	/*
2210 	 * Do not bother with localization if this vCPU is about to return to
2211 	 * the host CPU it was last localized to.
2212 	 */
2213 	if (vcpu->lastloccpu == curcpu)
2214 		return;
2215 
2216 	/*
2217 	 * Localize system-wide resources to the primary boot vCPU.  While any
2218 	 * of the other vCPUs may access them, it keeps the potential interrupt
2219 	 * footprint constrained to CPUs involved with this instance.
2220 	 */
2221 	if (vcpu == &vm->vcpu[0]) {
2222 		vhpet_localize_resources(vm->vhpet);
2223 		vrtc_localize_resources(vm->vrtc);
2224 		vatpit_localize_resources(vm->vatpit);
2225 	}
2226 
2227 	vlapic_localize_resources(vcpu->vlapic);
2228 
2229 	vcpu->lastloccpu = curcpu;
2230 }
2231 
2232 static void
2233 vmm_savectx(void *arg)
2234 {
2235 	vm_thread_ctx_t *vtc = arg;
2236 	struct vm *vm = vtc->vtc_vm;
2237 	const int vcpuid = vtc->vtc_vcpuid;
2238 
2239 	if (ops->vmsavectx != NULL) {
2240 		ops->vmsavectx(vm->cookie, vcpuid);
2241 	}
2242 
2243 	/*
2244 	 * Account for going off-cpu, unless the vCPU is idled, where being
2245 	 * off-cpu is the explicit point.
2246 	 */
2247 	if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2248 		vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2249 		vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2250 	}
2251 
2252 	/*
2253 	 * If the CPU holds the restored guest FPU state, save it and restore
2254 	 * the host FPU state before this thread goes off-cpu.
2255 	 */
2256 	if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2257 		struct vcpu *vcpu = &vm->vcpu[vcpuid];
2258 
2259 		save_guest_fpustate(vcpu);
2260 		vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2261 	}
2262 }
2263 
2264 static void
2265 vmm_restorectx(void *arg)
2266 {
2267 	vm_thread_ctx_t *vtc = arg;
2268 	struct vm *vm = vtc->vtc_vm;
2269 	const int vcpuid = vtc->vtc_vcpuid;
2270 
2271 	/* Complete microstate accounting for vCPU being off-cpu */
2272 	if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2273 		vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2274 	}
2275 
2276 	/*
2277 	 * When coming back on-cpu, only restore the guest FPU status if the
2278 	 * thread is in a context marked as requiring it.  This should be rare,
2279 	 * occurring only when a future logic error results in a voluntary
2280 	 * sleep during the VMRUN critical section.
2281 	 *
2282 	 * The common case will result in elision of the guest FPU state
2283 	 * restoration, deferring that action until it is clearly necessary
2284 	 * during vm_run.
2285 	 */
2286 	VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2287 	if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2288 		struct vcpu *vcpu = &vm->vcpu[vcpuid];
2289 
2290 		restore_guest_fpustate(vcpu);
2291 		vtc->vtc_status |= VTCS_FPU_RESTORED;
2292 	}
2293 
2294 	if (ops->vmrestorectx != NULL) {
2295 		ops->vmrestorectx(vm->cookie, vcpuid);
2296 	}
2297 
2298 }
2299 
2300 static int
2301 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2302     struct vm_exit *vme)
2303 {
2304 	struct vcpu *vcpu;
2305 	struct vie *vie;
2306 	int err;
2307 
2308 	vcpu = &vm->vcpu[vcpuid];
2309 	vie = vcpu->vie_ctx;
2310 	err = 0;
2311 
2312 	switch (entry->cmd) {
2313 	case VEC_DEFAULT:
2314 		return (0);
2315 	case VEC_DISCARD_INSTR:
2316 		vie_reset(vie);
2317 		return (0);
2318 	case VEC_FULFILL_MMIO:
2319 		err = vie_fulfill_mmio(vie, &entry->u.mmio);
2320 		if (err == 0) {
2321 			err = vie_emulate_mmio(vie, vm, vcpuid);
2322 			if (err == 0) {
2323 				vie_advance_pc(vie, &vcpu->nextrip);
2324 			} else if (err < 0) {
2325 				vie_exitinfo(vie, vme);
2326 			} else if (err == EAGAIN) {
2327 				/*
2328 				 * Clear the instruction emulation state in
2329 				 * order to re-enter VM context and continue
2330 				 * this 'rep <instruction>'
2331 				 */
2332 				vie_reset(vie);
2333 				err = 0;
2334 			}
2335 		}
2336 		break;
2337 	case VEC_FULFILL_INOUT:
2338 		err = vie_fulfill_inout(vie, &entry->u.inout);
2339 		if (err == 0) {
2340 			err = vie_emulate_inout(vie, vm, vcpuid);
2341 			if (err == 0) {
2342 				vie_advance_pc(vie, &vcpu->nextrip);
2343 			} else if (err < 0) {
2344 				vie_exitinfo(vie, vme);
2345 			} else if (err == EAGAIN) {
2346 				/*
2347 				 * Clear the instruction emulation state in
2348 				 * order to re-enter VM context and continue
2349 				 * this 'rep ins/outs'
2350 				 */
2351 				vie_reset(vie);
2352 				err = 0;
2353 			}
2354 		}
2355 		break;
2356 	default:
2357 		return (EINVAL);
2358 	}
2359 	return (err);
2360 }
2361 
2362 static int
2363 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2364 {
2365 	struct vie *vie;
2366 
2367 	vie = vm->vcpu[vcpuid].vie_ctx;
2368 
2369 	if (vie_pending(vie)) {
2370 		/*
2371 		 * Userspace has not fulfilled the pending needs of the
2372 		 * instruction emulation, so bail back out.
2373 		 */
2374 		vie_exitinfo(vie, vme);
2375 		return (-1);
2376 	}
2377 
2378 	return (0);
2379 }
2380 
2381 int
2382 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2383 {
2384 	int error;
2385 	struct vcpu *vcpu;
2386 	struct vm_exit *vme;
2387 	bool intr_disabled;
2388 	int affinity_type = CPU_CURRENT;
2389 
2390 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2391 		return (EINVAL);
2392 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2393 		return (EINVAL);
2394 
2395 	vcpu = &vm->vcpu[vcpuid];
2396 	vme = &vcpu->exitinfo;
2397 
2398 	vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2399 
2400 	vcpu->vtc.vtc_status = 0;
2401 	ctxop_attach(curthread, vcpu->ctxop);
2402 
2403 	error = vm_entry_actions(vm, vcpuid, entry, vme);
2404 	if (error != 0) {
2405 		goto exit;
2406 	}
2407 
2408 restart:
2409 	error = vm_loop_checks(vm, vcpuid, vme);
2410 	if (error != 0) {
2411 		goto exit;
2412 	}
2413 
2414 	thread_affinity_set(curthread, affinity_type);
2415 	/*
2416 	 * Resource localization should happen after the CPU affinity for the
2417 	 * thread has been set to ensure that access from restricted contexts,
2418 	 * such as VMX-accelerated APIC operations, can occur without inducing
2419 	 * cyclic cross-calls.
2420 	 *
2421 	 * This must be done prior to disabling kpreempt via critical_enter().
2422 	 */
2423 	vm_localize_resources(vm, vcpu);
2424 	affinity_type = CPU_CURRENT;
2425 	critical_enter();
2426 
2427 	/* Force a trip through update_sregs to reload %fs/%gs and friends */
2428 	PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2429 
2430 	if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2431 		restore_guest_fpustate(vcpu);
2432 		vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED;
2433 	}
2434 	vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2435 
2436 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2437 	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip);
2438 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2439 
2440 	/*
2441 	 * Once clear of the delicate contexts comprising the VM_RUN handler,
2442 	 * thread CPU affinity can be loosened while other processing occurs.
2443 	 */
2444 	vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2445 	thread_affinity_clear(curthread);
2446 	critical_exit();
2447 
2448 	if (error != 0) {
2449 		/* Communicate out any error from VMRUN() above */
2450 		goto exit;
2451 	}
2452 
2453 	vcpu->nextrip = vme->rip + vme->inst_length;
2454 	switch (vme->exitcode) {
2455 	case VM_EXITCODE_REQIDLE:
2456 		error = vm_handle_reqidle(vm, vcpuid);
2457 		break;
2458 	case VM_EXITCODE_RUN_STATE:
2459 		error = vm_handle_run_state(vm, vcpuid);
2460 		break;
2461 	case VM_EXITCODE_SUSPENDED:
2462 		error = vm_handle_suspend(vm, vcpuid);
2463 		break;
2464 	case VM_EXITCODE_IOAPIC_EOI:
2465 		vioapic_process_eoi(vm, vcpuid,
2466 		    vme->u.ioapic_eoi.vector);
2467 		break;
2468 	case VM_EXITCODE_HLT:
2469 		intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2470 		error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2471 		break;
2472 	case VM_EXITCODE_PAGING:
2473 		error = vm_handle_paging(vm, vcpuid);
2474 		break;
2475 	case VM_EXITCODE_MMIO_EMUL:
2476 		error = vm_handle_mmio_emul(vm, vcpuid);
2477 		break;
2478 	case VM_EXITCODE_INOUT:
2479 		error = vm_handle_inout(vm, vcpuid, vme);
2480 		break;
2481 	case VM_EXITCODE_INST_EMUL:
2482 		error = vm_handle_inst_emul(vm, vcpuid);
2483 		break;
2484 	case VM_EXITCODE_MONITOR:
2485 	case VM_EXITCODE_MWAIT:
2486 	case VM_EXITCODE_VMINSN:
2487 		vm_inject_ud(vm, vcpuid);
2488 		break;
2489 	case VM_EXITCODE_RDMSR:
2490 		error = vm_handle_rdmsr(vm, vcpuid, vme);
2491 		break;
2492 	case VM_EXITCODE_WRMSR:
2493 		error = vm_handle_wrmsr(vm, vcpuid, vme);
2494 		break;
2495 	case VM_EXITCODE_HT:
2496 		affinity_type = CPU_BEST;
2497 		break;
2498 	case VM_EXITCODE_MTRAP:
2499 		VERIFY0(vm_suspend_cpu(vm, vcpuid));
2500 		error = -1;
2501 		break;
2502 	default:
2503 		/* handled in userland */
2504 		error = -1;
2505 		break;
2506 	}
2507 
2508 	if (error == 0) {
2509 		/* VM exit conditions handled in-kernel, continue running */
2510 		goto restart;
2511 	}
2512 
2513 exit:
2514 	kpreempt_disable();
2515 	ctxop_detach(curthread, vcpu->ctxop);
2516 	/* Make sure all of the needed vCPU context state is saved */
2517 	vmm_savectx(&vcpu->vtc);
2518 	kpreempt_enable();
2519 
2520 	vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2521 	return (error);
2522 }
2523 
2524 int
2525 vm_restart_instruction(void *arg, int vcpuid)
2526 {
2527 	struct vm *vm;
2528 	struct vcpu *vcpu;
2529 	enum vcpu_state state;
2530 	uint64_t rip;
2531 	int error;
2532 
2533 	vm = arg;
2534 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2535 		return (EINVAL);
2536 
2537 	vcpu = &vm->vcpu[vcpuid];
2538 	state = vcpu_get_state(vm, vcpuid, NULL);
2539 	if (state == VCPU_RUNNING) {
2540 		/*
2541 		 * When a vcpu is "running" the next instruction is determined
2542 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2543 		 * Thus setting 'inst_length' to zero will cause the current
2544 		 * instruction to be restarted.
2545 		 */
2546 		vcpu->exitinfo.inst_length = 0;
2547 	} else if (state == VCPU_FROZEN) {
2548 		/*
2549 		 * When a vcpu is "frozen" it is outside the critical section
2550 		 * around VMRUN() and 'nextrip' points to the next instruction.
2551 		 * Thus instruction restart is achieved by setting 'nextrip'
2552 		 * to the vcpu's %rip.
2553 		 */
2554 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2555 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2556 		vcpu->nextrip = rip;
2557 	} else {
2558 		panic("%s: invalid state %d", __func__, state);
2559 	}
2560 	return (0);
2561 }
2562 
2563 int
2564 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2565 {
2566 	struct vcpu *vcpu;
2567 
2568 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2569 		return (EINVAL);
2570 
2571 	vcpu = &vm->vcpu[vcpuid];
2572 
2573 	if (VM_INTINFO_PENDING(info)) {
2574 		const uint32_t type = VM_INTINFO_TYPE(info);
2575 		const uint8_t vector = VM_INTINFO_VECTOR(info);
2576 
2577 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2578 			return (EINVAL);
2579 		if (type == VM_INTINFO_HWEXCP && vector >= 32)
2580 			return (EINVAL);
2581 		if (info & VM_INTINFO_MASK_RSVD)
2582 			return (EINVAL);
2583 	} else {
2584 		info = 0;
2585 	}
2586 	vcpu->exit_intinfo = info;
2587 	return (0);
2588 }
2589 
2590 enum exc_class {
2591 	EXC_BENIGN,
2592 	EXC_CONTRIBUTORY,
2593 	EXC_PAGEFAULT
2594 };
2595 
2596 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
2597 
2598 static enum exc_class
2599 exception_class(uint64_t info)
2600 {
2601 	ASSERT(VM_INTINFO_PENDING(info));
2602 
2603 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2604 	switch (VM_INTINFO_TYPE(info)) {
2605 	case VM_INTINFO_HWINTR:
2606 	case VM_INTINFO_SWINTR:
2607 	case VM_INTINFO_NMI:
2608 		return (EXC_BENIGN);
2609 	default:
2610 		/*
2611 		 * Hardware exception.
2612 		 *
2613 		 * SVM and VT-x use identical type values to represent NMI,
2614 		 * hardware interrupt and software interrupt.
2615 		 *
2616 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2617 		 * for exceptions except #BP and #OF. #BP and #OF use a type
2618 		 * value of '5' or '6'. Therefore we don't check for explicit
2619 		 * values of 'type' to classify 'intinfo' into a hardware
2620 		 * exception.
2621 		 */
2622 		break;
2623 	}
2624 
2625 	switch (VM_INTINFO_VECTOR(info)) {
2626 	case IDT_PF:
2627 	case IDT_VE:
2628 		return (EXC_PAGEFAULT);
2629 	case IDT_DE:
2630 	case IDT_TS:
2631 	case IDT_NP:
2632 	case IDT_SS:
2633 	case IDT_GP:
2634 		return (EXC_CONTRIBUTORY);
2635 	default:
2636 		return (EXC_BENIGN);
2637 	}
2638 }
2639 
2640 /*
2641  * Fetch event pending injection into the guest, if one exists.
2642  *
2643  * Returns true if an event is to be injected (which is placed in `retinfo`).
2644  */
2645 bool
2646 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2647 {
2648 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2649 	const uint64_t info1 = vcpu->exit_intinfo;
2650 	vcpu->exit_intinfo = 0;
2651 	const uint64_t info2 = vcpu->exc_pending;
2652 	vcpu->exc_pending = 0;
2653 
2654 	if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) {
2655 		/*
2656 		 * If an exception occurs while attempting to call the
2657 		 * double-fault handler the processor enters shutdown mode
2658 		 * (aka triple fault).
2659 		 */
2660 		if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP &&
2661 		    VM_INTINFO_VECTOR(info1) == IDT_DF) {
2662 			(void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2663 			*retinfo = 0;
2664 			return (false);
2665 		}
2666 		/*
2667 		 * "Conditions for Generating a Double Fault"
2668 		 *  Intel SDM, Vol3, Table 6-5
2669 		 */
2670 		const enum exc_class exc1 = exception_class(info1);
2671 		const enum exc_class exc2 = exception_class(info2);
2672 		if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2673 		    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2674 			/* Convert nested fault into a double fault. */
2675 			*retinfo =
2676 			    VM_INTINFO_VALID |
2677 			    VM_INTINFO_DEL_ERRCODE |
2678 			    VM_INTINFO_HWEXCP |
2679 			    IDT_DF;
2680 		} else {
2681 			/* Handle exceptions serially */
2682 			vcpu->exit_intinfo = info1;
2683 			*retinfo = info2;
2684 		}
2685 		return (true);
2686 	} else if (VM_INTINFO_PENDING(info1)) {
2687 		*retinfo = info1;
2688 		return (true);
2689 	} else if (VM_INTINFO_PENDING(info2)) {
2690 		*retinfo = info2;
2691 		return (true);
2692 	}
2693 
2694 	return (false);
2695 }
2696 
2697 int
2698 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2699 {
2700 	struct vcpu *vcpu;
2701 
2702 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2703 		return (EINVAL);
2704 
2705 	vcpu = &vm->vcpu[vcpuid];
2706 	*info1 = vcpu->exit_intinfo;
2707 	*info2 = vcpu->exc_pending;
2708 	return (0);
2709 }
2710 
2711 int
2712 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector,
2713     bool errcode_valid, uint32_t errcode, bool restart_instruction)
2714 {
2715 	struct vcpu *vcpu;
2716 	uint64_t regval;
2717 	int error;
2718 
2719 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2720 		return (EINVAL);
2721 
2722 	if (vector >= 32)
2723 		return (EINVAL);
2724 
2725 	/*
2726 	 * NMIs are to be injected via their own specialized path using
2727 	 * vm_inject_nmi().
2728 	 */
2729 	if (vector == IDT_NMI) {
2730 		return (EINVAL);
2731 	}
2732 
2733 	/*
2734 	 * A double fault exception should never be injected directly into
2735 	 * the guest. It is a derived exception that results from specific
2736 	 * combinations of nested faults.
2737 	 */
2738 	if (vector == IDT_DF) {
2739 		return (EINVAL);
2740 	}
2741 
2742 	vcpu = &vm->vcpu[vcpuid];
2743 
2744 	if (VM_INTINFO_PENDING(vcpu->exc_pending)) {
2745 		/* Unable to inject exception due to one already pending */
2746 		return (EBUSY);
2747 	}
2748 
2749 	if (errcode_valid) {
2750 		/*
2751 		 * Exceptions don't deliver an error code in real mode.
2752 		 */
2753 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2754 		VERIFY0(error);
2755 		if ((regval & CR0_PE) == 0) {
2756 			errcode_valid = false;
2757 		}
2758 	}
2759 
2760 	/*
2761 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2762 	 *
2763 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2764 	 * one instruction or incurs an exception.
2765 	 */
2766 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2767 	VERIFY0(error);
2768 
2769 	if (restart_instruction) {
2770 		VERIFY0(vm_restart_instruction(vm, vcpuid));
2771 	}
2772 
2773 	uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector;
2774 	if (errcode_valid) {
2775 		val |= VM_INTINFO_DEL_ERRCODE;
2776 		val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE;
2777 	}
2778 	vcpu->exc_pending = val;
2779 	return (0);
2780 }
2781 
2782 void
2783 vm_inject_ud(struct vm *vm, int vcpuid)
2784 {
2785 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true));
2786 }
2787 
2788 void
2789 vm_inject_gp(struct vm *vm, int vcpuid)
2790 {
2791 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true));
2792 }
2793 
2794 void
2795 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode)
2796 {
2797 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true));
2798 }
2799 
2800 void
2801 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode)
2802 {
2803 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true));
2804 }
2805 
2806 void
2807 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2)
2808 {
2809 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2));
2810 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true));
2811 }
2812 
2813 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2814 
2815 int
2816 vm_inject_nmi(struct vm *vm, int vcpuid)
2817 {
2818 	struct vcpu *vcpu;
2819 
2820 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2821 		return (EINVAL);
2822 
2823 	vcpu = &vm->vcpu[vcpuid];
2824 
2825 	vcpu->nmi_pending = true;
2826 	vcpu_notify_event(vm, vcpuid);
2827 	return (0);
2828 }
2829 
2830 bool
2831 vm_nmi_pending(struct vm *vm, int vcpuid)
2832 {
2833 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2834 
2835 	return (vcpu->nmi_pending);
2836 }
2837 
2838 void
2839 vm_nmi_clear(struct vm *vm, int vcpuid)
2840 {
2841 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2842 
2843 	ASSERT(vcpu->nmi_pending);
2844 
2845 	vcpu->nmi_pending = false;
2846 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2847 }
2848 
2849 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2850 
2851 int
2852 vm_inject_extint(struct vm *vm, int vcpuid)
2853 {
2854 	struct vcpu *vcpu;
2855 
2856 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2857 		return (EINVAL);
2858 
2859 	vcpu = &vm->vcpu[vcpuid];
2860 
2861 	vcpu->extint_pending = true;
2862 	vcpu_notify_event(vm, vcpuid);
2863 	return (0);
2864 }
2865 
2866 bool
2867 vm_extint_pending(struct vm *vm, int vcpuid)
2868 {
2869 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2870 
2871 	return (vcpu->extint_pending);
2872 }
2873 
2874 void
2875 vm_extint_clear(struct vm *vm, int vcpuid)
2876 {
2877 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2878 
2879 	ASSERT(vcpu->extint_pending);
2880 
2881 	vcpu->extint_pending = false;
2882 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2883 }
2884 
2885 int
2886 vm_inject_init(struct vm *vm, int vcpuid)
2887 {
2888 	struct vcpu *vcpu;
2889 
2890 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2891 		return (EINVAL);
2892 
2893 	vcpu = &vm->vcpu[vcpuid];
2894 	vcpu_lock(vcpu);
2895 	vcpu->run_state |= VRS_PEND_INIT;
2896 	/*
2897 	 * As part of queuing the INIT request, clear any pending SIPI.  It
2898 	 * would not otherwise survive across the reset of the vCPU when it
2899 	 * undergoes the requested INIT.  We would not want it to linger when it
2900 	 * could be mistaken as a subsequent (after the INIT) SIPI request.
2901 	 */
2902 	vcpu->run_state &= ~VRS_PEND_SIPI;
2903 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2904 
2905 	vcpu_unlock(vcpu);
2906 	return (0);
2907 }
2908 
2909 int
2910 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2911 {
2912 	struct vcpu *vcpu;
2913 
2914 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2915 		return (EINVAL);
2916 
2917 	vcpu = &vm->vcpu[vcpuid];
2918 	vcpu_lock(vcpu);
2919 	vcpu->run_state |= VRS_PEND_SIPI;
2920 	vcpu->sipi_vector = vector;
2921 	/* SIPI is only actionable if the CPU is waiting in INIT state */
2922 	if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2923 		vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2924 	}
2925 	vcpu_unlock(vcpu);
2926 	return (0);
2927 }
2928 
2929 bool
2930 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2931 {
2932 	struct vcpu *vcpu;
2933 
2934 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2935 	vcpu = &vm->vcpu[vcpuid];
2936 
2937 	/* Of interest: vCPU not in running state or with pending INIT */
2938 	return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2939 }
2940 
2941 int
2942 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2943 {
2944 	struct seg_desc desc;
2945 	const enum vm_reg_name clear_regs[] = {
2946 		VM_REG_GUEST_CR2,
2947 		VM_REG_GUEST_CR3,
2948 		VM_REG_GUEST_CR4,
2949 		VM_REG_GUEST_RAX,
2950 		VM_REG_GUEST_RBX,
2951 		VM_REG_GUEST_RCX,
2952 		VM_REG_GUEST_RSI,
2953 		VM_REG_GUEST_RDI,
2954 		VM_REG_GUEST_RBP,
2955 		VM_REG_GUEST_RSP,
2956 		VM_REG_GUEST_R8,
2957 		VM_REG_GUEST_R9,
2958 		VM_REG_GUEST_R10,
2959 		VM_REG_GUEST_R11,
2960 		VM_REG_GUEST_R12,
2961 		VM_REG_GUEST_R13,
2962 		VM_REG_GUEST_R14,
2963 		VM_REG_GUEST_R15,
2964 		VM_REG_GUEST_DR0,
2965 		VM_REG_GUEST_DR1,
2966 		VM_REG_GUEST_DR2,
2967 		VM_REG_GUEST_DR3,
2968 		VM_REG_GUEST_EFER,
2969 	};
2970 	const enum vm_reg_name data_segs[] = {
2971 		VM_REG_GUEST_SS,
2972 		VM_REG_GUEST_DS,
2973 		VM_REG_GUEST_ES,
2974 		VM_REG_GUEST_FS,
2975 		VM_REG_GUEST_GS,
2976 	};
2977 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2978 
2979 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2980 		return (EINVAL);
2981 
2982 	for (uint_t i = 0; i < nitems(clear_regs); i++) {
2983 		VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2984 	}
2985 
2986 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2987 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2988 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2989 
2990 	/*
2991 	 * The prescribed contents of %rdx differ slightly between the Intel and
2992 	 * AMD architectural definitions.  The former expects the Extended Model
2993 	 * in bits 16-19 where the latter expects all the Family, Model, and
2994 	 * Stepping be there.  Common boot ROMs appear to disregard this
2995 	 * anyways, so we stick with a compromise value similar to what is
2996 	 * spelled out in the Intel SDM.
2997 	 */
2998 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2999 
3000 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
3001 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
3002 
3003 	/* CS: Present, R/W, Accessed */
3004 	desc.access = 0x0093;
3005 	desc.base = 0xffff0000;
3006 	desc.limit = 0xffff;
3007 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
3008 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
3009 
3010 	/* SS, DS, ES, FS, GS: Present, R/W, Accessed */
3011 	desc.access = 0x0093;
3012 	desc.base = 0;
3013 	desc.limit = 0xffff;
3014 	for (uint_t i = 0; i < nitems(data_segs); i++) {
3015 		VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
3016 		VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
3017 	}
3018 
3019 	/* GDTR, IDTR */
3020 	desc.base = 0;
3021 	desc.limit = 0xffff;
3022 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
3023 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
3024 
3025 	/* LDTR: Present, LDT */
3026 	desc.access = 0x0082;
3027 	desc.base = 0;
3028 	desc.limit = 0xffff;
3029 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
3030 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
3031 
3032 	/* TR: Present, 32-bit TSS */
3033 	desc.access = 0x008b;
3034 	desc.base = 0;
3035 	desc.limit = 0xffff;
3036 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
3037 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
3038 
3039 	vlapic_reset(vm_lapic(vm, vcpuid));
3040 
3041 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
3042 
3043 	vcpu->exit_intinfo = 0;
3044 	vcpu->exc_pending = 0;
3045 	vcpu->nmi_pending = false;
3046 	vcpu->extint_pending = 0;
3047 
3048 	/*
3049 	 * A CPU reset caused by power-on or system reset clears more state than
3050 	 * one which is trigged from an INIT IPI.
3051 	 */
3052 	if (!init_only) {
3053 		vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
3054 		(void) hma_fpu_init(vcpu->guestfpu);
3055 
3056 		/* XXX: clear MSRs and other pieces */
3057 		bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
3058 	}
3059 
3060 	return (0);
3061 }
3062 
3063 static int
3064 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
3065 {
3066 	struct seg_desc desc;
3067 
3068 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3069 		return (EINVAL);
3070 
3071 	/* CS: Present, R/W, Accessed */
3072 	desc.access = 0x0093;
3073 	desc.base = (uint64_t)vector << 12;
3074 	desc.limit = 0xffff;
3075 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
3076 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
3077 	    (uint64_t)vector << 8));
3078 
3079 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
3080 
3081 	return (0);
3082 }
3083 
3084 int
3085 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
3086 {
3087 	if (vcpu < 0 || vcpu >= vm->maxcpus)
3088 		return (EINVAL);
3089 
3090 	if (type < 0 || type >= VM_CAP_MAX)
3091 		return (EINVAL);
3092 
3093 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
3094 }
3095 
3096 int
3097 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3098 {
3099 	if (vcpu < 0 || vcpu >= vm->maxcpus)
3100 		return (EINVAL);
3101 
3102 	if (type < 0 || type >= VM_CAP_MAX)
3103 		return (EINVAL);
3104 
3105 	return (VMSETCAP(vm->cookie, vcpu, type, val));
3106 }
3107 
3108 vcpu_cpuid_config_t *
3109 vm_cpuid_config(struct vm *vm, int vcpuid)
3110 {
3111 	ASSERT3S(vcpuid, >=, 0);
3112 	ASSERT3S(vcpuid, <, VM_MAXCPU);
3113 
3114 	return (&vm->vcpu[vcpuid].cpuid_cfg);
3115 }
3116 
3117 struct vlapic *
3118 vm_lapic(struct vm *vm, int cpu)
3119 {
3120 	ASSERT3S(cpu, >=, 0);
3121 	ASSERT3S(cpu, <, VM_MAXCPU);
3122 
3123 	return (vm->vcpu[cpu].vlapic);
3124 }
3125 
3126 struct vioapic *
3127 vm_ioapic(struct vm *vm)
3128 {
3129 
3130 	return (vm->vioapic);
3131 }
3132 
3133 struct vhpet *
3134 vm_hpet(struct vm *vm)
3135 {
3136 
3137 	return (vm->vhpet);
3138 }
3139 
3140 void *
3141 vm_iommu_domain(struct vm *vm)
3142 {
3143 
3144 	return (vm->iommu);
3145 }
3146 
3147 int
3148 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3149     bool from_idle)
3150 {
3151 	int error;
3152 	struct vcpu *vcpu;
3153 
3154 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3155 		panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3156 
3157 	vcpu = &vm->vcpu[vcpuid];
3158 
3159 	vcpu_lock(vcpu);
3160 	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3161 	vcpu_unlock(vcpu);
3162 
3163 	return (error);
3164 }
3165 
3166 enum vcpu_state
3167 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3168 {
3169 	struct vcpu *vcpu;
3170 	enum vcpu_state state;
3171 
3172 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3173 		panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3174 
3175 	vcpu = &vm->vcpu[vcpuid];
3176 
3177 	vcpu_lock(vcpu);
3178 	state = vcpu->state;
3179 	if (hostcpu != NULL)
3180 		*hostcpu = vcpu->hostcpu;
3181 	vcpu_unlock(vcpu);
3182 
3183 	return (state);
3184 }
3185 
3186 uint64_t
3187 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3188 {
3189 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3190 
3191 	uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3192 
3193 	if (phys_adj) {
3194 		/* Include any offset for the current physical CPU too */
3195 		extern hrtime_t tsc_gethrtime_tick_delta(void);
3196 		vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3197 	}
3198 
3199 	return (vcpu_off);
3200 }
3201 
3202 /* Normalize hrtime against the boot time for a VM */
3203 hrtime_t
3204 vm_normalize_hrtime(struct vm *vm, hrtime_t hrt)
3205 {
3206 	/* To avoid underflow/overflow UB, perform math as unsigned */
3207 	return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime));
3208 }
3209 
3210 /* Denormalize hrtime against the boot time for a VM */
3211 hrtime_t
3212 vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt)
3213 {
3214 	/* To avoid underflow/overflow UB, perform math as unsigned */
3215 	return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime));
3216 }
3217 
3218 int
3219 vm_activate_cpu(struct vm *vm, int vcpuid)
3220 {
3221 
3222 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3223 		return (EINVAL);
3224 
3225 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
3226 		return (EBUSY);
3227 
3228 	if (vm->suspend != 0) {
3229 		return (EBUSY);
3230 	}
3231 
3232 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3233 
3234 	/*
3235 	 * It is possible that this vCPU was undergoing activation at the same
3236 	 * time that the VM was being suspended.  If that happens to be the
3237 	 * case, it should reflect the suspended state immediately.
3238 	 */
3239 	if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) {
3240 		CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
3241 	}
3242 
3243 	return (0);
3244 }
3245 
3246 int
3247 vm_suspend_cpu(struct vm *vm, int vcpuid)
3248 {
3249 	int i;
3250 
3251 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3252 		return (EINVAL);
3253 
3254 	if (vcpuid == -1) {
3255 		vm->debug_cpus = vm->active_cpus;
3256 		for (i = 0; i < vm->maxcpus; i++) {
3257 			if (CPU_ISSET(i, &vm->active_cpus))
3258 				vcpu_notify_event(vm, i);
3259 		}
3260 	} else {
3261 		if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3262 			return (EINVAL);
3263 
3264 		CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3265 		vcpu_notify_event(vm, vcpuid);
3266 	}
3267 	return (0);
3268 }
3269 
3270 int
3271 vm_resume_cpu(struct vm *vm, int vcpuid)
3272 {
3273 
3274 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3275 		return (EINVAL);
3276 
3277 	if (vcpuid == -1) {
3278 		CPU_ZERO(&vm->debug_cpus);
3279 	} else {
3280 		if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3281 			return (EINVAL);
3282 
3283 		CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3284 	}
3285 	return (0);
3286 }
3287 
3288 static bool
3289 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3290     uint64_t entry_rip)
3291 {
3292 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3293 	struct vm_exit *vme = &vcpu->exitinfo;
3294 	bool bail = false;
3295 
3296 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3297 
3298 	if (vm->suspend) {
3299 		if (on_entry) {
3300 			VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3301 			    vm->suspend < VM_SUSPEND_LAST);
3302 
3303 			vme->exitcode = VM_EXITCODE_SUSPENDED;
3304 			vme->u.suspended.how = vm->suspend;
3305 		} else {
3306 			/*
3307 			 * Handling VM suspend is complicated, so if that
3308 			 * condition is detected outside of VM-entry itself,
3309 			 * just emit a BOGUS exitcode so we take a lap to pick
3310 			 * up the event during an entry and are directed into
3311 			 * the vm_handle_suspend() logic.
3312 			 */
3313 			vme->exitcode = VM_EXITCODE_BOGUS;
3314 		}
3315 		bail = true;
3316 	}
3317 	if (vcpu->reqidle) {
3318 		vme->exitcode = VM_EXITCODE_REQIDLE;
3319 		vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3320 
3321 		if (!on_entry) {
3322 			/*
3323 			 * A reqidle request detected outside of VM-entry can be
3324 			 * handled directly by clearing the request (and taking
3325 			 * a lap to userspace).
3326 			 */
3327 			vcpu_assert_locked(vcpu);
3328 			vcpu->reqidle = 0;
3329 		}
3330 		bail = true;
3331 	}
3332 	if (vcpu_should_yield(vm, vcpuid)) {
3333 		vme->exitcode = VM_EXITCODE_BOGUS;
3334 		vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3335 		bail = true;
3336 	}
3337 	if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3338 		vme->exitcode = VM_EXITCODE_DEBUG;
3339 		bail = true;
3340 	}
3341 
3342 	if (bail) {
3343 		if (on_entry) {
3344 			/*
3345 			 * If bailing out during VM-entry, the current %rip must
3346 			 * be recorded in the exitinfo.
3347 			 */
3348 			vme->rip = entry_rip;
3349 		}
3350 		vme->inst_length = 0;
3351 	}
3352 	return (bail);
3353 }
3354 
3355 static bool
3356 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3357 {
3358 	/*
3359 	 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3360 	 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3361 	 * structure, and we would only modify the exitcode.
3362 	 */
3363 	return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3364 }
3365 
3366 bool
3367 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3368 {
3369 	/*
3370 	 * Bail-out checks done as part of VM entry require an updated %rip to
3371 	 * populate the vm_exit struct if any of the conditions of interest are
3372 	 * matched in the check.
3373 	 */
3374 	return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3375 }
3376 
3377 cpuset_t
3378 vm_active_cpus(struct vm *vm)
3379 {
3380 
3381 	return (vm->active_cpus);
3382 }
3383 
3384 cpuset_t
3385 vm_debug_cpus(struct vm *vm)
3386 {
3387 
3388 	return (vm->debug_cpus);
3389 }
3390 
3391 cpuset_t
3392 vm_suspended_cpus(struct vm *vm)
3393 {
3394 
3395 	return (vm->suspended_cpus);
3396 }
3397 
3398 void *
3399 vcpu_stats(struct vm *vm, int vcpuid)
3400 {
3401 
3402 	return (vm->vcpu[vcpuid].stats);
3403 }
3404 
3405 int
3406 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3407 {
3408 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3409 		return (EINVAL);
3410 
3411 	*state = vm->vcpu[vcpuid].x2apic_state;
3412 
3413 	return (0);
3414 }
3415 
3416 int
3417 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3418 {
3419 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3420 		return (EINVAL);
3421 
3422 	if (state >= X2APIC_STATE_LAST)
3423 		return (EINVAL);
3424 
3425 	vm->vcpu[vcpuid].x2apic_state = state;
3426 
3427 	vlapic_set_x2apic_state(vm, vcpuid, state);
3428 
3429 	return (0);
3430 }
3431 
3432 /*
3433  * This function is called to ensure that a vcpu "sees" a pending event
3434  * as soon as possible:
3435  * - If the vcpu thread is sleeping then it is woken up.
3436  * - If the vcpu is running on a different host_cpu then an IPI will be directed
3437  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3438  */
3439 static void
3440 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3441 {
3442 	int hostcpu;
3443 
3444 	ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3445 
3446 	hostcpu = vcpu->hostcpu;
3447 	if (vcpu->state == VCPU_RUNNING) {
3448 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3449 		if (hostcpu != curcpu) {
3450 			if (ntype == VCPU_NOTIFY_APIC) {
3451 				vlapic_post_intr(vcpu->vlapic, hostcpu);
3452 			} else {
3453 				poke_cpu(hostcpu);
3454 			}
3455 		} else {
3456 			/*
3457 			 * If the 'vcpu' is running on 'curcpu' then it must
3458 			 * be sending a notification to itself (e.g. SELF_IPI).
3459 			 * The pending event will be picked up when the vcpu
3460 			 * transitions back to guest context.
3461 			 */
3462 		}
3463 	} else {
3464 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3465 		    "with hostcpu %d", vcpu->state, hostcpu));
3466 		if (vcpu->state == VCPU_SLEEPING) {
3467 			cv_signal(&vcpu->vcpu_cv);
3468 		}
3469 	}
3470 }
3471 
3472 void
3473 vcpu_notify_event(struct vm *vm, int vcpuid)
3474 {
3475 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3476 
3477 	vcpu_lock(vcpu);
3478 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3479 	vcpu_unlock(vcpu);
3480 }
3481 
3482 void
3483 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3484 {
3485 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3486 
3487 	if (ntype == VCPU_NOTIFY_NONE) {
3488 		return;
3489 	}
3490 
3491 	vcpu_lock(vcpu);
3492 	vcpu_notify_event_locked(vcpu, ntype);
3493 	vcpu_unlock(vcpu);
3494 }
3495 
3496 void
3497 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3498 {
3499 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3500 	hrtime_t now = gethrtime();
3501 
3502 	ASSERT3U(ustate, !=, vcpu->ustate);
3503 	ASSERT3S(ustate, <, VU_MAX);
3504 	ASSERT3S(ustate, >=, VU_INIT);
3505 
3506 	hrtime_t delta = now - vcpu->ustate_when;
3507 	vcpu->ustate_total[vcpu->ustate] += delta;
3508 
3509 	membar_producer();
3510 
3511 	vcpu->ustate_when = now;
3512 	vcpu->ustate = ustate;
3513 }
3514 
3515 struct vmspace *
3516 vm_get_vmspace(struct vm *vm)
3517 {
3518 
3519 	return (vm->vmspace);
3520 }
3521 
3522 struct vm_client *
3523 vm_get_vmclient(struct vm *vm, int vcpuid)
3524 {
3525 	return (vm->vcpu[vcpuid].vmclient);
3526 }
3527 
3528 int
3529 vm_apicid2vcpuid(struct vm *vm, int apicid)
3530 {
3531 	/*
3532 	 * XXX apic id is assumed to be numerically identical to vcpu id
3533 	 */
3534 	return (apicid);
3535 }
3536 
3537 struct vatpic *
3538 vm_atpic(struct vm *vm)
3539 {
3540 	return (vm->vatpic);
3541 }
3542 
3543 struct vatpit *
3544 vm_atpit(struct vm *vm)
3545 {
3546 	return (vm->vatpit);
3547 }
3548 
3549 struct vpmtmr *
3550 vm_pmtmr(struct vm *vm)
3551 {
3552 
3553 	return (vm->vpmtmr);
3554 }
3555 
3556 struct vrtc *
3557 vm_rtc(struct vm *vm)
3558 {
3559 
3560 	return (vm->vrtc);
3561 }
3562 
3563 enum vm_reg_name
3564 vm_segment_name(int seg)
3565 {
3566 	static enum vm_reg_name seg_names[] = {
3567 		VM_REG_GUEST_ES,
3568 		VM_REG_GUEST_CS,
3569 		VM_REG_GUEST_SS,
3570 		VM_REG_GUEST_DS,
3571 		VM_REG_GUEST_FS,
3572 		VM_REG_GUEST_GS
3573 	};
3574 
3575 	KASSERT(seg >= 0 && seg < nitems(seg_names),
3576 	    ("%s: invalid segment encoding %d", __func__, seg));
3577 	return (seg_names[seg]);
3578 }
3579 
3580 void
3581 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3582     uint_t num_copyinfo)
3583 {
3584 	for (uint_t idx = 0; idx < num_copyinfo; idx++) {
3585 		if (copyinfo[idx].cookie != NULL) {
3586 			(void) vmp_release((vm_page_t *)copyinfo[idx].cookie);
3587 		}
3588 	}
3589 	bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3590 }
3591 
3592 int
3593 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3594     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3595     uint_t num_copyinfo, int *fault)
3596 {
3597 	uint_t idx, nused;
3598 	size_t n, off, remaining;
3599 	vm_client_t *vmc = vm_get_vmclient(vm, vcpuid);
3600 
3601 	bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3602 
3603 	nused = 0;
3604 	remaining = len;
3605 	while (remaining > 0) {
3606 		uint64_t gpa;
3607 		int error;
3608 
3609 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3610 		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3611 		if (error || *fault)
3612 			return (error);
3613 		off = gpa & PAGEOFFSET;
3614 		n = min(remaining, PAGESIZE - off);
3615 		copyinfo[nused].gpa = gpa;
3616 		copyinfo[nused].len = n;
3617 		remaining -= n;
3618 		gla += n;
3619 		nused++;
3620 	}
3621 
3622 	for (idx = 0; idx < nused; idx++) {
3623 		vm_page_t *vmp;
3624 		caddr_t hva;
3625 
3626 		vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot);
3627 		if (vmp == NULL) {
3628 			break;
3629 		}
3630 		if ((prot & PROT_WRITE) != 0) {
3631 			hva = (caddr_t)vmp_get_writable(vmp);
3632 		} else {
3633 			hva = (caddr_t)vmp_get_readable(vmp);
3634 		}
3635 		copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET);
3636 		copyinfo[idx].cookie = vmp;
3637 		copyinfo[idx].prot = prot;
3638 	}
3639 
3640 	if (idx != nused) {
3641 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3642 		return (EFAULT);
3643 	} else {
3644 		*fault = 0;
3645 		return (0);
3646 	}
3647 }
3648 
3649 void
3650 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3651     size_t len)
3652 {
3653 	char *dst;
3654 	int idx;
3655 
3656 	dst = kaddr;
3657 	idx = 0;
3658 	while (len > 0) {
3659 		ASSERT(copyinfo[idx].prot & PROT_READ);
3660 
3661 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3662 		len -= copyinfo[idx].len;
3663 		dst += copyinfo[idx].len;
3664 		idx++;
3665 	}
3666 }
3667 
3668 void
3669 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3670     struct vm_copyinfo *copyinfo, size_t len)
3671 {
3672 	const char *src;
3673 	int idx;
3674 
3675 	src = kaddr;
3676 	idx = 0;
3677 	while (len > 0) {
3678 		ASSERT(copyinfo[idx].prot & PROT_WRITE);
3679 
3680 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3681 		len -= copyinfo[idx].len;
3682 		src += copyinfo[idx].len;
3683 		idx++;
3684 	}
3685 }
3686 
3687 /*
3688  * Return the amount of in-use and wired memory for the VM. Since
3689  * these are global stats, only return the values with for vCPU 0
3690  */
3691 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3692 
3693 static void
3694 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3695 {
3696 	if (vcpu == 0) {
3697 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3698 		    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3699 	}
3700 }
3701 
3702 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3703 
3704 int
3705 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3706     uint8_t bytes, uint32_t *val)
3707 {
3708 	return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3709 }
3710 
3711 /*
3712  * bhyve-internal interfaces to attach or detach IO port handlers.
3713  * Must be called with VM write lock held for safety.
3714  */
3715 int
3716 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3717     void **cookie)
3718 {
3719 	int err;
3720 	err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3721 	if (err == 0) {
3722 		*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3723 	}
3724 	return (err);
3725 }
3726 int
3727 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3728     void **old_arg)
3729 {
3730 	uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3731 	int err;
3732 
3733 	err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3734 	if (err == 0) {
3735 		*cookie = NULL;
3736 	}
3737 	return (err);
3738 }
3739 
3740 /*
3741  * External driver interfaces to attach or detach IO port handlers.
3742  * Must be called with VM write lock held for safety.
3743  */
3744 int
3745 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3746     void *arg, void **cookie)
3747 {
3748 	int err;
3749 
3750 	if (port == 0) {
3751 		return (EINVAL);
3752 	}
3753 
3754 	err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3755 	if (err == 0) {
3756 		*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3757 	}
3758 	return (err);
3759 }
3760 void
3761 vm_ioport_unhook(struct vm *vm, void **cookie)
3762 {
3763 	uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3764 	ioport_handler_t old_func;
3765 	void *old_arg;
3766 	int err;
3767 
3768 	err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3769 
3770 	/* ioport-hook-using drivers are expected to be well-behaved */
3771 	VERIFY0(err);
3772 	VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3773 
3774 	*cookie = NULL;
3775 }
3776 
3777 int
3778 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3779 {
3780 	struct vm *vm = ksp->ks_private;
3781 	vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3782 	const int vcpuid = vvk->vvk_vcpu.value.ui32;
3783 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3784 
3785 	ASSERT3U(vcpuid, <, VM_MAXCPU);
3786 
3787 	vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3788 	vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3789 	vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3790 	vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3791 	vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3792 	vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3793 
3794 	return (0);
3795 }
3796 
3797 SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t);
3798 
3799 static inline bool
3800 vmm_data_is_cpu_specific(uint16_t data_class)
3801 {
3802 	switch (data_class) {
3803 	case VDC_REGISTER:
3804 	case VDC_MSR:
3805 	case VDC_FPU:
3806 	case VDC_LAPIC:
3807 		return (true);
3808 	default:
3809 		return (false);
3810 	}
3811 }
3812 
3813 static int
3814 vmm_data_find(const vmm_data_req_t *req, const vmm_data_version_entry_t **resp)
3815 {
3816 	const vmm_data_version_entry_t **vdpp, *vdp;
3817 
3818 	ASSERT(resp != NULL);
3819 	ASSERT(req->vdr_result_len != NULL);
3820 
3821 	SET_FOREACH(vdpp, vmm_data_version_entries) {
3822 		vdp = *vdpp;
3823 		if (vdp->vdve_class == req->vdr_class &&
3824 		    vdp->vdve_version == req->vdr_version) {
3825 			/*
3826 			 * Enforce any data length expectation expressed by the
3827 			 * provider for this data.
3828 			 */
3829 			if (vdp->vdve_len_expect != 0 &&
3830 			    vdp->vdve_len_expect > req->vdr_len) {
3831 				*req->vdr_result_len = vdp->vdve_len_expect;
3832 				return (ENOSPC);
3833 			}
3834 			*resp = vdp;
3835 			return (0);
3836 		}
3837 	}
3838 	return (EINVAL);
3839 }
3840 
3841 static void *
3842 vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid)
3843 {
3844 	switch (req->vdr_class) {
3845 		/* per-cpu data/devices */
3846 	case VDC_LAPIC:
3847 		return (vm_lapic(vm, vcpuid));
3848 	case VDC_VMM_ARCH:
3849 		return (vm);
3850 
3851 	case VDC_FPU:
3852 	case VDC_REGISTER:
3853 	case VDC_MSR:
3854 		/*
3855 		 * These have per-CPU handling which is dispatched outside
3856 		 * vmm_data_version_entries listing.
3857 		 */
3858 		return (NULL);
3859 
3860 		/* system-wide data/devices */
3861 	case VDC_IOAPIC:
3862 		return (vm->vioapic);
3863 	case VDC_ATPIT:
3864 		return (vm->vatpit);
3865 	case VDC_ATPIC:
3866 		return (vm->vatpic);
3867 	case VDC_HPET:
3868 		return (vm->vhpet);
3869 	case VDC_PM_TIMER:
3870 		return (vm->vpmtmr);
3871 	case VDC_RTC:
3872 		return (vm->vrtc);
3873 
3874 	default:
3875 		/* The data class will have been validated by now */
3876 		panic("Unexpected class %u", req->vdr_class);
3877 	}
3878 }
3879 
3880 const uint32_t arch_msr_iter[] = {
3881 	MSR_EFER,
3882 
3883 	/*
3884 	 * While gsbase and fsbase are accessible via the MSR accessors, they
3885 	 * are not included in MSR iteration since they are covered by the
3886 	 * segment descriptor interface too.
3887 	 */
3888 	MSR_KGSBASE,
3889 
3890 	MSR_STAR,
3891 	MSR_LSTAR,
3892 	MSR_CSTAR,
3893 	MSR_SF_MASK,
3894 
3895 	MSR_SYSENTER_CS_MSR,
3896 	MSR_SYSENTER_ESP_MSR,
3897 	MSR_SYSENTER_EIP_MSR,
3898 	MSR_PAT,
3899 };
3900 const uint32_t generic_msr_iter[] = {
3901 	MSR_TSC,
3902 	MSR_MTRRcap,
3903 	MSR_MTRRdefType,
3904 
3905 	MSR_MTRR4kBase, MSR_MTRR4kBase + 1, MSR_MTRR4kBase + 2,
3906 	MSR_MTRR4kBase + 3, MSR_MTRR4kBase + 4, MSR_MTRR4kBase + 5,
3907 	MSR_MTRR4kBase + 6, MSR_MTRR4kBase + 7,
3908 
3909 	MSR_MTRR16kBase, MSR_MTRR16kBase + 1,
3910 
3911 	MSR_MTRR64kBase,
3912 };
3913 
3914 static int
3915 vmm_data_read_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
3916 {
3917 	VERIFY3U(req->vdr_class, ==, VDC_MSR);
3918 	VERIFY3U(req->vdr_version, ==, 1);
3919 
3920 	const uint_t num_msrs = nitems(arch_msr_iter) + nitems(generic_msr_iter)
3921 	    + (VMM_MTRR_VAR_MAX * 2);
3922 	const uint32_t output_len =
3923 	    num_msrs * sizeof (struct vdi_field_entry_v1);
3924 	*req->vdr_result_len = output_len;
3925 
3926 	if (req->vdr_len < output_len) {
3927 		return (ENOSPC);
3928 	}
3929 
3930 	struct vdi_field_entry_v1 *entryp = req->vdr_data;
3931 	for (uint_t i = 0; i < nitems(arch_msr_iter); i++, entryp++) {
3932 		const uint32_t msr = arch_msr_iter[i];
3933 		uint64_t val = 0;
3934 
3935 		int err = ops->vmgetmsr(vm->cookie, vcpuid, msr, &val);
3936 		/* All of these MSRs are expected to work */
3937 		VERIFY0(err);
3938 		entryp->vfe_ident = msr;
3939 		entryp->vfe_value = val;
3940 	}
3941 
3942 	struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr;
3943 	for (uint_t i = 0; i < nitems(generic_msr_iter); i++, entryp++) {
3944 		const uint32_t msr = generic_msr_iter[i];
3945 
3946 		entryp->vfe_ident = msr;
3947 		switch (msr) {
3948 		case MSR_TSC:
3949 			/*
3950 			 * Communicate this as the difference from the VM-wide
3951 			 * offset of the boot time.
3952 			 */
3953 			entryp->vfe_value = vm->vcpu[vcpuid].tsc_offset;
3954 			break;
3955 		case MSR_MTRRcap:
3956 		case MSR_MTRRdefType:
3957 		case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
3958 		case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
3959 		case MSR_MTRR64kBase: {
3960 			int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value);
3961 			VERIFY0(err);
3962 			break;
3963 		}
3964 		default:
3965 			panic("unexpected msr export %x", msr);
3966 		}
3967 	}
3968 	/* Copy the variable MTRRs */
3969 	for (uint_t i = 0; i < (VMM_MTRR_VAR_MAX * 2); i++, entryp++) {
3970 		const uint32_t msr = MSR_MTRRVarBase + i;
3971 
3972 		entryp->vfe_ident = msr;
3973 		int err = vm_rdmtrr(mtrr, msr, &entryp->vfe_value);
3974 		VERIFY0(err);
3975 	}
3976 	return (0);
3977 }
3978 
3979 static int
3980 vmm_data_write_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
3981 {
3982 	VERIFY3U(req->vdr_class, ==, VDC_MSR);
3983 	VERIFY3U(req->vdr_version, ==, 1);
3984 
3985 	const struct vdi_field_entry_v1 *entryp = req->vdr_data;
3986 	const uint_t entry_count =
3987 	    req->vdr_len / sizeof (struct vdi_field_entry_v1);
3988 	struct vm_mtrr *mtrr = &vm->vcpu[vcpuid].mtrr;
3989 
3990 	/*
3991 	 * First make sure that all of the MSRs can be manipulated.
3992 	 * For now, this check is done by going though the getmsr handler
3993 	 */
3994 	for (uint_t i = 0; i < entry_count; i++, entryp++) {
3995 		const uint32_t msr = entryp->vfe_ident;
3996 		uint64_t val;
3997 		int err = 0;
3998 
3999 		switch (msr) {
4000 		case MSR_TSC:
4001 			break;
4002 		default:
4003 			if (is_mtrr_msr(msr)) {
4004 				err = vm_rdmtrr(mtrr, msr, &val);
4005 			} else {
4006 				err = ops->vmgetmsr(vm->cookie, vcpuid, msr,
4007 				    &val);
4008 			}
4009 			break;
4010 		}
4011 		if (err != 0) {
4012 			return (err);
4013 		}
4014 	}
4015 
4016 	/*
4017 	 * Fairly confident that all of the 'set' operations are at least
4018 	 * targeting valid MSRs, continue on.
4019 	 */
4020 	entryp = req->vdr_data;
4021 	for (uint_t i = 0; i < entry_count; i++, entryp++) {
4022 		const uint32_t msr = entryp->vfe_ident;
4023 		const uint64_t val = entryp->vfe_value;
4024 		int err = 0;
4025 
4026 		switch (msr) {
4027 		case MSR_TSC:
4028 			vm->vcpu[vcpuid].tsc_offset = entryp->vfe_value;
4029 			break;
4030 		default:
4031 			if (is_mtrr_msr(msr)) {
4032 				if (msr == MSR_MTRRcap) {
4033 					/*
4034 					 * MTRRcap is read-only.  If the current
4035 					 * value matches the incoming one,
4036 					 * consider it a success
4037 					 */
4038 					uint64_t comp;
4039 					err = vm_rdmtrr(mtrr, msr, &comp);
4040 					if (err != 0 || comp != val) {
4041 						err = EINVAL;
4042 					}
4043 				} else {
4044 					err = vm_wrmtrr(mtrr, msr, val);
4045 				}
4046 			} else {
4047 				err = ops->vmsetmsr(vm->cookie, vcpuid, msr,
4048 				    val);
4049 			}
4050 			break;
4051 		}
4052 		if (err != 0) {
4053 			return (err);
4054 		}
4055 	}
4056 	*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
4057 
4058 	return (0);
4059 }
4060 
4061 static const vmm_data_version_entry_t msr_v1 = {
4062 	.vdve_class = VDC_MSR,
4063 	.vdve_version = 1,
4064 	.vdve_len_per_item = sizeof (struct vdi_field_entry_v1),
4065 	/* Requires backend-specific dispatch */
4066 	.vdve_readf = NULL,
4067 	.vdve_writef = NULL,
4068 };
4069 VMM_DATA_VERSION(msr_v1);
4070 
4071 static const uint32_t vmm_arch_v1_fields[] = {
4072 	VAI_TSC_BOOT_OFFSET,
4073 	VAI_BOOT_HRTIME,
4074 	VAI_TSC_FREQ,
4075 };
4076 
4077 static bool
4078 vmm_read_arch_field(struct vm *vm, uint32_t ident, uint64_t *valp)
4079 {
4080 	ASSERT(valp != NULL);
4081 
4082 	switch (ident) {
4083 	case VAI_TSC_BOOT_OFFSET:
4084 		*valp = vm->boot_tsc_offset;
4085 		return (true);
4086 	case VAI_BOOT_HRTIME:
4087 		*valp = vm->boot_hrtime;
4088 		return (true);
4089 	case VAI_TSC_FREQ:
4090 		/*
4091 		 * Since the system TSC calibration is not public, just derive
4092 		 * it from the scaling functions available.
4093 		 */
4094 		*valp = unscalehrtime(NANOSEC);
4095 		return (true);
4096 	default:
4097 		break;
4098 	}
4099 	return (false);
4100 }
4101 
4102 static int
4103 vmm_data_read_vmm_arch(void *arg, const vmm_data_req_t *req)
4104 {
4105 	struct vm *vm = arg;
4106 
4107 	VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
4108 	VERIFY3U(req->vdr_version, ==, 1);
4109 
4110 	struct vdi_field_entry_v1 *entryp = req->vdr_data;
4111 
4112 	/* Specific fields requested */
4113 	if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) {
4114 		const uint_t count =
4115 		    req->vdr_len / sizeof (struct vdi_field_entry_v1);
4116 
4117 		for (uint_t i = 0; i < count; i++, entryp++) {
4118 			if (!vmm_read_arch_field(vm, entryp->vfe_ident,
4119 			    &entryp->vfe_value)) {
4120 				return (EINVAL);
4121 			}
4122 		}
4123 		*req->vdr_result_len =
4124 		    count * sizeof (struct vdi_field_entry_v1);
4125 		return (0);
4126 	}
4127 
4128 	/* Emit all of the possible values */
4129 	const uint32_t total_size = nitems(vmm_arch_v1_fields) *
4130 	    sizeof (struct vdi_field_entry_v1);
4131 	*req->vdr_result_len = total_size;
4132 	if (req->vdr_len < total_size) {
4133 		return (ENOSPC);
4134 	}
4135 	for (uint_t i = 0; i < nitems(vmm_arch_v1_fields); i++, entryp++) {
4136 		entryp->vfe_ident = vmm_arch_v1_fields[i];
4137 		VERIFY(vmm_read_arch_field(vm, entryp->vfe_ident,
4138 		    &entryp->vfe_value));
4139 	}
4140 	return (0);
4141 }
4142 
4143 static int
4144 vmm_data_write_vmm_arch(void *arg, const vmm_data_req_t *req)
4145 {
4146 	struct vm *vm = arg;
4147 
4148 	VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
4149 	VERIFY3U(req->vdr_version, ==, 1);
4150 
4151 	const struct vdi_field_entry_v1 *entryp = req->vdr_data;
4152 	const uint_t entry_count =
4153 	    req->vdr_len / sizeof (struct vdi_field_entry_v1);
4154 
4155 	for (uint_t i = 0; i < entry_count; i++, entryp++) {
4156 		const uint64_t val = entryp->vfe_value;
4157 
4158 		switch (entryp->vfe_ident) {
4159 		case VAI_TSC_BOOT_OFFSET:
4160 			vm->boot_tsc_offset = val;
4161 			break;
4162 		case VAI_BOOT_HRTIME:
4163 			vm->boot_hrtime = val;
4164 			break;
4165 		case VAI_TSC_FREQ:
4166 			/* Guest TSC frequency not (currently) adjustable */
4167 			return (EPERM);
4168 		default:
4169 			return (EINVAL);
4170 		}
4171 	}
4172 	*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
4173 	return (0);
4174 }
4175 
4176 static const vmm_data_version_entry_t vmm_arch_v1 = {
4177 	.vdve_class = VDC_VMM_ARCH,
4178 	.vdve_version = 1,
4179 	.vdve_len_per_item = sizeof (struct vdi_field_entry_v1),
4180 	.vdve_readf = vmm_data_read_vmm_arch,
4181 	.vdve_writef = vmm_data_write_vmm_arch,
4182 };
4183 VMM_DATA_VERSION(vmm_arch_v1);
4184 
4185 static int
4186 vmm_data_read_versions(void *arg, const vmm_data_req_t *req)
4187 {
4188 	VERIFY3U(req->vdr_class, ==, VDC_VERSION);
4189 	VERIFY3U(req->vdr_version, ==, 1);
4190 
4191 	const uint32_t total_size = SET_COUNT(vmm_data_version_entries) *
4192 	    sizeof (struct vdi_version_entry_v1);
4193 
4194 	/* Make sure there is room for all of the entries */
4195 	*req->vdr_result_len = total_size;
4196 	if (req->vdr_len < *req->vdr_result_len) {
4197 		return (ENOSPC);
4198 	}
4199 
4200 	struct vdi_version_entry_v1 *entryp = req->vdr_data;
4201 	const vmm_data_version_entry_t **vdpp;
4202 	SET_FOREACH(vdpp, vmm_data_version_entries) {
4203 		const vmm_data_version_entry_t *vdp = *vdpp;
4204 
4205 		entryp->vve_class = vdp->vdve_class;
4206 		entryp->vve_version = vdp->vdve_version;
4207 		entryp->vve_len_expect = vdp->vdve_len_expect;
4208 		entryp->vve_len_per_item = vdp->vdve_len_per_item;
4209 		entryp++;
4210 	}
4211 	return (0);
4212 }
4213 
4214 static int
4215 vmm_data_write_versions(void *arg, const vmm_data_req_t *req)
4216 {
4217 	/* Writing to the version information makes no sense */
4218 	return (EPERM);
4219 }
4220 
4221 static const vmm_data_version_entry_t versions_v1 = {
4222 	.vdve_class = VDC_VERSION,
4223 	.vdve_version = 1,
4224 	.vdve_len_per_item = sizeof (struct vdi_version_entry_v1),
4225 	.vdve_readf = vmm_data_read_versions,
4226 	.vdve_writef = vmm_data_write_versions,
4227 };
4228 VMM_DATA_VERSION(versions_v1);
4229 
4230 int
4231 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
4232 {
4233 	int err = 0;
4234 
4235 	if (vmm_data_is_cpu_specific(req->vdr_class)) {
4236 		if (vcpuid >= VM_MAXCPU) {
4237 			return (EINVAL);
4238 		}
4239 	}
4240 
4241 	const vmm_data_version_entry_t *entry = NULL;
4242 	err = vmm_data_find(req, &entry);
4243 	if (err != 0) {
4244 		return (err);
4245 	}
4246 	ASSERT(entry != NULL);
4247 
4248 	void *datap = vmm_data_from_class(req, vm, vcpuid);
4249 	if (datap != NULL) {
4250 		err = entry->vdve_readf(datap, req);
4251 
4252 		/*
4253 		 * Successful reads of fixed-length data should populate the
4254 		 * length of that result.
4255 		 */
4256 		if (err == 0 && entry->vdve_len_expect != 0) {
4257 			*req->vdr_result_len = entry->vdve_len_expect;
4258 		}
4259 	} else {
4260 		switch (req->vdr_class) {
4261 		case VDC_MSR:
4262 			err = vmm_data_read_msrs(vm, vcpuid, req);
4263 			break;
4264 		case VDC_FPU:
4265 			/* TODO: wire up to xsave export via hma_fpu iface */
4266 			err = EINVAL;
4267 			break;
4268 		case VDC_REGISTER:
4269 		default:
4270 			err = EINVAL;
4271 			break;
4272 		}
4273 	}
4274 
4275 	return (err);
4276 }
4277 
4278 int
4279 vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
4280 {
4281 	int err = 0;
4282 
4283 	if (vmm_data_is_cpu_specific(req->vdr_class)) {
4284 		if (vcpuid >= VM_MAXCPU) {
4285 			return (EINVAL);
4286 		}
4287 	}
4288 
4289 	const vmm_data_version_entry_t *entry = NULL;
4290 	err = vmm_data_find(req, &entry);
4291 	if (err != 0) {
4292 		return (err);
4293 	}
4294 	ASSERT(entry != NULL);
4295 
4296 	void *datap = vmm_data_from_class(req, vm, vcpuid);
4297 	if (datap != NULL) {
4298 		err = entry->vdve_writef(datap, req);
4299 		/*
4300 		 * Successful writes of fixed-length data should populate the
4301 		 * length of that result.
4302 		 */
4303 		if (err == 0 && entry->vdve_len_expect != 0) {
4304 			*req->vdr_result_len = entry->vdve_len_expect;
4305 		}
4306 	} else {
4307 		switch (req->vdr_class) {
4308 		case VDC_MSR:
4309 			err = vmm_data_write_msrs(vm, vcpuid, req);
4310 			break;
4311 		case VDC_FPU:
4312 			/* TODO: wire up to xsave import via hma_fpu iface */
4313 			err = EINVAL;
4314 			break;
4315 		case VDC_REGISTER:
4316 		default:
4317 			err = EINVAL;
4318 			break;
4319 		}
4320 	}
4321 
4322 	return (err);
4323 }
4324