xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm.c (revision 29621f011bb9ea4f2d9ea887e971bbb3910ee931)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * This file and its contents are supplied under the terms of the
32  * Common Development and Distribution License ("CDDL"), version 1.0.
33  * You may only use this file in accordance with the terms of version
34  * 1.0 of the CDDL.
35  *
36  * A full copy of the text of the CDDL should have accompanied this
37  * source.  A copy of the CDDL is also available via the Internet at
38  * http://www.illumos.org/license/CDDL.
39  *
40  * Copyright 2015 Pluribus Networks Inc.
41  * Copyright 2018 Joyent, Inc.
42  * Copyright 2022 Oxide Computer Company
43  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
44  */
45 
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/kernel.h>
52 #include <sys/module.h>
53 #include <sys/sysctl.h>
54 #include <sys/kmem.h>
55 #include <sys/pcpu.h>
56 #include <sys/mutex.h>
57 #include <sys/proc.h>
58 #include <sys/rwlock.h>
59 #include <sys/sched.h>
60 #include <sys/systm.h>
61 #include <sys/sunddi.h>
62 #include <sys/hma.h>
63 
64 #include <machine/md_var.h>
65 #include <x86/psl.h>
66 #include <x86/apicreg.h>
67 
68 #include <machine/specialreg.h>
69 #include <machine/vmm.h>
70 #include <machine/vmm_dev.h>
71 #include <machine/vmparam.h>
72 #include <sys/vmm_instruction_emul.h>
73 #include <sys/vmm_vm.h>
74 #include <sys/vmm_gpt.h>
75 #include <sys/vmm_data.h>
76 
77 #include "vmm_ioport.h"
78 #include "vmm_host.h"
79 #include "vmm_util.h"
80 #include "vatpic.h"
81 #include "vatpit.h"
82 #include "vhpet.h"
83 #include "vioapic.h"
84 #include "vlapic.h"
85 #include "vpmtmr.h"
86 #include "vrtc.h"
87 #include "vmm_stat.h"
88 #include "vmm_lapic.h"
89 
90 #include "io/ppt.h"
91 #include "io/iommu.h"
92 
93 struct vlapic;
94 
95 /* Flags for vtc_status */
96 #define	VTCS_FPU_RESTORED	1 /* guest FPU restored, host FPU saved */
97 #define	VTCS_FPU_CTX_CRITICAL	2 /* in ctx where FPU restore cannot be lazy */
98 
99 typedef struct vm_thread_ctx {
100 	struct vm	*vtc_vm;
101 	int		vtc_vcpuid;
102 	uint_t		vtc_status;
103 	enum vcpu_ustate vtc_ustate;
104 } vm_thread_ctx_t;
105 
106 #define	VMM_MTRR_VAR_MAX 10
107 #define	VMM_MTRR_DEF_MASK \
108 	(MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE)
109 #define	VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE)
110 #define	VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID)
111 struct vm_mtrr {
112 	uint64_t def_type;
113 	uint64_t fixed4k[8];
114 	uint64_t fixed16k[2];
115 	uint64_t fixed64k;
116 	struct {
117 		uint64_t base;
118 		uint64_t mask;
119 	} var[VMM_MTRR_VAR_MAX];
120 };
121 
122 /*
123  * Initialization:
124  * (a) allocated when vcpu is created
125  * (i) initialized when vcpu is created and when it is reinitialized
126  * (o) initialized the first time the vcpu is created
127  * (x) initialized before use
128  */
129 struct vcpu {
130 	/* (o) protects state, run_state, hostcpu, sipi_vector */
131 	kmutex_t	lock;
132 
133 	enum vcpu_state	state;		/* (o) vcpu state */
134 	enum vcpu_run_state run_state;	/* (i) vcpu init/sipi/run state */
135 	kcondvar_t	vcpu_cv;	/* (o) cpu waiter cv */
136 	kcondvar_t	state_cv;	/* (o) IDLE-transition cv */
137 	int		hostcpu;	/* (o) vcpu's current host cpu */
138 	int		lastloccpu;	/* (o) last host cpu localized to */
139 	int		reqidle;	/* (i) request vcpu to idle */
140 	struct vlapic	*vlapic;	/* (i) APIC device model */
141 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
142 	uint64_t	exit_intinfo;	/* (i) events pending at VM exit */
143 	uint64_t	exc_pending;	/* (i) exception pending */
144 	bool		nmi_pending;	/* (i) NMI pending */
145 	bool		extint_pending;	/* (i) INTR pending */
146 
147 	uint8_t		sipi_vector;	/* (i) SIPI vector */
148 	hma_fpu_t	*guestfpu;	/* (a,i) guest fpu state */
149 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
150 	void		*stats;		/* (a,i) statistics */
151 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
152 	uint64_t	nextrip;	/* (x) next instruction to execute */
153 	struct vie	*vie_ctx;	/* (x) instruction emulation context */
154 	vm_client_t	*vmclient;	/* (a) VM-system client */
155 	uint64_t	tsc_offset;	/* (x) offset from host TSC */
156 	struct vm_mtrr	mtrr;		/* (i) vcpu's MTRR */
157 
158 	enum vcpu_ustate ustate;	/* (i) microstate for the vcpu */
159 	hrtime_t	ustate_when;	/* (i) time of last ustate change */
160 	uint64_t ustate_total[VU_MAX];	/* (o) total time spent in ustates */
161 	vm_thread_ctx_t	vtc;		/* (o) thread state for ctxops */
162 	struct ctxop	*ctxop;		/* (o) ctxop storage for vcpu */
163 };
164 
165 #define	vcpu_lock(v)		mutex_enter(&((v)->lock))
166 #define	vcpu_unlock(v)		mutex_exit(&((v)->lock))
167 #define	vcpu_assert_locked(v)	ASSERT(MUTEX_HELD(&((v)->lock)))
168 
169 struct mem_seg {
170 	size_t	len;
171 	bool	sysmem;
172 	vm_object_t *object;
173 };
174 #define	VM_MAX_MEMSEGS	5
175 
176 struct mem_map {
177 	vm_paddr_t	gpa;
178 	size_t		len;
179 	vm_ooffset_t	segoff;
180 	int		segid;
181 	int		prot;
182 	int		flags;
183 };
184 #define	VM_MAX_MEMMAPS	8
185 
186 /*
187  * Initialization:
188  * (o) initialized the first time the VM is created
189  * (i) initialized when VM is created and when it is reinitialized
190  * (x) initialized before use
191  */
192 struct vm {
193 	void		*cookie;		/* (i) cpu-specific data */
194 	void		*iommu;			/* (x) iommu-specific data */
195 	struct vhpet	*vhpet;			/* (i) virtual HPET */
196 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
197 	struct vatpic	*vatpic;		/* (i) virtual atpic */
198 	struct vatpit	*vatpit;		/* (i) virtual atpit */
199 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
200 	struct vrtc	*vrtc;			/* (o) virtual RTC */
201 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
202 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for dbg */
203 	int		suspend;		/* (i) stop VM execution */
204 	volatile cpuset_t suspended_cpus;	/* (i) suspended vcpus */
205 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
206 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
207 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
208 	struct vmspace	*vmspace;		/* (o) guest's address space */
209 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
210 	/* The following describe the vm cpu topology */
211 	uint16_t	sockets;		/* (o) num of sockets */
212 	uint16_t	cores;			/* (o) num of cores/socket */
213 	uint16_t	threads;		/* (o) num of threads/core */
214 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
215 
216 	uint64_t	boot_tsc_offset;	/* (i) TSC offset at VM boot */
217 	hrtime_t	boot_hrtime;		/* (i) hrtime at VM boot */
218 
219 	struct ioport_config ioports;		/* (o) ioport handling */
220 
221 	bool		mem_transient;		/* (o) alloc transient memory */
222 };
223 
224 static int vmm_initialized;
225 
226 
227 static void
228 nullop_panic(void)
229 {
230 	panic("null vmm operation call");
231 }
232 
233 /* Do not allow use of an un-set `ops` to do anything but panic */
234 static struct vmm_ops vmm_ops_null = {
235 	.init		= (vmm_init_func_t)nullop_panic,
236 	.cleanup	= (vmm_cleanup_func_t)nullop_panic,
237 	.resume		= (vmm_resume_func_t)nullop_panic,
238 	.vminit		= (vmi_init_func_t)nullop_panic,
239 	.vmrun		= (vmi_run_func_t)nullop_panic,
240 	.vmcleanup	= (vmi_cleanup_func_t)nullop_panic,
241 	.vmgetreg	= (vmi_get_register_t)nullop_panic,
242 	.vmsetreg	= (vmi_set_register_t)nullop_panic,
243 	.vmgetdesc	= (vmi_get_desc_t)nullop_panic,
244 	.vmsetdesc	= (vmi_set_desc_t)nullop_panic,
245 	.vmgetcap	= (vmi_get_cap_t)nullop_panic,
246 	.vmsetcap	= (vmi_set_cap_t)nullop_panic,
247 	.vlapic_init	= (vmi_vlapic_init)nullop_panic,
248 	.vlapic_cleanup	= (vmi_vlapic_cleanup)nullop_panic,
249 	.vmsavectx	= (vmi_savectx)nullop_panic,
250 	.vmrestorectx	= (vmi_restorectx)nullop_panic,
251 };
252 
253 static struct vmm_ops *ops = &vmm_ops_null;
254 static vmm_pte_ops_t *pte_ops = NULL;
255 
256 #define	VMM_INIT()			((*ops->init)())
257 #define	VMM_CLEANUP()			((*ops->cleanup)())
258 #define	VMM_RESUME()			((*ops->resume)())
259 
260 #define	VMINIT(vm)		((*ops->vminit)(vm))
261 #define	VMRUN(vmi, vcpu, rip)	((*ops->vmrun)(vmi, vcpu, rip))
262 #define	VMCLEANUP(vmi)			((*ops->vmcleanup)(vmi))
263 
264 #define	VMGETREG(vmi, vcpu, num, rv)	((*ops->vmgetreg)(vmi, vcpu, num, rv))
265 #define	VMSETREG(vmi, vcpu, num, val)	((*ops->vmsetreg)(vmi, vcpu, num, val))
266 #define	VMGETDESC(vmi, vcpu, num, dsc)	((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
267 #define	VMSETDESC(vmi, vcpu, num, dsc)	((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
268 #define	VMGETCAP(vmi, vcpu, num, rv)	((*ops->vmgetcap)(vmi, vcpu, num, rv))
269 #define	VMSETCAP(vmi, vcpu, num, val)	((*ops->vmsetcap)(vmi, vcpu, num, val))
270 #define	VLAPIC_INIT(vmi, vcpu)		((*ops->vlapic_init)(vmi, vcpu))
271 #define	VLAPIC_CLEANUP(vmi, vlapic)	((*ops->vlapic_cleanup)(vmi, vlapic))
272 
273 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
274 #define	fpu_stop_emulating()	clts()
275 
276 SDT_PROVIDER_DEFINE(vmm);
277 
278 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
279     NULL);
280 
281 /*
282  * Halt the guest if all vcpus are executing a HLT instruction with
283  * interrupts disabled.
284  */
285 static int halt_detection_enabled = 1;
286 
287 /* Trap into hypervisor on all guest exceptions and reflect them back */
288 static int trace_guest_exceptions;
289 
290 static void vm_free_memmap(struct vm *vm, int ident);
291 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
292 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
293 static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
294 static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
295 
296 static void vmm_savectx(void *);
297 static void vmm_restorectx(void *);
298 static const struct ctxop_template vmm_ctxop_tpl = {
299 	.ct_rev		= CTXOP_TPL_REV,
300 	.ct_save	= vmm_savectx,
301 	.ct_restore	= vmm_restorectx,
302 };
303 
304 #ifdef KTR
305 static const char *
306 vcpu_state2str(enum vcpu_state state)
307 {
308 
309 	switch (state) {
310 	case VCPU_IDLE:
311 		return ("idle");
312 	case VCPU_FROZEN:
313 		return ("frozen");
314 	case VCPU_RUNNING:
315 		return ("running");
316 	case VCPU_SLEEPING:
317 		return ("sleeping");
318 	default:
319 		return ("unknown");
320 	}
321 }
322 #endif
323 
324 static void
325 vcpu_cleanup(struct vm *vm, int i, bool destroy)
326 {
327 	struct vcpu *vcpu = &vm->vcpu[i];
328 
329 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
330 	if (destroy) {
331 		vmm_stat_free(vcpu->stats);
332 
333 		hma_fpu_free(vcpu->guestfpu);
334 		vcpu->guestfpu = NULL;
335 
336 		vie_free(vcpu->vie_ctx);
337 		vcpu->vie_ctx = NULL;
338 
339 		vmc_destroy(vcpu->vmclient);
340 		vcpu->vmclient = NULL;
341 
342 		ctxop_free(vcpu->ctxop);
343 		mutex_destroy(&vcpu->lock);
344 	}
345 }
346 
347 static void
348 vcpu_init(struct vm *vm, int vcpu_id, bool create)
349 {
350 	struct vcpu *vcpu;
351 
352 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
353 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
354 
355 	vcpu = &vm->vcpu[vcpu_id];
356 
357 	if (create) {
358 		mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL);
359 
360 		vcpu->state = VCPU_IDLE;
361 		vcpu->hostcpu = NOCPU;
362 		vcpu->lastloccpu = NOCPU;
363 		vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP);
364 		vcpu->stats = vmm_stat_alloc();
365 		vcpu->vie_ctx = vie_alloc();
366 
367 		vcpu->ustate = VU_INIT;
368 		vcpu->ustate_when = gethrtime();
369 
370 		vcpu->vtc.vtc_vm = vm;
371 		vcpu->vtc.vtc_vcpuid = vcpu_id;
372 		vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc);
373 	} else {
374 		vie_reset(vcpu->vie_ctx);
375 		bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
376 		if (vcpu->ustate != VU_INIT) {
377 			vcpu_ustate_change(vm, vcpu_id, VU_INIT);
378 		}
379 		bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
380 	}
381 
382 	vcpu->run_state = VRS_HALT;
383 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
384 	(void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
385 	vcpu->reqidle = 0;
386 	vcpu->exit_intinfo = 0;
387 	vcpu->nmi_pending = false;
388 	vcpu->extint_pending = false;
389 	vcpu->exc_pending = 0;
390 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
391 	(void) hma_fpu_init(vcpu->guestfpu);
392 	vmm_stat_init(vcpu->stats);
393 	vcpu->tsc_offset = 0;
394 }
395 
396 int
397 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
398 {
399 
400 	return (trace_guest_exceptions);
401 }
402 
403 struct vm_exit *
404 vm_exitinfo(struct vm *vm, int cpuid)
405 {
406 	struct vcpu *vcpu;
407 
408 	if (cpuid < 0 || cpuid >= vm->maxcpus)
409 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
410 
411 	vcpu = &vm->vcpu[cpuid];
412 
413 	return (&vcpu->exitinfo);
414 }
415 
416 struct vie *
417 vm_vie_ctx(struct vm *vm, int cpuid)
418 {
419 	if (cpuid < 0 || cpuid >= vm->maxcpus)
420 		panic("vm_vie_ctx: invalid cpuid %d", cpuid);
421 
422 	return (vm->vcpu[cpuid].vie_ctx);
423 }
424 
425 static int
426 vmm_init(void)
427 {
428 	vmm_host_state_init();
429 
430 	if (vmm_is_intel()) {
431 		ops = &vmm_ops_intel;
432 		pte_ops = &ept_pte_ops;
433 	} else if (vmm_is_svm()) {
434 		ops = &vmm_ops_amd;
435 		pte_ops = &rvi_pte_ops;
436 	} else {
437 		return (ENXIO);
438 	}
439 
440 	return (VMM_INIT());
441 }
442 
443 int
444 vmm_mod_load()
445 {
446 	int	error;
447 
448 	VERIFY(vmm_initialized == 0);
449 
450 	error = vmm_init();
451 	if (error == 0)
452 		vmm_initialized = 1;
453 
454 	return (error);
455 }
456 
457 int
458 vmm_mod_unload()
459 {
460 	int	error;
461 
462 	VERIFY(vmm_initialized == 1);
463 
464 	error = VMM_CLEANUP();
465 	if (error)
466 		return (error);
467 	vmm_initialized = 0;
468 
469 	return (0);
470 }
471 
472 /*
473  * Create a test IOMMU domain to see if the host system has necessary hardware
474  * and drivers to do so.
475  */
476 bool
477 vmm_check_iommu(void)
478 {
479 	void *domain;
480 	const size_t arb_test_sz = (1UL << 32);
481 
482 	domain = iommu_create_domain(arb_test_sz);
483 	if (domain == NULL) {
484 		return (false);
485 	}
486 	iommu_destroy_domain(domain);
487 	return (true);
488 }
489 
490 static void
491 vm_init(struct vm *vm, bool create)
492 {
493 	int i;
494 
495 	vm->cookie = VMINIT(vm);
496 	vm->iommu = NULL;
497 	vm->vioapic = vioapic_init(vm);
498 	vm->vhpet = vhpet_init(vm);
499 	vm->vatpic = vatpic_init(vm);
500 	vm->vatpit = vatpit_init(vm);
501 	vm->vpmtmr = vpmtmr_init(vm);
502 	if (create)
503 		vm->vrtc = vrtc_init(vm);
504 
505 	vm_inout_init(vm, &vm->ioports);
506 
507 	CPU_ZERO(&vm->active_cpus);
508 	CPU_ZERO(&vm->debug_cpus);
509 
510 	vm->suspend = 0;
511 	CPU_ZERO(&vm->suspended_cpus);
512 
513 	for (i = 0; i < vm->maxcpus; i++)
514 		vcpu_init(vm, i, create);
515 
516 	/*
517 	 * Configure the VM-wide TSC offset so that the call to vm_init()
518 	 * represents the boot time (when the TSC(s) read 0).  Each vCPU will
519 	 * have its own offset from this, which is altered if/when the guest
520 	 * writes to MSR_TSC.
521 	 *
522 	 * The TSC offsetting math is all unsigned, using overflow for negative
523 	 * offets.  A reading of the TSC is negated to form the boot offset.
524 	 */
525 	const uint64_t boot_tsc = rdtsc_offset();
526 	vm->boot_tsc_offset = (uint64_t)(-(int64_t)boot_tsc);
527 
528 	/* Convert the boot TSC reading to hrtime */
529 	vm->boot_hrtime = (hrtime_t)boot_tsc;
530 	scalehrtime(&vm->boot_hrtime);
531 }
532 
533 /*
534  * The default CPU topology is a single thread per package.
535  */
536 uint_t cores_per_package = 1;
537 uint_t threads_per_core = 1;
538 
539 /*
540  * Debugging tunable to enable dirty-page-tracking.
541  * (Remains off by default for now)
542  */
543 bool gpt_track_dirty = false;
544 
545 int
546 vm_create(uint64_t flags, struct vm **retvm)
547 {
548 	struct vm *vm;
549 	struct vmspace *vmspace;
550 
551 	/*
552 	 * If vmm.ko could not be successfully initialized then don't attempt
553 	 * to create the virtual machine.
554 	 */
555 	if (!vmm_initialized)
556 		return (ENXIO);
557 
558 	vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS, pte_ops, gpt_track_dirty);
559 	if (vmspace == NULL)
560 		return (ENOMEM);
561 
562 	vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP);
563 
564 	vm->vmspace = vmspace;
565 	vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0;
566 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
567 		vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace);
568 	}
569 
570 	vm->sockets = 1;
571 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
572 	vm->threads = threads_per_core;	/* XXX backwards compatibility */
573 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
574 
575 	vm_init(vm, true);
576 
577 	*retvm = vm;
578 	return (0);
579 }
580 
581 void
582 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
583     uint16_t *threads, uint16_t *maxcpus)
584 {
585 	*sockets = vm->sockets;
586 	*cores = vm->cores;
587 	*threads = vm->threads;
588 	*maxcpus = vm->maxcpus;
589 }
590 
591 uint16_t
592 vm_get_maxcpus(struct vm *vm)
593 {
594 	return (vm->maxcpus);
595 }
596 
597 int
598 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
599     uint16_t threads, uint16_t maxcpus)
600 {
601 	if (maxcpus != 0)
602 		return (EINVAL);	/* XXX remove when supported */
603 	if ((sockets * cores * threads) > vm->maxcpus)
604 		return (EINVAL);
605 	/* XXX need to check sockets * cores * threads == vCPU, how? */
606 	vm->sockets = sockets;
607 	vm->cores = cores;
608 	vm->threads = threads;
609 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
610 	return (0);
611 }
612 
613 static void
614 vm_cleanup(struct vm *vm, bool destroy)
615 {
616 	struct mem_map *mm;
617 	int i;
618 
619 	ppt_unassign_all(vm);
620 
621 	if (vm->iommu != NULL)
622 		iommu_destroy_domain(vm->iommu);
623 
624 	/*
625 	 * Devices which attach their own ioport hooks should be cleaned up
626 	 * first so they can tear down those registrations.
627 	 */
628 	vpmtmr_cleanup(vm->vpmtmr);
629 
630 	vm_inout_cleanup(vm, &vm->ioports);
631 
632 	if (destroy)
633 		vrtc_cleanup(vm->vrtc);
634 	else
635 		vrtc_reset(vm->vrtc);
636 
637 	vatpit_cleanup(vm->vatpit);
638 	vhpet_cleanup(vm->vhpet);
639 	vatpic_cleanup(vm->vatpic);
640 	vioapic_cleanup(vm->vioapic);
641 
642 	for (i = 0; i < vm->maxcpus; i++)
643 		vcpu_cleanup(vm, i, destroy);
644 
645 	VMCLEANUP(vm->cookie);
646 
647 	/*
648 	 * System memory is removed from the guest address space only when
649 	 * the VM is destroyed. This is because the mapping remains the same
650 	 * across VM reset.
651 	 *
652 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
653 	 * so those mappings are removed on a VM reset.
654 	 */
655 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
656 		mm = &vm->mem_maps[i];
657 		if (destroy || !sysmem_mapping(vm, mm)) {
658 			vm_free_memmap(vm, i);
659 		} else {
660 			/*
661 			 * We need to reset the IOMMU flag so this mapping can
662 			 * be reused when a VM is rebooted. Since the IOMMU
663 			 * domain has already been destroyed we can just reset
664 			 * the flag here.
665 			 */
666 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
667 		}
668 	}
669 
670 	if (destroy) {
671 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
672 			vm_free_memseg(vm, i);
673 
674 		vmspace_destroy(vm->vmspace);
675 		vm->vmspace = NULL;
676 	}
677 }
678 
679 void
680 vm_destroy(struct vm *vm)
681 {
682 	vm_cleanup(vm, true);
683 	kmem_free(vm, sizeof (*vm));
684 }
685 
686 int
687 vm_reinit(struct vm *vm, uint64_t flags)
688 {
689 	/* A virtual machine can be reset only if all vcpus are suspended. */
690 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) != 0) {
691 		if ((flags & VM_REINIT_F_FORCE_SUSPEND) == 0) {
692 			return (EBUSY);
693 		}
694 
695 		/*
696 		 * Force the VM (and all its vCPUs) into a suspended state.
697 		 * This should be quick and easy, since the vm_reinit() call is
698 		 * made while holding the VM write lock, which requires holding
699 		 * all of the vCPUs in the VCPU_FROZEN state.
700 		 */
701 		(void) atomic_cmpset_int((uint_t *)&vm->suspend, 0,
702 		    VM_SUSPEND_RESET);
703 		for (uint_t i = 0; i < vm->maxcpus; i++) {
704 			struct vcpu *vcpu = &vm->vcpu[i];
705 
706 			if (CPU_ISSET(i, &vm->suspended_cpus) ||
707 			    !CPU_ISSET(i, &vm->active_cpus)) {
708 				continue;
709 			}
710 
711 			vcpu_lock(vcpu);
712 			VERIFY3U(vcpu->state, ==, VCPU_FROZEN);
713 			CPU_SET_ATOMIC(i, &vm->suspended_cpus);
714 			vcpu_unlock(vcpu);
715 		}
716 
717 		VERIFY0(CPU_CMP(&vm->suspended_cpus, &vm->active_cpus));
718 	}
719 
720 	vm_cleanup(vm, false);
721 	vm_init(vm, false);
722 	return (0);
723 }
724 
725 int
726 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
727 {
728 	vm_object_t *obj;
729 
730 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
731 		return (ENOMEM);
732 	else
733 		return (0);
734 }
735 
736 int
737 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
738 {
739 	return (vmspace_unmap(vm->vmspace, gpa, gpa + len));
740 }
741 
742 /*
743  * Return 'true' if 'gpa' is allocated in the guest address space.
744  *
745  * This function is called in the context of a running vcpu which acts as
746  * an implicit lock on 'vm->mem_maps[]'.
747  */
748 bool
749 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
750 {
751 	struct mem_map *mm;
752 	int i;
753 
754 #ifdef INVARIANTS
755 	int hostcpu, state;
756 	state = vcpu_get_state(vm, vcpuid, &hostcpu);
757 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
758 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
759 #endif
760 
761 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
762 		mm = &vm->mem_maps[i];
763 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
764 			return (true);		/* 'gpa' is sysmem or devmem */
765 	}
766 
767 	if (ppt_is_mmio(vm, gpa))
768 		return (true);			/* 'gpa' is pci passthru mmio */
769 
770 	return (false);
771 }
772 
773 int
774 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
775 {
776 	struct mem_seg *seg;
777 	vm_object_t *obj;
778 
779 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
780 		return (EINVAL);
781 
782 	if (len == 0 || (len & PAGE_MASK))
783 		return (EINVAL);
784 
785 	seg = &vm->mem_segs[ident];
786 	if (seg->object != NULL) {
787 		if (seg->len == len && seg->sysmem == sysmem)
788 			return (EEXIST);
789 		else
790 			return (EINVAL);
791 	}
792 
793 	obj = vm_object_mem_allocate(len, vm->mem_transient);
794 	if (obj == NULL)
795 		return (ENOMEM);
796 
797 	seg->len = len;
798 	seg->object = obj;
799 	seg->sysmem = sysmem;
800 	return (0);
801 }
802 
803 int
804 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
805     vm_object_t **objptr)
806 {
807 	struct mem_seg *seg;
808 
809 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
810 		return (EINVAL);
811 
812 	seg = &vm->mem_segs[ident];
813 	if (len)
814 		*len = seg->len;
815 	if (sysmem)
816 		*sysmem = seg->sysmem;
817 	if (objptr)
818 		*objptr = seg->object;
819 	return (0);
820 }
821 
822 void
823 vm_free_memseg(struct vm *vm, int ident)
824 {
825 	struct mem_seg *seg;
826 
827 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
828 	    ("%s: invalid memseg ident %d", __func__, ident));
829 
830 	seg = &vm->mem_segs[ident];
831 	if (seg->object != NULL) {
832 		vm_object_release(seg->object);
833 		bzero(seg, sizeof (struct mem_seg));
834 	}
835 }
836 
837 int
838 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
839     size_t len, int prot, int flags)
840 {
841 	struct mem_seg *seg;
842 	struct mem_map *m, *map;
843 	vm_ooffset_t last;
844 	int i, error;
845 
846 	if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
847 		return (EINVAL);
848 
849 	if (flags & ~VM_MEMMAP_F_WIRED)
850 		return (EINVAL);
851 
852 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
853 		return (EINVAL);
854 
855 	seg = &vm->mem_segs[segid];
856 	if (seg->object == NULL)
857 		return (EINVAL);
858 
859 	last = first + len;
860 	if (first < 0 || first >= last || last > seg->len)
861 		return (EINVAL);
862 
863 	if ((gpa | first | last) & PAGE_MASK)
864 		return (EINVAL);
865 
866 	map = NULL;
867 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
868 		m = &vm->mem_maps[i];
869 		if (m->len == 0) {
870 			map = m;
871 			break;
872 		}
873 	}
874 
875 	if (map == NULL)
876 		return (ENOSPC);
877 
878 	error = vmspace_map(vm->vmspace, seg->object, first, gpa, len, prot);
879 	if (error != 0)
880 		return (EFAULT);
881 
882 	vm_object_reference(seg->object);
883 
884 	if ((flags & VM_MEMMAP_F_WIRED) != 0) {
885 		error = vmspace_populate(vm->vmspace, gpa, gpa + len);
886 		if (error != 0) {
887 			VERIFY0(vmspace_unmap(vm->vmspace, gpa, gpa + len));
888 			return (EFAULT);
889 		}
890 	}
891 
892 	map->gpa = gpa;
893 	map->len = len;
894 	map->segoff = first;
895 	map->segid = segid;
896 	map->prot = prot;
897 	map->flags = flags;
898 	return (0);
899 }
900 
901 int
902 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
903 {
904 	struct mem_map *m;
905 	int i;
906 
907 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
908 		m = &vm->mem_maps[i];
909 		if (m->gpa == gpa && m->len == len &&
910 		    (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
911 			vm_free_memmap(vm, i);
912 			return (0);
913 		}
914 	}
915 
916 	return (EINVAL);
917 }
918 
919 int
920 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
921     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
922 {
923 	struct mem_map *mm, *mmnext;
924 	int i;
925 
926 	mmnext = NULL;
927 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
928 		mm = &vm->mem_maps[i];
929 		if (mm->len == 0 || mm->gpa < *gpa)
930 			continue;
931 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
932 			mmnext = mm;
933 	}
934 
935 	if (mmnext != NULL) {
936 		*gpa = mmnext->gpa;
937 		if (segid)
938 			*segid = mmnext->segid;
939 		if (segoff)
940 			*segoff = mmnext->segoff;
941 		if (len)
942 			*len = mmnext->len;
943 		if (prot)
944 			*prot = mmnext->prot;
945 		if (flags)
946 			*flags = mmnext->flags;
947 		return (0);
948 	} else {
949 		return (ENOENT);
950 	}
951 }
952 
953 static void
954 vm_free_memmap(struct vm *vm, int ident)
955 {
956 	struct mem_map *mm;
957 	int error;
958 
959 	mm = &vm->mem_maps[ident];
960 	if (mm->len) {
961 		error = vmspace_unmap(vm->vmspace, mm->gpa,
962 		    mm->gpa + mm->len);
963 		KASSERT(error == 0, ("%s: vmspace_unmap error %d",
964 		    __func__, error));
965 		bzero(mm, sizeof (struct mem_map));
966 	}
967 }
968 
969 static __inline bool
970 sysmem_mapping(struct vm *vm, struct mem_map *mm)
971 {
972 
973 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
974 		return (true);
975 	else
976 		return (false);
977 }
978 
979 vm_paddr_t
980 vmm_sysmem_maxaddr(struct vm *vm)
981 {
982 	struct mem_map *mm;
983 	vm_paddr_t maxaddr;
984 	int i;
985 
986 	maxaddr = 0;
987 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
988 		mm = &vm->mem_maps[i];
989 		if (sysmem_mapping(vm, mm)) {
990 			if (maxaddr < mm->gpa + mm->len)
991 				maxaddr = mm->gpa + mm->len;
992 		}
993 	}
994 	return (maxaddr);
995 }
996 
997 static void
998 vm_iommu_modify(struct vm *vm, bool map)
999 {
1000 	int i, sz;
1001 	vm_paddr_t gpa, hpa;
1002 	struct mem_map *mm;
1003 	vm_client_t *vmc;
1004 
1005 	sz = PAGE_SIZE;
1006 	vmc = vmspace_client_alloc(vm->vmspace);
1007 
1008 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
1009 		mm = &vm->mem_maps[i];
1010 		if (!sysmem_mapping(vm, mm))
1011 			continue;
1012 
1013 		if (map) {
1014 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
1015 			    ("iommu map found invalid memmap %lx/%lx/%x",
1016 			    mm->gpa, mm->len, mm->flags));
1017 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
1018 				continue;
1019 			mm->flags |= VM_MEMMAP_F_IOMMU;
1020 		} else {
1021 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
1022 				continue;
1023 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
1024 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
1025 			    ("iommu unmap found invalid memmap %lx/%lx/%x",
1026 			    mm->gpa, mm->len, mm->flags));
1027 		}
1028 
1029 		gpa = mm->gpa;
1030 		while (gpa < mm->gpa + mm->len) {
1031 			vm_page_t *vmp;
1032 
1033 			vmp = vmc_hold(vmc, gpa, PROT_WRITE);
1034 			ASSERT(vmp != NULL);
1035 			hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT);
1036 			(void) vmp_release(vmp);
1037 
1038 			/*
1039 			 * When originally ported from FreeBSD, the logic for
1040 			 * adding memory to the guest domain would
1041 			 * simultaneously remove it from the host domain.  The
1042 			 * justification for that is not clear, and FreeBSD has
1043 			 * subsequently changed the behavior to not remove the
1044 			 * memory from the host domain.
1045 			 *
1046 			 * Leaving the guest memory in the host domain for the
1047 			 * life of the VM is necessary to make it available for
1048 			 * DMA, such as through viona in the TX path.
1049 			 */
1050 			if (map) {
1051 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
1052 			} else {
1053 				iommu_remove_mapping(vm->iommu, gpa, sz);
1054 			}
1055 
1056 			gpa += PAGE_SIZE;
1057 		}
1058 	}
1059 	vmc_destroy(vmc);
1060 
1061 	/*
1062 	 * Invalidate the cached translations associated with the domain
1063 	 * from which pages were removed.
1064 	 */
1065 	iommu_invalidate_tlb(vm->iommu);
1066 }
1067 
1068 int
1069 vm_unassign_pptdev(struct vm *vm, int pptfd)
1070 {
1071 	int error;
1072 
1073 	error = ppt_unassign_device(vm, pptfd);
1074 	if (error)
1075 		return (error);
1076 
1077 	if (ppt_assigned_devices(vm) == 0)
1078 		vm_iommu_modify(vm, false);
1079 
1080 	return (0);
1081 }
1082 
1083 int
1084 vm_assign_pptdev(struct vm *vm, int pptfd)
1085 {
1086 	int error;
1087 	vm_paddr_t maxaddr;
1088 
1089 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
1090 	if (ppt_assigned_devices(vm) == 0) {
1091 		KASSERT(vm->iommu == NULL,
1092 		    ("vm_assign_pptdev: iommu must be NULL"));
1093 		maxaddr = vmm_sysmem_maxaddr(vm);
1094 		vm->iommu = iommu_create_domain(maxaddr);
1095 		if (vm->iommu == NULL)
1096 			return (ENXIO);
1097 		vm_iommu_modify(vm, true);
1098 	}
1099 
1100 	error = ppt_assign_device(vm, pptfd);
1101 	return (error);
1102 }
1103 
1104 int
1105 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1106 {
1107 
1108 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1109 		return (EINVAL);
1110 
1111 	if (reg >= VM_REG_LAST)
1112 		return (EINVAL);
1113 
1114 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
1115 }
1116 
1117 int
1118 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1119 {
1120 	struct vcpu *vcpu;
1121 	int error;
1122 
1123 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1124 		return (EINVAL);
1125 
1126 	if (reg >= VM_REG_LAST)
1127 		return (EINVAL);
1128 
1129 	error = VMSETREG(vm->cookie, vcpuid, reg, val);
1130 	if (error || reg != VM_REG_GUEST_RIP)
1131 		return (error);
1132 
1133 	/* Set 'nextrip' to match the value of %rip */
1134 	vcpu = &vm->vcpu[vcpuid];
1135 	vcpu->nextrip = val;
1136 	return (0);
1137 }
1138 
1139 static bool
1140 is_descriptor_table(int reg)
1141 {
1142 	switch (reg) {
1143 	case VM_REG_GUEST_IDTR:
1144 	case VM_REG_GUEST_GDTR:
1145 		return (true);
1146 	default:
1147 		return (false);
1148 	}
1149 }
1150 
1151 static bool
1152 is_segment_register(int reg)
1153 {
1154 	switch (reg) {
1155 	case VM_REG_GUEST_ES:
1156 	case VM_REG_GUEST_CS:
1157 	case VM_REG_GUEST_SS:
1158 	case VM_REG_GUEST_DS:
1159 	case VM_REG_GUEST_FS:
1160 	case VM_REG_GUEST_GS:
1161 	case VM_REG_GUEST_TR:
1162 	case VM_REG_GUEST_LDTR:
1163 		return (true);
1164 	default:
1165 		return (false);
1166 	}
1167 }
1168 
1169 int
1170 vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
1171 {
1172 
1173 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1174 		return (EINVAL);
1175 
1176 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1177 		return (EINVAL);
1178 
1179 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1180 }
1181 
1182 int
1183 vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
1184 {
1185 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1186 		return (EINVAL);
1187 
1188 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1189 		return (EINVAL);
1190 
1191 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1192 }
1193 
1194 static int
1195 translate_hma_xsave_result(hma_fpu_xsave_result_t res)
1196 {
1197 	switch (res) {
1198 	case HFXR_OK:
1199 		return (0);
1200 	case HFXR_NO_SPACE:
1201 		return (ENOSPC);
1202 	case HFXR_BAD_ALIGN:
1203 	case HFXR_UNSUP_FMT:
1204 	case HFXR_UNSUP_FEAT:
1205 	case HFXR_INVALID_DATA:
1206 		return (EINVAL);
1207 	default:
1208 		panic("unexpected xsave result");
1209 	}
1210 }
1211 
1212 int
1213 vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
1214 {
1215 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1216 		return (EINVAL);
1217 
1218 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1219 	hma_fpu_xsave_result_t res;
1220 
1221 	res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len);
1222 	return (translate_hma_xsave_result(res));
1223 }
1224 
1225 int
1226 vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
1227 {
1228 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1229 		return (EINVAL);
1230 
1231 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1232 	hma_fpu_xsave_result_t res;
1233 
1234 	res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len);
1235 	return (translate_hma_xsave_result(res));
1236 }
1237 
1238 int
1239 vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
1240 {
1241 	struct vcpu *vcpu;
1242 
1243 	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1244 		return (EINVAL);
1245 	}
1246 
1247 	vcpu = &vm->vcpu[vcpuid];
1248 
1249 	vcpu_lock(vcpu);
1250 	*state = vcpu->run_state;
1251 	*sipi_vec = vcpu->sipi_vector;
1252 	vcpu_unlock(vcpu);
1253 
1254 	return (0);
1255 }
1256 
1257 int
1258 vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
1259 {
1260 	struct vcpu *vcpu;
1261 
1262 	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
1263 		return (EINVAL);
1264 	}
1265 	if (!VRS_IS_VALID(state)) {
1266 		return (EINVAL);
1267 	}
1268 
1269 	vcpu = &vm->vcpu[vcpuid];
1270 
1271 	vcpu_lock(vcpu);
1272 	vcpu->run_state = state;
1273 	vcpu->sipi_vector = sipi_vec;
1274 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1275 	vcpu_unlock(vcpu);
1276 
1277 	return (0);
1278 }
1279 
1280 void
1281 vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap)
1282 {
1283 	vmspace_t *vms = vm_get_vmspace(vm);
1284 	vmspace_track_dirty(vms, gpa, len, bitmap);
1285 }
1286 
1287 static void
1288 restore_guest_fpustate(struct vcpu *vcpu)
1289 {
1290 	/* Save host FPU and restore guest FPU */
1291 	fpu_stop_emulating();
1292 	hma_fpu_start_guest(vcpu->guestfpu);
1293 
1294 	/* restore guest XCR0 if XSAVE is enabled in the host */
1295 	if (rcr4() & CR4_XSAVE)
1296 		load_xcr(0, vcpu->guest_xcr0);
1297 
1298 	/*
1299 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1300 	 * to trap any access to the FPU by the host.
1301 	 */
1302 	fpu_start_emulating();
1303 }
1304 
1305 static void
1306 save_guest_fpustate(struct vcpu *vcpu)
1307 {
1308 
1309 	if ((rcr0() & CR0_TS) == 0)
1310 		panic("fpu emulation not enabled in host!");
1311 
1312 	/* save guest XCR0 and restore host XCR0 */
1313 	if (rcr4() & CR4_XSAVE) {
1314 		vcpu->guest_xcr0 = rxcr(0);
1315 		load_xcr(0, vmm_get_host_xcr0());
1316 	}
1317 
1318 	/* save guest FPU and restore host FPU */
1319 	fpu_stop_emulating();
1320 	hma_fpu_stop_guest(vcpu->guestfpu);
1321 	/*
1322 	 * When the host state has been restored, we should not re-enable
1323 	 * CR0.TS on illumos for eager FPU.
1324 	 */
1325 }
1326 
1327 static int
1328 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1329     bool from_idle)
1330 {
1331 	struct vcpu *vcpu;
1332 	int error;
1333 
1334 	vcpu = &vm->vcpu[vcpuid];
1335 	vcpu_assert_locked(vcpu);
1336 
1337 	/*
1338 	 * State transitions from the vmmdev_ioctl() must always begin from
1339 	 * the VCPU_IDLE state. This guarantees that there is only a single
1340 	 * ioctl() operating on a vcpu at any point.
1341 	 */
1342 	if (from_idle) {
1343 		while (vcpu->state != VCPU_IDLE) {
1344 			vcpu->reqidle = 1;
1345 			vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
1346 			cv_wait(&vcpu->state_cv, &vcpu->lock);
1347 		}
1348 	} else {
1349 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1350 		    "vcpu idle state"));
1351 	}
1352 
1353 	if (vcpu->state == VCPU_RUNNING) {
1354 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1355 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1356 	} else {
1357 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1358 		    "vcpu that is not running", vcpu->hostcpu));
1359 	}
1360 
1361 	/*
1362 	 * The following state transitions are allowed:
1363 	 * IDLE -> FROZEN -> IDLE
1364 	 * FROZEN -> RUNNING -> FROZEN
1365 	 * FROZEN -> SLEEPING -> FROZEN
1366 	 */
1367 	switch (vcpu->state) {
1368 	case VCPU_IDLE:
1369 	case VCPU_RUNNING:
1370 	case VCPU_SLEEPING:
1371 		error = (newstate != VCPU_FROZEN);
1372 		break;
1373 	case VCPU_FROZEN:
1374 		error = (newstate == VCPU_FROZEN);
1375 		break;
1376 	default:
1377 		error = 1;
1378 		break;
1379 	}
1380 
1381 	if (error)
1382 		return (EBUSY);
1383 
1384 	vcpu->state = newstate;
1385 	if (newstate == VCPU_RUNNING)
1386 		vcpu->hostcpu = curcpu;
1387 	else
1388 		vcpu->hostcpu = NOCPU;
1389 
1390 	if (newstate == VCPU_IDLE) {
1391 		cv_broadcast(&vcpu->state_cv);
1392 	}
1393 
1394 	return (0);
1395 }
1396 
1397 static void
1398 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1399 {
1400 	int error;
1401 
1402 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1403 		panic("Error %d setting state to %d\n", error, newstate);
1404 }
1405 
1406 static void
1407 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1408 {
1409 	int error;
1410 
1411 	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1412 		panic("Error %d setting state to %d", error, newstate);
1413 }
1414 
1415 /*
1416  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1417  */
1418 static int
1419 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
1420 {
1421 	struct vcpu *vcpu;
1422 	int vcpu_halted, vm_halted;
1423 	bool userspace_exit = false;
1424 
1425 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1426 
1427 	vcpu = &vm->vcpu[vcpuid];
1428 	vcpu_halted = 0;
1429 	vm_halted = 0;
1430 
1431 	vcpu_lock(vcpu);
1432 	while (1) {
1433 		/*
1434 		 * Do a final check for pending interrupts (including NMI and
1435 		 * INIT) before putting this thread to sleep.
1436 		 */
1437 		if (vm_nmi_pending(vm, vcpuid))
1438 			break;
1439 		if (vcpu_run_state_pending(vm, vcpuid))
1440 			break;
1441 		if (!intr_disabled) {
1442 			if (vm_extint_pending(vm, vcpuid) ||
1443 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1444 				break;
1445 			}
1446 		}
1447 
1448 		/*
1449 		 * Also check for software events which would cause a wake-up.
1450 		 * This will set the appropriate exitcode directly, rather than
1451 		 * requiring a trip through VM_RUN().
1452 		 */
1453 		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1454 			userspace_exit = true;
1455 			break;
1456 		}
1457 
1458 		/*
1459 		 * Some Linux guests implement "halt" by having all vcpus
1460 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1461 		 * track of the vcpus that have entered this state. When all
1462 		 * vcpus enter the halted state the virtual machine is halted.
1463 		 */
1464 		if (intr_disabled) {
1465 			if (!vcpu_halted && halt_detection_enabled) {
1466 				vcpu_halted = 1;
1467 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1468 			}
1469 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1470 				vm_halted = 1;
1471 				break;
1472 			}
1473 		}
1474 
1475 		vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1476 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1477 		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
1478 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1479 		vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1480 	}
1481 
1482 	if (vcpu_halted)
1483 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1484 
1485 	vcpu_unlock(vcpu);
1486 
1487 	if (vm_halted) {
1488 		(void) vm_suspend(vm, VM_SUSPEND_HALT);
1489 	}
1490 
1491 	return (userspace_exit ? -1 : 0);
1492 }
1493 
1494 static int
1495 vm_handle_paging(struct vm *vm, int vcpuid)
1496 {
1497 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1498 	vm_client_t *vmc = vcpu->vmclient;
1499 	struct vm_exit *vme = &vcpu->exitinfo;
1500 	int rv, ftype;
1501 
1502 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1503 	    __func__, vme->inst_length));
1504 
1505 	ftype = vme->u.paging.fault_type;
1506 	KASSERT(ftype == PROT_READ ||
1507 	    ftype == PROT_WRITE || ftype == PROT_EXEC,
1508 	    ("vm_handle_paging: invalid fault_type %d", ftype));
1509 
1510 	rv = vmc_fault(vmc, vme->u.paging.gpa, ftype);
1511 
1512 	if (rv != 0)
1513 		return (EFAULT);
1514 	return (0);
1515 }
1516 
1517 int
1518 vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
1519     int rsize)
1520 {
1521 	int err = ESRCH;
1522 
1523 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1524 		struct vlapic *vlapic = vm_lapic(vm, cpuid);
1525 
1526 		err = vlapic_mmio_read(vlapic, gpa, rval, rsize);
1527 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1528 		err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
1529 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1530 		err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
1531 	}
1532 
1533 	return (err);
1534 }
1535 
1536 int
1537 vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
1538     int wsize)
1539 {
1540 	int err = ESRCH;
1541 
1542 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1543 		struct vlapic *vlapic = vm_lapic(vm, cpuid);
1544 
1545 		err = vlapic_mmio_write(vlapic, gpa, wval, wsize);
1546 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1547 		err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
1548 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1549 		err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
1550 	}
1551 
1552 	return (err);
1553 }
1554 
1555 static int
1556 vm_handle_mmio_emul(struct vm *vm, int vcpuid)
1557 {
1558 	struct vie *vie;
1559 	struct vcpu *vcpu;
1560 	struct vm_exit *vme;
1561 	uint64_t inst_addr;
1562 	int error, fault, cs_d;
1563 
1564 	vcpu = &vm->vcpu[vcpuid];
1565 	vme = &vcpu->exitinfo;
1566 	vie = vcpu->vie_ctx;
1567 
1568 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1569 	    __func__, vme->inst_length));
1570 
1571 	inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
1572 	cs_d = vme->u.mmio_emul.cs_d;
1573 
1574 	/* Fetch the faulting instruction */
1575 	if (vie_needs_fetch(vie)) {
1576 		error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
1577 		    &fault);
1578 		if (error != 0) {
1579 			return (error);
1580 		} else if (fault) {
1581 			/*
1582 			 * If a fault during instruction fetch was encountered,
1583 			 * it will have asserted that the appropriate exception
1584 			 * be injected at next entry.
1585 			 * No further work is required.
1586 			 */
1587 			return (0);
1588 		}
1589 	}
1590 
1591 	if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1592 		/* Dump (unrecognized) instruction bytes in userspace */
1593 		vie_fallback_exitinfo(vie, vme);
1594 		return (-1);
1595 	}
1596 	if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
1597 	    vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
1598 		/* Decoded GLA does not match GLA from VM exit state */
1599 		vie_fallback_exitinfo(vie, vme);
1600 		return (-1);
1601 	}
1602 
1603 repeat:
1604 	error = vie_emulate_mmio(vie, vm, vcpuid);
1605 	if (error < 0) {
1606 		/*
1607 		 * MMIO not handled by any of the in-kernel-emulated devices, so
1608 		 * make a trip out to userspace for it.
1609 		 */
1610 		vie_exitinfo(vie, vme);
1611 	} else if (error == EAGAIN) {
1612 		/*
1613 		 * Continue emulating the rep-prefixed instruction, which has
1614 		 * not completed its iterations.
1615 		 *
1616 		 * In case this can be emulated in-kernel and has a high
1617 		 * repetition count (causing a tight spin), it should be
1618 		 * deferential to yield conditions.
1619 		 */
1620 		if (!vcpu_should_yield(vm, vcpuid)) {
1621 			goto repeat;
1622 		} else {
1623 			/*
1624 			 * Defer to the contending load by making a trip to
1625 			 * userspace with a no-op (BOGUS) exit reason.
1626 			 */
1627 			vie_reset(vie);
1628 			vme->exitcode = VM_EXITCODE_BOGUS;
1629 			return (-1);
1630 		}
1631 	} else if (error == 0) {
1632 		/* Update %rip now that instruction has been emulated */
1633 		vie_advance_pc(vie, &vcpu->nextrip);
1634 	}
1635 	return (error);
1636 }
1637 
1638 static int
1639 vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
1640 {
1641 	struct vcpu *vcpu;
1642 	struct vie *vie;
1643 	int err;
1644 
1645 	vcpu = &vm->vcpu[vcpuid];
1646 	vie = vcpu->vie_ctx;
1647 
1648 repeat:
1649 	err = vie_emulate_inout(vie, vm, vcpuid);
1650 
1651 	if (err < 0) {
1652 		/*
1653 		 * In/out not handled by any of the in-kernel-emulated devices,
1654 		 * so make a trip out to userspace for it.
1655 		 */
1656 		vie_exitinfo(vie, vme);
1657 		return (err);
1658 	} else if (err == EAGAIN) {
1659 		/*
1660 		 * Continue emulating the rep-prefixed ins/outs, which has not
1661 		 * completed its iterations.
1662 		 *
1663 		 * In case this can be emulated in-kernel and has a high
1664 		 * repetition count (causing a tight spin), it should be
1665 		 * deferential to yield conditions.
1666 		 */
1667 		if (!vcpu_should_yield(vm, vcpuid)) {
1668 			goto repeat;
1669 		} else {
1670 			/*
1671 			 * Defer to the contending load by making a trip to
1672 			 * userspace with a no-op (BOGUS) exit reason.
1673 			 */
1674 			vie_reset(vie);
1675 			vme->exitcode = VM_EXITCODE_BOGUS;
1676 			return (-1);
1677 		}
1678 	} else if (err != 0) {
1679 		/* Emulation failure.  Bail all the way out to userspace. */
1680 		vme->exitcode = VM_EXITCODE_INST_EMUL;
1681 		bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
1682 		return (-1);
1683 	}
1684 
1685 	vie_advance_pc(vie, &vcpu->nextrip);
1686 	return (0);
1687 }
1688 
1689 static int
1690 vm_handle_inst_emul(struct vm *vm, int vcpuid)
1691 {
1692 	struct vie *vie;
1693 	struct vcpu *vcpu;
1694 	struct vm_exit *vme;
1695 	uint64_t cs_base;
1696 	int error, fault, cs_d;
1697 
1698 	vcpu = &vm->vcpu[vcpuid];
1699 	vme = &vcpu->exitinfo;
1700 	vie = vcpu->vie_ctx;
1701 
1702 	vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
1703 
1704 	/* Fetch the faulting instruction */
1705 	ASSERT(vie_needs_fetch(vie));
1706 	error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
1707 	    &fault);
1708 	if (error != 0) {
1709 		return (error);
1710 	} else if (fault) {
1711 		/*
1712 		 * If a fault during instruction fetch was encounted, it will
1713 		 * have asserted that the appropriate exception be injected at
1714 		 * next entry.  No further work is required.
1715 		 */
1716 		return (0);
1717 	}
1718 
1719 	if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
1720 		/* Dump (unrecognized) instruction bytes in userspace */
1721 		vie_fallback_exitinfo(vie, vme);
1722 		return (-1);
1723 	}
1724 
1725 	error = vie_emulate_other(vie, vm, vcpuid);
1726 	if (error != 0) {
1727 		/*
1728 		 * Instruction emulation was unable to complete successfully, so
1729 		 * kick it out to userspace for handling.
1730 		 */
1731 		vie_fallback_exitinfo(vie, vme);
1732 	} else {
1733 		/* Update %rip now that instruction has been emulated */
1734 		vie_advance_pc(vie, &vcpu->nextrip);
1735 	}
1736 	return (error);
1737 }
1738 
1739 static int
1740 vm_handle_suspend(struct vm *vm, int vcpuid)
1741 {
1742 	int i;
1743 	struct vcpu *vcpu;
1744 
1745 	vcpu = &vm->vcpu[vcpuid];
1746 
1747 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1748 
1749 	/*
1750 	 * Wait until all 'active_cpus' have suspended themselves.
1751 	 */
1752 	vcpu_lock(vcpu);
1753 	vcpu_ustate_change(vm, vcpuid, VU_INIT);
1754 	while (1) {
1755 		int rc;
1756 
1757 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1758 			break;
1759 		}
1760 
1761 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1762 		rc = cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->lock, hz,
1763 		    TR_CLOCK_TICK);
1764 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1765 
1766 		/*
1767 		 * If the userspace process driving the instance is killed, any
1768 		 * vCPUs yet to be marked suspended (because they are not
1769 		 * VM_RUN-ing in the kernel presently) will never reach that
1770 		 * state.
1771 		 *
1772 		 * To avoid vm_handle_suspend() getting stuck in the kernel
1773 		 * waiting for those vCPUs, offer a bail-out even though it
1774 		 * means returning without all vCPUs in a suspended state.
1775 		 */
1776 		if (rc <= 0) {
1777 			if ((curproc->p_flag & SEXITING) != 0) {
1778 				break;
1779 			}
1780 		}
1781 	}
1782 	vcpu_unlock(vcpu);
1783 
1784 	/*
1785 	 * Wakeup the other sleeping vcpus and return to userspace.
1786 	 */
1787 	for (i = 0; i < vm->maxcpus; i++) {
1788 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1789 			vcpu_notify_event(vm, i);
1790 		}
1791 	}
1792 
1793 	return (-1);
1794 }
1795 
1796 static int
1797 vm_handle_reqidle(struct vm *vm, int vcpuid)
1798 {
1799 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1800 
1801 	vcpu_lock(vcpu);
1802 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1803 	vcpu->reqidle = 0;
1804 	vcpu_unlock(vcpu);
1805 	return (-1);
1806 }
1807 
1808 static int
1809 vm_handle_run_state(struct vm *vm, int vcpuid)
1810 {
1811 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1812 	bool handled = false;
1813 
1814 	vcpu_lock(vcpu);
1815 	while (1) {
1816 		if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
1817 			vcpu_unlock(vcpu);
1818 			VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
1819 			vcpu_lock(vcpu);
1820 
1821 			vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
1822 			vcpu->run_state |= VRS_INIT;
1823 		}
1824 
1825 		if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
1826 		    (VRS_INIT | VRS_PEND_SIPI)) {
1827 			const uint8_t vector = vcpu->sipi_vector;
1828 
1829 			vcpu_unlock(vcpu);
1830 			VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
1831 			vcpu_lock(vcpu);
1832 
1833 			vcpu->run_state &= ~VRS_PEND_SIPI;
1834 			vcpu->run_state |= VRS_RUN;
1835 		}
1836 
1837 		/*
1838 		 * If the vCPU is now in the running state, there is no need to
1839 		 * wait for anything prior to re-entry.
1840 		 */
1841 		if ((vcpu->run_state & VRS_RUN) != 0) {
1842 			handled = true;
1843 			break;
1844 		}
1845 
1846 		/*
1847 		 * Also check for software events which would cause a wake-up.
1848 		 * This will set the appropriate exitcode directly, rather than
1849 		 * requiring a trip through VM_RUN().
1850 		 */
1851 		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
1852 			break;
1853 		}
1854 
1855 		vcpu_ustate_change(vm, vcpuid, VU_IDLE);
1856 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1857 		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
1858 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1859 		vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
1860 	}
1861 	vcpu_unlock(vcpu);
1862 
1863 	return (handled ? 0 : -1);
1864 }
1865 
1866 static int
1867 vm_rdmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t *val)
1868 {
1869 	switch (num) {
1870 	case MSR_MTRRcap:
1871 		*val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX;
1872 		break;
1873 	case MSR_MTRRdefType:
1874 		*val = mtrr->def_type;
1875 		break;
1876 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
1877 		*val = mtrr->fixed4k[num - MSR_MTRR4kBase];
1878 		break;
1879 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1880 		*val = mtrr->fixed16k[num - MSR_MTRR16kBase];
1881 		break;
1882 	case MSR_MTRR64kBase:
1883 		*val = mtrr->fixed64k;
1884 		break;
1885 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
1886 		uint_t offset = num - MSR_MTRRVarBase;
1887 		if (offset % 2 == 0) {
1888 			*val = mtrr->var[offset / 2].base;
1889 		} else {
1890 			*val = mtrr->var[offset / 2].mask;
1891 		}
1892 		break;
1893 	}
1894 	default:
1895 		return (-1);
1896 	}
1897 
1898 	return (0);
1899 }
1900 
1901 static int
1902 vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val)
1903 {
1904 	switch (num) {
1905 	case MSR_MTRRcap:
1906 		/* MTRRCAP is read only */
1907 		return (-1);
1908 	case MSR_MTRRdefType:
1909 		if (val & ~VMM_MTRR_DEF_MASK) {
1910 			/* generate #GP on writes to reserved fields */
1911 			return (-1);
1912 		}
1913 		mtrr->def_type = val;
1914 		break;
1915 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
1916 		mtrr->fixed4k[num - MSR_MTRR4kBase] = val;
1917 		break;
1918 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1919 		mtrr->fixed16k[num - MSR_MTRR16kBase] = val;
1920 		break;
1921 	case MSR_MTRR64kBase:
1922 		mtrr->fixed64k = val;
1923 		break;
1924 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
1925 		uint_t offset = num - MSR_MTRRVarBase;
1926 		if (offset % 2 == 0) {
1927 			if (val & ~VMM_MTRR_PHYSBASE_MASK) {
1928 				/* generate #GP on writes to reserved fields */
1929 				return (-1);
1930 			}
1931 			mtrr->var[offset / 2].base = val;
1932 		} else {
1933 			if (val & ~VMM_MTRR_PHYSMASK_MASK) {
1934 				/* generate #GP on writes to reserved fields */
1935 				return (-1);
1936 			}
1937 			mtrr->var[offset / 2].mask = val;
1938 		}
1939 		break;
1940 	}
1941 	default:
1942 		return (-1);
1943 	}
1944 
1945 	return (0);
1946 }
1947 
1948 static int
1949 vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
1950 {
1951 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1952 	const uint32_t code = vme->u.msr.code;
1953 	uint64_t val = 0;
1954 
1955 	switch (code) {
1956 	case MSR_MCG_CAP:
1957 	case MSR_MCG_STATUS:
1958 		val = 0;
1959 		break;
1960 
1961 	case MSR_MTRRcap:
1962 	case MSR_MTRRdefType:
1963 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
1964 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
1965 	case MSR_MTRR64kBase:
1966 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
1967 		if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0)
1968 			vm_inject_gp(vm, vcpuid);
1969 		break;
1970 
1971 	case MSR_TSC:
1972 		/*
1973 		 * In all likelihood, this should always be handled in guest
1974 		 * context by VMX/SVM rather than taking an exit.  (Both VMX and
1975 		 * SVM pass through read-only access to MSR_TSC to the guest.)
1976 		 *
1977 		 * No physical offset is requested of vcpu_tsc_offset() since
1978 		 * rdtsc_offset() takes care of that instead.
1979 		 */
1980 		val = vcpu_tsc_offset(vm, vcpuid, false) + rdtsc_offset();
1981 		break;
1982 
1983 	default:
1984 		/*
1985 		 * Anything not handled at this point will be kicked out to
1986 		 * userspace for attempted processing there.
1987 		 */
1988 		return (-1);
1989 	}
1990 
1991 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
1992 	    val & 0xffffffff));
1993 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
1994 	    val >> 32));
1995 	return (0);
1996 }
1997 
1998 static int
1999 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
2000 {
2001 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2002 	const uint32_t code = vme->u.msr.code;
2003 	const uint64_t val = vme->u.msr.wval;
2004 
2005 	switch (code) {
2006 	case MSR_MCG_CAP:
2007 	case MSR_MCG_STATUS:
2008 		/* Ignore writes */
2009 		break;
2010 
2011 	case MSR_MTRRcap:
2012 	case MSR_MTRRdefType:
2013 	case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
2014 	case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
2015 	case MSR_MTRR64kBase:
2016 	case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
2017 		if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0)
2018 			vm_inject_gp(vm, vcpuid);
2019 		break;
2020 
2021 	case MSR_TSC:
2022 		/*
2023 		 * The effect of writing the TSC MSR is that a subsequent read
2024 		 * of the TSC would report that value written (plus any time
2025 		 * elapsed between the write and the read).  The guest TSC value
2026 		 * is calculated from a global offset for the guest (which
2027 		 * effectively makes its TSC read 0 at guest boot) and a
2028 		 * per-vCPU offset to handle these writes to the MSR.
2029 		 *
2030 		 * To calculate that per-vCPU offset, we can work backwards from
2031 		 * the guest value at the time of write:
2032 		 *
2033 		 * value = host TSC + VM boot offset + vCPU offset
2034 		 *
2035 		 * so therefore:
2036 		 *
2037 		 * value - host TSC - VM boot offset = vCPU offset
2038 		 */
2039 		vcpu->tsc_offset = val - vm->boot_tsc_offset - rdtsc_offset();
2040 		break;
2041 
2042 	default:
2043 		/*
2044 		 * Anything not handled at this point will be kicked out to
2045 		 * userspace for attempted processing there.
2046 		 */
2047 		return (-1);
2048 	}
2049 
2050 	return (0);
2051 }
2052 
2053 int
2054 vm_suspend(struct vm *vm, enum vm_suspend_how how)
2055 {
2056 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
2057 		return (EINVAL);
2058 
2059 	if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) {
2060 		return (EALREADY);
2061 	}
2062 
2063 	/*
2064 	 * Notify all active vcpus that they are now suspended.
2065 	 */
2066 	for (uint_t i = 0; i < vm->maxcpus; i++) {
2067 		struct vcpu *vcpu = &vm->vcpu[i];
2068 
2069 		vcpu_lock(vcpu);
2070 		if (vcpu->state == VCPU_IDLE || vcpu->state == VCPU_FROZEN) {
2071 			/*
2072 			 * Any vCPUs not actively running or in HLT can be
2073 			 * marked as suspended immediately.
2074 			 */
2075 			if (CPU_ISSET(i, &vm->active_cpus)) {
2076 				CPU_SET_ATOMIC(i, &vm->suspended_cpus);
2077 			}
2078 		} else {
2079 			/*
2080 			 * Those which are running or in HLT will pick up the
2081 			 * suspended state after notification.
2082 			 */
2083 			vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2084 		}
2085 		vcpu_unlock(vcpu);
2086 	}
2087 	return (0);
2088 }
2089 
2090 void
2091 vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
2092 {
2093 	struct vm_exit *vmexit;
2094 
2095 	vmexit = vm_exitinfo(vm, vcpuid);
2096 	vmexit->rip = rip;
2097 	vmexit->inst_length = 0;
2098 	vmexit->exitcode = VM_EXITCODE_RUN_STATE;
2099 	vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
2100 }
2101 
2102 /*
2103  * Some vmm resources, such as the lapic, may have CPU-specific resources
2104  * allocated to them which would benefit from migration onto the host CPU which
2105  * is processing the vcpu state.
2106  */
2107 static void
2108 vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
2109 {
2110 	/*
2111 	 * Localizing cyclic resources requires acquisition of cpu_lock, and
2112 	 * doing so with kpreempt disabled is a recipe for deadlock disaster.
2113 	 */
2114 	VERIFY(curthread->t_preempt == 0);
2115 
2116 	/*
2117 	 * Do not bother with localization if this vCPU is about to return to
2118 	 * the host CPU it was last localized to.
2119 	 */
2120 	if (vcpu->lastloccpu == curcpu)
2121 		return;
2122 
2123 	/*
2124 	 * Localize system-wide resources to the primary boot vCPU.  While any
2125 	 * of the other vCPUs may access them, it keeps the potential interrupt
2126 	 * footprint constrained to CPUs involved with this instance.
2127 	 */
2128 	if (vcpu == &vm->vcpu[0]) {
2129 		vhpet_localize_resources(vm->vhpet);
2130 		vrtc_localize_resources(vm->vrtc);
2131 		vatpit_localize_resources(vm->vatpit);
2132 	}
2133 
2134 	vlapic_localize_resources(vcpu->vlapic);
2135 
2136 	vcpu->lastloccpu = curcpu;
2137 }
2138 
2139 static void
2140 vmm_savectx(void *arg)
2141 {
2142 	vm_thread_ctx_t *vtc = arg;
2143 	struct vm *vm = vtc->vtc_vm;
2144 	const int vcpuid = vtc->vtc_vcpuid;
2145 
2146 	if (ops->vmsavectx != NULL) {
2147 		ops->vmsavectx(vm->cookie, vcpuid);
2148 	}
2149 
2150 	/*
2151 	 * Account for going off-cpu, unless the vCPU is idled, where being
2152 	 * off-cpu is the explicit point.
2153 	 */
2154 	if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2155 		vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
2156 		vcpu_ustate_change(vm, vcpuid, VU_SCHED);
2157 	}
2158 
2159 	/*
2160 	 * If the CPU holds the restored guest FPU state, save it and restore
2161 	 * the host FPU state before this thread goes off-cpu.
2162 	 */
2163 	if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
2164 		struct vcpu *vcpu = &vm->vcpu[vcpuid];
2165 
2166 		save_guest_fpustate(vcpu);
2167 		vtc->vtc_status &= ~VTCS_FPU_RESTORED;
2168 	}
2169 }
2170 
2171 static void
2172 vmm_restorectx(void *arg)
2173 {
2174 	vm_thread_ctx_t *vtc = arg;
2175 	struct vm *vm = vtc->vtc_vm;
2176 	const int vcpuid = vtc->vtc_vcpuid;
2177 
2178 	/* Complete microstate accounting for vCPU being off-cpu */
2179 	if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
2180 		vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
2181 	}
2182 
2183 	/*
2184 	 * When coming back on-cpu, only restore the guest FPU status if the
2185 	 * thread is in a context marked as requiring it.  This should be rare,
2186 	 * occurring only when a future logic error results in a voluntary
2187 	 * sleep during the VMRUN critical section.
2188 	 *
2189 	 * The common case will result in elision of the guest FPU state
2190 	 * restoration, deferring that action until it is clearly necessary
2191 	 * during vm_run.
2192 	 */
2193 	VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
2194 	if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
2195 		struct vcpu *vcpu = &vm->vcpu[vcpuid];
2196 
2197 		restore_guest_fpustate(vcpu);
2198 		vtc->vtc_status |= VTCS_FPU_RESTORED;
2199 	}
2200 
2201 	if (ops->vmrestorectx != NULL) {
2202 		ops->vmrestorectx(vm->cookie, vcpuid);
2203 	}
2204 
2205 }
2206 
2207 static int
2208 vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
2209     struct vm_exit *vme)
2210 {
2211 	struct vcpu *vcpu;
2212 	struct vie *vie;
2213 	int err;
2214 
2215 	vcpu = &vm->vcpu[vcpuid];
2216 	vie = vcpu->vie_ctx;
2217 	err = 0;
2218 
2219 	switch (entry->cmd) {
2220 	case VEC_DEFAULT:
2221 		return (0);
2222 	case VEC_DISCARD_INSTR:
2223 		vie_reset(vie);
2224 		return (0);
2225 	case VEC_FULFILL_MMIO:
2226 		err = vie_fulfill_mmio(vie, &entry->u.mmio);
2227 		if (err == 0) {
2228 			err = vie_emulate_mmio(vie, vm, vcpuid);
2229 			if (err == 0) {
2230 				vie_advance_pc(vie, &vcpu->nextrip);
2231 			} else if (err < 0) {
2232 				vie_exitinfo(vie, vme);
2233 			} else if (err == EAGAIN) {
2234 				/*
2235 				 * Clear the instruction emulation state in
2236 				 * order to re-enter VM context and continue
2237 				 * this 'rep <instruction>'
2238 				 */
2239 				vie_reset(vie);
2240 				err = 0;
2241 			}
2242 		}
2243 		break;
2244 	case VEC_FULFILL_INOUT:
2245 		err = vie_fulfill_inout(vie, &entry->u.inout);
2246 		if (err == 0) {
2247 			err = vie_emulate_inout(vie, vm, vcpuid);
2248 			if (err == 0) {
2249 				vie_advance_pc(vie, &vcpu->nextrip);
2250 			} else if (err < 0) {
2251 				vie_exitinfo(vie, vme);
2252 			} else if (err == EAGAIN) {
2253 				/*
2254 				 * Clear the instruction emulation state in
2255 				 * order to re-enter VM context and continue
2256 				 * this 'rep ins/outs'
2257 				 */
2258 				vie_reset(vie);
2259 				err = 0;
2260 			}
2261 		}
2262 		break;
2263 	default:
2264 		return (EINVAL);
2265 	}
2266 	return (err);
2267 }
2268 
2269 static int
2270 vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
2271 {
2272 	struct vie *vie;
2273 
2274 	vie = vm->vcpu[vcpuid].vie_ctx;
2275 
2276 	if (vie_pending(vie)) {
2277 		/*
2278 		 * Userspace has not fulfilled the pending needs of the
2279 		 * instruction emulation, so bail back out.
2280 		 */
2281 		vie_exitinfo(vie, vme);
2282 		return (-1);
2283 	}
2284 
2285 	return (0);
2286 }
2287 
2288 int
2289 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
2290 {
2291 	int error;
2292 	struct vcpu *vcpu;
2293 	struct vm_exit *vme;
2294 	bool intr_disabled;
2295 	int affinity_type = CPU_CURRENT;
2296 
2297 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2298 		return (EINVAL);
2299 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
2300 		return (EINVAL);
2301 
2302 	vcpu = &vm->vcpu[vcpuid];
2303 	vme = &vcpu->exitinfo;
2304 
2305 	vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
2306 
2307 	vcpu->vtc.vtc_status = 0;
2308 	ctxop_attach(curthread, vcpu->ctxop);
2309 
2310 	error = vm_entry_actions(vm, vcpuid, entry, vme);
2311 	if (error != 0) {
2312 		goto exit;
2313 	}
2314 
2315 restart:
2316 	error = vm_loop_checks(vm, vcpuid, vme);
2317 	if (error != 0) {
2318 		goto exit;
2319 	}
2320 
2321 	thread_affinity_set(curthread, affinity_type);
2322 	/*
2323 	 * Resource localization should happen after the CPU affinity for the
2324 	 * thread has been set to ensure that access from restricted contexts,
2325 	 * such as VMX-accelerated APIC operations, can occur without inducing
2326 	 * cyclic cross-calls.
2327 	 *
2328 	 * This must be done prior to disabling kpreempt via critical_enter().
2329 	 */
2330 	vm_localize_resources(vm, vcpu);
2331 	affinity_type = CPU_CURRENT;
2332 	critical_enter();
2333 
2334 	/* Force a trip through update_sregs to reload %fs/%gs and friends */
2335 	PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
2336 
2337 	if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
2338 		restore_guest_fpustate(vcpu);
2339 		vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED;
2340 	}
2341 	vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
2342 
2343 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
2344 	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip);
2345 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
2346 
2347 	/*
2348 	 * Once clear of the delicate contexts comprising the VM_RUN handler,
2349 	 * thread CPU affinity can be loosened while other processing occurs.
2350 	 */
2351 	vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
2352 	thread_affinity_clear(curthread);
2353 	critical_exit();
2354 
2355 	if (error != 0) {
2356 		/* Communicate out any error from VMRUN() above */
2357 		goto exit;
2358 	}
2359 
2360 	vcpu->nextrip = vme->rip + vme->inst_length;
2361 	switch (vme->exitcode) {
2362 	case VM_EXITCODE_REQIDLE:
2363 		error = vm_handle_reqidle(vm, vcpuid);
2364 		break;
2365 	case VM_EXITCODE_RUN_STATE:
2366 		error = vm_handle_run_state(vm, vcpuid);
2367 		break;
2368 	case VM_EXITCODE_SUSPENDED:
2369 		error = vm_handle_suspend(vm, vcpuid);
2370 		break;
2371 	case VM_EXITCODE_IOAPIC_EOI:
2372 		vioapic_process_eoi(vm, vcpuid,
2373 		    vme->u.ioapic_eoi.vector);
2374 		break;
2375 	case VM_EXITCODE_HLT:
2376 		intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
2377 		error = vm_handle_hlt(vm, vcpuid, intr_disabled);
2378 		break;
2379 	case VM_EXITCODE_PAGING:
2380 		error = vm_handle_paging(vm, vcpuid);
2381 		break;
2382 	case VM_EXITCODE_MMIO_EMUL:
2383 		error = vm_handle_mmio_emul(vm, vcpuid);
2384 		break;
2385 	case VM_EXITCODE_INOUT:
2386 		error = vm_handle_inout(vm, vcpuid, vme);
2387 		break;
2388 	case VM_EXITCODE_INST_EMUL:
2389 		error = vm_handle_inst_emul(vm, vcpuid);
2390 		break;
2391 	case VM_EXITCODE_MONITOR:
2392 	case VM_EXITCODE_MWAIT:
2393 	case VM_EXITCODE_VMINSN:
2394 		vm_inject_ud(vm, vcpuid);
2395 		break;
2396 	case VM_EXITCODE_RDMSR:
2397 		error = vm_handle_rdmsr(vm, vcpuid, vme);
2398 		break;
2399 	case VM_EXITCODE_WRMSR:
2400 		error = vm_handle_wrmsr(vm, vcpuid, vme);
2401 		break;
2402 	case VM_EXITCODE_HT:
2403 		affinity_type = CPU_BEST;
2404 		break;
2405 	case VM_EXITCODE_MTRAP:
2406 		VERIFY0(vm_suspend_cpu(vm, vcpuid));
2407 		error = -1;
2408 		break;
2409 	default:
2410 		/* handled in userland */
2411 		error = -1;
2412 		break;
2413 	}
2414 
2415 	if (error == 0) {
2416 		/* VM exit conditions handled in-kernel, continue running */
2417 		goto restart;
2418 	}
2419 
2420 exit:
2421 	kpreempt_disable();
2422 	ctxop_detach(curthread, vcpu->ctxop);
2423 	/* Make sure all of the needed vCPU context state is saved */
2424 	vmm_savectx(&vcpu->vtc);
2425 	kpreempt_enable();
2426 
2427 	vcpu_ustate_change(vm, vcpuid, VU_EMU_USER);
2428 	return (error);
2429 }
2430 
2431 int
2432 vm_restart_instruction(void *arg, int vcpuid)
2433 {
2434 	struct vm *vm;
2435 	struct vcpu *vcpu;
2436 	enum vcpu_state state;
2437 	uint64_t rip;
2438 	int error;
2439 
2440 	vm = arg;
2441 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2442 		return (EINVAL);
2443 
2444 	vcpu = &vm->vcpu[vcpuid];
2445 	state = vcpu_get_state(vm, vcpuid, NULL);
2446 	if (state == VCPU_RUNNING) {
2447 		/*
2448 		 * When a vcpu is "running" the next instruction is determined
2449 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
2450 		 * Thus setting 'inst_length' to zero will cause the current
2451 		 * instruction to be restarted.
2452 		 */
2453 		vcpu->exitinfo.inst_length = 0;
2454 	} else if (state == VCPU_FROZEN) {
2455 		/*
2456 		 * When a vcpu is "frozen" it is outside the critical section
2457 		 * around VMRUN() and 'nextrip' points to the next instruction.
2458 		 * Thus instruction restart is achieved by setting 'nextrip'
2459 		 * to the vcpu's %rip.
2460 		 */
2461 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
2462 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
2463 		vcpu->nextrip = rip;
2464 	} else {
2465 		panic("%s: invalid state %d", __func__, state);
2466 	}
2467 	return (0);
2468 }
2469 
2470 int
2471 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
2472 {
2473 	struct vcpu *vcpu;
2474 
2475 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2476 		return (EINVAL);
2477 
2478 	vcpu = &vm->vcpu[vcpuid];
2479 
2480 	if (VM_INTINFO_PENDING(info)) {
2481 		const uint32_t type = VM_INTINFO_TYPE(info);
2482 		const uint8_t vector = VM_INTINFO_VECTOR(info);
2483 
2484 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
2485 			return (EINVAL);
2486 		if (type == VM_INTINFO_HWEXCP && vector >= 32)
2487 			return (EINVAL);
2488 		if (info & VM_INTINFO_MASK_RSVD)
2489 			return (EINVAL);
2490 	} else {
2491 		info = 0;
2492 	}
2493 	vcpu->exit_intinfo = info;
2494 	return (0);
2495 }
2496 
2497 enum exc_class {
2498 	EXC_BENIGN,
2499 	EXC_CONTRIBUTORY,
2500 	EXC_PAGEFAULT
2501 };
2502 
2503 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
2504 
2505 static enum exc_class
2506 exception_class(uint64_t info)
2507 {
2508 	ASSERT(VM_INTINFO_PENDING(info));
2509 
2510 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
2511 	switch (VM_INTINFO_TYPE(info)) {
2512 	case VM_INTINFO_HWINTR:
2513 	case VM_INTINFO_SWINTR:
2514 	case VM_INTINFO_NMI:
2515 		return (EXC_BENIGN);
2516 	default:
2517 		/*
2518 		 * Hardware exception.
2519 		 *
2520 		 * SVM and VT-x use identical type values to represent NMI,
2521 		 * hardware interrupt and software interrupt.
2522 		 *
2523 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
2524 		 * for exceptions except #BP and #OF. #BP and #OF use a type
2525 		 * value of '5' or '6'. Therefore we don't check for explicit
2526 		 * values of 'type' to classify 'intinfo' into a hardware
2527 		 * exception.
2528 		 */
2529 		break;
2530 	}
2531 
2532 	switch (VM_INTINFO_VECTOR(info)) {
2533 	case IDT_PF:
2534 	case IDT_VE:
2535 		return (EXC_PAGEFAULT);
2536 	case IDT_DE:
2537 	case IDT_TS:
2538 	case IDT_NP:
2539 	case IDT_SS:
2540 	case IDT_GP:
2541 		return (EXC_CONTRIBUTORY);
2542 	default:
2543 		return (EXC_BENIGN);
2544 	}
2545 }
2546 
2547 /*
2548  * Fetch event pending injection into the guest, if one exists.
2549  *
2550  * Returns true if an event is to be injected (which is placed in `retinfo`).
2551  */
2552 bool
2553 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
2554 {
2555 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2556 	const uint64_t info1 = vcpu->exit_intinfo;
2557 	vcpu->exit_intinfo = 0;
2558 	const uint64_t info2 = vcpu->exc_pending;
2559 	vcpu->exc_pending = 0;
2560 
2561 	if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) {
2562 		/*
2563 		 * If an exception occurs while attempting to call the
2564 		 * double-fault handler the processor enters shutdown mode
2565 		 * (aka triple fault).
2566 		 */
2567 		if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP &&
2568 		    VM_INTINFO_VECTOR(info1) == IDT_DF) {
2569 			(void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
2570 			*retinfo = 0;
2571 			return (false);
2572 		}
2573 		/*
2574 		 * "Conditions for Generating a Double Fault"
2575 		 *  Intel SDM, Vol3, Table 6-5
2576 		 */
2577 		const enum exc_class exc1 = exception_class(info1);
2578 		const enum exc_class exc2 = exception_class(info2);
2579 		if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
2580 		    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
2581 			/* Convert nested fault into a double fault. */
2582 			*retinfo =
2583 			    VM_INTINFO_VALID |
2584 			    VM_INTINFO_DEL_ERRCODE |
2585 			    VM_INTINFO_HWEXCP |
2586 			    IDT_DF;
2587 		} else {
2588 			/* Handle exceptions serially */
2589 			vcpu->exit_intinfo = info1;
2590 			*retinfo = info2;
2591 		}
2592 		return (true);
2593 	} else if (VM_INTINFO_PENDING(info1)) {
2594 		*retinfo = info1;
2595 		return (true);
2596 	} else if (VM_INTINFO_PENDING(info2)) {
2597 		*retinfo = info2;
2598 		return (true);
2599 	}
2600 
2601 	return (false);
2602 }
2603 
2604 int
2605 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
2606 {
2607 	struct vcpu *vcpu;
2608 
2609 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2610 		return (EINVAL);
2611 
2612 	vcpu = &vm->vcpu[vcpuid];
2613 	*info1 = vcpu->exit_intinfo;
2614 	*info2 = vcpu->exc_pending;
2615 	return (0);
2616 }
2617 
2618 int
2619 vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector,
2620     bool errcode_valid, uint32_t errcode, bool restart_instruction)
2621 {
2622 	struct vcpu *vcpu;
2623 	uint64_t regval;
2624 	int error;
2625 
2626 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2627 		return (EINVAL);
2628 
2629 	if (vector >= 32)
2630 		return (EINVAL);
2631 
2632 	/*
2633 	 * NMIs are to be injected via their own specialized path using
2634 	 * vm_inject_nmi().
2635 	 */
2636 	if (vector == IDT_NMI) {
2637 		return (EINVAL);
2638 	}
2639 
2640 	/*
2641 	 * A double fault exception should never be injected directly into
2642 	 * the guest. It is a derived exception that results from specific
2643 	 * combinations of nested faults.
2644 	 */
2645 	if (vector == IDT_DF) {
2646 		return (EINVAL);
2647 	}
2648 
2649 	vcpu = &vm->vcpu[vcpuid];
2650 
2651 	if (VM_INTINFO_PENDING(vcpu->exc_pending)) {
2652 		/* Unable to inject exception due to one already pending */
2653 		return (EBUSY);
2654 	}
2655 
2656 	if (errcode_valid) {
2657 		/*
2658 		 * Exceptions don't deliver an error code in real mode.
2659 		 */
2660 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2661 		VERIFY0(error);
2662 		if ((regval & CR0_PE) == 0) {
2663 			errcode_valid = false;
2664 		}
2665 	}
2666 
2667 	/*
2668 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2669 	 *
2670 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2671 	 * one instruction or incurs an exception.
2672 	 */
2673 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2674 	VERIFY0(error);
2675 
2676 	if (restart_instruction) {
2677 		VERIFY0(vm_restart_instruction(vm, vcpuid));
2678 	}
2679 
2680 	uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector;
2681 	if (errcode_valid) {
2682 		val |= VM_INTINFO_DEL_ERRCODE;
2683 		val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE;
2684 	}
2685 	vcpu->exc_pending = val;
2686 	return (0);
2687 }
2688 
2689 void
2690 vm_inject_ud(struct vm *vm, int vcpuid)
2691 {
2692 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true));
2693 }
2694 
2695 void
2696 vm_inject_gp(struct vm *vm, int vcpuid)
2697 {
2698 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true));
2699 }
2700 
2701 void
2702 vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode)
2703 {
2704 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true));
2705 }
2706 
2707 void
2708 vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode)
2709 {
2710 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true));
2711 }
2712 
2713 void
2714 vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2)
2715 {
2716 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2));
2717 	VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true));
2718 }
2719 
2720 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2721 
2722 int
2723 vm_inject_nmi(struct vm *vm, int vcpuid)
2724 {
2725 	struct vcpu *vcpu;
2726 
2727 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2728 		return (EINVAL);
2729 
2730 	vcpu = &vm->vcpu[vcpuid];
2731 
2732 	vcpu->nmi_pending = true;
2733 	vcpu_notify_event(vm, vcpuid);
2734 	return (0);
2735 }
2736 
2737 bool
2738 vm_nmi_pending(struct vm *vm, int vcpuid)
2739 {
2740 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2741 
2742 	return (vcpu->nmi_pending);
2743 }
2744 
2745 void
2746 vm_nmi_clear(struct vm *vm, int vcpuid)
2747 {
2748 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2749 
2750 	ASSERT(vcpu->nmi_pending);
2751 
2752 	vcpu->nmi_pending = false;
2753 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2754 }
2755 
2756 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2757 
2758 int
2759 vm_inject_extint(struct vm *vm, int vcpuid)
2760 {
2761 	struct vcpu *vcpu;
2762 
2763 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2764 		return (EINVAL);
2765 
2766 	vcpu = &vm->vcpu[vcpuid];
2767 
2768 	vcpu->extint_pending = true;
2769 	vcpu_notify_event(vm, vcpuid);
2770 	return (0);
2771 }
2772 
2773 bool
2774 vm_extint_pending(struct vm *vm, int vcpuid)
2775 {
2776 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2777 
2778 	return (vcpu->extint_pending);
2779 }
2780 
2781 void
2782 vm_extint_clear(struct vm *vm, int vcpuid)
2783 {
2784 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2785 
2786 	ASSERT(vcpu->extint_pending);
2787 
2788 	vcpu->extint_pending = false;
2789 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2790 }
2791 
2792 int
2793 vm_inject_init(struct vm *vm, int vcpuid)
2794 {
2795 	struct vcpu *vcpu;
2796 
2797 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2798 		return (EINVAL);
2799 
2800 	vcpu = &vm->vcpu[vcpuid];
2801 	vcpu_lock(vcpu);
2802 	vcpu->run_state |= VRS_PEND_INIT;
2803 	/*
2804 	 * As part of queuing the INIT request, clear any pending SIPI.  It
2805 	 * would not otherwise survive across the reset of the vCPU when it
2806 	 * undergoes the requested INIT.  We would not want it to linger when it
2807 	 * could be mistaken as a subsequent (after the INIT) SIPI request.
2808 	 */
2809 	vcpu->run_state &= ~VRS_PEND_SIPI;
2810 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2811 
2812 	vcpu_unlock(vcpu);
2813 	return (0);
2814 }
2815 
2816 int
2817 vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2818 {
2819 	struct vcpu *vcpu;
2820 
2821 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2822 		return (EINVAL);
2823 
2824 	vcpu = &vm->vcpu[vcpuid];
2825 	vcpu_lock(vcpu);
2826 	vcpu->run_state |= VRS_PEND_SIPI;
2827 	vcpu->sipi_vector = vector;
2828 	/* SIPI is only actionable if the CPU is waiting in INIT state */
2829 	if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
2830 		vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
2831 	}
2832 	vcpu_unlock(vcpu);
2833 	return (0);
2834 }
2835 
2836 bool
2837 vcpu_run_state_pending(struct vm *vm, int vcpuid)
2838 {
2839 	struct vcpu *vcpu;
2840 
2841 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
2842 	vcpu = &vm->vcpu[vcpuid];
2843 
2844 	/* Of interest: vCPU not in running state or with pending INIT */
2845 	return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
2846 }
2847 
2848 int
2849 vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
2850 {
2851 	struct seg_desc desc;
2852 	const enum vm_reg_name clear_regs[] = {
2853 		VM_REG_GUEST_CR2,
2854 		VM_REG_GUEST_CR3,
2855 		VM_REG_GUEST_CR4,
2856 		VM_REG_GUEST_RAX,
2857 		VM_REG_GUEST_RBX,
2858 		VM_REG_GUEST_RCX,
2859 		VM_REG_GUEST_RSI,
2860 		VM_REG_GUEST_RDI,
2861 		VM_REG_GUEST_RBP,
2862 		VM_REG_GUEST_RSP,
2863 		VM_REG_GUEST_R8,
2864 		VM_REG_GUEST_R9,
2865 		VM_REG_GUEST_R10,
2866 		VM_REG_GUEST_R11,
2867 		VM_REG_GUEST_R12,
2868 		VM_REG_GUEST_R13,
2869 		VM_REG_GUEST_R14,
2870 		VM_REG_GUEST_R15,
2871 		VM_REG_GUEST_DR0,
2872 		VM_REG_GUEST_DR1,
2873 		VM_REG_GUEST_DR2,
2874 		VM_REG_GUEST_DR3,
2875 		VM_REG_GUEST_EFER,
2876 	};
2877 	const enum vm_reg_name data_segs[] = {
2878 		VM_REG_GUEST_SS,
2879 		VM_REG_GUEST_DS,
2880 		VM_REG_GUEST_ES,
2881 		VM_REG_GUEST_FS,
2882 		VM_REG_GUEST_GS,
2883 	};
2884 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2885 
2886 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2887 		return (EINVAL);
2888 
2889 	for (uint_t i = 0; i < nitems(clear_regs); i++) {
2890 		VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
2891 	}
2892 
2893 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
2894 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
2895 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
2896 
2897 	/*
2898 	 * The prescribed contents of %rdx differ slightly between the Intel and
2899 	 * AMD architectural definitions.  The former expects the Extended Model
2900 	 * in bits 16-19 where the latter expects all the Family, Model, and
2901 	 * Stepping be there.  Common boot ROMs appear to disregard this
2902 	 * anyways, so we stick with a compromise value similar to what is
2903 	 * spelled out in the Intel SDM.
2904 	 */
2905 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
2906 
2907 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
2908 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
2909 
2910 	/* CS: Present, R/W, Accessed */
2911 	desc.access = 0x0093;
2912 	desc.base = 0xffff0000;
2913 	desc.limit = 0xffff;
2914 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2915 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
2916 
2917 	/* SS, DS, ES, FS, GS: Present, R/W, Accessed */
2918 	desc.access = 0x0093;
2919 	desc.base = 0;
2920 	desc.limit = 0xffff;
2921 	for (uint_t i = 0; i < nitems(data_segs); i++) {
2922 		VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
2923 		VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
2924 	}
2925 
2926 	/* GDTR, IDTR */
2927 	desc.base = 0;
2928 	desc.limit = 0xffff;
2929 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
2930 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
2931 
2932 	/* LDTR: Present, LDT */
2933 	desc.access = 0x0082;
2934 	desc.base = 0;
2935 	desc.limit = 0xffff;
2936 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
2937 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
2938 
2939 	/* TR: Present, 32-bit TSS */
2940 	desc.access = 0x008b;
2941 	desc.base = 0;
2942 	desc.limit = 0xffff;
2943 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
2944 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
2945 
2946 	vlapic_reset(vm_lapic(vm, vcpuid));
2947 
2948 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
2949 
2950 	vcpu->exit_intinfo = 0;
2951 	vcpu->exc_pending = 0;
2952 	vcpu->nmi_pending = false;
2953 	vcpu->extint_pending = 0;
2954 
2955 	/*
2956 	 * A CPU reset caused by power-on or system reset clears more state than
2957 	 * one which is trigged from an INIT IPI.
2958 	 */
2959 	if (!init_only) {
2960 		vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
2961 		(void) hma_fpu_init(vcpu->guestfpu);
2962 
2963 		/* XXX: clear MSRs and other pieces */
2964 		bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
2965 	}
2966 
2967 	return (0);
2968 }
2969 
2970 static int
2971 vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
2972 {
2973 	struct seg_desc desc;
2974 
2975 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2976 		return (EINVAL);
2977 
2978 	/* CS: Present, R/W, Accessed */
2979 	desc.access = 0x0093;
2980 	desc.base = (uint64_t)vector << 12;
2981 	desc.limit = 0xffff;
2982 	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
2983 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
2984 	    (uint64_t)vector << 8));
2985 
2986 	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
2987 
2988 	return (0);
2989 }
2990 
2991 int
2992 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2993 {
2994 	if (vcpu < 0 || vcpu >= vm->maxcpus)
2995 		return (EINVAL);
2996 
2997 	if (type < 0 || type >= VM_CAP_MAX)
2998 		return (EINVAL);
2999 
3000 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
3001 }
3002 
3003 int
3004 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
3005 {
3006 	if (vcpu < 0 || vcpu >= vm->maxcpus)
3007 		return (EINVAL);
3008 
3009 	if (type < 0 || type >= VM_CAP_MAX)
3010 		return (EINVAL);
3011 
3012 	return (VMSETCAP(vm->cookie, vcpu, type, val));
3013 }
3014 
3015 struct vlapic *
3016 vm_lapic(struct vm *vm, int cpu)
3017 {
3018 	ASSERT3S(cpu, >=, 0);
3019 	ASSERT3S(cpu, <, VM_MAXCPU);
3020 
3021 	return (vm->vcpu[cpu].vlapic);
3022 }
3023 
3024 struct vioapic *
3025 vm_ioapic(struct vm *vm)
3026 {
3027 
3028 	return (vm->vioapic);
3029 }
3030 
3031 struct vhpet *
3032 vm_hpet(struct vm *vm)
3033 {
3034 
3035 	return (vm->vhpet);
3036 }
3037 
3038 void *
3039 vm_iommu_domain(struct vm *vm)
3040 {
3041 
3042 	return (vm->iommu);
3043 }
3044 
3045 int
3046 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
3047     bool from_idle)
3048 {
3049 	int error;
3050 	struct vcpu *vcpu;
3051 
3052 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3053 		panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
3054 
3055 	vcpu = &vm->vcpu[vcpuid];
3056 
3057 	vcpu_lock(vcpu);
3058 	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
3059 	vcpu_unlock(vcpu);
3060 
3061 	return (error);
3062 }
3063 
3064 enum vcpu_state
3065 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
3066 {
3067 	struct vcpu *vcpu;
3068 	enum vcpu_state state;
3069 
3070 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3071 		panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
3072 
3073 	vcpu = &vm->vcpu[vcpuid];
3074 
3075 	vcpu_lock(vcpu);
3076 	state = vcpu->state;
3077 	if (hostcpu != NULL)
3078 		*hostcpu = vcpu->hostcpu;
3079 	vcpu_unlock(vcpu);
3080 
3081 	return (state);
3082 }
3083 
3084 uint64_t
3085 vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
3086 {
3087 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3088 
3089 	uint64_t vcpu_off = vm->boot_tsc_offset + vm->vcpu[vcpuid].tsc_offset;
3090 
3091 	if (phys_adj) {
3092 		/* Include any offset for the current physical CPU too */
3093 		extern hrtime_t tsc_gethrtime_tick_delta(void);
3094 		vcpu_off += (uint64_t)tsc_gethrtime_tick_delta();
3095 	}
3096 
3097 	return (vcpu_off);
3098 }
3099 
3100 /* Normalize hrtime against the boot time for a VM */
3101 hrtime_t
3102 vm_normalize_hrtime(struct vm *vm, hrtime_t hrt)
3103 {
3104 	/* To avoid underflow/overflow UB, perform math as unsigned */
3105 	return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime));
3106 }
3107 
3108 /* Denormalize hrtime against the boot time for a VM */
3109 hrtime_t
3110 vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt)
3111 {
3112 	/* To avoid underflow/overflow UB, perform math as unsigned */
3113 	return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime));
3114 }
3115 
3116 int
3117 vm_activate_cpu(struct vm *vm, int vcpuid)
3118 {
3119 
3120 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3121 		return (EINVAL);
3122 
3123 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
3124 		return (EBUSY);
3125 
3126 	if (vm->suspend != 0) {
3127 		return (EBUSY);
3128 	}
3129 
3130 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
3131 
3132 	/*
3133 	 * It is possible that this vCPU was undergoing activation at the same
3134 	 * time that the VM was being suspended.  If that happens to be the
3135 	 * case, it should reflect the suspended state immediately.
3136 	 */
3137 	if (atomic_load_acq_int((uint_t *)&vm->suspend) != 0) {
3138 		CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
3139 	}
3140 
3141 	return (0);
3142 }
3143 
3144 int
3145 vm_suspend_cpu(struct vm *vm, int vcpuid)
3146 {
3147 	int i;
3148 
3149 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3150 		return (EINVAL);
3151 
3152 	if (vcpuid == -1) {
3153 		vm->debug_cpus = vm->active_cpus;
3154 		for (i = 0; i < vm->maxcpus; i++) {
3155 			if (CPU_ISSET(i, &vm->active_cpus))
3156 				vcpu_notify_event(vm, i);
3157 		}
3158 	} else {
3159 		if (!CPU_ISSET(vcpuid, &vm->active_cpus))
3160 			return (EINVAL);
3161 
3162 		CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
3163 		vcpu_notify_event(vm, vcpuid);
3164 	}
3165 	return (0);
3166 }
3167 
3168 int
3169 vm_resume_cpu(struct vm *vm, int vcpuid)
3170 {
3171 
3172 	if (vcpuid < -1 || vcpuid >= vm->maxcpus)
3173 		return (EINVAL);
3174 
3175 	if (vcpuid == -1) {
3176 		CPU_ZERO(&vm->debug_cpus);
3177 	} else {
3178 		if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
3179 			return (EINVAL);
3180 
3181 		CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
3182 	}
3183 	return (0);
3184 }
3185 
3186 static bool
3187 vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
3188     uint64_t entry_rip)
3189 {
3190 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3191 	struct vm_exit *vme = &vcpu->exitinfo;
3192 	bool bail = false;
3193 
3194 	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
3195 
3196 	if (vm->suspend) {
3197 		if (on_entry) {
3198 			VERIFY(vm->suspend > VM_SUSPEND_NONE &&
3199 			    vm->suspend < VM_SUSPEND_LAST);
3200 
3201 			vme->exitcode = VM_EXITCODE_SUSPENDED;
3202 			vme->u.suspended.how = vm->suspend;
3203 		} else {
3204 			/*
3205 			 * Handling VM suspend is complicated, so if that
3206 			 * condition is detected outside of VM-entry itself,
3207 			 * just emit a BOGUS exitcode so we take a lap to pick
3208 			 * up the event during an entry and are directed into
3209 			 * the vm_handle_suspend() logic.
3210 			 */
3211 			vme->exitcode = VM_EXITCODE_BOGUS;
3212 		}
3213 		bail = true;
3214 	}
3215 	if (vcpu->reqidle) {
3216 		vme->exitcode = VM_EXITCODE_REQIDLE;
3217 		vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
3218 
3219 		if (!on_entry) {
3220 			/*
3221 			 * A reqidle request detected outside of VM-entry can be
3222 			 * handled directly by clearing the request (and taking
3223 			 * a lap to userspace).
3224 			 */
3225 			vcpu_assert_locked(vcpu);
3226 			vcpu->reqidle = 0;
3227 		}
3228 		bail = true;
3229 	}
3230 	if (vcpu_should_yield(vm, vcpuid)) {
3231 		vme->exitcode = VM_EXITCODE_BOGUS;
3232 		vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
3233 		bail = true;
3234 	}
3235 	if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
3236 		vme->exitcode = VM_EXITCODE_DEBUG;
3237 		bail = true;
3238 	}
3239 
3240 	if (bail) {
3241 		if (on_entry) {
3242 			/*
3243 			 * If bailing out during VM-entry, the current %rip must
3244 			 * be recorded in the exitinfo.
3245 			 */
3246 			vme->rip = entry_rip;
3247 		}
3248 		vme->inst_length = 0;
3249 	}
3250 	return (bail);
3251 }
3252 
3253 static bool
3254 vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
3255 {
3256 	/*
3257 	 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
3258 	 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
3259 	 * structure, and we would only modify the exitcode.
3260 	 */
3261 	return (vcpu_bailout_checks(vm, vcpuid, false, 0));
3262 }
3263 
3264 bool
3265 vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
3266 {
3267 	/*
3268 	 * Bail-out checks done as part of VM entry require an updated %rip to
3269 	 * populate the vm_exit struct if any of the conditions of interest are
3270 	 * matched in the check.
3271 	 */
3272 	return (vcpu_bailout_checks(vm, vcpuid, true, rip));
3273 }
3274 
3275 cpuset_t
3276 vm_active_cpus(struct vm *vm)
3277 {
3278 
3279 	return (vm->active_cpus);
3280 }
3281 
3282 cpuset_t
3283 vm_debug_cpus(struct vm *vm)
3284 {
3285 
3286 	return (vm->debug_cpus);
3287 }
3288 
3289 cpuset_t
3290 vm_suspended_cpus(struct vm *vm)
3291 {
3292 
3293 	return (vm->suspended_cpus);
3294 }
3295 
3296 void *
3297 vcpu_stats(struct vm *vm, int vcpuid)
3298 {
3299 
3300 	return (vm->vcpu[vcpuid].stats);
3301 }
3302 
3303 int
3304 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
3305 {
3306 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3307 		return (EINVAL);
3308 
3309 	*state = vm->vcpu[vcpuid].x2apic_state;
3310 
3311 	return (0);
3312 }
3313 
3314 int
3315 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
3316 {
3317 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
3318 		return (EINVAL);
3319 
3320 	if (state >= X2APIC_STATE_LAST)
3321 		return (EINVAL);
3322 
3323 	vm->vcpu[vcpuid].x2apic_state = state;
3324 
3325 	vlapic_set_x2apic_state(vm, vcpuid, state);
3326 
3327 	return (0);
3328 }
3329 
3330 /*
3331  * This function is called to ensure that a vcpu "sees" a pending event
3332  * as soon as possible:
3333  * - If the vcpu thread is sleeping then it is woken up.
3334  * - If the vcpu is running on a different host_cpu then an IPI will be directed
3335  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
3336  */
3337 static void
3338 vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
3339 {
3340 	int hostcpu;
3341 
3342 	ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
3343 
3344 	hostcpu = vcpu->hostcpu;
3345 	if (vcpu->state == VCPU_RUNNING) {
3346 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
3347 		if (hostcpu != curcpu) {
3348 			if (ntype == VCPU_NOTIFY_APIC) {
3349 				vlapic_post_intr(vcpu->vlapic, hostcpu);
3350 			} else {
3351 				poke_cpu(hostcpu);
3352 			}
3353 		} else {
3354 			/*
3355 			 * If the 'vcpu' is running on 'curcpu' then it must
3356 			 * be sending a notification to itself (e.g. SELF_IPI).
3357 			 * The pending event will be picked up when the vcpu
3358 			 * transitions back to guest context.
3359 			 */
3360 		}
3361 	} else {
3362 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
3363 		    "with hostcpu %d", vcpu->state, hostcpu));
3364 		if (vcpu->state == VCPU_SLEEPING) {
3365 			cv_signal(&vcpu->vcpu_cv);
3366 		}
3367 	}
3368 }
3369 
3370 void
3371 vcpu_notify_event(struct vm *vm, int vcpuid)
3372 {
3373 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3374 
3375 	vcpu_lock(vcpu);
3376 	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
3377 	vcpu_unlock(vcpu);
3378 }
3379 
3380 void
3381 vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
3382 {
3383 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3384 
3385 	if (ntype == VCPU_NOTIFY_NONE) {
3386 		return;
3387 	}
3388 
3389 	vcpu_lock(vcpu);
3390 	vcpu_notify_event_locked(vcpu, ntype);
3391 	vcpu_unlock(vcpu);
3392 }
3393 
3394 void
3395 vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
3396 {
3397 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3398 	hrtime_t now = gethrtime();
3399 
3400 	ASSERT3U(ustate, !=, vcpu->ustate);
3401 	ASSERT3S(ustate, <, VU_MAX);
3402 	ASSERT3S(ustate, >=, VU_INIT);
3403 
3404 	hrtime_t delta = now - vcpu->ustate_when;
3405 	vcpu->ustate_total[vcpu->ustate] += delta;
3406 
3407 	membar_producer();
3408 
3409 	vcpu->ustate_when = now;
3410 	vcpu->ustate = ustate;
3411 }
3412 
3413 struct vmspace *
3414 vm_get_vmspace(struct vm *vm)
3415 {
3416 
3417 	return (vm->vmspace);
3418 }
3419 
3420 struct vm_client *
3421 vm_get_vmclient(struct vm *vm, int vcpuid)
3422 {
3423 	return (vm->vcpu[vcpuid].vmclient);
3424 }
3425 
3426 int
3427 vm_apicid2vcpuid(struct vm *vm, int apicid)
3428 {
3429 	/*
3430 	 * XXX apic id is assumed to be numerically identical to vcpu id
3431 	 */
3432 	return (apicid);
3433 }
3434 
3435 struct vatpic *
3436 vm_atpic(struct vm *vm)
3437 {
3438 	return (vm->vatpic);
3439 }
3440 
3441 struct vatpit *
3442 vm_atpit(struct vm *vm)
3443 {
3444 	return (vm->vatpit);
3445 }
3446 
3447 struct vpmtmr *
3448 vm_pmtmr(struct vm *vm)
3449 {
3450 
3451 	return (vm->vpmtmr);
3452 }
3453 
3454 struct vrtc *
3455 vm_rtc(struct vm *vm)
3456 {
3457 
3458 	return (vm->vrtc);
3459 }
3460 
3461 enum vm_reg_name
3462 vm_segment_name(int seg)
3463 {
3464 	static enum vm_reg_name seg_names[] = {
3465 		VM_REG_GUEST_ES,
3466 		VM_REG_GUEST_CS,
3467 		VM_REG_GUEST_SS,
3468 		VM_REG_GUEST_DS,
3469 		VM_REG_GUEST_FS,
3470 		VM_REG_GUEST_GS
3471 	};
3472 
3473 	KASSERT(seg >= 0 && seg < nitems(seg_names),
3474 	    ("%s: invalid segment encoding %d", __func__, seg));
3475 	return (seg_names[seg]);
3476 }
3477 
3478 void
3479 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
3480     uint_t num_copyinfo)
3481 {
3482 	for (uint_t idx = 0; idx < num_copyinfo; idx++) {
3483 		if (copyinfo[idx].cookie != NULL) {
3484 			(void) vmp_release((vm_page_t *)copyinfo[idx].cookie);
3485 		}
3486 	}
3487 	bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
3488 }
3489 
3490 int
3491 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3492     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
3493     uint_t num_copyinfo, int *fault)
3494 {
3495 	uint_t idx, nused;
3496 	size_t n, off, remaining;
3497 	vm_client_t *vmc = vm_get_vmclient(vm, vcpuid);
3498 
3499 	bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
3500 
3501 	nused = 0;
3502 	remaining = len;
3503 	while (remaining > 0) {
3504 		uint64_t gpa;
3505 		int error;
3506 
3507 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
3508 		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
3509 		if (error || *fault)
3510 			return (error);
3511 		off = gpa & PAGEOFFSET;
3512 		n = min(remaining, PAGESIZE - off);
3513 		copyinfo[nused].gpa = gpa;
3514 		copyinfo[nused].len = n;
3515 		remaining -= n;
3516 		gla += n;
3517 		nused++;
3518 	}
3519 
3520 	for (idx = 0; idx < nused; idx++) {
3521 		vm_page_t *vmp;
3522 		caddr_t hva;
3523 
3524 		vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot);
3525 		if (vmp == NULL) {
3526 			break;
3527 		}
3528 		if ((prot & PROT_WRITE) != 0) {
3529 			hva = (caddr_t)vmp_get_writable(vmp);
3530 		} else {
3531 			hva = (caddr_t)vmp_get_readable(vmp);
3532 		}
3533 		copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET);
3534 		copyinfo[idx].cookie = vmp;
3535 		copyinfo[idx].prot = prot;
3536 	}
3537 
3538 	if (idx != nused) {
3539 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
3540 		return (EFAULT);
3541 	} else {
3542 		*fault = 0;
3543 		return (0);
3544 	}
3545 }
3546 
3547 void
3548 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
3549     size_t len)
3550 {
3551 	char *dst;
3552 	int idx;
3553 
3554 	dst = kaddr;
3555 	idx = 0;
3556 	while (len > 0) {
3557 		ASSERT(copyinfo[idx].prot & PROT_READ);
3558 
3559 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
3560 		len -= copyinfo[idx].len;
3561 		dst += copyinfo[idx].len;
3562 		idx++;
3563 	}
3564 }
3565 
3566 void
3567 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
3568     struct vm_copyinfo *copyinfo, size_t len)
3569 {
3570 	const char *src;
3571 	int idx;
3572 
3573 	src = kaddr;
3574 	idx = 0;
3575 	while (len > 0) {
3576 		ASSERT(copyinfo[idx].prot & PROT_WRITE);
3577 
3578 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
3579 		len -= copyinfo[idx].len;
3580 		src += copyinfo[idx].len;
3581 		idx++;
3582 	}
3583 }
3584 
3585 /*
3586  * Return the amount of in-use and wired memory for the VM. Since
3587  * these are global stats, only return the values with for vCPU 0
3588  */
3589 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
3590 
3591 static void
3592 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
3593 {
3594 	if (vcpu == 0) {
3595 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
3596 		    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
3597 	}
3598 }
3599 
3600 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
3601 
3602 int
3603 vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
3604     uint8_t bytes, uint32_t *val)
3605 {
3606 	return (vm_inout_access(&vm->ioports, in, port, bytes, val));
3607 }
3608 
3609 /*
3610  * bhyve-internal interfaces to attach or detach IO port handlers.
3611  * Must be called with VM write lock held for safety.
3612  */
3613 int
3614 vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
3615     void **cookie)
3616 {
3617 	int err;
3618 	err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
3619 	if (err == 0) {
3620 		*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3621 	}
3622 	return (err);
3623 }
3624 int
3625 vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
3626     void **old_arg)
3627 {
3628 	uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3629 	int err;
3630 
3631 	err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
3632 	if (err == 0) {
3633 		*cookie = NULL;
3634 	}
3635 	return (err);
3636 }
3637 
3638 /*
3639  * External driver interfaces to attach or detach IO port handlers.
3640  * Must be called with VM write lock held for safety.
3641  */
3642 int
3643 vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
3644     void *arg, void **cookie)
3645 {
3646 	int err;
3647 
3648 	if (port == 0) {
3649 		return (EINVAL);
3650 	}
3651 
3652 	err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
3653 	if (err == 0) {
3654 		*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
3655 	}
3656 	return (err);
3657 }
3658 void
3659 vm_ioport_unhook(struct vm *vm, void **cookie)
3660 {
3661 	uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
3662 	ioport_handler_t old_func;
3663 	void *old_arg;
3664 	int err;
3665 
3666 	err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
3667 
3668 	/* ioport-hook-using drivers are expected to be well-behaved */
3669 	VERIFY0(err);
3670 	VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
3671 
3672 	*cookie = NULL;
3673 }
3674 
3675 int
3676 vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
3677 {
3678 	struct vm *vm = ksp->ks_private;
3679 	vmm_vcpu_kstats_t *vvk = ksp->ks_data;
3680 	const int vcpuid = vvk->vvk_vcpu.value.ui32;
3681 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
3682 
3683 	ASSERT3U(vcpuid, <, VM_MAXCPU);
3684 
3685 	vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
3686 	vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
3687 	vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
3688 	vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
3689 	vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
3690 	vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
3691 
3692 	return (0);
3693 }
3694 
3695 SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t);
3696 
3697 static inline bool
3698 vmm_data_is_cpu_specific(uint16_t data_class)
3699 {
3700 	switch (data_class) {
3701 	case VDC_REGISTER:
3702 	case VDC_MSR:
3703 	case VDC_FPU:
3704 	case VDC_LAPIC:
3705 	case VDC_VMM_ARCH:
3706 		return (true);
3707 	default:
3708 		return (false);
3709 	}
3710 }
3711 
3712 static const vmm_data_version_entry_t *
3713 vmm_data_find(const vmm_data_req_t *req, int *err)
3714 {
3715 	const vmm_data_version_entry_t **vdpp, *vdp;
3716 	SET_FOREACH(vdpp, vmm_data_version_entries) {
3717 		vdp = *vdpp;
3718 		if (vdp->vdve_class == req->vdr_class &&
3719 		    vdp->vdve_version == req->vdr_version) {
3720 			/*
3721 			 * Enforce any data length expectation expressed by the
3722 			 * provider for this data.
3723 			 */
3724 			if (vdp->vdve_len_expect != 0 &&
3725 			    vdp->vdve_len_expect != req->vdr_len) {
3726 				*err = ENOSPC;
3727 				return (NULL);
3728 			}
3729 			return (vdp);
3730 		}
3731 	}
3732 	*err = EINVAL;
3733 	return (NULL);
3734 }
3735 
3736 static void *
3737 vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm, int vcpuid)
3738 {
3739 	switch (req->vdr_class) {
3740 		/* per-cpu data/devices */
3741 	case VDC_LAPIC:
3742 		return (vm_lapic(vm, vcpuid));
3743 
3744 	case VDC_FPU:
3745 	case VDC_REGISTER:
3746 	case VDC_VMM_ARCH:
3747 	case VDC_MSR:
3748 		/*
3749 		 * These have per-CPU handling which is dispatched outside
3750 		 * vmm_data_version_entries listing.
3751 		 */
3752 		return (NULL);
3753 
3754 		/* system-wide data/devices */
3755 	case VDC_IOAPIC:
3756 		return (vm->vioapic);
3757 	case VDC_ATPIT:
3758 		return (vm->vatpit);
3759 	case VDC_ATPIC:
3760 		return (vm->vatpic);
3761 	case VDC_HPET:
3762 		return (vm->vhpet);
3763 	case VDC_PM_TIMER:
3764 		return (vm->vpmtmr);
3765 	case VDC_RTC:
3766 		return (vm->vrtc);
3767 
3768 	default:
3769 		/* The data class will have been validated by now */
3770 		panic("Unexpected class %u", req->vdr_class);
3771 	}
3772 }
3773 
3774 int
3775 vmm_data_read(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
3776 {
3777 	int err = 0;
3778 
3779 	if (vmm_data_is_cpu_specific(req->vdr_class)) {
3780 		if (vcpuid >= VM_MAXCPU) {
3781 			return (EINVAL);
3782 		}
3783 	}
3784 
3785 	const vmm_data_version_entry_t *entry;
3786 	entry = vmm_data_find(req, &err);
3787 	if (entry == NULL) {
3788 		ASSERT(err != 0);
3789 		return (err);
3790 	}
3791 
3792 	void *datap = vmm_data_from_class(req, vm, vcpuid);
3793 	if (datap != NULL) {
3794 		err = entry->vdve_readf(datap, req);
3795 	} else {
3796 		switch (req->vdr_class) {
3797 		case VDC_FPU:
3798 			/* TODO: wire up to xsave export via hma_fpu iface */
3799 			err = EINVAL;
3800 			break;
3801 		case VDC_REGISTER:
3802 		case VDC_VMM_ARCH:
3803 		case VDC_MSR:
3804 			/* TODO: implement */
3805 			err = EINVAL;
3806 			break;
3807 		default:
3808 			err = EINVAL;
3809 			break;
3810 		}
3811 	}
3812 
3813 	return (err);
3814 }
3815 
3816 int
3817 vmm_data_write(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
3818 {
3819 	int err = 0;
3820 
3821 	if (vmm_data_is_cpu_specific(req->vdr_class)) {
3822 		if (vcpuid >= VM_MAXCPU) {
3823 			return (EINVAL);
3824 		}
3825 	}
3826 
3827 	const vmm_data_version_entry_t *entry;
3828 	entry = vmm_data_find(req, &err);
3829 	if (entry == NULL) {
3830 		ASSERT(err != 0);
3831 		return (err);
3832 	}
3833 
3834 	void *datap = vmm_data_from_class(req, vm, vcpuid);
3835 	if (datap != NULL) {
3836 		err = entry->vdve_writef(datap, req);
3837 	} else {
3838 		switch (req->vdr_class) {
3839 		case VDC_FPU:
3840 			/* TODO: wire up to xsave import via hma_fpu iface */
3841 			err = EINVAL;
3842 			break;
3843 		case VDC_REGISTER:
3844 		case VDC_VMM_ARCH:
3845 		case VDC_MSR:
3846 			/* TODO: implement */
3847 			err = EINVAL;
3848 			break;
3849 		default:
3850 			err = EINVAL;
3851 			break;
3852 		}
3853 	}
3854 
3855 	return (err);
3856 }
3857