xref: /illumos-gate/usr/src/uts/intel/io/vmm/amd/svm.c (revision 1677a13522f801f59117c9fb50212af5fb87a872)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * This file and its contents are supplied under the terms of the
31  * Common Development and Distribution License ("CDDL"), version 1.0.
32  * You may only use this file in accordance with the terms of version
33  * 1.0 of the CDDL.
34  *
35  * A full copy of the text of the CDDL should have accompanied this
36  * source.  A copy of the CDDL is also available via the Internet at
37  * http://www.illumos.org/license/CDDL.
38  *
39  * Copyright 2018 Joyent, Inc.
40  * Copyright 2023 Oxide Computer Company
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/kmem.h>
50 #include <sys/pcpu.h>
51 #include <sys/proc.h>
52 #include <sys/sysctl.h>
53 
54 #include <sys/x86_archext.h>
55 #include <sys/trap.h>
56 
57 #include <machine/cpufunc.h>
58 #include <machine/psl.h>
59 #include <machine/md_var.h>
60 #include <machine/reg.h>
61 #include <machine/specialreg.h>
62 #include <machine/vmm.h>
63 #include <machine/vmm_dev.h>
64 #include <sys/vmm_instruction_emul.h>
65 #include <sys/vmm_vm.h>
66 #include <sys/vmm_kernel.h>
67 
68 #include "vmm_lapic.h"
69 #include "vmm_stat.h"
70 #include "vmm_ioport.h"
71 #include "vatpic.h"
72 #include "vlapic.h"
73 #include "vlapic_priv.h"
74 
75 #include "vmcb.h"
76 #include "svm.h"
77 #include "svm_softc.h"
78 #include "svm_msr.h"
79 
80 SYSCTL_DECL(_hw_vmm);
81 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
82     NULL);
83 
84 /*
85  * SVM CPUID function 0x8000_000A, edx bit decoding.
86  */
87 #define	AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
88 #define	AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
89 #define	AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
90 #define	AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
91 #define	AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
92 #define	AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
93 #define	AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
94 #define	AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
95 #define	AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
96 #define	AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
97 #define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
98 
99 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID	|	\
100 				VMCB_CACHE_IOPM		|	\
101 				VMCB_CACHE_I		|	\
102 				VMCB_CACHE_TPR		|	\
103 				VMCB_CACHE_CR2		|	\
104 				VMCB_CACHE_CR		|	\
105 				VMCB_CACHE_DR		|	\
106 				VMCB_CACHE_DT		|	\
107 				VMCB_CACHE_SEG		|	\
108 				VMCB_CACHE_NP)
109 
110 /*
111  * Guardrails for supported guest TSC frequencies.
112  *
113  * A minimum of 0.5 GHz, which should be sufficient for all recent AMD CPUs, and
114  * a maximum ratio of (15 * host frequency), which is sufficient to prevent
115  * overflowing frequency calcuations and give plenty of bandwidth for future CPU
116  * frequency increases.
117  */
118 #define	AMD_TSC_MIN_FREQ	500000000
119 #define	AMD_TSC_MAX_FREQ_RATIO	15
120 
121 static bool svm_has_tsc_freq_ctl;
122 
123 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
124 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
125     0, NULL);
126 
127 /* SVM features advertised by CPUID.8000000AH:EDX */
128 static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
129 
130 static int disable_npf_assist;
131 
132 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
133 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
134 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
135 
136 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
137 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val);
138 static void flush_asid(struct svm_softc *sc, int vcpuid);
139 
140 static __inline bool
141 flush_by_asid(void)
142 {
143 	return ((svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID) != 0);
144 }
145 
146 static __inline bool
147 decode_assist(void)
148 {
149 	return ((svm_feature & AMD_CPUID_SVM_DECODE_ASSIST) != 0);
150 }
151 
152 static bool
153 svm_tsc_freq_ctl(void)
154 {
155 	return ((svm_feature & AMD_CPUID_SVM_TSC_RATE) != 0);
156 }
157 
158 static int
159 svm_cleanup(void)
160 {
161 	/* This is taken care of by the hma registration */
162 	return (0);
163 }
164 
165 static int
166 svm_init(void)
167 {
168 	vmcb_clean &= VMCB_CACHE_DEFAULT;
169 
170 	svm_has_tsc_freq_ctl = svm_tsc_freq_ctl();
171 	svm_msr_init();
172 
173 	return (0);
174 }
175 
176 static void
177 svm_restore(void)
178 {
179 	/* No-op on illumos */
180 }
181 
182 /* Pentium compatible MSRs */
183 #define	MSR_PENTIUM_START	0
184 #define	MSR_PENTIUM_END		0x1FFF
185 /* AMD 6th generation and Intel compatible MSRs */
186 #define	MSR_AMD6TH_START	0xC0000000UL
187 #define	MSR_AMD6TH_END		0xC0001FFFUL
188 /* AMD 7th and 8th generation compatible MSRs */
189 #define	MSR_AMD7TH_START	0xC0010000UL
190 #define	MSR_AMD7TH_END		0xC0011FFFUL
191 
192 /*
193  * Get the index and bit position for a MSR in permission bitmap.
194  * Two bits are used for each MSR: lower bit for read and higher bit for write.
195  */
196 static int
197 svm_msr_index(uint64_t msr, int *index, int *bit)
198 {
199 	uint32_t base, off;
200 
201 	*index = -1;
202 	*bit = (msr % 4) * 2;
203 	base = 0;
204 
205 	if (msr <= MSR_PENTIUM_END) {
206 		*index = msr / 4;
207 		return (0);
208 	}
209 
210 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
211 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
212 		off = (msr - MSR_AMD6TH_START);
213 		*index = (off + base) / 4;
214 		return (0);
215 	}
216 
217 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
218 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
219 		off = (msr - MSR_AMD7TH_START);
220 		*index = (off + base) / 4;
221 		return (0);
222 	}
223 
224 	return (EINVAL);
225 }
226 
227 /*
228  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
229  */
230 static void
231 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
232 {
233 	int index, bit, error;
234 
235 	error = svm_msr_index(msr, &index, &bit);
236 	KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
237 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
238 	    ("%s: invalid index %d for msr %lx", __func__, index, msr));
239 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
240 	    "msr %lx", __func__, bit, msr));
241 
242 	if (read)
243 		perm_bitmap[index] &= ~(1UL << bit);
244 
245 	if (write)
246 		perm_bitmap[index] &= ~(2UL << bit);
247 }
248 
249 static void
250 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
251 {
252 
253 	svm_msr_perm(perm_bitmap, msr, true, true);
254 }
255 
256 static void
257 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
258 {
259 
260 	svm_msr_perm(perm_bitmap, msr, true, false);
261 }
262 
263 static __inline int
264 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
265 {
266 	struct vmcb_ctrl *ctrl;
267 
268 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
269 
270 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
271 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
272 }
273 
274 static __inline void
275 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
276     int enabled)
277 {
278 	struct vmcb_ctrl *ctrl;
279 	uint32_t oldval;
280 
281 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
282 
283 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
284 	oldval = ctrl->intercept[idx];
285 
286 	if (enabled)
287 		ctrl->intercept[idx] |= bitmask;
288 	else
289 		ctrl->intercept[idx] &= ~bitmask;
290 
291 	if (ctrl->intercept[idx] != oldval) {
292 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
293 	}
294 }
295 
296 static __inline void
297 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
298 {
299 
300 	svm_set_intercept(sc, vcpu, off, bitmask, 0);
301 }
302 
303 static __inline void
304 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
305 {
306 
307 	svm_set_intercept(sc, vcpu, off, bitmask, 1);
308 }
309 
310 static void
311 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
312     uint64_t msrpm_base_pa, uint64_t np_pml4)
313 {
314 	struct vmcb_ctrl *ctrl;
315 	struct vmcb_state *state;
316 	uint32_t mask;
317 	int n;
318 
319 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
320 	state = svm_get_vmcb_state(sc, vcpu);
321 
322 	ctrl->iopm_base_pa = iopm_base_pa;
323 	ctrl->msrpm_base_pa = msrpm_base_pa;
324 
325 	/* Enable nested paging */
326 	ctrl->np_ctrl = NP_ENABLE;
327 	ctrl->n_cr3 = np_pml4;
328 
329 	/*
330 	 * Intercept accesses to the control registers that are not shadowed
331 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
332 	 */
333 	for (n = 0; n < 16; n++) {
334 		mask = (BIT(n) << 16) | BIT(n);
335 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
336 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
337 		else
338 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
339 	}
340 
341 	/*
342 	 * Selectively intercept writes to %cr0.  This triggers on operations
343 	 * which would change bits other than TS or MP.
344 	 */
345 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
346 	    VMCB_INTCPT_CR0_WRITE);
347 
348 	/*
349 	 * Intercept everything when tracing guest exceptions otherwise
350 	 * just intercept machine check exception.
351 	 */
352 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
353 		for (n = 0; n < 32; n++) {
354 			/*
355 			 * Skip unimplemented vectors in the exception bitmap.
356 			 */
357 			if (n == 2 || n == 9) {
358 				continue;
359 			}
360 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
361 		}
362 	} else {
363 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
364 	}
365 
366 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
367 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
368 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
369 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
370 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
371 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
372 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
373 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
374 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
375 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
376 	    VMCB_INTCPT_FERR_FREEZE);
377 
378 	/* Enable exit-on-hlt by default */
379 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT);
380 
381 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
382 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
383 
384 	/* Intercept privileged invalidation instructions. */
385 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
386 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
387 
388 	/*
389 	 * Intercept all virtualization-related instructions.
390 	 *
391 	 * From section "Canonicalization and Consistency Checks" in APMv2
392 	 * the VMRUN intercept bit must be set to pass the consistency check.
393 	 */
394 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
395 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
396 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
397 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
398 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
399 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
400 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
401 	if (vcpu_trap_wbinvd(sc->vm, vcpu) != 0) {
402 		svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT,
403 		    VMCB_INTCPT_WBINVD);
404 	}
405 
406 	/*
407 	 * The ASID will be set to a non-zero value just before VMRUN.
408 	 */
409 	ctrl->asid = 0;
410 
411 	/*
412 	 * Section 15.21.1, Interrupt Masking in EFLAGS
413 	 * Section 15.21.2, Virtualizing APIC.TPR
414 	 *
415 	 * This must be set for %rflag and %cr8 isolation of guest and host.
416 	 */
417 	ctrl->v_intr_ctrl |= V_INTR_MASKING;
418 
419 	/* Enable Last Branch Record aka LBR for debugging */
420 	ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
421 	state->dbgctl = BIT(0);
422 
423 	/* EFER_SVM must always be set when the guest is executing */
424 	state->efer = EFER_SVM;
425 
426 	/* Set up the PAT to power-on state */
427 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
428 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
429 	    PAT_VALUE(2, PAT_UNCACHED)		|
430 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
431 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
432 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
433 	    PAT_VALUE(6, PAT_UNCACHED)		|
434 	    PAT_VALUE(7, PAT_UNCACHEABLE);
435 
436 	/* Set up DR6/7 to power-on state */
437 	state->dr6 = DBREG_DR6_RESERVED1;
438 	state->dr7 = DBREG_DR7_RESERVED1;
439 }
440 
441 /*
442  * Initialize a virtual machine.
443  */
444 static void *
445 svm_vminit(struct vm *vm)
446 {
447 	struct svm_softc *svm_sc;
448 	struct svm_vcpu *vcpu;
449 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
450 	int i;
451 	uint16_t maxcpus;
452 
453 	svm_sc = kmem_zalloc(sizeof (*svm_sc), KM_SLEEP);
454 	VERIFY3U(((uintptr_t)svm_sc & PAGE_MASK),  ==,  0);
455 
456 	svm_sc->msr_bitmap = vmm_contig_alloc(SVM_MSR_BITMAP_SIZE);
457 	if (svm_sc->msr_bitmap == NULL)
458 		panic("contigmalloc of SVM MSR bitmap failed");
459 	svm_sc->iopm_bitmap = vmm_contig_alloc(SVM_IO_BITMAP_SIZE);
460 	if (svm_sc->iopm_bitmap == NULL)
461 		panic("contigmalloc of SVM IO bitmap failed");
462 
463 	svm_sc->vm = vm;
464 	svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm));
465 
466 	/*
467 	 * Intercept read and write accesses to all MSRs.
468 	 */
469 	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
470 
471 	/*
472 	 * Access to the following MSRs is redirected to the VMCB when the
473 	 * guest is executing. Therefore it is safe to allow the guest to
474 	 * read/write these MSRs directly without hypervisor involvement.
475 	 */
476 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
477 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
478 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
479 
480 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
481 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
482 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
483 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
484 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
485 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
486 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
487 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
488 
489 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
490 
491 	/*
492 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
493 	 */
494 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
495 
496 	/* Intercept access to all I/O ports. */
497 	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
498 
499 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
500 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
501 	pml4_pa = svm_sc->nptp;
502 	maxcpus = vm_get_maxcpus(svm_sc->vm);
503 	for (i = 0; i < maxcpus; i++) {
504 		vcpu = svm_get_vcpu(svm_sc, i);
505 		vcpu->nextrip = ~0;
506 		vcpu->lastcpu = NOCPU;
507 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
508 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
509 		svm_msr_guest_init(svm_sc, i);
510 	}
511 	return (svm_sc);
512 }
513 
514 /*
515  * Collateral for a generic SVM VM-exit.
516  */
517 static void
518 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
519 {
520 
521 	vme->exitcode = VM_EXITCODE_SVM;
522 	vme->u.svm.exitcode = code;
523 	vme->u.svm.exitinfo1 = info1;
524 	vme->u.svm.exitinfo2 = info2;
525 }
526 
527 static enum vm_cpu_mode
528 svm_vcpu_mode(struct vmcb *vmcb)
529 {
530 	struct vmcb_state *state;
531 
532 	state = &vmcb->state;
533 
534 	if (state->efer & EFER_LMA) {
535 		struct vmcb_segment *seg;
536 
537 		/*
538 		 * Section 4.8.1 for APM2, check if Code Segment has
539 		 * Long attribute set in descriptor.
540 		 */
541 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
542 		if (seg->attrib & VMCB_CS_ATTRIB_L)
543 			return (CPU_MODE_64BIT);
544 		else
545 			return (CPU_MODE_COMPATIBILITY);
546 	} else  if (state->cr0 & CR0_PE) {
547 		return (CPU_MODE_PROTECTED);
548 	} else {
549 		return (CPU_MODE_REAL);
550 	}
551 }
552 
553 static enum vm_paging_mode
554 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
555 {
556 
557 	if ((cr0 & CR0_PG) == 0)
558 		return (PAGING_MODE_FLAT);
559 	if ((cr4 & CR4_PAE) == 0)
560 		return (PAGING_MODE_32);
561 	if (efer & EFER_LME)
562 		return (PAGING_MODE_64);
563 	else
564 		return (PAGING_MODE_PAE);
565 }
566 
567 static void
568 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
569 {
570 	struct vmcb_state *state;
571 
572 	state = &vmcb->state;
573 	paging->cr3 = state->cr3;
574 	paging->cpl = state->cpl;
575 	paging->cpu_mode = svm_vcpu_mode(vmcb);
576 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
577 	    state->efer);
578 }
579 
580 #define	UNHANDLED 0
581 
582 /*
583  * Handle guest I/O intercept.
584  */
585 static int
586 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
587 {
588 	struct vmcb_ctrl *ctrl;
589 	struct vmcb_state *state;
590 	struct vm_inout *inout;
591 	struct vie *vie;
592 	uint64_t info1;
593 	struct vm_guest_paging paging;
594 
595 	state = svm_get_vmcb_state(svm_sc, vcpu);
596 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
597 	inout = &vmexit->u.inout;
598 	info1 = ctrl->exitinfo1;
599 
600 	inout->bytes = (info1 >> 4) & 0x7;
601 	inout->flags = 0;
602 	inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
603 	inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
604 	inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
605 	inout->port = (uint16_t)(info1 >> 16);
606 	inout->eax = (uint32_t)(state->rax);
607 
608 	if ((inout->flags & INOUT_STR) != 0) {
609 		/*
610 		 * The effective segment number in EXITINFO1[12:10] is populated
611 		 * only if the processor has the DecodeAssist capability.
612 		 *
613 		 * This is not specified explicitly in APMv2 but can be verified
614 		 * empirically.
615 		 */
616 		if (!decode_assist()) {
617 			/*
618 			 * Without decoding assistance, force the task of
619 			 * emulating the ins/outs on userspace.
620 			 */
621 			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
622 			bzero(&vmexit->u.inst_emul,
623 			    sizeof (vmexit->u.inst_emul));
624 			return (UNHANDLED);
625 		}
626 
627 		/*
628 		 * Bits 7-9 encode the address size of ins/outs operations where
629 		 * the 1/2/4 values correspond to 16/32/64 bit sizes.
630 		 */
631 		inout->addrsize = 2 * ((info1 >> 7) & 0x7);
632 		VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
633 		    inout->addrsize == 8);
634 
635 		if (inout->flags & INOUT_IN) {
636 			/*
637 			 * For INS instructions, %es (encoded as 0) is the
638 			 * implied segment for the operation.
639 			 */
640 			inout->segment = 0;
641 		} else {
642 			/*
643 			 * Bits 10-12 encode the segment for OUTS.
644 			 * This value follows the standard x86 segment order.
645 			 */
646 			inout->segment = (info1 >> 10) & 0x7;
647 		}
648 	}
649 
650 	vmexit->exitcode = VM_EXITCODE_INOUT;
651 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
652 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
653 	vie_init_inout(vie, inout, vmexit->inst_length, &paging);
654 
655 	/* The in/out emulation will handle advancing %rip */
656 	vmexit->inst_length = 0;
657 
658 	return (UNHANDLED);
659 }
660 
661 static int
662 npf_fault_type(uint64_t exitinfo1)
663 {
664 
665 	if (exitinfo1 & VMCB_NPF_INFO1_W)
666 		return (PROT_WRITE);
667 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
668 		return (PROT_EXEC);
669 	else
670 		return (PROT_READ);
671 }
672 
673 static bool
674 svm_npf_emul_fault(uint64_t exitinfo1)
675 {
676 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
677 		return (false);
678 	}
679 
680 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
681 		return (false);
682 	}
683 
684 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
685 		return (false);
686 	}
687 
688 	return (true);
689 }
690 
691 static void
692 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
693     uint64_t gpa)
694 {
695 	struct vmcb_ctrl *ctrl;
696 	struct vmcb *vmcb;
697 	struct vie *vie;
698 	struct vm_guest_paging paging;
699 	struct vmcb_segment *seg;
700 	char *inst_bytes = NULL;
701 	uint8_t inst_len = 0;
702 
703 	vmcb = svm_get_vmcb(svm_sc, vcpu);
704 	ctrl = &vmcb->ctrl;
705 
706 	vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
707 	vmexit->u.mmio_emul.gpa = gpa;
708 	vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
709 	svm_paging_info(vmcb, &paging);
710 
711 	switch (paging.cpu_mode) {
712 	case CPU_MODE_REAL:
713 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
714 		vmexit->u.mmio_emul.cs_base = seg->base;
715 		vmexit->u.mmio_emul.cs_d = 0;
716 		break;
717 	case CPU_MODE_PROTECTED:
718 	case CPU_MODE_COMPATIBILITY:
719 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
720 		vmexit->u.mmio_emul.cs_base = seg->base;
721 
722 		/*
723 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
724 		 */
725 		vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
726 		    1 : 0;
727 		break;
728 	default:
729 		vmexit->u.mmio_emul.cs_base = 0;
730 		vmexit->u.mmio_emul.cs_d = 0;
731 		break;
732 	}
733 
734 	/*
735 	 * Copy the instruction bytes into 'vie' if available.
736 	 */
737 	if (decode_assist() && !disable_npf_assist) {
738 		inst_len = ctrl->inst_len;
739 		inst_bytes = (char *)ctrl->inst_bytes;
740 	}
741 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
742 	vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
743 }
744 
745 /*
746  * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0
747  * which is live in the guest.  They are visible via the shadow instead.
748  */
749 #define	SVM_CR0_MASK	~(CR0_CD | CR0_NW | 0xffffffff00000000)
750 
751 static void
752 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write)
753 {
754 	struct vmcb_state *state;
755 	struct svm_regctx *regctx;
756 	uint64_t masked, old, diff;
757 
758 	state = svm_get_vmcb_state(svm_sc, vcpu);
759 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
760 
761 	old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
762 	diff = old ^ val;
763 
764 	/* No further work needed if register contents remain the same */
765 	if (diff == 0) {
766 		return;
767 	}
768 
769 	/* Flush the TLB if the paging or write-protect bits are changing */
770 	if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
771 		flush_asid(svm_sc, vcpu);
772 	}
773 
774 	/*
775 	 * If the change in %cr0 is due to a guest action (via interception)
776 	 * then other CPU state updates may be required.
777 	 */
778 	if (guest_write) {
779 		if ((diff & CR0_PG) != 0) {
780 			uint64_t efer = state->efer;
781 
782 			/* Keep the long-mode state in EFER in sync */
783 			if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) {
784 				state->efer |= EFER_LMA;
785 			}
786 			if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) {
787 				state->efer &= ~EFER_LMA;
788 			}
789 		}
790 	}
791 
792 	masked = val & SVM_CR0_MASK;
793 	regctx->sctx_cr0_shadow = val;
794 	state->cr0 = masked;
795 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR);
796 
797 	if ((masked ^ val) != 0) {
798 		/*
799 		 * The guest has set bits in %cr0 which we are masking out and
800 		 * exposing via shadow.
801 		 *
802 		 * We must intercept %cr0 reads in order to make the shadowed
803 		 * view available to the guest.
804 		 *
805 		 * Writes to %cr0 must also be intercepted (unconditionally,
806 		 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch
807 		 * if/when the guest clears those shadowed bits.
808 		 */
809 		svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
810 		    BIT(0) | BIT(16));
811 	} else {
812 		/*
813 		 * When no bits remain in %cr0 which require shadowing, the
814 		 * unconditional intercept of reads/writes to %cr0 can be
815 		 * disabled.
816 		 *
817 		 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains
818 		 * in place so we can be notified of operations which change
819 		 * bits other than TS or MP.
820 		 */
821 		svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
822 		    BIT(0) | BIT(16));
823 	}
824 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I);
825 }
826 
827 static void
828 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val)
829 {
830 	struct vmcb *vmcb;
831 	struct svm_regctx *regctx;
832 
833 	vmcb = svm_get_vmcb(svm_sc, vcpu);
834 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
835 
836 	/*
837 	 * Include the %cr0 bits which exist only in the shadow along with those
838 	 * in the running vCPU state.
839 	 */
840 	*val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
841 }
842 
843 static void
844 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
845 {
846 	uint64_t val;
847 	int err __maybe_unused;
848 
849 	svm_get_cr0(svm_sc, vcpu, &val);
850 	err = svm_setreg(svm_sc, vcpu, reg, val);
851 	ASSERT(err == 0);
852 }
853 
854 static void
855 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
856 {
857 	struct vmcb_state *state;
858 	uint64_t val;
859 	int err __maybe_unused;
860 
861 	state = svm_get_vmcb_state(svm_sc, vcpu);
862 
863 	err = svm_getreg(svm_sc, vcpu, reg, &val);
864 	ASSERT(err == 0);
865 
866 	if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) {
867 		/* NW without CD is nonsensical */
868 		vm_inject_gp(svm_sc->vm, vcpu);
869 		return;
870 	}
871 	if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) {
872 		/* PG requires PE */
873 		vm_inject_gp(svm_sc->vm, vcpu);
874 		return;
875 	}
876 	if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) {
877 		/* When enabling paging, PAE must be enabled if LME is. */
878 		if ((state->efer & EFER_LME) != 0 &&
879 		    (state->cr4 & CR4_PAE) == 0) {
880 			vm_inject_gp(svm_sc->vm, vcpu);
881 			return;
882 		}
883 	}
884 
885 	svm_set_cr0(svm_sc, vcpu, val, true);
886 }
887 
888 static void
889 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
890 {
891 	struct vie *vie;
892 	struct vm_guest_paging paging;
893 
894 	/* Let the instruction emulation (hopefully in-kernel) handle it */
895 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
896 	bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul));
897 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
898 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
899 	vie_init_other(vie, &paging);
900 
901 	/* The instruction emulation will handle advancing %rip */
902 	vmexit->inst_length = 0;
903 }
904 
905 static void
906 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
907 {
908 	struct vm *vm;
909 	struct vlapic *vlapic;
910 	struct vmcb_ctrl *ctrl;
911 
912 	vm = sc->vm;
913 	vlapic = vm_lapic(vm, vcpu);
914 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
915 
916 	/* Update %cr8 in the emulated vlapic */
917 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
918 
919 	/* Virtual interrupt injection is not used. */
920 	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
921 	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
922 }
923 
924 CTASSERT(VMCB_EVENTINJ_TYPE_INTR	== VM_INTINFO_HWINTR);
925 CTASSERT(VMCB_EVENTINJ_TYPE_NMI		== VM_INTINFO_NMI);
926 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION	== VM_INTINFO_HWEXCP);
927 CTASSERT(VMCB_EVENTINJ_TYPE_INTn	== VM_INTINFO_SWINTR);
928 CTASSERT(VMCB_EVENTINJ_EC_VALID		== VM_INTINFO_DEL_ERRCODE);
929 CTASSERT(VMCB_EVENTINJ_VALID		== VM_INTINFO_VALID);
930 
931 /*
932  * Store SVM-specific event injection info for later handling.  This depends on
933  * the bhyve-internal event definitions matching those in the VMCB, as ensured
934  * by the above CTASSERTs.
935  */
936 static void
937 svm_stash_intinfo(struct svm_softc *svm_sc, int vcpu, uint64_t intinfo)
938 {
939 	ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
940 
941 	/*
942 	 * If stashing an NMI pending injection, ensure that it bears the
943 	 * correct vector which exit_intinfo expects.
944 	 */
945 	if (VM_INTINFO_TYPE(intinfo) == VM_INTINFO_NMI) {
946 		intinfo &= ~VM_INTINFO_MASK_VECTOR;
947 		intinfo |= IDT_NMI;
948 	}
949 
950 	VERIFY0(vm_exit_intinfo(svm_sc->vm, vcpu, intinfo));
951 }
952 
953 static void
954 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
955 {
956 	struct vmcb_ctrl *ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
957 	uint64_t intinfo = ctrl->exitintinfo;
958 
959 	if (VMCB_EXITINTINFO_VALID(intinfo)) {
960 		/*
961 		 * If a #VMEXIT happened during event delivery then record the
962 		 * event that was being delivered.
963 		 */
964 		vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
965 
966 		svm_stash_intinfo(svm_sc, vcpu, intinfo);
967 	}
968 }
969 
970 static __inline int
971 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
972 {
973 
974 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
975 	    VMCB_INTCPT_VINTR));
976 }
977 
978 static void
979 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
980 {
981 	struct vmcb_ctrl *ctrl;
982 	struct vmcb_state *state;
983 
984 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
985 	state = svm_get_vmcb_state(sc, vcpu);
986 
987 	if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
988 		KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
989 		    ("%s: invalid v_ign_tpr", __func__));
990 		KASSERT(vintr_intercept_enabled(sc, vcpu),
991 		    ("%s: vintr intercept should be enabled", __func__));
992 		return;
993 	}
994 
995 	/*
996 	 * We use V_IRQ in conjunction with the VINTR intercept to trap into the
997 	 * hypervisor as soon as a virtual interrupt can be delivered.
998 	 *
999 	 * Since injected events are not subject to intercept checks we need to
1000 	 * ensure that the V_IRQ is not actually going to be delivered on VM
1001 	 * entry.
1002 	 */
1003 	VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
1004 	    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
1005 
1006 	ctrl->v_irq |= V_IRQ;
1007 	ctrl->v_intr_prio |= V_IGN_TPR;
1008 	ctrl->v_intr_vector = 0;
1009 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1010 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1011 }
1012 
1013 static void
1014 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
1015 {
1016 	struct vmcb_ctrl *ctrl;
1017 
1018 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1019 
1020 	if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
1021 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
1022 		    ("%s: vintr intercept should be disabled", __func__));
1023 		return;
1024 	}
1025 
1026 	ctrl->v_irq &= ~V_IRQ;
1027 	ctrl->v_intr_vector = 0;
1028 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1029 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1030 }
1031 
1032 /*
1033  * Once an NMI is injected it blocks delivery of further NMIs until the handler
1034  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1035  * to track when the vcpu is done handling the NMI.
1036  */
1037 static int
1038 svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1039 {
1040 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1041 	    VMCB_INTCPT_IRET));
1042 }
1043 
1044 static void
1045 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1046 {
1047 	struct vmcb_ctrl *ctrl;
1048 
1049 	KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1050 	/*
1051 	 * When the IRET intercept is cleared the vcpu will attempt to execute
1052 	 * the "iret" when it runs next. However, it is possible to inject
1053 	 * another NMI into the vcpu before the "iret" has actually executed.
1054 	 *
1055 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1056 	 * it will trap back into the hypervisor. If an NMI is pending for
1057 	 * the vcpu it will be injected into the guest.
1058 	 *
1059 	 * XXX this needs to be fixed
1060 	 */
1061 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1062 
1063 	/*
1064 	 * Set an interrupt shadow to prevent an NMI from being immediately
1065 	 * injected on the next VMRUN.
1066 	 */
1067 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1068 	ctrl->intr_shadow = 1;
1069 }
1070 
1071 static void
1072 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info)
1073 {
1074 	ASSERT(VM_INTINFO_PENDING(info));
1075 
1076 	uint8_t vector = VM_INTINFO_VECTOR(info);
1077 	uint32_t type = VM_INTINFO_TYPE(info);
1078 
1079 	/*
1080 	 * Correct behavior depends on bhyve intinfo event types lining up with
1081 	 * those defined by AMD for event injection in the VMCB.  The CTASSERTs
1082 	 * above svm_save_exitintinfo() ensure it.
1083 	 */
1084 	switch (type) {
1085 	case VM_INTINFO_NMI:
1086 		/* Ensure vector for injected event matches its type (NMI) */
1087 		vector = IDT_NMI;
1088 		break;
1089 	case VM_INTINFO_HWINTR:
1090 	case VM_INTINFO_SWINTR:
1091 		break;
1092 	case VM_INTINFO_HWEXCP:
1093 		if (vector == IDT_NMI) {
1094 			/*
1095 			 * NMIs are expected to be injected with
1096 			 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception
1097 			 * with the NMI vector.
1098 			 */
1099 			type = VM_INTINFO_NMI;
1100 		}
1101 		VERIFY(vector < 32);
1102 		break;
1103 	default:
1104 		/*
1105 		 * Since there is not strong validation for injected event types
1106 		 * at this point, fall back to software interrupt for those we
1107 		 * do not recognized.
1108 		 */
1109 		type = VM_INTINFO_SWINTR;
1110 		break;
1111 	}
1112 
1113 	ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector;
1114 	if (VM_INTINFO_HAS_ERRCODE(info)) {
1115 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1116 		ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32;
1117 	}
1118 }
1119 
1120 static void
1121 svm_inject_nmi(struct svm_softc *sc, int vcpu)
1122 {
1123 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1124 
1125 	ASSERT(!svm_nmi_blocked(sc, vcpu));
1126 
1127 	ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1128 	vm_nmi_clear(sc->vm, vcpu);
1129 
1130 	/*
1131 	 * Virtual NMI blocking is now in effect.
1132 	 *
1133 	 * Not only does this block a subsequent NMI injection from taking
1134 	 * place, it also configures an intercept on the IRET so we can track
1135 	 * when the next injection can take place.
1136 	 */
1137 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1138 }
1139 
1140 static void
1141 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1142 {
1143 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1144 
1145 	ASSERT(vector >= 0 && vector <= 255);
1146 
1147 	ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1148 }
1149 
1150 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
1151 
1152 static vm_msr_result_t
1153 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1154 {
1155 	struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu);
1156 	uint64_t lma;
1157 	int error;
1158 
1159 	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
1160 
1161 	if (newval & EFER_MBZ_BITS) {
1162 		return (VMR_GP);
1163 	}
1164 
1165 	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1166 	const uint64_t changed = state->efer ^ newval;
1167 	if (changed & EFER_LME) {
1168 		if (state->cr0 & CR0_PG) {
1169 			return (VMR_GP);
1170 		}
1171 	}
1172 
1173 	/* EFER.LMA = EFER.LME & CR0.PG */
1174 	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) {
1175 		lma = EFER_LMA;
1176 	} else {
1177 		lma = 0;
1178 	}
1179 	if ((newval & EFER_LMA) != lma) {
1180 		return (VMR_GP);
1181 	}
1182 
1183 	if ((newval & EFER_NXE) != 0 &&
1184 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) {
1185 		return (VMR_GP);
1186 	}
1187 	if ((newval & EFER_FFXSR) != 0 &&
1188 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) {
1189 		return (VMR_GP);
1190 	}
1191 	if ((newval & EFER_TCE) != 0 &&
1192 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) {
1193 		return (VMR_GP);
1194 	}
1195 
1196 	/*
1197 	 * Until bhyve has proper support for long-mode segment limits, just
1198 	 * toss a #GP at the guest if they attempt to use it.
1199 	 */
1200 	if (newval & EFER_LMSLE) {
1201 		return (VMR_GP);
1202 	}
1203 
1204 	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1205 	VERIFY0(error);
1206 	return (VMR_OK);
1207 }
1208 
1209 static int
1210 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
1211     bool is_wrmsr)
1212 {
1213 	struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1214 	struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1215 	const uint32_t ecx = ctx->sctx_rcx;
1216 	vm_msr_result_t res;
1217 	uint64_t val = 0;
1218 
1219 	if (is_wrmsr) {
1220 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1221 		val = ctx->sctx_rdx << 32 | (uint32_t)state->rax;
1222 
1223 		if (vlapic_owned_msr(ecx)) {
1224 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1225 
1226 			res = vlapic_wrmsr(vlapic, ecx, val);
1227 		} else if (ecx == MSR_EFER) {
1228 			res = svm_write_efer(svm_sc, vcpu, val);
1229 		} else {
1230 			res = svm_wrmsr(svm_sc, vcpu, ecx, val);
1231 		}
1232 	} else {
1233 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1234 
1235 		if (vlapic_owned_msr(ecx)) {
1236 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1237 
1238 			res = vlapic_rdmsr(vlapic, ecx, &val);
1239 		} else {
1240 			res = svm_rdmsr(svm_sc, vcpu, ecx, &val);
1241 		}
1242 	}
1243 
1244 	switch (res) {
1245 	case VMR_OK:
1246 		/* Store rdmsr result in the appropriate registers */
1247 		if (!is_wrmsr) {
1248 			state->rax = (uint32_t)val;
1249 			ctx->sctx_rdx = val >> 32;
1250 		}
1251 		return (1);
1252 	case VMR_GP:
1253 		vm_inject_gp(svm_sc->vm, vcpu);
1254 		return (1);
1255 	case VMR_UNHANLDED:
1256 		vmexit->exitcode = is_wrmsr ?
1257 		    VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR;
1258 		vmexit->u.msr.code = ecx;
1259 		vmexit->u.msr.wval = val;
1260 		return (0);
1261 	default:
1262 		panic("unexpected msr result %u\n", res);
1263 	}
1264 }
1265 
1266 /*
1267  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1268  * that are due to instruction intercepts as well as MSR and IOIO intercepts
1269  * and exceptions caused by INT3, INTO and BOUND instructions.
1270  *
1271  * Return 1 if the nRIP is valid and 0 otherwise.
1272  */
1273 static int
1274 nrip_valid(uint64_t exitcode)
1275 {
1276 	switch (exitcode) {
1277 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
1278 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
1279 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
1280 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
1281 	case 0x43:		/* INT3 */
1282 	case 0x44:		/* INTO */
1283 	case 0x45:		/* BOUND */
1284 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1285 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1286 		return (1);
1287 	default:
1288 		return (0);
1289 	}
1290 }
1291 
1292 static int
1293 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1294 {
1295 	struct vmcb *vmcb;
1296 	struct vmcb_state *state;
1297 	struct vmcb_ctrl *ctrl;
1298 	struct svm_regctx *ctx;
1299 	uint64_t code, info1, info2;
1300 	int handled;
1301 
1302 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
1303 	vmcb = svm_get_vmcb(svm_sc, vcpu);
1304 	state = &vmcb->state;
1305 	ctrl = &vmcb->ctrl;
1306 
1307 	handled = 0;
1308 	code = ctrl->exitcode;
1309 	info1 = ctrl->exitinfo1;
1310 	info2 = ctrl->exitinfo2;
1311 
1312 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1313 	vmexit->rip = state->rip;
1314 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1315 
1316 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1317 
1318 	/*
1319 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1320 	 * in an inconsistent state and can trigger assertions that would
1321 	 * never happen otherwise.
1322 	 */
1323 	if (code == VMCB_EXIT_INVALID) {
1324 		vm_exit_svm(vmexit, code, info1, info2);
1325 		return (0);
1326 	}
1327 
1328 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1329 	    "injection valid bit is set %lx", __func__, ctrl->eventinj));
1330 
1331 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1332 	    ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1333 	    vmexit->inst_length, code, info1, info2));
1334 
1335 	svm_update_virqinfo(svm_sc, vcpu);
1336 	svm_save_exitintinfo(svm_sc, vcpu);
1337 
1338 	switch (code) {
1339 	case VMCB_EXIT_CR0_READ:
1340 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1341 			svm_handle_cr0_read(svm_sc, vcpu,
1342 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1343 			handled = 1;
1344 		} else {
1345 			/*
1346 			 * If SMSW is used to read the contents of %cr0, then
1347 			 * the VALID bit will not be set in `info1`, since the
1348 			 * handling is different from the mov-to-reg case.
1349 			 *
1350 			 * Punt to the instruction emulation to handle it.
1351 			 */
1352 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1353 		}
1354 		break;
1355 	case VMCB_EXIT_CR0_WRITE:
1356 	case VMCB_EXIT_CR0_SEL_WRITE:
1357 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1358 			svm_handle_cr0_write(svm_sc, vcpu,
1359 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1360 			handled = 1;
1361 		} else {
1362 			/*
1363 			 * Writes to %cr0 without VALID being set in `info1` are
1364 			 * initiated by the LMSW and CLTS instructions.  While
1365 			 * LMSW (like SMSW) sees little use in modern OSes and
1366 			 * bootloaders, CLTS is still used for handling FPU
1367 			 * state transitions.
1368 			 *
1369 			 * Punt to the instruction emulation to handle them.
1370 			 */
1371 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1372 		}
1373 		break;
1374 	case VMCB_EXIT_IRET:
1375 		/*
1376 		 * Restart execution at "iret" but with the intercept cleared.
1377 		 */
1378 		vmexit->inst_length = 0;
1379 		svm_clear_nmi_blocking(svm_sc, vcpu);
1380 		handled = 1;
1381 		break;
1382 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
1383 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1384 		svm_disable_intr_window_exiting(svm_sc, vcpu);
1385 		handled = 1;
1386 		break;
1387 	case VMCB_EXIT_INTR:	/* external interrupt */
1388 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1389 		handled = 1;
1390 		break;
1391 	case VMCB_EXIT_NMI:
1392 	case VMCB_EXIT_SMI:
1393 	case VMCB_EXIT_INIT:
1394 		/*
1395 		 * For external NMI/SMI and physical INIT interrupts, simply
1396 		 * continue execution, as those host events will be handled by
1397 		 * the physical CPU.
1398 		 */
1399 		handled = 1;
1400 		break;
1401 	case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: {
1402 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1403 
1404 		const uint8_t idtvec = code - VMCB_EXIT_EXCP0;
1405 		uint32_t errcode = 0;
1406 		bool reflect = true;
1407 		bool errcode_valid = false;
1408 
1409 		switch (idtvec) {
1410 		case IDT_MC:
1411 			/* The host will handle the MCE itself. */
1412 			reflect = false;
1413 			vmm_call_trap(T_MCE);
1414 			break;
1415 		case IDT_PF:
1416 			VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1417 			    info2));
1418 			/* fallthru */
1419 		case IDT_NP:
1420 		case IDT_SS:
1421 		case IDT_GP:
1422 		case IDT_AC:
1423 		case IDT_TS:
1424 			errcode_valid = true;
1425 			errcode = info1;
1426 			break;
1427 
1428 		case IDT_DF:
1429 			errcode_valid = true;
1430 			break;
1431 
1432 		case IDT_BP:
1433 		case IDT_OF:
1434 		case IDT_BR:
1435 			/*
1436 			 * The 'nrip' field is populated for INT3, INTO and
1437 			 * BOUND exceptions and this also implies that
1438 			 * 'inst_length' is non-zero.
1439 			 *
1440 			 * Reset 'inst_length' to zero so the guest %rip at
1441 			 * event injection is identical to what it was when
1442 			 * the exception originally happened.
1443 			 */
1444 			vmexit->inst_length = 0;
1445 			/* fallthru */
1446 		default:
1447 			errcode_valid = false;
1448 			break;
1449 		}
1450 		VERIFY0(vmexit->inst_length);
1451 
1452 		if (reflect) {
1453 			/* Reflect the exception back into the guest */
1454 			VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1455 			    errcode_valid, errcode, false));
1456 		}
1457 		handled = 1;
1458 		break;
1459 		}
1460 	case VMCB_EXIT_MSR:
1461 		handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0);
1462 		break;
1463 	case VMCB_EXIT_IO:
1464 		handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1465 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1466 		break;
1467 	case VMCB_EXIT_SHUTDOWN:
1468 		(void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT);
1469 		handled = 1;
1470 		break;
1471 	case VMCB_EXIT_INVLPGA:
1472 		/* privileged invalidation instructions */
1473 		vm_inject_ud(svm_sc->vm, vcpu);
1474 		handled = 1;
1475 		break;
1476 	case VMCB_EXIT_VMRUN:
1477 	case VMCB_EXIT_VMLOAD:
1478 	case VMCB_EXIT_VMSAVE:
1479 	case VMCB_EXIT_STGI:
1480 	case VMCB_EXIT_CLGI:
1481 	case VMCB_EXIT_SKINIT:
1482 		/* privileged vmm instructions */
1483 		vm_inject_ud(svm_sc->vm, vcpu);
1484 		handled = 1;
1485 		break;
1486 	case VMCB_EXIT_INVD:
1487 	case VMCB_EXIT_WBINVD:
1488 		/* ignore exit */
1489 		handled = 1;
1490 		break;
1491 	case VMCB_EXIT_VMMCALL:
1492 		/* No handlers make use of VMMCALL for now */
1493 		vm_inject_ud(svm_sc->vm, vcpu);
1494 		handled = 1;
1495 		break;
1496 	case VMCB_EXIT_CPUID:
1497 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1498 		vcpu_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1499 		    &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1500 		handled = 1;
1501 		break;
1502 	case VMCB_EXIT_HLT:
1503 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1504 		vmexit->exitcode = VM_EXITCODE_HLT;
1505 		vmexit->u.hlt.rflags = state->rflags;
1506 		break;
1507 	case VMCB_EXIT_PAUSE:
1508 		vmexit->exitcode = VM_EXITCODE_PAUSE;
1509 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1510 		break;
1511 	case VMCB_EXIT_NPF:
1512 		/* EXITINFO2 contains the faulting guest physical address */
1513 		if (info1 & VMCB_NPF_INFO1_RSV) {
1514 			/* nested fault with reserved bits set */
1515 		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1516 			vmexit->exitcode = VM_EXITCODE_PAGING;
1517 			vmexit->u.paging.gpa = info2;
1518 			vmexit->u.paging.fault_type = npf_fault_type(info1);
1519 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1520 		} else if (svm_npf_emul_fault(info1)) {
1521 			svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1522 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1523 		}
1524 		break;
1525 	case VMCB_EXIT_MONITOR:
1526 		vmexit->exitcode = VM_EXITCODE_MONITOR;
1527 		break;
1528 	case VMCB_EXIT_MWAIT:
1529 		vmexit->exitcode = VM_EXITCODE_MWAIT;
1530 		break;
1531 	default:
1532 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1533 		break;
1534 	}
1535 
1536 	DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1537 	    code);
1538 
1539 	if (handled) {
1540 		vmexit->rip += vmexit->inst_length;
1541 		vmexit->inst_length = 0;
1542 		state->rip = vmexit->rip;
1543 	} else {
1544 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1545 			/*
1546 			 * If this VM exit was not claimed by anybody then
1547 			 * treat it as a generic SVM exit.
1548 			 */
1549 			vm_exit_svm(vmexit, code, info1, info2);
1550 		} else {
1551 			/*
1552 			 * The exitcode and collateral have been populated.
1553 			 * The VM exit will be processed further in userland.
1554 			 */
1555 		}
1556 	}
1557 	return (handled);
1558 }
1559 
1560 /*
1561  * Inject exceptions, NMIs, and ExtINTs.
1562  *
1563  * The logic behind these are complicated and may involve mutex contention, so
1564  * the injection is performed without the protection of host CPU interrupts
1565  * being disabled.  This means a racing notification could be "lost",
1566  * necessitating a later call to svm_inject_recheck() to close that window
1567  * of opportunity.
1568  */
1569 static enum event_inject_state
1570 svm_inject_events(struct svm_softc *sc, int vcpu)
1571 {
1572 	struct vmcb_ctrl *ctrl;
1573 	struct vmcb_state *state;
1574 	struct svm_vcpu *vcpustate;
1575 	uint64_t intinfo;
1576 	enum event_inject_state ev_state;
1577 
1578 	state = svm_get_vmcb_state(sc, vcpu);
1579 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1580 	vcpustate = svm_get_vcpu(sc, vcpu);
1581 	ev_state = EIS_CAN_INJECT;
1582 
1583 	/* Clear any interrupt shadow if guest %rip has changed */
1584 	if (vcpustate->nextrip != state->rip) {
1585 		ctrl->intr_shadow = 0;
1586 	}
1587 
1588 	/*
1589 	 * An event is already pending for injection.  This can occur when the
1590 	 * vCPU exits prior to VM entry (like for an AST).
1591 	 */
1592 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1593 		return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1594 	}
1595 
1596 	/*
1597 	 * Inject pending events or exceptions for this vcpu.
1598 	 *
1599 	 * An event might be pending because the previous #VMEXIT happened
1600 	 * during event delivery (i.e. ctrl->exitintinfo).
1601 	 *
1602 	 * An event might also be pending because an exception was injected
1603 	 * by the hypervisor (e.g. #PF during instruction emulation).
1604 	 */
1605 	if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1606 		svm_inject_event(ctrl, intinfo);
1607 		vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1608 		ev_state = EIS_EV_INJECTED;
1609 	}
1610 
1611 	/* NMI event has priority over interrupts. */
1612 	if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1613 		if (ev_state == EIS_CAN_INJECT) {
1614 			/* Can't inject NMI if vcpu is in an intr_shadow. */
1615 			if (ctrl->intr_shadow) {
1616 				return (EIS_GI_BLOCK);
1617 			}
1618 
1619 			svm_inject_nmi(sc, vcpu);
1620 			ev_state = EIS_EV_INJECTED;
1621 		} else {
1622 			return (ev_state | EIS_REQ_EXIT);
1623 		}
1624 	}
1625 
1626 	if (vm_extint_pending(sc->vm, vcpu)) {
1627 		int vector;
1628 
1629 		if (ev_state != EIS_CAN_INJECT) {
1630 			return (ev_state | EIS_REQ_EXIT);
1631 		}
1632 
1633 		/*
1634 		 * If the guest has disabled interrupts or is in an interrupt
1635 		 * shadow then we cannot inject the pending interrupt.
1636 		 */
1637 		if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1638 			return (EIS_GI_BLOCK);
1639 		}
1640 
1641 		/* Ask the legacy pic for a vector to inject */
1642 		vatpic_pending_intr(sc->vm, &vector);
1643 		KASSERT(vector >= 0 && vector <= 255,
1644 		    ("invalid vector %d from INTR", vector));
1645 
1646 		svm_inject_irq(sc, vcpu, vector);
1647 		vm_extint_clear(sc->vm, vcpu);
1648 		vatpic_intr_accepted(sc->vm, vector);
1649 		ev_state = EIS_EV_INJECTED;
1650 	}
1651 
1652 	return (ev_state);
1653 }
1654 
1655 /*
1656  * Synchronize vLAPIC state and inject any interrupts pending on it.
1657  *
1658  * This is done with host CPU interrupts disabled so notification IPIs will be
1659  * queued on the host APIC and recognized when entering SVM guest context.
1660  */
1661 static enum event_inject_state
1662 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1663     enum event_inject_state ev_state)
1664 {
1665 	struct vmcb_ctrl *ctrl;
1666 	struct vmcb_state *state;
1667 	int vector;
1668 	uint8_t v_tpr;
1669 
1670 	state = svm_get_vmcb_state(sc, vcpu);
1671 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1672 
1673 	/*
1674 	 * The guest can modify the TPR by writing to %cr8. In guest mode the
1675 	 * CPU reflects this write to V_TPR without hypervisor intervention.
1676 	 *
1677 	 * The guest can also modify the TPR by writing to it via the memory
1678 	 * mapped APIC page. In this case, the write will be emulated by the
1679 	 * hypervisor. For this reason V_TPR must be updated before every
1680 	 * VMRUN.
1681 	 */
1682 	v_tpr = vlapic_get_cr8(vlapic);
1683 	KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1684 	if (ctrl->v_tpr != v_tpr) {
1685 		ctrl->v_tpr = v_tpr;
1686 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1687 	}
1688 
1689 	/* If an event cannot otherwise be injected, we are done for now */
1690 	if (ev_state != EIS_CAN_INJECT) {
1691 		return (ev_state);
1692 	}
1693 
1694 	if (!vlapic_pending_intr(vlapic, &vector)) {
1695 		return (EIS_CAN_INJECT);
1696 	}
1697 	KASSERT(vector >= 16 && vector <= 255,
1698 	    ("invalid vector %d from local APIC", vector));
1699 
1700 	/*
1701 	 * If the guest has disabled interrupts or is in an interrupt shadow
1702 	 * then we cannot inject the pending interrupt.
1703 	 */
1704 	if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1705 		return (EIS_GI_BLOCK);
1706 	}
1707 
1708 	svm_inject_irq(sc, vcpu, vector);
1709 	vlapic_intr_accepted(vlapic, vector);
1710 	return (EIS_EV_INJECTED);
1711 }
1712 
1713 /*
1714  * Re-check for events to be injected.
1715  *
1716  * Once host CPU interrupts are disabled, check for the presence of any events
1717  * which require injection processing.  If an exit is required upon injection,
1718  * or once the guest becomes interruptable, that will be configured too.
1719  */
1720 static bool
1721 svm_inject_recheck(struct svm_softc *sc, int vcpu,
1722     enum event_inject_state ev_state)
1723 {
1724 	struct vmcb_ctrl *ctrl;
1725 
1726 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1727 
1728 	if (ev_state == EIS_CAN_INJECT) {
1729 		/*
1730 		 * An active interrupt shadow would preclude us from injecting
1731 		 * any events picked up during a re-check.
1732 		 */
1733 		if (ctrl->intr_shadow != 0) {
1734 			return (false);
1735 		}
1736 
1737 		if (vm_nmi_pending(sc->vm, vcpu) &&
1738 		    !svm_nmi_blocked(sc, vcpu)) {
1739 			/* queued NMI not blocked by NMI-window-exiting */
1740 			return (true);
1741 		}
1742 		if (vm_extint_pending(sc->vm, vcpu)) {
1743 			/* queued ExtINT not blocked by existing injection */
1744 			return (true);
1745 		}
1746 	} else {
1747 		if ((ev_state & EIS_REQ_EXIT) != 0) {
1748 			/*
1749 			 * Use a self-IPI to force an immediate exit after
1750 			 * event injection has occurred.
1751 			 */
1752 			poke_cpu(CPU->cpu_id);
1753 		} else {
1754 			/*
1755 			 * If any event is being injected, an exit immediately
1756 			 * upon becoming interruptable again will allow pending
1757 			 * or newly queued events to be injected in a timely
1758 			 * manner.
1759 			 */
1760 			svm_enable_intr_window_exiting(sc, vcpu);
1761 		}
1762 	}
1763 	return (false);
1764 }
1765 
1766 
1767 static void
1768 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen)
1769 {
1770 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1771 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1772 	uint8_t flush;
1773 
1774 	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1775 	    vcpustate->nptgen != nptgen);
1776 
1777 	if (flush != VMCB_TLB_FLUSH_NOTHING) {
1778 		ctrl->asid = vcpustate->hma_asid.hsa_asid;
1779 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1780 	}
1781 	ctrl->tlb_ctrl = flush;
1782 	vcpustate->nptgen = nptgen;
1783 }
1784 
1785 static void
1786 flush_asid(struct svm_softc *sc, int vcpuid)
1787 {
1788 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1789 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1790 	uint8_t flush;
1791 
1792 	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1793 	    true);
1794 
1795 	ASSERT(flush != VMCB_TLB_FLUSH_NOTHING);
1796 	ctrl->asid = vcpustate->hma_asid.hsa_asid;
1797 	ctrl->tlb_ctrl = flush;
1798 	svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1799 	/*
1800 	 * A potential future optimization: We could choose to update the nptgen
1801 	 * associated with the vCPU, since any pending nptgen change requiring a
1802 	 * flush will be satisfied by the one which has just now been queued.
1803 	 */
1804 }
1805 
1806 static __inline void
1807 disable_gintr(void)
1808 {
1809 	__asm __volatile("clgi");
1810 }
1811 
1812 static __inline void
1813 enable_gintr(void)
1814 {
1815 	__asm __volatile("stgi");
1816 }
1817 
1818 static __inline void
1819 svm_dr_enter_guest(struct svm_regctx *gctx)
1820 {
1821 
1822 	/* Save host control debug registers. */
1823 	gctx->host_dr7 = rdr7();
1824 	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1825 
1826 	/*
1827 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1828 	 * exceptions in the host based on the guest DRx values.  The
1829 	 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1830 	 * VMCB.
1831 	 */
1832 	load_dr7(0);
1833 	wrmsr(MSR_DEBUGCTLMSR, 0);
1834 
1835 	/* Save host debug registers. */
1836 	gctx->host_dr0 = rdr0();
1837 	gctx->host_dr1 = rdr1();
1838 	gctx->host_dr2 = rdr2();
1839 	gctx->host_dr3 = rdr3();
1840 	gctx->host_dr6 = rdr6();
1841 
1842 	/* Restore guest debug registers. */
1843 	load_dr0(gctx->sctx_dr0);
1844 	load_dr1(gctx->sctx_dr1);
1845 	load_dr2(gctx->sctx_dr2);
1846 	load_dr3(gctx->sctx_dr3);
1847 }
1848 
1849 static __inline void
1850 svm_dr_leave_guest(struct svm_regctx *gctx)
1851 {
1852 
1853 	/* Save guest debug registers. */
1854 	gctx->sctx_dr0 = rdr0();
1855 	gctx->sctx_dr1 = rdr1();
1856 	gctx->sctx_dr2 = rdr2();
1857 	gctx->sctx_dr3 = rdr3();
1858 
1859 	/*
1860 	 * Restore host debug registers.  Restore DR7 and DEBUGCTL
1861 	 * last.
1862 	 */
1863 	load_dr0(gctx->host_dr0);
1864 	load_dr1(gctx->host_dr1);
1865 	load_dr2(gctx->host_dr2);
1866 	load_dr3(gctx->host_dr3);
1867 	load_dr6(gctx->host_dr6);
1868 	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1869 	load_dr7(gctx->host_dr7);
1870 }
1871 
1872 /*
1873  * Apply the TSC offset for a vCPU, including physical CPU and per-vCPU offsets.
1874  */
1875 static void
1876 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid)
1877 {
1878 	const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true);
1879 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid);
1880 
1881 	if (ctrl->tsc_offset != offset) {
1882 		ctrl->tsc_offset = offset;
1883 		svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I);
1884 	}
1885 }
1886 
1887 /*
1888  * Start vcpu with specified RIP.
1889  */
1890 static int
1891 svm_vmrun(void *arg, int vcpu, uint64_t rip)
1892 {
1893 	struct svm_regctx *gctx;
1894 	struct svm_softc *svm_sc;
1895 	struct svm_vcpu *vcpustate;
1896 	struct vmcb_state *state;
1897 	struct vmcb_ctrl *ctrl;
1898 	struct vm_exit *vmexit;
1899 	struct vlapic *vlapic;
1900 	vm_client_t *vmc;
1901 	struct vm *vm;
1902 	uint64_t vmcb_pa;
1903 	int handled;
1904 	uint16_t ldt_sel;
1905 
1906 	svm_sc = arg;
1907 	vm = svm_sc->vm;
1908 
1909 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1910 	state = svm_get_vmcb_state(svm_sc, vcpu);
1911 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1912 	vmexit = vm_exitinfo(vm, vcpu);
1913 	vlapic = vm_lapic(vm, vcpu);
1914 	vmc = vm_get_vmclient(vm, vcpu);
1915 
1916 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1917 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1918 
1919 	if (vcpustate->lastcpu != curcpu) {
1920 		/*
1921 		 * Force new ASID allocation by invalidating the generation.
1922 		 */
1923 		vcpustate->hma_asid.hsa_gen = 0;
1924 
1925 		/*
1926 		 * Invalidate the VMCB state cache by marking all fields dirty.
1927 		 */
1928 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1929 
1930 		/*
1931 		 * XXX
1932 		 * Setting 'vcpustate->lastcpu' here is bit premature because
1933 		 * we may return from this function without actually executing
1934 		 * the VMRUN  instruction. This could happen if an AST or yield
1935 		 * condition is pending on the first time through the loop.
1936 		 *
1937 		 * This works for now but any new side-effects of vcpu
1938 		 * migration should take this case into account.
1939 		 */
1940 		vcpustate->lastcpu = curcpu;
1941 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1942 	}
1943 
1944 	svm_apply_tsc_adjust(svm_sc, vcpu);
1945 
1946 	svm_msr_guest_enter(svm_sc, vcpu);
1947 
1948 	VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1949 	vcpustate->loaded = B_TRUE;
1950 
1951 	/* Update Guest RIP */
1952 	state->rip = rip;
1953 
1954 	do {
1955 		enum event_inject_state inject_state;
1956 		uint64_t nptgen;
1957 
1958 		/*
1959 		 * Initial event injection is complex and may involve mutex
1960 		 * contention, so it must be performed with global interrupts
1961 		 * still enabled.
1962 		 */
1963 		inject_state = svm_inject_events(svm_sc, vcpu);
1964 		handled = 0;
1965 
1966 		/*
1967 		 * Disable global interrupts to guarantee atomicity during
1968 		 * loading of guest state. This includes not only the state
1969 		 * loaded by the "vmrun" instruction but also software state
1970 		 * maintained by the hypervisor: suspended and rendezvous
1971 		 * state, NPT generation number, vlapic interrupts etc.
1972 		 */
1973 		disable_gintr();
1974 
1975 		/*
1976 		 * Synchronizing and injecting vlapic state is lock-free and is
1977 		 * safe (and prudent) to perform with interrupts disabled.
1978 		 */
1979 		inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
1980 		    inject_state);
1981 
1982 		/*
1983 		 * Check for vCPU bail-out conditions.  This must be done after
1984 		 * svm_inject_events() to detect a triple-fault condition.
1985 		 */
1986 		if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
1987 			enable_gintr();
1988 			break;
1989 		}
1990 
1991 		if (vcpu_run_state_pending(vm, vcpu)) {
1992 			enable_gintr();
1993 			vm_exit_run_state(vm, vcpu, state->rip);
1994 			break;
1995 		}
1996 
1997 		/*
1998 		 * If subsequent activity queued events which require injection
1999 		 * handling, take another lap to handle them.
2000 		 */
2001 		if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
2002 			enable_gintr();
2003 			handled = 1;
2004 			continue;
2005 		}
2006 
2007 		/*
2008 		 * #VMEXIT resumes the host with the guest LDTR, so
2009 		 * save the current LDT selector so it can be restored
2010 		 * after an exit.  The userspace hypervisor probably
2011 		 * doesn't use a LDT, but save and restore it to be
2012 		 * safe.
2013 		 */
2014 		ldt_sel = sldt();
2015 
2016 		/*
2017 		 * Check the vmspace and ASID generations to ensure that the
2018 		 * vcpu does not use stale TLB mappings.
2019 		 */
2020 		nptgen = vmc_table_enter(vmc);
2021 		check_asid(svm_sc, vcpu, curcpu, nptgen);
2022 
2023 		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
2024 		vcpustate->dirty = 0;
2025 
2026 		/* Launch Virtual Machine. */
2027 		vcpu_ustate_change(vm, vcpu, VU_RUN);
2028 		svm_dr_enter_guest(gctx);
2029 		svm_launch(vmcb_pa, gctx, get_pcpu());
2030 		svm_dr_leave_guest(gctx);
2031 		vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
2032 
2033 		/* Restore host LDTR. */
2034 		lldt(ldt_sel);
2035 
2036 		/* #VMEXIT disables interrupts so re-enable them here. */
2037 		enable_gintr();
2038 
2039 		vmc_table_exit(vmc);
2040 
2041 		/* Update 'nextrip' */
2042 		vcpustate->nextrip = state->rip;
2043 
2044 		/* Handle #VMEXIT and if required return to user space. */
2045 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
2046 	} while (handled);
2047 
2048 	svm_msr_guest_exit(svm_sc, vcpu);
2049 
2050 	VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2051 	vcpustate->loaded = B_FALSE;
2052 
2053 	return (0);
2054 }
2055 
2056 static void
2057 svm_vmcleanup(void *arg)
2058 {
2059 	struct svm_softc *sc = arg;
2060 
2061 	vmm_contig_free(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE);
2062 	vmm_contig_free(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE);
2063 	kmem_free(sc, sizeof (*sc));
2064 }
2065 
2066 static uint64_t *
2067 swctx_regptr(struct svm_regctx *regctx, int reg)
2068 {
2069 	switch (reg) {
2070 	case VM_REG_GUEST_RBX:
2071 		return (&regctx->sctx_rbx);
2072 	case VM_REG_GUEST_RCX:
2073 		return (&regctx->sctx_rcx);
2074 	case VM_REG_GUEST_RDX:
2075 		return (&regctx->sctx_rdx);
2076 	case VM_REG_GUEST_RDI:
2077 		return (&regctx->sctx_rdi);
2078 	case VM_REG_GUEST_RSI:
2079 		return (&regctx->sctx_rsi);
2080 	case VM_REG_GUEST_RBP:
2081 		return (&regctx->sctx_rbp);
2082 	case VM_REG_GUEST_R8:
2083 		return (&regctx->sctx_r8);
2084 	case VM_REG_GUEST_R9:
2085 		return (&regctx->sctx_r9);
2086 	case VM_REG_GUEST_R10:
2087 		return (&regctx->sctx_r10);
2088 	case VM_REG_GUEST_R11:
2089 		return (&regctx->sctx_r11);
2090 	case VM_REG_GUEST_R12:
2091 		return (&regctx->sctx_r12);
2092 	case VM_REG_GUEST_R13:
2093 		return (&regctx->sctx_r13);
2094 	case VM_REG_GUEST_R14:
2095 		return (&regctx->sctx_r14);
2096 	case VM_REG_GUEST_R15:
2097 		return (&regctx->sctx_r15);
2098 	case VM_REG_GUEST_DR0:
2099 		return (&regctx->sctx_dr0);
2100 	case VM_REG_GUEST_DR1:
2101 		return (&regctx->sctx_dr1);
2102 	case VM_REG_GUEST_DR2:
2103 		return (&regctx->sctx_dr2);
2104 	case VM_REG_GUEST_DR3:
2105 		return (&regctx->sctx_dr3);
2106 	default:
2107 		return (NULL);
2108 	}
2109 }
2110 
2111 static int
2112 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2113 {
2114 	struct svm_softc *sc;
2115 	struct vmcb *vmcb;
2116 	uint64_t *regp;
2117 	uint64_t *fieldp;
2118 	struct vmcb_segment *seg;
2119 
2120 	sc = arg;
2121 	vmcb = svm_get_vmcb(sc, vcpu);
2122 
2123 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2124 	if (regp != NULL) {
2125 		*val = *regp;
2126 		return (0);
2127 	}
2128 
2129 	switch (ident) {
2130 	case VM_REG_GUEST_INTR_SHADOW:
2131 		*val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2132 		break;
2133 
2134 	case VM_REG_GUEST_CR0:
2135 		svm_get_cr0(sc, vcpu, val);
2136 		break;
2137 	case VM_REG_GUEST_CR2:
2138 	case VM_REG_GUEST_CR3:
2139 	case VM_REG_GUEST_CR4:
2140 	case VM_REG_GUEST_DR6:
2141 	case VM_REG_GUEST_DR7:
2142 	case VM_REG_GUEST_EFER:
2143 	case VM_REG_GUEST_RAX:
2144 	case VM_REG_GUEST_RFLAGS:
2145 	case VM_REG_GUEST_RIP:
2146 	case VM_REG_GUEST_RSP:
2147 		fieldp = vmcb_regptr(vmcb, ident, NULL);
2148 		*val = *fieldp;
2149 		break;
2150 
2151 	case VM_REG_GUEST_CS:
2152 	case VM_REG_GUEST_DS:
2153 	case VM_REG_GUEST_ES:
2154 	case VM_REG_GUEST_FS:
2155 	case VM_REG_GUEST_GS:
2156 	case VM_REG_GUEST_SS:
2157 	case VM_REG_GUEST_LDTR:
2158 	case VM_REG_GUEST_TR:
2159 		seg = vmcb_segptr(vmcb, ident);
2160 		*val = seg->selector;
2161 		break;
2162 
2163 	case VM_REG_GUEST_GDTR:
2164 	case VM_REG_GUEST_IDTR:
2165 		/* GDTR and IDTR don't have segment selectors */
2166 		return (EINVAL);
2167 
2168 	case VM_REG_GUEST_PDPTE0:
2169 	case VM_REG_GUEST_PDPTE1:
2170 	case VM_REG_GUEST_PDPTE2:
2171 	case VM_REG_GUEST_PDPTE3:
2172 		/*
2173 		 * Unlike VMX, where the PDPTEs are explicitly cached as part of
2174 		 * several well-defined events related to paging (such as
2175 		 * loading %cr3), SVM walks the PDPEs (their PDPTE) as part of
2176 		 * nested paging lookups.  This makes these registers
2177 		 * effectively irrelevant on SVM.
2178 		 *
2179 		 * Rather than tossing an error, emit zeroed values so casual
2180 		 * consumers do not need to be as careful about that difference.
2181 		 */
2182 		*val = 0;
2183 		break;
2184 
2185 	default:
2186 		return (EINVAL);
2187 	}
2188 
2189 	return (0);
2190 }
2191 
2192 static int
2193 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2194 {
2195 	struct svm_softc *sc;
2196 	struct vmcb *vmcb;
2197 	uint64_t *regp;
2198 	uint64_t *fieldp;
2199 	uint32_t dirty;
2200 	struct vmcb_segment *seg;
2201 
2202 	sc = arg;
2203 	vmcb = svm_get_vmcb(sc, vcpu);
2204 
2205 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2206 	if (regp != NULL) {
2207 		*regp = val;
2208 		return (0);
2209 	}
2210 
2211 	dirty = VMCB_CACHE_NONE;
2212 	switch (ident) {
2213 	case VM_REG_GUEST_INTR_SHADOW:
2214 		vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2215 		break;
2216 
2217 	case VM_REG_GUEST_EFER:
2218 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2219 		/* EFER_SVM must always be set when the guest is executing */
2220 		*fieldp = val | EFER_SVM;
2221 		dirty |= VMCB_CACHE_CR;
2222 		break;
2223 
2224 	case VM_REG_GUEST_CR0:
2225 		svm_set_cr0(sc, vcpu, val, false);
2226 		break;
2227 	case VM_REG_GUEST_CR2:
2228 	case VM_REG_GUEST_CR3:
2229 	case VM_REG_GUEST_CR4:
2230 	case VM_REG_GUEST_DR6:
2231 	case VM_REG_GUEST_DR7:
2232 	case VM_REG_GUEST_RAX:
2233 	case VM_REG_GUEST_RFLAGS:
2234 	case VM_REG_GUEST_RIP:
2235 	case VM_REG_GUEST_RSP:
2236 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2237 		*fieldp = val;
2238 		break;
2239 
2240 	case VM_REG_GUEST_CS:
2241 	case VM_REG_GUEST_DS:
2242 	case VM_REG_GUEST_ES:
2243 	case VM_REG_GUEST_SS:
2244 	case VM_REG_GUEST_FS:
2245 	case VM_REG_GUEST_GS:
2246 	case VM_REG_GUEST_LDTR:
2247 	case VM_REG_GUEST_TR:
2248 		dirty |= VMCB_CACHE_SEG;
2249 		seg = vmcb_segptr(vmcb, ident);
2250 		seg->selector = (uint16_t)val;
2251 		break;
2252 
2253 	case VM_REG_GUEST_GDTR:
2254 	case VM_REG_GUEST_IDTR:
2255 		/* GDTR and IDTR don't have segment selectors */
2256 		return (EINVAL);
2257 
2258 	case VM_REG_GUEST_PDPTE0:
2259 	case VM_REG_GUEST_PDPTE1:
2260 	case VM_REG_GUEST_PDPTE2:
2261 	case VM_REG_GUEST_PDPTE3:
2262 		/*
2263 		 * PDPEs (AMD's PDPTE) are not cached under SVM, so we can
2264 		 * ignore attempts to set them.  See handler in svm_getreg() for
2265 		 * more details.
2266 		 */
2267 		break;
2268 
2269 	default:
2270 		return (EINVAL);
2271 	}
2272 
2273 	if (dirty != VMCB_CACHE_NONE) {
2274 		svm_set_dirty(sc, vcpu, dirty);
2275 	}
2276 
2277 	/*
2278 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
2279 	 * vcpu's ASID. This needs to be treated differently depending on
2280 	 * whether 'running' is true/false.
2281 	 */
2282 
2283 	return (0);
2284 }
2285 
2286 static int
2287 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2288 {
2289 	struct vmcb *vmcb;
2290 	struct svm_softc *sc;
2291 	struct vmcb_segment *seg;
2292 
2293 	sc = arg;
2294 	vmcb = svm_get_vmcb(sc, vcpu);
2295 
2296 	switch (reg) {
2297 	case VM_REG_GUEST_CS:
2298 	case VM_REG_GUEST_DS:
2299 	case VM_REG_GUEST_ES:
2300 	case VM_REG_GUEST_SS:
2301 	case VM_REG_GUEST_FS:
2302 	case VM_REG_GUEST_GS:
2303 	case VM_REG_GUEST_LDTR:
2304 	case VM_REG_GUEST_TR:
2305 		svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2306 		seg = vmcb_segptr(vmcb, reg);
2307 		/*
2308 		 * Map seg_desc access to VMCB attribute format.
2309 		 *
2310 		 * SVM uses the 'P' bit in the segment attributes to indicate a
2311 		 * NULL segment so clear it if the segment is marked unusable.
2312 		 */
2313 		seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2314 		if (SEG_DESC_UNUSABLE(desc->access)) {
2315 			seg->attrib &= ~0x80;
2316 		}
2317 		/*
2318 		 * Keep CPL synced with the DPL specified for %ss.
2319 		 *
2320 		 * KVM notes that a SYSRET to non-cpl-3 is possible on AMD
2321 		 * (unlike Intel), but accepts such a possible deviation for
2322 		 * what is otherwise unreasonable behavior for a guest OS, since
2323 		 * they do the same synchronization.
2324 		 */
2325 		if (reg == VM_REG_GUEST_SS) {
2326 			vmcb->state.cpl = SEG_DESC_DPL(desc->access);
2327 		}
2328 		break;
2329 
2330 	case VM_REG_GUEST_GDTR:
2331 	case VM_REG_GUEST_IDTR:
2332 		svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2333 		seg = vmcb_segptr(vmcb, reg);
2334 		break;
2335 
2336 	default:
2337 		return (EINVAL);
2338 	}
2339 
2340 	ASSERT(seg != NULL);
2341 	seg->base = desc->base;
2342 	seg->limit = desc->limit;
2343 
2344 	return (0);
2345 }
2346 
2347 static int
2348 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2349 {
2350 	struct vmcb *vmcb;
2351 	struct svm_softc *sc;
2352 	struct vmcb_segment *seg;
2353 
2354 	sc = arg;
2355 	vmcb = svm_get_vmcb(sc, vcpu);
2356 
2357 	switch (reg) {
2358 	case VM_REG_GUEST_DS:
2359 	case VM_REG_GUEST_ES:
2360 	case VM_REG_GUEST_FS:
2361 	case VM_REG_GUEST_GS:
2362 	case VM_REG_GUEST_SS:
2363 	case VM_REG_GUEST_LDTR:
2364 		seg = vmcb_segptr(vmcb, reg);
2365 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2366 		/*
2367 		 * VT-x uses bit 16 to indicate a segment that has been loaded
2368 		 * with a NULL selector (aka unusable). The 'desc->access'
2369 		 * field is interpreted in the VT-x format by the
2370 		 * processor-independent code.
2371 		 *
2372 		 * SVM uses the 'P' bit to convey the same information so
2373 		 * convert it into the VT-x format. For more details refer to
2374 		 * section "Segment State in the VMCB" in APMv2.
2375 		 */
2376 		if ((desc->access & 0x80) == 0) {
2377 			/* Unusable segment */
2378 			desc->access |= 0x10000;
2379 		}
2380 
2381 		/*
2382 		 * Just as CPL (in the VMCB) is kept synced to SS when the
2383 		 * segment is written, so too shall the segment sync from CPL
2384 		 * when it is read.
2385 		 */
2386 		if (reg == VM_REG_GUEST_SS) {
2387 			desc->access &=
2388 			    ~(SEG_DESC_DPL_MASK << SEG_DESC_DPL_SHIFT);
2389 			desc->access |=
2390 			    (vmcb->state.cpl & SEG_DESC_DPL_MASK) <<
2391 			    SEG_DESC_DPL_SHIFT;
2392 		}
2393 		break;
2394 
2395 	case VM_REG_GUEST_CS:
2396 	case VM_REG_GUEST_TR:
2397 		seg = vmcb_segptr(vmcb, reg);
2398 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2399 		break;
2400 
2401 	case VM_REG_GUEST_GDTR:
2402 	case VM_REG_GUEST_IDTR:
2403 		seg = vmcb_segptr(vmcb, reg);
2404 		/*
2405 		 * Since there are no access bits associated with the GDTR or
2406 		 * the IDTR, zero out the field to ensure it does not contain
2407 		 * garbage which might confuse the consumer.
2408 		 */
2409 		desc->access = 0;
2410 		break;
2411 
2412 	default:
2413 		return (EINVAL);
2414 	}
2415 
2416 	ASSERT(seg != NULL);
2417 	desc->base = seg->base;
2418 	desc->limit = seg->limit;
2419 	return (0);
2420 }
2421 
2422 static int
2423 svm_get_msr(void *arg, int vcpu, uint32_t msr, uint64_t *valp)
2424 {
2425 	struct svm_softc *sc = arg;
2426 	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2427 	const uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, NULL);
2428 
2429 	if (msrp != NULL) {
2430 		*valp = *msrp;
2431 		return (0);
2432 	}
2433 
2434 	return (EINVAL);
2435 }
2436 
2437 static int
2438 svm_set_msr(void *arg, int vcpu, uint32_t msr, uint64_t val)
2439 {
2440 	struct svm_softc *sc = arg;
2441 	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2442 
2443 	uint32_t dirty = 0;
2444 	uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, &dirty);
2445 	if (msrp == NULL) {
2446 		return (EINVAL);
2447 	}
2448 	switch (msr) {
2449 	case MSR_EFER:
2450 		/*
2451 		 * For now, just clone the logic from
2452 		 * svm_setreg():
2453 		 *
2454 		 * EFER_SVM must always be set when the guest is
2455 		 * executing
2456 		 */
2457 		*msrp = val | EFER_SVM;
2458 		break;
2459 	/* TODO: other necessary MSR masking */
2460 	default:
2461 		*msrp = val;
2462 		break;
2463 	}
2464 	if (dirty != 0) {
2465 		svm_set_dirty(sc, vcpu, dirty);
2466 	}
2467 	return (0);
2468 
2469 }
2470 
2471 static int
2472 svm_setcap(void *arg, int vcpu, int type, int val)
2473 {
2474 	struct svm_softc *sc;
2475 	int error;
2476 
2477 	sc = arg;
2478 	error = 0;
2479 	switch (type) {
2480 	case VM_CAP_HALT_EXIT:
2481 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2482 		    VMCB_INTCPT_HLT, val);
2483 		break;
2484 	case VM_CAP_PAUSE_EXIT:
2485 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2486 		    VMCB_INTCPT_PAUSE, val);
2487 		break;
2488 	default:
2489 		error = ENOENT;
2490 		break;
2491 	}
2492 	return (error);
2493 }
2494 
2495 static int
2496 svm_getcap(void *arg, int vcpu, int type, int *retval)
2497 {
2498 	struct svm_softc *sc;
2499 	int error;
2500 
2501 	sc = arg;
2502 	error = 0;
2503 
2504 	switch (type) {
2505 	case VM_CAP_HALT_EXIT:
2506 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2507 		    VMCB_INTCPT_HLT);
2508 		break;
2509 	case VM_CAP_PAUSE_EXIT:
2510 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2511 		    VMCB_INTCPT_PAUSE);
2512 		break;
2513 	default:
2514 		error = ENOENT;
2515 		break;
2516 	}
2517 	return (error);
2518 }
2519 
2520 static struct vlapic *
2521 svm_vlapic_init(void *arg, int vcpuid)
2522 {
2523 	struct svm_softc *svm_sc;
2524 	struct vlapic *vlapic;
2525 
2526 	svm_sc = arg;
2527 	vlapic = kmem_zalloc(sizeof (struct vlapic), KM_SLEEP);
2528 	vlapic->vm = svm_sc->vm;
2529 	vlapic->vcpuid = vcpuid;
2530 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2531 
2532 	vlapic_init(vlapic);
2533 
2534 	return (vlapic);
2535 }
2536 
2537 static void
2538 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2539 {
2540 	vlapic_cleanup(vlapic);
2541 	kmem_free(vlapic, sizeof (struct vlapic));
2542 }
2543 
2544 static void
2545 svm_pause(void *arg, int vcpu)
2546 {
2547 	struct svm_softc *sc = arg;
2548 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
2549 
2550 	/*
2551 	 * If an event is pending injection in the VMCB, stash it in
2552 	 * exit_intinfo as if it were deferred by an exit from guest context.
2553 	 */
2554 	const uint64_t intinfo = ctrl->eventinj;
2555 	if ((intinfo & VMCB_EVENTINJ_VALID) != 0) {
2556 		svm_stash_intinfo(sc, vcpu, intinfo);
2557 		ctrl->eventinj = 0;
2558 	}
2559 
2560 	/*
2561 	 * Now that no event is pending injection, interrupt-window exiting and
2562 	 * NMI-blocking can be disabled.  If/when this vCPU is made to run
2563 	 * again, those conditions will be reinstated when the now-queued events
2564 	 * are re-injected.
2565 	 */
2566 	svm_disable_intr_window_exiting(sc, vcpu);
2567 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
2568 }
2569 
2570 static void
2571 svm_savectx(void *arg, int vcpu)
2572 {
2573 	struct svm_softc *sc = arg;
2574 
2575 	if (sc->vcpu[vcpu].loaded) {
2576 		svm_msr_guest_exit(sc, vcpu);
2577 	}
2578 }
2579 
2580 static void
2581 svm_restorectx(void *arg, int vcpu)
2582 {
2583 	struct svm_softc *sc = arg;
2584 
2585 	if (sc->vcpu[vcpu].loaded) {
2586 		svm_msr_guest_enter(sc, vcpu);
2587 	}
2588 }
2589 
2590 static freqratio_res_t
2591 svm_freq_ratio(uint64_t guest_hz, uint64_t host_hz, uint64_t *mult)
2592 {
2593 	/*
2594 	 * Check whether scaling is needed at all before potentially erroring
2595 	 * out for other reasons.
2596 	 */
2597 	if (guest_hz == host_hz) {
2598 		return (FR_SCALING_NOT_NEEDED);
2599 	}
2600 
2601 	/*
2602 	 * Confirm that scaling is available.
2603 	 */
2604 	if (!svm_has_tsc_freq_ctl) {
2605 		return (FR_SCALING_NOT_SUPPORTED);
2606 	}
2607 
2608 	/*
2609 	 * Verify the guest_hz is within the supported range.
2610 	 */
2611 	if ((guest_hz < AMD_TSC_MIN_FREQ) ||
2612 	    (guest_hz >= (host_hz * AMD_TSC_MAX_FREQ_RATIO))) {
2613 		return (FR_OUT_OF_RANGE);
2614 	}
2615 
2616 	/* Calculate the multiplier. */
2617 	uint64_t m = vmm_calc_freq_multiplier(guest_hz, host_hz,
2618 	    AMD_TSCM_FRAC_SIZE);
2619 	*mult = m;
2620 
2621 	return (FR_VALID);
2622 }
2623 
2624 struct vmm_ops vmm_ops_amd = {
2625 	.init		= svm_init,
2626 	.cleanup	= svm_cleanup,
2627 	.resume		= svm_restore,
2628 
2629 	.vminit		= svm_vminit,
2630 	.vmrun		= svm_vmrun,
2631 	.vmcleanup	= svm_vmcleanup,
2632 	.vmgetreg	= svm_getreg,
2633 	.vmsetreg	= svm_setreg,
2634 	.vmgetdesc	= svm_getdesc,
2635 	.vmsetdesc	= svm_setdesc,
2636 	.vmgetcap	= svm_getcap,
2637 	.vmsetcap	= svm_setcap,
2638 	.vlapic_init	= svm_vlapic_init,
2639 	.vlapic_cleanup	= svm_vlapic_cleanup,
2640 	.vmpause	= svm_pause,
2641 
2642 	.vmsavectx	= svm_savectx,
2643 	.vmrestorectx	= svm_restorectx,
2644 
2645 	.vmgetmsr	= svm_get_msr,
2646 	.vmsetmsr	= svm_set_msr,
2647 
2648 	.vmfreqratio	= svm_freq_ratio,
2649 	.fr_intsize	= AMD_TSCM_INT_SIZE,
2650 	.fr_fracsize	= AMD_TSCM_FRAC_SIZE,
2651 };
2652