xref: /illumos-gate/usr/src/uts/intel/io/vmm/amd/svm.c (revision 22e991d5bb9d07bf7dd2a65bc080922753a3100b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * This file and its contents are supplied under the terms of the
31  * Common Development and Distribution License ("CDDL"), version 1.0.
32  * You may only use this file in accordance with the terms of version
33  * 1.0 of the CDDL.
34  *
35  * A full copy of the text of the CDDL should have accompanied this
36  * source.  A copy of the CDDL is also available via the Internet at
37  * http://www.illumos.org/license/CDDL.
38  *
39  * Copyright 2018 Joyent, Inc.
40  * Copyright 2022 Oxide Computer Company
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/malloc.h>
50 #include <sys/pcpu.h>
51 #include <sys/proc.h>
52 #include <sys/sysctl.h>
53 
54 #include <sys/x86_archext.h>
55 #include <sys/trap.h>
56 
57 #include <machine/cpufunc.h>
58 #include <machine/psl.h>
59 #include <machine/md_var.h>
60 #include <machine/reg.h>
61 #include <machine/specialreg.h>
62 #include <machine/vmm.h>
63 #include <machine/vmm_dev.h>
64 #include <sys/vmm_instruction_emul.h>
65 #include <sys/vmm_vm.h>
66 #include <sys/vmm_kernel.h>
67 
68 #include "vmm_lapic.h"
69 #include "vmm_stat.h"
70 #include "vmm_ktr.h"
71 #include "vmm_ioport.h"
72 #include "vatpic.h"
73 #include "vlapic.h"
74 #include "vlapic_priv.h"
75 
76 #include "x86.h"
77 #include "vmcb.h"
78 #include "svm.h"
79 #include "svm_softc.h"
80 #include "svm_msr.h"
81 
82 SYSCTL_DECL(_hw_vmm);
83 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
84     NULL);
85 
86 /*
87  * SVM CPUID function 0x8000_000A, edx bit decoding.
88  */
89 #define	AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
90 #define	AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
91 #define	AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
92 #define	AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
93 #define	AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
94 #define	AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
95 #define	AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
96 #define	AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
97 #define	AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
98 #define	AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
99 #define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
100 
101 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID	|	\
102 				VMCB_CACHE_IOPM		|	\
103 				VMCB_CACHE_I		|	\
104 				VMCB_CACHE_TPR		|	\
105 				VMCB_CACHE_CR2		|	\
106 				VMCB_CACHE_CR		|	\
107 				VMCB_CACHE_DR		|	\
108 				VMCB_CACHE_DT		|	\
109 				VMCB_CACHE_SEG		|	\
110 				VMCB_CACHE_NP)
111 
112 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
113 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
114     0, NULL);
115 
116 static MALLOC_DEFINE(M_SVM, "svm", "svm");
117 static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
118 
119 /* SVM features advertised by CPUID.8000000AH:EDX */
120 static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
121 
122 static int disable_npf_assist;
123 
124 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
125 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
126 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
127 
128 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
129 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val);
130 static void flush_asid(struct svm_softc *sc, int vcpuid);
131 
132 static __inline bool
133 flush_by_asid(void)
134 {
135 	return ((svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID) != 0);
136 }
137 
138 static __inline bool
139 decode_assist(void)
140 {
141 	return ((svm_feature & AMD_CPUID_SVM_DECODE_ASSIST) != 0);
142 }
143 
144 static int
145 svm_cleanup(void)
146 {
147 	/* This is taken care of by the hma registration */
148 	return (0);
149 }
150 
151 static int
152 svm_init(void)
153 {
154 	vmcb_clean &= VMCB_CACHE_DEFAULT;
155 
156 	svm_msr_init();
157 
158 	return (0);
159 }
160 
161 static void
162 svm_restore(void)
163 {
164 	/* No-op on illumos */
165 }
166 
167 /* Pentium compatible MSRs */
168 #define	MSR_PENTIUM_START	0
169 #define	MSR_PENTIUM_END		0x1FFF
170 /* AMD 6th generation and Intel compatible MSRs */
171 #define	MSR_AMD6TH_START	0xC0000000UL
172 #define	MSR_AMD6TH_END		0xC0001FFFUL
173 /* AMD 7th and 8th generation compatible MSRs */
174 #define	MSR_AMD7TH_START	0xC0010000UL
175 #define	MSR_AMD7TH_END		0xC0011FFFUL
176 
177 /*
178  * Get the index and bit position for a MSR in permission bitmap.
179  * Two bits are used for each MSR: lower bit for read and higher bit for write.
180  */
181 static int
182 svm_msr_index(uint64_t msr, int *index, int *bit)
183 {
184 	uint32_t base, off;
185 
186 	*index = -1;
187 	*bit = (msr % 4) * 2;
188 	base = 0;
189 
190 	if (msr <= MSR_PENTIUM_END) {
191 		*index = msr / 4;
192 		return (0);
193 	}
194 
195 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
196 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
197 		off = (msr - MSR_AMD6TH_START);
198 		*index = (off + base) / 4;
199 		return (0);
200 	}
201 
202 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
203 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
204 		off = (msr - MSR_AMD7TH_START);
205 		*index = (off + base) / 4;
206 		return (0);
207 	}
208 
209 	return (EINVAL);
210 }
211 
212 /*
213  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
214  */
215 static void
216 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
217 {
218 	int index, bit, error;
219 
220 	error = svm_msr_index(msr, &index, &bit);
221 	KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
222 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
223 	    ("%s: invalid index %d for msr %lx", __func__, index, msr));
224 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
225 	    "msr %lx", __func__, bit, msr));
226 
227 	if (read)
228 		perm_bitmap[index] &= ~(1UL << bit);
229 
230 	if (write)
231 		perm_bitmap[index] &= ~(2UL << bit);
232 }
233 
234 static void
235 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
236 {
237 
238 	svm_msr_perm(perm_bitmap, msr, true, true);
239 }
240 
241 static void
242 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
243 {
244 
245 	svm_msr_perm(perm_bitmap, msr, true, false);
246 }
247 
248 static __inline int
249 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
250 {
251 	struct vmcb_ctrl *ctrl;
252 
253 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
254 
255 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
256 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
257 }
258 
259 static __inline void
260 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
261     int enabled)
262 {
263 	struct vmcb_ctrl *ctrl;
264 	uint32_t oldval;
265 
266 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
267 
268 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
269 	oldval = ctrl->intercept[idx];
270 
271 	if (enabled)
272 		ctrl->intercept[idx] |= bitmask;
273 	else
274 		ctrl->intercept[idx] &= ~bitmask;
275 
276 	if (ctrl->intercept[idx] != oldval) {
277 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
278 		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
279 		    "from %x to %x", idx, oldval, ctrl->intercept[idx]);
280 	}
281 }
282 
283 static __inline void
284 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
285 {
286 
287 	svm_set_intercept(sc, vcpu, off, bitmask, 0);
288 }
289 
290 static __inline void
291 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
292 {
293 
294 	svm_set_intercept(sc, vcpu, off, bitmask, 1);
295 }
296 
297 static void
298 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
299     uint64_t msrpm_base_pa, uint64_t np_pml4)
300 {
301 	struct vmcb_ctrl *ctrl;
302 	struct vmcb_state *state;
303 	uint32_t mask;
304 	int n;
305 
306 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
307 	state = svm_get_vmcb_state(sc, vcpu);
308 
309 	ctrl->iopm_base_pa = iopm_base_pa;
310 	ctrl->msrpm_base_pa = msrpm_base_pa;
311 
312 	/* Enable nested paging */
313 	ctrl->np_ctrl = NP_ENABLE;
314 	ctrl->n_cr3 = np_pml4;
315 
316 	/*
317 	 * Intercept accesses to the control registers that are not shadowed
318 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
319 	 */
320 	for (n = 0; n < 16; n++) {
321 		mask = (BIT(n) << 16) | BIT(n);
322 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
323 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
324 		else
325 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
326 	}
327 
328 	/*
329 	 * Selectively intercept writes to %cr0.  This triggers on operations
330 	 * which would change bits other than TS or MP.
331 	 */
332 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
333 	    VMCB_INTCPT_CR0_WRITE);
334 
335 	/*
336 	 * Intercept everything when tracing guest exceptions otherwise
337 	 * just intercept machine check exception.
338 	 */
339 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
340 		for (n = 0; n < 32; n++) {
341 			/*
342 			 * Skip unimplemented vectors in the exception bitmap.
343 			 */
344 			if (n == 2 || n == 9) {
345 				continue;
346 			}
347 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
348 		}
349 	} else {
350 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
351 	}
352 
353 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
354 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
355 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
356 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
357 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
358 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
359 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
360 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
361 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
362 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
363 	    VMCB_INTCPT_FERR_FREEZE);
364 
365 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
366 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
367 
368 	/* Intercept privileged invalidation instructions. */
369 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
370 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
371 
372 	/*
373 	 * Intercept all virtualization-related instructions.
374 	 *
375 	 * From section "Canonicalization and Consistency Checks" in APMv2
376 	 * the VMRUN intercept bit must be set to pass the consistency check.
377 	 */
378 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
379 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
380 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
381 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
382 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
383 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
384 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
385 
386 	/*
387 	 * The ASID will be set to a non-zero value just before VMRUN.
388 	 */
389 	ctrl->asid = 0;
390 
391 	/*
392 	 * Section 15.21.1, Interrupt Masking in EFLAGS
393 	 * Section 15.21.2, Virtualizing APIC.TPR
394 	 *
395 	 * This must be set for %rflag and %cr8 isolation of guest and host.
396 	 */
397 	ctrl->v_intr_ctrl |= V_INTR_MASKING;
398 
399 	/* Enable Last Branch Record aka LBR for debugging */
400 	ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
401 	state->dbgctl = BIT(0);
402 
403 	/* EFER_SVM must always be set when the guest is executing */
404 	state->efer = EFER_SVM;
405 
406 	/* Set up the PAT to power-on state */
407 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
408 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
409 	    PAT_VALUE(2, PAT_UNCACHED)		|
410 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
411 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
412 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
413 	    PAT_VALUE(6, PAT_UNCACHED)		|
414 	    PAT_VALUE(7, PAT_UNCACHEABLE);
415 
416 	/* Set up DR6/7 to power-on state */
417 	state->dr6 = DBREG_DR6_RESERVED1;
418 	state->dr7 = DBREG_DR7_RESERVED1;
419 }
420 
421 /*
422  * Initialize a virtual machine.
423  */
424 static void *
425 svm_vminit(struct vm *vm)
426 {
427 	struct svm_softc *svm_sc;
428 	struct svm_vcpu *vcpu;
429 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
430 	int i;
431 	uint16_t maxcpus;
432 
433 	svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO);
434 	if (((uintptr_t)svm_sc & PAGE_MASK) != 0)
435 		panic("malloc of svm_softc not aligned on page boundary");
436 
437 	svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
438 	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
439 	if (svm_sc->msr_bitmap == NULL)
440 		panic("contigmalloc of SVM MSR bitmap failed");
441 	svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
442 	    M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
443 	if (svm_sc->iopm_bitmap == NULL)
444 		panic("contigmalloc of SVM IO bitmap failed");
445 
446 	svm_sc->vm = vm;
447 	svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm));
448 
449 	/*
450 	 * Intercept read and write accesses to all MSRs.
451 	 */
452 	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
453 
454 	/*
455 	 * Access to the following MSRs is redirected to the VMCB when the
456 	 * guest is executing. Therefore it is safe to allow the guest to
457 	 * read/write these MSRs directly without hypervisor involvement.
458 	 */
459 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
460 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
461 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
462 
463 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
464 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
465 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
466 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
467 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
468 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
469 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
470 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
471 
472 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
473 
474 	/*
475 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
476 	 */
477 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
478 
479 	/* Intercept access to all I/O ports. */
480 	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
481 
482 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
483 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
484 	pml4_pa = svm_sc->nptp;
485 	maxcpus = vm_get_maxcpus(svm_sc->vm);
486 	for (i = 0; i < maxcpus; i++) {
487 		vcpu = svm_get_vcpu(svm_sc, i);
488 		vcpu->nextrip = ~0;
489 		vcpu->lastcpu = NOCPU;
490 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
491 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
492 		svm_msr_guest_init(svm_sc, i);
493 	}
494 	return (svm_sc);
495 }
496 
497 /*
498  * Collateral for a generic SVM VM-exit.
499  */
500 static void
501 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
502 {
503 
504 	vme->exitcode = VM_EXITCODE_SVM;
505 	vme->u.svm.exitcode = code;
506 	vme->u.svm.exitinfo1 = info1;
507 	vme->u.svm.exitinfo2 = info2;
508 }
509 
510 static int
511 svm_cpl(struct vmcb_state *state)
512 {
513 
514 	/*
515 	 * From APMv2:
516 	 *   "Retrieve the CPL from the CPL field in the VMCB, not
517 	 *    from any segment DPL"
518 	 */
519 	return (state->cpl);
520 }
521 
522 static enum vm_cpu_mode
523 svm_vcpu_mode(struct vmcb *vmcb)
524 {
525 	struct vmcb_state *state;
526 
527 	state = &vmcb->state;
528 
529 	if (state->efer & EFER_LMA) {
530 		struct vmcb_segment *seg;
531 
532 		/*
533 		 * Section 4.8.1 for APM2, check if Code Segment has
534 		 * Long attribute set in descriptor.
535 		 */
536 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
537 		if (seg->attrib & VMCB_CS_ATTRIB_L)
538 			return (CPU_MODE_64BIT);
539 		else
540 			return (CPU_MODE_COMPATIBILITY);
541 	} else  if (state->cr0 & CR0_PE) {
542 		return (CPU_MODE_PROTECTED);
543 	} else {
544 		return (CPU_MODE_REAL);
545 	}
546 }
547 
548 static enum vm_paging_mode
549 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
550 {
551 
552 	if ((cr0 & CR0_PG) == 0)
553 		return (PAGING_MODE_FLAT);
554 	if ((cr4 & CR4_PAE) == 0)
555 		return (PAGING_MODE_32);
556 	if (efer & EFER_LME)
557 		return (PAGING_MODE_64);
558 	else
559 		return (PAGING_MODE_PAE);
560 }
561 
562 /*
563  * ins/outs utility routines
564  */
565 
566 static void
567 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
568 {
569 	struct vmcb_state *state;
570 
571 	state = &vmcb->state;
572 	paging->cr3 = state->cr3;
573 	paging->cpl = svm_cpl(state);
574 	paging->cpu_mode = svm_vcpu_mode(vmcb);
575 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
576 	    state->efer);
577 }
578 
579 #define	UNHANDLED 0
580 
581 /*
582  * Handle guest I/O intercept.
583  */
584 static int
585 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
586 {
587 	struct vmcb_ctrl *ctrl;
588 	struct vmcb_state *state;
589 	struct vm_inout *inout;
590 	struct vie *vie;
591 	uint64_t info1;
592 	struct vm_guest_paging paging;
593 
594 	state = svm_get_vmcb_state(svm_sc, vcpu);
595 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
596 	inout = &vmexit->u.inout;
597 	info1 = ctrl->exitinfo1;
598 
599 	inout->bytes = (info1 >> 4) & 0x7;
600 	inout->flags = 0;
601 	inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
602 	inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
603 	inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
604 	inout->port = (uint16_t)(info1 >> 16);
605 	inout->eax = (uint32_t)(state->rax);
606 
607 	if ((inout->flags & INOUT_STR) != 0) {
608 		/*
609 		 * The effective segment number in EXITINFO1[12:10] is populated
610 		 * only if the processor has the DecodeAssist capability.
611 		 *
612 		 * This is not specified explicitly in APMv2 but can be verified
613 		 * empirically.
614 		 */
615 		if (!decode_assist()) {
616 			/*
617 			 * Without decoding assistance, force the task of
618 			 * emulating the ins/outs on userspace.
619 			 */
620 			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
621 			bzero(&vmexit->u.inst_emul,
622 			    sizeof (vmexit->u.inst_emul));
623 			return (UNHANDLED);
624 		}
625 
626 		/*
627 		 * Bits 7-9 encode the address size of ins/outs operations where
628 		 * the 1/2/4 values correspond to 16/32/64 bit sizes.
629 		 */
630 		inout->addrsize = 2 * ((info1 >> 7) & 0x7);
631 		VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
632 		    inout->addrsize == 8);
633 
634 		if (inout->flags & INOUT_IN) {
635 			/*
636 			 * For INS instructions, %es (encoded as 0) is the
637 			 * implied segment for the operation.
638 			 */
639 			inout->segment = 0;
640 		} else {
641 			/*
642 			 * Bits 10-12 encode the segment for OUTS.
643 			 * This value follows the standard x86 segment order.
644 			 */
645 			inout->segment = (info1 >> 10) & 0x7;
646 		}
647 	}
648 
649 	vmexit->exitcode = VM_EXITCODE_INOUT;
650 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
651 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
652 	vie_init_inout(vie, inout, vmexit->inst_length, &paging);
653 
654 	/* The in/out emulation will handle advancing %rip */
655 	vmexit->inst_length = 0;
656 
657 	return (UNHANDLED);
658 }
659 
660 static int
661 npf_fault_type(uint64_t exitinfo1)
662 {
663 
664 	if (exitinfo1 & VMCB_NPF_INFO1_W)
665 		return (PROT_WRITE);
666 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
667 		return (PROT_EXEC);
668 	else
669 		return (PROT_READ);
670 }
671 
672 static bool
673 svm_npf_emul_fault(uint64_t exitinfo1)
674 {
675 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
676 		return (false);
677 	}
678 
679 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
680 		return (false);
681 	}
682 
683 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
684 		return (false);
685 	}
686 
687 	return (true);
688 }
689 
690 static void
691 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
692     uint64_t gpa)
693 {
694 	struct vmcb_ctrl *ctrl;
695 	struct vmcb *vmcb;
696 	struct vie *vie;
697 	struct vm_guest_paging paging;
698 	struct vmcb_segment *seg;
699 	char *inst_bytes = NULL;
700 	uint8_t inst_len = 0;
701 
702 	vmcb = svm_get_vmcb(svm_sc, vcpu);
703 	ctrl = &vmcb->ctrl;
704 
705 	vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
706 	vmexit->u.mmio_emul.gpa = gpa;
707 	vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
708 	svm_paging_info(vmcb, &paging);
709 
710 	switch (paging.cpu_mode) {
711 	case CPU_MODE_REAL:
712 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
713 		vmexit->u.mmio_emul.cs_base = seg->base;
714 		vmexit->u.mmio_emul.cs_d = 0;
715 		break;
716 	case CPU_MODE_PROTECTED:
717 	case CPU_MODE_COMPATIBILITY:
718 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
719 		vmexit->u.mmio_emul.cs_base = seg->base;
720 
721 		/*
722 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
723 		 */
724 		vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
725 		    1 : 0;
726 		break;
727 	default:
728 		vmexit->u.mmio_emul.cs_base = 0;
729 		vmexit->u.mmio_emul.cs_d = 0;
730 		break;
731 	}
732 
733 	/*
734 	 * Copy the instruction bytes into 'vie' if available.
735 	 */
736 	if (decode_assist() && !disable_npf_assist) {
737 		inst_len = ctrl->inst_len;
738 		inst_bytes = (char *)ctrl->inst_bytes;
739 	}
740 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
741 	vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
742 }
743 
744 /*
745  * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0
746  * which is live in the guest.  They are visible via the shadow instead.
747  */
748 #define	SVM_CR0_MASK	~(CR0_CD | CR0_NW | 0xffffffff00000000)
749 
750 static void
751 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write)
752 {
753 	struct vmcb_state *state;
754 	struct svm_regctx *regctx;
755 	uint64_t masked, old, diff;
756 
757 	state = svm_get_vmcb_state(svm_sc, vcpu);
758 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
759 
760 	old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
761 	diff = old ^ val;
762 
763 	/* No further work needed if register contents remain the same */
764 	if (diff == 0) {
765 		return;
766 	}
767 
768 	/* Flush the TLB if the paging or write-protect bits are changing */
769 	if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
770 		flush_asid(svm_sc, vcpu);
771 	}
772 
773 	/*
774 	 * If the change in %cr0 is due to a guest action (via interception)
775 	 * then other CPU state updates may be required.
776 	 */
777 	if (guest_write) {
778 		if ((diff & CR0_PG) != 0) {
779 			uint64_t efer = state->efer;
780 
781 			/* Keep the long-mode state in EFER in sync */
782 			if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) {
783 				state->efer |= EFER_LMA;
784 			}
785 			if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) {
786 				state->efer &= ~EFER_LMA;
787 			}
788 		}
789 	}
790 
791 	masked = val & SVM_CR0_MASK;
792 	regctx->sctx_cr0_shadow = val;
793 	state->cr0 = masked;
794 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR);
795 
796 	if ((masked ^ val) != 0) {
797 		/*
798 		 * The guest has set bits in %cr0 which we are masking out and
799 		 * exposing via shadow.
800 		 *
801 		 * We must intercept %cr0 reads in order to make the shadowed
802 		 * view available to the guest.
803 		 *
804 		 * Writes to %cr0 must also be intercepted (unconditionally,
805 		 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch
806 		 * if/when the guest clears those shadowed bits.
807 		 */
808 		svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
809 		    BIT(0) | BIT(16));
810 	} else {
811 		/*
812 		 * When no bits remain in %cr0 which require shadowing, the
813 		 * unconditional intercept of reads/writes to %cr0 can be
814 		 * disabled.
815 		 *
816 		 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains
817 		 * in place so we can be notified of operations which change
818 		 * bits other than TS or MP.
819 		 */
820 		svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
821 		    BIT(0) | BIT(16));
822 	}
823 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I);
824 }
825 
826 static void
827 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val)
828 {
829 	struct vmcb *vmcb;
830 	struct svm_regctx *regctx;
831 
832 	vmcb = svm_get_vmcb(svm_sc, vcpu);
833 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
834 
835 	/*
836 	 * Include the %cr0 bits which exist only in the shadow along with those
837 	 * in the running vCPU state.
838 	 */
839 	*val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
840 }
841 
842 static void
843 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
844 {
845 	uint64_t val;
846 	int err;
847 
848 	svm_get_cr0(svm_sc, vcpu, &val);
849 	err = svm_setreg(svm_sc, vcpu, reg, val);
850 	ASSERT(err == 0);
851 }
852 
853 static void
854 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
855 {
856 	struct vmcb_state *state;
857 	uint64_t val;
858 	int err;
859 
860 	state = svm_get_vmcb_state(svm_sc, vcpu);
861 
862 	err = svm_getreg(svm_sc, vcpu, reg, &val);
863 	ASSERT(err == 0);
864 
865 	if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) {
866 		/* NW without CD is nonsensical */
867 		vm_inject_gp(svm_sc->vm, vcpu);
868 		return;
869 	}
870 	if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) {
871 		/* PG requires PE */
872 		vm_inject_gp(svm_sc->vm, vcpu);
873 		return;
874 	}
875 	if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) {
876 		/* When enabling paging, PAE must be enabled if LME is. */
877 		if ((state->efer & EFER_LME) != 0 &&
878 		    (state->cr4 & CR4_PAE) == 0) {
879 			vm_inject_gp(svm_sc->vm, vcpu);
880 			return;
881 		}
882 	}
883 
884 	svm_set_cr0(svm_sc, vcpu, val, true);
885 }
886 
887 static void
888 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
889 {
890 	struct vie *vie;
891 	struct vm_guest_paging paging;
892 
893 	/* Let the instruction emulation (hopefully in-kernel) handle it */
894 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
895 	bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul));
896 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
897 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
898 	vie_init_other(vie, &paging);
899 
900 	/* The instruction emulation will handle advancing %rip */
901 	vmexit->inst_length = 0;
902 }
903 
904 static void
905 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
906 {
907 	struct vm *vm;
908 	struct vlapic *vlapic;
909 	struct vmcb_ctrl *ctrl;
910 
911 	vm = sc->vm;
912 	vlapic = vm_lapic(vm, vcpu);
913 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
914 
915 	/* Update %cr8 in the emulated vlapic */
916 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
917 
918 	/* Virtual interrupt injection is not used. */
919 	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
920 	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
921 }
922 
923 CTASSERT(VMCB_EVENTINJ_TYPE_INTR	== VM_INTINFO_HWINTR);
924 CTASSERT(VMCB_EVENTINJ_TYPE_NMI		== VM_INTINFO_NMI);
925 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION	== VM_INTINFO_HWEXCP);
926 CTASSERT(VMCB_EVENTINJ_TYPE_INTn	== VM_INTINFO_SWINTR);
927 CTASSERT(VMCB_EVENTINJ_EC_VALID		== VM_INTINFO_DEL_ERRCODE);
928 CTASSERT(VMCB_EVENTINJ_VALID		== VM_INTINFO_VALID);
929 
930 static void
931 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
932 {
933 	struct vmcb_ctrl *ctrl;
934 	uint64_t intinfo;
935 	int err;
936 
937 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
938 	intinfo = ctrl->exitintinfo;
939 	if (!VMCB_EXITINTINFO_VALID(intinfo))
940 		return;
941 
942 	/*
943 	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
944 	 *
945 	 * If a #VMEXIT happened during event delivery then record the event
946 	 * that was being delivered.
947 	 */
948 	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
949 	    intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
950 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
951 	/*
952 	 * Relies on match between VMCB exitintinfo format and bhyve-generic
953 	 * format, which is ensured by CTASSERTs above.
954 	 */
955 	err = vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
956 	VERIFY0(err);
957 }
958 
959 static __inline int
960 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
961 {
962 
963 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
964 	    VMCB_INTCPT_VINTR));
965 }
966 
967 static void
968 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
969 {
970 	struct vmcb_ctrl *ctrl;
971 	struct vmcb_state *state;
972 
973 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
974 	state = svm_get_vmcb_state(sc, vcpu);
975 
976 	if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
977 		KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
978 		    ("%s: invalid v_ign_tpr", __func__));
979 		KASSERT(vintr_intercept_enabled(sc, vcpu),
980 		    ("%s: vintr intercept should be enabled", __func__));
981 		return;
982 	}
983 
984 	/*
985 	 * We use V_IRQ in conjunction with the VINTR intercept to trap into the
986 	 * hypervisor as soon as a virtual interrupt can be delivered.
987 	 *
988 	 * Since injected events are not subject to intercept checks we need to
989 	 * ensure that the V_IRQ is not actually going to be delivered on VM
990 	 * entry.
991 	 */
992 	VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
993 	    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
994 
995 	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
996 	ctrl->v_irq |= V_IRQ;
997 	ctrl->v_intr_prio |= V_IGN_TPR;
998 	ctrl->v_intr_vector = 0;
999 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1000 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1001 }
1002 
1003 static void
1004 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
1005 {
1006 	struct vmcb_ctrl *ctrl;
1007 
1008 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1009 
1010 	if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
1011 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
1012 		    ("%s: vintr intercept should be disabled", __func__));
1013 		return;
1014 	}
1015 
1016 	VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
1017 	ctrl->v_irq &= ~V_IRQ;
1018 	ctrl->v_intr_vector = 0;
1019 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1020 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1021 }
1022 
1023 /*
1024  * Once an NMI is injected it blocks delivery of further NMIs until the handler
1025  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1026  * to track when the vcpu is done handling the NMI.
1027  */
1028 static int
1029 svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1030 {
1031 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1032 	    VMCB_INTCPT_IRET));
1033 }
1034 
1035 static void
1036 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1037 {
1038 	struct vmcb_ctrl *ctrl;
1039 
1040 	KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1041 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
1042 	/*
1043 	 * When the IRET intercept is cleared the vcpu will attempt to execute
1044 	 * the "iret" when it runs next. However, it is possible to inject
1045 	 * another NMI into the vcpu before the "iret" has actually executed.
1046 	 *
1047 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1048 	 * it will trap back into the hypervisor. If an NMI is pending for
1049 	 * the vcpu it will be injected into the guest.
1050 	 *
1051 	 * XXX this needs to be fixed
1052 	 */
1053 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1054 
1055 	/*
1056 	 * Set an interrupt shadow to prevent an NMI from being immediately
1057 	 * injected on the next VMRUN.
1058 	 */
1059 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1060 	ctrl->intr_shadow = 1;
1061 }
1062 
1063 static void
1064 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info)
1065 {
1066 	ASSERT(VM_INTINFO_PENDING(info));
1067 
1068 	uint8_t vector = VM_INTINFO_VECTOR(info);
1069 	uint32_t type = VM_INTINFO_TYPE(info);
1070 
1071 	/*
1072 	 * Correct behavior depends on bhyve intinfo event types lining up with
1073 	 * those defined by AMD for event injection in the VMCB.  The CTASSERTs
1074 	 * above svm_save_exitintinfo() ensure it.
1075 	 */
1076 	switch (type) {
1077 	case VM_INTINFO_NMI:
1078 		/* Ensure vector for injected event matches its type (NMI) */
1079 		vector = IDT_NMI;
1080 		break;
1081 	case VM_INTINFO_HWINTR:
1082 	case VM_INTINFO_SWINTR:
1083 		break;
1084 	case VM_INTINFO_HWEXCP:
1085 		if (vector == IDT_NMI) {
1086 			/*
1087 			 * NMIs are expected to be injected with
1088 			 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception
1089 			 * with the NMI vector.
1090 			 */
1091 			type = VM_INTINFO_NMI;
1092 		}
1093 		VERIFY(vector < 32);
1094 		break;
1095 	default:
1096 		/*
1097 		 * Since there is not strong validation for injected event types
1098 		 * at this point, fall back to software interrupt for those we
1099 		 * do not recognized.
1100 		 */
1101 		type = VM_INTINFO_SWINTR;
1102 		break;
1103 	}
1104 
1105 	ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector;
1106 	if (VM_INTINFO_HAS_ERRCODE(info)) {
1107 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1108 		ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32;
1109 	}
1110 }
1111 
1112 static void
1113 svm_inject_nmi(struct svm_softc *sc, int vcpu)
1114 {
1115 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1116 
1117 	ASSERT(!svm_nmi_blocked(sc, vcpu));
1118 
1119 	ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1120 	vm_nmi_clear(sc->vm, vcpu);
1121 
1122 	/*
1123 	 * Virtual NMI blocking is now in effect.
1124 	 *
1125 	 * Not only does this block a subsequent NMI injection from taking
1126 	 * place, it also configures an intercept on the IRET so we can track
1127 	 * when the next injection can take place.
1128 	 */
1129 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1130 }
1131 
1132 static void
1133 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1134 {
1135 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1136 
1137 	ASSERT(vector >= 0 && vector <= 255);
1138 
1139 	ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1140 }
1141 
1142 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
1143 
1144 static vm_msr_result_t
1145 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1146 {
1147 	struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu);
1148 	uint64_t lma;
1149 	int error;
1150 
1151 	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
1152 
1153 	if (newval & EFER_MBZ_BITS) {
1154 		return (VMR_GP);
1155 	}
1156 
1157 	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1158 	const uint64_t changed = state->efer ^ newval;
1159 	if (changed & EFER_LME) {
1160 		if (state->cr0 & CR0_PG) {
1161 			return (VMR_GP);
1162 		}
1163 	}
1164 
1165 	/* EFER.LMA = EFER.LME & CR0.PG */
1166 	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) {
1167 		lma = EFER_LMA;
1168 	} else {
1169 		lma = 0;
1170 	}
1171 	if ((newval & EFER_LMA) != lma) {
1172 		return (VMR_GP);
1173 	}
1174 
1175 	if ((newval & EFER_NXE) != 0 &&
1176 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) {
1177 		return (VMR_GP);
1178 	}
1179 	if ((newval & EFER_FFXSR) != 0 &&
1180 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) {
1181 		return (VMR_GP);
1182 	}
1183 	if ((newval & EFER_TCE) != 0 &&
1184 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) {
1185 		return (VMR_GP);
1186 	}
1187 
1188 	/*
1189 	 * Until bhyve has proper support for long-mode segment limits, just
1190 	 * toss a #GP at the guest if they attempt to use it.
1191 	 */
1192 	if (newval & EFER_LMSLE) {
1193 		return (VMR_GP);
1194 	}
1195 
1196 	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1197 	VERIFY0(error);
1198 	return (VMR_OK);
1199 }
1200 
1201 static int
1202 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
1203     bool is_wrmsr)
1204 {
1205 	struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1206 	struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1207 	const uint32_t ecx = ctx->sctx_rcx;
1208 	vm_msr_result_t res;
1209 	uint64_t val = 0;
1210 
1211 	if (is_wrmsr) {
1212 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1213 		val = ctx->sctx_rdx << 32 | (uint32_t)state->rax;
1214 
1215 		if (vlapic_owned_msr(ecx)) {
1216 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1217 
1218 			res = vlapic_wrmsr(vlapic, ecx, val);
1219 		} else if (ecx == MSR_EFER) {
1220 			res = svm_write_efer(svm_sc, vcpu, val);
1221 		} else {
1222 			res = svm_wrmsr(svm_sc, vcpu, ecx, val);
1223 		}
1224 	} else {
1225 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1226 
1227 		if (vlapic_owned_msr(ecx)) {
1228 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1229 
1230 			res = vlapic_rdmsr(vlapic, ecx, &val);
1231 		} else {
1232 			res = svm_rdmsr(svm_sc, vcpu, ecx, &val);
1233 		}
1234 	}
1235 
1236 	switch (res) {
1237 	case VMR_OK:
1238 		/* Store rdmsr result in the appropriate registers */
1239 		if (!is_wrmsr) {
1240 			state->rax = (uint32_t)val;
1241 			ctx->sctx_rdx = val >> 32;
1242 		}
1243 		return (1);
1244 	case VMR_GP:
1245 		vm_inject_gp(svm_sc->vm, vcpu);
1246 		return (1);
1247 	case VMR_UNHANLDED:
1248 		vmexit->exitcode = is_wrmsr ?
1249 		    VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR;
1250 		vmexit->u.msr.code = ecx;
1251 		vmexit->u.msr.wval = val;
1252 		return (0);
1253 	default:
1254 		panic("unexpected msr result %u\n", res);
1255 	}
1256 }
1257 
1258 /*
1259  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1260  * that are due to instruction intercepts as well as MSR and IOIO intercepts
1261  * and exceptions caused by INT3, INTO and BOUND instructions.
1262  *
1263  * Return 1 if the nRIP is valid and 0 otherwise.
1264  */
1265 static int
1266 nrip_valid(uint64_t exitcode)
1267 {
1268 	switch (exitcode) {
1269 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
1270 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
1271 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
1272 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
1273 	case 0x43:		/* INT3 */
1274 	case 0x44:		/* INTO */
1275 	case 0x45:		/* BOUND */
1276 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1277 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1278 		return (1);
1279 	default:
1280 		return (0);
1281 	}
1282 }
1283 
1284 static int
1285 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1286 {
1287 	struct vmcb *vmcb;
1288 	struct vmcb_state *state;
1289 	struct vmcb_ctrl *ctrl;
1290 	struct svm_regctx *ctx;
1291 	uint64_t code, info1, info2;
1292 	int handled;
1293 
1294 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
1295 	vmcb = svm_get_vmcb(svm_sc, vcpu);
1296 	state = &vmcb->state;
1297 	ctrl = &vmcb->ctrl;
1298 
1299 	handled = 0;
1300 	code = ctrl->exitcode;
1301 	info1 = ctrl->exitinfo1;
1302 	info2 = ctrl->exitinfo2;
1303 
1304 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1305 	vmexit->rip = state->rip;
1306 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1307 
1308 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1309 
1310 	/*
1311 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1312 	 * in an inconsistent state and can trigger assertions that would
1313 	 * never happen otherwise.
1314 	 */
1315 	if (code == VMCB_EXIT_INVALID) {
1316 		vm_exit_svm(vmexit, code, info1, info2);
1317 		return (0);
1318 	}
1319 
1320 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1321 	    "injection valid bit is set %lx", __func__, ctrl->eventinj));
1322 
1323 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1324 	    ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1325 	    vmexit->inst_length, code, info1, info2));
1326 
1327 	svm_update_virqinfo(svm_sc, vcpu);
1328 	svm_save_exitintinfo(svm_sc, vcpu);
1329 
1330 	switch (code) {
1331 	case VMCB_EXIT_CR0_READ:
1332 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1333 			svm_handle_cr0_read(svm_sc, vcpu,
1334 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1335 			handled = 1;
1336 		} else {
1337 			/*
1338 			 * If SMSW is used to read the contents of %cr0, then
1339 			 * the VALID bit will not be set in `info1`, since the
1340 			 * handling is different from the mov-to-reg case.
1341 			 *
1342 			 * Punt to the instruction emulation to handle it.
1343 			 */
1344 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1345 		}
1346 		break;
1347 	case VMCB_EXIT_CR0_WRITE:
1348 	case VMCB_EXIT_CR0_SEL_WRITE:
1349 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1350 			svm_handle_cr0_write(svm_sc, vcpu,
1351 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1352 			handled = 1;
1353 		} else {
1354 			/*
1355 			 * Writes to %cr0 without VALID being set in `info1` are
1356 			 * initiated by the LMSW and CLTS instructions.  While
1357 			 * LMSW (like SMSW) sees little use in modern OSes and
1358 			 * bootloaders, CLTS is still used for handling FPU
1359 			 * state transitions.
1360 			 *
1361 			 * Punt to the instruction emulation to handle them.
1362 			 */
1363 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1364 		}
1365 		break;
1366 	case VMCB_EXIT_IRET:
1367 		/*
1368 		 * Restart execution at "iret" but with the intercept cleared.
1369 		 */
1370 		vmexit->inst_length = 0;
1371 		svm_clear_nmi_blocking(svm_sc, vcpu);
1372 		handled = 1;
1373 		break;
1374 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
1375 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1376 		svm_disable_intr_window_exiting(svm_sc, vcpu);
1377 		handled = 1;
1378 		break;
1379 	case VMCB_EXIT_INTR:	/* external interrupt */
1380 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1381 		handled = 1;
1382 		break;
1383 	case VMCB_EXIT_NMI:
1384 	case VMCB_EXIT_SMI:
1385 	case VMCB_EXIT_INIT:
1386 		/*
1387 		 * For external NMI/SMI and physical INIT interrupts, simply
1388 		 * continue execution, as those host events will be handled by
1389 		 * the physical CPU.
1390 		 */
1391 		handled = 1;
1392 		break;
1393 	case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: {
1394 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1395 
1396 		const uint8_t idtvec = code - VMCB_EXIT_EXCP0;
1397 		uint32_t errcode = 0;
1398 		bool reflect = true;
1399 		bool errcode_valid = false;
1400 
1401 		switch (idtvec) {
1402 		case IDT_MC:
1403 			/* The host will handle the MCE itself. */
1404 			reflect = false;
1405 			vmm_call_trap(T_MCE);
1406 			break;
1407 		case IDT_PF:
1408 			VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1409 			    info2));
1410 			/* fallthru */
1411 		case IDT_NP:
1412 		case IDT_SS:
1413 		case IDT_GP:
1414 		case IDT_AC:
1415 		case IDT_TS:
1416 			errcode_valid = true;
1417 			errcode = info1;
1418 			break;
1419 
1420 		case IDT_DF:
1421 			errcode_valid = true;
1422 			break;
1423 
1424 		case IDT_BP:
1425 		case IDT_OF:
1426 		case IDT_BR:
1427 			/*
1428 			 * The 'nrip' field is populated for INT3, INTO and
1429 			 * BOUND exceptions and this also implies that
1430 			 * 'inst_length' is non-zero.
1431 			 *
1432 			 * Reset 'inst_length' to zero so the guest %rip at
1433 			 * event injection is identical to what it was when
1434 			 * the exception originally happened.
1435 			 */
1436 			vmexit->inst_length = 0;
1437 			/* fallthru */
1438 		default:
1439 			errcode_valid = false;
1440 			break;
1441 		}
1442 		VERIFY0(vmexit->inst_length);
1443 
1444 		if (reflect) {
1445 			/* Reflect the exception back into the guest */
1446 			VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1447 			    errcode_valid, errcode, false));
1448 		}
1449 		handled = 1;
1450 		break;
1451 		}
1452 	case VMCB_EXIT_MSR:
1453 		handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0);
1454 		break;
1455 	case VMCB_EXIT_IO:
1456 		handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1457 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1458 		break;
1459 	case VMCB_EXIT_SHUTDOWN:
1460 		(void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT);
1461 		handled = 1;
1462 		break;
1463 	case VMCB_EXIT_INVD:
1464 	case VMCB_EXIT_INVLPGA:
1465 		/* privileged invalidation instructions */
1466 		vm_inject_ud(svm_sc->vm, vcpu);
1467 		handled = 1;
1468 		break;
1469 	case VMCB_EXIT_VMRUN:
1470 	case VMCB_EXIT_VMLOAD:
1471 	case VMCB_EXIT_VMSAVE:
1472 	case VMCB_EXIT_STGI:
1473 	case VMCB_EXIT_CLGI:
1474 	case VMCB_EXIT_SKINIT:
1475 		/* privileged vmm instructions */
1476 		vm_inject_ud(svm_sc->vm, vcpu);
1477 		handled = 1;
1478 		break;
1479 	case VMCB_EXIT_VMMCALL:
1480 		/* No handlers make use of VMMCALL for now */
1481 		vm_inject_ud(svm_sc->vm, vcpu);
1482 		handled = 1;
1483 		break;
1484 	case VMCB_EXIT_CPUID:
1485 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1486 		handled = x86_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1487 		    &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1488 		break;
1489 	case VMCB_EXIT_HLT:
1490 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1491 		vmexit->exitcode = VM_EXITCODE_HLT;
1492 		vmexit->u.hlt.rflags = state->rflags;
1493 		break;
1494 	case VMCB_EXIT_PAUSE:
1495 		vmexit->exitcode = VM_EXITCODE_PAUSE;
1496 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1497 		break;
1498 	case VMCB_EXIT_NPF:
1499 		/* EXITINFO2 contains the faulting guest physical address */
1500 		if (info1 & VMCB_NPF_INFO1_RSV) {
1501 			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
1502 			    "reserved bits set: info1(%lx) info2(%lx)",
1503 			    info1, info2);
1504 		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1505 			vmexit->exitcode = VM_EXITCODE_PAGING;
1506 			vmexit->u.paging.gpa = info2;
1507 			vmexit->u.paging.fault_type = npf_fault_type(info1);
1508 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1509 			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
1510 			    "on gpa %lx/%lx at rip %lx",
1511 			    info2, info1, state->rip);
1512 		} else if (svm_npf_emul_fault(info1)) {
1513 			svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1514 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1515 			VCPU_CTR3(svm_sc->vm, vcpu, "mmio_emul fault "
1516 			    "for gpa %lx/%lx at rip %lx",
1517 			    info2, info1, state->rip);
1518 		}
1519 		break;
1520 	case VMCB_EXIT_MONITOR:
1521 		vmexit->exitcode = VM_EXITCODE_MONITOR;
1522 		break;
1523 	case VMCB_EXIT_MWAIT:
1524 		vmexit->exitcode = VM_EXITCODE_MWAIT;
1525 		break;
1526 	default:
1527 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1528 		break;
1529 	}
1530 
1531 	DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1532 	    code);
1533 
1534 	if (handled) {
1535 		vmexit->rip += vmexit->inst_length;
1536 		vmexit->inst_length = 0;
1537 		state->rip = vmexit->rip;
1538 	} else {
1539 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1540 			/*
1541 			 * If this VM exit was not claimed by anybody then
1542 			 * treat it as a generic SVM exit.
1543 			 */
1544 			vm_exit_svm(vmexit, code, info1, info2);
1545 		} else {
1546 			/*
1547 			 * The exitcode and collateral have been populated.
1548 			 * The VM exit will be processed further in userland.
1549 			 */
1550 		}
1551 	}
1552 	return (handled);
1553 }
1554 
1555 /*
1556  * Inject exceptions, NMIs, and ExtINTs.
1557  *
1558  * The logic behind these are complicated and may involve mutex contention, so
1559  * the injection is performed without the protection of host CPU interrupts
1560  * being disabled.  This means a racing notification could be "lost",
1561  * necessitating a later call to svm_inject_recheck() to close that window
1562  * of opportunity.
1563  */
1564 static enum event_inject_state
1565 svm_inject_events(struct svm_softc *sc, int vcpu)
1566 {
1567 	struct vmcb_ctrl *ctrl;
1568 	struct vmcb_state *state;
1569 	struct svm_vcpu *vcpustate;
1570 	uint64_t intinfo;
1571 	enum event_inject_state ev_state;
1572 
1573 	state = svm_get_vmcb_state(sc, vcpu);
1574 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1575 	vcpustate = svm_get_vcpu(sc, vcpu);
1576 	ev_state = EIS_CAN_INJECT;
1577 
1578 	/* Clear any interrupt shadow if guest %rip has changed */
1579 	if (vcpustate->nextrip != state->rip) {
1580 		ctrl->intr_shadow = 0;
1581 	}
1582 
1583 	/*
1584 	 * An event is already pending for injection.  This can occur when the
1585 	 * vCPU exits prior to VM entry (like for an AST).
1586 	 */
1587 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1588 		return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1589 	}
1590 
1591 	/*
1592 	 * Inject pending events or exceptions for this vcpu.
1593 	 *
1594 	 * An event might be pending because the previous #VMEXIT happened
1595 	 * during event delivery (i.e. ctrl->exitintinfo).
1596 	 *
1597 	 * An event might also be pending because an exception was injected
1598 	 * by the hypervisor (e.g. #PF during instruction emulation).
1599 	 */
1600 	if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1601 		svm_inject_event(ctrl, intinfo);
1602 		vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1603 		ev_state = EIS_EV_INJECTED;
1604 	}
1605 
1606 	/* NMI event has priority over interrupts. */
1607 	if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1608 		if (ev_state == EIS_CAN_INJECT) {
1609 			/* Can't inject NMI if vcpu is in an intr_shadow. */
1610 			if (ctrl->intr_shadow) {
1611 				return (EIS_GI_BLOCK);
1612 			}
1613 
1614 			svm_inject_nmi(sc, vcpu);
1615 			ev_state = EIS_EV_INJECTED;
1616 		} else {
1617 			return (ev_state | EIS_REQ_EXIT);
1618 		}
1619 	}
1620 
1621 	if (vm_extint_pending(sc->vm, vcpu)) {
1622 		int vector;
1623 
1624 		if (ev_state != EIS_CAN_INJECT) {
1625 			return (ev_state | EIS_REQ_EXIT);
1626 		}
1627 
1628 		/*
1629 		 * If the guest has disabled interrupts or is in an interrupt
1630 		 * shadow then we cannot inject the pending interrupt.
1631 		 */
1632 		if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1633 			return (EIS_GI_BLOCK);
1634 		}
1635 
1636 		/* Ask the legacy pic for a vector to inject */
1637 		vatpic_pending_intr(sc->vm, &vector);
1638 		KASSERT(vector >= 0 && vector <= 255,
1639 		    ("invalid vector %d from INTR", vector));
1640 
1641 		svm_inject_irq(sc, vcpu, vector);
1642 		vm_extint_clear(sc->vm, vcpu);
1643 		vatpic_intr_accepted(sc->vm, vector);
1644 		ev_state = EIS_EV_INJECTED;
1645 	}
1646 
1647 	return (ev_state);
1648 }
1649 
1650 /*
1651  * Synchronize vLAPIC state and inject any interrupts pending on it.
1652  *
1653  * This is done with host CPU interrupts disabled so notification IPIs will be
1654  * queued on the host APIC and recognized when entering SVM guest context.
1655  */
1656 static enum event_inject_state
1657 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1658     enum event_inject_state ev_state)
1659 {
1660 	struct vmcb_ctrl *ctrl;
1661 	struct vmcb_state *state;
1662 	int vector;
1663 	uint8_t v_tpr;
1664 
1665 	state = svm_get_vmcb_state(sc, vcpu);
1666 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1667 
1668 	/*
1669 	 * The guest can modify the TPR by writing to %cr8. In guest mode the
1670 	 * CPU reflects this write to V_TPR without hypervisor intervention.
1671 	 *
1672 	 * The guest can also modify the TPR by writing to it via the memory
1673 	 * mapped APIC page. In this case, the write will be emulated by the
1674 	 * hypervisor. For this reason V_TPR must be updated before every
1675 	 * VMRUN.
1676 	 */
1677 	v_tpr = vlapic_get_cr8(vlapic);
1678 	KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1679 	if (ctrl->v_tpr != v_tpr) {
1680 		ctrl->v_tpr = v_tpr;
1681 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1682 	}
1683 
1684 	/* If an event cannot otherwise be injected, we are done for now */
1685 	if (ev_state != EIS_CAN_INJECT) {
1686 		return (ev_state);
1687 	}
1688 
1689 	if (!vlapic_pending_intr(vlapic, &vector)) {
1690 		return (EIS_CAN_INJECT);
1691 	}
1692 	KASSERT(vector >= 16 && vector <= 255,
1693 	    ("invalid vector %d from local APIC", vector));
1694 
1695 	/*
1696 	 * If the guest has disabled interrupts or is in an interrupt shadow
1697 	 * then we cannot inject the pending interrupt.
1698 	 */
1699 	if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1700 		return (EIS_GI_BLOCK);
1701 	}
1702 
1703 	svm_inject_irq(sc, vcpu, vector);
1704 	vlapic_intr_accepted(vlapic, vector);
1705 	return (EIS_EV_INJECTED);
1706 }
1707 
1708 /*
1709  * Re-check for events to be injected.
1710  *
1711  * Once host CPU interrupts are disabled, check for the presence of any events
1712  * which require injection processing.  If an exit is required upon injection,
1713  * or once the guest becomes interruptable, that will be configured too.
1714  */
1715 static bool
1716 svm_inject_recheck(struct svm_softc *sc, int vcpu,
1717     enum event_inject_state ev_state)
1718 {
1719 	struct vmcb_ctrl *ctrl;
1720 
1721 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1722 
1723 	if (ev_state == EIS_CAN_INJECT) {
1724 		/*
1725 		 * An active interrupt shadow would preclude us from injecting
1726 		 * any events picked up during a re-check.
1727 		 */
1728 		if (ctrl->intr_shadow != 0) {
1729 			return (false);
1730 		}
1731 
1732 		if (vm_nmi_pending(sc->vm, vcpu) &&
1733 		    !svm_nmi_blocked(sc, vcpu)) {
1734 			/* queued NMI not blocked by NMI-window-exiting */
1735 			return (true);
1736 		}
1737 		if (vm_extint_pending(sc->vm, vcpu)) {
1738 			/* queued ExtINT not blocked by existing injection */
1739 			return (true);
1740 		}
1741 	} else {
1742 		if ((ev_state & EIS_REQ_EXIT) != 0) {
1743 			/*
1744 			 * Use a self-IPI to force an immediate exit after
1745 			 * event injection has occurred.
1746 			 */
1747 			poke_cpu(CPU->cpu_id);
1748 		} else {
1749 			/*
1750 			 * If any event is being injected, an exit immediately
1751 			 * upon becoming interruptable again will allow pending
1752 			 * or newly queued events to be injected in a timely
1753 			 * manner.
1754 			 */
1755 			svm_enable_intr_window_exiting(sc, vcpu);
1756 		}
1757 	}
1758 	return (false);
1759 }
1760 
1761 
1762 static void
1763 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen)
1764 {
1765 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1766 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1767 	uint8_t flush;
1768 
1769 	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1770 	    vcpustate->nptgen != nptgen);
1771 
1772 	if (flush != VMCB_TLB_FLUSH_NOTHING) {
1773 		ctrl->asid = vcpustate->hma_asid.hsa_asid;
1774 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1775 	}
1776 	ctrl->tlb_ctrl = flush;
1777 	vcpustate->nptgen = nptgen;
1778 }
1779 
1780 static void
1781 flush_asid(struct svm_softc *sc, int vcpuid)
1782 {
1783 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1784 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1785 	uint8_t flush;
1786 
1787 	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1788 	    true);
1789 
1790 	ASSERT(flush != VMCB_TLB_FLUSH_NOTHING);
1791 	ctrl->asid = vcpustate->hma_asid.hsa_asid;
1792 	ctrl->tlb_ctrl = flush;
1793 	svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1794 	/*
1795 	 * A potential future optimization: We could choose to update the nptgen
1796 	 * associated with the vCPU, since any pending nptgen change requiring a
1797 	 * flush will be satisfied by the one which has just now been queued.
1798 	 */
1799 }
1800 
1801 static __inline void
1802 disable_gintr(void)
1803 {
1804 	__asm __volatile("clgi");
1805 }
1806 
1807 static __inline void
1808 enable_gintr(void)
1809 {
1810 	__asm __volatile("stgi");
1811 }
1812 
1813 static __inline void
1814 svm_dr_enter_guest(struct svm_regctx *gctx)
1815 {
1816 
1817 	/* Save host control debug registers. */
1818 	gctx->host_dr7 = rdr7();
1819 	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1820 
1821 	/*
1822 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1823 	 * exceptions in the host based on the guest DRx values.  The
1824 	 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1825 	 * VMCB.
1826 	 */
1827 	load_dr7(0);
1828 	wrmsr(MSR_DEBUGCTLMSR, 0);
1829 
1830 	/* Save host debug registers. */
1831 	gctx->host_dr0 = rdr0();
1832 	gctx->host_dr1 = rdr1();
1833 	gctx->host_dr2 = rdr2();
1834 	gctx->host_dr3 = rdr3();
1835 	gctx->host_dr6 = rdr6();
1836 
1837 	/* Restore guest debug registers. */
1838 	load_dr0(gctx->sctx_dr0);
1839 	load_dr1(gctx->sctx_dr1);
1840 	load_dr2(gctx->sctx_dr2);
1841 	load_dr3(gctx->sctx_dr3);
1842 }
1843 
1844 static __inline void
1845 svm_dr_leave_guest(struct svm_regctx *gctx)
1846 {
1847 
1848 	/* Save guest debug registers. */
1849 	gctx->sctx_dr0 = rdr0();
1850 	gctx->sctx_dr1 = rdr1();
1851 	gctx->sctx_dr2 = rdr2();
1852 	gctx->sctx_dr3 = rdr3();
1853 
1854 	/*
1855 	 * Restore host debug registers.  Restore DR7 and DEBUGCTL
1856 	 * last.
1857 	 */
1858 	load_dr0(gctx->host_dr0);
1859 	load_dr1(gctx->host_dr1);
1860 	load_dr2(gctx->host_dr2);
1861 	load_dr3(gctx->host_dr3);
1862 	load_dr6(gctx->host_dr6);
1863 	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1864 	load_dr7(gctx->host_dr7);
1865 }
1866 
1867 static void
1868 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid)
1869 {
1870 	const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true);
1871 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid);
1872 
1873 	if (ctrl->tsc_offset != offset) {
1874 		ctrl->tsc_offset = offset;
1875 		svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I);
1876 	}
1877 }
1878 
1879 
1880 /*
1881  * Start vcpu with specified RIP.
1882  */
1883 static int
1884 svm_vmrun(void *arg, int vcpu, uint64_t rip)
1885 {
1886 	struct svm_regctx *gctx;
1887 	struct svm_softc *svm_sc;
1888 	struct svm_vcpu *vcpustate;
1889 	struct vmcb_state *state;
1890 	struct vmcb_ctrl *ctrl;
1891 	struct vm_exit *vmexit;
1892 	struct vlapic *vlapic;
1893 	vm_client_t *vmc;
1894 	struct vm *vm;
1895 	uint64_t vmcb_pa;
1896 	int handled;
1897 	uint16_t ldt_sel;
1898 
1899 	svm_sc = arg;
1900 	vm = svm_sc->vm;
1901 
1902 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1903 	state = svm_get_vmcb_state(svm_sc, vcpu);
1904 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1905 	vmexit = vm_exitinfo(vm, vcpu);
1906 	vlapic = vm_lapic(vm, vcpu);
1907 	vmc = vm_get_vmclient(vm, vcpu);
1908 
1909 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1910 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1911 
1912 	if (vcpustate->lastcpu != curcpu) {
1913 		/*
1914 		 * Force new ASID allocation by invalidating the generation.
1915 		 */
1916 		vcpustate->hma_asid.hsa_gen = 0;
1917 
1918 		/*
1919 		 * Invalidate the VMCB state cache by marking all fields dirty.
1920 		 */
1921 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1922 
1923 		/*
1924 		 * XXX
1925 		 * Setting 'vcpustate->lastcpu' here is bit premature because
1926 		 * we may return from this function without actually executing
1927 		 * the VMRUN  instruction. This could happen if an AST or yield
1928 		 * condition is pending on the first time through the loop.
1929 		 *
1930 		 * This works for now but any new side-effects of vcpu
1931 		 * migration should take this case into account.
1932 		 */
1933 		vcpustate->lastcpu = curcpu;
1934 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1935 	}
1936 
1937 	svm_apply_tsc_adjust(svm_sc, vcpu);
1938 
1939 	svm_msr_guest_enter(svm_sc, vcpu);
1940 
1941 	VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1942 	vcpustate->loaded = B_TRUE;
1943 
1944 	/* Update Guest RIP */
1945 	state->rip = rip;
1946 
1947 	do {
1948 		enum event_inject_state inject_state;
1949 		uint64_t nptgen;
1950 
1951 		/*
1952 		 * Initial event injection is complex and may involve mutex
1953 		 * contention, so it must be performed with global interrupts
1954 		 * still enabled.
1955 		 */
1956 		inject_state = svm_inject_events(svm_sc, vcpu);
1957 		handled = 0;
1958 
1959 		/*
1960 		 * Disable global interrupts to guarantee atomicity during
1961 		 * loading of guest state. This includes not only the state
1962 		 * loaded by the "vmrun" instruction but also software state
1963 		 * maintained by the hypervisor: suspended and rendezvous
1964 		 * state, NPT generation number, vlapic interrupts etc.
1965 		 */
1966 		disable_gintr();
1967 
1968 		/*
1969 		 * Synchronizing and injecting vlapic state is lock-free and is
1970 		 * safe (and prudent) to perform with interrupts disabled.
1971 		 */
1972 		inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
1973 		    inject_state);
1974 
1975 		/*
1976 		 * Check for vCPU bail-out conditions.  This must be done after
1977 		 * svm_inject_events() to detect a triple-fault condition.
1978 		 */
1979 		if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
1980 			enable_gintr();
1981 			break;
1982 		}
1983 
1984 		if (vcpu_run_state_pending(vm, vcpu)) {
1985 			enable_gintr();
1986 			vm_exit_run_state(vm, vcpu, state->rip);
1987 			break;
1988 		}
1989 
1990 		/*
1991 		 * If subsequent activity queued events which require injection
1992 		 * handling, take another lap to handle them.
1993 		 */
1994 		if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
1995 			enable_gintr();
1996 			handled = 1;
1997 			continue;
1998 		}
1999 
2000 		/*
2001 		 * #VMEXIT resumes the host with the guest LDTR, so
2002 		 * save the current LDT selector so it can be restored
2003 		 * after an exit.  The userspace hypervisor probably
2004 		 * doesn't use a LDT, but save and restore it to be
2005 		 * safe.
2006 		 */
2007 		ldt_sel = sldt();
2008 
2009 		/*
2010 		 * Check the vmspace and ASID generations to ensure that the
2011 		 * vcpu does not use stale TLB mappings.
2012 		 */
2013 		nptgen = vmc_table_enter(vmc);
2014 		check_asid(svm_sc, vcpu, curcpu, nptgen);
2015 
2016 		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
2017 		vcpustate->dirty = 0;
2018 		VCPU_CTR1(vm, vcpu, "vmcb clean %x", ctrl->vmcb_clean);
2019 
2020 		/* Launch Virtual Machine. */
2021 		vcpu_ustate_change(vm, vcpu, VU_RUN);
2022 		VCPU_CTR1(vm, vcpu, "Resume execution at %lx", state->rip);
2023 		svm_dr_enter_guest(gctx);
2024 		svm_launch(vmcb_pa, gctx, get_pcpu());
2025 		svm_dr_leave_guest(gctx);
2026 		vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
2027 
2028 		/* Restore host LDTR. */
2029 		lldt(ldt_sel);
2030 
2031 		/* #VMEXIT disables interrupts so re-enable them here. */
2032 		enable_gintr();
2033 
2034 		vmc_table_exit(vmc);
2035 
2036 		/* Update 'nextrip' */
2037 		vcpustate->nextrip = state->rip;
2038 
2039 		/* Handle #VMEXIT and if required return to user space. */
2040 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
2041 	} while (handled);
2042 
2043 	svm_msr_guest_exit(svm_sc, vcpu);
2044 
2045 	VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2046 	vcpustate->loaded = B_FALSE;
2047 
2048 	return (0);
2049 }
2050 
2051 static void
2052 svm_vmcleanup(void *arg)
2053 {
2054 	struct svm_softc *sc = arg;
2055 
2056 	contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
2057 	contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
2058 	free(sc, M_SVM);
2059 }
2060 
2061 static uint64_t *
2062 swctx_regptr(struct svm_regctx *regctx, int reg)
2063 {
2064 	switch (reg) {
2065 	case VM_REG_GUEST_RBX:
2066 		return (&regctx->sctx_rbx);
2067 	case VM_REG_GUEST_RCX:
2068 		return (&regctx->sctx_rcx);
2069 	case VM_REG_GUEST_RDX:
2070 		return (&regctx->sctx_rdx);
2071 	case VM_REG_GUEST_RDI:
2072 		return (&regctx->sctx_rdi);
2073 	case VM_REG_GUEST_RSI:
2074 		return (&regctx->sctx_rsi);
2075 	case VM_REG_GUEST_RBP:
2076 		return (&regctx->sctx_rbp);
2077 	case VM_REG_GUEST_R8:
2078 		return (&regctx->sctx_r8);
2079 	case VM_REG_GUEST_R9:
2080 		return (&regctx->sctx_r9);
2081 	case VM_REG_GUEST_R10:
2082 		return (&regctx->sctx_r10);
2083 	case VM_REG_GUEST_R11:
2084 		return (&regctx->sctx_r11);
2085 	case VM_REG_GUEST_R12:
2086 		return (&regctx->sctx_r12);
2087 	case VM_REG_GUEST_R13:
2088 		return (&regctx->sctx_r13);
2089 	case VM_REG_GUEST_R14:
2090 		return (&regctx->sctx_r14);
2091 	case VM_REG_GUEST_R15:
2092 		return (&regctx->sctx_r15);
2093 	case VM_REG_GUEST_DR0:
2094 		return (&regctx->sctx_dr0);
2095 	case VM_REG_GUEST_DR1:
2096 		return (&regctx->sctx_dr1);
2097 	case VM_REG_GUEST_DR2:
2098 		return (&regctx->sctx_dr2);
2099 	case VM_REG_GUEST_DR3:
2100 		return (&regctx->sctx_dr3);
2101 	default:
2102 		return (NULL);
2103 	}
2104 }
2105 
2106 static int
2107 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2108 {
2109 	struct svm_softc *sc;
2110 	struct vmcb *vmcb;
2111 	uint64_t *regp;
2112 	uint64_t *fieldp;
2113 	struct vmcb_segment *seg;
2114 
2115 	sc = arg;
2116 	vmcb = svm_get_vmcb(sc, vcpu);
2117 
2118 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2119 	if (regp != NULL) {
2120 		*val = *regp;
2121 		return (0);
2122 	}
2123 
2124 	switch (ident) {
2125 	case VM_REG_GUEST_INTR_SHADOW:
2126 		*val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2127 		break;
2128 
2129 	case VM_REG_GUEST_CR0:
2130 		svm_get_cr0(sc, vcpu, val);
2131 		break;
2132 	case VM_REG_GUEST_CR2:
2133 	case VM_REG_GUEST_CR3:
2134 	case VM_REG_GUEST_CR4:
2135 	case VM_REG_GUEST_DR6:
2136 	case VM_REG_GUEST_DR7:
2137 	case VM_REG_GUEST_EFER:
2138 	case VM_REG_GUEST_RAX:
2139 	case VM_REG_GUEST_RFLAGS:
2140 	case VM_REG_GUEST_RIP:
2141 	case VM_REG_GUEST_RSP:
2142 		fieldp = vmcb_regptr(vmcb, ident, NULL);
2143 		*val = *fieldp;
2144 		break;
2145 
2146 	case VM_REG_GUEST_CS:
2147 	case VM_REG_GUEST_DS:
2148 	case VM_REG_GUEST_ES:
2149 	case VM_REG_GUEST_FS:
2150 	case VM_REG_GUEST_GS:
2151 	case VM_REG_GUEST_SS:
2152 	case VM_REG_GUEST_LDTR:
2153 	case VM_REG_GUEST_TR:
2154 		seg = vmcb_segptr(vmcb, ident);
2155 		*val = seg->selector;
2156 		break;
2157 
2158 	case VM_REG_GUEST_GDTR:
2159 	case VM_REG_GUEST_IDTR:
2160 		/* GDTR and IDTR don't have segment selectors */
2161 		return (EINVAL);
2162 
2163 	default:
2164 		return (EINVAL);
2165 	}
2166 
2167 	return (0);
2168 }
2169 
2170 static int
2171 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2172 {
2173 	struct svm_softc *sc;
2174 	struct vmcb *vmcb;
2175 	uint64_t *regp;
2176 	uint64_t *fieldp;
2177 	uint32_t dirty;
2178 	struct vmcb_segment *seg;
2179 
2180 	sc = arg;
2181 	vmcb = svm_get_vmcb(sc, vcpu);
2182 
2183 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2184 	if (regp != NULL) {
2185 		*regp = val;
2186 		return (0);
2187 	}
2188 
2189 	dirty = VMCB_CACHE_NONE;
2190 	switch (ident) {
2191 	case VM_REG_GUEST_INTR_SHADOW:
2192 		vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2193 		break;
2194 
2195 	case VM_REG_GUEST_EFER:
2196 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2197 		/* EFER_SVM must always be set when the guest is executing */
2198 		*fieldp = val | EFER_SVM;
2199 		dirty |= VMCB_CACHE_CR;
2200 		break;
2201 
2202 	case VM_REG_GUEST_CR0:
2203 		svm_set_cr0(sc, vcpu, val, false);
2204 		break;
2205 	case VM_REG_GUEST_CR2:
2206 	case VM_REG_GUEST_CR3:
2207 	case VM_REG_GUEST_CR4:
2208 	case VM_REG_GUEST_DR6:
2209 	case VM_REG_GUEST_DR7:
2210 	case VM_REG_GUEST_RAX:
2211 	case VM_REG_GUEST_RFLAGS:
2212 	case VM_REG_GUEST_RIP:
2213 	case VM_REG_GUEST_RSP:
2214 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2215 		*fieldp = val;
2216 		break;
2217 
2218 	case VM_REG_GUEST_CS:
2219 	case VM_REG_GUEST_DS:
2220 	case VM_REG_GUEST_ES:
2221 	case VM_REG_GUEST_SS:
2222 	case VM_REG_GUEST_FS:
2223 	case VM_REG_GUEST_GS:
2224 	case VM_REG_GUEST_LDTR:
2225 	case VM_REG_GUEST_TR:
2226 		dirty |= VMCB_CACHE_SEG;
2227 		seg = vmcb_segptr(vmcb, ident);
2228 		seg->selector = (uint16_t)val;
2229 		break;
2230 
2231 	case VM_REG_GUEST_GDTR:
2232 	case VM_REG_GUEST_IDTR:
2233 		/* GDTR and IDTR don't have segment selectors */
2234 		return (EINVAL);
2235 
2236 	default:
2237 		return (EINVAL);
2238 	}
2239 
2240 	if (dirty != VMCB_CACHE_NONE) {
2241 		svm_set_dirty(sc, vcpu, dirty);
2242 	}
2243 
2244 	/*
2245 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
2246 	 * vcpu's ASID. This needs to be treated differently depending on
2247 	 * whether 'running' is true/false.
2248 	 */
2249 
2250 	return (0);
2251 }
2252 
2253 static int
2254 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2255 {
2256 	struct vmcb *vmcb;
2257 	struct svm_softc *sc;
2258 	struct vmcb_segment *seg;
2259 
2260 	sc = arg;
2261 	vmcb = svm_get_vmcb(sc, vcpu);
2262 
2263 	switch (reg) {
2264 	case VM_REG_GUEST_CS:
2265 	case VM_REG_GUEST_DS:
2266 	case VM_REG_GUEST_ES:
2267 	case VM_REG_GUEST_SS:
2268 	case VM_REG_GUEST_FS:
2269 	case VM_REG_GUEST_GS:
2270 	case VM_REG_GUEST_LDTR:
2271 	case VM_REG_GUEST_TR:
2272 		svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2273 		seg = vmcb_segptr(vmcb, reg);
2274 		/*
2275 		 * Map seg_desc access to VMCB attribute format.
2276 		 *
2277 		 * SVM uses the 'P' bit in the segment attributes to indicate a
2278 		 * NULL segment so clear it if the segment is marked unusable.
2279 		 */
2280 		seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2281 		if (SEG_DESC_UNUSABLE(desc->access)) {
2282 			seg->attrib &= ~0x80;
2283 		}
2284 		break;
2285 
2286 	case VM_REG_GUEST_GDTR:
2287 	case VM_REG_GUEST_IDTR:
2288 		svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2289 		seg = vmcb_segptr(vmcb, reg);
2290 		break;
2291 
2292 	default:
2293 		return (EINVAL);
2294 	}
2295 
2296 	ASSERT(seg != NULL);
2297 	seg->base = desc->base;
2298 	seg->limit = desc->limit;
2299 
2300 	return (0);
2301 }
2302 
2303 static int
2304 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2305 {
2306 	struct vmcb *vmcb;
2307 	struct svm_softc *sc;
2308 	struct vmcb_segment *seg;
2309 
2310 	sc = arg;
2311 	vmcb = svm_get_vmcb(sc, vcpu);
2312 
2313 	switch (reg) {
2314 	case VM_REG_GUEST_DS:
2315 	case VM_REG_GUEST_ES:
2316 	case VM_REG_GUEST_FS:
2317 	case VM_REG_GUEST_GS:
2318 	case VM_REG_GUEST_SS:
2319 	case VM_REG_GUEST_LDTR:
2320 		seg = vmcb_segptr(vmcb, reg);
2321 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2322 		/*
2323 		 * VT-x uses bit 16 to indicate a segment that has been loaded
2324 		 * with a NULL selector (aka unusable). The 'desc->access'
2325 		 * field is interpreted in the VT-x format by the
2326 		 * processor-independent code.
2327 		 *
2328 		 * SVM uses the 'P' bit to convey the same information so
2329 		 * convert it into the VT-x format. For more details refer to
2330 		 * section "Segment State in the VMCB" in APMv2.
2331 		 */
2332 		if ((desc->access & 0x80) == 0) {
2333 			/* Unusable segment */
2334 			desc->access |= 0x10000;
2335 		}
2336 		break;
2337 
2338 	case VM_REG_GUEST_CS:
2339 	case VM_REG_GUEST_TR:
2340 		seg = vmcb_segptr(vmcb, reg);
2341 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2342 		break;
2343 
2344 	case VM_REG_GUEST_GDTR:
2345 	case VM_REG_GUEST_IDTR:
2346 		seg = vmcb_segptr(vmcb, reg);
2347 		/*
2348 		 * Since there are no access bits associated with the GDTR or
2349 		 * the IDTR, zero out the field to ensure it does not contain
2350 		 * garbage which might confuse the consumer.
2351 		 */
2352 		desc->access = 0;
2353 		break;
2354 
2355 	default:
2356 		return (EINVAL);
2357 	}
2358 
2359 	ASSERT(seg != NULL);
2360 	desc->base = seg->base;
2361 	desc->limit = seg->limit;
2362 	return (0);
2363 }
2364 
2365 static int
2366 svm_setcap(void *arg, int vcpu, int type, int val)
2367 {
2368 	struct svm_softc *sc;
2369 	int error;
2370 
2371 	sc = arg;
2372 	error = 0;
2373 	switch (type) {
2374 	case VM_CAP_HALT_EXIT:
2375 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2376 		    VMCB_INTCPT_HLT, val);
2377 		break;
2378 	case VM_CAP_PAUSE_EXIT:
2379 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2380 		    VMCB_INTCPT_PAUSE, val);
2381 		break;
2382 	default:
2383 		error = ENOENT;
2384 		break;
2385 	}
2386 	return (error);
2387 }
2388 
2389 static int
2390 svm_getcap(void *arg, int vcpu, int type, int *retval)
2391 {
2392 	struct svm_softc *sc;
2393 	int error;
2394 
2395 	sc = arg;
2396 	error = 0;
2397 
2398 	switch (type) {
2399 	case VM_CAP_HALT_EXIT:
2400 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2401 		    VMCB_INTCPT_HLT);
2402 		break;
2403 	case VM_CAP_PAUSE_EXIT:
2404 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2405 		    VMCB_INTCPT_PAUSE);
2406 		break;
2407 	default:
2408 		error = ENOENT;
2409 		break;
2410 	}
2411 	return (error);
2412 }
2413 
2414 static struct vlapic *
2415 svm_vlapic_init(void *arg, int vcpuid)
2416 {
2417 	struct svm_softc *svm_sc;
2418 	struct vlapic *vlapic;
2419 
2420 	svm_sc = arg;
2421 	vlapic = malloc(sizeof (struct vlapic), M_SVM_VLAPIC,
2422 	    M_WAITOK | M_ZERO);
2423 	vlapic->vm = svm_sc->vm;
2424 	vlapic->vcpuid = vcpuid;
2425 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2426 
2427 	vlapic_init(vlapic);
2428 
2429 	return (vlapic);
2430 }
2431 
2432 static void
2433 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2434 {
2435 	vlapic_cleanup(vlapic);
2436 	free(vlapic, M_SVM_VLAPIC);
2437 }
2438 
2439 static void
2440 svm_savectx(void *arg, int vcpu)
2441 {
2442 	struct svm_softc *sc = arg;
2443 
2444 	if (sc->vcpu[vcpu].loaded) {
2445 		svm_msr_guest_exit(sc, vcpu);
2446 	}
2447 }
2448 
2449 static void
2450 svm_restorectx(void *arg, int vcpu)
2451 {
2452 	struct svm_softc *sc = arg;
2453 
2454 	if (sc->vcpu[vcpu].loaded) {
2455 		svm_msr_guest_enter(sc, vcpu);
2456 	}
2457 }
2458 
2459 struct vmm_ops vmm_ops_amd = {
2460 	.init		= svm_init,
2461 	.cleanup	= svm_cleanup,
2462 	.resume		= svm_restore,
2463 
2464 	.vminit		= svm_vminit,
2465 	.vmrun		= svm_vmrun,
2466 	.vmcleanup	= svm_vmcleanup,
2467 	.vmgetreg	= svm_getreg,
2468 	.vmsetreg	= svm_setreg,
2469 	.vmgetdesc	= svm_getdesc,
2470 	.vmsetdesc	= svm_setdesc,
2471 	.vmgetcap	= svm_getcap,
2472 	.vmsetcap	= svm_setcap,
2473 	.vlapic_init	= svm_vlapic_init,
2474 	.vlapic_cleanup	= svm_vlapic_cleanup,
2475 
2476 	.vmsavectx	= svm_savectx,
2477 	.vmrestorectx	= svm_restorectx,
2478 };
2479