xref: /illumos-gate/usr/src/uts/intel/io/vmm/amd/svm.c (revision ae5a8bed14db6c16225cac733ea042c27e242d18)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * This file and its contents are supplied under the terms of the
31  * Common Development and Distribution License ("CDDL"), version 1.0.
32  * You may only use this file in accordance with the terms of version
33  * 1.0 of the CDDL.
34  *
35  * A full copy of the text of the CDDL should have accompanied this
36  * source.  A copy of the CDDL is also available via the Internet at
37  * http://www.illumos.org/license/CDDL.
38  *
39  * Copyright 2018 Joyent, Inc.
40  * Copyright 2022 Oxide Computer Company
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/kmem.h>
50 #include <sys/pcpu.h>
51 #include <sys/proc.h>
52 #include <sys/sysctl.h>
53 
54 #include <sys/x86_archext.h>
55 #include <sys/trap.h>
56 
57 #include <machine/cpufunc.h>
58 #include <machine/psl.h>
59 #include <machine/md_var.h>
60 #include <machine/reg.h>
61 #include <machine/specialreg.h>
62 #include <machine/vmm.h>
63 #include <machine/vmm_dev.h>
64 #include <sys/vmm_instruction_emul.h>
65 #include <sys/vmm_vm.h>
66 #include <sys/vmm_kernel.h>
67 
68 #include "vmm_lapic.h"
69 #include "vmm_stat.h"
70 #include "vmm_ioport.h"
71 #include "vatpic.h"
72 #include "vlapic.h"
73 #include "vlapic_priv.h"
74 
75 #include "x86.h"
76 #include "vmcb.h"
77 #include "svm.h"
78 #include "svm_softc.h"
79 #include "svm_msr.h"
80 
81 SYSCTL_DECL(_hw_vmm);
82 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
83     NULL);
84 
85 /*
86  * SVM CPUID function 0x8000_000A, edx bit decoding.
87  */
88 #define	AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
89 #define	AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
90 #define	AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
91 #define	AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
92 #define	AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
93 #define	AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
94 #define	AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
95 #define	AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
96 #define	AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
97 #define	AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
98 #define	AMD_CPUID_SVM_AVIC		BIT(13)	/* AVIC present */
99 
100 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID	|	\
101 				VMCB_CACHE_IOPM		|	\
102 				VMCB_CACHE_I		|	\
103 				VMCB_CACHE_TPR		|	\
104 				VMCB_CACHE_CR2		|	\
105 				VMCB_CACHE_CR		|	\
106 				VMCB_CACHE_DR		|	\
107 				VMCB_CACHE_DT		|	\
108 				VMCB_CACHE_SEG		|	\
109 				VMCB_CACHE_NP)
110 
111 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
112 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
113     0, NULL);
114 
115 /* SVM features advertised by CPUID.8000000AH:EDX */
116 static uint32_t svm_feature = ~0U;	/* AMD SVM features. */
117 
118 static int disable_npf_assist;
119 
120 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
121 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
122 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
123 
124 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
125 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val);
126 static void flush_asid(struct svm_softc *sc, int vcpuid);
127 
128 static __inline bool
129 flush_by_asid(void)
130 {
131 	return ((svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID) != 0);
132 }
133 
134 static __inline bool
135 decode_assist(void)
136 {
137 	return ((svm_feature & AMD_CPUID_SVM_DECODE_ASSIST) != 0);
138 }
139 
140 static int
141 svm_cleanup(void)
142 {
143 	/* This is taken care of by the hma registration */
144 	return (0);
145 }
146 
147 static int
148 svm_init(void)
149 {
150 	vmcb_clean &= VMCB_CACHE_DEFAULT;
151 
152 	svm_msr_init();
153 
154 	return (0);
155 }
156 
157 static void
158 svm_restore(void)
159 {
160 	/* No-op on illumos */
161 }
162 
163 /* Pentium compatible MSRs */
164 #define	MSR_PENTIUM_START	0
165 #define	MSR_PENTIUM_END		0x1FFF
166 /* AMD 6th generation and Intel compatible MSRs */
167 #define	MSR_AMD6TH_START	0xC0000000UL
168 #define	MSR_AMD6TH_END		0xC0001FFFUL
169 /* AMD 7th and 8th generation compatible MSRs */
170 #define	MSR_AMD7TH_START	0xC0010000UL
171 #define	MSR_AMD7TH_END		0xC0011FFFUL
172 
173 /*
174  * Get the index and bit position for a MSR in permission bitmap.
175  * Two bits are used for each MSR: lower bit for read and higher bit for write.
176  */
177 static int
178 svm_msr_index(uint64_t msr, int *index, int *bit)
179 {
180 	uint32_t base, off;
181 
182 	*index = -1;
183 	*bit = (msr % 4) * 2;
184 	base = 0;
185 
186 	if (msr <= MSR_PENTIUM_END) {
187 		*index = msr / 4;
188 		return (0);
189 	}
190 
191 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
192 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
193 		off = (msr - MSR_AMD6TH_START);
194 		*index = (off + base) / 4;
195 		return (0);
196 	}
197 
198 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
199 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
200 		off = (msr - MSR_AMD7TH_START);
201 		*index = (off + base) / 4;
202 		return (0);
203 	}
204 
205 	return (EINVAL);
206 }
207 
208 /*
209  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
210  */
211 static void
212 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
213 {
214 	int index, bit, error;
215 
216 	error = svm_msr_index(msr, &index, &bit);
217 	KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
218 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
219 	    ("%s: invalid index %d for msr %lx", __func__, index, msr));
220 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
221 	    "msr %lx", __func__, bit, msr));
222 
223 	if (read)
224 		perm_bitmap[index] &= ~(1UL << bit);
225 
226 	if (write)
227 		perm_bitmap[index] &= ~(2UL << bit);
228 }
229 
230 static void
231 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
232 {
233 
234 	svm_msr_perm(perm_bitmap, msr, true, true);
235 }
236 
237 static void
238 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
239 {
240 
241 	svm_msr_perm(perm_bitmap, msr, true, false);
242 }
243 
244 static __inline int
245 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
246 {
247 	struct vmcb_ctrl *ctrl;
248 
249 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
250 
251 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
252 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
253 }
254 
255 static __inline void
256 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
257     int enabled)
258 {
259 	struct vmcb_ctrl *ctrl;
260 	uint32_t oldval;
261 
262 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
263 
264 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
265 	oldval = ctrl->intercept[idx];
266 
267 	if (enabled)
268 		ctrl->intercept[idx] |= bitmask;
269 	else
270 		ctrl->intercept[idx] &= ~bitmask;
271 
272 	if (ctrl->intercept[idx] != oldval) {
273 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
274 	}
275 }
276 
277 static __inline void
278 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
279 {
280 
281 	svm_set_intercept(sc, vcpu, off, bitmask, 0);
282 }
283 
284 static __inline void
285 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
286 {
287 
288 	svm_set_intercept(sc, vcpu, off, bitmask, 1);
289 }
290 
291 static void
292 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
293     uint64_t msrpm_base_pa, uint64_t np_pml4)
294 {
295 	struct vmcb_ctrl *ctrl;
296 	struct vmcb_state *state;
297 	uint32_t mask;
298 	int n;
299 
300 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
301 	state = svm_get_vmcb_state(sc, vcpu);
302 
303 	ctrl->iopm_base_pa = iopm_base_pa;
304 	ctrl->msrpm_base_pa = msrpm_base_pa;
305 
306 	/* Enable nested paging */
307 	ctrl->np_ctrl = NP_ENABLE;
308 	ctrl->n_cr3 = np_pml4;
309 
310 	/*
311 	 * Intercept accesses to the control registers that are not shadowed
312 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
313 	 */
314 	for (n = 0; n < 16; n++) {
315 		mask = (BIT(n) << 16) | BIT(n);
316 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
317 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
318 		else
319 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
320 	}
321 
322 	/*
323 	 * Selectively intercept writes to %cr0.  This triggers on operations
324 	 * which would change bits other than TS or MP.
325 	 */
326 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
327 	    VMCB_INTCPT_CR0_WRITE);
328 
329 	/*
330 	 * Intercept everything when tracing guest exceptions otherwise
331 	 * just intercept machine check exception.
332 	 */
333 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
334 		for (n = 0; n < 32; n++) {
335 			/*
336 			 * Skip unimplemented vectors in the exception bitmap.
337 			 */
338 			if (n == 2 || n == 9) {
339 				continue;
340 			}
341 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
342 		}
343 	} else {
344 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
345 	}
346 
347 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
348 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
349 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
350 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
351 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
352 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
353 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
354 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
355 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
356 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
357 	    VMCB_INTCPT_FERR_FREEZE);
358 
359 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
360 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
361 
362 	/* Intercept privileged invalidation instructions. */
363 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
364 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
365 
366 	/*
367 	 * Intercept all virtualization-related instructions.
368 	 *
369 	 * From section "Canonicalization and Consistency Checks" in APMv2
370 	 * the VMRUN intercept bit must be set to pass the consistency check.
371 	 */
372 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
373 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
374 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
375 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
376 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
377 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
378 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
379 
380 	/*
381 	 * The ASID will be set to a non-zero value just before VMRUN.
382 	 */
383 	ctrl->asid = 0;
384 
385 	/*
386 	 * Section 15.21.1, Interrupt Masking in EFLAGS
387 	 * Section 15.21.2, Virtualizing APIC.TPR
388 	 *
389 	 * This must be set for %rflag and %cr8 isolation of guest and host.
390 	 */
391 	ctrl->v_intr_ctrl |= V_INTR_MASKING;
392 
393 	/* Enable Last Branch Record aka LBR for debugging */
394 	ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
395 	state->dbgctl = BIT(0);
396 
397 	/* EFER_SVM must always be set when the guest is executing */
398 	state->efer = EFER_SVM;
399 
400 	/* Set up the PAT to power-on state */
401 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
402 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
403 	    PAT_VALUE(2, PAT_UNCACHED)		|
404 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
405 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
406 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
407 	    PAT_VALUE(6, PAT_UNCACHED)		|
408 	    PAT_VALUE(7, PAT_UNCACHEABLE);
409 
410 	/* Set up DR6/7 to power-on state */
411 	state->dr6 = DBREG_DR6_RESERVED1;
412 	state->dr7 = DBREG_DR7_RESERVED1;
413 }
414 
415 /*
416  * Initialize a virtual machine.
417  */
418 static void *
419 svm_vminit(struct vm *vm)
420 {
421 	struct svm_softc *svm_sc;
422 	struct svm_vcpu *vcpu;
423 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
424 	int i;
425 	uint16_t maxcpus;
426 
427 	svm_sc = kmem_zalloc(sizeof (*svm_sc), KM_SLEEP);
428 	VERIFY3U(((uintptr_t)svm_sc & PAGE_MASK),  ==,  0);
429 
430 	svm_sc->msr_bitmap = vmm_contig_alloc(SVM_MSR_BITMAP_SIZE);
431 	if (svm_sc->msr_bitmap == NULL)
432 		panic("contigmalloc of SVM MSR bitmap failed");
433 	svm_sc->iopm_bitmap = vmm_contig_alloc(SVM_IO_BITMAP_SIZE);
434 	if (svm_sc->iopm_bitmap == NULL)
435 		panic("contigmalloc of SVM IO bitmap failed");
436 
437 	svm_sc->vm = vm;
438 	svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm));
439 
440 	/*
441 	 * Intercept read and write accesses to all MSRs.
442 	 */
443 	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
444 
445 	/*
446 	 * Access to the following MSRs is redirected to the VMCB when the
447 	 * guest is executing. Therefore it is safe to allow the guest to
448 	 * read/write these MSRs directly without hypervisor involvement.
449 	 */
450 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
451 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
452 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
453 
454 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
455 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
456 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
457 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
458 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
459 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
460 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
461 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
462 
463 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
464 
465 	/*
466 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
467 	 */
468 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
469 
470 	/* Intercept access to all I/O ports. */
471 	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
472 
473 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
474 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
475 	pml4_pa = svm_sc->nptp;
476 	maxcpus = vm_get_maxcpus(svm_sc->vm);
477 	for (i = 0; i < maxcpus; i++) {
478 		vcpu = svm_get_vcpu(svm_sc, i);
479 		vcpu->nextrip = ~0;
480 		vcpu->lastcpu = NOCPU;
481 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
482 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
483 		svm_msr_guest_init(svm_sc, i);
484 	}
485 	return (svm_sc);
486 }
487 
488 /*
489  * Collateral for a generic SVM VM-exit.
490  */
491 static void
492 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
493 {
494 
495 	vme->exitcode = VM_EXITCODE_SVM;
496 	vme->u.svm.exitcode = code;
497 	vme->u.svm.exitinfo1 = info1;
498 	vme->u.svm.exitinfo2 = info2;
499 }
500 
501 static int
502 svm_cpl(struct vmcb_state *state)
503 {
504 
505 	/*
506 	 * From APMv2:
507 	 *   "Retrieve the CPL from the CPL field in the VMCB, not
508 	 *    from any segment DPL"
509 	 */
510 	return (state->cpl);
511 }
512 
513 static enum vm_cpu_mode
514 svm_vcpu_mode(struct vmcb *vmcb)
515 {
516 	struct vmcb_state *state;
517 
518 	state = &vmcb->state;
519 
520 	if (state->efer & EFER_LMA) {
521 		struct vmcb_segment *seg;
522 
523 		/*
524 		 * Section 4.8.1 for APM2, check if Code Segment has
525 		 * Long attribute set in descriptor.
526 		 */
527 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
528 		if (seg->attrib & VMCB_CS_ATTRIB_L)
529 			return (CPU_MODE_64BIT);
530 		else
531 			return (CPU_MODE_COMPATIBILITY);
532 	} else  if (state->cr0 & CR0_PE) {
533 		return (CPU_MODE_PROTECTED);
534 	} else {
535 		return (CPU_MODE_REAL);
536 	}
537 }
538 
539 static enum vm_paging_mode
540 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
541 {
542 
543 	if ((cr0 & CR0_PG) == 0)
544 		return (PAGING_MODE_FLAT);
545 	if ((cr4 & CR4_PAE) == 0)
546 		return (PAGING_MODE_32);
547 	if (efer & EFER_LME)
548 		return (PAGING_MODE_64);
549 	else
550 		return (PAGING_MODE_PAE);
551 }
552 
553 /*
554  * ins/outs utility routines
555  */
556 
557 static void
558 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
559 {
560 	struct vmcb_state *state;
561 
562 	state = &vmcb->state;
563 	paging->cr3 = state->cr3;
564 	paging->cpl = svm_cpl(state);
565 	paging->cpu_mode = svm_vcpu_mode(vmcb);
566 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
567 	    state->efer);
568 }
569 
570 #define	UNHANDLED 0
571 
572 /*
573  * Handle guest I/O intercept.
574  */
575 static int
576 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
577 {
578 	struct vmcb_ctrl *ctrl;
579 	struct vmcb_state *state;
580 	struct vm_inout *inout;
581 	struct vie *vie;
582 	uint64_t info1;
583 	struct vm_guest_paging paging;
584 
585 	state = svm_get_vmcb_state(svm_sc, vcpu);
586 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
587 	inout = &vmexit->u.inout;
588 	info1 = ctrl->exitinfo1;
589 
590 	inout->bytes = (info1 >> 4) & 0x7;
591 	inout->flags = 0;
592 	inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
593 	inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
594 	inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
595 	inout->port = (uint16_t)(info1 >> 16);
596 	inout->eax = (uint32_t)(state->rax);
597 
598 	if ((inout->flags & INOUT_STR) != 0) {
599 		/*
600 		 * The effective segment number in EXITINFO1[12:10] is populated
601 		 * only if the processor has the DecodeAssist capability.
602 		 *
603 		 * This is not specified explicitly in APMv2 but can be verified
604 		 * empirically.
605 		 */
606 		if (!decode_assist()) {
607 			/*
608 			 * Without decoding assistance, force the task of
609 			 * emulating the ins/outs on userspace.
610 			 */
611 			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
612 			bzero(&vmexit->u.inst_emul,
613 			    sizeof (vmexit->u.inst_emul));
614 			return (UNHANDLED);
615 		}
616 
617 		/*
618 		 * Bits 7-9 encode the address size of ins/outs operations where
619 		 * the 1/2/4 values correspond to 16/32/64 bit sizes.
620 		 */
621 		inout->addrsize = 2 * ((info1 >> 7) & 0x7);
622 		VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
623 		    inout->addrsize == 8);
624 
625 		if (inout->flags & INOUT_IN) {
626 			/*
627 			 * For INS instructions, %es (encoded as 0) is the
628 			 * implied segment for the operation.
629 			 */
630 			inout->segment = 0;
631 		} else {
632 			/*
633 			 * Bits 10-12 encode the segment for OUTS.
634 			 * This value follows the standard x86 segment order.
635 			 */
636 			inout->segment = (info1 >> 10) & 0x7;
637 		}
638 	}
639 
640 	vmexit->exitcode = VM_EXITCODE_INOUT;
641 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
642 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
643 	vie_init_inout(vie, inout, vmexit->inst_length, &paging);
644 
645 	/* The in/out emulation will handle advancing %rip */
646 	vmexit->inst_length = 0;
647 
648 	return (UNHANDLED);
649 }
650 
651 static int
652 npf_fault_type(uint64_t exitinfo1)
653 {
654 
655 	if (exitinfo1 & VMCB_NPF_INFO1_W)
656 		return (PROT_WRITE);
657 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
658 		return (PROT_EXEC);
659 	else
660 		return (PROT_READ);
661 }
662 
663 static bool
664 svm_npf_emul_fault(uint64_t exitinfo1)
665 {
666 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
667 		return (false);
668 	}
669 
670 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
671 		return (false);
672 	}
673 
674 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
675 		return (false);
676 	}
677 
678 	return (true);
679 }
680 
681 static void
682 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
683     uint64_t gpa)
684 {
685 	struct vmcb_ctrl *ctrl;
686 	struct vmcb *vmcb;
687 	struct vie *vie;
688 	struct vm_guest_paging paging;
689 	struct vmcb_segment *seg;
690 	char *inst_bytes = NULL;
691 	uint8_t inst_len = 0;
692 
693 	vmcb = svm_get_vmcb(svm_sc, vcpu);
694 	ctrl = &vmcb->ctrl;
695 
696 	vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
697 	vmexit->u.mmio_emul.gpa = gpa;
698 	vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
699 	svm_paging_info(vmcb, &paging);
700 
701 	switch (paging.cpu_mode) {
702 	case CPU_MODE_REAL:
703 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
704 		vmexit->u.mmio_emul.cs_base = seg->base;
705 		vmexit->u.mmio_emul.cs_d = 0;
706 		break;
707 	case CPU_MODE_PROTECTED:
708 	case CPU_MODE_COMPATIBILITY:
709 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
710 		vmexit->u.mmio_emul.cs_base = seg->base;
711 
712 		/*
713 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
714 		 */
715 		vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
716 		    1 : 0;
717 		break;
718 	default:
719 		vmexit->u.mmio_emul.cs_base = 0;
720 		vmexit->u.mmio_emul.cs_d = 0;
721 		break;
722 	}
723 
724 	/*
725 	 * Copy the instruction bytes into 'vie' if available.
726 	 */
727 	if (decode_assist() && !disable_npf_assist) {
728 		inst_len = ctrl->inst_len;
729 		inst_bytes = (char *)ctrl->inst_bytes;
730 	}
731 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
732 	vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
733 }
734 
735 /*
736  * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0
737  * which is live in the guest.  They are visible via the shadow instead.
738  */
739 #define	SVM_CR0_MASK	~(CR0_CD | CR0_NW | 0xffffffff00000000)
740 
741 static void
742 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write)
743 {
744 	struct vmcb_state *state;
745 	struct svm_regctx *regctx;
746 	uint64_t masked, old, diff;
747 
748 	state = svm_get_vmcb_state(svm_sc, vcpu);
749 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
750 
751 	old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
752 	diff = old ^ val;
753 
754 	/* No further work needed if register contents remain the same */
755 	if (diff == 0) {
756 		return;
757 	}
758 
759 	/* Flush the TLB if the paging or write-protect bits are changing */
760 	if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
761 		flush_asid(svm_sc, vcpu);
762 	}
763 
764 	/*
765 	 * If the change in %cr0 is due to a guest action (via interception)
766 	 * then other CPU state updates may be required.
767 	 */
768 	if (guest_write) {
769 		if ((diff & CR0_PG) != 0) {
770 			uint64_t efer = state->efer;
771 
772 			/* Keep the long-mode state in EFER in sync */
773 			if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) {
774 				state->efer |= EFER_LMA;
775 			}
776 			if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) {
777 				state->efer &= ~EFER_LMA;
778 			}
779 		}
780 	}
781 
782 	masked = val & SVM_CR0_MASK;
783 	regctx->sctx_cr0_shadow = val;
784 	state->cr0 = masked;
785 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR);
786 
787 	if ((masked ^ val) != 0) {
788 		/*
789 		 * The guest has set bits in %cr0 which we are masking out and
790 		 * exposing via shadow.
791 		 *
792 		 * We must intercept %cr0 reads in order to make the shadowed
793 		 * view available to the guest.
794 		 *
795 		 * Writes to %cr0 must also be intercepted (unconditionally,
796 		 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch
797 		 * if/when the guest clears those shadowed bits.
798 		 */
799 		svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
800 		    BIT(0) | BIT(16));
801 	} else {
802 		/*
803 		 * When no bits remain in %cr0 which require shadowing, the
804 		 * unconditional intercept of reads/writes to %cr0 can be
805 		 * disabled.
806 		 *
807 		 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains
808 		 * in place so we can be notified of operations which change
809 		 * bits other than TS or MP.
810 		 */
811 		svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
812 		    BIT(0) | BIT(16));
813 	}
814 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I);
815 }
816 
817 static void
818 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val)
819 {
820 	struct vmcb *vmcb;
821 	struct svm_regctx *regctx;
822 
823 	vmcb = svm_get_vmcb(svm_sc, vcpu);
824 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
825 
826 	/*
827 	 * Include the %cr0 bits which exist only in the shadow along with those
828 	 * in the running vCPU state.
829 	 */
830 	*val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
831 }
832 
833 static void
834 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
835 {
836 	uint64_t val;
837 	int err;
838 
839 	svm_get_cr0(svm_sc, vcpu, &val);
840 	err = svm_setreg(svm_sc, vcpu, reg, val);
841 	ASSERT(err == 0);
842 }
843 
844 static void
845 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
846 {
847 	struct vmcb_state *state;
848 	uint64_t val;
849 	int err;
850 
851 	state = svm_get_vmcb_state(svm_sc, vcpu);
852 
853 	err = svm_getreg(svm_sc, vcpu, reg, &val);
854 	ASSERT(err == 0);
855 
856 	if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) {
857 		/* NW without CD is nonsensical */
858 		vm_inject_gp(svm_sc->vm, vcpu);
859 		return;
860 	}
861 	if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) {
862 		/* PG requires PE */
863 		vm_inject_gp(svm_sc->vm, vcpu);
864 		return;
865 	}
866 	if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) {
867 		/* When enabling paging, PAE must be enabled if LME is. */
868 		if ((state->efer & EFER_LME) != 0 &&
869 		    (state->cr4 & CR4_PAE) == 0) {
870 			vm_inject_gp(svm_sc->vm, vcpu);
871 			return;
872 		}
873 	}
874 
875 	svm_set_cr0(svm_sc, vcpu, val, true);
876 }
877 
878 static void
879 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
880 {
881 	struct vie *vie;
882 	struct vm_guest_paging paging;
883 
884 	/* Let the instruction emulation (hopefully in-kernel) handle it */
885 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
886 	bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul));
887 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
888 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
889 	vie_init_other(vie, &paging);
890 
891 	/* The instruction emulation will handle advancing %rip */
892 	vmexit->inst_length = 0;
893 }
894 
895 static void
896 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
897 {
898 	struct vm *vm;
899 	struct vlapic *vlapic;
900 	struct vmcb_ctrl *ctrl;
901 
902 	vm = sc->vm;
903 	vlapic = vm_lapic(vm, vcpu);
904 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
905 
906 	/* Update %cr8 in the emulated vlapic */
907 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
908 
909 	/* Virtual interrupt injection is not used. */
910 	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
911 	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
912 }
913 
914 CTASSERT(VMCB_EVENTINJ_TYPE_INTR	== VM_INTINFO_HWINTR);
915 CTASSERT(VMCB_EVENTINJ_TYPE_NMI		== VM_INTINFO_NMI);
916 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION	== VM_INTINFO_HWEXCP);
917 CTASSERT(VMCB_EVENTINJ_TYPE_INTn	== VM_INTINFO_SWINTR);
918 CTASSERT(VMCB_EVENTINJ_EC_VALID		== VM_INTINFO_DEL_ERRCODE);
919 CTASSERT(VMCB_EVENTINJ_VALID		== VM_INTINFO_VALID);
920 
921 static void
922 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
923 {
924 	struct vmcb_ctrl *ctrl;
925 	uint64_t intinfo;
926 	int err;
927 
928 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
929 	intinfo = ctrl->exitintinfo;
930 	if (!VMCB_EXITINTINFO_VALID(intinfo))
931 		return;
932 
933 	/*
934 	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
935 	 *
936 	 * If a #VMEXIT happened during event delivery then record the event
937 	 * that was being delivered.
938 	 */
939 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
940 	/*
941 	 * Relies on match between VMCB exitintinfo format and bhyve-generic
942 	 * format, which is ensured by CTASSERTs above.
943 	 */
944 	err = vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
945 	VERIFY0(err);
946 }
947 
948 static __inline int
949 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
950 {
951 
952 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
953 	    VMCB_INTCPT_VINTR));
954 }
955 
956 static void
957 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
958 {
959 	struct vmcb_ctrl *ctrl;
960 	struct vmcb_state *state;
961 
962 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
963 	state = svm_get_vmcb_state(sc, vcpu);
964 
965 	if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
966 		KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
967 		    ("%s: invalid v_ign_tpr", __func__));
968 		KASSERT(vintr_intercept_enabled(sc, vcpu),
969 		    ("%s: vintr intercept should be enabled", __func__));
970 		return;
971 	}
972 
973 	/*
974 	 * We use V_IRQ in conjunction with the VINTR intercept to trap into the
975 	 * hypervisor as soon as a virtual interrupt can be delivered.
976 	 *
977 	 * Since injected events are not subject to intercept checks we need to
978 	 * ensure that the V_IRQ is not actually going to be delivered on VM
979 	 * entry.
980 	 */
981 	VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
982 	    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
983 
984 	ctrl->v_irq |= V_IRQ;
985 	ctrl->v_intr_prio |= V_IGN_TPR;
986 	ctrl->v_intr_vector = 0;
987 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
988 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
989 }
990 
991 static void
992 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
993 {
994 	struct vmcb_ctrl *ctrl;
995 
996 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
997 
998 	if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
999 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
1000 		    ("%s: vintr intercept should be disabled", __func__));
1001 		return;
1002 	}
1003 
1004 	ctrl->v_irq &= ~V_IRQ;
1005 	ctrl->v_intr_vector = 0;
1006 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1007 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1008 }
1009 
1010 /*
1011  * Once an NMI is injected it blocks delivery of further NMIs until the handler
1012  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1013  * to track when the vcpu is done handling the NMI.
1014  */
1015 static int
1016 svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1017 {
1018 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1019 	    VMCB_INTCPT_IRET));
1020 }
1021 
1022 static void
1023 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1024 {
1025 	struct vmcb_ctrl *ctrl;
1026 
1027 	KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1028 	/*
1029 	 * When the IRET intercept is cleared the vcpu will attempt to execute
1030 	 * the "iret" when it runs next. However, it is possible to inject
1031 	 * another NMI into the vcpu before the "iret" has actually executed.
1032 	 *
1033 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1034 	 * it will trap back into the hypervisor. If an NMI is pending for
1035 	 * the vcpu it will be injected into the guest.
1036 	 *
1037 	 * XXX this needs to be fixed
1038 	 */
1039 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1040 
1041 	/*
1042 	 * Set an interrupt shadow to prevent an NMI from being immediately
1043 	 * injected on the next VMRUN.
1044 	 */
1045 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1046 	ctrl->intr_shadow = 1;
1047 }
1048 
1049 static void
1050 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info)
1051 {
1052 	ASSERT(VM_INTINFO_PENDING(info));
1053 
1054 	uint8_t vector = VM_INTINFO_VECTOR(info);
1055 	uint32_t type = VM_INTINFO_TYPE(info);
1056 
1057 	/*
1058 	 * Correct behavior depends on bhyve intinfo event types lining up with
1059 	 * those defined by AMD for event injection in the VMCB.  The CTASSERTs
1060 	 * above svm_save_exitintinfo() ensure it.
1061 	 */
1062 	switch (type) {
1063 	case VM_INTINFO_NMI:
1064 		/* Ensure vector for injected event matches its type (NMI) */
1065 		vector = IDT_NMI;
1066 		break;
1067 	case VM_INTINFO_HWINTR:
1068 	case VM_INTINFO_SWINTR:
1069 		break;
1070 	case VM_INTINFO_HWEXCP:
1071 		if (vector == IDT_NMI) {
1072 			/*
1073 			 * NMIs are expected to be injected with
1074 			 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception
1075 			 * with the NMI vector.
1076 			 */
1077 			type = VM_INTINFO_NMI;
1078 		}
1079 		VERIFY(vector < 32);
1080 		break;
1081 	default:
1082 		/*
1083 		 * Since there is not strong validation for injected event types
1084 		 * at this point, fall back to software interrupt for those we
1085 		 * do not recognized.
1086 		 */
1087 		type = VM_INTINFO_SWINTR;
1088 		break;
1089 	}
1090 
1091 	ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector;
1092 	if (VM_INTINFO_HAS_ERRCODE(info)) {
1093 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1094 		ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32;
1095 	}
1096 }
1097 
1098 static void
1099 svm_inject_nmi(struct svm_softc *sc, int vcpu)
1100 {
1101 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1102 
1103 	ASSERT(!svm_nmi_blocked(sc, vcpu));
1104 
1105 	ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1106 	vm_nmi_clear(sc->vm, vcpu);
1107 
1108 	/*
1109 	 * Virtual NMI blocking is now in effect.
1110 	 *
1111 	 * Not only does this block a subsequent NMI injection from taking
1112 	 * place, it also configures an intercept on the IRET so we can track
1113 	 * when the next injection can take place.
1114 	 */
1115 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1116 }
1117 
1118 static void
1119 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1120 {
1121 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1122 
1123 	ASSERT(vector >= 0 && vector <= 255);
1124 
1125 	ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1126 }
1127 
1128 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
1129 
1130 static vm_msr_result_t
1131 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1132 {
1133 	struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu);
1134 	uint64_t lma;
1135 	int error;
1136 
1137 	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
1138 
1139 	if (newval & EFER_MBZ_BITS) {
1140 		return (VMR_GP);
1141 	}
1142 
1143 	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1144 	const uint64_t changed = state->efer ^ newval;
1145 	if (changed & EFER_LME) {
1146 		if (state->cr0 & CR0_PG) {
1147 			return (VMR_GP);
1148 		}
1149 	}
1150 
1151 	/* EFER.LMA = EFER.LME & CR0.PG */
1152 	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) {
1153 		lma = EFER_LMA;
1154 	} else {
1155 		lma = 0;
1156 	}
1157 	if ((newval & EFER_LMA) != lma) {
1158 		return (VMR_GP);
1159 	}
1160 
1161 	if ((newval & EFER_NXE) != 0 &&
1162 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) {
1163 		return (VMR_GP);
1164 	}
1165 	if ((newval & EFER_FFXSR) != 0 &&
1166 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) {
1167 		return (VMR_GP);
1168 	}
1169 	if ((newval & EFER_TCE) != 0 &&
1170 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) {
1171 		return (VMR_GP);
1172 	}
1173 
1174 	/*
1175 	 * Until bhyve has proper support for long-mode segment limits, just
1176 	 * toss a #GP at the guest if they attempt to use it.
1177 	 */
1178 	if (newval & EFER_LMSLE) {
1179 		return (VMR_GP);
1180 	}
1181 
1182 	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1183 	VERIFY0(error);
1184 	return (VMR_OK);
1185 }
1186 
1187 static int
1188 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
1189     bool is_wrmsr)
1190 {
1191 	struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1192 	struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1193 	const uint32_t ecx = ctx->sctx_rcx;
1194 	vm_msr_result_t res;
1195 	uint64_t val = 0;
1196 
1197 	if (is_wrmsr) {
1198 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1199 		val = ctx->sctx_rdx << 32 | (uint32_t)state->rax;
1200 
1201 		if (vlapic_owned_msr(ecx)) {
1202 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1203 
1204 			res = vlapic_wrmsr(vlapic, ecx, val);
1205 		} else if (ecx == MSR_EFER) {
1206 			res = svm_write_efer(svm_sc, vcpu, val);
1207 		} else {
1208 			res = svm_wrmsr(svm_sc, vcpu, ecx, val);
1209 		}
1210 	} else {
1211 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1212 
1213 		if (vlapic_owned_msr(ecx)) {
1214 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1215 
1216 			res = vlapic_rdmsr(vlapic, ecx, &val);
1217 		} else {
1218 			res = svm_rdmsr(svm_sc, vcpu, ecx, &val);
1219 		}
1220 	}
1221 
1222 	switch (res) {
1223 	case VMR_OK:
1224 		/* Store rdmsr result in the appropriate registers */
1225 		if (!is_wrmsr) {
1226 			state->rax = (uint32_t)val;
1227 			ctx->sctx_rdx = val >> 32;
1228 		}
1229 		return (1);
1230 	case VMR_GP:
1231 		vm_inject_gp(svm_sc->vm, vcpu);
1232 		return (1);
1233 	case VMR_UNHANLDED:
1234 		vmexit->exitcode = is_wrmsr ?
1235 		    VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR;
1236 		vmexit->u.msr.code = ecx;
1237 		vmexit->u.msr.wval = val;
1238 		return (0);
1239 	default:
1240 		panic("unexpected msr result %u\n", res);
1241 	}
1242 }
1243 
1244 /*
1245  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1246  * that are due to instruction intercepts as well as MSR and IOIO intercepts
1247  * and exceptions caused by INT3, INTO and BOUND instructions.
1248  *
1249  * Return 1 if the nRIP is valid and 0 otherwise.
1250  */
1251 static int
1252 nrip_valid(uint64_t exitcode)
1253 {
1254 	switch (exitcode) {
1255 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
1256 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
1257 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
1258 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
1259 	case 0x43:		/* INT3 */
1260 	case 0x44:		/* INTO */
1261 	case 0x45:		/* BOUND */
1262 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1263 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1264 		return (1);
1265 	default:
1266 		return (0);
1267 	}
1268 }
1269 
1270 static int
1271 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1272 {
1273 	struct vmcb *vmcb;
1274 	struct vmcb_state *state;
1275 	struct vmcb_ctrl *ctrl;
1276 	struct svm_regctx *ctx;
1277 	uint64_t code, info1, info2;
1278 	int handled;
1279 
1280 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
1281 	vmcb = svm_get_vmcb(svm_sc, vcpu);
1282 	state = &vmcb->state;
1283 	ctrl = &vmcb->ctrl;
1284 
1285 	handled = 0;
1286 	code = ctrl->exitcode;
1287 	info1 = ctrl->exitinfo1;
1288 	info2 = ctrl->exitinfo2;
1289 
1290 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1291 	vmexit->rip = state->rip;
1292 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1293 
1294 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1295 
1296 	/*
1297 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1298 	 * in an inconsistent state and can trigger assertions that would
1299 	 * never happen otherwise.
1300 	 */
1301 	if (code == VMCB_EXIT_INVALID) {
1302 		vm_exit_svm(vmexit, code, info1, info2);
1303 		return (0);
1304 	}
1305 
1306 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1307 	    "injection valid bit is set %lx", __func__, ctrl->eventinj));
1308 
1309 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1310 	    ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1311 	    vmexit->inst_length, code, info1, info2));
1312 
1313 	svm_update_virqinfo(svm_sc, vcpu);
1314 	svm_save_exitintinfo(svm_sc, vcpu);
1315 
1316 	switch (code) {
1317 	case VMCB_EXIT_CR0_READ:
1318 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1319 			svm_handle_cr0_read(svm_sc, vcpu,
1320 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1321 			handled = 1;
1322 		} else {
1323 			/*
1324 			 * If SMSW is used to read the contents of %cr0, then
1325 			 * the VALID bit will not be set in `info1`, since the
1326 			 * handling is different from the mov-to-reg case.
1327 			 *
1328 			 * Punt to the instruction emulation to handle it.
1329 			 */
1330 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1331 		}
1332 		break;
1333 	case VMCB_EXIT_CR0_WRITE:
1334 	case VMCB_EXIT_CR0_SEL_WRITE:
1335 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1336 			svm_handle_cr0_write(svm_sc, vcpu,
1337 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1338 			handled = 1;
1339 		} else {
1340 			/*
1341 			 * Writes to %cr0 without VALID being set in `info1` are
1342 			 * initiated by the LMSW and CLTS instructions.  While
1343 			 * LMSW (like SMSW) sees little use in modern OSes and
1344 			 * bootloaders, CLTS is still used for handling FPU
1345 			 * state transitions.
1346 			 *
1347 			 * Punt to the instruction emulation to handle them.
1348 			 */
1349 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1350 		}
1351 		break;
1352 	case VMCB_EXIT_IRET:
1353 		/*
1354 		 * Restart execution at "iret" but with the intercept cleared.
1355 		 */
1356 		vmexit->inst_length = 0;
1357 		svm_clear_nmi_blocking(svm_sc, vcpu);
1358 		handled = 1;
1359 		break;
1360 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
1361 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1362 		svm_disable_intr_window_exiting(svm_sc, vcpu);
1363 		handled = 1;
1364 		break;
1365 	case VMCB_EXIT_INTR:	/* external interrupt */
1366 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1367 		handled = 1;
1368 		break;
1369 	case VMCB_EXIT_NMI:
1370 	case VMCB_EXIT_SMI:
1371 	case VMCB_EXIT_INIT:
1372 		/*
1373 		 * For external NMI/SMI and physical INIT interrupts, simply
1374 		 * continue execution, as those host events will be handled by
1375 		 * the physical CPU.
1376 		 */
1377 		handled = 1;
1378 		break;
1379 	case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: {
1380 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1381 
1382 		const uint8_t idtvec = code - VMCB_EXIT_EXCP0;
1383 		uint32_t errcode = 0;
1384 		bool reflect = true;
1385 		bool errcode_valid = false;
1386 
1387 		switch (idtvec) {
1388 		case IDT_MC:
1389 			/* The host will handle the MCE itself. */
1390 			reflect = false;
1391 			vmm_call_trap(T_MCE);
1392 			break;
1393 		case IDT_PF:
1394 			VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1395 			    info2));
1396 			/* fallthru */
1397 		case IDT_NP:
1398 		case IDT_SS:
1399 		case IDT_GP:
1400 		case IDT_AC:
1401 		case IDT_TS:
1402 			errcode_valid = true;
1403 			errcode = info1;
1404 			break;
1405 
1406 		case IDT_DF:
1407 			errcode_valid = true;
1408 			break;
1409 
1410 		case IDT_BP:
1411 		case IDT_OF:
1412 		case IDT_BR:
1413 			/*
1414 			 * The 'nrip' field is populated for INT3, INTO and
1415 			 * BOUND exceptions and this also implies that
1416 			 * 'inst_length' is non-zero.
1417 			 *
1418 			 * Reset 'inst_length' to zero so the guest %rip at
1419 			 * event injection is identical to what it was when
1420 			 * the exception originally happened.
1421 			 */
1422 			vmexit->inst_length = 0;
1423 			/* fallthru */
1424 		default:
1425 			errcode_valid = false;
1426 			break;
1427 		}
1428 		VERIFY0(vmexit->inst_length);
1429 
1430 		if (reflect) {
1431 			/* Reflect the exception back into the guest */
1432 			VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1433 			    errcode_valid, errcode, false));
1434 		}
1435 		handled = 1;
1436 		break;
1437 		}
1438 	case VMCB_EXIT_MSR:
1439 		handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0);
1440 		break;
1441 	case VMCB_EXIT_IO:
1442 		handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1443 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1444 		break;
1445 	case VMCB_EXIT_SHUTDOWN:
1446 		(void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT);
1447 		handled = 1;
1448 		break;
1449 	case VMCB_EXIT_INVD:
1450 	case VMCB_EXIT_INVLPGA:
1451 		/* privileged invalidation instructions */
1452 		vm_inject_ud(svm_sc->vm, vcpu);
1453 		handled = 1;
1454 		break;
1455 	case VMCB_EXIT_VMRUN:
1456 	case VMCB_EXIT_VMLOAD:
1457 	case VMCB_EXIT_VMSAVE:
1458 	case VMCB_EXIT_STGI:
1459 	case VMCB_EXIT_CLGI:
1460 	case VMCB_EXIT_SKINIT:
1461 		/* privileged vmm instructions */
1462 		vm_inject_ud(svm_sc->vm, vcpu);
1463 		handled = 1;
1464 		break;
1465 	case VMCB_EXIT_VMMCALL:
1466 		/* No handlers make use of VMMCALL for now */
1467 		vm_inject_ud(svm_sc->vm, vcpu);
1468 		handled = 1;
1469 		break;
1470 	case VMCB_EXIT_CPUID:
1471 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1472 		handled = x86_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1473 		    &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1474 		break;
1475 	case VMCB_EXIT_HLT:
1476 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1477 		vmexit->exitcode = VM_EXITCODE_HLT;
1478 		vmexit->u.hlt.rflags = state->rflags;
1479 		break;
1480 	case VMCB_EXIT_PAUSE:
1481 		vmexit->exitcode = VM_EXITCODE_PAUSE;
1482 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1483 		break;
1484 	case VMCB_EXIT_NPF:
1485 		/* EXITINFO2 contains the faulting guest physical address */
1486 		if (info1 & VMCB_NPF_INFO1_RSV) {
1487 			/* nested fault with reserved bits set */
1488 		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1489 			vmexit->exitcode = VM_EXITCODE_PAGING;
1490 			vmexit->u.paging.gpa = info2;
1491 			vmexit->u.paging.fault_type = npf_fault_type(info1);
1492 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1493 		} else if (svm_npf_emul_fault(info1)) {
1494 			svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1495 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1496 		}
1497 		break;
1498 	case VMCB_EXIT_MONITOR:
1499 		vmexit->exitcode = VM_EXITCODE_MONITOR;
1500 		break;
1501 	case VMCB_EXIT_MWAIT:
1502 		vmexit->exitcode = VM_EXITCODE_MWAIT;
1503 		break;
1504 	default:
1505 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1506 		break;
1507 	}
1508 
1509 	DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1510 	    code);
1511 
1512 	if (handled) {
1513 		vmexit->rip += vmexit->inst_length;
1514 		vmexit->inst_length = 0;
1515 		state->rip = vmexit->rip;
1516 	} else {
1517 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1518 			/*
1519 			 * If this VM exit was not claimed by anybody then
1520 			 * treat it as a generic SVM exit.
1521 			 */
1522 			vm_exit_svm(vmexit, code, info1, info2);
1523 		} else {
1524 			/*
1525 			 * The exitcode and collateral have been populated.
1526 			 * The VM exit will be processed further in userland.
1527 			 */
1528 		}
1529 	}
1530 	return (handled);
1531 }
1532 
1533 /*
1534  * Inject exceptions, NMIs, and ExtINTs.
1535  *
1536  * The logic behind these are complicated and may involve mutex contention, so
1537  * the injection is performed without the protection of host CPU interrupts
1538  * being disabled.  This means a racing notification could be "lost",
1539  * necessitating a later call to svm_inject_recheck() to close that window
1540  * of opportunity.
1541  */
1542 static enum event_inject_state
1543 svm_inject_events(struct svm_softc *sc, int vcpu)
1544 {
1545 	struct vmcb_ctrl *ctrl;
1546 	struct vmcb_state *state;
1547 	struct svm_vcpu *vcpustate;
1548 	uint64_t intinfo;
1549 	enum event_inject_state ev_state;
1550 
1551 	state = svm_get_vmcb_state(sc, vcpu);
1552 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1553 	vcpustate = svm_get_vcpu(sc, vcpu);
1554 	ev_state = EIS_CAN_INJECT;
1555 
1556 	/* Clear any interrupt shadow if guest %rip has changed */
1557 	if (vcpustate->nextrip != state->rip) {
1558 		ctrl->intr_shadow = 0;
1559 	}
1560 
1561 	/*
1562 	 * An event is already pending for injection.  This can occur when the
1563 	 * vCPU exits prior to VM entry (like for an AST).
1564 	 */
1565 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1566 		return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1567 	}
1568 
1569 	/*
1570 	 * Inject pending events or exceptions for this vcpu.
1571 	 *
1572 	 * An event might be pending because the previous #VMEXIT happened
1573 	 * during event delivery (i.e. ctrl->exitintinfo).
1574 	 *
1575 	 * An event might also be pending because an exception was injected
1576 	 * by the hypervisor (e.g. #PF during instruction emulation).
1577 	 */
1578 	if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1579 		svm_inject_event(ctrl, intinfo);
1580 		vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1581 		ev_state = EIS_EV_INJECTED;
1582 	}
1583 
1584 	/* NMI event has priority over interrupts. */
1585 	if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1586 		if (ev_state == EIS_CAN_INJECT) {
1587 			/* Can't inject NMI if vcpu is in an intr_shadow. */
1588 			if (ctrl->intr_shadow) {
1589 				return (EIS_GI_BLOCK);
1590 			}
1591 
1592 			svm_inject_nmi(sc, vcpu);
1593 			ev_state = EIS_EV_INJECTED;
1594 		} else {
1595 			return (ev_state | EIS_REQ_EXIT);
1596 		}
1597 	}
1598 
1599 	if (vm_extint_pending(sc->vm, vcpu)) {
1600 		int vector;
1601 
1602 		if (ev_state != EIS_CAN_INJECT) {
1603 			return (ev_state | EIS_REQ_EXIT);
1604 		}
1605 
1606 		/*
1607 		 * If the guest has disabled interrupts or is in an interrupt
1608 		 * shadow then we cannot inject the pending interrupt.
1609 		 */
1610 		if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1611 			return (EIS_GI_BLOCK);
1612 		}
1613 
1614 		/* Ask the legacy pic for a vector to inject */
1615 		vatpic_pending_intr(sc->vm, &vector);
1616 		KASSERT(vector >= 0 && vector <= 255,
1617 		    ("invalid vector %d from INTR", vector));
1618 
1619 		svm_inject_irq(sc, vcpu, vector);
1620 		vm_extint_clear(sc->vm, vcpu);
1621 		vatpic_intr_accepted(sc->vm, vector);
1622 		ev_state = EIS_EV_INJECTED;
1623 	}
1624 
1625 	return (ev_state);
1626 }
1627 
1628 /*
1629  * Synchronize vLAPIC state and inject any interrupts pending on it.
1630  *
1631  * This is done with host CPU interrupts disabled so notification IPIs will be
1632  * queued on the host APIC and recognized when entering SVM guest context.
1633  */
1634 static enum event_inject_state
1635 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1636     enum event_inject_state ev_state)
1637 {
1638 	struct vmcb_ctrl *ctrl;
1639 	struct vmcb_state *state;
1640 	int vector;
1641 	uint8_t v_tpr;
1642 
1643 	state = svm_get_vmcb_state(sc, vcpu);
1644 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1645 
1646 	/*
1647 	 * The guest can modify the TPR by writing to %cr8. In guest mode the
1648 	 * CPU reflects this write to V_TPR without hypervisor intervention.
1649 	 *
1650 	 * The guest can also modify the TPR by writing to it via the memory
1651 	 * mapped APIC page. In this case, the write will be emulated by the
1652 	 * hypervisor. For this reason V_TPR must be updated before every
1653 	 * VMRUN.
1654 	 */
1655 	v_tpr = vlapic_get_cr8(vlapic);
1656 	KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1657 	if (ctrl->v_tpr != v_tpr) {
1658 		ctrl->v_tpr = v_tpr;
1659 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1660 	}
1661 
1662 	/* If an event cannot otherwise be injected, we are done for now */
1663 	if (ev_state != EIS_CAN_INJECT) {
1664 		return (ev_state);
1665 	}
1666 
1667 	if (!vlapic_pending_intr(vlapic, &vector)) {
1668 		return (EIS_CAN_INJECT);
1669 	}
1670 	KASSERT(vector >= 16 && vector <= 255,
1671 	    ("invalid vector %d from local APIC", vector));
1672 
1673 	/*
1674 	 * If the guest has disabled interrupts or is in an interrupt shadow
1675 	 * then we cannot inject the pending interrupt.
1676 	 */
1677 	if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1678 		return (EIS_GI_BLOCK);
1679 	}
1680 
1681 	svm_inject_irq(sc, vcpu, vector);
1682 	vlapic_intr_accepted(vlapic, vector);
1683 	return (EIS_EV_INJECTED);
1684 }
1685 
1686 /*
1687  * Re-check for events to be injected.
1688  *
1689  * Once host CPU interrupts are disabled, check for the presence of any events
1690  * which require injection processing.  If an exit is required upon injection,
1691  * or once the guest becomes interruptable, that will be configured too.
1692  */
1693 static bool
1694 svm_inject_recheck(struct svm_softc *sc, int vcpu,
1695     enum event_inject_state ev_state)
1696 {
1697 	struct vmcb_ctrl *ctrl;
1698 
1699 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1700 
1701 	if (ev_state == EIS_CAN_INJECT) {
1702 		/*
1703 		 * An active interrupt shadow would preclude us from injecting
1704 		 * any events picked up during a re-check.
1705 		 */
1706 		if (ctrl->intr_shadow != 0) {
1707 			return (false);
1708 		}
1709 
1710 		if (vm_nmi_pending(sc->vm, vcpu) &&
1711 		    !svm_nmi_blocked(sc, vcpu)) {
1712 			/* queued NMI not blocked by NMI-window-exiting */
1713 			return (true);
1714 		}
1715 		if (vm_extint_pending(sc->vm, vcpu)) {
1716 			/* queued ExtINT not blocked by existing injection */
1717 			return (true);
1718 		}
1719 	} else {
1720 		if ((ev_state & EIS_REQ_EXIT) != 0) {
1721 			/*
1722 			 * Use a self-IPI to force an immediate exit after
1723 			 * event injection has occurred.
1724 			 */
1725 			poke_cpu(CPU->cpu_id);
1726 		} else {
1727 			/*
1728 			 * If any event is being injected, an exit immediately
1729 			 * upon becoming interruptable again will allow pending
1730 			 * or newly queued events to be injected in a timely
1731 			 * manner.
1732 			 */
1733 			svm_enable_intr_window_exiting(sc, vcpu);
1734 		}
1735 	}
1736 	return (false);
1737 }
1738 
1739 
1740 static void
1741 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen)
1742 {
1743 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1744 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1745 	uint8_t flush;
1746 
1747 	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1748 	    vcpustate->nptgen != nptgen);
1749 
1750 	if (flush != VMCB_TLB_FLUSH_NOTHING) {
1751 		ctrl->asid = vcpustate->hma_asid.hsa_asid;
1752 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1753 	}
1754 	ctrl->tlb_ctrl = flush;
1755 	vcpustate->nptgen = nptgen;
1756 }
1757 
1758 static void
1759 flush_asid(struct svm_softc *sc, int vcpuid)
1760 {
1761 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1762 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1763 	uint8_t flush;
1764 
1765 	flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(),
1766 	    true);
1767 
1768 	ASSERT(flush != VMCB_TLB_FLUSH_NOTHING);
1769 	ctrl->asid = vcpustate->hma_asid.hsa_asid;
1770 	ctrl->tlb_ctrl = flush;
1771 	svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1772 	/*
1773 	 * A potential future optimization: We could choose to update the nptgen
1774 	 * associated with the vCPU, since any pending nptgen change requiring a
1775 	 * flush will be satisfied by the one which has just now been queued.
1776 	 */
1777 }
1778 
1779 static __inline void
1780 disable_gintr(void)
1781 {
1782 	__asm __volatile("clgi");
1783 }
1784 
1785 static __inline void
1786 enable_gintr(void)
1787 {
1788 	__asm __volatile("stgi");
1789 }
1790 
1791 static __inline void
1792 svm_dr_enter_guest(struct svm_regctx *gctx)
1793 {
1794 
1795 	/* Save host control debug registers. */
1796 	gctx->host_dr7 = rdr7();
1797 	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1798 
1799 	/*
1800 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1801 	 * exceptions in the host based on the guest DRx values.  The
1802 	 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1803 	 * VMCB.
1804 	 */
1805 	load_dr7(0);
1806 	wrmsr(MSR_DEBUGCTLMSR, 0);
1807 
1808 	/* Save host debug registers. */
1809 	gctx->host_dr0 = rdr0();
1810 	gctx->host_dr1 = rdr1();
1811 	gctx->host_dr2 = rdr2();
1812 	gctx->host_dr3 = rdr3();
1813 	gctx->host_dr6 = rdr6();
1814 
1815 	/* Restore guest debug registers. */
1816 	load_dr0(gctx->sctx_dr0);
1817 	load_dr1(gctx->sctx_dr1);
1818 	load_dr2(gctx->sctx_dr2);
1819 	load_dr3(gctx->sctx_dr3);
1820 }
1821 
1822 static __inline void
1823 svm_dr_leave_guest(struct svm_regctx *gctx)
1824 {
1825 
1826 	/* Save guest debug registers. */
1827 	gctx->sctx_dr0 = rdr0();
1828 	gctx->sctx_dr1 = rdr1();
1829 	gctx->sctx_dr2 = rdr2();
1830 	gctx->sctx_dr3 = rdr3();
1831 
1832 	/*
1833 	 * Restore host debug registers.  Restore DR7 and DEBUGCTL
1834 	 * last.
1835 	 */
1836 	load_dr0(gctx->host_dr0);
1837 	load_dr1(gctx->host_dr1);
1838 	load_dr2(gctx->host_dr2);
1839 	load_dr3(gctx->host_dr3);
1840 	load_dr6(gctx->host_dr6);
1841 	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1842 	load_dr7(gctx->host_dr7);
1843 }
1844 
1845 static void
1846 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid)
1847 {
1848 	const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true);
1849 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid);
1850 
1851 	if (ctrl->tsc_offset != offset) {
1852 		ctrl->tsc_offset = offset;
1853 		svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I);
1854 	}
1855 }
1856 
1857 
1858 /*
1859  * Start vcpu with specified RIP.
1860  */
1861 static int
1862 svm_vmrun(void *arg, int vcpu, uint64_t rip)
1863 {
1864 	struct svm_regctx *gctx;
1865 	struct svm_softc *svm_sc;
1866 	struct svm_vcpu *vcpustate;
1867 	struct vmcb_state *state;
1868 	struct vmcb_ctrl *ctrl;
1869 	struct vm_exit *vmexit;
1870 	struct vlapic *vlapic;
1871 	vm_client_t *vmc;
1872 	struct vm *vm;
1873 	uint64_t vmcb_pa;
1874 	int handled;
1875 	uint16_t ldt_sel;
1876 
1877 	svm_sc = arg;
1878 	vm = svm_sc->vm;
1879 
1880 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1881 	state = svm_get_vmcb_state(svm_sc, vcpu);
1882 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1883 	vmexit = vm_exitinfo(vm, vcpu);
1884 	vlapic = vm_lapic(vm, vcpu);
1885 	vmc = vm_get_vmclient(vm, vcpu);
1886 
1887 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1888 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1889 
1890 	if (vcpustate->lastcpu != curcpu) {
1891 		/*
1892 		 * Force new ASID allocation by invalidating the generation.
1893 		 */
1894 		vcpustate->hma_asid.hsa_gen = 0;
1895 
1896 		/*
1897 		 * Invalidate the VMCB state cache by marking all fields dirty.
1898 		 */
1899 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1900 
1901 		/*
1902 		 * XXX
1903 		 * Setting 'vcpustate->lastcpu' here is bit premature because
1904 		 * we may return from this function without actually executing
1905 		 * the VMRUN  instruction. This could happen if an AST or yield
1906 		 * condition is pending on the first time through the loop.
1907 		 *
1908 		 * This works for now but any new side-effects of vcpu
1909 		 * migration should take this case into account.
1910 		 */
1911 		vcpustate->lastcpu = curcpu;
1912 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1913 	}
1914 
1915 	svm_apply_tsc_adjust(svm_sc, vcpu);
1916 
1917 	svm_msr_guest_enter(svm_sc, vcpu);
1918 
1919 	VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1920 	vcpustate->loaded = B_TRUE;
1921 
1922 	/* Update Guest RIP */
1923 	state->rip = rip;
1924 
1925 	do {
1926 		enum event_inject_state inject_state;
1927 		uint64_t nptgen;
1928 
1929 		/*
1930 		 * Initial event injection is complex and may involve mutex
1931 		 * contention, so it must be performed with global interrupts
1932 		 * still enabled.
1933 		 */
1934 		inject_state = svm_inject_events(svm_sc, vcpu);
1935 		handled = 0;
1936 
1937 		/*
1938 		 * Disable global interrupts to guarantee atomicity during
1939 		 * loading of guest state. This includes not only the state
1940 		 * loaded by the "vmrun" instruction but also software state
1941 		 * maintained by the hypervisor: suspended and rendezvous
1942 		 * state, NPT generation number, vlapic interrupts etc.
1943 		 */
1944 		disable_gintr();
1945 
1946 		/*
1947 		 * Synchronizing and injecting vlapic state is lock-free and is
1948 		 * safe (and prudent) to perform with interrupts disabled.
1949 		 */
1950 		inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
1951 		    inject_state);
1952 
1953 		/*
1954 		 * Check for vCPU bail-out conditions.  This must be done after
1955 		 * svm_inject_events() to detect a triple-fault condition.
1956 		 */
1957 		if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
1958 			enable_gintr();
1959 			break;
1960 		}
1961 
1962 		if (vcpu_run_state_pending(vm, vcpu)) {
1963 			enable_gintr();
1964 			vm_exit_run_state(vm, vcpu, state->rip);
1965 			break;
1966 		}
1967 
1968 		/*
1969 		 * If subsequent activity queued events which require injection
1970 		 * handling, take another lap to handle them.
1971 		 */
1972 		if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
1973 			enable_gintr();
1974 			handled = 1;
1975 			continue;
1976 		}
1977 
1978 		/*
1979 		 * #VMEXIT resumes the host with the guest LDTR, so
1980 		 * save the current LDT selector so it can be restored
1981 		 * after an exit.  The userspace hypervisor probably
1982 		 * doesn't use a LDT, but save and restore it to be
1983 		 * safe.
1984 		 */
1985 		ldt_sel = sldt();
1986 
1987 		/*
1988 		 * Check the vmspace and ASID generations to ensure that the
1989 		 * vcpu does not use stale TLB mappings.
1990 		 */
1991 		nptgen = vmc_table_enter(vmc);
1992 		check_asid(svm_sc, vcpu, curcpu, nptgen);
1993 
1994 		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
1995 		vcpustate->dirty = 0;
1996 
1997 		/* Launch Virtual Machine. */
1998 		vcpu_ustate_change(vm, vcpu, VU_RUN);
1999 		svm_dr_enter_guest(gctx);
2000 		svm_launch(vmcb_pa, gctx, get_pcpu());
2001 		svm_dr_leave_guest(gctx);
2002 		vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
2003 
2004 		/* Restore host LDTR. */
2005 		lldt(ldt_sel);
2006 
2007 		/* #VMEXIT disables interrupts so re-enable them here. */
2008 		enable_gintr();
2009 
2010 		vmc_table_exit(vmc);
2011 
2012 		/* Update 'nextrip' */
2013 		vcpustate->nextrip = state->rip;
2014 
2015 		/* Handle #VMEXIT and if required return to user space. */
2016 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
2017 	} while (handled);
2018 
2019 	svm_msr_guest_exit(svm_sc, vcpu);
2020 
2021 	VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2022 	vcpustate->loaded = B_FALSE;
2023 
2024 	return (0);
2025 }
2026 
2027 static void
2028 svm_vmcleanup(void *arg)
2029 {
2030 	struct svm_softc *sc = arg;
2031 
2032 	vmm_contig_free(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE);
2033 	vmm_contig_free(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE);
2034 	kmem_free(sc, sizeof (*sc));
2035 }
2036 
2037 static uint64_t *
2038 swctx_regptr(struct svm_regctx *regctx, int reg)
2039 {
2040 	switch (reg) {
2041 	case VM_REG_GUEST_RBX:
2042 		return (&regctx->sctx_rbx);
2043 	case VM_REG_GUEST_RCX:
2044 		return (&regctx->sctx_rcx);
2045 	case VM_REG_GUEST_RDX:
2046 		return (&regctx->sctx_rdx);
2047 	case VM_REG_GUEST_RDI:
2048 		return (&regctx->sctx_rdi);
2049 	case VM_REG_GUEST_RSI:
2050 		return (&regctx->sctx_rsi);
2051 	case VM_REG_GUEST_RBP:
2052 		return (&regctx->sctx_rbp);
2053 	case VM_REG_GUEST_R8:
2054 		return (&regctx->sctx_r8);
2055 	case VM_REG_GUEST_R9:
2056 		return (&regctx->sctx_r9);
2057 	case VM_REG_GUEST_R10:
2058 		return (&regctx->sctx_r10);
2059 	case VM_REG_GUEST_R11:
2060 		return (&regctx->sctx_r11);
2061 	case VM_REG_GUEST_R12:
2062 		return (&regctx->sctx_r12);
2063 	case VM_REG_GUEST_R13:
2064 		return (&regctx->sctx_r13);
2065 	case VM_REG_GUEST_R14:
2066 		return (&regctx->sctx_r14);
2067 	case VM_REG_GUEST_R15:
2068 		return (&regctx->sctx_r15);
2069 	case VM_REG_GUEST_DR0:
2070 		return (&regctx->sctx_dr0);
2071 	case VM_REG_GUEST_DR1:
2072 		return (&regctx->sctx_dr1);
2073 	case VM_REG_GUEST_DR2:
2074 		return (&regctx->sctx_dr2);
2075 	case VM_REG_GUEST_DR3:
2076 		return (&regctx->sctx_dr3);
2077 	default:
2078 		return (NULL);
2079 	}
2080 }
2081 
2082 static int
2083 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2084 {
2085 	struct svm_softc *sc;
2086 	struct vmcb *vmcb;
2087 	uint64_t *regp;
2088 	uint64_t *fieldp;
2089 	struct vmcb_segment *seg;
2090 
2091 	sc = arg;
2092 	vmcb = svm_get_vmcb(sc, vcpu);
2093 
2094 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2095 	if (regp != NULL) {
2096 		*val = *regp;
2097 		return (0);
2098 	}
2099 
2100 	switch (ident) {
2101 	case VM_REG_GUEST_INTR_SHADOW:
2102 		*val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2103 		break;
2104 
2105 	case VM_REG_GUEST_CR0:
2106 		svm_get_cr0(sc, vcpu, val);
2107 		break;
2108 	case VM_REG_GUEST_CR2:
2109 	case VM_REG_GUEST_CR3:
2110 	case VM_REG_GUEST_CR4:
2111 	case VM_REG_GUEST_DR6:
2112 	case VM_REG_GUEST_DR7:
2113 	case VM_REG_GUEST_EFER:
2114 	case VM_REG_GUEST_RAX:
2115 	case VM_REG_GUEST_RFLAGS:
2116 	case VM_REG_GUEST_RIP:
2117 	case VM_REG_GUEST_RSP:
2118 		fieldp = vmcb_regptr(vmcb, ident, NULL);
2119 		*val = *fieldp;
2120 		break;
2121 
2122 	case VM_REG_GUEST_CS:
2123 	case VM_REG_GUEST_DS:
2124 	case VM_REG_GUEST_ES:
2125 	case VM_REG_GUEST_FS:
2126 	case VM_REG_GUEST_GS:
2127 	case VM_REG_GUEST_SS:
2128 	case VM_REG_GUEST_LDTR:
2129 	case VM_REG_GUEST_TR:
2130 		seg = vmcb_segptr(vmcb, ident);
2131 		*val = seg->selector;
2132 		break;
2133 
2134 	case VM_REG_GUEST_GDTR:
2135 	case VM_REG_GUEST_IDTR:
2136 		/* GDTR and IDTR don't have segment selectors */
2137 		return (EINVAL);
2138 
2139 	default:
2140 		return (EINVAL);
2141 	}
2142 
2143 	return (0);
2144 }
2145 
2146 static int
2147 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2148 {
2149 	struct svm_softc *sc;
2150 	struct vmcb *vmcb;
2151 	uint64_t *regp;
2152 	uint64_t *fieldp;
2153 	uint32_t dirty;
2154 	struct vmcb_segment *seg;
2155 
2156 	sc = arg;
2157 	vmcb = svm_get_vmcb(sc, vcpu);
2158 
2159 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2160 	if (regp != NULL) {
2161 		*regp = val;
2162 		return (0);
2163 	}
2164 
2165 	dirty = VMCB_CACHE_NONE;
2166 	switch (ident) {
2167 	case VM_REG_GUEST_INTR_SHADOW:
2168 		vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2169 		break;
2170 
2171 	case VM_REG_GUEST_EFER:
2172 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2173 		/* EFER_SVM must always be set when the guest is executing */
2174 		*fieldp = val | EFER_SVM;
2175 		dirty |= VMCB_CACHE_CR;
2176 		break;
2177 
2178 	case VM_REG_GUEST_CR0:
2179 		svm_set_cr0(sc, vcpu, val, false);
2180 		break;
2181 	case VM_REG_GUEST_CR2:
2182 	case VM_REG_GUEST_CR3:
2183 	case VM_REG_GUEST_CR4:
2184 	case VM_REG_GUEST_DR6:
2185 	case VM_REG_GUEST_DR7:
2186 	case VM_REG_GUEST_RAX:
2187 	case VM_REG_GUEST_RFLAGS:
2188 	case VM_REG_GUEST_RIP:
2189 	case VM_REG_GUEST_RSP:
2190 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2191 		*fieldp = val;
2192 		break;
2193 
2194 	case VM_REG_GUEST_CS:
2195 	case VM_REG_GUEST_DS:
2196 	case VM_REG_GUEST_ES:
2197 	case VM_REG_GUEST_SS:
2198 	case VM_REG_GUEST_FS:
2199 	case VM_REG_GUEST_GS:
2200 	case VM_REG_GUEST_LDTR:
2201 	case VM_REG_GUEST_TR:
2202 		dirty |= VMCB_CACHE_SEG;
2203 		seg = vmcb_segptr(vmcb, ident);
2204 		seg->selector = (uint16_t)val;
2205 		break;
2206 
2207 	case VM_REG_GUEST_GDTR:
2208 	case VM_REG_GUEST_IDTR:
2209 		/* GDTR and IDTR don't have segment selectors */
2210 		return (EINVAL);
2211 
2212 	default:
2213 		return (EINVAL);
2214 	}
2215 
2216 	if (dirty != VMCB_CACHE_NONE) {
2217 		svm_set_dirty(sc, vcpu, dirty);
2218 	}
2219 
2220 	/*
2221 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
2222 	 * vcpu's ASID. This needs to be treated differently depending on
2223 	 * whether 'running' is true/false.
2224 	 */
2225 
2226 	return (0);
2227 }
2228 
2229 static int
2230 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2231 {
2232 	struct vmcb *vmcb;
2233 	struct svm_softc *sc;
2234 	struct vmcb_segment *seg;
2235 
2236 	sc = arg;
2237 	vmcb = svm_get_vmcb(sc, vcpu);
2238 
2239 	switch (reg) {
2240 	case VM_REG_GUEST_CS:
2241 	case VM_REG_GUEST_DS:
2242 	case VM_REG_GUEST_ES:
2243 	case VM_REG_GUEST_SS:
2244 	case VM_REG_GUEST_FS:
2245 	case VM_REG_GUEST_GS:
2246 	case VM_REG_GUEST_LDTR:
2247 	case VM_REG_GUEST_TR:
2248 		svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2249 		seg = vmcb_segptr(vmcb, reg);
2250 		/*
2251 		 * Map seg_desc access to VMCB attribute format.
2252 		 *
2253 		 * SVM uses the 'P' bit in the segment attributes to indicate a
2254 		 * NULL segment so clear it if the segment is marked unusable.
2255 		 */
2256 		seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2257 		if (SEG_DESC_UNUSABLE(desc->access)) {
2258 			seg->attrib &= ~0x80;
2259 		}
2260 		break;
2261 
2262 	case VM_REG_GUEST_GDTR:
2263 	case VM_REG_GUEST_IDTR:
2264 		svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2265 		seg = vmcb_segptr(vmcb, reg);
2266 		break;
2267 
2268 	default:
2269 		return (EINVAL);
2270 	}
2271 
2272 	ASSERT(seg != NULL);
2273 	seg->base = desc->base;
2274 	seg->limit = desc->limit;
2275 
2276 	return (0);
2277 }
2278 
2279 static int
2280 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2281 {
2282 	struct vmcb *vmcb;
2283 	struct svm_softc *sc;
2284 	struct vmcb_segment *seg;
2285 
2286 	sc = arg;
2287 	vmcb = svm_get_vmcb(sc, vcpu);
2288 
2289 	switch (reg) {
2290 	case VM_REG_GUEST_DS:
2291 	case VM_REG_GUEST_ES:
2292 	case VM_REG_GUEST_FS:
2293 	case VM_REG_GUEST_GS:
2294 	case VM_REG_GUEST_SS:
2295 	case VM_REG_GUEST_LDTR:
2296 		seg = vmcb_segptr(vmcb, reg);
2297 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2298 		/*
2299 		 * VT-x uses bit 16 to indicate a segment that has been loaded
2300 		 * with a NULL selector (aka unusable). The 'desc->access'
2301 		 * field is interpreted in the VT-x format by the
2302 		 * processor-independent code.
2303 		 *
2304 		 * SVM uses the 'P' bit to convey the same information so
2305 		 * convert it into the VT-x format. For more details refer to
2306 		 * section "Segment State in the VMCB" in APMv2.
2307 		 */
2308 		if ((desc->access & 0x80) == 0) {
2309 			/* Unusable segment */
2310 			desc->access |= 0x10000;
2311 		}
2312 		break;
2313 
2314 	case VM_REG_GUEST_CS:
2315 	case VM_REG_GUEST_TR:
2316 		seg = vmcb_segptr(vmcb, reg);
2317 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2318 		break;
2319 
2320 	case VM_REG_GUEST_GDTR:
2321 	case VM_REG_GUEST_IDTR:
2322 		seg = vmcb_segptr(vmcb, reg);
2323 		/*
2324 		 * Since there are no access bits associated with the GDTR or
2325 		 * the IDTR, zero out the field to ensure it does not contain
2326 		 * garbage which might confuse the consumer.
2327 		 */
2328 		desc->access = 0;
2329 		break;
2330 
2331 	default:
2332 		return (EINVAL);
2333 	}
2334 
2335 	ASSERT(seg != NULL);
2336 	desc->base = seg->base;
2337 	desc->limit = seg->limit;
2338 	return (0);
2339 }
2340 
2341 static int
2342 svm_setcap(void *arg, int vcpu, int type, int val)
2343 {
2344 	struct svm_softc *sc;
2345 	int error;
2346 
2347 	sc = arg;
2348 	error = 0;
2349 	switch (type) {
2350 	case VM_CAP_HALT_EXIT:
2351 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2352 		    VMCB_INTCPT_HLT, val);
2353 		break;
2354 	case VM_CAP_PAUSE_EXIT:
2355 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2356 		    VMCB_INTCPT_PAUSE, val);
2357 		break;
2358 	default:
2359 		error = ENOENT;
2360 		break;
2361 	}
2362 	return (error);
2363 }
2364 
2365 static int
2366 svm_getcap(void *arg, int vcpu, int type, int *retval)
2367 {
2368 	struct svm_softc *sc;
2369 	int error;
2370 
2371 	sc = arg;
2372 	error = 0;
2373 
2374 	switch (type) {
2375 	case VM_CAP_HALT_EXIT:
2376 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2377 		    VMCB_INTCPT_HLT);
2378 		break;
2379 	case VM_CAP_PAUSE_EXIT:
2380 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2381 		    VMCB_INTCPT_PAUSE);
2382 		break;
2383 	default:
2384 		error = ENOENT;
2385 		break;
2386 	}
2387 	return (error);
2388 }
2389 
2390 static struct vlapic *
2391 svm_vlapic_init(void *arg, int vcpuid)
2392 {
2393 	struct svm_softc *svm_sc;
2394 	struct vlapic *vlapic;
2395 
2396 	svm_sc = arg;
2397 	vlapic = kmem_zalloc(sizeof (struct vlapic), KM_SLEEP);
2398 	vlapic->vm = svm_sc->vm;
2399 	vlapic->vcpuid = vcpuid;
2400 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2401 
2402 	vlapic_init(vlapic);
2403 
2404 	return (vlapic);
2405 }
2406 
2407 static void
2408 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2409 {
2410 	vlapic_cleanup(vlapic);
2411 	kmem_free(vlapic, sizeof (struct vlapic));
2412 }
2413 
2414 static void
2415 svm_savectx(void *arg, int vcpu)
2416 {
2417 	struct svm_softc *sc = arg;
2418 
2419 	if (sc->vcpu[vcpu].loaded) {
2420 		svm_msr_guest_exit(sc, vcpu);
2421 	}
2422 }
2423 
2424 static void
2425 svm_restorectx(void *arg, int vcpu)
2426 {
2427 	struct svm_softc *sc = arg;
2428 
2429 	if (sc->vcpu[vcpu].loaded) {
2430 		svm_msr_guest_enter(sc, vcpu);
2431 	}
2432 }
2433 
2434 struct vmm_ops vmm_ops_amd = {
2435 	.init		= svm_init,
2436 	.cleanup	= svm_cleanup,
2437 	.resume		= svm_restore,
2438 
2439 	.vminit		= svm_vminit,
2440 	.vmrun		= svm_vmrun,
2441 	.vmcleanup	= svm_vmcleanup,
2442 	.vmgetreg	= svm_getreg,
2443 	.vmsetreg	= svm_setreg,
2444 	.vmgetdesc	= svm_getdesc,
2445 	.vmsetdesc	= svm_setdesc,
2446 	.vmgetcap	= svm_getcap,
2447 	.vmsetcap	= svm_setcap,
2448 	.vlapic_init	= svm_vlapic_init,
2449 	.vlapic_cleanup	= svm_vlapic_cleanup,
2450 
2451 	.vmsavectx	= svm_savectx,
2452 	.vmrestorectx	= svm_restorectx,
2453 };
2454