xref: /freebsd/sys/amd64/vmm/amd/svm.c (revision 0e97acdf58fe27b09c4824a474b0344daf997c5f)
1 /*-
2  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/smp.h>
33 #include <sys/kernel.h>
34 #include <sys/malloc.h>
35 #include <sys/pcpu.h>
36 #include <sys/proc.h>
37 #include <sys/sysctl.h>
38 
39 #include <vm/vm.h>
40 #include <vm/pmap.h>
41 
42 #include <machine/cpufunc.h>
43 #include <machine/psl.h>
44 #include <machine/pmap.h>
45 #include <machine/md_var.h>
46 #include <machine/specialreg.h>
47 #include <machine/smp.h>
48 #include <machine/vmm.h>
49 #include <machine/vmm_instruction_emul.h>
50 
51 #include "vmm_lapic.h"
52 #include "vmm_stat.h"
53 #include "vmm_ktr.h"
54 #include "vmm_ioport.h"
55 #include "vatpic.h"
56 #include "vlapic.h"
57 #include "vlapic_priv.h"
58 
59 #include "x86.h"
60 #include "vmcb.h"
61 #include "svm.h"
62 #include "svm_softc.h"
63 #include "svm_msr.h"
64 #include "npt.h"
65 
66 SYSCTL_DECL(_hw_vmm);
67 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
68 
69 /*
70  * SVM CPUID function 0x8000_000A, edx bit decoding.
71  */
72 #define AMD_CPUID_SVM_NP		BIT(0)  /* Nested paging or RVI */
73 #define AMD_CPUID_SVM_LBR		BIT(1)  /* Last branch virtualization */
74 #define AMD_CPUID_SVM_SVML		BIT(2)  /* SVM lock */
75 #define AMD_CPUID_SVM_NRIP_SAVE		BIT(3)  /* Next RIP is saved */
76 #define AMD_CPUID_SVM_TSC_RATE		BIT(4)  /* TSC rate control. */
77 #define AMD_CPUID_SVM_VMCB_CLEAN	BIT(5)  /* VMCB state caching */
78 #define AMD_CPUID_SVM_FLUSH_BY_ASID	BIT(6)  /* Flush by ASID */
79 #define AMD_CPUID_SVM_DECODE_ASSIST	BIT(7)  /* Decode assist */
80 #define AMD_CPUID_SVM_PAUSE_INC		BIT(10) /* Pause intercept filter. */
81 #define AMD_CPUID_SVM_PAUSE_FTH		BIT(12) /* Pause filter threshold */
82 
83 #define	VMCB_CACHE_DEFAULT	(VMCB_CACHE_ASID 	|	\
84 				VMCB_CACHE_IOPM		|	\
85 				VMCB_CACHE_I		|	\
86 				VMCB_CACHE_TPR		|	\
87 				VMCB_CACHE_CR2		|	\
88 				VMCB_CACHE_CR		|	\
89 				VMCB_CACHE_DT		|	\
90 				VMCB_CACHE_SEG		|	\
91 				VMCB_CACHE_NP)
92 
93 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
94 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
95     0, NULL);
96 
97 static MALLOC_DEFINE(M_SVM, "svm", "svm");
98 static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
99 
100 /* Per-CPU context area. */
101 extern struct pcpu __pcpu[];
102 
103 static uint32_t svm_feature;	/* AMD SVM features. */
104 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RD, &svm_feature, 0,
105     "SVM features advertised by CPUID.8000000AH:EDX");
106 
107 static int disable_npf_assist;
108 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
109     &disable_npf_assist, 0, NULL);
110 
111 /* Maximum ASIDs supported by the processor */
112 static uint32_t nasid;
113 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RD, &nasid, 0,
114     "Number of ASIDs supported by this processor");
115 
116 /* Current ASID generation for each host cpu */
117 static struct asid asid[MAXCPU];
118 
119 /*
120  * SVM host state saved area of size 4KB for each core.
121  */
122 static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
123 
124 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
125 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
126 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
127 
128 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
129 
130 static __inline int
131 flush_by_asid(void)
132 {
133 
134 	return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
135 }
136 
137 static __inline int
138 decode_assist(void)
139 {
140 
141 	return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
142 }
143 
144 static void
145 svm_disable(void *arg __unused)
146 {
147 	uint64_t efer;
148 
149 	efer = rdmsr(MSR_EFER);
150 	efer &= ~EFER_SVM;
151 	wrmsr(MSR_EFER, efer);
152 }
153 
154 /*
155  * Disable SVM on all CPUs.
156  */
157 static int
158 svm_cleanup(void)
159 {
160 
161 	smp_rendezvous(NULL, svm_disable, NULL, NULL);
162 	return (0);
163 }
164 
165 /*
166  * Verify that all the features required by bhyve are available.
167  */
168 static int
169 check_svm_features(void)
170 {
171 	u_int regs[4];
172 
173 	/* CPUID Fn8000_000A is for SVM */
174 	do_cpuid(0x8000000A, regs);
175 	svm_feature = regs[3];
176 
177 	nasid = regs[1];
178 	KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
179 
180 	/* bhyve requires the Nested Paging feature */
181 	if (!(svm_feature & AMD_CPUID_SVM_NP)) {
182 		printf("SVM: Nested Paging feature not available.\n");
183 		return (ENXIO);
184 	}
185 
186 	/* bhyve requires the NRIP Save feature */
187 	if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
188 		printf("SVM: NRIP Save feature not available.\n");
189 		return (ENXIO);
190 	}
191 
192 	return (0);
193 }
194 
195 static void
196 svm_enable(void *arg __unused)
197 {
198 	uint64_t efer;
199 
200 	efer = rdmsr(MSR_EFER);
201 	efer |= EFER_SVM;
202 	wrmsr(MSR_EFER, efer);
203 
204 	wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
205 }
206 
207 /*
208  * Return 1 if SVM is enabled on this processor and 0 otherwise.
209  */
210 static int
211 svm_available(void)
212 {
213 	uint64_t msr;
214 
215 	/* Section 15.4 Enabling SVM from APM2. */
216 	if ((amd_feature2 & AMDID2_SVM) == 0) {
217 		printf("SVM: not available.\n");
218 		return (0);
219 	}
220 
221 	msr = rdmsr(MSR_VM_CR);
222 	if ((msr & VM_CR_SVMDIS) != 0) {
223 		printf("SVM: disabled by BIOS.\n");
224 		return (0);
225 	}
226 
227 	return (1);
228 }
229 
230 static int
231 svm_init(int ipinum)
232 {
233 	int error, cpu;
234 
235 	if (!svm_available())
236 		return (ENXIO);
237 
238 	error = check_svm_features();
239 	if (error)
240 		return (error);
241 
242 	vmcb_clean &= VMCB_CACHE_DEFAULT;
243 
244 	for (cpu = 0; cpu < MAXCPU; cpu++) {
245 		/*
246 		 * Initialize the host ASIDs to their "highest" valid values.
247 		 *
248 		 * The next ASID allocation will rollover both 'gen' and 'num'
249 		 * and start off the sequence at {1,1}.
250 		 */
251 		asid[cpu].gen = ~0UL;
252 		asid[cpu].num = nasid - 1;
253 	}
254 
255 	svm_msr_init();
256 	svm_npt_init(ipinum);
257 
258 	/* Enable SVM on all CPUs */
259 	smp_rendezvous(NULL, svm_enable, NULL, NULL);
260 
261 	return (0);
262 }
263 
264 static void
265 svm_restore(void)
266 {
267 
268 	svm_enable(NULL);
269 }
270 
271 /* Pentium compatible MSRs */
272 #define MSR_PENTIUM_START 	0
273 #define MSR_PENTIUM_END 	0x1FFF
274 /* AMD 6th generation and Intel compatible MSRs */
275 #define MSR_AMD6TH_START 	0xC0000000UL
276 #define MSR_AMD6TH_END 		0xC0001FFFUL
277 /* AMD 7th and 8th generation compatible MSRs */
278 #define MSR_AMD7TH_START 	0xC0010000UL
279 #define MSR_AMD7TH_END 		0xC0011FFFUL
280 
281 /*
282  * Get the index and bit position for a MSR in permission bitmap.
283  * Two bits are used for each MSR: lower bit for read and higher bit for write.
284  */
285 static int
286 svm_msr_index(uint64_t msr, int *index, int *bit)
287 {
288 	uint32_t base, off;
289 
290 	*index = -1;
291 	*bit = (msr % 4) * 2;
292 	base = 0;
293 
294 	if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
295 		*index = msr / 4;
296 		return (0);
297 	}
298 
299 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
300 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
301 		off = (msr - MSR_AMD6TH_START);
302 		*index = (off + base) / 4;
303 		return (0);
304 	}
305 
306 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
307 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
308 		off = (msr - MSR_AMD7TH_START);
309 		*index = (off + base) / 4;
310 		return (0);
311 	}
312 
313 	return (EINVAL);
314 }
315 
316 /*
317  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
318  */
319 static void
320 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
321 {
322 	int index, bit, error;
323 
324 	error = svm_msr_index(msr, &index, &bit);
325 	KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
326 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
327 	    ("%s: invalid index %d for msr %#lx", __func__, index, msr));
328 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
329 	    "msr %#lx", __func__, bit, msr));
330 
331 	if (read)
332 		perm_bitmap[index] &= ~(1UL << bit);
333 
334 	if (write)
335 		perm_bitmap[index] &= ~(2UL << bit);
336 }
337 
338 static void
339 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
340 {
341 
342 	svm_msr_perm(perm_bitmap, msr, true, true);
343 }
344 
345 static void
346 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
347 {
348 
349 	svm_msr_perm(perm_bitmap, msr, true, false);
350 }
351 
352 static __inline int
353 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
354 {
355 	struct vmcb_ctrl *ctrl;
356 
357 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
358 
359 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
360 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
361 }
362 
363 static __inline void
364 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
365     int enabled)
366 {
367 	struct vmcb_ctrl *ctrl;
368 	uint32_t oldval;
369 
370 	KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
371 
372 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
373 	oldval = ctrl->intercept[idx];
374 
375 	if (enabled)
376 		ctrl->intercept[idx] |= bitmask;
377 	else
378 		ctrl->intercept[idx] &= ~bitmask;
379 
380 	if (ctrl->intercept[idx] != oldval) {
381 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
382 		VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
383 		    "from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
384 	}
385 }
386 
387 static __inline void
388 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
389 {
390 
391 	svm_set_intercept(sc, vcpu, off, bitmask, 0);
392 }
393 
394 static __inline void
395 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
396 {
397 
398 	svm_set_intercept(sc, vcpu, off, bitmask, 1);
399 }
400 
401 static void
402 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
403     uint64_t msrpm_base_pa, uint64_t np_pml4)
404 {
405 	struct vmcb_ctrl *ctrl;
406 	struct vmcb_state *state;
407 	uint32_t mask;
408 	int n;
409 
410 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
411 	state = svm_get_vmcb_state(sc, vcpu);
412 
413 	ctrl->iopm_base_pa = iopm_base_pa;
414 	ctrl->msrpm_base_pa = msrpm_base_pa;
415 
416 	/* Enable nested paging */
417 	ctrl->np_enable = 1;
418 	ctrl->n_cr3 = np_pml4;
419 
420 	/*
421 	 * Intercept accesses to the control registers that are not shadowed
422 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
423 	 */
424 	for (n = 0; n < 16; n++) {
425 		mask = (BIT(n) << 16) | BIT(n);
426 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
427 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
428 		else
429 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
430 	}
431 
432 	/* Intercept Machine Check exceptions. */
433 	svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
434 
435 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
436 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
437 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
438 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
439 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
440 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
441 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
442 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
443 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
444 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
445 	    VMCB_INTCPT_FERR_FREEZE);
446 
447 	/*
448 	 * From section "Canonicalization and Consistency Checks" in APMv2
449 	 * the VMRUN intercept bit must be set to pass the consistency check.
450 	 */
451 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
452 
453 	/*
454 	 * The ASID will be set to a non-zero value just before VMRUN.
455 	 */
456 	ctrl->asid = 0;
457 
458 	/*
459 	 * Section 15.21.1, Interrupt Masking in EFLAGS
460 	 * Section 15.21.2, Virtualizing APIC.TPR
461 	 *
462 	 * This must be set for %rflag and %cr8 isolation of guest and host.
463 	 */
464 	ctrl->v_intr_masking = 1;
465 
466 	/* Enable Last Branch Record aka LBR for debugging */
467 	ctrl->lbr_virt_en = 1;
468 	state->dbgctl = BIT(0);
469 
470 	/* EFER_SVM must always be set when the guest is executing */
471 	state->efer = EFER_SVM;
472 
473 	/* Set up the PAT to power-on state */
474 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
475 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
476 	    PAT_VALUE(2, PAT_UNCACHED)		|
477 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
478 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
479 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
480 	    PAT_VALUE(6, PAT_UNCACHED)		|
481 	    PAT_VALUE(7, PAT_UNCACHEABLE);
482 }
483 
484 /*
485  * Initialize a virtual machine.
486  */
487 static void *
488 svm_vminit(struct vm *vm, pmap_t pmap)
489 {
490 	struct svm_softc *svm_sc;
491 	struct svm_vcpu *vcpu;
492 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
493 	int i;
494 
495 	svm_sc = malloc(sizeof (struct svm_softc), M_SVM, M_WAITOK | M_ZERO);
496 	svm_sc->vm = vm;
497 	svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
498 
499 	/*
500 	 * Intercept read and write accesses to all MSRs.
501 	 */
502 	memset(svm_sc->msr_bitmap, 0xFF, sizeof(svm_sc->msr_bitmap));
503 
504 	/*
505 	 * Access to the following MSRs is redirected to the VMCB when the
506 	 * guest is executing. Therefore it is safe to allow the guest to
507 	 * read/write these MSRs directly without hypervisor involvement.
508 	 */
509 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
510 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
511 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
512 
513 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
514 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
515 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
516 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
517 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
518 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
519 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
520 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
521 
522 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
523 
524 	/*
525 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
526 	 */
527 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
528 
529 	/* Intercept access to all I/O ports. */
530 	memset(svm_sc->iopm_bitmap, 0xFF, sizeof(svm_sc->iopm_bitmap));
531 
532 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
533 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
534 	pml4_pa = svm_sc->nptp;
535 	for (i = 0; i < VM_MAXCPU; i++) {
536 		vcpu = svm_get_vcpu(svm_sc, i);
537 		vcpu->lastcpu = NOCPU;
538 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
539 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
540 		svm_msr_guest_init(svm_sc, i);
541 	}
542 	return (svm_sc);
543 }
544 
545 static int
546 svm_cpl(struct vmcb_state *state)
547 {
548 
549 	/*
550 	 * From APMv2:
551 	 *   "Retrieve the CPL from the CPL field in the VMCB, not
552 	 *    from any segment DPL"
553 	 */
554 	return (state->cpl);
555 }
556 
557 static enum vm_cpu_mode
558 svm_vcpu_mode(struct vmcb *vmcb)
559 {
560 	struct vmcb_segment seg;
561 	struct vmcb_state *state;
562 	int error;
563 
564 	state = &vmcb->state;
565 
566 	if (state->efer & EFER_LMA) {
567 		error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
568 		KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
569 		    error));
570 
571 		/*
572 		 * Section 4.8.1 for APM2, check if Code Segment has
573 		 * Long attribute set in descriptor.
574 		 */
575 		if (seg.attrib & VMCB_CS_ATTRIB_L)
576 			return (CPU_MODE_64BIT);
577 		else
578 			return (CPU_MODE_COMPATIBILITY);
579 	} else  if (state->cr0 & CR0_PE) {
580 		return (CPU_MODE_PROTECTED);
581 	} else {
582 		return (CPU_MODE_REAL);
583 	}
584 }
585 
586 static enum vm_paging_mode
587 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
588 {
589 
590 	if ((cr0 & CR0_PG) == 0)
591 		return (PAGING_MODE_FLAT);
592 	if ((cr4 & CR4_PAE) == 0)
593 		return (PAGING_MODE_32);
594 	if (efer & EFER_LME)
595 		return (PAGING_MODE_64);
596 	else
597 		return (PAGING_MODE_PAE);
598 }
599 
600 /*
601  * ins/outs utility routines
602  */
603 static uint64_t
604 svm_inout_str_index(struct svm_regctx *regs, int in)
605 {
606 	uint64_t val;
607 
608 	val = in ? regs->sctx_rdi : regs->sctx_rsi;
609 
610 	return (val);
611 }
612 
613 static uint64_t
614 svm_inout_str_count(struct svm_regctx *regs, int rep)
615 {
616 	uint64_t val;
617 
618 	val = rep ? regs->sctx_rcx : 1;
619 
620 	return (val);
621 }
622 
623 static void
624 svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
625     int in, struct vm_inout_str *vis)
626 {
627 	int error, s;
628 
629 	if (in) {
630 		vis->seg_name = VM_REG_GUEST_ES;
631 	} else {
632 		/* The segment field has standard encoding */
633 		s = (info1 >> 10) & 0x7;
634 		vis->seg_name = vm_segment_name(s);
635 	}
636 
637 	error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
638 	KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
639 }
640 
641 static int
642 svm_inout_str_addrsize(uint64_t info1)
643 {
644         uint32_t size;
645 
646         size = (info1 >> 7) & 0x7;
647         switch (size) {
648         case 1:
649                 return (2);     /* 16 bit */
650         case 2:
651                 return (4);     /* 32 bit */
652         case 4:
653                 return (8);     /* 64 bit */
654         default:
655                 panic("%s: invalid size encoding %d", __func__, size);
656         }
657 }
658 
659 static void
660 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
661 {
662 	struct vmcb_state *state;
663 
664 	state = &vmcb->state;
665 	paging->cr3 = state->cr3;
666 	paging->cpl = svm_cpl(state);
667 	paging->cpu_mode = svm_vcpu_mode(vmcb);
668 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
669 	    state->efer);
670 }
671 
672 #define	UNHANDLED 0
673 
674 /*
675  * Handle guest I/O intercept.
676  */
677 static int
678 svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
679 {
680 	struct vmcb_ctrl *ctrl;
681 	struct vmcb_state *state;
682 	struct svm_regctx *regs;
683 	struct vm_inout_str *vis;
684 	uint64_t info1;
685 	int inout_string;
686 
687 	state = svm_get_vmcb_state(svm_sc, vcpu);
688 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
689 	regs  = svm_get_guest_regctx(svm_sc, vcpu);
690 
691 	info1 = ctrl->exitinfo1;
692 	inout_string = info1 & BIT(2) ? 1 : 0;
693 
694 	/*
695 	 * The effective segment number in EXITINFO1[12:10] is populated
696 	 * only if the processor has the DecodeAssist capability.
697 	 *
698 	 * XXX this is not specified explicitly in APMv2 but can be verified
699 	 * empirically.
700 	 */
701 	if (inout_string && !decode_assist())
702 		return (UNHANDLED);
703 
704 	vmexit->exitcode 	= VM_EXITCODE_INOUT;
705 	vmexit->u.inout.in 	= (info1 & BIT(0)) ? 1 : 0;
706 	vmexit->u.inout.string 	= inout_string;
707 	vmexit->u.inout.rep 	= (info1 & BIT(3)) ? 1 : 0;
708 	vmexit->u.inout.bytes 	= (info1 >> 4) & 0x7;
709 	vmexit->u.inout.port 	= (uint16_t)(info1 >> 16);
710 	vmexit->u.inout.eax 	= (uint32_t)(state->rax);
711 
712 	if (inout_string) {
713 		vmexit->exitcode = VM_EXITCODE_INOUT_STR;
714 		vis = &vmexit->u.inout_str;
715 		svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
716 		vis->rflags = state->rflags;
717 		vis->cr0 = state->cr0;
718 		vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
719 		vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
720 		vis->addrsize = svm_inout_str_addrsize(info1);
721 		svm_inout_str_seginfo(svm_sc, vcpu, info1,
722 		    vmexit->u.inout.in, vis);
723 	}
724 
725 	return (UNHANDLED);
726 }
727 
728 static int
729 npf_fault_type(uint64_t exitinfo1)
730 {
731 
732 	if (exitinfo1 & VMCB_NPF_INFO1_W)
733 		return (VM_PROT_WRITE);
734 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
735 		return (VM_PROT_EXECUTE);
736 	else
737 		return (VM_PROT_READ);
738 }
739 
740 static bool
741 svm_npf_emul_fault(uint64_t exitinfo1)
742 {
743 
744 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
745 		return (false);
746 	}
747 
748 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
749 		return (false);
750 	}
751 
752 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
753 		return (false);
754 	}
755 
756 	return (true);
757 }
758 
759 static void
760 svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
761 {
762 	struct vm_guest_paging *paging;
763 	struct vmcb_segment seg;
764 	struct vmcb_ctrl *ctrl;
765 	char *inst_bytes;
766 	int error, inst_len;
767 
768 	ctrl = &vmcb->ctrl;
769 	paging = &vmexit->u.inst_emul.paging;
770 
771 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
772 	vmexit->u.inst_emul.gpa = gpa;
773 	vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
774 	svm_paging_info(vmcb, paging);
775 
776 	error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
777 	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
778 
779 	switch(paging->cpu_mode) {
780 	case CPU_MODE_PROTECTED:
781 	case CPU_MODE_COMPATIBILITY:
782 		/*
783 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
784 		 */
785 		vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
786 		    1 : 0;
787 		break;
788 	default:
789 		vmexit->u.inst_emul.cs_d = 0;
790 		break;
791 	}
792 
793 	/*
794 	 * Copy the instruction bytes into 'vie' if available.
795 	 */
796 	if (decode_assist() && !disable_npf_assist) {
797 		inst_len = ctrl->inst_len;
798 		inst_bytes = ctrl->inst_bytes;
799 	} else {
800 		inst_len = 0;
801 		inst_bytes = NULL;
802 	}
803 	vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
804 }
805 
806 #ifdef KTR
807 static const char *
808 intrtype_to_str(int intr_type)
809 {
810 	switch (intr_type) {
811 	case VMCB_EVENTINJ_TYPE_INTR:
812 		return ("hwintr");
813 	case VMCB_EVENTINJ_TYPE_NMI:
814 		return ("nmi");
815 	case VMCB_EVENTINJ_TYPE_INTn:
816 		return ("swintr");
817 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
818 		return ("exception");
819 	default:
820 		panic("%s: unknown intr_type %d", __func__, intr_type);
821 	}
822 }
823 #endif
824 
825 /*
826  * Inject an event to vcpu as described in section 15.20, "Event injection".
827  */
828 static void
829 svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
830 		 uint32_t error, bool ec_valid)
831 {
832 	struct vmcb_ctrl *ctrl;
833 
834 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
835 
836 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
837 	    ("%s: event already pending %#lx", __func__, ctrl->eventinj));
838 
839 	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
840 	    __func__, vector));
841 
842 	switch (intr_type) {
843 	case VMCB_EVENTINJ_TYPE_INTR:
844 	case VMCB_EVENTINJ_TYPE_NMI:
845 	case VMCB_EVENTINJ_TYPE_INTn:
846 		break;
847 	case VMCB_EVENTINJ_TYPE_EXCEPTION:
848 		if (vector >= 0 && vector <= 31 && vector != 2)
849 			break;
850 		/* FALLTHROUGH */
851 	default:
852 		panic("%s: invalid intr_type/vector: %d/%d", __func__,
853 		    intr_type, vector);
854 	}
855 	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
856 	if (ec_valid) {
857 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
858 		ctrl->eventinj |= (uint64_t)error << 32;
859 		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
860 		    intrtype_to_str(intr_type), vector, error);
861 	} else {
862 		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
863 		    intrtype_to_str(intr_type), vector);
864 	}
865 }
866 
867 static void
868 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
869 {
870 	struct vm *vm;
871 	struct vlapic *vlapic;
872 	struct vmcb_ctrl *ctrl;
873 	int pending;
874 
875 	vm = sc->vm;
876 	vlapic = vm_lapic(vm, vcpu);
877 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
878 
879 	/* Update %cr8 in the emulated vlapic */
880 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
881 
882 	/*
883 	 * If V_IRQ indicates that the interrupt injection attempted on then
884 	 * last VMRUN was successful then update the vlapic accordingly.
885 	 */
886 	if (ctrl->v_intr_vector != 0) {
887 		pending = ctrl->v_irq;
888 		KASSERT(ctrl->v_intr_vector >= 16, ("%s: invalid "
889 		    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
890 		KASSERT(!ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
891 		VCPU_CTR2(vm, vcpu, "v_intr_vector %d %s", ctrl->v_intr_vector,
892 		    pending ? "pending" : "accepted");
893 		if (!pending)
894 			vlapic_intr_accepted(vlapic, ctrl->v_intr_vector);
895 	}
896 }
897 
898 static void
899 svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
900 {
901 	struct vmcb_ctrl *ctrl;
902 	uint64_t intinfo;
903 
904 	ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
905 	intinfo = ctrl->exitintinfo;
906 	if (!VMCB_EXITINTINFO_VALID(intinfo))
907 		return;
908 
909 	/*
910 	 * From APMv2, Section "Intercepts during IDT interrupt delivery"
911 	 *
912 	 * If a #VMEXIT happened during event delivery then record the event
913 	 * that was being delivered.
914 	 */
915 	VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
916 		intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
917 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
918 	vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
919 }
920 
921 static __inline int
922 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
923 {
924 
925 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
926 	    VMCB_INTCPT_VINTR));
927 }
928 
929 static __inline void
930 enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
931 {
932 	struct vmcb_ctrl *ctrl;
933 
934 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
935 
936 	if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
937 		KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
938 		KASSERT(vintr_intercept_enabled(sc, vcpu),
939 		    ("%s: vintr intercept should be enabled", __func__));
940 		return;
941 	}
942 
943 	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
944 	ctrl->v_irq = 1;
945 	ctrl->v_ign_tpr = 1;
946 	ctrl->v_intr_vector = 0;
947 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
948 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
949 }
950 
951 static __inline void
952 disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
953 {
954 	struct vmcb_ctrl *ctrl;
955 
956 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
957 
958 	if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
959 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
960 		    ("%s: vintr intercept should be disabled", __func__));
961 		return;
962 	}
963 
964 #ifdef KTR
965 	if (ctrl->v_intr_vector == 0)
966 		VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
967 	else
968 		VCPU_CTR0(sc->vm, vcpu, "Clearing V_IRQ interrupt injection");
969 #endif
970 	ctrl->v_irq = 0;
971 	ctrl->v_intr_vector = 0;
972 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
973 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
974 }
975 
976 static int
977 svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
978 {
979 	struct vmcb_ctrl *ctrl;
980 	int oldval, newval;
981 
982 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
983 	oldval = ctrl->intr_shadow;
984 	newval = val ? 1 : 0;
985 	if (newval != oldval) {
986 		ctrl->intr_shadow = newval;
987 		VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
988 	}
989 	return (0);
990 }
991 
992 static int
993 svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val)
994 {
995 	struct vmcb_ctrl *ctrl;
996 
997 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
998 	*val = ctrl->intr_shadow;
999 	return (0);
1000 }
1001 
1002 /*
1003  * Once an NMI is injected it blocks delivery of further NMIs until the handler
1004  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1005  * to track when the vcpu is done handling the NMI.
1006  */
1007 static int
1008 nmi_blocked(struct svm_softc *sc, int vcpu)
1009 {
1010 	int blocked;
1011 
1012 	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1013 	    VMCB_INTCPT_IRET);
1014 	return (blocked);
1015 }
1016 
1017 static void
1018 enable_nmi_blocking(struct svm_softc *sc, int vcpu)
1019 {
1020 
1021 	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
1022 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
1023 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1024 }
1025 
1026 static void
1027 clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1028 {
1029 	int error;
1030 
1031 	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1032 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
1033 	/*
1034 	 * When the IRET intercept is cleared the vcpu will attempt to execute
1035 	 * the "iret" when it runs next. However, it is possible to inject
1036 	 * another NMI into the vcpu before the "iret" has actually executed.
1037 	 *
1038 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1039 	 * it will trap back into the hypervisor. If an NMI is pending for
1040 	 * the vcpu it will be injected into the guest.
1041 	 *
1042 	 * XXX this needs to be fixed
1043 	 */
1044 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1045 
1046 	/*
1047 	 * Set 'intr_shadow' to prevent an NMI from being injected on the
1048 	 * immediate VMRUN.
1049 	 */
1050 	error = svm_modify_intr_shadow(sc, vcpu, 1);
1051 	KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
1052 }
1053 
1054 static int
1055 emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
1056     bool *retu)
1057 {
1058 	int error;
1059 
1060 	if (lapic_msr(num))
1061 		error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
1062 	else if (num == MSR_EFER)
1063 		error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, val);
1064 	else
1065 		error = svm_wrmsr(sc, vcpu, num, val, retu);
1066 
1067 	return (error);
1068 }
1069 
1070 static int
1071 emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu)
1072 {
1073 	struct vmcb_state *state;
1074 	struct svm_regctx *ctx;
1075 	uint64_t result;
1076 	int error;
1077 
1078 	if (lapic_msr(num))
1079 		error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
1080 	else
1081 		error = svm_rdmsr(sc, vcpu, num, &result, retu);
1082 
1083 	if (error == 0) {
1084 		state = svm_get_vmcb_state(sc, vcpu);
1085 		ctx = svm_get_guest_regctx(sc, vcpu);
1086 		state->rax = result & 0xffffffff;
1087 		ctx->sctx_rdx = result >> 32;
1088 	}
1089 
1090 	return (error);
1091 }
1092 
1093 #ifdef KTR
1094 static const char *
1095 exit_reason_to_str(uint64_t reason)
1096 {
1097 	static char reasonbuf[32];
1098 
1099 	switch (reason) {
1100 	case VMCB_EXIT_INVALID:
1101 		return ("invalvmcb");
1102 	case VMCB_EXIT_SHUTDOWN:
1103 		return ("shutdown");
1104 	case VMCB_EXIT_NPF:
1105 		return ("nptfault");
1106 	case VMCB_EXIT_PAUSE:
1107 		return ("pause");
1108 	case VMCB_EXIT_HLT:
1109 		return ("hlt");
1110 	case VMCB_EXIT_CPUID:
1111 		return ("cpuid");
1112 	case VMCB_EXIT_IO:
1113 		return ("inout");
1114 	case VMCB_EXIT_MC:
1115 		return ("mchk");
1116 	case VMCB_EXIT_INTR:
1117 		return ("extintr");
1118 	case VMCB_EXIT_NMI:
1119 		return ("nmi");
1120 	case VMCB_EXIT_VINTR:
1121 		return ("vintr");
1122 	case VMCB_EXIT_MSR:
1123 		return ("msr");
1124 	case VMCB_EXIT_IRET:
1125 		return ("iret");
1126 	default:
1127 		snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
1128 		return (reasonbuf);
1129 	}
1130 }
1131 #endif	/* KTR */
1132 
1133 /*
1134  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1135  * that are due to instruction intercepts as well as MSR and IOIO intercepts
1136  * and exceptions caused by INT3, INTO and BOUND instructions.
1137  *
1138  * Return 1 if the nRIP is valid and 0 otherwise.
1139  */
1140 static int
1141 nrip_valid(uint64_t exitcode)
1142 {
1143 	switch (exitcode) {
1144 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
1145 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
1146 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
1147 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
1148 	case 0x43:		/* INT3 */
1149 	case 0x44:		/* INTO */
1150 	case 0x45:		/* BOUND */
1151 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1152 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1153 		return (1);
1154 	default:
1155 		return (0);
1156 	}
1157 }
1158 
1159 /*
1160  * Collateral for a generic SVM VM-exit.
1161  */
1162 static void
1163 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
1164 {
1165 
1166 	vme->exitcode = VM_EXITCODE_SVM;
1167 	vme->u.svm.exitcode = code;
1168 	vme->u.svm.exitinfo1 = info1;
1169 	vme->u.svm.exitinfo2 = info2;
1170 }
1171 
1172 static int
1173 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1174 {
1175 	struct vmcb *vmcb;
1176 	struct vmcb_state *state;
1177 	struct vmcb_ctrl *ctrl;
1178 	struct svm_regctx *ctx;
1179 	uint64_t code, info1, info2, val;
1180 	uint32_t eax, ecx, edx;
1181 	int handled;
1182 	bool retu;
1183 
1184 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
1185 	vmcb = svm_get_vmcb(svm_sc, vcpu);
1186 	state = &vmcb->state;
1187 	ctrl = &vmcb->ctrl;
1188 
1189 	handled = 0;
1190 	code = ctrl->exitcode;
1191 	info1 = ctrl->exitinfo1;
1192 	info2 = ctrl->exitinfo2;
1193 
1194 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1195 	vmexit->rip = state->rip;
1196 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1197 
1198 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1199 
1200 	/*
1201 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1202 	 * in an inconsistent state and can trigger assertions that would
1203 	 * never happen otherwise.
1204 	 */
1205 	if (code == VMCB_EXIT_INVALID) {
1206 		vm_exit_svm(vmexit, code, info1, info2);
1207 		return (0);
1208 	}
1209 
1210 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1211 	    "injection valid bit is set %#lx", __func__, ctrl->eventinj));
1212 
1213 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1214 	    ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
1215 	    vmexit->inst_length, code, info1, info2));
1216 
1217 	svm_update_virqinfo(svm_sc, vcpu);
1218 	svm_save_intinfo(svm_sc, vcpu);
1219 
1220 	switch (code) {
1221 	case VMCB_EXIT_IRET:
1222 		/*
1223 		 * Restart execution at "iret" but with the intercept cleared.
1224 		 */
1225 		vmexit->inst_length = 0;
1226 		clear_nmi_blocking(svm_sc, vcpu);
1227 		handled = 1;
1228 		break;
1229 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
1230 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1231 		handled = 1;
1232 		break;
1233 	case VMCB_EXIT_INTR:	/* external interrupt */
1234 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1235 		handled = 1;
1236 		break;
1237 	case VMCB_EXIT_NMI:	/* external NMI */
1238 		handled = 1;
1239 		break;
1240 	case VMCB_EXIT_MC:	/* machine check */
1241 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1242 		break;
1243 	case VMCB_EXIT_MSR:	/* MSR access. */
1244 		eax = state->rax;
1245 		ecx = ctx->sctx_rcx;
1246 		edx = ctx->sctx_rdx;
1247 		retu = false;
1248 
1249 		if (info1) {
1250 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1251 			val = (uint64_t)edx << 32 | eax;
1252 			VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
1253 			    ecx, val);
1254 			if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
1255 				vmexit->exitcode = VM_EXITCODE_WRMSR;
1256 				vmexit->u.msr.code = ecx;
1257 				vmexit->u.msr.wval = val;
1258 			} else if (!retu) {
1259 				handled = 1;
1260 			} else {
1261 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1262 				    ("emulate_wrmsr retu with bogus exitcode"));
1263 			}
1264 		} else {
1265 			VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
1266 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1267 			if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
1268 				vmexit->exitcode = VM_EXITCODE_RDMSR;
1269 				vmexit->u.msr.code = ecx;
1270 			} else if (!retu) {
1271 				handled = 1;
1272 			} else {
1273 				KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1274 				    ("emulate_rdmsr retu with bogus exitcode"));
1275 			}
1276 		}
1277 		break;
1278 	case VMCB_EXIT_IO:
1279 		handled = svm_handle_io(svm_sc, vcpu, vmexit);
1280 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1281 		break;
1282 	case VMCB_EXIT_CPUID:
1283 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1284 		handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
1285 		    (uint32_t *)&state->rax,
1286 		    (uint32_t *)&ctx->sctx_rbx,
1287 		    (uint32_t *)&ctx->sctx_rcx,
1288 		    (uint32_t *)&ctx->sctx_rdx);
1289 		break;
1290 	case VMCB_EXIT_HLT:
1291 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1292 		vmexit->exitcode = VM_EXITCODE_HLT;
1293 		vmexit->u.hlt.rflags = state->rflags;
1294 		break;
1295 	case VMCB_EXIT_PAUSE:
1296 		vmexit->exitcode = VM_EXITCODE_PAUSE;
1297 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1298 		break;
1299 	case VMCB_EXIT_NPF:
1300 		/* EXITINFO2 contains the faulting guest physical address */
1301 		if (info1 & VMCB_NPF_INFO1_RSV) {
1302 			VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
1303 			    "reserved bits set: info1(%#lx) info2(%#lx)",
1304 			    info1, info2);
1305 		} else if (vm_mem_allocated(svm_sc->vm, info2)) {
1306 			vmexit->exitcode = VM_EXITCODE_PAGING;
1307 			vmexit->u.paging.gpa = info2;
1308 			vmexit->u.paging.fault_type = npf_fault_type(info1);
1309 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1310 			VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
1311 			    "on gpa %#lx/%#lx at rip %#lx",
1312 			    info2, info1, state->rip);
1313 		} else if (svm_npf_emul_fault(info1)) {
1314 			svm_handle_inst_emul(vmcb, info2, vmexit);
1315 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
1316 			VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
1317 			    "for gpa %#lx/%#lx at rip %#lx",
1318 			    info2, info1, state->rip);
1319 		}
1320 		break;
1321 	default:
1322 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1323 		break;
1324 	}
1325 
1326 	VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
1327 	    handled ? "handled" : "unhandled", exit_reason_to_str(code),
1328 	    vmexit->rip, vmexit->inst_length);
1329 
1330 	if (handled) {
1331 		vmexit->rip += vmexit->inst_length;
1332 		vmexit->inst_length = 0;
1333 		state->rip = vmexit->rip;
1334 	} else {
1335 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1336 			/*
1337 			 * If this VM exit was not claimed by anybody then
1338 			 * treat it as a generic SVM exit.
1339 			 */
1340 			vm_exit_svm(vmexit, code, info1, info2);
1341 		} else {
1342 			/*
1343 			 * The exitcode and collateral have been populated.
1344 			 * The VM exit will be processed further in userland.
1345 			 */
1346 		}
1347 	}
1348 	return (handled);
1349 }
1350 
1351 static void
1352 svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
1353 {
1354 	uint64_t intinfo;
1355 
1356 	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
1357 		return;
1358 
1359 	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
1360 	    "valid: %#lx", __func__, intinfo));
1361 
1362 	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
1363 		VMCB_EXITINTINFO_VECTOR(intinfo),
1364 		VMCB_EXITINTINFO_EC(intinfo),
1365 		VMCB_EXITINTINFO_EC_VALID(intinfo));
1366 	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1367 	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
1368 }
1369 
1370 /*
1371  * Inject event to virtual cpu.
1372  */
1373 static void
1374 svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
1375 {
1376 	struct vmcb_ctrl *ctrl;
1377 	struct vmcb_state *state;
1378 	uint8_t v_tpr;
1379 	int vector, need_intr_window, pending_apic_vector;
1380 
1381 	state = svm_get_vmcb_state(sc, vcpu);
1382 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1383 
1384 	need_intr_window = 0;
1385 	pending_apic_vector = 0;
1386 
1387 	/*
1388 	 * Inject pending events or exceptions for this vcpu.
1389 	 *
1390 	 * An event might be pending because the previous #VMEXIT happened
1391 	 * during event delivery (i.e. ctrl->exitintinfo).
1392 	 *
1393 	 * An event might also be pending because an exception was injected
1394 	 * by the hypervisor (e.g. #PF during instruction emulation).
1395 	 */
1396 	svm_inj_intinfo(sc, vcpu);
1397 
1398 	/* NMI event has priority over interrupts. */
1399 	if (vm_nmi_pending(sc->vm, vcpu)) {
1400 		if (nmi_blocked(sc, vcpu)) {
1401 			/*
1402 			 * Can't inject another NMI if the guest has not
1403 			 * yet executed an "iret" after the last NMI.
1404 			 */
1405 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
1406 			    "to NMI-blocking");
1407 		} else if (ctrl->intr_shadow) {
1408 			/*
1409 			 * Can't inject an NMI if the vcpu is in an intr_shadow.
1410 			 */
1411 			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
1412 			    "interrupt shadow");
1413 			need_intr_window = 1;
1414 			goto done;
1415 		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1416 			/*
1417 			 * If there is already an exception/interrupt pending
1418 			 * then defer the NMI until after that.
1419 			 */
1420 			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
1421 			    "eventinj %#lx", ctrl->eventinj);
1422 
1423 			/*
1424 			 * Use self-IPI to trigger a VM-exit as soon as
1425 			 * possible after the event injection is completed.
1426 			 *
1427 			 * This works only if the external interrupt exiting
1428 			 * is at a lower priority than the event injection.
1429 			 *
1430 			 * Although not explicitly specified in APMv2 the
1431 			 * relative priorities were verified empirically.
1432 			 */
1433 			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
1434 		} else {
1435 			vm_nmi_clear(sc->vm, vcpu);
1436 
1437 			/* Inject NMI, vector number is not used */
1438 			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
1439 			    IDT_NMI, 0, false);
1440 
1441 			/* virtual NMI blocking is now in effect */
1442 			enable_nmi_blocking(sc, vcpu);
1443 
1444 			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
1445 		}
1446 	}
1447 
1448 	if (!vm_extint_pending(sc->vm, vcpu)) {
1449 		/*
1450 		 * APIC interrupts are delivered using the V_IRQ offload.
1451 		 *
1452 		 * The primary benefit is that the hypervisor doesn't need to
1453 		 * deal with the various conditions that inhibit interrupts.
1454 		 * It also means that TPR changes via CR8 will be handled
1455 		 * without any hypervisor involvement.
1456 		 *
1457 		 * Note that the APIC vector must remain pending in the vIRR
1458 		 * until it is confirmed that it was delivered to the guest.
1459 		 * This can be confirmed based on the value of V_IRQ at the
1460 		 * next #VMEXIT (1 = pending, 0 = delivered).
1461 		 *
1462 		 * Also note that it is possible that another higher priority
1463 		 * vector can become pending before this vector is delivered
1464 		 * to the guest. This is alright because vcpu_notify_event()
1465 		 * will send an IPI and force the vcpu to trap back into the
1466 		 * hypervisor. The higher priority vector will be injected on
1467 		 * the next VMRUN.
1468 		 */
1469 		if (vlapic_pending_intr(vlapic, &vector)) {
1470 			KASSERT(vector >= 16 && vector <= 255,
1471 			    ("invalid vector %d from local APIC", vector));
1472 			pending_apic_vector = vector;
1473 		}
1474 		goto done;
1475 	}
1476 
1477 	/* Ask the legacy pic for a vector to inject */
1478 	vatpic_pending_intr(sc->vm, &vector);
1479 	KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR",
1480 	    vector));
1481 
1482 	/*
1483 	 * If the guest has disabled interrupts or is in an interrupt shadow
1484 	 * then we cannot inject the pending interrupt.
1485 	 */
1486 	if ((state->rflags & PSL_I) == 0) {
1487 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
1488 		    "rflags %#lx", vector, state->rflags);
1489 		need_intr_window = 1;
1490 		goto done;
1491 	}
1492 
1493 	if (ctrl->intr_shadow) {
1494 		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
1495 		    "interrupt shadow", vector);
1496 		need_intr_window = 1;
1497 		goto done;
1498 	}
1499 
1500 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1501 		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
1502 		    "eventinj %#lx", vector, ctrl->eventinj);
1503 		need_intr_window = 1;
1504 		goto done;
1505 	}
1506 
1507 	/*
1508 	 * Legacy PIC interrupts are delivered via the event injection
1509 	 * mechanism.
1510 	 */
1511 	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
1512 
1513 	vm_extint_clear(sc->vm, vcpu);
1514 	vatpic_intr_accepted(sc->vm, vector);
1515 
1516 	/*
1517 	 * Force a VM-exit as soon as the vcpu is ready to accept another
1518 	 * interrupt. This is done because the PIC might have another vector
1519 	 * that it wants to inject. Also, if the APIC has a pending interrupt
1520 	 * that was preempted by the ExtInt then it allows us to inject the
1521 	 * APIC vector as soon as possible.
1522 	 */
1523 	need_intr_window = 1;
1524 done:
1525 	/*
1526 	 * The guest can modify the TPR by writing to %CR8. In guest mode
1527 	 * the processor reflects this write to V_TPR without hypervisor
1528 	 * intervention.
1529 	 *
1530 	 * The guest can also modify the TPR by writing to it via the memory
1531 	 * mapped APIC page. In this case, the write will be emulated by the
1532 	 * hypervisor. For this reason V_TPR must be updated before every
1533 	 * VMRUN.
1534 	 */
1535 	v_tpr = vlapic_get_cr8(vlapic);
1536 	KASSERT(v_tpr >= 0 && v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
1537 	if (ctrl->v_tpr != v_tpr) {
1538 		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
1539 		    ctrl->v_tpr, v_tpr);
1540 		ctrl->v_tpr = v_tpr;
1541 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1542 	}
1543 
1544 	if (pending_apic_vector) {
1545 		/*
1546 		 * If an APIC vector is being injected then interrupt window
1547 		 * exiting is not possible on this VMRUN.
1548 		 */
1549 		KASSERT(!need_intr_window, ("intr_window exiting impossible"));
1550 		VCPU_CTR1(sc->vm, vcpu, "Injecting vector %d using V_IRQ",
1551 		    pending_apic_vector);
1552 
1553 		ctrl->v_irq = 1;
1554 		ctrl->v_ign_tpr = 0;
1555 		ctrl->v_intr_vector = pending_apic_vector;
1556 		ctrl->v_intr_prio = pending_apic_vector >> 4;
1557 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1558 	} else if (need_intr_window) {
1559 		/*
1560 		 * We use V_IRQ in conjunction with the VINTR intercept to
1561 		 * trap into the hypervisor as soon as a virtual interrupt
1562 		 * can be delivered.
1563 		 *
1564 		 * Since injected events are not subject to intercept checks
1565 		 * we need to ensure that the V_IRQ is not actually going to
1566 		 * be delivered on VM entry. The KASSERT below enforces this.
1567 		 */
1568 		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
1569 		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
1570 		    ("Bogus intr_window_exiting: eventinj (%#lx), "
1571 		    "intr_shadow (%u), rflags (%#lx)",
1572 		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
1573 		enable_intr_window_exiting(sc, vcpu);
1574 	} else {
1575 		disable_intr_window_exiting(sc, vcpu);
1576 	}
1577 }
1578 
1579 static __inline void
1580 restore_host_tss(void)
1581 {
1582 	struct system_segment_descriptor *tss_sd;
1583 
1584 	/*
1585 	 * The TSS descriptor was in use prior to launching the guest so it
1586 	 * has been marked busy.
1587 	 *
1588 	 * 'ltr' requires the descriptor to be marked available so change the
1589 	 * type to "64-bit available TSS".
1590 	 */
1591 	tss_sd = PCPU_GET(tss);
1592 	tss_sd->sd_type = SDT_SYSTSS;
1593 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
1594 }
1595 
1596 static void
1597 check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
1598 {
1599 	struct svm_vcpu *vcpustate;
1600 	struct vmcb_ctrl *ctrl;
1601 	long eptgen;
1602 	bool alloc_asid;
1603 
1604 	KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
1605 	    "active on cpu %u", __func__, thiscpu));
1606 
1607 	vcpustate = svm_get_vcpu(sc, vcpuid);
1608 	ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1609 
1610 	/*
1611 	 * The TLB entries associated with the vcpu's ASID are not valid
1612 	 * if either of the following conditions is true:
1613 	 *
1614 	 * 1. The vcpu's ASID generation is different than the host cpu's
1615 	 *    ASID generation. This happens when the vcpu migrates to a new
1616 	 *    host cpu. It can also happen when the number of vcpus executing
1617 	 *    on a host cpu is greater than the number of ASIDs available.
1618 	 *
1619 	 * 2. The pmap generation number is different than the value cached in
1620 	 *    the 'vcpustate'. This happens when the host invalidates pages
1621 	 *    belonging to the guest.
1622 	 *
1623 	 *	asidgen		eptgen	      Action
1624 	 *	mismatch	mismatch
1625 	 *	   0		   0		(a)
1626 	 *	   0		   1		(b1) or (b2)
1627 	 *	   1		   0		(c)
1628 	 *	   1		   1		(d)
1629 	 *
1630 	 * (a) There is no mismatch in eptgen or ASID generation and therefore
1631 	 *     no further action is needed.
1632 	 *
1633 	 * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
1634 	 *      retained and the TLB entries associated with this ASID
1635 	 *      are flushed by VMRUN.
1636 	 *
1637 	 * (b2) If the cpu does not support FlushByAsid then a new ASID is
1638 	 *      allocated.
1639 	 *
1640 	 * (c) A new ASID is allocated.
1641 	 *
1642 	 * (d) A new ASID is allocated.
1643 	 */
1644 
1645 	alloc_asid = false;
1646 	eptgen = pmap->pm_eptgen;
1647 	ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
1648 
1649 	if (vcpustate->asid.gen != asid[thiscpu].gen) {
1650 		alloc_asid = true;	/* (c) and (d) */
1651 	} else if (vcpustate->eptgen != eptgen) {
1652 		if (flush_by_asid())
1653 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;	/* (b1) */
1654 		else
1655 			alloc_asid = true;			/* (b2) */
1656 	} else {
1657 		/*
1658 		 * This is the common case (a).
1659 		 */
1660 		KASSERT(!alloc_asid, ("ASID allocation not necessary"));
1661 		KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
1662 		    ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
1663 	}
1664 
1665 	if (alloc_asid) {
1666 		if (++asid[thiscpu].num >= nasid) {
1667 			asid[thiscpu].num = 1;
1668 			if (++asid[thiscpu].gen == 0)
1669 				asid[thiscpu].gen = 1;
1670 			/*
1671 			 * If this cpu does not support "flush-by-asid"
1672 			 * then flush the entire TLB on a generation
1673 			 * bump. Subsequent ASID allocation in this
1674 			 * generation can be done without a TLB flush.
1675 			 */
1676 			if (!flush_by_asid())
1677 				ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
1678 		}
1679 		vcpustate->asid.gen = asid[thiscpu].gen;
1680 		vcpustate->asid.num = asid[thiscpu].num;
1681 
1682 		ctrl->asid = vcpustate->asid.num;
1683 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1684 		/*
1685 		 * If this cpu supports "flush-by-asid" then the TLB
1686 		 * was not flushed after the generation bump. The TLB
1687 		 * is flushed selectively after every new ASID allocation.
1688 		 */
1689 		if (flush_by_asid())
1690 			ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
1691 	}
1692 	vcpustate->eptgen = eptgen;
1693 
1694 	KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
1695 	KASSERT(ctrl->asid == vcpustate->asid.num,
1696 	    ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
1697 }
1698 
1699 static __inline void
1700 disable_gintr(void)
1701 {
1702 
1703         __asm __volatile("clgi" : : :);
1704 }
1705 
1706 static __inline void
1707 enable_gintr(void)
1708 {
1709 
1710         __asm __volatile("stgi" : : :);
1711 }
1712 
1713 /*
1714  * Start vcpu with specified RIP.
1715  */
1716 static int
1717 svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
1718 	void *rend_cookie, void *suspended_cookie)
1719 {
1720 	struct svm_regctx *gctx;
1721 	struct svm_softc *svm_sc;
1722 	struct svm_vcpu *vcpustate;
1723 	struct vmcb_state *state;
1724 	struct vmcb_ctrl *ctrl;
1725 	struct vm_exit *vmexit;
1726 	struct vlapic *vlapic;
1727 	struct vm *vm;
1728 	uint64_t vmcb_pa;
1729 	u_int thiscpu;
1730 	int handled;
1731 
1732 	svm_sc = arg;
1733 	vm = svm_sc->vm;
1734 
1735 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1736 	state = svm_get_vmcb_state(svm_sc, vcpu);
1737 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1738 	vmexit = vm_exitinfo(vm, vcpu);
1739 	vlapic = vm_lapic(vm, vcpu);
1740 
1741 	/*
1742 	 * Stash 'curcpu' on the stack as 'thiscpu'.
1743 	 *
1744 	 * The per-cpu data area is not accessible until MSR_GSBASE is restored
1745 	 * after the #VMEXIT. Since VMRUN is executed inside a critical section
1746 	 * 'curcpu' and 'thiscpu' are guaranteed to identical.
1747 	 */
1748 	thiscpu = curcpu;
1749 
1750 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1751 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1752 
1753 	if (vcpustate->lastcpu != thiscpu) {
1754 		/*
1755 		 * Force new ASID allocation by invalidating the generation.
1756 		 */
1757 		vcpustate->asid.gen = 0;
1758 
1759 		/*
1760 		 * Invalidate the VMCB state cache by marking all fields dirty.
1761 		 */
1762 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1763 
1764 		/*
1765 		 * XXX
1766 		 * Setting 'vcpustate->lastcpu' here is bit premature because
1767 		 * we may return from this function without actually executing
1768 		 * the VMRUN  instruction. This could happen if a rendezvous
1769 		 * or an AST is pending on the first time through the loop.
1770 		 *
1771 		 * This works for now but any new side-effects of vcpu
1772 		 * migration should take this case into account.
1773 		 */
1774 		vcpustate->lastcpu = thiscpu;
1775 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1776 	}
1777 
1778 	svm_msr_guest_enter(svm_sc, vcpu);
1779 
1780 	/* Update Guest RIP */
1781 	state->rip = rip;
1782 
1783 	do {
1784 		/*
1785 		 * Disable global interrupts to guarantee atomicity during
1786 		 * loading of guest state. This includes not only the state
1787 		 * loaded by the "vmrun" instruction but also software state
1788 		 * maintained by the hypervisor: suspended and rendezvous
1789 		 * state, NPT generation number, vlapic interrupts etc.
1790 		 */
1791 		disable_gintr();
1792 
1793 		if (vcpu_suspended(suspended_cookie)) {
1794 			enable_gintr();
1795 			vm_exit_suspended(vm, vcpu, state->rip);
1796 			break;
1797 		}
1798 
1799 		if (vcpu_rendezvous_pending(rend_cookie)) {
1800 			enable_gintr();
1801 			vm_exit_rendezvous(vm, vcpu, state->rip);
1802 			break;
1803 		}
1804 
1805 		/* We are asked to give the cpu by scheduler. */
1806 		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
1807 			enable_gintr();
1808 			vm_exit_astpending(vm, vcpu, state->rip);
1809 			break;
1810 		}
1811 
1812 		svm_inj_interrupts(svm_sc, vcpu, vlapic);
1813 
1814 		/* Activate the nested pmap on 'thiscpu' */
1815 		CPU_SET_ATOMIC_ACQ(thiscpu, &pmap->pm_active);
1816 
1817 		/*
1818 		 * Check the pmap generation and the ASID generation to
1819 		 * ensure that the vcpu does not use stale TLB mappings.
1820 		 */
1821 		check_asid(svm_sc, vcpu, pmap, thiscpu);
1822 
1823 		ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
1824 		vcpustate->dirty = 0;
1825 		VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
1826 
1827 		/* Launch Virtual Machine. */
1828 		VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
1829 		svm_launch(vmcb_pa, gctx);
1830 
1831 		CPU_CLR_ATOMIC(thiscpu, &pmap->pm_active);
1832 
1833 		/*
1834 		 * Restore MSR_GSBASE to point to the pcpu data area.
1835 		 *
1836 		 * Note that accesses done via PCPU_GET/PCPU_SET will work
1837 		 * only after MSR_GSBASE is restored.
1838 		 *
1839 		 * Also note that we don't bother restoring MSR_KGSBASE
1840 		 * since it is not used in the kernel and will be restored
1841 		 * when the VMRUN ioctl returns to userspace.
1842 		 */
1843 		wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[thiscpu]);
1844 		KASSERT(curcpu == thiscpu, ("thiscpu/curcpu (%u/%u) mismatch",
1845 		    thiscpu, curcpu));
1846 
1847 		/*
1848 		 * The host GDTR and IDTR is saved by VMRUN and restored
1849 		 * automatically on #VMEXIT. However, the host TSS needs
1850 		 * to be restored explicitly.
1851 		 */
1852 		restore_host_tss();
1853 
1854 		/* #VMEXIT disables interrupts so re-enable them here. */
1855 		enable_gintr();
1856 
1857 		/* Handle #VMEXIT and if required return to user space. */
1858 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
1859 	} while (handled);
1860 
1861 	svm_msr_guest_exit(svm_sc, vcpu);
1862 
1863 	return (0);
1864 }
1865 
1866 static void
1867 svm_vmcleanup(void *arg)
1868 {
1869 	struct svm_softc *sc = arg;
1870 
1871 	free(sc, M_SVM);
1872 }
1873 
1874 static register_t *
1875 swctx_regptr(struct svm_regctx *regctx, int reg)
1876 {
1877 
1878 	switch (reg) {
1879 	case VM_REG_GUEST_RBX:
1880 		return (&regctx->sctx_rbx);
1881 	case VM_REG_GUEST_RCX:
1882 		return (&regctx->sctx_rcx);
1883 	case VM_REG_GUEST_RDX:
1884 		return (&regctx->sctx_rdx);
1885 	case VM_REG_GUEST_RDI:
1886 		return (&regctx->sctx_rdi);
1887 	case VM_REG_GUEST_RSI:
1888 		return (&regctx->sctx_rsi);
1889 	case VM_REG_GUEST_RBP:
1890 		return (&regctx->sctx_rbp);
1891 	case VM_REG_GUEST_R8:
1892 		return (&regctx->sctx_r8);
1893 	case VM_REG_GUEST_R9:
1894 		return (&regctx->sctx_r9);
1895 	case VM_REG_GUEST_R10:
1896 		return (&regctx->sctx_r10);
1897 	case VM_REG_GUEST_R11:
1898 		return (&regctx->sctx_r11);
1899 	case VM_REG_GUEST_R12:
1900 		return (&regctx->sctx_r12);
1901 	case VM_REG_GUEST_R13:
1902 		return (&regctx->sctx_r13);
1903 	case VM_REG_GUEST_R14:
1904 		return (&regctx->sctx_r14);
1905 	case VM_REG_GUEST_R15:
1906 		return (&regctx->sctx_r15);
1907 	default:
1908 		return (NULL);
1909 	}
1910 }
1911 
1912 static int
1913 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
1914 {
1915 	struct svm_softc *svm_sc;
1916 	register_t *reg;
1917 
1918 	svm_sc = arg;
1919 
1920 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
1921 		return (svm_get_intr_shadow(svm_sc, vcpu, val));
1922 	}
1923 
1924 	if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
1925 		return (0);
1926 	}
1927 
1928 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
1929 
1930 	if (reg != NULL) {
1931 		*val = *reg;
1932 		return (0);
1933 	}
1934 
1935 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
1936 	return (EINVAL);
1937 }
1938 
1939 static int
1940 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
1941 {
1942 	struct svm_softc *svm_sc;
1943 	register_t *reg;
1944 
1945 	svm_sc = arg;
1946 
1947 	if (ident == VM_REG_GUEST_INTR_SHADOW) {
1948 		return (svm_modify_intr_shadow(svm_sc, vcpu, val));
1949 	}
1950 
1951 	if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
1952 		return (0);
1953 	}
1954 
1955 	reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
1956 
1957 	if (reg != NULL) {
1958 		*reg = val;
1959 		return (0);
1960 	}
1961 
1962 	/*
1963 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
1964 	 * vcpu's ASID. This needs to be treated differently depending on
1965 	 * whether 'running' is true/false.
1966 	 */
1967 
1968 	VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
1969 	return (EINVAL);
1970 }
1971 
1972 static int
1973 svm_setcap(void *arg, int vcpu, int type, int val)
1974 {
1975 	struct svm_softc *sc;
1976 	int error;
1977 
1978 	sc = arg;
1979 	error = 0;
1980 	switch (type) {
1981 	case VM_CAP_HALT_EXIT:
1982 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1983 		    VMCB_INTCPT_HLT, val);
1984 		break;
1985 	case VM_CAP_PAUSE_EXIT:
1986 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1987 		    VMCB_INTCPT_PAUSE, val);
1988 		break;
1989 	case VM_CAP_UNRESTRICTED_GUEST:
1990 		/* Unrestricted guest execution cannot be disabled in SVM */
1991 		if (val == 0)
1992 			error = EINVAL;
1993 		break;
1994 	default:
1995 		error = ENOENT;
1996 		break;
1997 	}
1998 	return (error);
1999 }
2000 
2001 static int
2002 svm_getcap(void *arg, int vcpu, int type, int *retval)
2003 {
2004 	struct svm_softc *sc;
2005 	int error;
2006 
2007 	sc = arg;
2008 	error = 0;
2009 
2010 	switch (type) {
2011 	case VM_CAP_HALT_EXIT:
2012 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2013 		    VMCB_INTCPT_HLT);
2014 		break;
2015 	case VM_CAP_PAUSE_EXIT:
2016 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2017 		    VMCB_INTCPT_PAUSE);
2018 		break;
2019 	case VM_CAP_UNRESTRICTED_GUEST:
2020 		*retval = 1;	/* unrestricted guest is always enabled */
2021 		break;
2022 	default:
2023 		error = ENOENT;
2024 		break;
2025 	}
2026 	return (error);
2027 }
2028 
2029 static struct vlapic *
2030 svm_vlapic_init(void *arg, int vcpuid)
2031 {
2032 	struct svm_softc *svm_sc;
2033 	struct vlapic *vlapic;
2034 
2035 	svm_sc = arg;
2036 	vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
2037 	vlapic->vm = svm_sc->vm;
2038 	vlapic->vcpuid = vcpuid;
2039 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2040 
2041 	vlapic_init(vlapic);
2042 
2043 	return (vlapic);
2044 }
2045 
2046 static void
2047 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2048 {
2049 
2050         vlapic_cleanup(vlapic);
2051         free(vlapic, M_SVM_VLAPIC);
2052 }
2053 
2054 struct vmm_ops vmm_ops_amd = {
2055 	svm_init,
2056 	svm_cleanup,
2057 	svm_restore,
2058 	svm_vminit,
2059 	svm_vmrun,
2060 	svm_vmcleanup,
2061 	svm_getreg,
2062 	svm_setreg,
2063 	vmcb_getdesc,
2064 	vmcb_setdesc,
2065 	svm_getcap,
2066 	svm_setcap,
2067 	svm_npt_alloc,
2068 	svm_npt_free,
2069 	svm_vlapic_init,
2070 	svm_vlapic_cleanup
2071 };
2072