xref: /illumos-gate/usr/src/uts/intel/io/vmm/amd/svm.c (revision 09ea9c53cd9ac02c506f68475d98e8f07b457ffc)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * This file and its contents are supplied under the terms of the
31  * Common Development and Distribution License ("CDDL"), version 1.0.
32  * You may only use this file in accordance with the terms of version
33  * 1.0 of the CDDL.
34  *
35  * A full copy of the text of the CDDL should have accompanied this
36  * source.  A copy of the CDDL is also available via the Internet at
37  * http://www.illumos.org/license/CDDL.
38  *
39  * Copyright 2018 Joyent, Inc.
40  * Copyright 2023 Oxide Computer Company
41  */
42 
43 #include <sys/cdefs.h>
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/kernel.h>
48 #include <sys/kmem.h>
49 #include <sys/pcpu.h>
50 #include <sys/proc.h>
51 #include <sys/sysctl.h>
52 #include <sys/cpu.h>
53 
54 #include <sys/x86_archext.h>
55 #include <sys/archsystm.h>
56 #include <sys/trap.h>
57 
58 #include <machine/cpufunc.h>
59 #include <machine/psl.h>
60 #include <machine/md_var.h>
61 #include <machine/reg.h>
62 #include <machine/specialreg.h>
63 #include <machine/vmm.h>
64 #include <machine/vmm_dev.h>
65 #include <sys/vmm_instruction_emul.h>
66 #include <sys/vmm_vm.h>
67 #include <sys/vmm_kernel.h>
68 
69 #include "vmm_lapic.h"
70 #include "vmm_stat.h"
71 #include "vmm_ioport.h"
72 #include "vatpic.h"
73 #include "vlapic.h"
74 #include "vlapic_priv.h"
75 
76 #include "vmcb.h"
77 #include "svm.h"
78 #include "svm_softc.h"
79 #include "svm_msr.h"
80 
81 SYSCTL_DECL(_hw_vmm);
82 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
83     NULL);
84 
85 /*
86  * Guardrails for supported guest TSC frequencies.
87  *
88  * A minimum of 0.5 GHz, which should be sufficient for all recent AMD CPUs, and
89  * a maximum ratio of (15 * host frequency), which is sufficient to prevent
90  * overflowing frequency calcuations and give plenty of bandwidth for future CPU
91  * frequency increases.
92  */
93 #define	AMD_TSC_MIN_FREQ	500000000
94 #define	AMD_TSC_MAX_FREQ_RATIO	15
95 
96 /* SVM features advertised by CPUID.8000000AH:EDX */
97 static uint32_t svm_feature = 0;
98 
99 static int disable_npf_assist;
100 
101 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
102 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
103 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
104 
105 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
106 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val);
107 static void flush_asid(struct svm_softc *sc, int vcpuid);
108 
109 static __inline bool
has_flush_by_asid(void)110 has_flush_by_asid(void)
111 {
112 	return ((svm_feature & CPUID_AMD_EDX_FLUSH_ASID) != 0);
113 }
114 
115 static __inline bool
has_lbr_virt(void)116 has_lbr_virt(void)
117 {
118 	return ((svm_feature & CPUID_AMD_EDX_LBR_VIRT) != 0);
119 }
120 
121 static __inline bool
has_decode_assist(void)122 has_decode_assist(void)
123 {
124 	return ((svm_feature & CPUID_AMD_EDX_DECODE_ASSISTS) != 0);
125 }
126 
127 static __inline bool
has_tsc_freq_ctl(void)128 has_tsc_freq_ctl(void)
129 {
130 	return ((svm_feature & CPUID_AMD_EDX_TSC_RATE_MSR) != 0);
131 }
132 
133 static int
svm_cleanup(void)134 svm_cleanup(void)
135 {
136 	/* This is taken care of by the hma registration */
137 	return (0);
138 }
139 
140 static int
svm_init(void)141 svm_init(void)
142 {
143 	/* Grab a (bhyve) local copy of the SVM feature bits */
144 	struct cpuid_regs regs = {
145 		.cp_eax = 0x8000000a,
146 	};
147 	(void) cpuid_insn(NULL, &regs);
148 	svm_feature = regs.cp_edx;
149 
150 	/*
151 	 * HMA should have already checked for these features which we refuse to
152 	 * operate without, but no harm in making sure
153 	 */
154 	const uint32_t demand_bits =
155 	    (CPUID_AMD_EDX_NESTED_PAGING | CPUID_AMD_EDX_NRIPS);
156 	VERIFY((svm_feature & demand_bits) == demand_bits);
157 
158 	return (0);
159 }
160 
161 static void
svm_restore(void)162 svm_restore(void)
163 {
164 	/* No-op on illumos */
165 }
166 
167 /* Pentium compatible MSRs */
168 #define	MSR_PENTIUM_START	0
169 #define	MSR_PENTIUM_END		0x1FFF
170 /* AMD 6th generation and Intel compatible MSRs */
171 #define	MSR_AMD6TH_START	0xC0000000UL
172 #define	MSR_AMD6TH_END		0xC0001FFFUL
173 /* AMD 7th and 8th generation compatible MSRs */
174 #define	MSR_AMD7TH_START	0xC0010000UL
175 #define	MSR_AMD7TH_END		0xC0011FFFUL
176 
177 /*
178  * Get the index and bit position for a MSR in permission bitmap.
179  * Two bits are used for each MSR: lower bit for read and higher bit for write.
180  */
181 static int
svm_msr_index(uint64_t msr,int * index,int * bit)182 svm_msr_index(uint64_t msr, int *index, int *bit)
183 {
184 	uint32_t base, off;
185 
186 	*index = -1;
187 	*bit = (msr % 4) * 2;
188 	base = 0;
189 
190 	if (msr <= MSR_PENTIUM_END) {
191 		*index = msr / 4;
192 		return (0);
193 	}
194 
195 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
196 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
197 		off = (msr - MSR_AMD6TH_START);
198 		*index = (off + base) / 4;
199 		return (0);
200 	}
201 
202 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
203 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
204 		off = (msr - MSR_AMD7TH_START);
205 		*index = (off + base) / 4;
206 		return (0);
207 	}
208 
209 	return (EINVAL);
210 }
211 
212 /*
213  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
214  */
215 static void
svm_msr_perm(uint8_t * perm_bitmap,uint64_t msr,bool read,bool write)216 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
217 {
218 	int index, bit, error;
219 
220 	error = svm_msr_index(msr, &index, &bit);
221 	KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
222 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
223 	    ("%s: invalid index %d for msr %lx", __func__, index, msr));
224 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
225 	    "msr %lx", __func__, bit, msr));
226 
227 	if (read)
228 		perm_bitmap[index] &= ~(1UL << bit);
229 
230 	if (write)
231 		perm_bitmap[index] &= ~(2UL << bit);
232 }
233 
234 static void
svm_msr_rw_ok(uint8_t * perm_bitmap,uint64_t msr)235 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
236 {
237 
238 	svm_msr_perm(perm_bitmap, msr, true, true);
239 }
240 
241 static void
svm_msr_rd_ok(uint8_t * perm_bitmap,uint64_t msr)242 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
243 {
244 
245 	svm_msr_perm(perm_bitmap, msr, true, false);
246 }
247 
248 int
svm_get_intercept(struct svm_softc * sc,int vcpu,int idx,uint32_t bitmask)249 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
250 {
251 	struct vmcb_ctrl *ctrl;
252 
253 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
254 
255 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
256 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
257 }
258 
259 void
svm_set_intercept(struct svm_softc * sc,int vcpu,int idx,uint32_t bitmask,int enabled)260 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
261     int enabled)
262 {
263 	struct vmcb_ctrl *ctrl;
264 	uint32_t oldval;
265 
266 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
267 
268 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
269 	oldval = ctrl->intercept[idx];
270 
271 	if (enabled)
272 		ctrl->intercept[idx] |= bitmask;
273 	else
274 		ctrl->intercept[idx] &= ~bitmask;
275 
276 	if (ctrl->intercept[idx] != oldval) {
277 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
278 	}
279 }
280 
281 static void
vmcb_init(struct svm_softc * sc,int vcpu,uint64_t iopm_base_pa,uint64_t msrpm_base_pa,uint64_t np_pml4)282 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
283     uint64_t msrpm_base_pa, uint64_t np_pml4)
284 {
285 	struct vmcb_ctrl *ctrl;
286 	struct vmcb_state *state;
287 	uint32_t mask;
288 	int n;
289 
290 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
291 	state = svm_get_vmcb_state(sc, vcpu);
292 
293 	ctrl->iopm_base_pa = iopm_base_pa;
294 	ctrl->msrpm_base_pa = msrpm_base_pa;
295 
296 	/* Enable nested paging */
297 	ctrl->np_ctrl = NP_ENABLE;
298 	ctrl->n_cr3 = np_pml4;
299 
300 	/*
301 	 * Intercept accesses to the control registers that are not shadowed
302 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
303 	 */
304 	for (n = 0; n < 16; n++) {
305 		mask = (BIT(n) << 16) | BIT(n);
306 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
307 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
308 		else
309 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
310 	}
311 
312 	/*
313 	 * Selectively intercept writes to %cr0.  This triggers on operations
314 	 * which would change bits other than TS or MP.
315 	 */
316 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
317 	    VMCB_INTCPT_CR0_WRITE);
318 
319 	/*
320 	 * Intercept everything when tracing guest exceptions otherwise
321 	 * just intercept machine check exception.
322 	 */
323 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
324 		for (n = 0; n < 32; n++) {
325 			/*
326 			 * Skip unimplemented vectors in the exception bitmap.
327 			 */
328 			if (n == 2 || n == 9) {
329 				continue;
330 			}
331 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
332 		}
333 	} else {
334 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
335 	}
336 
337 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
338 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
339 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
340 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
341 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
342 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
343 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
344 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
345 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_RDPMC);
346 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
347 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
348 	    VMCB_INTCPT_FERR_FREEZE);
349 
350 	/* Enable exit-on-hlt by default */
351 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT);
352 
353 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
354 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
355 
356 	/* Intercept privileged invalidation instructions. */
357 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
358 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
359 
360 	/*
361 	 * Intercept all virtualization-related instructions.
362 	 *
363 	 * From section "Canonicalization and Consistency Checks" in APMv2
364 	 * the VMRUN intercept bit must be set to pass the consistency check.
365 	 */
366 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
367 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
368 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
369 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
370 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
371 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
372 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
373 	if (vcpu_trap_wbinvd(sc->vm, vcpu) != 0) {
374 		svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT,
375 		    VMCB_INTCPT_WBINVD);
376 	}
377 
378 	/*
379 	 * The ASID will be set to a non-zero value just before VMRUN.
380 	 */
381 	ctrl->asid = 0;
382 
383 	/*
384 	 * Section 15.21.1, Interrupt Masking in EFLAGS
385 	 * Section 15.21.2, Virtualizing APIC.TPR
386 	 *
387 	 * This must be set for %rflag and %cr8 isolation of guest and host.
388 	 */
389 	ctrl->v_intr_ctrl |= V_INTR_MASKING;
390 
391 	/* Enable Last Branch Record aka LBR-virt (if available) */
392 	if (has_lbr_virt()) {
393 		ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
394 	}
395 
396 	/* EFER_SVM must always be set when the guest is executing */
397 	state->efer = EFER_SVM;
398 
399 	/* Set up the PAT to power-on state */
400 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
401 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
402 	    PAT_VALUE(2, PAT_UNCACHED)		|
403 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
404 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
405 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
406 	    PAT_VALUE(6, PAT_UNCACHED)		|
407 	    PAT_VALUE(7, PAT_UNCACHEABLE);
408 
409 	/* Set up DR6/7 to power-on state */
410 	state->dr6 = DBREG_DR6_RESERVED1;
411 	state->dr7 = DBREG_DR7_RESERVED1;
412 }
413 
414 /*
415  * Initialize a virtual machine.
416  */
417 static void *
svm_vminit(struct vm * vm)418 svm_vminit(struct vm *vm)
419 {
420 	struct svm_softc *svm_sc;
421 	struct svm_vcpu *vcpu;
422 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
423 	int i;
424 	uint16_t maxcpus;
425 
426 	svm_sc = kmem_zalloc(sizeof (*svm_sc), KM_SLEEP);
427 	VERIFY3U(((uintptr_t)svm_sc & PAGE_MASK),  ==,  0);
428 
429 	svm_sc->msr_bitmap = vmm_contig_alloc(SVM_MSR_BITMAP_SIZE);
430 	if (svm_sc->msr_bitmap == NULL)
431 		panic("contigmalloc of SVM MSR bitmap failed");
432 	svm_sc->iopm_bitmap = vmm_contig_alloc(SVM_IO_BITMAP_SIZE);
433 	if (svm_sc->iopm_bitmap == NULL)
434 		panic("contigmalloc of SVM IO bitmap failed");
435 
436 	svm_sc->vm = vm;
437 	svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm));
438 
439 	/*
440 	 * Intercept read and write accesses to all MSRs.
441 	 */
442 	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
443 
444 	/*
445 	 * Access to the following MSRs is redirected to the VMCB when the
446 	 * guest is executing. Therefore it is safe to allow the guest to
447 	 * read/write these MSRs directly without hypervisor involvement.
448 	 */
449 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
450 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
451 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
452 
453 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
454 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
455 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
456 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
457 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
458 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
459 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
460 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
461 
462 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
463 
464 	/*
465 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
466 	 */
467 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
468 
469 	/* Intercept access to all I/O ports. */
470 	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
471 
472 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
473 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
474 	pml4_pa = svm_sc->nptp;
475 	maxcpus = vm_get_maxcpus(svm_sc->vm);
476 	for (i = 0; i < maxcpus; i++) {
477 		vcpu = svm_get_vcpu(svm_sc, i);
478 		vcpu->nextrip = ~0;
479 		vcpu->lastcpu = NOCPU;
480 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
481 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
482 		svm_msr_guest_init(svm_sc, i);
483 	}
484 
485 	svm_pmu_init(svm_sc);
486 
487 	return (svm_sc);
488 }
489 
490 /*
491  * Collateral for a generic SVM VM-exit.
492  */
493 static void
vm_exit_svm(struct vm_exit * vme,uint64_t code,uint64_t info1,uint64_t info2)494 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
495 {
496 
497 	vme->exitcode = VM_EXITCODE_SVM;
498 	vme->u.svm.exitcode = code;
499 	vme->u.svm.exitinfo1 = info1;
500 	vme->u.svm.exitinfo2 = info2;
501 }
502 
503 static enum vm_cpu_mode
svm_vcpu_mode(struct vmcb * vmcb)504 svm_vcpu_mode(struct vmcb *vmcb)
505 {
506 	struct vmcb_state *state;
507 
508 	state = &vmcb->state;
509 
510 	if (state->efer & EFER_LMA) {
511 		struct vmcb_segment *seg;
512 
513 		/*
514 		 * Section 4.8.1 for APM2, check if Code Segment has
515 		 * Long attribute set in descriptor.
516 		 */
517 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
518 		if (seg->attrib & VMCB_CS_ATTRIB_L)
519 			return (CPU_MODE_64BIT);
520 		else
521 			return (CPU_MODE_COMPATIBILITY);
522 	} else  if (state->cr0 & CR0_PE) {
523 		return (CPU_MODE_PROTECTED);
524 	} else {
525 		return (CPU_MODE_REAL);
526 	}
527 }
528 
529 static enum vm_paging_mode
svm_paging_mode(uint64_t cr0,uint64_t cr4,uint64_t efer)530 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
531 {
532 
533 	if ((cr0 & CR0_PG) == 0)
534 		return (PAGING_MODE_FLAT);
535 	if ((cr4 & CR4_PAE) == 0)
536 		return (PAGING_MODE_32);
537 	if (efer & EFER_LME)
538 		return (PAGING_MODE_64);
539 	else
540 		return (PAGING_MODE_PAE);
541 }
542 
543 static void
svm_paging_info(struct vmcb * vmcb,struct vm_guest_paging * paging)544 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
545 {
546 	struct vmcb_state *state;
547 
548 	state = &vmcb->state;
549 	paging->cr3 = state->cr3;
550 	paging->cpl = state->cpl;
551 	paging->cpu_mode = svm_vcpu_mode(vmcb);
552 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
553 	    state->efer);
554 }
555 
556 #define	UNHANDLED 0
557 
558 /*
559  * Handle guest I/O intercept.
560  */
561 static int
svm_handle_inout(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)562 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
563 {
564 	struct vmcb_ctrl *ctrl;
565 	struct vmcb_state *state;
566 	struct vm_inout *inout;
567 	struct vie *vie;
568 	uint64_t info1;
569 	struct vm_guest_paging paging;
570 
571 	state = svm_get_vmcb_state(svm_sc, vcpu);
572 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
573 	inout = &vmexit->u.inout;
574 	info1 = ctrl->exitinfo1;
575 
576 	inout->bytes = (info1 >> 4) & 0x7;
577 	inout->flags = 0;
578 	inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
579 	inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
580 	inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
581 	inout->port = (uint16_t)(info1 >> 16);
582 	inout->eax = (uint32_t)(state->rax);
583 
584 	/*
585 	 * We'll always need paging and vie info, even if we bail out early
586 	 * due to missing DecodeAssist.
587 	 */
588 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
589 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
590 
591 	if ((inout->flags & INOUT_STR) != 0) {
592 		/*
593 		 * The effective segment number in EXITINFO1[12:10] is populated
594 		 * only if the processor has the DecodeAssist capability.
595 		 *
596 		 * This is not specified explicitly in APMv2 but can be verified
597 		 * empirically.
598 		 */
599 		if (!has_decode_assist()) {
600 			/*
601 			 * Without decoding assistance, force the task of
602 			 * emulating the ins/outs on userspace.
603 			 */
604 			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
605 			bzero(&vmexit->u.inst_emul,
606 			    sizeof (vmexit->u.inst_emul));
607 			vie_init_other(vie, &paging);
608 			return (UNHANDLED);
609 		}
610 
611 		/*
612 		 * Bits 7-9 encode the address size of ins/outs operations where
613 		 * the 1/2/4 values correspond to 16/32/64 bit sizes.
614 		 */
615 		inout->addrsize = 2 * ((info1 >> 7) & 0x7);
616 		VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
617 		    inout->addrsize == 8);
618 
619 		if (inout->flags & INOUT_IN) {
620 			/*
621 			 * For INS instructions, %es (encoded as 0) is the
622 			 * implied segment for the operation.
623 			 */
624 			inout->segment = 0;
625 		} else {
626 			/*
627 			 * Bits 10-12 encode the segment for OUTS.
628 			 * This value follows the standard x86 segment order.
629 			 */
630 			inout->segment = (info1 >> 10) & 0x7;
631 		}
632 	}
633 
634 	vmexit->exitcode = VM_EXITCODE_INOUT;
635 	vie_init_inout(vie, inout, vmexit->inst_length, &paging);
636 
637 	/* The in/out emulation will handle advancing %rip */
638 	vmexit->inst_length = 0;
639 
640 	return (UNHANDLED);
641 }
642 
643 static int
npf_fault_type(uint64_t exitinfo1)644 npf_fault_type(uint64_t exitinfo1)
645 {
646 
647 	if (exitinfo1 & VMCB_NPF_INFO1_W)
648 		return (PROT_WRITE);
649 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
650 		return (PROT_EXEC);
651 	else
652 		return (PROT_READ);
653 }
654 
655 static bool
svm_npf_emul_fault(uint64_t exitinfo1)656 svm_npf_emul_fault(uint64_t exitinfo1)
657 {
658 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
659 		return (false);
660 	}
661 
662 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
663 		return (false);
664 	}
665 
666 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
667 		return (false);
668 	}
669 
670 	return (true);
671 }
672 
673 static void
svm_handle_mmio_emul(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit,uint64_t gpa)674 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
675     uint64_t gpa)
676 {
677 	struct vmcb_ctrl *ctrl;
678 	struct vmcb *vmcb;
679 	struct vie *vie;
680 	struct vm_guest_paging paging;
681 	struct vmcb_segment *seg;
682 	char *inst_bytes = NULL;
683 	uint8_t inst_len = 0;
684 
685 	vmcb = svm_get_vmcb(svm_sc, vcpu);
686 	ctrl = &vmcb->ctrl;
687 
688 	vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
689 	vmexit->u.mmio_emul.gpa = gpa;
690 	vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
691 	svm_paging_info(vmcb, &paging);
692 
693 	switch (paging.cpu_mode) {
694 	case CPU_MODE_REAL:
695 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
696 		vmexit->u.mmio_emul.cs_base = seg->base;
697 		vmexit->u.mmio_emul.cs_d = 0;
698 		break;
699 	case CPU_MODE_PROTECTED:
700 	case CPU_MODE_COMPATIBILITY:
701 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
702 		vmexit->u.mmio_emul.cs_base = seg->base;
703 
704 		/*
705 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
706 		 */
707 		vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
708 		    1 : 0;
709 		break;
710 	default:
711 		vmexit->u.mmio_emul.cs_base = 0;
712 		vmexit->u.mmio_emul.cs_d = 0;
713 		break;
714 	}
715 
716 	/*
717 	 * Copy the instruction bytes into 'vie' if available.
718 	 */
719 	if (has_decode_assist() && !disable_npf_assist) {
720 		inst_len = ctrl->inst_len;
721 		inst_bytes = (char *)ctrl->inst_bytes;
722 	}
723 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
724 	vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
725 }
726 
727 /*
728  * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0
729  * which is live in the guest.  They are visible via the shadow instead.
730  */
731 #define	SVM_CR0_MASK	~(CR0_CD | CR0_NW | 0xffffffff00000000)
732 
733 static void
svm_set_cr0(struct svm_softc * svm_sc,int vcpu,uint64_t val,bool guest_write)734 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write)
735 {
736 	struct vmcb_state *state;
737 	struct svm_regctx *regctx;
738 	uint64_t masked, old, diff;
739 
740 	state = svm_get_vmcb_state(svm_sc, vcpu);
741 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
742 
743 	old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
744 	diff = old ^ val;
745 
746 	/* No further work needed if register contents remain the same */
747 	if (diff == 0) {
748 		return;
749 	}
750 
751 	/* Flush the TLB if the paging or write-protect bits are changing */
752 	if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
753 		flush_asid(svm_sc, vcpu);
754 	}
755 
756 	/*
757 	 * If the change in %cr0 is due to a guest action (via interception)
758 	 * then other CPU state updates may be required.
759 	 */
760 	if (guest_write) {
761 		if ((diff & CR0_PG) != 0) {
762 			uint64_t efer = state->efer;
763 
764 			/* Keep the long-mode state in EFER in sync */
765 			if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) {
766 				state->efer |= EFER_LMA;
767 			}
768 			if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) {
769 				state->efer &= ~EFER_LMA;
770 			}
771 		}
772 	}
773 
774 	masked = val & SVM_CR0_MASK;
775 	regctx->sctx_cr0_shadow = val;
776 	state->cr0 = masked;
777 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR);
778 
779 	if ((masked ^ val) != 0) {
780 		/*
781 		 * The guest has set bits in %cr0 which we are masking out and
782 		 * exposing via shadow.
783 		 *
784 		 * We must intercept %cr0 reads in order to make the shadowed
785 		 * view available to the guest.
786 		 *
787 		 * Writes to %cr0 must also be intercepted (unconditionally,
788 		 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch
789 		 * if/when the guest clears those shadowed bits.
790 		 */
791 		svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
792 		    BIT(0) | BIT(16));
793 	} else {
794 		/*
795 		 * When no bits remain in %cr0 which require shadowing, the
796 		 * unconditional intercept of reads/writes to %cr0 can be
797 		 * disabled.
798 		 *
799 		 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains
800 		 * in place so we can be notified of operations which change
801 		 * bits other than TS or MP.
802 		 */
803 		svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
804 		    BIT(0) | BIT(16));
805 	}
806 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I);
807 }
808 
809 static void
svm_get_cr0(struct svm_softc * svm_sc,int vcpu,uint64_t * val)810 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val)
811 {
812 	struct vmcb *vmcb;
813 	struct svm_regctx *regctx;
814 
815 	vmcb = svm_get_vmcb(svm_sc, vcpu);
816 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
817 
818 	/*
819 	 * Include the %cr0 bits which exist only in the shadow along with those
820 	 * in the running vCPU state.
821 	 */
822 	*val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
823 }
824 
825 static void
svm_handle_cr0_read(struct svm_softc * svm_sc,int vcpu,enum vm_reg_name reg)826 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
827 {
828 	uint64_t val;
829 	int err __maybe_unused;
830 
831 	svm_get_cr0(svm_sc, vcpu, &val);
832 	err = svm_setreg(svm_sc, vcpu, reg, val);
833 	ASSERT(err == 0);
834 }
835 
836 static void
svm_handle_cr0_write(struct svm_softc * svm_sc,int vcpu,enum vm_reg_name reg)837 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
838 {
839 	struct vmcb_state *state;
840 	uint64_t val;
841 	int err __maybe_unused;
842 
843 	state = svm_get_vmcb_state(svm_sc, vcpu);
844 
845 	err = svm_getreg(svm_sc, vcpu, reg, &val);
846 	ASSERT(err == 0);
847 
848 	if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) {
849 		/* NW without CD is nonsensical */
850 		vm_inject_gp(svm_sc->vm, vcpu);
851 		return;
852 	}
853 	if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) {
854 		/* PG requires PE */
855 		vm_inject_gp(svm_sc->vm, vcpu);
856 		return;
857 	}
858 	if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) {
859 		/* When enabling paging, PAE must be enabled if LME is. */
860 		if ((state->efer & EFER_LME) != 0 &&
861 		    (state->cr4 & CR4_PAE) == 0) {
862 			vm_inject_gp(svm_sc->vm, vcpu);
863 			return;
864 		}
865 	}
866 
867 	svm_set_cr0(svm_sc, vcpu, val, true);
868 }
869 
870 static void
svm_inst_emul_other(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)871 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
872 {
873 	struct vie *vie;
874 	struct vm_guest_paging paging;
875 
876 	/* Let the instruction emulation (hopefully in-kernel) handle it */
877 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
878 	bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul));
879 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
880 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
881 	vie_init_other(vie, &paging);
882 
883 	/* The instruction emulation will handle advancing %rip */
884 	vmexit->inst_length = 0;
885 }
886 
887 static void
svm_update_virqinfo(struct svm_softc * sc,int vcpu)888 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
889 {
890 	struct vm *vm;
891 	struct vlapic *vlapic;
892 	struct vmcb_ctrl *ctrl;
893 
894 	vm = sc->vm;
895 	vlapic = vm_lapic(vm, vcpu);
896 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
897 
898 	/* Update %cr8 in the emulated vlapic */
899 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
900 
901 	/* Virtual interrupt injection is not used. */
902 	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
903 	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
904 }
905 
906 CTASSERT(VMCB_EVENTINJ_TYPE_INTR	== VM_INTINFO_HWINTR);
907 CTASSERT(VMCB_EVENTINJ_TYPE_NMI		== VM_INTINFO_NMI);
908 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION	== VM_INTINFO_HWEXCP);
909 CTASSERT(VMCB_EVENTINJ_TYPE_INTn	== VM_INTINFO_SWINTR);
910 CTASSERT(VMCB_EVENTINJ_EC_VALID		== VM_INTINFO_DEL_ERRCODE);
911 CTASSERT(VMCB_EVENTINJ_VALID		== VM_INTINFO_VALID);
912 
913 /*
914  * Store SVM-specific event injection info for later handling.  This depends on
915  * the bhyve-internal event definitions matching those in the VMCB, as ensured
916  * by the above CTASSERTs.
917  */
918 static void
svm_stash_intinfo(struct svm_softc * svm_sc,int vcpu,uint64_t intinfo)919 svm_stash_intinfo(struct svm_softc *svm_sc, int vcpu, uint64_t intinfo)
920 {
921 	ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
922 
923 	/*
924 	 * If stashing an NMI pending injection, ensure that it bears the
925 	 * correct vector which exit_intinfo expects.
926 	 */
927 	if (VM_INTINFO_TYPE(intinfo) == VM_INTINFO_NMI) {
928 		intinfo &= ~VM_INTINFO_MASK_VECTOR;
929 		intinfo |= IDT_NMI;
930 	}
931 
932 	VERIFY0(vm_exit_intinfo(svm_sc->vm, vcpu, intinfo));
933 }
934 
935 static void
svm_save_exitintinfo(struct svm_softc * svm_sc,int vcpu)936 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
937 {
938 	struct vmcb_ctrl *ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
939 	uint64_t intinfo = ctrl->exitintinfo;
940 
941 	if (VMCB_EXITINTINFO_VALID(intinfo)) {
942 		/*
943 		 * If a #VMEXIT happened during event delivery then record the
944 		 * event that was being delivered.
945 		 */
946 		vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
947 
948 		svm_stash_intinfo(svm_sc, vcpu, intinfo);
949 	}
950 }
951 
952 static __inline int
vintr_intercept_enabled(struct svm_softc * sc,int vcpu)953 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
954 {
955 
956 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
957 	    VMCB_INTCPT_VINTR));
958 }
959 
960 static void
svm_enable_intr_window_exiting(struct svm_softc * sc,int vcpu)961 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
962 {
963 	struct vmcb_ctrl *ctrl;
964 	struct vmcb_state *state;
965 
966 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
967 	state = svm_get_vmcb_state(sc, vcpu);
968 
969 	if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
970 		KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
971 		    ("%s: invalid v_ign_tpr", __func__));
972 		KASSERT(vintr_intercept_enabled(sc, vcpu),
973 		    ("%s: vintr intercept should be enabled", __func__));
974 		return;
975 	}
976 
977 	/*
978 	 * We use V_IRQ in conjunction with the VINTR intercept to trap into the
979 	 * hypervisor as soon as a virtual interrupt can be delivered.
980 	 *
981 	 * Since injected events are not subject to intercept checks we need to
982 	 * ensure that the V_IRQ is not actually going to be delivered on VM
983 	 * entry.
984 	 */
985 	VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
986 	    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
987 
988 	ctrl->v_irq |= V_IRQ;
989 	ctrl->v_intr_prio |= V_IGN_TPR;
990 	ctrl->v_intr_vector = 0;
991 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
992 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
993 }
994 
995 static void
svm_disable_intr_window_exiting(struct svm_softc * sc,int vcpu)996 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
997 {
998 	struct vmcb_ctrl *ctrl;
999 
1000 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1001 
1002 	if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
1003 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
1004 		    ("%s: vintr intercept should be disabled", __func__));
1005 		return;
1006 	}
1007 
1008 	ctrl->v_irq &= ~V_IRQ;
1009 	ctrl->v_intr_vector = 0;
1010 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1011 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1012 }
1013 
1014 /*
1015  * Once an NMI is injected it blocks delivery of further NMIs until the handler
1016  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1017  * to track when the vcpu is done handling the NMI.
1018  */
1019 static int
svm_nmi_blocked(struct svm_softc * sc,int vcpu)1020 svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1021 {
1022 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1023 	    VMCB_INTCPT_IRET));
1024 }
1025 
1026 static void
svm_clear_nmi_blocking(struct svm_softc * sc,int vcpu)1027 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1028 {
1029 	struct vmcb_ctrl *ctrl;
1030 
1031 	KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1032 	/*
1033 	 * When the IRET intercept is cleared the vcpu will attempt to execute
1034 	 * the "iret" when it runs next. However, it is possible to inject
1035 	 * another NMI into the vcpu before the "iret" has actually executed.
1036 	 *
1037 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1038 	 * it will trap back into the hypervisor. If an NMI is pending for
1039 	 * the vcpu it will be injected into the guest.
1040 	 *
1041 	 * XXX this needs to be fixed
1042 	 */
1043 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1044 
1045 	/*
1046 	 * Set an interrupt shadow to prevent an NMI from being immediately
1047 	 * injected on the next VMRUN.
1048 	 */
1049 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1050 	ctrl->intr_shadow = 1;
1051 }
1052 
1053 static void
svm_inject_event(struct vmcb_ctrl * ctrl,uint64_t info)1054 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info)
1055 {
1056 	ASSERT(VM_INTINFO_PENDING(info));
1057 
1058 	uint8_t vector = VM_INTINFO_VECTOR(info);
1059 	uint32_t type = VM_INTINFO_TYPE(info);
1060 
1061 	/*
1062 	 * Correct behavior depends on bhyve intinfo event types lining up with
1063 	 * those defined by AMD for event injection in the VMCB.  The CTASSERTs
1064 	 * above svm_save_exitintinfo() ensure it.
1065 	 */
1066 	switch (type) {
1067 	case VM_INTINFO_NMI:
1068 		/* Ensure vector for injected event matches its type (NMI) */
1069 		vector = IDT_NMI;
1070 		break;
1071 	case VM_INTINFO_HWINTR:
1072 	case VM_INTINFO_SWINTR:
1073 		break;
1074 	case VM_INTINFO_HWEXCP:
1075 		if (vector == IDT_NMI) {
1076 			/*
1077 			 * NMIs are expected to be injected with
1078 			 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception
1079 			 * with the NMI vector.
1080 			 */
1081 			type = VM_INTINFO_NMI;
1082 		}
1083 		VERIFY(vector < 32);
1084 		break;
1085 	default:
1086 		/*
1087 		 * Since there is not strong validation for injected event types
1088 		 * at this point, fall back to software interrupt for those we
1089 		 * do not recognized.
1090 		 */
1091 		type = VM_INTINFO_SWINTR;
1092 		break;
1093 	}
1094 
1095 	ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector;
1096 	if (VM_INTINFO_HAS_ERRCODE(info)) {
1097 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1098 		ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32;
1099 	}
1100 }
1101 
1102 static void
svm_inject_nmi(struct svm_softc * sc,int vcpu)1103 svm_inject_nmi(struct svm_softc *sc, int vcpu)
1104 {
1105 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1106 
1107 	ASSERT(!svm_nmi_blocked(sc, vcpu));
1108 
1109 	ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1110 	vm_nmi_clear(sc->vm, vcpu);
1111 
1112 	/*
1113 	 * Virtual NMI blocking is now in effect.
1114 	 *
1115 	 * Not only does this block a subsequent NMI injection from taking
1116 	 * place, it also configures an intercept on the IRET so we can track
1117 	 * when the next injection can take place.
1118 	 */
1119 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1120 }
1121 
1122 static void
svm_inject_irq(struct svm_softc * sc,int vcpu,int vector)1123 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1124 {
1125 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1126 
1127 	ASSERT(vector >= 0 && vector <= 255);
1128 
1129 	ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1130 }
1131 
1132 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
1133 
1134 static vm_msr_result_t
svm_write_efer(struct svm_softc * sc,int vcpu,uint64_t newval)1135 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1136 {
1137 	struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu);
1138 	uint64_t lma;
1139 	int error;
1140 
1141 	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
1142 
1143 	if (newval & EFER_MBZ_BITS) {
1144 		return (VMR_GP);
1145 	}
1146 
1147 	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1148 	const uint64_t changed = state->efer ^ newval;
1149 	if (changed & EFER_LME) {
1150 		if (state->cr0 & CR0_PG) {
1151 			return (VMR_GP);
1152 		}
1153 	}
1154 
1155 	/* EFER.LMA = EFER.LME & CR0.PG */
1156 	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) {
1157 		lma = EFER_LMA;
1158 	} else {
1159 		lma = 0;
1160 	}
1161 	if ((newval & EFER_LMA) != lma) {
1162 		return (VMR_GP);
1163 	}
1164 
1165 	if ((newval & EFER_NXE) != 0 &&
1166 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) {
1167 		return (VMR_GP);
1168 	}
1169 	if ((newval & EFER_FFXSR) != 0 &&
1170 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) {
1171 		return (VMR_GP);
1172 	}
1173 	if ((newval & EFER_TCE) != 0 &&
1174 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) {
1175 		return (VMR_GP);
1176 	}
1177 
1178 	/*
1179 	 * Until bhyve has proper support for long-mode segment limits, just
1180 	 * toss a #GP at the guest if they attempt to use it.
1181 	 */
1182 	if (newval & EFER_LMSLE) {
1183 		return (VMR_GP);
1184 	}
1185 
1186 	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1187 	VERIFY0(error);
1188 	return (VMR_OK);
1189 }
1190 
1191 static int
svm_handle_msr(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit,bool is_wrmsr)1192 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
1193     bool is_wrmsr)
1194 {
1195 	struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1196 	struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1197 	const uint32_t ecx = ctx->sctx_rcx;
1198 	vm_msr_result_t res;
1199 	uint64_t val = 0;
1200 
1201 	if (is_wrmsr) {
1202 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1203 		val = ctx->sctx_rdx << 32 | (uint32_t)state->rax;
1204 
1205 		if (vlapic_owned_msr(ecx)) {
1206 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1207 
1208 			res = vlapic_wrmsr(vlapic, ecx, val);
1209 		} else if (ecx == MSR_EFER) {
1210 			res = svm_write_efer(svm_sc, vcpu, val);
1211 		} else if (svm_pmu_owned_msr(ecx)) {
1212 			res = svm_pmu_wrmsr(svm_sc, vcpu, ecx, val);
1213 		} else {
1214 			res = svm_wrmsr(svm_sc, vcpu, ecx, val);
1215 		}
1216 	} else {
1217 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1218 
1219 		if (vlapic_owned_msr(ecx)) {
1220 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1221 
1222 			res = vlapic_rdmsr(vlapic, ecx, &val);
1223 		} else if (svm_pmu_owned_msr(ecx)) {
1224 			res = svm_pmu_rdmsr(svm_sc, vcpu, ecx, &val);
1225 		} else {
1226 			res = svm_rdmsr(svm_sc, vcpu, ecx, &val);
1227 		}
1228 	}
1229 
1230 	switch (res) {
1231 	case VMR_OK:
1232 		/* Store rdmsr result in the appropriate registers */
1233 		if (!is_wrmsr) {
1234 			state->rax = (uint32_t)val;
1235 			ctx->sctx_rdx = val >> 32;
1236 		}
1237 		return (1);
1238 	case VMR_GP:
1239 		vm_inject_gp(svm_sc->vm, vcpu);
1240 		return (1);
1241 	case VMR_UNHANLDED:
1242 		vmexit->exitcode = is_wrmsr ?
1243 		    VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR;
1244 		vmexit->u.msr.code = ecx;
1245 		vmexit->u.msr.wval = val;
1246 		return (0);
1247 	default:
1248 		panic("unexpected msr result %u\n", res);
1249 	}
1250 }
1251 
1252 static void
svm_handle_rdpmc(struct svm_softc * svm_sc,int vcpu)1253 svm_handle_rdpmc(struct svm_softc *svm_sc, int vcpu)
1254 {
1255 	struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1256 	struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1257 	const uint32_t ecx = ctx->sctx_rcx;
1258 	uint64_t val = 0;
1259 
1260 	if (svm_pmu_rdpmc(svm_sc, vcpu, ecx, &val)) {
1261 		state->rax = (uint32_t)val;
1262 		ctx->sctx_rdx = val >> 32;
1263 	} else {
1264 		vm_inject_gp(svm_sc->vm, vcpu);
1265 	}
1266 }
1267 
1268 /*
1269  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1270  * that are due to instruction intercepts as well as MSR and IOIO intercepts
1271  * and exceptions caused by INT3, INTO and BOUND instructions.
1272  *
1273  * Return 1 if the nRIP is valid and 0 otherwise.
1274  */
1275 static int
nrip_valid(uint64_t exitcode)1276 nrip_valid(uint64_t exitcode)
1277 {
1278 	switch (exitcode) {
1279 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
1280 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
1281 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
1282 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
1283 	case 0x43:		/* INT3 */
1284 	case 0x44:		/* INTO */
1285 	case 0x45:		/* BOUND */
1286 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1287 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1288 		return (1);
1289 	default:
1290 		return (0);
1291 	}
1292 }
1293 
1294 static int
svm_vmexit(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)1295 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1296 {
1297 	struct vmcb *vmcb;
1298 	struct vmcb_state *state;
1299 	struct vmcb_ctrl *ctrl;
1300 	struct svm_regctx *ctx;
1301 	uint64_t code, info1, info2;
1302 	int handled;
1303 
1304 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
1305 	vmcb = svm_get_vmcb(svm_sc, vcpu);
1306 	state = &vmcb->state;
1307 	ctrl = &vmcb->ctrl;
1308 
1309 	handled = 0;
1310 	code = ctrl->exitcode;
1311 	info1 = ctrl->exitinfo1;
1312 	info2 = ctrl->exitinfo2;
1313 
1314 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1315 	vmexit->rip = state->rip;
1316 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1317 
1318 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1319 
1320 	/*
1321 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1322 	 * in an inconsistent state and can trigger assertions that would
1323 	 * never happen otherwise.
1324 	 */
1325 	if (code == VMCB_EXIT_INVALID) {
1326 		vm_exit_svm(vmexit, code, info1, info2);
1327 		return (0);
1328 	}
1329 
1330 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1331 	    "injection valid bit is set %lx", __func__, ctrl->eventinj));
1332 
1333 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1334 	    ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1335 	    vmexit->inst_length, code, info1, info2));
1336 
1337 	svm_update_virqinfo(svm_sc, vcpu);
1338 	svm_save_exitintinfo(svm_sc, vcpu);
1339 
1340 	switch (code) {
1341 	case VMCB_EXIT_CR0_READ:
1342 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1343 			svm_handle_cr0_read(svm_sc, vcpu,
1344 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1345 			handled = 1;
1346 		} else {
1347 			/*
1348 			 * If SMSW is used to read the contents of %cr0, then
1349 			 * the VALID bit will not be set in `info1`, since the
1350 			 * handling is different from the mov-to-reg case.
1351 			 *
1352 			 * Punt to the instruction emulation to handle it.
1353 			 */
1354 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1355 		}
1356 		break;
1357 	case VMCB_EXIT_CR0_WRITE:
1358 	case VMCB_EXIT_CR0_SEL_WRITE:
1359 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1360 			svm_handle_cr0_write(svm_sc, vcpu,
1361 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1362 			handled = 1;
1363 		} else {
1364 			/*
1365 			 * Writes to %cr0 without VALID being set in `info1` are
1366 			 * initiated by the LMSW and CLTS instructions.  While
1367 			 * LMSW (like SMSW) sees little use in modern OSes and
1368 			 * bootloaders, CLTS is still used for handling FPU
1369 			 * state transitions.
1370 			 *
1371 			 * Punt to the instruction emulation to handle them.
1372 			 */
1373 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1374 		}
1375 		break;
1376 	case VMCB_EXIT_IRET:
1377 		/*
1378 		 * Restart execution at "iret" but with the intercept cleared.
1379 		 */
1380 		vmexit->inst_length = 0;
1381 		svm_clear_nmi_blocking(svm_sc, vcpu);
1382 		handled = 1;
1383 		break;
1384 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
1385 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1386 		svm_disable_intr_window_exiting(svm_sc, vcpu);
1387 		handled = 1;
1388 		break;
1389 	case VMCB_EXIT_INTR:	/* external interrupt */
1390 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1391 		handled = 1;
1392 		break;
1393 	case VMCB_EXIT_NMI:
1394 	case VMCB_EXIT_SMI:
1395 	case VMCB_EXIT_INIT:
1396 		/*
1397 		 * For external NMI/SMI and physical INIT interrupts, simply
1398 		 * continue execution, as those host events will be handled by
1399 		 * the physical CPU.
1400 		 */
1401 		handled = 1;
1402 		break;
1403 	case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: {
1404 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1405 
1406 		const uint8_t idtvec = code - VMCB_EXIT_EXCP0;
1407 		uint32_t errcode = 0;
1408 		bool reflect = true;
1409 		bool errcode_valid = false;
1410 
1411 		switch (idtvec) {
1412 		case IDT_MC:
1413 			/* The host will handle the MCE itself. */
1414 			reflect = false;
1415 			vmm_call_trap(T_MCE);
1416 			break;
1417 		case IDT_PF:
1418 			VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1419 			    info2));
1420 			/* fallthru */
1421 		case IDT_NP:
1422 		case IDT_SS:
1423 		case IDT_GP:
1424 		case IDT_AC:
1425 		case IDT_TS:
1426 			errcode_valid = true;
1427 			errcode = info1;
1428 			break;
1429 
1430 		case IDT_DF:
1431 			errcode_valid = true;
1432 			break;
1433 
1434 		case IDT_BP:
1435 		case IDT_OF:
1436 		case IDT_BR:
1437 			/*
1438 			 * The 'nrip' field is populated for INT3, INTO and
1439 			 * BOUND exceptions and this also implies that
1440 			 * 'inst_length' is non-zero.
1441 			 *
1442 			 * Reset 'inst_length' to zero so the guest %rip at
1443 			 * event injection is identical to what it was when
1444 			 * the exception originally happened.
1445 			 */
1446 			vmexit->inst_length = 0;
1447 			/* fallthru */
1448 		default:
1449 			errcode_valid = false;
1450 			break;
1451 		}
1452 		VERIFY0(vmexit->inst_length);
1453 
1454 		if (reflect) {
1455 			/* Reflect the exception back into the guest */
1456 			VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1457 			    errcode_valid, errcode, false));
1458 		}
1459 		handled = 1;
1460 		break;
1461 		}
1462 	case VMCB_EXIT_MSR:
1463 		handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0);
1464 		break;
1465 	case VMCB_EXIT_RDPMC:
1466 		svm_handle_rdpmc(svm_sc, vcpu);
1467 		handled = 1;
1468 		break;
1469 	case VMCB_EXIT_IO:
1470 		handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1471 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1472 		break;
1473 	case VMCB_EXIT_SHUTDOWN:
1474 		(void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT, vcpu);
1475 		handled = 1;
1476 		break;
1477 	case VMCB_EXIT_INVLPGA:
1478 		/* privileged invalidation instructions */
1479 		vm_inject_ud(svm_sc->vm, vcpu);
1480 		handled = 1;
1481 		break;
1482 	case VMCB_EXIT_VMRUN:
1483 	case VMCB_EXIT_VMLOAD:
1484 	case VMCB_EXIT_VMSAVE:
1485 	case VMCB_EXIT_STGI:
1486 	case VMCB_EXIT_CLGI:
1487 	case VMCB_EXIT_SKINIT:
1488 		/* privileged vmm instructions */
1489 		vm_inject_ud(svm_sc->vm, vcpu);
1490 		handled = 1;
1491 		break;
1492 	case VMCB_EXIT_INVD:
1493 	case VMCB_EXIT_WBINVD:
1494 		/* ignore exit */
1495 		handled = 1;
1496 		break;
1497 	case VMCB_EXIT_VMMCALL:
1498 		/* No handlers make use of VMMCALL for now */
1499 		vm_inject_ud(svm_sc->vm, vcpu);
1500 		handled = 1;
1501 		break;
1502 	case VMCB_EXIT_CPUID:
1503 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1504 		vcpu_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1505 		    &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1506 		handled = 1;
1507 		break;
1508 	case VMCB_EXIT_HLT:
1509 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1510 		vmexit->exitcode = VM_EXITCODE_HLT;
1511 		vmexit->u.hlt.rflags = state->rflags;
1512 		break;
1513 	case VMCB_EXIT_PAUSE:
1514 		vmexit->exitcode = VM_EXITCODE_PAUSE;
1515 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1516 		break;
1517 	case VMCB_EXIT_NPF:
1518 		/* EXITINFO2 contains the faulting guest physical address */
1519 		if (info1 & VMCB_NPF_INFO1_RSV) {
1520 			/* nested fault with reserved bits set */
1521 		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1522 			vmexit->exitcode = VM_EXITCODE_PAGING;
1523 			vmexit->u.paging.gpa = info2;
1524 			vmexit->u.paging.fault_type = npf_fault_type(info1);
1525 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1526 		} else if (svm_npf_emul_fault(info1)) {
1527 			svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1528 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1529 		}
1530 		break;
1531 	case VMCB_EXIT_MONITOR:
1532 		vmexit->exitcode = VM_EXITCODE_MONITOR;
1533 		break;
1534 	case VMCB_EXIT_MWAIT:
1535 		vmexit->exitcode = VM_EXITCODE_MWAIT;
1536 		break;
1537 	default:
1538 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1539 		break;
1540 	}
1541 
1542 	DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1543 	    code);
1544 
1545 	if (handled) {
1546 		vmexit->rip += vmexit->inst_length;
1547 		vmexit->inst_length = 0;
1548 		state->rip = vmexit->rip;
1549 	} else {
1550 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1551 			/*
1552 			 * If this VM exit was not claimed by anybody then
1553 			 * treat it as a generic SVM exit.
1554 			 */
1555 			vm_exit_svm(vmexit, code, info1, info2);
1556 		} else {
1557 			/*
1558 			 * The exitcode and collateral have been populated.
1559 			 * The VM exit will be processed further in userland.
1560 			 */
1561 		}
1562 	}
1563 	return (handled);
1564 }
1565 
1566 /*
1567  * Inject exceptions, NMIs, and ExtINTs.
1568  *
1569  * The logic behind these are complicated and may involve mutex contention, so
1570  * the injection is performed without the protection of host CPU interrupts
1571  * being disabled.  This means a racing notification could be "lost",
1572  * necessitating a later call to svm_inject_recheck() to close that window
1573  * of opportunity.
1574  */
1575 static enum event_inject_state
svm_inject_events(struct svm_softc * sc,int vcpu)1576 svm_inject_events(struct svm_softc *sc, int vcpu)
1577 {
1578 	struct vmcb_ctrl *ctrl;
1579 	struct vmcb_state *state;
1580 	struct svm_vcpu *vcpustate;
1581 	uint64_t intinfo;
1582 	enum event_inject_state ev_state;
1583 
1584 	state = svm_get_vmcb_state(sc, vcpu);
1585 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1586 	vcpustate = svm_get_vcpu(sc, vcpu);
1587 	ev_state = EIS_CAN_INJECT;
1588 
1589 	/* Clear any interrupt shadow if guest %rip has changed */
1590 	if (vcpustate->nextrip != state->rip) {
1591 		ctrl->intr_shadow = 0;
1592 	}
1593 
1594 	/*
1595 	 * An event is already pending for injection.  This can occur when the
1596 	 * vCPU exits prior to VM entry (like for an AST).
1597 	 */
1598 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1599 		return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1600 	}
1601 
1602 	/*
1603 	 * Inject pending events or exceptions for this vcpu.
1604 	 *
1605 	 * An event might be pending because the previous #VMEXIT happened
1606 	 * during event delivery (i.e. ctrl->exitintinfo).
1607 	 *
1608 	 * An event might also be pending because an exception was injected
1609 	 * by the hypervisor (e.g. #PF during instruction emulation).
1610 	 */
1611 	if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1612 		svm_inject_event(ctrl, intinfo);
1613 		vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1614 		ev_state = EIS_EV_INJECTED;
1615 	}
1616 
1617 	/* NMI event has priority over interrupts. */
1618 	if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1619 		if (ev_state == EIS_CAN_INJECT) {
1620 			/* Can't inject NMI if vcpu is in an intr_shadow. */
1621 			if (ctrl->intr_shadow) {
1622 				return (EIS_GI_BLOCK);
1623 			}
1624 
1625 			svm_inject_nmi(sc, vcpu);
1626 			ev_state = EIS_EV_INJECTED;
1627 		} else {
1628 			return (ev_state | EIS_REQ_EXIT);
1629 		}
1630 	}
1631 
1632 	if (vm_extint_pending(sc->vm, vcpu)) {
1633 		int vector;
1634 
1635 		if (ev_state != EIS_CAN_INJECT) {
1636 			return (ev_state | EIS_REQ_EXIT);
1637 		}
1638 
1639 		/*
1640 		 * If the guest has disabled interrupts or is in an interrupt
1641 		 * shadow then we cannot inject the pending interrupt.
1642 		 */
1643 		if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1644 			return (EIS_GI_BLOCK);
1645 		}
1646 
1647 		/* Ask the legacy pic for a vector to inject */
1648 		vatpic_pending_intr(sc->vm, &vector);
1649 		KASSERT(vector >= 0 && vector <= 255,
1650 		    ("invalid vector %d from INTR", vector));
1651 
1652 		svm_inject_irq(sc, vcpu, vector);
1653 		vm_extint_clear(sc->vm, vcpu);
1654 		vatpic_intr_accepted(sc->vm, vector);
1655 		ev_state = EIS_EV_INJECTED;
1656 	}
1657 
1658 	return (ev_state);
1659 }
1660 
1661 /*
1662  * Synchronize vLAPIC state and inject any interrupts pending on it.
1663  *
1664  * This is done with host CPU interrupts disabled so notification IPIs will be
1665  * queued on the host APIC and recognized when entering SVM guest context.
1666  */
1667 static enum event_inject_state
svm_inject_vlapic(struct svm_softc * sc,int vcpu,struct vlapic * vlapic,enum event_inject_state ev_state)1668 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1669     enum event_inject_state ev_state)
1670 {
1671 	struct vmcb_ctrl *ctrl;
1672 	struct vmcb_state *state;
1673 	int vector;
1674 	uint8_t v_tpr;
1675 
1676 	state = svm_get_vmcb_state(sc, vcpu);
1677 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1678 
1679 	/*
1680 	 * The guest can modify the TPR by writing to %cr8. In guest mode the
1681 	 * CPU reflects this write to V_TPR without hypervisor intervention.
1682 	 *
1683 	 * The guest can also modify the TPR by writing to it via the memory
1684 	 * mapped APIC page. In this case, the write will be emulated by the
1685 	 * hypervisor. For this reason V_TPR must be updated before every
1686 	 * VMRUN.
1687 	 */
1688 	v_tpr = vlapic_get_cr8(vlapic);
1689 	KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1690 	if (ctrl->v_tpr != v_tpr) {
1691 		ctrl->v_tpr = v_tpr;
1692 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1693 	}
1694 
1695 	/* If an event cannot otherwise be injected, we are done for now */
1696 	if (ev_state != EIS_CAN_INJECT) {
1697 		return (ev_state);
1698 	}
1699 
1700 	if (!vlapic_pending_intr(vlapic, &vector)) {
1701 		return (EIS_CAN_INJECT);
1702 	}
1703 	KASSERT(vector >= 16 && vector <= 255,
1704 	    ("invalid vector %d from local APIC", vector));
1705 
1706 	/*
1707 	 * If the guest has disabled interrupts or is in an interrupt shadow
1708 	 * then we cannot inject the pending interrupt.
1709 	 */
1710 	if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1711 		return (EIS_GI_BLOCK);
1712 	}
1713 
1714 	svm_inject_irq(sc, vcpu, vector);
1715 	vlapic_intr_accepted(vlapic, vector);
1716 	return (EIS_EV_INJECTED);
1717 }
1718 
1719 /*
1720  * Re-check for events to be injected.
1721  *
1722  * Once host CPU interrupts are disabled, check for the presence of any events
1723  * which require injection processing.  If an exit is required upon injection,
1724  * or once the guest becomes interruptable, that will be configured too.
1725  */
1726 static bool
svm_inject_recheck(struct svm_softc * sc,int vcpu,enum event_inject_state ev_state)1727 svm_inject_recheck(struct svm_softc *sc, int vcpu,
1728     enum event_inject_state ev_state)
1729 {
1730 	struct vmcb_ctrl *ctrl;
1731 
1732 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1733 
1734 	if (ev_state == EIS_CAN_INJECT) {
1735 		/*
1736 		 * An active interrupt shadow would preclude us from injecting
1737 		 * any events picked up during a re-check.
1738 		 */
1739 		if (ctrl->intr_shadow != 0) {
1740 			return (false);
1741 		}
1742 
1743 		if (vm_nmi_pending(sc->vm, vcpu) &&
1744 		    !svm_nmi_blocked(sc, vcpu)) {
1745 			/* queued NMI not blocked by NMI-window-exiting */
1746 			return (true);
1747 		}
1748 		if (vm_extint_pending(sc->vm, vcpu)) {
1749 			/* queued ExtINT not blocked by existing injection */
1750 			return (true);
1751 		}
1752 	} else {
1753 		if ((ev_state & EIS_REQ_EXIT) != 0) {
1754 			/*
1755 			 * Use a self-IPI to force an immediate exit after
1756 			 * event injection has occurred.
1757 			 */
1758 			poke_cpu(CPU->cpu_id);
1759 		} else {
1760 			/*
1761 			 * If any event is being injected, an exit immediately
1762 			 * upon becoming interruptable again will allow pending
1763 			 * or newly queued events to be injected in a timely
1764 			 * manner.
1765 			 */
1766 			svm_enable_intr_window_exiting(sc, vcpu);
1767 		}
1768 	}
1769 	return (false);
1770 }
1771 
1772 
1773 static void
check_asid(struct svm_softc * sc,int vcpuid,uint_t thiscpu,uint64_t nptgen)1774 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen)
1775 {
1776 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1777 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1778 	uint8_t flush;
1779 
1780 	flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(),
1781 	    vcpustate->nptgen != nptgen);
1782 
1783 	if (flush != VMCB_TLB_FLUSH_NOTHING) {
1784 		ctrl->asid = vcpustate->hma_asid.hsa_asid;
1785 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1786 	}
1787 	ctrl->tlb_ctrl = flush;
1788 	vcpustate->nptgen = nptgen;
1789 }
1790 
1791 static void
flush_asid(struct svm_softc * sc,int vcpuid)1792 flush_asid(struct svm_softc *sc, int vcpuid)
1793 {
1794 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1795 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1796 	uint8_t flush;
1797 
1798 	/* HMA ASID updates are expected to be done with interrupts disabled */
1799 	const ulong_t iflag = intr_clear();
1800 	flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(),
1801 	    true);
1802 	intr_restore(iflag);
1803 
1804 	ASSERT(flush != VMCB_TLB_FLUSH_NOTHING);
1805 	ctrl->asid = vcpustate->hma_asid.hsa_asid;
1806 	ctrl->tlb_ctrl = flush;
1807 	svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1808 	/*
1809 	 * A potential future optimization: We could choose to update the nptgen
1810 	 * associated with the vCPU, since any pending nptgen change requiring a
1811 	 * flush will be satisfied by the one which has just now been queued.
1812 	 */
1813 }
1814 
1815 static __inline void
svm_dr_enter_guest(struct svm_regctx * gctx)1816 svm_dr_enter_guest(struct svm_regctx *gctx)
1817 {
1818 
1819 	/* Save host control debug registers. */
1820 	gctx->host_dr7 = rdr7();
1821 	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1822 
1823 	/*
1824 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1825 	 * exceptions in the host based on the guest DRx values.  The
1826 	 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1827 	 * VMCB.
1828 	 */
1829 	load_dr7(0);
1830 	wrmsr(MSR_DEBUGCTLMSR, 0);
1831 
1832 	/* Save host debug registers. */
1833 	gctx->host_dr0 = rdr0();
1834 	gctx->host_dr1 = rdr1();
1835 	gctx->host_dr2 = rdr2();
1836 	gctx->host_dr3 = rdr3();
1837 	gctx->host_dr6 = rdr6();
1838 
1839 	/* Restore guest debug registers. */
1840 	load_dr0(gctx->sctx_dr0);
1841 	load_dr1(gctx->sctx_dr1);
1842 	load_dr2(gctx->sctx_dr2);
1843 	load_dr3(gctx->sctx_dr3);
1844 }
1845 
1846 static __inline void
svm_dr_leave_guest(struct svm_regctx * gctx)1847 svm_dr_leave_guest(struct svm_regctx *gctx)
1848 {
1849 
1850 	/* Save guest debug registers. */
1851 	gctx->sctx_dr0 = rdr0();
1852 	gctx->sctx_dr1 = rdr1();
1853 	gctx->sctx_dr2 = rdr2();
1854 	gctx->sctx_dr3 = rdr3();
1855 
1856 	/*
1857 	 * Restore host debug registers.  Restore DR7 and DEBUGCTL
1858 	 * last.
1859 	 */
1860 	load_dr0(gctx->host_dr0);
1861 	load_dr1(gctx->host_dr1);
1862 	load_dr2(gctx->host_dr2);
1863 	load_dr3(gctx->host_dr3);
1864 	load_dr6(gctx->host_dr6);
1865 	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1866 	load_dr7(gctx->host_dr7);
1867 }
1868 
1869 /*
1870  * Apply the TSC offset for a vCPU, including physical CPU and per-vCPU offsets.
1871  */
1872 static void
svm_apply_tsc_adjust(struct svm_softc * svm_sc,int vcpuid)1873 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid)
1874 {
1875 	const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true);
1876 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid);
1877 
1878 	if (ctrl->tsc_offset != offset) {
1879 		ctrl->tsc_offset = offset;
1880 		svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I);
1881 	}
1882 }
1883 
1884 /*
1885  * Start vcpu with specified RIP.
1886  */
1887 static int
svm_vmrun(void * arg,int vcpu,uint64_t rip)1888 svm_vmrun(void *arg, int vcpu, uint64_t rip)
1889 {
1890 	struct svm_regctx *gctx;
1891 	struct svm_softc *svm_sc;
1892 	struct svm_vcpu *vcpustate;
1893 	struct vmcb_state *state;
1894 	struct vm_exit *vmexit;
1895 	struct vlapic *vlapic;
1896 	vm_client_t *vmc;
1897 	struct vm *vm;
1898 	uint64_t vmcb_pa;
1899 	int handled;
1900 	uint16_t ldt_sel;
1901 
1902 	svm_sc = arg;
1903 	vm = svm_sc->vm;
1904 
1905 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1906 	state = svm_get_vmcb_state(svm_sc, vcpu);
1907 	vmexit = vm_exitinfo(vm, vcpu);
1908 	vlapic = vm_lapic(vm, vcpu);
1909 	vmc = vm_get_vmclient(vm, vcpu);
1910 
1911 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1912 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1913 
1914 	if (vcpustate->lastcpu != curcpu) {
1915 		/*
1916 		 * Force new ASID allocation by invalidating the generation.
1917 		 */
1918 		vcpustate->hma_asid.hsa_gen = 0;
1919 
1920 		/*
1921 		 * Invalidate the VMCB state cache by marking all fields dirty.
1922 		 */
1923 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1924 
1925 		/*
1926 		 * XXX
1927 		 * Setting 'vcpustate->lastcpu' here is bit premature because
1928 		 * we may return from this function without actually executing
1929 		 * the VMRUN  instruction. This could happen if an AST or yield
1930 		 * condition is pending on the first time through the loop.
1931 		 *
1932 		 * This works for now but any new side-effects of vcpu
1933 		 * migration should take this case into account.
1934 		 */
1935 		vcpustate->lastcpu = curcpu;
1936 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1937 	}
1938 
1939 	svm_apply_tsc_adjust(svm_sc, vcpu);
1940 
1941 	svm_msr_guest_enter(svm_sc, vcpu);
1942 
1943 	VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1944 	vcpustate->loaded = B_TRUE;
1945 
1946 	/* Update Guest RIP */
1947 	state->rip = rip;
1948 
1949 	do {
1950 		enum event_inject_state inject_state;
1951 		uint64_t nptgen;
1952 
1953 		/*
1954 		 * Initial event injection is complex and may involve mutex
1955 		 * contention, so it must be performed with global interrupts
1956 		 * still enabled.
1957 		 */
1958 		inject_state = svm_inject_events(svm_sc, vcpu);
1959 		handled = 0;
1960 
1961 		/*
1962 		 * Disable interrupts while loading VM state and performing
1963 		 * event injection.
1964 		 */
1965 		const ulong_t iflag = intr_clear();
1966 
1967 		/*
1968 		 * Synchronizing and injecting vlapic state is lock-free and is
1969 		 * safe (and prudent) to perform with interrupts disabled.
1970 		 */
1971 		inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
1972 		    inject_state);
1973 
1974 		/*
1975 		 * Check for vCPU bail-out conditions.  This must be done after
1976 		 * svm_inject_events() to detect a triple-fault condition.
1977 		 */
1978 		if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
1979 			intr_restore(iflag);
1980 			break;
1981 		}
1982 
1983 		if (vcpu_run_state_pending(vm, vcpu)) {
1984 			intr_restore(iflag);
1985 			vm_exit_run_state(vm, vcpu, state->rip);
1986 			break;
1987 		}
1988 
1989 		/*
1990 		 * If subsequent activity queued events which require injection
1991 		 * handling, take another lap to handle them.
1992 		 */
1993 		if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
1994 			intr_restore(iflag);
1995 			handled = 1;
1996 			continue;
1997 		}
1998 
1999 		/*
2000 		 * #VMEXIT resumes the host with the guest LDTR, so
2001 		 * save the current LDT selector so it can be restored
2002 		 * after an exit.  The userspace hypervisor probably
2003 		 * doesn't use a LDT, but save and restore it to be
2004 		 * safe.
2005 		 */
2006 		ldt_sel = sldt();
2007 
2008 		/*
2009 		 * Check the vmspace and ASID generations to ensure that the
2010 		 * vcpu does not use stale TLB mappings.
2011 		 */
2012 		nptgen = vmc_table_enter(vmc);
2013 		check_asid(svm_sc, vcpu, curcpu, nptgen);
2014 
2015 		svm_pmu_enter(svm_sc, vcpu);
2016 		vcpu_ustate_change(vm, vcpu, VU_RUN);
2017 		svm_dr_enter_guest(gctx);
2018 		svm_apply_dirty(svm_sc, vcpu);
2019 
2020 		/*
2021 		 * Perform VMRUN to enter guest context.
2022 		 *
2023 		 * This is done with the protection of clearing the GIF
2024 		 * (global interrupt flag) as required by SVM.
2025 		 */
2026 		hma_svm_gif_disable();
2027 		svm_launch(vmcb_pa, gctx, get_pcpu());
2028 		hma_svm_gif_enable();
2029 
2030 		svm_dr_leave_guest(gctx);
2031 		vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
2032 		svm_pmu_exit(svm_sc, vcpu);
2033 
2034 		/* Restore host LDTR. */
2035 		lldt(ldt_sel);
2036 
2037 		/*
2038 		 * Re-enable interrupts now that necessary CPU state has been
2039 		 * restored.  Subsequent logic may need to block.
2040 		 */
2041 		intr_restore(iflag);
2042 
2043 		vmc_table_exit(vmc);
2044 
2045 		/* Update 'nextrip' */
2046 		vcpustate->nextrip = state->rip;
2047 
2048 		/* Handle #VMEXIT and if required return to user space. */
2049 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
2050 	} while (handled);
2051 
2052 	svm_msr_guest_exit(svm_sc, vcpu);
2053 
2054 	ASSERT(interrupts_enabled());
2055 	VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2056 	vcpustate->loaded = B_FALSE;
2057 
2058 	return (0);
2059 }
2060 
2061 static void
svm_vmcleanup(void * arg)2062 svm_vmcleanup(void *arg)
2063 {
2064 	struct svm_softc *sc = arg;
2065 
2066 	vmm_contig_free(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE);
2067 	vmm_contig_free(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE);
2068 	kmem_free(sc, sizeof (*sc));
2069 }
2070 
2071 static uint64_t *
swctx_regptr(struct svm_regctx * regctx,int reg)2072 swctx_regptr(struct svm_regctx *regctx, int reg)
2073 {
2074 	switch (reg) {
2075 	case VM_REG_GUEST_RBX:
2076 		return (&regctx->sctx_rbx);
2077 	case VM_REG_GUEST_RCX:
2078 		return (&regctx->sctx_rcx);
2079 	case VM_REG_GUEST_RDX:
2080 		return (&regctx->sctx_rdx);
2081 	case VM_REG_GUEST_RDI:
2082 		return (&regctx->sctx_rdi);
2083 	case VM_REG_GUEST_RSI:
2084 		return (&regctx->sctx_rsi);
2085 	case VM_REG_GUEST_RBP:
2086 		return (&regctx->sctx_rbp);
2087 	case VM_REG_GUEST_R8:
2088 		return (&regctx->sctx_r8);
2089 	case VM_REG_GUEST_R9:
2090 		return (&regctx->sctx_r9);
2091 	case VM_REG_GUEST_R10:
2092 		return (&regctx->sctx_r10);
2093 	case VM_REG_GUEST_R11:
2094 		return (&regctx->sctx_r11);
2095 	case VM_REG_GUEST_R12:
2096 		return (&regctx->sctx_r12);
2097 	case VM_REG_GUEST_R13:
2098 		return (&regctx->sctx_r13);
2099 	case VM_REG_GUEST_R14:
2100 		return (&regctx->sctx_r14);
2101 	case VM_REG_GUEST_R15:
2102 		return (&regctx->sctx_r15);
2103 	case VM_REG_GUEST_DR0:
2104 		return (&regctx->sctx_dr0);
2105 	case VM_REG_GUEST_DR1:
2106 		return (&regctx->sctx_dr1);
2107 	case VM_REG_GUEST_DR2:
2108 		return (&regctx->sctx_dr2);
2109 	case VM_REG_GUEST_DR3:
2110 		return (&regctx->sctx_dr3);
2111 	default:
2112 		return (NULL);
2113 	}
2114 }
2115 
2116 static int
svm_getreg(void * arg,int vcpu,int ident,uint64_t * val)2117 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2118 {
2119 	struct svm_softc *sc;
2120 	struct vmcb *vmcb;
2121 	uint64_t *regp;
2122 	uint64_t *fieldp;
2123 	struct vmcb_segment *seg;
2124 
2125 	sc = arg;
2126 	vmcb = svm_get_vmcb(sc, vcpu);
2127 
2128 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2129 	if (regp != NULL) {
2130 		*val = *regp;
2131 		return (0);
2132 	}
2133 
2134 	switch (ident) {
2135 	case VM_REG_GUEST_INTR_SHADOW:
2136 		*val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2137 		break;
2138 
2139 	case VM_REG_GUEST_CR0:
2140 		svm_get_cr0(sc, vcpu, val);
2141 		break;
2142 	case VM_REG_GUEST_CR2:
2143 	case VM_REG_GUEST_CR3:
2144 	case VM_REG_GUEST_CR4:
2145 	case VM_REG_GUEST_DR6:
2146 	case VM_REG_GUEST_DR7:
2147 	case VM_REG_GUEST_EFER:
2148 	case VM_REG_GUEST_RAX:
2149 	case VM_REG_GUEST_RFLAGS:
2150 	case VM_REG_GUEST_RIP:
2151 	case VM_REG_GUEST_RSP:
2152 		fieldp = vmcb_regptr(vmcb, ident, NULL);
2153 		*val = *fieldp;
2154 		break;
2155 
2156 	case VM_REG_GUEST_CS:
2157 	case VM_REG_GUEST_DS:
2158 	case VM_REG_GUEST_ES:
2159 	case VM_REG_GUEST_FS:
2160 	case VM_REG_GUEST_GS:
2161 	case VM_REG_GUEST_SS:
2162 	case VM_REG_GUEST_LDTR:
2163 	case VM_REG_GUEST_TR:
2164 		seg = vmcb_segptr(vmcb, ident);
2165 		*val = seg->selector;
2166 		break;
2167 
2168 	case VM_REG_GUEST_GDTR:
2169 	case VM_REG_GUEST_IDTR:
2170 		/* GDTR and IDTR don't have segment selectors */
2171 		return (EINVAL);
2172 
2173 	case VM_REG_GUEST_PDPTE0:
2174 	case VM_REG_GUEST_PDPTE1:
2175 	case VM_REG_GUEST_PDPTE2:
2176 	case VM_REG_GUEST_PDPTE3:
2177 		/*
2178 		 * Unlike VMX, where the PDPTEs are explicitly cached as part of
2179 		 * several well-defined events related to paging (such as
2180 		 * loading %cr3), SVM walks the PDPEs (their PDPTE) as part of
2181 		 * nested paging lookups.  This makes these registers
2182 		 * effectively irrelevant on SVM.
2183 		 *
2184 		 * Rather than tossing an error, emit zeroed values so casual
2185 		 * consumers do not need to be as careful about that difference.
2186 		 */
2187 		*val = 0;
2188 		break;
2189 
2190 	default:
2191 		return (EINVAL);
2192 	}
2193 
2194 	return (0);
2195 }
2196 
2197 static int
svm_setreg(void * arg,int vcpu,int ident,uint64_t val)2198 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2199 {
2200 	struct svm_softc *sc;
2201 	struct vmcb *vmcb;
2202 	uint64_t *regp;
2203 	uint64_t *fieldp;
2204 	uint32_t dirty;
2205 	struct vmcb_segment *seg;
2206 
2207 	sc = arg;
2208 	vmcb = svm_get_vmcb(sc, vcpu);
2209 
2210 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2211 	if (regp != NULL) {
2212 		*regp = val;
2213 		return (0);
2214 	}
2215 
2216 	dirty = VMCB_CACHE_NONE;
2217 	switch (ident) {
2218 	case VM_REG_GUEST_INTR_SHADOW:
2219 		vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2220 		break;
2221 
2222 	case VM_REG_GUEST_EFER:
2223 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2224 		/* EFER_SVM must always be set when the guest is executing */
2225 		*fieldp = val | EFER_SVM;
2226 		dirty |= VMCB_CACHE_CR;
2227 		break;
2228 
2229 	case VM_REG_GUEST_CR0:
2230 		svm_set_cr0(sc, vcpu, val, false);
2231 		break;
2232 	case VM_REG_GUEST_CR2:
2233 	case VM_REG_GUEST_CR3:
2234 	case VM_REG_GUEST_CR4:
2235 	case VM_REG_GUEST_DR6:
2236 	case VM_REG_GUEST_DR7:
2237 	case VM_REG_GUEST_RAX:
2238 	case VM_REG_GUEST_RFLAGS:
2239 	case VM_REG_GUEST_RIP:
2240 	case VM_REG_GUEST_RSP:
2241 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2242 		*fieldp = val;
2243 		break;
2244 
2245 	case VM_REG_GUEST_CS:
2246 	case VM_REG_GUEST_DS:
2247 	case VM_REG_GUEST_ES:
2248 	case VM_REG_GUEST_SS:
2249 	case VM_REG_GUEST_FS:
2250 	case VM_REG_GUEST_GS:
2251 	case VM_REG_GUEST_LDTR:
2252 	case VM_REG_GUEST_TR:
2253 		dirty |= VMCB_CACHE_SEG;
2254 		seg = vmcb_segptr(vmcb, ident);
2255 		seg->selector = (uint16_t)val;
2256 		break;
2257 
2258 	case VM_REG_GUEST_GDTR:
2259 	case VM_REG_GUEST_IDTR:
2260 		/* GDTR and IDTR don't have segment selectors */
2261 		return (EINVAL);
2262 
2263 	case VM_REG_GUEST_PDPTE0:
2264 	case VM_REG_GUEST_PDPTE1:
2265 	case VM_REG_GUEST_PDPTE2:
2266 	case VM_REG_GUEST_PDPTE3:
2267 		/*
2268 		 * PDPEs (AMD's PDPTE) are not cached under SVM, so we can
2269 		 * ignore attempts to set them.  See handler in svm_getreg() for
2270 		 * more details.
2271 		 */
2272 		break;
2273 
2274 	default:
2275 		return (EINVAL);
2276 	}
2277 
2278 	if (dirty != VMCB_CACHE_NONE) {
2279 		svm_set_dirty(sc, vcpu, dirty);
2280 	}
2281 
2282 	/*
2283 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
2284 	 * vcpu's ASID. This needs to be treated differently depending on
2285 	 * whether 'running' is true/false.
2286 	 */
2287 
2288 	return (0);
2289 }
2290 
2291 static int
svm_setdesc(void * arg,int vcpu,int reg,const struct seg_desc * desc)2292 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2293 {
2294 	struct vmcb *vmcb;
2295 	struct svm_softc *sc;
2296 	struct vmcb_segment *seg;
2297 
2298 	sc = arg;
2299 	vmcb = svm_get_vmcb(sc, vcpu);
2300 
2301 	switch (reg) {
2302 	case VM_REG_GUEST_CS:
2303 	case VM_REG_GUEST_DS:
2304 	case VM_REG_GUEST_ES:
2305 	case VM_REG_GUEST_SS:
2306 	case VM_REG_GUEST_FS:
2307 	case VM_REG_GUEST_GS:
2308 	case VM_REG_GUEST_LDTR:
2309 	case VM_REG_GUEST_TR:
2310 		svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2311 		seg = vmcb_segptr(vmcb, reg);
2312 		/*
2313 		 * Map seg_desc access to VMCB attribute format.
2314 		 *
2315 		 * SVM uses the 'P' bit in the segment attributes to indicate a
2316 		 * NULL segment so clear it if the segment is marked unusable.
2317 		 */
2318 		seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2319 		if (SEG_DESC_UNUSABLE(desc->access)) {
2320 			seg->attrib &= ~0x80;
2321 		}
2322 		/*
2323 		 * Keep CPL synced with the DPL specified for %ss.
2324 		 *
2325 		 * KVM notes that a SYSRET to non-cpl-3 is possible on AMD
2326 		 * (unlike Intel), but accepts such a possible deviation for
2327 		 * what is otherwise unreasonable behavior for a guest OS, since
2328 		 * they do the same synchronization.
2329 		 */
2330 		if (reg == VM_REG_GUEST_SS) {
2331 			vmcb->state.cpl = SEG_DESC_DPL(desc->access);
2332 		}
2333 		break;
2334 
2335 	case VM_REG_GUEST_GDTR:
2336 	case VM_REG_GUEST_IDTR:
2337 		svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2338 		seg = vmcb_segptr(vmcb, reg);
2339 		break;
2340 
2341 	default:
2342 		return (EINVAL);
2343 	}
2344 
2345 	ASSERT(seg != NULL);
2346 	seg->base = desc->base;
2347 	seg->limit = desc->limit;
2348 
2349 	return (0);
2350 }
2351 
2352 static int
svm_getdesc(void * arg,int vcpu,int reg,struct seg_desc * desc)2353 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2354 {
2355 	struct vmcb *vmcb;
2356 	struct svm_softc *sc;
2357 	struct vmcb_segment *seg;
2358 
2359 	sc = arg;
2360 	vmcb = svm_get_vmcb(sc, vcpu);
2361 
2362 	switch (reg) {
2363 	case VM_REG_GUEST_DS:
2364 	case VM_REG_GUEST_ES:
2365 	case VM_REG_GUEST_FS:
2366 	case VM_REG_GUEST_GS:
2367 	case VM_REG_GUEST_SS:
2368 	case VM_REG_GUEST_LDTR:
2369 		seg = vmcb_segptr(vmcb, reg);
2370 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2371 		/*
2372 		 * VT-x uses bit 16 to indicate a segment that has been loaded
2373 		 * with a NULL selector (aka unusable). The 'desc->access'
2374 		 * field is interpreted in the VT-x format by the
2375 		 * processor-independent code.
2376 		 *
2377 		 * SVM uses the 'P' bit to convey the same information so
2378 		 * convert it into the VT-x format. For more details refer to
2379 		 * section "Segment State in the VMCB" in APMv2.
2380 		 */
2381 		if ((desc->access & 0x80) == 0) {
2382 			/* Unusable segment */
2383 			desc->access |= 0x10000;
2384 		}
2385 
2386 		/*
2387 		 * Just as CPL (in the VMCB) is kept synced to SS when the
2388 		 * segment is written, so too shall the segment sync from CPL
2389 		 * when it is read.
2390 		 */
2391 		if (reg == VM_REG_GUEST_SS) {
2392 			desc->access &=
2393 			    ~(SEG_DESC_DPL_MASK << SEG_DESC_DPL_SHIFT);
2394 			desc->access |=
2395 			    (vmcb->state.cpl & SEG_DESC_DPL_MASK) <<
2396 			    SEG_DESC_DPL_SHIFT;
2397 		}
2398 		break;
2399 
2400 	case VM_REG_GUEST_CS:
2401 	case VM_REG_GUEST_TR:
2402 		seg = vmcb_segptr(vmcb, reg);
2403 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2404 		break;
2405 
2406 	case VM_REG_GUEST_GDTR:
2407 	case VM_REG_GUEST_IDTR:
2408 		seg = vmcb_segptr(vmcb, reg);
2409 		/*
2410 		 * Since there are no access bits associated with the GDTR or
2411 		 * the IDTR, zero out the field to ensure it does not contain
2412 		 * garbage which might confuse the consumer.
2413 		 */
2414 		desc->access = 0;
2415 		break;
2416 
2417 	default:
2418 		return (EINVAL);
2419 	}
2420 
2421 	ASSERT(seg != NULL);
2422 	desc->base = seg->base;
2423 	desc->limit = seg->limit;
2424 	return (0);
2425 }
2426 
2427 static int
svm_get_msr(void * arg,int vcpu,uint32_t msr,uint64_t * valp)2428 svm_get_msr(void *arg, int vcpu, uint32_t msr, uint64_t *valp)
2429 {
2430 	struct svm_softc *sc = arg;
2431 	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2432 	const uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, NULL);
2433 
2434 	if (msrp != NULL) {
2435 		*valp = *msrp;
2436 		return (0);
2437 	}
2438 
2439 	return (EINVAL);
2440 }
2441 
2442 static int
svm_set_msr(void * arg,int vcpu,uint32_t msr,uint64_t val)2443 svm_set_msr(void *arg, int vcpu, uint32_t msr, uint64_t val)
2444 {
2445 	struct svm_softc *sc = arg;
2446 	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2447 
2448 	uint32_t dirty = 0;
2449 	uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, &dirty);
2450 	if (msrp == NULL) {
2451 		return (EINVAL);
2452 	}
2453 	switch (msr) {
2454 	case MSR_EFER:
2455 		/*
2456 		 * For now, just clone the logic from
2457 		 * svm_setreg():
2458 		 *
2459 		 * EFER_SVM must always be set when the guest is
2460 		 * executing
2461 		 */
2462 		*msrp = val | EFER_SVM;
2463 		break;
2464 	/* TODO: other necessary MSR masking */
2465 	default:
2466 		*msrp = val;
2467 		break;
2468 	}
2469 	if (dirty != 0) {
2470 		svm_set_dirty(sc, vcpu, dirty);
2471 	}
2472 	return (0);
2473 
2474 }
2475 
2476 static int
svm_setcap(void * arg,int vcpu,int type,int val)2477 svm_setcap(void *arg, int vcpu, int type, int val)
2478 {
2479 	struct svm_softc *sc;
2480 	int error;
2481 
2482 	sc = arg;
2483 	error = 0;
2484 	switch (type) {
2485 	case VM_CAP_HALT_EXIT:
2486 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2487 		    VMCB_INTCPT_HLT, val);
2488 		break;
2489 	case VM_CAP_PAUSE_EXIT:
2490 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2491 		    VMCB_INTCPT_PAUSE, val);
2492 		break;
2493 	default:
2494 		error = ENOENT;
2495 		break;
2496 	}
2497 	return (error);
2498 }
2499 
2500 static int
svm_getcap(void * arg,int vcpu,int type,int * retval)2501 svm_getcap(void *arg, int vcpu, int type, int *retval)
2502 {
2503 	struct svm_softc *sc;
2504 	int error;
2505 
2506 	sc = arg;
2507 	error = 0;
2508 
2509 	switch (type) {
2510 	case VM_CAP_HALT_EXIT:
2511 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2512 		    VMCB_INTCPT_HLT);
2513 		break;
2514 	case VM_CAP_PAUSE_EXIT:
2515 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2516 		    VMCB_INTCPT_PAUSE);
2517 		break;
2518 	default:
2519 		error = ENOENT;
2520 		break;
2521 	}
2522 	return (error);
2523 }
2524 
2525 static struct vlapic *
svm_vlapic_init(void * arg,int vcpuid)2526 svm_vlapic_init(void *arg, int vcpuid)
2527 {
2528 	struct svm_softc *svm_sc;
2529 	struct vlapic *vlapic;
2530 
2531 	svm_sc = arg;
2532 	vlapic = kmem_zalloc(sizeof (struct vlapic), KM_SLEEP);
2533 	vlapic->vm = svm_sc->vm;
2534 	vlapic->vcpuid = vcpuid;
2535 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2536 
2537 	vlapic_init(vlapic);
2538 
2539 	return (vlapic);
2540 }
2541 
2542 static void
svm_vlapic_cleanup(void * arg,struct vlapic * vlapic)2543 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2544 {
2545 	vlapic_cleanup(vlapic);
2546 	kmem_free(vlapic, sizeof (struct vlapic));
2547 }
2548 
2549 static void
svm_pause(void * arg,int vcpu)2550 svm_pause(void *arg, int vcpu)
2551 {
2552 	struct svm_softc *sc = arg;
2553 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
2554 
2555 	/*
2556 	 * If an event is pending injection in the VMCB, stash it in
2557 	 * exit_intinfo as if it were deferred by an exit from guest context.
2558 	 */
2559 	const uint64_t intinfo = ctrl->eventinj;
2560 	if ((intinfo & VMCB_EVENTINJ_VALID) != 0) {
2561 		svm_stash_intinfo(sc, vcpu, intinfo);
2562 		ctrl->eventinj = 0;
2563 	}
2564 
2565 	/*
2566 	 * Now that no event is pending injection, interrupt-window exiting and
2567 	 * NMI-blocking can be disabled.  If/when this vCPU is made to run
2568 	 * again, those conditions will be reinstated when the now-queued events
2569 	 * are re-injected.
2570 	 */
2571 	svm_disable_intr_window_exiting(sc, vcpu);
2572 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
2573 }
2574 
2575 static void
svm_savectx(void * arg,int vcpu)2576 svm_savectx(void *arg, int vcpu)
2577 {
2578 	struct svm_softc *sc = arg;
2579 
2580 	/* We should _never_ go off-CPU with the GIF disabled */
2581 	ASSERT(!hma_svm_gif_is_disabled());
2582 
2583 	if (sc->vcpu[vcpu].loaded) {
2584 		svm_msr_guest_exit(sc, vcpu);
2585 	}
2586 }
2587 
2588 static void
svm_restorectx(void * arg,int vcpu)2589 svm_restorectx(void *arg, int vcpu)
2590 {
2591 	struct svm_softc *sc = arg;
2592 
2593 	if (sc->vcpu[vcpu].loaded) {
2594 		svm_msr_guest_enter(sc, vcpu);
2595 	}
2596 }
2597 
2598 static freqratio_res_t
svm_freq_ratio(uint64_t guest_hz,uint64_t host_hz,uint64_t * mult)2599 svm_freq_ratio(uint64_t guest_hz, uint64_t host_hz, uint64_t *mult)
2600 {
2601 	/*
2602 	 * Check whether scaling is needed at all before potentially erroring
2603 	 * out for other reasons.
2604 	 */
2605 	if (guest_hz == host_hz) {
2606 		return (FR_SCALING_NOT_NEEDED);
2607 	}
2608 
2609 	/*
2610 	 * Confirm that scaling is available.
2611 	 */
2612 	if (!has_tsc_freq_ctl()) {
2613 		return (FR_SCALING_NOT_SUPPORTED);
2614 	}
2615 
2616 	/*
2617 	 * Verify the guest_hz is within the supported range.
2618 	 */
2619 	if ((guest_hz < AMD_TSC_MIN_FREQ) ||
2620 	    (guest_hz >= (host_hz * AMD_TSC_MAX_FREQ_RATIO))) {
2621 		return (FR_OUT_OF_RANGE);
2622 	}
2623 
2624 	/* Calculate the multiplier. */
2625 	uint64_t m = vmm_calc_freq_multiplier(guest_hz, host_hz,
2626 	    AMD_TSCM_FRAC_SIZE);
2627 	*mult = m;
2628 
2629 	return (FR_VALID);
2630 }
2631 
2632 struct vmm_ops vmm_ops_amd = {
2633 	.init		= svm_init,
2634 	.cleanup	= svm_cleanup,
2635 	.resume		= svm_restore,
2636 
2637 	.vminit		= svm_vminit,
2638 	.vmrun		= svm_vmrun,
2639 	.vmcleanup	= svm_vmcleanup,
2640 	.vmgetreg	= svm_getreg,
2641 	.vmsetreg	= svm_setreg,
2642 	.vmgetdesc	= svm_getdesc,
2643 	.vmsetdesc	= svm_setdesc,
2644 	.vmgetcap	= svm_getcap,
2645 	.vmsetcap	= svm_setcap,
2646 	.vlapic_init	= svm_vlapic_init,
2647 	.vlapic_cleanup	= svm_vlapic_cleanup,
2648 	.vmpause	= svm_pause,
2649 
2650 	.vmsavectx	= svm_savectx,
2651 	.vmrestorectx	= svm_restorectx,
2652 
2653 	.vmgetmsr	= svm_get_msr,
2654 	.vmsetmsr	= svm_set_msr,
2655 
2656 	.vmfreqratio	= svm_freq_ratio,
2657 	.fr_intsize	= AMD_TSCM_INT_SIZE,
2658 	.fr_fracsize	= AMD_TSCM_FRAC_SIZE,
2659 };
2660