xref: /illumos-gate/usr/src/uts/intel/io/vmm/amd/svm.c (revision fdad6fbf87b201fdb96a704fc41fa8be1e4efbc8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * This file and its contents are supplied under the terms of the
31  * Common Development and Distribution License ("CDDL"), version 1.0.
32  * You may only use this file in accordance with the terms of version
33  * 1.0 of the CDDL.
34  *
35  * A full copy of the text of the CDDL should have accompanied this
36  * source.  A copy of the CDDL is also available via the Internet at
37  * http://www.illumos.org/license/CDDL.
38  */
39 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
40 
41 /*
42  * Copyright 2018 Joyent, Inc.
43  * Copyright 2023 Oxide Computer Company
44  */
45 
46 #include <sys/cdefs.h>
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/kernel.h>
51 #include <sys/kmem.h>
52 #include <sys/pcpu.h>
53 #include <sys/proc.h>
54 #include <sys/sysctl.h>
55 #include <sys/cpu.h>
56 
57 #include <sys/x86_archext.h>
58 #include <sys/archsystm.h>
59 #include <sys/trap.h>
60 
61 #include <machine/cpufunc.h>
62 #include <machine/psl.h>
63 #include <machine/md_var.h>
64 #include <machine/reg.h>
65 #include <machine/specialreg.h>
66 #include <machine/vmm.h>
67 #include <machine/vmm_dev.h>
68 #include <sys/vmm_instruction_emul.h>
69 #include <sys/vmm_vm.h>
70 #include <sys/vmm_kernel.h>
71 
72 #include "vmm_lapic.h"
73 #include "vmm_stat.h"
74 #include "vmm_ioport.h"
75 #include "vatpic.h"
76 #include "vlapic.h"
77 #include "vlapic_priv.h"
78 
79 #include "vmcb.h"
80 #include "svm.h"
81 #include "svm_softc.h"
82 #include "svm_msr.h"
83 
84 SYSCTL_DECL(_hw_vmm);
85 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
86     NULL);
87 
88 /*
89  * Guardrails for supported guest TSC frequencies.
90  *
91  * A minimum of 0.5 GHz, which should be sufficient for all recent AMD CPUs, and
92  * a maximum ratio of (15 * host frequency), which is sufficient to prevent
93  * overflowing frequency calcuations and give plenty of bandwidth for future CPU
94  * frequency increases.
95  */
96 #define	AMD_TSC_MIN_FREQ	500000000
97 #define	AMD_TSC_MAX_FREQ_RATIO	15
98 
99 /* SVM features advertised by CPUID.8000000AH:EDX */
100 static uint32_t svm_feature = 0;
101 
102 static int disable_npf_assist;
103 
104 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
105 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
106 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
107 
108 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
109 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val);
110 static void flush_asid(struct svm_softc *sc, int vcpuid);
111 
112 static __inline bool
has_flush_by_asid(void)113 has_flush_by_asid(void)
114 {
115 	return ((svm_feature & CPUID_AMD_EDX_FLUSH_ASID) != 0);
116 }
117 
118 static __inline bool
has_lbr_virt(void)119 has_lbr_virt(void)
120 {
121 	return ((svm_feature & CPUID_AMD_EDX_LBR_VIRT) != 0);
122 }
123 
124 static __inline bool
has_decode_assist(void)125 has_decode_assist(void)
126 {
127 	return ((svm_feature & CPUID_AMD_EDX_DECODE_ASSISTS) != 0);
128 }
129 
130 static __inline bool
has_tsc_freq_ctl(void)131 has_tsc_freq_ctl(void)
132 {
133 	return ((svm_feature & CPUID_AMD_EDX_TSC_RATE_MSR) != 0);
134 }
135 
136 static int
svm_cleanup(void)137 svm_cleanup(void)
138 {
139 	/* This is taken care of by the hma registration */
140 	return (0);
141 }
142 
143 static int
svm_init(void)144 svm_init(void)
145 {
146 	/* Grab a (bhyve) local copy of the SVM feature bits */
147 	struct cpuid_regs regs = {
148 		.cp_eax = 0x8000000a,
149 	};
150 	(void) cpuid_insn(NULL, &regs);
151 	svm_feature = regs.cp_edx;
152 
153 	/*
154 	 * HMA should have already checked for these features which we refuse to
155 	 * operate without, but no harm in making sure
156 	 */
157 	const uint32_t demand_bits =
158 	    (CPUID_AMD_EDX_NESTED_PAGING | CPUID_AMD_EDX_NRIPS);
159 	VERIFY((svm_feature & demand_bits) == demand_bits);
160 
161 	return (0);
162 }
163 
164 static void
svm_restore(void)165 svm_restore(void)
166 {
167 	/* No-op on illumos */
168 }
169 
170 /* Pentium compatible MSRs */
171 #define	MSR_PENTIUM_START	0
172 #define	MSR_PENTIUM_END		0x1FFF
173 /* AMD 6th generation and Intel compatible MSRs */
174 #define	MSR_AMD6TH_START	0xC0000000UL
175 #define	MSR_AMD6TH_END		0xC0001FFFUL
176 /* AMD 7th and 8th generation compatible MSRs */
177 #define	MSR_AMD7TH_START	0xC0010000UL
178 #define	MSR_AMD7TH_END		0xC0011FFFUL
179 
180 /*
181  * Get the index and bit position for a MSR in permission bitmap.
182  * Two bits are used for each MSR: lower bit for read and higher bit for write.
183  */
184 static int
svm_msr_index(uint64_t msr,int * index,int * bit)185 svm_msr_index(uint64_t msr, int *index, int *bit)
186 {
187 	uint32_t base, off;
188 
189 	*index = -1;
190 	*bit = (msr % 4) * 2;
191 	base = 0;
192 
193 	if (msr <= MSR_PENTIUM_END) {
194 		*index = msr / 4;
195 		return (0);
196 	}
197 
198 	base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
199 	if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
200 		off = (msr - MSR_AMD6TH_START);
201 		*index = (off + base) / 4;
202 		return (0);
203 	}
204 
205 	base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
206 	if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
207 		off = (msr - MSR_AMD7TH_START);
208 		*index = (off + base) / 4;
209 		return (0);
210 	}
211 
212 	return (EINVAL);
213 }
214 
215 /*
216  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
217  */
218 static void
svm_msr_perm(uint8_t * perm_bitmap,uint64_t msr,bool read,bool write)219 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
220 {
221 	int index, bit, error;
222 
223 	error = svm_msr_index(msr, &index, &bit);
224 	KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
225 	KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
226 	    ("%s: invalid index %d for msr %lx", __func__, index, msr));
227 	KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
228 	    "msr %lx", __func__, bit, msr));
229 
230 	if (read)
231 		perm_bitmap[index] &= ~(1UL << bit);
232 
233 	if (write)
234 		perm_bitmap[index] &= ~(2UL << bit);
235 }
236 
237 static void
svm_msr_rw_ok(uint8_t * perm_bitmap,uint64_t msr)238 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
239 {
240 
241 	svm_msr_perm(perm_bitmap, msr, true, true);
242 }
243 
244 static void
svm_msr_rd_ok(uint8_t * perm_bitmap,uint64_t msr)245 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
246 {
247 
248 	svm_msr_perm(perm_bitmap, msr, true, false);
249 }
250 
251 int
svm_get_intercept(struct svm_softc * sc,int vcpu,int idx,uint32_t bitmask)252 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
253 {
254 	struct vmcb_ctrl *ctrl;
255 
256 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
257 
258 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
259 	return (ctrl->intercept[idx] & bitmask ? 1 : 0);
260 }
261 
262 void
svm_set_intercept(struct svm_softc * sc,int vcpu,int idx,uint32_t bitmask,int enabled)263 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
264     int enabled)
265 {
266 	struct vmcb_ctrl *ctrl;
267 	uint32_t oldval;
268 
269 	KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
270 
271 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
272 	oldval = ctrl->intercept[idx];
273 
274 	if (enabled)
275 		ctrl->intercept[idx] |= bitmask;
276 	else
277 		ctrl->intercept[idx] &= ~bitmask;
278 
279 	if (ctrl->intercept[idx] != oldval) {
280 		svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
281 	}
282 }
283 
284 static void
vmcb_init(struct svm_softc * sc,int vcpu,uint64_t iopm_base_pa,uint64_t msrpm_base_pa,uint64_t np_pml4)285 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
286     uint64_t msrpm_base_pa, uint64_t np_pml4)
287 {
288 	struct vmcb_ctrl *ctrl;
289 	struct vmcb_state *state;
290 	uint32_t mask;
291 	int n;
292 
293 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
294 	state = svm_get_vmcb_state(sc, vcpu);
295 
296 	ctrl->iopm_base_pa = iopm_base_pa;
297 	ctrl->msrpm_base_pa = msrpm_base_pa;
298 
299 	/* Enable nested paging */
300 	ctrl->np_ctrl = NP_ENABLE;
301 	ctrl->n_cr3 = np_pml4;
302 
303 	/*
304 	 * Intercept accesses to the control registers that are not shadowed
305 	 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
306 	 */
307 	for (n = 0; n < 16; n++) {
308 		mask = (BIT(n) << 16) | BIT(n);
309 		if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
310 			svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
311 		else
312 			svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
313 	}
314 
315 	/*
316 	 * Selectively intercept writes to %cr0.  This triggers on operations
317 	 * which would change bits other than TS or MP.
318 	 */
319 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
320 	    VMCB_INTCPT_CR0_WRITE);
321 
322 	/*
323 	 * Intercept everything when tracing guest exceptions otherwise
324 	 * just intercept machine check exception.
325 	 */
326 	if (vcpu_trace_exceptions(sc->vm, vcpu)) {
327 		for (n = 0; n < 32; n++) {
328 			/*
329 			 * Skip unimplemented vectors in the exception bitmap.
330 			 */
331 			if (n == 2 || n == 9) {
332 				continue;
333 			}
334 			svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
335 		}
336 	} else {
337 		svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
338 	}
339 
340 	/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
341 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
342 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
343 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
344 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
345 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
346 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
347 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
348 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_RDPMC);
349 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
350 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
351 	    VMCB_INTCPT_FERR_FREEZE);
352 
353 	/* Enable exit-on-hlt by default */
354 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT);
355 
356 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
357 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
358 
359 	/* Intercept privileged invalidation instructions. */
360 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
361 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
362 
363 	/*
364 	 * Intercept all virtualization-related instructions.
365 	 *
366 	 * From section "Canonicalization and Consistency Checks" in APMv2
367 	 * the VMRUN intercept bit must be set to pass the consistency check.
368 	 */
369 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
370 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
371 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
372 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
373 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
374 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
375 	svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
376 	if (vcpu_trap_wbinvd(sc->vm, vcpu) != 0) {
377 		svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT,
378 		    VMCB_INTCPT_WBINVD);
379 	}
380 
381 	/*
382 	 * The ASID will be set to a non-zero value just before VMRUN.
383 	 */
384 	ctrl->asid = 0;
385 
386 	/*
387 	 * Section 15.21.1, Interrupt Masking in EFLAGS
388 	 * Section 15.21.2, Virtualizing APIC.TPR
389 	 *
390 	 * This must be set for %rflag and %cr8 isolation of guest and host.
391 	 */
392 	ctrl->v_intr_ctrl |= V_INTR_MASKING;
393 
394 	/* Enable Last Branch Record aka LBR-virt (if available) */
395 	if (has_lbr_virt()) {
396 		ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
397 	}
398 
399 	/* EFER_SVM must always be set when the guest is executing */
400 	state->efer = EFER_SVM;
401 
402 	/* Set up the PAT to power-on state */
403 	state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)	|
404 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
405 	    PAT_VALUE(2, PAT_UNCACHED)		|
406 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
407 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
408 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
409 	    PAT_VALUE(6, PAT_UNCACHED)		|
410 	    PAT_VALUE(7, PAT_UNCACHEABLE);
411 
412 	/* Set up DR6/7 to power-on state */
413 	state->dr6 = DBREG_DR6_RESERVED1;
414 	state->dr7 = DBREG_DR7_RESERVED1;
415 }
416 
417 /*
418  * Initialize a virtual machine.
419  */
420 static void *
svm_vminit(struct vm * vm)421 svm_vminit(struct vm *vm)
422 {
423 	struct svm_softc *svm_sc;
424 	struct svm_vcpu *vcpu;
425 	vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
426 	int i;
427 	uint16_t maxcpus;
428 
429 	svm_sc = kmem_zalloc(sizeof (*svm_sc), KM_SLEEP);
430 	VERIFY3U(((uintptr_t)svm_sc & PAGE_MASK),  ==,  0);
431 
432 	svm_sc->msr_bitmap = vmm_contig_alloc(SVM_MSR_BITMAP_SIZE);
433 	if (svm_sc->msr_bitmap == NULL)
434 		panic("contigmalloc of SVM MSR bitmap failed");
435 	svm_sc->iopm_bitmap = vmm_contig_alloc(SVM_IO_BITMAP_SIZE);
436 	if (svm_sc->iopm_bitmap == NULL)
437 		panic("contigmalloc of SVM IO bitmap failed");
438 
439 	svm_sc->vm = vm;
440 	svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm));
441 
442 	/*
443 	 * Intercept read and write accesses to all MSRs.
444 	 */
445 	memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
446 
447 	/*
448 	 * Access to the following MSRs is redirected to the VMCB when the
449 	 * guest is executing. Therefore it is safe to allow the guest to
450 	 * read/write these MSRs directly without hypervisor involvement.
451 	 */
452 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
453 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
454 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
455 
456 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
457 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
458 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
459 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
460 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
461 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
462 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
463 	svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
464 
465 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
466 
467 	/*
468 	 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
469 	 */
470 	svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
471 
472 	/* Intercept access to all I/O ports. */
473 	memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
474 
475 	iopm_pa = vtophys(svm_sc->iopm_bitmap);
476 	msrpm_pa = vtophys(svm_sc->msr_bitmap);
477 	pml4_pa = svm_sc->nptp;
478 	maxcpus = vm_get_maxcpus(svm_sc->vm);
479 	for (i = 0; i < maxcpus; i++) {
480 		vcpu = svm_get_vcpu(svm_sc, i);
481 		vcpu->nextrip = ~0;
482 		vcpu->lastcpu = NOCPU;
483 		vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
484 		vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
485 		svm_msr_guest_init(svm_sc, i);
486 	}
487 
488 	svm_pmu_init(svm_sc);
489 
490 	return (svm_sc);
491 }
492 
493 /*
494  * Collateral for a generic SVM VM-exit.
495  */
496 static void
vm_exit_svm(struct vm_exit * vme,uint64_t code,uint64_t info1,uint64_t info2)497 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
498 {
499 
500 	vme->exitcode = VM_EXITCODE_SVM;
501 	vme->u.svm.exitcode = code;
502 	vme->u.svm.exitinfo1 = info1;
503 	vme->u.svm.exitinfo2 = info2;
504 }
505 
506 static enum vm_cpu_mode
svm_vcpu_mode(struct vmcb * vmcb)507 svm_vcpu_mode(struct vmcb *vmcb)
508 {
509 	struct vmcb_state *state;
510 
511 	state = &vmcb->state;
512 
513 	if (state->efer & EFER_LMA) {
514 		struct vmcb_segment *seg;
515 
516 		/*
517 		 * Section 4.8.1 for APM2, check if Code Segment has
518 		 * Long attribute set in descriptor.
519 		 */
520 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
521 		if (seg->attrib & VMCB_CS_ATTRIB_L)
522 			return (CPU_MODE_64BIT);
523 		else
524 			return (CPU_MODE_COMPATIBILITY);
525 	} else  if (state->cr0 & CR0_PE) {
526 		return (CPU_MODE_PROTECTED);
527 	} else {
528 		return (CPU_MODE_REAL);
529 	}
530 }
531 
532 static enum vm_paging_mode
svm_paging_mode(uint64_t cr0,uint64_t cr4,uint64_t efer)533 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
534 {
535 
536 	if ((cr0 & CR0_PG) == 0)
537 		return (PAGING_MODE_FLAT);
538 	if ((cr4 & CR4_PAE) == 0)
539 		return (PAGING_MODE_32);
540 	if (efer & EFER_LME)
541 		return (PAGING_MODE_64);
542 	else
543 		return (PAGING_MODE_PAE);
544 }
545 
546 static void
svm_paging_info(struct vmcb * vmcb,struct vm_guest_paging * paging)547 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
548 {
549 	struct vmcb_state *state;
550 
551 	state = &vmcb->state;
552 	paging->cr3 = state->cr3;
553 	paging->cpl = state->cpl;
554 	paging->cpu_mode = svm_vcpu_mode(vmcb);
555 	paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
556 	    state->efer);
557 }
558 
559 #define	UNHANDLED 0
560 
561 /*
562  * Handle guest I/O intercept.
563  */
564 static int
svm_handle_inout(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)565 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
566 {
567 	struct vmcb_ctrl *ctrl;
568 	struct vmcb_state *state;
569 	struct vm_inout *inout;
570 	struct vie *vie;
571 	uint64_t info1;
572 	struct vm_guest_paging paging;
573 
574 	state = svm_get_vmcb_state(svm_sc, vcpu);
575 	ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
576 	inout = &vmexit->u.inout;
577 	info1 = ctrl->exitinfo1;
578 
579 	inout->bytes = (info1 >> 4) & 0x7;
580 	inout->flags = 0;
581 	inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
582 	inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
583 	inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
584 	inout->port = (uint16_t)(info1 >> 16);
585 	inout->eax = (uint32_t)(state->rax);
586 
587 	/*
588 	 * We'll always need paging and vie info, even if we bail out early
589 	 * due to missing DecodeAssist.
590 	 */
591 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
592 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
593 
594 	if ((inout->flags & INOUT_STR) != 0) {
595 		/*
596 		 * The effective segment number in EXITINFO1[12:10] is populated
597 		 * only if the processor has the DecodeAssist capability.
598 		 *
599 		 * This is not specified explicitly in APMv2 but can be verified
600 		 * empirically.
601 		 */
602 		if (!has_decode_assist()) {
603 			/*
604 			 * Without decoding assistance, force the task of
605 			 * emulating the ins/outs on userspace.
606 			 */
607 			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
608 			bzero(&vmexit->u.inst_emul,
609 			    sizeof (vmexit->u.inst_emul));
610 			vie_init_other(vie, &paging);
611 			return (UNHANDLED);
612 		}
613 
614 		/*
615 		 * Bits 7-9 encode the address size of ins/outs operations where
616 		 * the 1/2/4 values correspond to 16/32/64 bit sizes.
617 		 */
618 		inout->addrsize = 2 * ((info1 >> 7) & 0x7);
619 		VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
620 		    inout->addrsize == 8);
621 
622 		if (inout->flags & INOUT_IN) {
623 			/*
624 			 * For INS instructions, %es (encoded as 0) is the
625 			 * implied segment for the operation.
626 			 */
627 			inout->segment = 0;
628 		} else {
629 			/*
630 			 * Bits 10-12 encode the segment for OUTS.
631 			 * This value follows the standard x86 segment order.
632 			 */
633 			inout->segment = (info1 >> 10) & 0x7;
634 		}
635 	}
636 
637 	vmexit->exitcode = VM_EXITCODE_INOUT;
638 	vie_init_inout(vie, inout, vmexit->inst_length, &paging);
639 
640 	/* The in/out emulation will handle advancing %rip */
641 	vmexit->inst_length = 0;
642 
643 	return (UNHANDLED);
644 }
645 
646 static int
npf_fault_type(uint64_t exitinfo1)647 npf_fault_type(uint64_t exitinfo1)
648 {
649 
650 	if (exitinfo1 & VMCB_NPF_INFO1_W)
651 		return (PROT_WRITE);
652 	else if (exitinfo1 & VMCB_NPF_INFO1_ID)
653 		return (PROT_EXEC);
654 	else
655 		return (PROT_READ);
656 }
657 
658 static bool
svm_npf_emul_fault(uint64_t exitinfo1)659 svm_npf_emul_fault(uint64_t exitinfo1)
660 {
661 	if (exitinfo1 & VMCB_NPF_INFO1_ID) {
662 		return (false);
663 	}
664 
665 	if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
666 		return (false);
667 	}
668 
669 	if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
670 		return (false);
671 	}
672 
673 	return (true);
674 }
675 
676 static void
svm_handle_mmio_emul(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit,uint64_t gpa)677 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
678     uint64_t gpa)
679 {
680 	struct vmcb_ctrl *ctrl;
681 	struct vmcb *vmcb;
682 	struct vie *vie;
683 	struct vm_guest_paging paging;
684 	struct vmcb_segment *seg;
685 	char *inst_bytes = NULL;
686 	uint8_t inst_len = 0;
687 
688 	vmcb = svm_get_vmcb(svm_sc, vcpu);
689 	ctrl = &vmcb->ctrl;
690 
691 	vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
692 	vmexit->u.mmio_emul.gpa = gpa;
693 	vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
694 	svm_paging_info(vmcb, &paging);
695 
696 	switch (paging.cpu_mode) {
697 	case CPU_MODE_REAL:
698 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
699 		vmexit->u.mmio_emul.cs_base = seg->base;
700 		vmexit->u.mmio_emul.cs_d = 0;
701 		break;
702 	case CPU_MODE_PROTECTED:
703 	case CPU_MODE_COMPATIBILITY:
704 		seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
705 		vmexit->u.mmio_emul.cs_base = seg->base;
706 
707 		/*
708 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
709 		 */
710 		vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
711 		    1 : 0;
712 		break;
713 	default:
714 		vmexit->u.mmio_emul.cs_base = 0;
715 		vmexit->u.mmio_emul.cs_d = 0;
716 		break;
717 	}
718 
719 	/*
720 	 * Copy the instruction bytes into 'vie' if available.
721 	 */
722 	if (has_decode_assist() && !disable_npf_assist) {
723 		inst_len = ctrl->inst_len;
724 		inst_bytes = (char *)ctrl->inst_bytes;
725 	}
726 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
727 	vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
728 }
729 
730 /*
731  * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0
732  * which is live in the guest.  They are visible via the shadow instead.
733  */
734 #define	SVM_CR0_MASK	~(CR0_CD | CR0_NW | 0xffffffff00000000)
735 
736 static void
svm_set_cr0(struct svm_softc * svm_sc,int vcpu,uint64_t val,bool guest_write)737 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write)
738 {
739 	struct vmcb_state *state;
740 	struct svm_regctx *regctx;
741 	uint64_t masked, old, diff;
742 
743 	state = svm_get_vmcb_state(svm_sc, vcpu);
744 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
745 
746 	old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
747 	diff = old ^ val;
748 
749 	/* No further work needed if register contents remain the same */
750 	if (diff == 0) {
751 		return;
752 	}
753 
754 	/* Flush the TLB if the paging or write-protect bits are changing */
755 	if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
756 		flush_asid(svm_sc, vcpu);
757 	}
758 
759 	/*
760 	 * If the change in %cr0 is due to a guest action (via interception)
761 	 * then other CPU state updates may be required.
762 	 */
763 	if (guest_write) {
764 		if ((diff & CR0_PG) != 0) {
765 			uint64_t efer = state->efer;
766 
767 			/* Keep the long-mode state in EFER in sync */
768 			if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) {
769 				state->efer |= EFER_LMA;
770 			}
771 			if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) {
772 				state->efer &= ~EFER_LMA;
773 			}
774 		}
775 	}
776 
777 	masked = val & SVM_CR0_MASK;
778 	regctx->sctx_cr0_shadow = val;
779 	state->cr0 = masked;
780 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR);
781 
782 	if ((masked ^ val) != 0) {
783 		/*
784 		 * The guest has set bits in %cr0 which we are masking out and
785 		 * exposing via shadow.
786 		 *
787 		 * We must intercept %cr0 reads in order to make the shadowed
788 		 * view available to the guest.
789 		 *
790 		 * Writes to %cr0 must also be intercepted (unconditionally,
791 		 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch
792 		 * if/when the guest clears those shadowed bits.
793 		 */
794 		svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
795 		    BIT(0) | BIT(16));
796 	} else {
797 		/*
798 		 * When no bits remain in %cr0 which require shadowing, the
799 		 * unconditional intercept of reads/writes to %cr0 can be
800 		 * disabled.
801 		 *
802 		 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains
803 		 * in place so we can be notified of operations which change
804 		 * bits other than TS or MP.
805 		 */
806 		svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
807 		    BIT(0) | BIT(16));
808 	}
809 	svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I);
810 }
811 
812 static void
svm_get_cr0(struct svm_softc * svm_sc,int vcpu,uint64_t * val)813 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val)
814 {
815 	struct vmcb *vmcb;
816 	struct svm_regctx *regctx;
817 
818 	vmcb = svm_get_vmcb(svm_sc, vcpu);
819 	regctx = svm_get_guest_regctx(svm_sc, vcpu);
820 
821 	/*
822 	 * Include the %cr0 bits which exist only in the shadow along with those
823 	 * in the running vCPU state.
824 	 */
825 	*val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
826 }
827 
828 static void
svm_handle_cr0_read(struct svm_softc * svm_sc,int vcpu,enum vm_reg_name reg)829 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
830 {
831 	uint64_t val;
832 	int err __maybe_unused;
833 
834 	svm_get_cr0(svm_sc, vcpu, &val);
835 	err = svm_setreg(svm_sc, vcpu, reg, val);
836 	ASSERT(err == 0);
837 }
838 
839 static void
svm_handle_cr0_write(struct svm_softc * svm_sc,int vcpu,enum vm_reg_name reg)840 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
841 {
842 	struct vmcb_state *state;
843 	uint64_t val;
844 	int err __maybe_unused;
845 
846 	state = svm_get_vmcb_state(svm_sc, vcpu);
847 
848 	err = svm_getreg(svm_sc, vcpu, reg, &val);
849 	ASSERT(err == 0);
850 
851 	if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) {
852 		/* NW without CD is nonsensical */
853 		vm_inject_gp(svm_sc->vm, vcpu);
854 		return;
855 	}
856 	if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) {
857 		/* PG requires PE */
858 		vm_inject_gp(svm_sc->vm, vcpu);
859 		return;
860 	}
861 	if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) {
862 		/* When enabling paging, PAE must be enabled if LME is. */
863 		if ((state->efer & EFER_LME) != 0 &&
864 		    (state->cr4 & CR4_PAE) == 0) {
865 			vm_inject_gp(svm_sc->vm, vcpu);
866 			return;
867 		}
868 	}
869 
870 	svm_set_cr0(svm_sc, vcpu, val, true);
871 }
872 
873 static void
svm_inst_emul_other(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)874 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
875 {
876 	struct vie *vie;
877 	struct vm_guest_paging paging;
878 
879 	/* Let the instruction emulation (hopefully in-kernel) handle it */
880 	vmexit->exitcode = VM_EXITCODE_INST_EMUL;
881 	bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul));
882 	vie = vm_vie_ctx(svm_sc->vm, vcpu);
883 	svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
884 	vie_init_other(vie, &paging);
885 
886 	/* The instruction emulation will handle advancing %rip */
887 	vmexit->inst_length = 0;
888 }
889 
890 static void
svm_update_virqinfo(struct svm_softc * sc,int vcpu)891 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
892 {
893 	struct vm *vm;
894 	struct vlapic *vlapic;
895 	struct vmcb_ctrl *ctrl;
896 
897 	vm = sc->vm;
898 	vlapic = vm_lapic(vm, vcpu);
899 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
900 
901 	/* Update %cr8 in the emulated vlapic */
902 	vlapic_set_cr8(vlapic, ctrl->v_tpr);
903 
904 	/* Virtual interrupt injection is not used. */
905 	KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
906 	    "v_intr_vector %d", __func__, ctrl->v_intr_vector));
907 }
908 
909 CTASSERT(VMCB_EVENTINJ_TYPE_INTR	== VM_INTINFO_HWINTR);
910 CTASSERT(VMCB_EVENTINJ_TYPE_NMI		== VM_INTINFO_NMI);
911 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION	== VM_INTINFO_HWEXCP);
912 CTASSERT(VMCB_EVENTINJ_TYPE_INTn	== VM_INTINFO_SWINTR);
913 CTASSERT(VMCB_EVENTINJ_EC_VALID		== VM_INTINFO_DEL_ERRCODE);
914 CTASSERT(VMCB_EVENTINJ_VALID		== VM_INTINFO_VALID);
915 
916 /*
917  * Store SVM-specific event injection info for later handling.  This depends on
918  * the bhyve-internal event definitions matching those in the VMCB, as ensured
919  * by the above CTASSERTs.
920  */
921 static void
svm_stash_intinfo(struct svm_softc * svm_sc,int vcpu,uint64_t intinfo)922 svm_stash_intinfo(struct svm_softc *svm_sc, int vcpu, uint64_t intinfo)
923 {
924 	ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
925 
926 	/*
927 	 * If stashing an NMI pending injection, ensure that it bears the
928 	 * correct vector which exit_intinfo expects.
929 	 */
930 	if (VM_INTINFO_TYPE(intinfo) == VM_INTINFO_NMI) {
931 		intinfo &= ~VM_INTINFO_MASK_VECTOR;
932 		intinfo |= IDT_NMI;
933 	}
934 
935 	VERIFY0(vm_exit_intinfo(svm_sc->vm, vcpu, intinfo));
936 }
937 
938 static void
svm_save_exitintinfo(struct svm_softc * svm_sc,int vcpu)939 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
940 {
941 	struct vmcb_ctrl *ctrl  = svm_get_vmcb_ctrl(svm_sc, vcpu);
942 	uint64_t intinfo = ctrl->exitintinfo;
943 
944 	if (VMCB_EXITINTINFO_VALID(intinfo)) {
945 		/*
946 		 * If a #VMEXIT happened during event delivery then record the
947 		 * event that was being delivered.
948 		 */
949 		vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
950 
951 		svm_stash_intinfo(svm_sc, vcpu, intinfo);
952 	}
953 }
954 
955 static __inline int
vintr_intercept_enabled(struct svm_softc * sc,int vcpu)956 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
957 {
958 
959 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
960 	    VMCB_INTCPT_VINTR));
961 }
962 
963 static void
svm_enable_intr_window_exiting(struct svm_softc * sc,int vcpu)964 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
965 {
966 	struct vmcb_ctrl *ctrl;
967 	struct vmcb_state *state;
968 
969 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
970 	state = svm_get_vmcb_state(sc, vcpu);
971 
972 	if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
973 		KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
974 		    ("%s: invalid v_ign_tpr", __func__));
975 		KASSERT(vintr_intercept_enabled(sc, vcpu),
976 		    ("%s: vintr intercept should be enabled", __func__));
977 		return;
978 	}
979 
980 	/*
981 	 * We use V_IRQ in conjunction with the VINTR intercept to trap into the
982 	 * hypervisor as soon as a virtual interrupt can be delivered.
983 	 *
984 	 * Since injected events are not subject to intercept checks we need to
985 	 * ensure that the V_IRQ is not actually going to be delivered on VM
986 	 * entry.
987 	 */
988 	VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
989 	    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
990 
991 	ctrl->v_irq |= V_IRQ;
992 	ctrl->v_intr_prio |= V_IGN_TPR;
993 	ctrl->v_intr_vector = 0;
994 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
995 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
996 }
997 
998 static void
svm_disable_intr_window_exiting(struct svm_softc * sc,int vcpu)999 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
1000 {
1001 	struct vmcb_ctrl *ctrl;
1002 
1003 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1004 
1005 	if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
1006 		KASSERT(!vintr_intercept_enabled(sc, vcpu),
1007 		    ("%s: vintr intercept should be disabled", __func__));
1008 		return;
1009 	}
1010 
1011 	ctrl->v_irq &= ~V_IRQ;
1012 	ctrl->v_intr_vector = 0;
1013 	svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1014 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1015 }
1016 
1017 /*
1018  * Once an NMI is injected it blocks delivery of further NMIs until the handler
1019  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1020  * to track when the vcpu is done handling the NMI.
1021  */
1022 static int
svm_nmi_blocked(struct svm_softc * sc,int vcpu)1023 svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1024 {
1025 	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1026 	    VMCB_INTCPT_IRET));
1027 }
1028 
1029 static void
svm_clear_nmi_blocking(struct svm_softc * sc,int vcpu)1030 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1031 {
1032 	struct vmcb_ctrl *ctrl;
1033 
1034 	KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1035 	/*
1036 	 * When the IRET intercept is cleared the vcpu will attempt to execute
1037 	 * the "iret" when it runs next. However, it is possible to inject
1038 	 * another NMI into the vcpu before the "iret" has actually executed.
1039 	 *
1040 	 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1041 	 * it will trap back into the hypervisor. If an NMI is pending for
1042 	 * the vcpu it will be injected into the guest.
1043 	 *
1044 	 * XXX this needs to be fixed
1045 	 */
1046 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1047 
1048 	/*
1049 	 * Set an interrupt shadow to prevent an NMI from being immediately
1050 	 * injected on the next VMRUN.
1051 	 */
1052 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1053 	ctrl->intr_shadow = 1;
1054 }
1055 
1056 static void
svm_inject_event(struct vmcb_ctrl * ctrl,uint64_t info)1057 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info)
1058 {
1059 	ASSERT(VM_INTINFO_PENDING(info));
1060 
1061 	uint8_t vector = VM_INTINFO_VECTOR(info);
1062 	uint32_t type = VM_INTINFO_TYPE(info);
1063 
1064 	/*
1065 	 * Correct behavior depends on bhyve intinfo event types lining up with
1066 	 * those defined by AMD for event injection in the VMCB.  The CTASSERTs
1067 	 * above svm_save_exitintinfo() ensure it.
1068 	 */
1069 	switch (type) {
1070 	case VM_INTINFO_NMI:
1071 		/* Ensure vector for injected event matches its type (NMI) */
1072 		vector = IDT_NMI;
1073 		break;
1074 	case VM_INTINFO_HWINTR:
1075 	case VM_INTINFO_SWINTR:
1076 		break;
1077 	case VM_INTINFO_HWEXCP:
1078 		if (vector == IDT_NMI) {
1079 			/*
1080 			 * NMIs are expected to be injected with
1081 			 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception
1082 			 * with the NMI vector.
1083 			 */
1084 			type = VM_INTINFO_NMI;
1085 		}
1086 		VERIFY(vector < 32);
1087 		break;
1088 	default:
1089 		/*
1090 		 * Since there is not strong validation for injected event types
1091 		 * at this point, fall back to software interrupt for those we
1092 		 * do not recognized.
1093 		 */
1094 		type = VM_INTINFO_SWINTR;
1095 		break;
1096 	}
1097 
1098 	ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector;
1099 	if (VM_INTINFO_HAS_ERRCODE(info)) {
1100 		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1101 		ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32;
1102 	}
1103 }
1104 
1105 static void
svm_inject_nmi(struct svm_softc * sc,int vcpu)1106 svm_inject_nmi(struct svm_softc *sc, int vcpu)
1107 {
1108 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1109 
1110 	ASSERT(!svm_nmi_blocked(sc, vcpu));
1111 
1112 	ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1113 	vm_nmi_clear(sc->vm, vcpu);
1114 
1115 	/*
1116 	 * Virtual NMI blocking is now in effect.
1117 	 *
1118 	 * Not only does this block a subsequent NMI injection from taking
1119 	 * place, it also configures an intercept on the IRET so we can track
1120 	 * when the next injection can take place.
1121 	 */
1122 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1123 }
1124 
1125 static void
svm_inject_irq(struct svm_softc * sc,int vcpu,int vector)1126 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1127 {
1128 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1129 
1130 	ASSERT(vector >= 0 && vector <= 255);
1131 
1132 	ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1133 }
1134 
1135 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
1136 
1137 static vm_msr_result_t
svm_write_efer(struct svm_softc * sc,int vcpu,uint64_t newval)1138 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1139 {
1140 	struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu);
1141 	uint64_t lma;
1142 	int error;
1143 
1144 	newval &= ~0xFE;		/* clear the Read-As-Zero (RAZ) bits */
1145 
1146 	if (newval & EFER_MBZ_BITS) {
1147 		return (VMR_GP);
1148 	}
1149 
1150 	/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1151 	const uint64_t changed = state->efer ^ newval;
1152 	if (changed & EFER_LME) {
1153 		if (state->cr0 & CR0_PG) {
1154 			return (VMR_GP);
1155 		}
1156 	}
1157 
1158 	/* EFER.LMA = EFER.LME & CR0.PG */
1159 	if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) {
1160 		lma = EFER_LMA;
1161 	} else {
1162 		lma = 0;
1163 	}
1164 	if ((newval & EFER_LMA) != lma) {
1165 		return (VMR_GP);
1166 	}
1167 
1168 	if ((newval & EFER_NXE) != 0 &&
1169 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) {
1170 		return (VMR_GP);
1171 	}
1172 	if ((newval & EFER_FFXSR) != 0 &&
1173 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) {
1174 		return (VMR_GP);
1175 	}
1176 	if ((newval & EFER_TCE) != 0 &&
1177 	    !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) {
1178 		return (VMR_GP);
1179 	}
1180 
1181 	/*
1182 	 * Until bhyve has proper support for long-mode segment limits, just
1183 	 * toss a #GP at the guest if they attempt to use it.
1184 	 */
1185 	if (newval & EFER_LMSLE) {
1186 		return (VMR_GP);
1187 	}
1188 
1189 	error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1190 	VERIFY0(error);
1191 	return (VMR_OK);
1192 }
1193 
1194 static int
svm_handle_msr(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit,bool is_wrmsr)1195 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
1196     bool is_wrmsr)
1197 {
1198 	struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1199 	struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1200 	const uint32_t ecx = ctx->sctx_rcx;
1201 	vm_msr_result_t res;
1202 	uint64_t val = 0;
1203 
1204 	if (is_wrmsr) {
1205 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1206 		val = ctx->sctx_rdx << 32 | (uint32_t)state->rax;
1207 
1208 		if (vlapic_owned_msr(ecx)) {
1209 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1210 
1211 			res = vlapic_wrmsr(vlapic, ecx, val);
1212 		} else if (ecx == MSR_EFER) {
1213 			res = svm_write_efer(svm_sc, vcpu, val);
1214 		} else if (svm_pmu_owned_msr(ecx)) {
1215 			res = svm_pmu_wrmsr(svm_sc, vcpu, ecx, val);
1216 		} else {
1217 			res = svm_wrmsr(svm_sc, vcpu, ecx, val);
1218 		}
1219 	} else {
1220 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1221 
1222 		if (vlapic_owned_msr(ecx)) {
1223 			struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1224 
1225 			res = vlapic_rdmsr(vlapic, ecx, &val);
1226 		} else if (svm_pmu_owned_msr(ecx)) {
1227 			res = svm_pmu_rdmsr(svm_sc, vcpu, ecx, &val);
1228 		} else {
1229 			res = svm_rdmsr(svm_sc, vcpu, ecx, &val);
1230 		}
1231 	}
1232 
1233 	switch (res) {
1234 	case VMR_OK:
1235 		/* Store rdmsr result in the appropriate registers */
1236 		if (!is_wrmsr) {
1237 			state->rax = (uint32_t)val;
1238 			ctx->sctx_rdx = val >> 32;
1239 		}
1240 		return (1);
1241 	case VMR_GP:
1242 		vm_inject_gp(svm_sc->vm, vcpu);
1243 		return (1);
1244 	case VMR_UNHANLDED:
1245 		vmexit->exitcode = is_wrmsr ?
1246 		    VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR;
1247 		vmexit->u.msr.code = ecx;
1248 		vmexit->u.msr.wval = val;
1249 		return (0);
1250 	default:
1251 		panic("unexpected msr result %u\n", res);
1252 	}
1253 }
1254 
1255 static void
svm_handle_rdpmc(struct svm_softc * svm_sc,int vcpu)1256 svm_handle_rdpmc(struct svm_softc *svm_sc, int vcpu)
1257 {
1258 	struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1259 	struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1260 	const uint32_t ecx = ctx->sctx_rcx;
1261 	uint64_t val = 0;
1262 
1263 	if (svm_pmu_rdpmc(svm_sc, vcpu, ecx, &val)) {
1264 		state->rax = (uint32_t)val;
1265 		ctx->sctx_rdx = val >> 32;
1266 	} else {
1267 		vm_inject_gp(svm_sc->vm, vcpu);
1268 	}
1269 }
1270 
1271 /*
1272  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1273  * that are due to instruction intercepts as well as MSR and IOIO intercepts
1274  * and exceptions caused by INT3, INTO and BOUND instructions.
1275  *
1276  * Return 1 if the nRIP is valid and 0 otherwise.
1277  */
1278 static int
nrip_valid(uint64_t exitcode)1279 nrip_valid(uint64_t exitcode)
1280 {
1281 	switch (exitcode) {
1282 	case 0x00 ... 0x0F:	/* read of CR0 through CR15 */
1283 	case 0x10 ... 0x1F:	/* write of CR0 through CR15 */
1284 	case 0x20 ... 0x2F:	/* read of DR0 through DR15 */
1285 	case 0x30 ... 0x3F:	/* write of DR0 through DR15 */
1286 	case 0x43:		/* INT3 */
1287 	case 0x44:		/* INTO */
1288 	case 0x45:		/* BOUND */
1289 	case 0x65 ... 0x7C:	/* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1290 	case 0x80 ... 0x8D:	/* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1291 		return (1);
1292 	default:
1293 		return (0);
1294 	}
1295 }
1296 
1297 static int
svm_vmexit(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)1298 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1299 {
1300 	struct vmcb *vmcb;
1301 	struct vmcb_state *state;
1302 	struct vmcb_ctrl *ctrl;
1303 	struct svm_regctx *ctx;
1304 	uint64_t code, info1, info2;
1305 	int handled;
1306 
1307 	ctx = svm_get_guest_regctx(svm_sc, vcpu);
1308 	vmcb = svm_get_vmcb(svm_sc, vcpu);
1309 	state = &vmcb->state;
1310 	ctrl = &vmcb->ctrl;
1311 
1312 	handled = 0;
1313 	code = ctrl->exitcode;
1314 	info1 = ctrl->exitinfo1;
1315 	info2 = ctrl->exitinfo2;
1316 
1317 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1318 	vmexit->rip = state->rip;
1319 	vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1320 
1321 	vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1322 
1323 	/*
1324 	 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1325 	 * in an inconsistent state and can trigger assertions that would
1326 	 * never happen otherwise.
1327 	 */
1328 	if (code == VMCB_EXIT_INVALID) {
1329 		vm_exit_svm(vmexit, code, info1, info2);
1330 		return (0);
1331 	}
1332 
1333 	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1334 	    "injection valid bit is set %lx", __func__, ctrl->eventinj));
1335 
1336 	KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1337 	    ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1338 	    vmexit->inst_length, code, info1, info2));
1339 
1340 	svm_update_virqinfo(svm_sc, vcpu);
1341 	svm_save_exitintinfo(svm_sc, vcpu);
1342 
1343 	switch (code) {
1344 	case VMCB_EXIT_CR0_READ:
1345 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1346 			svm_handle_cr0_read(svm_sc, vcpu,
1347 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1348 			handled = 1;
1349 		} else {
1350 			/*
1351 			 * If SMSW is used to read the contents of %cr0, then
1352 			 * the VALID bit will not be set in `info1`, since the
1353 			 * handling is different from the mov-to-reg case.
1354 			 *
1355 			 * Punt to the instruction emulation to handle it.
1356 			 */
1357 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1358 		}
1359 		break;
1360 	case VMCB_EXIT_CR0_WRITE:
1361 	case VMCB_EXIT_CR0_SEL_WRITE:
1362 		if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1363 			svm_handle_cr0_write(svm_sc, vcpu,
1364 			    vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1365 			handled = 1;
1366 		} else {
1367 			/*
1368 			 * Writes to %cr0 without VALID being set in `info1` are
1369 			 * initiated by the LMSW and CLTS instructions.  While
1370 			 * LMSW (like SMSW) sees little use in modern OSes and
1371 			 * bootloaders, CLTS is still used for handling FPU
1372 			 * state transitions.
1373 			 *
1374 			 * Punt to the instruction emulation to handle them.
1375 			 */
1376 			svm_inst_emul_other(svm_sc, vcpu, vmexit);
1377 		}
1378 		break;
1379 	case VMCB_EXIT_IRET:
1380 		/*
1381 		 * Restart execution at "iret" but with the intercept cleared.
1382 		 */
1383 		vmexit->inst_length = 0;
1384 		svm_clear_nmi_blocking(svm_sc, vcpu);
1385 		handled = 1;
1386 		break;
1387 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
1388 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1389 		svm_disable_intr_window_exiting(svm_sc, vcpu);
1390 		handled = 1;
1391 		break;
1392 	case VMCB_EXIT_INTR:	/* external interrupt */
1393 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1394 		handled = 1;
1395 		break;
1396 	case VMCB_EXIT_NMI:
1397 	case VMCB_EXIT_SMI:
1398 	case VMCB_EXIT_INIT:
1399 		/*
1400 		 * For external NMI/SMI and physical INIT interrupts, simply
1401 		 * continue execution, as those host events will be handled by
1402 		 * the physical CPU.
1403 		 */
1404 		handled = 1;
1405 		break;
1406 	case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: {
1407 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1408 
1409 		const uint8_t idtvec = code - VMCB_EXIT_EXCP0;
1410 		uint32_t errcode = 0;
1411 		bool reflect = true;
1412 		bool errcode_valid = false;
1413 
1414 		switch (idtvec) {
1415 		case IDT_MC:
1416 			/* The host will handle the MCE itself. */
1417 			reflect = false;
1418 			vmm_call_trap(T_MCE);
1419 			break;
1420 		case IDT_PF:
1421 			VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1422 			    info2));
1423 			/* fallthru */
1424 		case IDT_NP:
1425 		case IDT_SS:
1426 		case IDT_GP:
1427 		case IDT_AC:
1428 		case IDT_TS:
1429 			errcode_valid = true;
1430 			errcode = info1;
1431 			break;
1432 
1433 		case IDT_DF:
1434 			errcode_valid = true;
1435 			break;
1436 
1437 		case IDT_BP:
1438 		case IDT_OF:
1439 		case IDT_BR:
1440 			/*
1441 			 * The 'nrip' field is populated for INT3, INTO and
1442 			 * BOUND exceptions and this also implies that
1443 			 * 'inst_length' is non-zero.
1444 			 *
1445 			 * Reset 'inst_length' to zero so the guest %rip at
1446 			 * event injection is identical to what it was when
1447 			 * the exception originally happened.
1448 			 */
1449 			vmexit->inst_length = 0;
1450 			/* fallthru */
1451 		default:
1452 			errcode_valid = false;
1453 			break;
1454 		}
1455 		VERIFY0(vmexit->inst_length);
1456 
1457 		if (reflect) {
1458 			/* Reflect the exception back into the guest */
1459 			VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1460 			    errcode_valid, errcode, false));
1461 		}
1462 		handled = 1;
1463 		break;
1464 		}
1465 	case VMCB_EXIT_MSR:
1466 		handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0);
1467 		break;
1468 	case VMCB_EXIT_RDPMC:
1469 		svm_handle_rdpmc(svm_sc, vcpu);
1470 		handled = 1;
1471 		break;
1472 	case VMCB_EXIT_IO:
1473 		handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1474 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1475 		break;
1476 	case VMCB_EXIT_SHUTDOWN:
1477 		(void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT, vcpu);
1478 		handled = 1;
1479 		break;
1480 	case VMCB_EXIT_INVLPGA:
1481 		/* privileged invalidation instructions */
1482 		vm_inject_ud(svm_sc->vm, vcpu);
1483 		handled = 1;
1484 		break;
1485 	case VMCB_EXIT_VMRUN:
1486 	case VMCB_EXIT_VMLOAD:
1487 	case VMCB_EXIT_VMSAVE:
1488 	case VMCB_EXIT_STGI:
1489 	case VMCB_EXIT_CLGI:
1490 	case VMCB_EXIT_SKINIT:
1491 		/* privileged vmm instructions */
1492 		vm_inject_ud(svm_sc->vm, vcpu);
1493 		handled = 1;
1494 		break;
1495 	case VMCB_EXIT_INVD:
1496 	case VMCB_EXIT_WBINVD:
1497 		/* ignore exit */
1498 		handled = 1;
1499 		break;
1500 	case VMCB_EXIT_VMMCALL:
1501 		/* No handlers make use of VMMCALL for now */
1502 		vm_inject_ud(svm_sc->vm, vcpu);
1503 		handled = 1;
1504 		break;
1505 	case VMCB_EXIT_CPUID:
1506 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1507 		vcpu_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1508 		    &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1509 		handled = 1;
1510 		break;
1511 	case VMCB_EXIT_HLT:
1512 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1513 		vmexit->exitcode = VM_EXITCODE_HLT;
1514 		vmexit->u.hlt.rflags = state->rflags;
1515 		break;
1516 	case VMCB_EXIT_PAUSE:
1517 		vmexit->exitcode = VM_EXITCODE_PAUSE;
1518 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1519 		break;
1520 	case VMCB_EXIT_NPF:
1521 		/* EXITINFO2 contains the faulting guest physical address */
1522 		if (info1 & VMCB_NPF_INFO1_RSV) {
1523 			/* nested fault with reserved bits set */
1524 		} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1525 			vmexit->exitcode = VM_EXITCODE_PAGING;
1526 			vmexit->u.paging.gpa = info2;
1527 			vmexit->u.paging.fault_type = npf_fault_type(info1);
1528 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1529 		} else if (svm_npf_emul_fault(info1)) {
1530 			svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1531 			vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1532 		}
1533 		break;
1534 	case VMCB_EXIT_MONITOR:
1535 		vmexit->exitcode = VM_EXITCODE_MONITOR;
1536 		break;
1537 	case VMCB_EXIT_MWAIT:
1538 		vmexit->exitcode = VM_EXITCODE_MWAIT;
1539 		break;
1540 	default:
1541 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1542 		break;
1543 	}
1544 
1545 	DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1546 	    code);
1547 
1548 	if (handled) {
1549 		vmexit->rip += vmexit->inst_length;
1550 		vmexit->inst_length = 0;
1551 		state->rip = vmexit->rip;
1552 	} else {
1553 		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1554 			/*
1555 			 * If this VM exit was not claimed by anybody then
1556 			 * treat it as a generic SVM exit.
1557 			 */
1558 			vm_exit_svm(vmexit, code, info1, info2);
1559 		} else {
1560 			/*
1561 			 * The exitcode and collateral have been populated.
1562 			 * The VM exit will be processed further in userland.
1563 			 */
1564 		}
1565 	}
1566 	return (handled);
1567 }
1568 
1569 /*
1570  * Inject exceptions, NMIs, and ExtINTs.
1571  *
1572  * The logic behind these are complicated and may involve mutex contention, so
1573  * the injection is performed without the protection of host CPU interrupts
1574  * being disabled.  This means a racing notification could be "lost",
1575  * necessitating a later call to svm_inject_recheck() to close that window
1576  * of opportunity.
1577  */
1578 static enum event_inject_state
svm_inject_events(struct svm_softc * sc,int vcpu)1579 svm_inject_events(struct svm_softc *sc, int vcpu)
1580 {
1581 	struct vmcb_ctrl *ctrl;
1582 	struct vmcb_state *state;
1583 	struct svm_vcpu *vcpustate;
1584 	uint64_t intinfo;
1585 	enum event_inject_state ev_state;
1586 
1587 	state = svm_get_vmcb_state(sc, vcpu);
1588 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1589 	vcpustate = svm_get_vcpu(sc, vcpu);
1590 	ev_state = EIS_CAN_INJECT;
1591 
1592 	/* Clear any interrupt shadow if guest %rip has changed */
1593 	if (vcpustate->nextrip != state->rip) {
1594 		ctrl->intr_shadow = 0;
1595 	}
1596 
1597 	/*
1598 	 * An event is already pending for injection.  This can occur when the
1599 	 * vCPU exits prior to VM entry (like for an AST).
1600 	 */
1601 	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1602 		return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1603 	}
1604 
1605 	/*
1606 	 * Inject pending events or exceptions for this vcpu.
1607 	 *
1608 	 * An event might be pending because the previous #VMEXIT happened
1609 	 * during event delivery (i.e. ctrl->exitintinfo).
1610 	 *
1611 	 * An event might also be pending because an exception was injected
1612 	 * by the hypervisor (e.g. #PF during instruction emulation).
1613 	 */
1614 	if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1615 		svm_inject_event(ctrl, intinfo);
1616 		vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1617 		ev_state = EIS_EV_INJECTED;
1618 	}
1619 
1620 	/* NMI event has priority over interrupts. */
1621 	if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1622 		if (ev_state == EIS_CAN_INJECT) {
1623 			/* Can't inject NMI if vcpu is in an intr_shadow. */
1624 			if (ctrl->intr_shadow) {
1625 				return (EIS_GI_BLOCK);
1626 			}
1627 
1628 			svm_inject_nmi(sc, vcpu);
1629 			ev_state = EIS_EV_INJECTED;
1630 		} else {
1631 			return (ev_state | EIS_REQ_EXIT);
1632 		}
1633 	}
1634 
1635 	if (vm_extint_pending(sc->vm, vcpu)) {
1636 		int vector;
1637 
1638 		if (ev_state != EIS_CAN_INJECT) {
1639 			return (ev_state | EIS_REQ_EXIT);
1640 		}
1641 
1642 		/*
1643 		 * If the guest has disabled interrupts or is in an interrupt
1644 		 * shadow then we cannot inject the pending interrupt.
1645 		 */
1646 		if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1647 			return (EIS_GI_BLOCK);
1648 		}
1649 
1650 		/* Ask the legacy pic for a vector to inject */
1651 		vatpic_pending_intr(sc->vm, &vector);
1652 		KASSERT(vector >= 0 && vector <= 255,
1653 		    ("invalid vector %d from INTR", vector));
1654 
1655 		svm_inject_irq(sc, vcpu, vector);
1656 		vm_extint_clear(sc->vm, vcpu);
1657 		vatpic_intr_accepted(sc->vm, vector);
1658 		ev_state = EIS_EV_INJECTED;
1659 	}
1660 
1661 	return (ev_state);
1662 }
1663 
1664 /*
1665  * Synchronize vLAPIC state and inject any interrupts pending on it.
1666  *
1667  * This is done with host CPU interrupts disabled so notification IPIs will be
1668  * queued on the host APIC and recognized when entering SVM guest context.
1669  */
1670 static enum event_inject_state
svm_inject_vlapic(struct svm_softc * sc,int vcpu,struct vlapic * vlapic,enum event_inject_state ev_state)1671 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1672     enum event_inject_state ev_state)
1673 {
1674 	struct vmcb_ctrl *ctrl;
1675 	struct vmcb_state *state;
1676 	int vector;
1677 	uint8_t v_tpr;
1678 
1679 	state = svm_get_vmcb_state(sc, vcpu);
1680 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1681 
1682 	/*
1683 	 * The guest can modify the TPR by writing to %cr8. In guest mode the
1684 	 * CPU reflects this write to V_TPR without hypervisor intervention.
1685 	 *
1686 	 * The guest can also modify the TPR by writing to it via the memory
1687 	 * mapped APIC page. In this case, the write will be emulated by the
1688 	 * hypervisor. For this reason V_TPR must be updated before every
1689 	 * VMRUN.
1690 	 */
1691 	v_tpr = vlapic_get_cr8(vlapic);
1692 	KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1693 	if (ctrl->v_tpr != v_tpr) {
1694 		ctrl->v_tpr = v_tpr;
1695 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1696 	}
1697 
1698 	/* If an event cannot otherwise be injected, we are done for now */
1699 	if (ev_state != EIS_CAN_INJECT) {
1700 		return (ev_state);
1701 	}
1702 
1703 	if (!vlapic_pending_intr(vlapic, &vector)) {
1704 		return (EIS_CAN_INJECT);
1705 	}
1706 	KASSERT(vector >= 16 && vector <= 255,
1707 	    ("invalid vector %d from local APIC", vector));
1708 
1709 	/*
1710 	 * If the guest has disabled interrupts or is in an interrupt shadow
1711 	 * then we cannot inject the pending interrupt.
1712 	 */
1713 	if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1714 		return (EIS_GI_BLOCK);
1715 	}
1716 
1717 	svm_inject_irq(sc, vcpu, vector);
1718 	vlapic_intr_accepted(vlapic, vector);
1719 	return (EIS_EV_INJECTED);
1720 }
1721 
1722 /*
1723  * Re-check for events to be injected.
1724  *
1725  * Once host CPU interrupts are disabled, check for the presence of any events
1726  * which require injection processing.  If an exit is required upon injection,
1727  * or once the guest becomes interruptable, that will be configured too.
1728  */
1729 static bool
svm_inject_recheck(struct svm_softc * sc,int vcpu,enum event_inject_state ev_state)1730 svm_inject_recheck(struct svm_softc *sc, int vcpu,
1731     enum event_inject_state ev_state)
1732 {
1733 	struct vmcb_ctrl *ctrl;
1734 
1735 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
1736 
1737 	if (ev_state == EIS_CAN_INJECT) {
1738 		/*
1739 		 * An active interrupt shadow would preclude us from injecting
1740 		 * any events picked up during a re-check.
1741 		 */
1742 		if (ctrl->intr_shadow != 0) {
1743 			return (false);
1744 		}
1745 
1746 		if (vm_nmi_pending(sc->vm, vcpu) &&
1747 		    !svm_nmi_blocked(sc, vcpu)) {
1748 			/* queued NMI not blocked by NMI-window-exiting */
1749 			return (true);
1750 		}
1751 		if (vm_extint_pending(sc->vm, vcpu)) {
1752 			/* queued ExtINT not blocked by existing injection */
1753 			return (true);
1754 		}
1755 	} else {
1756 		if ((ev_state & EIS_REQ_EXIT) != 0) {
1757 			/*
1758 			 * Use a self-IPI to force an immediate exit after
1759 			 * event injection has occurred.
1760 			 */
1761 			poke_cpu(CPU->cpu_id);
1762 		} else {
1763 			/*
1764 			 * If any event is being injected, an exit immediately
1765 			 * upon becoming interruptable again will allow pending
1766 			 * or newly queued events to be injected in a timely
1767 			 * manner.
1768 			 */
1769 			svm_enable_intr_window_exiting(sc, vcpu);
1770 		}
1771 	}
1772 	return (false);
1773 }
1774 
1775 
1776 static void
check_asid(struct svm_softc * sc,int vcpuid,uint_t thiscpu,uint64_t nptgen)1777 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen)
1778 {
1779 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1780 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1781 	uint8_t flush;
1782 
1783 	flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(),
1784 	    vcpustate->nptgen != nptgen);
1785 
1786 	if (flush != VMCB_TLB_FLUSH_NOTHING) {
1787 		ctrl->asid = vcpustate->hma_asid.hsa_asid;
1788 		svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1789 	}
1790 	ctrl->tlb_ctrl = flush;
1791 	vcpustate->nptgen = nptgen;
1792 }
1793 
1794 static void
flush_asid(struct svm_softc * sc,int vcpuid)1795 flush_asid(struct svm_softc *sc, int vcpuid)
1796 {
1797 	struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1798 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1799 	uint8_t flush;
1800 
1801 	/* HMA ASID updates are expected to be done with interrupts disabled */
1802 	const ulong_t iflag = intr_clear();
1803 	flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(),
1804 	    true);
1805 	intr_restore(iflag);
1806 
1807 	ASSERT(flush != VMCB_TLB_FLUSH_NOTHING);
1808 	ctrl->asid = vcpustate->hma_asid.hsa_asid;
1809 	ctrl->tlb_ctrl = flush;
1810 	svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1811 	/*
1812 	 * A potential future optimization: We could choose to update the nptgen
1813 	 * associated with the vCPU, since any pending nptgen change requiring a
1814 	 * flush will be satisfied by the one which has just now been queued.
1815 	 */
1816 }
1817 
1818 static __inline void
svm_dr_enter_guest(struct svm_regctx * gctx)1819 svm_dr_enter_guest(struct svm_regctx *gctx)
1820 {
1821 
1822 	/* Save host control debug registers. */
1823 	gctx->host_dr7 = rdr7();
1824 	gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1825 
1826 	/*
1827 	 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1828 	 * exceptions in the host based on the guest DRx values.  The
1829 	 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1830 	 * VMCB.
1831 	 */
1832 	load_dr7(0);
1833 	wrmsr(MSR_DEBUGCTLMSR, 0);
1834 
1835 	/* Save host debug registers. */
1836 	gctx->host_dr0 = rdr0();
1837 	gctx->host_dr1 = rdr1();
1838 	gctx->host_dr2 = rdr2();
1839 	gctx->host_dr3 = rdr3();
1840 	gctx->host_dr6 = rdr6();
1841 
1842 	/* Restore guest debug registers. */
1843 	load_dr0(gctx->sctx_dr0);
1844 	load_dr1(gctx->sctx_dr1);
1845 	load_dr2(gctx->sctx_dr2);
1846 	load_dr3(gctx->sctx_dr3);
1847 }
1848 
1849 static __inline void
svm_dr_leave_guest(struct svm_regctx * gctx)1850 svm_dr_leave_guest(struct svm_regctx *gctx)
1851 {
1852 
1853 	/* Save guest debug registers. */
1854 	gctx->sctx_dr0 = rdr0();
1855 	gctx->sctx_dr1 = rdr1();
1856 	gctx->sctx_dr2 = rdr2();
1857 	gctx->sctx_dr3 = rdr3();
1858 
1859 	/*
1860 	 * Restore host debug registers.  Restore DR7 and DEBUGCTL
1861 	 * last.
1862 	 */
1863 	load_dr0(gctx->host_dr0);
1864 	load_dr1(gctx->host_dr1);
1865 	load_dr2(gctx->host_dr2);
1866 	load_dr3(gctx->host_dr3);
1867 	load_dr6(gctx->host_dr6);
1868 	wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1869 	load_dr7(gctx->host_dr7);
1870 }
1871 
1872 /*
1873  * Apply the TSC offset for a vCPU, including physical CPU and per-vCPU offsets.
1874  */
1875 static void
svm_apply_tsc_adjust(struct svm_softc * svm_sc,int vcpuid)1876 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid)
1877 {
1878 	const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true);
1879 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid);
1880 
1881 	if (ctrl->tsc_offset != offset) {
1882 		ctrl->tsc_offset = offset;
1883 		svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I);
1884 	}
1885 }
1886 
1887 /*
1888  * Start vcpu with specified RIP.
1889  */
1890 static int
svm_vmrun(void * arg,int vcpu,uint64_t rip)1891 svm_vmrun(void *arg, int vcpu, uint64_t rip)
1892 {
1893 	struct svm_regctx *gctx;
1894 	struct svm_softc *svm_sc;
1895 	struct svm_vcpu *vcpustate;
1896 	struct vmcb_state *state;
1897 	struct vm_exit *vmexit;
1898 	struct vlapic *vlapic;
1899 	vm_client_t *vmc;
1900 	struct vm *vm;
1901 	uint64_t vmcb_pa;
1902 	int handled;
1903 	uint16_t ldt_sel;
1904 
1905 	svm_sc = arg;
1906 	vm = svm_sc->vm;
1907 
1908 	vcpustate = svm_get_vcpu(svm_sc, vcpu);
1909 	state = svm_get_vmcb_state(svm_sc, vcpu);
1910 	vmexit = vm_exitinfo(vm, vcpu);
1911 	vlapic = vm_lapic(vm, vcpu);
1912 	vmc = vm_get_vmclient(vm, vcpu);
1913 
1914 	gctx = svm_get_guest_regctx(svm_sc, vcpu);
1915 	vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1916 
1917 	if (vcpustate->lastcpu != curcpu) {
1918 		/*
1919 		 * Force new ASID allocation by invalidating the generation.
1920 		 */
1921 		vcpustate->hma_asid.hsa_gen = 0;
1922 
1923 		/*
1924 		 * Invalidate the VMCB state cache by marking all fields dirty.
1925 		 */
1926 		svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1927 
1928 		/*
1929 		 * XXX
1930 		 * Setting 'vcpustate->lastcpu' here is bit premature because
1931 		 * we may return from this function without actually executing
1932 		 * the VMRUN  instruction. This could happen if an AST or yield
1933 		 * condition is pending on the first time through the loop.
1934 		 *
1935 		 * This works for now but any new side-effects of vcpu
1936 		 * migration should take this case into account.
1937 		 */
1938 		vcpustate->lastcpu = curcpu;
1939 		vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1940 	}
1941 
1942 	svm_apply_tsc_adjust(svm_sc, vcpu);
1943 
1944 	svm_msr_guest_enter(svm_sc, vcpu);
1945 
1946 	VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1947 	vcpustate->loaded = B_TRUE;
1948 
1949 	/* Update Guest RIP */
1950 	state->rip = rip;
1951 
1952 	do {
1953 		enum event_inject_state inject_state;
1954 		uint64_t nptgen;
1955 
1956 		/*
1957 		 * Initial event injection is complex and may involve mutex
1958 		 * contention, so it must be performed with global interrupts
1959 		 * still enabled.
1960 		 */
1961 		inject_state = svm_inject_events(svm_sc, vcpu);
1962 		handled = 0;
1963 
1964 		/*
1965 		 * Disable interrupts while loading VM state and performing
1966 		 * event injection.
1967 		 */
1968 		const ulong_t iflag = intr_clear();
1969 
1970 		/*
1971 		 * Synchronizing and injecting vlapic state is lock-free and is
1972 		 * safe (and prudent) to perform with interrupts disabled.
1973 		 */
1974 		inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
1975 		    inject_state);
1976 
1977 		/*
1978 		 * Check for vCPU bail-out conditions.  This must be done after
1979 		 * svm_inject_events() to detect a triple-fault condition.
1980 		 */
1981 		if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
1982 			intr_restore(iflag);
1983 			break;
1984 		}
1985 
1986 		if (vcpu_run_state_pending(vm, vcpu)) {
1987 			intr_restore(iflag);
1988 			vm_exit_run_state(vm, vcpu, state->rip);
1989 			break;
1990 		}
1991 
1992 		/*
1993 		 * If subsequent activity queued events which require injection
1994 		 * handling, take another lap to handle them.
1995 		 */
1996 		if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
1997 			intr_restore(iflag);
1998 			handled = 1;
1999 			continue;
2000 		}
2001 
2002 		/*
2003 		 * #VMEXIT resumes the host with the guest LDTR, so
2004 		 * save the current LDT selector so it can be restored
2005 		 * after an exit.  The userspace hypervisor probably
2006 		 * doesn't use a LDT, but save and restore it to be
2007 		 * safe.
2008 		 */
2009 		ldt_sel = sldt();
2010 
2011 		/*
2012 		 * Check the vmspace and ASID generations to ensure that the
2013 		 * vcpu does not use stale TLB mappings.
2014 		 */
2015 		nptgen = vmc_table_enter(vmc);
2016 		check_asid(svm_sc, vcpu, curcpu, nptgen);
2017 
2018 		svm_pmu_enter(svm_sc, vcpu);
2019 		vcpu_ustate_change(vm, vcpu, VU_RUN);
2020 		svm_dr_enter_guest(gctx);
2021 		svm_apply_dirty(svm_sc, vcpu);
2022 
2023 		/*
2024 		 * Perform VMRUN to enter guest context.
2025 		 *
2026 		 * This is done with the protection of clearing the GIF
2027 		 * (global interrupt flag) as required by SVM.
2028 		 */
2029 		hma_svm_gif_disable();
2030 		svm_launch(vmcb_pa, gctx, get_pcpu());
2031 		hma_svm_gif_enable();
2032 
2033 		svm_dr_leave_guest(gctx);
2034 		vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
2035 		svm_pmu_exit(svm_sc, vcpu);
2036 
2037 		/* Restore host LDTR. */
2038 		lldt(ldt_sel);
2039 
2040 		/*
2041 		 * Re-enable interrupts now that necessary CPU state has been
2042 		 * restored.  Subsequent logic may need to block.
2043 		 */
2044 		intr_restore(iflag);
2045 
2046 		vmc_table_exit(vmc);
2047 
2048 		/* Update 'nextrip' */
2049 		vcpustate->nextrip = state->rip;
2050 
2051 		/* Handle #VMEXIT and if required return to user space. */
2052 		handled = svm_vmexit(svm_sc, vcpu, vmexit);
2053 	} while (handled);
2054 
2055 	svm_msr_guest_exit(svm_sc, vcpu);
2056 
2057 	ASSERT(interrupts_enabled());
2058 	VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2059 	vcpustate->loaded = B_FALSE;
2060 
2061 	return (0);
2062 }
2063 
2064 static void
svm_vmcleanup(void * arg)2065 svm_vmcleanup(void *arg)
2066 {
2067 	struct svm_softc *sc = arg;
2068 
2069 	vmm_contig_free(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE);
2070 	vmm_contig_free(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE);
2071 	kmem_free(sc, sizeof (*sc));
2072 }
2073 
2074 static uint64_t *
swctx_regptr(struct svm_regctx * regctx,int reg)2075 swctx_regptr(struct svm_regctx *regctx, int reg)
2076 {
2077 	switch (reg) {
2078 	case VM_REG_GUEST_RBX:
2079 		return (&regctx->sctx_rbx);
2080 	case VM_REG_GUEST_RCX:
2081 		return (&regctx->sctx_rcx);
2082 	case VM_REG_GUEST_RDX:
2083 		return (&regctx->sctx_rdx);
2084 	case VM_REG_GUEST_RDI:
2085 		return (&regctx->sctx_rdi);
2086 	case VM_REG_GUEST_RSI:
2087 		return (&regctx->sctx_rsi);
2088 	case VM_REG_GUEST_RBP:
2089 		return (&regctx->sctx_rbp);
2090 	case VM_REG_GUEST_R8:
2091 		return (&regctx->sctx_r8);
2092 	case VM_REG_GUEST_R9:
2093 		return (&regctx->sctx_r9);
2094 	case VM_REG_GUEST_R10:
2095 		return (&regctx->sctx_r10);
2096 	case VM_REG_GUEST_R11:
2097 		return (&regctx->sctx_r11);
2098 	case VM_REG_GUEST_R12:
2099 		return (&regctx->sctx_r12);
2100 	case VM_REG_GUEST_R13:
2101 		return (&regctx->sctx_r13);
2102 	case VM_REG_GUEST_R14:
2103 		return (&regctx->sctx_r14);
2104 	case VM_REG_GUEST_R15:
2105 		return (&regctx->sctx_r15);
2106 	case VM_REG_GUEST_DR0:
2107 		return (&regctx->sctx_dr0);
2108 	case VM_REG_GUEST_DR1:
2109 		return (&regctx->sctx_dr1);
2110 	case VM_REG_GUEST_DR2:
2111 		return (&regctx->sctx_dr2);
2112 	case VM_REG_GUEST_DR3:
2113 		return (&regctx->sctx_dr3);
2114 	default:
2115 		return (NULL);
2116 	}
2117 }
2118 
2119 static int
svm_getreg(void * arg,int vcpu,int ident,uint64_t * val)2120 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2121 {
2122 	struct svm_softc *sc;
2123 	struct vmcb *vmcb;
2124 	uint64_t *regp;
2125 	uint64_t *fieldp;
2126 	struct vmcb_segment *seg;
2127 
2128 	sc = arg;
2129 	vmcb = svm_get_vmcb(sc, vcpu);
2130 
2131 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2132 	if (regp != NULL) {
2133 		*val = *regp;
2134 		return (0);
2135 	}
2136 
2137 	switch (ident) {
2138 	case VM_REG_GUEST_INTR_SHADOW:
2139 		*val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2140 		break;
2141 
2142 	case VM_REG_GUEST_CR0:
2143 		svm_get_cr0(sc, vcpu, val);
2144 		break;
2145 	case VM_REG_GUEST_CR2:
2146 	case VM_REG_GUEST_CR3:
2147 	case VM_REG_GUEST_CR4:
2148 	case VM_REG_GUEST_DR6:
2149 	case VM_REG_GUEST_DR7:
2150 	case VM_REG_GUEST_EFER:
2151 	case VM_REG_GUEST_RAX:
2152 	case VM_REG_GUEST_RFLAGS:
2153 	case VM_REG_GUEST_RIP:
2154 	case VM_REG_GUEST_RSP:
2155 		fieldp = vmcb_regptr(vmcb, ident, NULL);
2156 		*val = *fieldp;
2157 		break;
2158 
2159 	case VM_REG_GUEST_CS:
2160 	case VM_REG_GUEST_DS:
2161 	case VM_REG_GUEST_ES:
2162 	case VM_REG_GUEST_FS:
2163 	case VM_REG_GUEST_GS:
2164 	case VM_REG_GUEST_SS:
2165 	case VM_REG_GUEST_LDTR:
2166 	case VM_REG_GUEST_TR:
2167 		seg = vmcb_segptr(vmcb, ident);
2168 		*val = seg->selector;
2169 		break;
2170 
2171 	case VM_REG_GUEST_GDTR:
2172 	case VM_REG_GUEST_IDTR:
2173 		/* GDTR and IDTR don't have segment selectors */
2174 		return (EINVAL);
2175 
2176 	case VM_REG_GUEST_PDPTE0:
2177 	case VM_REG_GUEST_PDPTE1:
2178 	case VM_REG_GUEST_PDPTE2:
2179 	case VM_REG_GUEST_PDPTE3:
2180 		/*
2181 		 * Unlike VMX, where the PDPTEs are explicitly cached as part of
2182 		 * several well-defined events related to paging (such as
2183 		 * loading %cr3), SVM walks the PDPEs (their PDPTE) as part of
2184 		 * nested paging lookups.  This makes these registers
2185 		 * effectively irrelevant on SVM.
2186 		 *
2187 		 * Rather than tossing an error, emit zeroed values so casual
2188 		 * consumers do not need to be as careful about that difference.
2189 		 */
2190 		*val = 0;
2191 		break;
2192 
2193 	default:
2194 		return (EINVAL);
2195 	}
2196 
2197 	return (0);
2198 }
2199 
2200 static int
svm_setreg(void * arg,int vcpu,int ident,uint64_t val)2201 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2202 {
2203 	struct svm_softc *sc;
2204 	struct vmcb *vmcb;
2205 	uint64_t *regp;
2206 	uint64_t *fieldp;
2207 	uint32_t dirty;
2208 	struct vmcb_segment *seg;
2209 
2210 	sc = arg;
2211 	vmcb = svm_get_vmcb(sc, vcpu);
2212 
2213 	regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2214 	if (regp != NULL) {
2215 		*regp = val;
2216 		return (0);
2217 	}
2218 
2219 	dirty = VMCB_CACHE_NONE;
2220 	switch (ident) {
2221 	case VM_REG_GUEST_INTR_SHADOW:
2222 		vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2223 		break;
2224 
2225 	case VM_REG_GUEST_EFER:
2226 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2227 		/* EFER_SVM must always be set when the guest is executing */
2228 		*fieldp = val | EFER_SVM;
2229 		dirty |= VMCB_CACHE_CR;
2230 		break;
2231 
2232 	case VM_REG_GUEST_CR0:
2233 		svm_set_cr0(sc, vcpu, val, false);
2234 		break;
2235 	case VM_REG_GUEST_CR2:
2236 	case VM_REG_GUEST_CR3:
2237 	case VM_REG_GUEST_CR4:
2238 	case VM_REG_GUEST_DR6:
2239 	case VM_REG_GUEST_DR7:
2240 	case VM_REG_GUEST_RAX:
2241 	case VM_REG_GUEST_RFLAGS:
2242 	case VM_REG_GUEST_RIP:
2243 	case VM_REG_GUEST_RSP:
2244 		fieldp = vmcb_regptr(vmcb, ident, &dirty);
2245 		*fieldp = val;
2246 		break;
2247 
2248 	case VM_REG_GUEST_CS:
2249 	case VM_REG_GUEST_DS:
2250 	case VM_REG_GUEST_ES:
2251 	case VM_REG_GUEST_SS:
2252 	case VM_REG_GUEST_FS:
2253 	case VM_REG_GUEST_GS:
2254 	case VM_REG_GUEST_LDTR:
2255 	case VM_REG_GUEST_TR:
2256 		dirty |= VMCB_CACHE_SEG;
2257 		seg = vmcb_segptr(vmcb, ident);
2258 		seg->selector = (uint16_t)val;
2259 		break;
2260 
2261 	case VM_REG_GUEST_GDTR:
2262 	case VM_REG_GUEST_IDTR:
2263 		/* GDTR and IDTR don't have segment selectors */
2264 		return (EINVAL);
2265 
2266 	case VM_REG_GUEST_PDPTE0:
2267 	case VM_REG_GUEST_PDPTE1:
2268 	case VM_REG_GUEST_PDPTE2:
2269 	case VM_REG_GUEST_PDPTE3:
2270 		/*
2271 		 * PDPEs (AMD's PDPTE) are not cached under SVM, so we can
2272 		 * ignore attempts to set them.  See handler in svm_getreg() for
2273 		 * more details.
2274 		 */
2275 		break;
2276 
2277 	default:
2278 		return (EINVAL);
2279 	}
2280 
2281 	if (dirty != VMCB_CACHE_NONE) {
2282 		svm_set_dirty(sc, vcpu, dirty);
2283 	}
2284 
2285 	/*
2286 	 * XXX deal with CR3 and invalidate TLB entries tagged with the
2287 	 * vcpu's ASID. This needs to be treated differently depending on
2288 	 * whether 'running' is true/false.
2289 	 */
2290 
2291 	return (0);
2292 }
2293 
2294 static int
svm_setdesc(void * arg,int vcpu,int reg,const struct seg_desc * desc)2295 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2296 {
2297 	struct vmcb *vmcb;
2298 	struct svm_softc *sc;
2299 	struct vmcb_segment *seg;
2300 
2301 	sc = arg;
2302 	vmcb = svm_get_vmcb(sc, vcpu);
2303 
2304 	switch (reg) {
2305 	case VM_REG_GUEST_CS:
2306 	case VM_REG_GUEST_DS:
2307 	case VM_REG_GUEST_ES:
2308 	case VM_REG_GUEST_SS:
2309 	case VM_REG_GUEST_FS:
2310 	case VM_REG_GUEST_GS:
2311 	case VM_REG_GUEST_LDTR:
2312 	case VM_REG_GUEST_TR:
2313 		svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2314 		seg = vmcb_segptr(vmcb, reg);
2315 		/*
2316 		 * Map seg_desc access to VMCB attribute format.
2317 		 *
2318 		 * SVM uses the 'P' bit in the segment attributes to indicate a
2319 		 * NULL segment so clear it if the segment is marked unusable.
2320 		 */
2321 		seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2322 		if (SEG_DESC_UNUSABLE(desc->access)) {
2323 			seg->attrib &= ~0x80;
2324 		}
2325 		/*
2326 		 * Keep CPL synced with the DPL specified for %ss.
2327 		 *
2328 		 * KVM notes that a SYSRET to non-cpl-3 is possible on AMD
2329 		 * (unlike Intel), but accepts such a possible deviation for
2330 		 * what is otherwise unreasonable behavior for a guest OS, since
2331 		 * they do the same synchronization.
2332 		 */
2333 		if (reg == VM_REG_GUEST_SS) {
2334 			vmcb->state.cpl = SEG_DESC_DPL(desc->access);
2335 		}
2336 		break;
2337 
2338 	case VM_REG_GUEST_GDTR:
2339 	case VM_REG_GUEST_IDTR:
2340 		svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2341 		seg = vmcb_segptr(vmcb, reg);
2342 		break;
2343 
2344 	default:
2345 		return (EINVAL);
2346 	}
2347 
2348 	ASSERT(seg != NULL);
2349 	seg->base = desc->base;
2350 	seg->limit = desc->limit;
2351 
2352 	return (0);
2353 }
2354 
2355 static int
svm_getdesc(void * arg,int vcpu,int reg,struct seg_desc * desc)2356 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2357 {
2358 	struct vmcb *vmcb;
2359 	struct svm_softc *sc;
2360 	struct vmcb_segment *seg;
2361 
2362 	sc = arg;
2363 	vmcb = svm_get_vmcb(sc, vcpu);
2364 
2365 	switch (reg) {
2366 	case VM_REG_GUEST_DS:
2367 	case VM_REG_GUEST_ES:
2368 	case VM_REG_GUEST_FS:
2369 	case VM_REG_GUEST_GS:
2370 	case VM_REG_GUEST_SS:
2371 	case VM_REG_GUEST_LDTR:
2372 		seg = vmcb_segptr(vmcb, reg);
2373 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2374 		/*
2375 		 * VT-x uses bit 16 to indicate a segment that has been loaded
2376 		 * with a NULL selector (aka unusable). The 'desc->access'
2377 		 * field is interpreted in the VT-x format by the
2378 		 * processor-independent code.
2379 		 *
2380 		 * SVM uses the 'P' bit to convey the same information so
2381 		 * convert it into the VT-x format. For more details refer to
2382 		 * section "Segment State in the VMCB" in APMv2.
2383 		 */
2384 		if ((desc->access & 0x80) == 0) {
2385 			/* Unusable segment */
2386 			desc->access |= 0x10000;
2387 		}
2388 
2389 		/*
2390 		 * Just as CPL (in the VMCB) is kept synced to SS when the
2391 		 * segment is written, so too shall the segment sync from CPL
2392 		 * when it is read.
2393 		 */
2394 		if (reg == VM_REG_GUEST_SS) {
2395 			desc->access &=
2396 			    ~(SEG_DESC_DPL_MASK << SEG_DESC_DPL_SHIFT);
2397 			desc->access |=
2398 			    (vmcb->state.cpl & SEG_DESC_DPL_MASK) <<
2399 			    SEG_DESC_DPL_SHIFT;
2400 		}
2401 		break;
2402 
2403 	case VM_REG_GUEST_CS:
2404 	case VM_REG_GUEST_TR:
2405 		seg = vmcb_segptr(vmcb, reg);
2406 		desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2407 		break;
2408 
2409 	case VM_REG_GUEST_GDTR:
2410 	case VM_REG_GUEST_IDTR:
2411 		seg = vmcb_segptr(vmcb, reg);
2412 		/*
2413 		 * Since there are no access bits associated with the GDTR or
2414 		 * the IDTR, zero out the field to ensure it does not contain
2415 		 * garbage which might confuse the consumer.
2416 		 */
2417 		desc->access = 0;
2418 		break;
2419 
2420 	default:
2421 		return (EINVAL);
2422 	}
2423 
2424 	ASSERT(seg != NULL);
2425 	desc->base = seg->base;
2426 	desc->limit = seg->limit;
2427 	return (0);
2428 }
2429 
2430 static int
svm_get_msr(void * arg,int vcpu,uint32_t msr,uint64_t * valp)2431 svm_get_msr(void *arg, int vcpu, uint32_t msr, uint64_t *valp)
2432 {
2433 	struct svm_softc *sc = arg;
2434 	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2435 	const uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, NULL);
2436 
2437 	if (msrp != NULL) {
2438 		*valp = *msrp;
2439 		return (0);
2440 	}
2441 
2442 	return (EINVAL);
2443 }
2444 
2445 static int
svm_set_msr(void * arg,int vcpu,uint32_t msr,uint64_t val)2446 svm_set_msr(void *arg, int vcpu, uint32_t msr, uint64_t val)
2447 {
2448 	struct svm_softc *sc = arg;
2449 	struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2450 
2451 	uint32_t dirty = 0;
2452 	uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, &dirty);
2453 	if (msrp == NULL) {
2454 		return (EINVAL);
2455 	}
2456 	switch (msr) {
2457 	case MSR_EFER:
2458 		/*
2459 		 * For now, just clone the logic from
2460 		 * svm_setreg():
2461 		 *
2462 		 * EFER_SVM must always be set when the guest is
2463 		 * executing
2464 		 */
2465 		*msrp = val | EFER_SVM;
2466 		break;
2467 	/* TODO: other necessary MSR masking */
2468 	default:
2469 		*msrp = val;
2470 		break;
2471 	}
2472 	if (dirty != 0) {
2473 		svm_set_dirty(sc, vcpu, dirty);
2474 	}
2475 	return (0);
2476 
2477 }
2478 
2479 static int
svm_setcap(void * arg,int vcpu,int type,int val)2480 svm_setcap(void *arg, int vcpu, int type, int val)
2481 {
2482 	struct svm_softc *sc;
2483 	int error;
2484 
2485 	sc = arg;
2486 	error = 0;
2487 	switch (type) {
2488 	case VM_CAP_HALT_EXIT:
2489 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2490 		    VMCB_INTCPT_HLT, val);
2491 		break;
2492 	case VM_CAP_PAUSE_EXIT:
2493 		svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2494 		    VMCB_INTCPT_PAUSE, val);
2495 		break;
2496 	default:
2497 		error = ENOENT;
2498 		break;
2499 	}
2500 	return (error);
2501 }
2502 
2503 static int
svm_getcap(void * arg,int vcpu,int type,int * retval)2504 svm_getcap(void *arg, int vcpu, int type, int *retval)
2505 {
2506 	struct svm_softc *sc;
2507 	int error;
2508 
2509 	sc = arg;
2510 	error = 0;
2511 
2512 	switch (type) {
2513 	case VM_CAP_HALT_EXIT:
2514 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2515 		    VMCB_INTCPT_HLT);
2516 		break;
2517 	case VM_CAP_PAUSE_EXIT:
2518 		*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2519 		    VMCB_INTCPT_PAUSE);
2520 		break;
2521 	default:
2522 		error = ENOENT;
2523 		break;
2524 	}
2525 	return (error);
2526 }
2527 
2528 static struct vlapic *
svm_vlapic_init(void * arg,int vcpuid)2529 svm_vlapic_init(void *arg, int vcpuid)
2530 {
2531 	struct svm_softc *svm_sc;
2532 	struct vlapic *vlapic;
2533 
2534 	svm_sc = arg;
2535 	vlapic = kmem_zalloc(sizeof (struct vlapic), KM_SLEEP);
2536 	vlapic->vm = svm_sc->vm;
2537 	vlapic->vcpuid = vcpuid;
2538 	vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2539 
2540 	vlapic_init(vlapic);
2541 
2542 	return (vlapic);
2543 }
2544 
2545 static void
svm_vlapic_cleanup(void * arg,struct vlapic * vlapic)2546 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2547 {
2548 	vlapic_cleanup(vlapic);
2549 	kmem_free(vlapic, sizeof (struct vlapic));
2550 }
2551 
2552 static void
svm_pause(void * arg,int vcpu)2553 svm_pause(void *arg, int vcpu)
2554 {
2555 	struct svm_softc *sc = arg;
2556 	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
2557 
2558 	/*
2559 	 * If an event is pending injection in the VMCB, stash it in
2560 	 * exit_intinfo as if it were deferred by an exit from guest context.
2561 	 */
2562 	const uint64_t intinfo = ctrl->eventinj;
2563 	if ((intinfo & VMCB_EVENTINJ_VALID) != 0) {
2564 		svm_stash_intinfo(sc, vcpu, intinfo);
2565 		ctrl->eventinj = 0;
2566 	}
2567 
2568 	/*
2569 	 * Now that no event is pending injection, interrupt-window exiting and
2570 	 * NMI-blocking can be disabled.  If/when this vCPU is made to run
2571 	 * again, those conditions will be reinstated when the now-queued events
2572 	 * are re-injected.
2573 	 */
2574 	svm_disable_intr_window_exiting(sc, vcpu);
2575 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
2576 }
2577 
2578 static void
svm_savectx(void * arg,int vcpu)2579 svm_savectx(void *arg, int vcpu)
2580 {
2581 	struct svm_softc *sc = arg;
2582 
2583 	/* We should _never_ go off-CPU with the GIF disabled */
2584 	ASSERT(!hma_svm_gif_is_disabled());
2585 
2586 	if (sc->vcpu[vcpu].loaded) {
2587 		svm_msr_guest_exit(sc, vcpu);
2588 	}
2589 }
2590 
2591 static void
svm_restorectx(void * arg,int vcpu)2592 svm_restorectx(void *arg, int vcpu)
2593 {
2594 	struct svm_softc *sc = arg;
2595 
2596 	if (sc->vcpu[vcpu].loaded) {
2597 		svm_msr_guest_enter(sc, vcpu);
2598 	}
2599 }
2600 
2601 static freqratio_res_t
svm_freq_ratio(uint64_t guest_hz,uint64_t host_hz,uint64_t * mult)2602 svm_freq_ratio(uint64_t guest_hz, uint64_t host_hz, uint64_t *mult)
2603 {
2604 	/*
2605 	 * Check whether scaling is needed at all before potentially erroring
2606 	 * out for other reasons.
2607 	 */
2608 	if (guest_hz == host_hz) {
2609 		return (FR_SCALING_NOT_NEEDED);
2610 	}
2611 
2612 	/*
2613 	 * Confirm that scaling is available.
2614 	 */
2615 	if (!has_tsc_freq_ctl()) {
2616 		return (FR_SCALING_NOT_SUPPORTED);
2617 	}
2618 
2619 	/*
2620 	 * Verify the guest_hz is within the supported range.
2621 	 */
2622 	if ((guest_hz < AMD_TSC_MIN_FREQ) ||
2623 	    (guest_hz >= (host_hz * AMD_TSC_MAX_FREQ_RATIO))) {
2624 		return (FR_OUT_OF_RANGE);
2625 	}
2626 
2627 	/* Calculate the multiplier. */
2628 	uint64_t m = vmm_calc_freq_multiplier(guest_hz, host_hz,
2629 	    AMD_TSCM_FRAC_SIZE);
2630 	*mult = m;
2631 
2632 	return (FR_VALID);
2633 }
2634 
2635 struct vmm_ops vmm_ops_amd = {
2636 	.init		= svm_init,
2637 	.cleanup	= svm_cleanup,
2638 	.resume		= svm_restore,
2639 
2640 	.vminit		= svm_vminit,
2641 	.vmrun		= svm_vmrun,
2642 	.vmcleanup	= svm_vmcleanup,
2643 	.vmgetreg	= svm_getreg,
2644 	.vmsetreg	= svm_setreg,
2645 	.vmgetdesc	= svm_getdesc,
2646 	.vmsetdesc	= svm_setdesc,
2647 	.vmgetcap	= svm_getcap,
2648 	.vmsetcap	= svm_setcap,
2649 	.vlapic_init	= svm_vlapic_init,
2650 	.vlapic_cleanup	= svm_vlapic_cleanup,
2651 	.vmpause	= svm_pause,
2652 
2653 	.vmsavectx	= svm_savectx,
2654 	.vmrestorectx	= svm_restorectx,
2655 
2656 	.vmgetmsr	= svm_get_msr,
2657 	.vmsetmsr	= svm_set_msr,
2658 
2659 	.vmfreqratio	= svm_freq_ratio,
2660 	.fr_intsize	= AMD_TSCM_INT_SIZE,
2661 	.fr_fracsize	= AMD_TSCM_FRAC_SIZE,
2662 };
2663