1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
12 * disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * This file and its contents are supplied under the terms of the
31 * Common Development and Distribution License ("CDDL"), version 1.0.
32 * You may only use this file in accordance with the terms of version
33 * 1.0 of the CDDL.
34 *
35 * A full copy of the text of the CDDL should have accompanied this
36 * source. A copy of the CDDL is also available via the Internet at
37 * http://www.illumos.org/license/CDDL.
38 *
39 * Copyright 2018 Joyent, Inc.
40 * Copyright 2023 Oxide Computer Company
41 */
42
43 #include <sys/cdefs.h>
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/kernel.h>
48 #include <sys/kmem.h>
49 #include <sys/pcpu.h>
50 #include <sys/proc.h>
51 #include <sys/sysctl.h>
52
53 #include <sys/x86_archext.h>
54 #include <sys/trap.h>
55
56 #include <machine/cpufunc.h>
57 #include <machine/psl.h>
58 #include <machine/md_var.h>
59 #include <machine/reg.h>
60 #include <machine/specialreg.h>
61 #include <machine/vmm.h>
62 #include <machine/vmm_dev.h>
63 #include <sys/vmm_instruction_emul.h>
64 #include <sys/vmm_vm.h>
65 #include <sys/vmm_kernel.h>
66
67 #include "vmm_lapic.h"
68 #include "vmm_stat.h"
69 #include "vmm_ioport.h"
70 #include "vatpic.h"
71 #include "vlapic.h"
72 #include "vlapic_priv.h"
73
74 #include "vmcb.h"
75 #include "svm.h"
76 #include "svm_softc.h"
77 #include "svm_msr.h"
78
79 SYSCTL_DECL(_hw_vmm);
80 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
81 NULL);
82
83 #define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \
84 VMCB_CACHE_IOPM | \
85 VMCB_CACHE_I | \
86 VMCB_CACHE_TPR | \
87 VMCB_CACHE_CR2 | \
88 VMCB_CACHE_CR | \
89 VMCB_CACHE_DR | \
90 VMCB_CACHE_DT | \
91 VMCB_CACHE_SEG | \
92 VMCB_CACHE_NP)
93
94 /*
95 * Guardrails for supported guest TSC frequencies.
96 *
97 * A minimum of 0.5 GHz, which should be sufficient for all recent AMD CPUs, and
98 * a maximum ratio of (15 * host frequency), which is sufficient to prevent
99 * overflowing frequency calcuations and give plenty of bandwidth for future CPU
100 * frequency increases.
101 */
102 #define AMD_TSC_MIN_FREQ 500000000
103 #define AMD_TSC_MAX_FREQ_RATIO 15
104
105 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
106
107 /* SVM features advertised by CPUID.8000000AH:EDX */
108 static uint32_t svm_feature = 0;
109
110 static int disable_npf_assist;
111
112 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
113 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
114 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
115
116 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
117 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val);
118 static void flush_asid(struct svm_softc *sc, int vcpuid);
119
120 static __inline bool
has_flush_by_asid(void)121 has_flush_by_asid(void)
122 {
123 return ((svm_feature & CPUID_AMD_EDX_FLUSH_ASID) != 0);
124 }
125
126 static __inline bool
has_lbr_virt(void)127 has_lbr_virt(void)
128 {
129 return ((svm_feature & CPUID_AMD_EDX_LBR_VIRT) != 0);
130 }
131
132 static __inline bool
has_decode_assist(void)133 has_decode_assist(void)
134 {
135 return ((svm_feature & CPUID_AMD_EDX_DECODE_ASSISTS) != 0);
136 }
137
138 static __inline bool
has_tsc_freq_ctl(void)139 has_tsc_freq_ctl(void)
140 {
141 return ((svm_feature & CPUID_AMD_EDX_TSC_RATE_MSR) != 0);
142 }
143
144 static int
svm_cleanup(void)145 svm_cleanup(void)
146 {
147 /* This is taken care of by the hma registration */
148 return (0);
149 }
150
151 static int
svm_init(void)152 svm_init(void)
153 {
154 /* Grab a (bhyve) local copy of the SVM feature bits */
155 struct cpuid_regs regs = {
156 .cp_eax = 0x8000000a,
157 };
158 (void) cpuid_insn(NULL, ®s);
159 svm_feature = regs.cp_edx;
160
161 /*
162 * HMA should have already checked for these features which we refuse to
163 * operate without, but no harm in making sure
164 */
165 const uint32_t demand_bits =
166 (CPUID_AMD_EDX_NESTED_PAGING | CPUID_AMD_EDX_NRIPS);
167 VERIFY((svm_feature & demand_bits) == demand_bits);
168
169 /* Clear any unexpected bits (set manually) from vmcb_clean */
170 vmcb_clean &= VMCB_CACHE_DEFAULT;
171
172 return (0);
173 }
174
175 static void
svm_restore(void)176 svm_restore(void)
177 {
178 /* No-op on illumos */
179 }
180
181 /* Pentium compatible MSRs */
182 #define MSR_PENTIUM_START 0
183 #define MSR_PENTIUM_END 0x1FFF
184 /* AMD 6th generation and Intel compatible MSRs */
185 #define MSR_AMD6TH_START 0xC0000000UL
186 #define MSR_AMD6TH_END 0xC0001FFFUL
187 /* AMD 7th and 8th generation compatible MSRs */
188 #define MSR_AMD7TH_START 0xC0010000UL
189 #define MSR_AMD7TH_END 0xC0011FFFUL
190
191 /*
192 * Get the index and bit position for a MSR in permission bitmap.
193 * Two bits are used for each MSR: lower bit for read and higher bit for write.
194 */
195 static int
svm_msr_index(uint64_t msr,int * index,int * bit)196 svm_msr_index(uint64_t msr, int *index, int *bit)
197 {
198 uint32_t base, off;
199
200 *index = -1;
201 *bit = (msr % 4) * 2;
202 base = 0;
203
204 if (msr <= MSR_PENTIUM_END) {
205 *index = msr / 4;
206 return (0);
207 }
208
209 base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
210 if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
211 off = (msr - MSR_AMD6TH_START);
212 *index = (off + base) / 4;
213 return (0);
214 }
215
216 base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
217 if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
218 off = (msr - MSR_AMD7TH_START);
219 *index = (off + base) / 4;
220 return (0);
221 }
222
223 return (EINVAL);
224 }
225
226 /*
227 * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
228 */
229 static void
svm_msr_perm(uint8_t * perm_bitmap,uint64_t msr,bool read,bool write)230 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
231 {
232 int index, bit, error;
233
234 error = svm_msr_index(msr, &index, &bit);
235 KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
236 KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
237 ("%s: invalid index %d for msr %lx", __func__, index, msr));
238 KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
239 "msr %lx", __func__, bit, msr));
240
241 if (read)
242 perm_bitmap[index] &= ~(1UL << bit);
243
244 if (write)
245 perm_bitmap[index] &= ~(2UL << bit);
246 }
247
248 static void
svm_msr_rw_ok(uint8_t * perm_bitmap,uint64_t msr)249 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
250 {
251
252 svm_msr_perm(perm_bitmap, msr, true, true);
253 }
254
255 static void
svm_msr_rd_ok(uint8_t * perm_bitmap,uint64_t msr)256 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
257 {
258
259 svm_msr_perm(perm_bitmap, msr, true, false);
260 }
261
262 static __inline int
svm_get_intercept(struct svm_softc * sc,int vcpu,int idx,uint32_t bitmask)263 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
264 {
265 struct vmcb_ctrl *ctrl;
266
267 KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
268
269 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
270 return (ctrl->intercept[idx] & bitmask ? 1 : 0);
271 }
272
273 static __inline void
svm_set_intercept(struct svm_softc * sc,int vcpu,int idx,uint32_t bitmask,int enabled)274 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
275 int enabled)
276 {
277 struct vmcb_ctrl *ctrl;
278 uint32_t oldval;
279
280 KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
281
282 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
283 oldval = ctrl->intercept[idx];
284
285 if (enabled)
286 ctrl->intercept[idx] |= bitmask;
287 else
288 ctrl->intercept[idx] &= ~bitmask;
289
290 if (ctrl->intercept[idx] != oldval) {
291 svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
292 }
293 }
294
295 static __inline void
svm_disable_intercept(struct svm_softc * sc,int vcpu,int off,uint32_t bitmask)296 svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
297 {
298
299 svm_set_intercept(sc, vcpu, off, bitmask, 0);
300 }
301
302 static __inline void
svm_enable_intercept(struct svm_softc * sc,int vcpu,int off,uint32_t bitmask)303 svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
304 {
305
306 svm_set_intercept(sc, vcpu, off, bitmask, 1);
307 }
308
309 static void
vmcb_init(struct svm_softc * sc,int vcpu,uint64_t iopm_base_pa,uint64_t msrpm_base_pa,uint64_t np_pml4)310 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
311 uint64_t msrpm_base_pa, uint64_t np_pml4)
312 {
313 struct vmcb_ctrl *ctrl;
314 struct vmcb_state *state;
315 uint32_t mask;
316 int n;
317
318 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
319 state = svm_get_vmcb_state(sc, vcpu);
320
321 ctrl->iopm_base_pa = iopm_base_pa;
322 ctrl->msrpm_base_pa = msrpm_base_pa;
323
324 /* Enable nested paging */
325 ctrl->np_ctrl = NP_ENABLE;
326 ctrl->n_cr3 = np_pml4;
327
328 /*
329 * Intercept accesses to the control registers that are not shadowed
330 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
331 */
332 for (n = 0; n < 16; n++) {
333 mask = (BIT(n) << 16) | BIT(n);
334 if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
335 svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
336 else
337 svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
338 }
339
340 /*
341 * Selectively intercept writes to %cr0. This triggers on operations
342 * which would change bits other than TS or MP.
343 */
344 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
345 VMCB_INTCPT_CR0_WRITE);
346
347 /*
348 * Intercept everything when tracing guest exceptions otherwise
349 * just intercept machine check exception.
350 */
351 if (vcpu_trace_exceptions(sc->vm, vcpu)) {
352 for (n = 0; n < 32; n++) {
353 /*
354 * Skip unimplemented vectors in the exception bitmap.
355 */
356 if (n == 2 || n == 9) {
357 continue;
358 }
359 svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
360 }
361 } else {
362 svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
363 }
364
365 /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
366 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
367 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
368 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
369 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
370 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
371 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
372 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
373 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
374 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
375 VMCB_INTCPT_FERR_FREEZE);
376
377 /* Enable exit-on-hlt by default */
378 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT);
379
380 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
381 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
382
383 /* Intercept privileged invalidation instructions. */
384 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
385 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
386
387 /*
388 * Intercept all virtualization-related instructions.
389 *
390 * From section "Canonicalization and Consistency Checks" in APMv2
391 * the VMRUN intercept bit must be set to pass the consistency check.
392 */
393 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
394 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
395 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
396 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
397 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
398 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
399 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
400 if (vcpu_trap_wbinvd(sc->vm, vcpu) != 0) {
401 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT,
402 VMCB_INTCPT_WBINVD);
403 }
404
405 /*
406 * The ASID will be set to a non-zero value just before VMRUN.
407 */
408 ctrl->asid = 0;
409
410 /*
411 * Section 15.21.1, Interrupt Masking in EFLAGS
412 * Section 15.21.2, Virtualizing APIC.TPR
413 *
414 * This must be set for %rflag and %cr8 isolation of guest and host.
415 */
416 ctrl->v_intr_ctrl |= V_INTR_MASKING;
417
418 /* Enable Last Branch Record aka LBR-virt (if available) */
419 if (has_lbr_virt()) {
420 ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
421 }
422
423 /* EFER_SVM must always be set when the guest is executing */
424 state->efer = EFER_SVM;
425
426 /* Set up the PAT to power-on state */
427 state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) |
428 PAT_VALUE(1, PAT_WRITE_THROUGH) |
429 PAT_VALUE(2, PAT_UNCACHED) |
430 PAT_VALUE(3, PAT_UNCACHEABLE) |
431 PAT_VALUE(4, PAT_WRITE_BACK) |
432 PAT_VALUE(5, PAT_WRITE_THROUGH) |
433 PAT_VALUE(6, PAT_UNCACHED) |
434 PAT_VALUE(7, PAT_UNCACHEABLE);
435
436 /* Set up DR6/7 to power-on state */
437 state->dr6 = DBREG_DR6_RESERVED1;
438 state->dr7 = DBREG_DR7_RESERVED1;
439 }
440
441 /*
442 * Initialize a virtual machine.
443 */
444 static void *
svm_vminit(struct vm * vm)445 svm_vminit(struct vm *vm)
446 {
447 struct svm_softc *svm_sc;
448 struct svm_vcpu *vcpu;
449 vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
450 int i;
451 uint16_t maxcpus;
452
453 svm_sc = kmem_zalloc(sizeof (*svm_sc), KM_SLEEP);
454 VERIFY3U(((uintptr_t)svm_sc & PAGE_MASK), ==, 0);
455
456 svm_sc->msr_bitmap = vmm_contig_alloc(SVM_MSR_BITMAP_SIZE);
457 if (svm_sc->msr_bitmap == NULL)
458 panic("contigmalloc of SVM MSR bitmap failed");
459 svm_sc->iopm_bitmap = vmm_contig_alloc(SVM_IO_BITMAP_SIZE);
460 if (svm_sc->iopm_bitmap == NULL)
461 panic("contigmalloc of SVM IO bitmap failed");
462
463 svm_sc->vm = vm;
464 svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm));
465
466 /*
467 * Intercept read and write accesses to all MSRs.
468 */
469 memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
470
471 /*
472 * Access to the following MSRs is redirected to the VMCB when the
473 * guest is executing. Therefore it is safe to allow the guest to
474 * read/write these MSRs directly without hypervisor involvement.
475 */
476 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
477 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
478 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
479
480 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
481 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
482 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
483 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
484 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
485 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
486 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
487 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
488
489 svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
490
491 /*
492 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
493 */
494 svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
495
496 /* Intercept access to all I/O ports. */
497 memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
498
499 iopm_pa = vtophys(svm_sc->iopm_bitmap);
500 msrpm_pa = vtophys(svm_sc->msr_bitmap);
501 pml4_pa = svm_sc->nptp;
502 maxcpus = vm_get_maxcpus(svm_sc->vm);
503 for (i = 0; i < maxcpus; i++) {
504 vcpu = svm_get_vcpu(svm_sc, i);
505 vcpu->nextrip = ~0;
506 vcpu->lastcpu = NOCPU;
507 vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
508 vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
509 svm_msr_guest_init(svm_sc, i);
510 }
511 return (svm_sc);
512 }
513
514 /*
515 * Collateral for a generic SVM VM-exit.
516 */
517 static void
vm_exit_svm(struct vm_exit * vme,uint64_t code,uint64_t info1,uint64_t info2)518 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
519 {
520
521 vme->exitcode = VM_EXITCODE_SVM;
522 vme->u.svm.exitcode = code;
523 vme->u.svm.exitinfo1 = info1;
524 vme->u.svm.exitinfo2 = info2;
525 }
526
527 static enum vm_cpu_mode
svm_vcpu_mode(struct vmcb * vmcb)528 svm_vcpu_mode(struct vmcb *vmcb)
529 {
530 struct vmcb_state *state;
531
532 state = &vmcb->state;
533
534 if (state->efer & EFER_LMA) {
535 struct vmcb_segment *seg;
536
537 /*
538 * Section 4.8.1 for APM2, check if Code Segment has
539 * Long attribute set in descriptor.
540 */
541 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
542 if (seg->attrib & VMCB_CS_ATTRIB_L)
543 return (CPU_MODE_64BIT);
544 else
545 return (CPU_MODE_COMPATIBILITY);
546 } else if (state->cr0 & CR0_PE) {
547 return (CPU_MODE_PROTECTED);
548 } else {
549 return (CPU_MODE_REAL);
550 }
551 }
552
553 static enum vm_paging_mode
svm_paging_mode(uint64_t cr0,uint64_t cr4,uint64_t efer)554 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
555 {
556
557 if ((cr0 & CR0_PG) == 0)
558 return (PAGING_MODE_FLAT);
559 if ((cr4 & CR4_PAE) == 0)
560 return (PAGING_MODE_32);
561 if (efer & EFER_LME)
562 return (PAGING_MODE_64);
563 else
564 return (PAGING_MODE_PAE);
565 }
566
567 static void
svm_paging_info(struct vmcb * vmcb,struct vm_guest_paging * paging)568 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
569 {
570 struct vmcb_state *state;
571
572 state = &vmcb->state;
573 paging->cr3 = state->cr3;
574 paging->cpl = state->cpl;
575 paging->cpu_mode = svm_vcpu_mode(vmcb);
576 paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
577 state->efer);
578 }
579
580 #define UNHANDLED 0
581
582 /*
583 * Handle guest I/O intercept.
584 */
585 static int
svm_handle_inout(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)586 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
587 {
588 struct vmcb_ctrl *ctrl;
589 struct vmcb_state *state;
590 struct vm_inout *inout;
591 struct vie *vie;
592 uint64_t info1;
593 struct vm_guest_paging paging;
594
595 state = svm_get_vmcb_state(svm_sc, vcpu);
596 ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
597 inout = &vmexit->u.inout;
598 info1 = ctrl->exitinfo1;
599
600 inout->bytes = (info1 >> 4) & 0x7;
601 inout->flags = 0;
602 inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
603 inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
604 inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
605 inout->port = (uint16_t)(info1 >> 16);
606 inout->eax = (uint32_t)(state->rax);
607
608 if ((inout->flags & INOUT_STR) != 0) {
609 /*
610 * The effective segment number in EXITINFO1[12:10] is populated
611 * only if the processor has the DecodeAssist capability.
612 *
613 * This is not specified explicitly in APMv2 but can be verified
614 * empirically.
615 */
616 if (!has_decode_assist()) {
617 /*
618 * Without decoding assistance, force the task of
619 * emulating the ins/outs on userspace.
620 */
621 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
622 bzero(&vmexit->u.inst_emul,
623 sizeof (vmexit->u.inst_emul));
624 return (UNHANDLED);
625 }
626
627 /*
628 * Bits 7-9 encode the address size of ins/outs operations where
629 * the 1/2/4 values correspond to 16/32/64 bit sizes.
630 */
631 inout->addrsize = 2 * ((info1 >> 7) & 0x7);
632 VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
633 inout->addrsize == 8);
634
635 if (inout->flags & INOUT_IN) {
636 /*
637 * For INS instructions, %es (encoded as 0) is the
638 * implied segment for the operation.
639 */
640 inout->segment = 0;
641 } else {
642 /*
643 * Bits 10-12 encode the segment for OUTS.
644 * This value follows the standard x86 segment order.
645 */
646 inout->segment = (info1 >> 10) & 0x7;
647 }
648 }
649
650 vmexit->exitcode = VM_EXITCODE_INOUT;
651 svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
652 vie = vm_vie_ctx(svm_sc->vm, vcpu);
653 vie_init_inout(vie, inout, vmexit->inst_length, &paging);
654
655 /* The in/out emulation will handle advancing %rip */
656 vmexit->inst_length = 0;
657
658 return (UNHANDLED);
659 }
660
661 static int
npf_fault_type(uint64_t exitinfo1)662 npf_fault_type(uint64_t exitinfo1)
663 {
664
665 if (exitinfo1 & VMCB_NPF_INFO1_W)
666 return (PROT_WRITE);
667 else if (exitinfo1 & VMCB_NPF_INFO1_ID)
668 return (PROT_EXEC);
669 else
670 return (PROT_READ);
671 }
672
673 static bool
svm_npf_emul_fault(uint64_t exitinfo1)674 svm_npf_emul_fault(uint64_t exitinfo1)
675 {
676 if (exitinfo1 & VMCB_NPF_INFO1_ID) {
677 return (false);
678 }
679
680 if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
681 return (false);
682 }
683
684 if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
685 return (false);
686 }
687
688 return (true);
689 }
690
691 static void
svm_handle_mmio_emul(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit,uint64_t gpa)692 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
693 uint64_t gpa)
694 {
695 struct vmcb_ctrl *ctrl;
696 struct vmcb *vmcb;
697 struct vie *vie;
698 struct vm_guest_paging paging;
699 struct vmcb_segment *seg;
700 char *inst_bytes = NULL;
701 uint8_t inst_len = 0;
702
703 vmcb = svm_get_vmcb(svm_sc, vcpu);
704 ctrl = &vmcb->ctrl;
705
706 vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
707 vmexit->u.mmio_emul.gpa = gpa;
708 vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
709 svm_paging_info(vmcb, &paging);
710
711 switch (paging.cpu_mode) {
712 case CPU_MODE_REAL:
713 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
714 vmexit->u.mmio_emul.cs_base = seg->base;
715 vmexit->u.mmio_emul.cs_d = 0;
716 break;
717 case CPU_MODE_PROTECTED:
718 case CPU_MODE_COMPATIBILITY:
719 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
720 vmexit->u.mmio_emul.cs_base = seg->base;
721
722 /*
723 * Section 4.8.1 of APM2, Default Operand Size or D bit.
724 */
725 vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
726 1 : 0;
727 break;
728 default:
729 vmexit->u.mmio_emul.cs_base = 0;
730 vmexit->u.mmio_emul.cs_d = 0;
731 break;
732 }
733
734 /*
735 * Copy the instruction bytes into 'vie' if available.
736 */
737 if (has_decode_assist() && !disable_npf_assist) {
738 inst_len = ctrl->inst_len;
739 inst_bytes = (char *)ctrl->inst_bytes;
740 }
741 vie = vm_vie_ctx(svm_sc->vm, vcpu);
742 vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
743 }
744
745 /*
746 * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0
747 * which is live in the guest. They are visible via the shadow instead.
748 */
749 #define SVM_CR0_MASK ~(CR0_CD | CR0_NW | 0xffffffff00000000)
750
751 static void
svm_set_cr0(struct svm_softc * svm_sc,int vcpu,uint64_t val,bool guest_write)752 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write)
753 {
754 struct vmcb_state *state;
755 struct svm_regctx *regctx;
756 uint64_t masked, old, diff;
757
758 state = svm_get_vmcb_state(svm_sc, vcpu);
759 regctx = svm_get_guest_regctx(svm_sc, vcpu);
760
761 old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
762 diff = old ^ val;
763
764 /* No further work needed if register contents remain the same */
765 if (diff == 0) {
766 return;
767 }
768
769 /* Flush the TLB if the paging or write-protect bits are changing */
770 if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
771 flush_asid(svm_sc, vcpu);
772 }
773
774 /*
775 * If the change in %cr0 is due to a guest action (via interception)
776 * then other CPU state updates may be required.
777 */
778 if (guest_write) {
779 if ((diff & CR0_PG) != 0) {
780 uint64_t efer = state->efer;
781
782 /* Keep the long-mode state in EFER in sync */
783 if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) {
784 state->efer |= EFER_LMA;
785 }
786 if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) {
787 state->efer &= ~EFER_LMA;
788 }
789 }
790 }
791
792 masked = val & SVM_CR0_MASK;
793 regctx->sctx_cr0_shadow = val;
794 state->cr0 = masked;
795 svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR);
796
797 if ((masked ^ val) != 0) {
798 /*
799 * The guest has set bits in %cr0 which we are masking out and
800 * exposing via shadow.
801 *
802 * We must intercept %cr0 reads in order to make the shadowed
803 * view available to the guest.
804 *
805 * Writes to %cr0 must also be intercepted (unconditionally,
806 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch
807 * if/when the guest clears those shadowed bits.
808 */
809 svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
810 BIT(0) | BIT(16));
811 } else {
812 /*
813 * When no bits remain in %cr0 which require shadowing, the
814 * unconditional intercept of reads/writes to %cr0 can be
815 * disabled.
816 *
817 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains
818 * in place so we can be notified of operations which change
819 * bits other than TS or MP.
820 */
821 svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
822 BIT(0) | BIT(16));
823 }
824 svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I);
825 }
826
827 static void
svm_get_cr0(struct svm_softc * svm_sc,int vcpu,uint64_t * val)828 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val)
829 {
830 struct vmcb *vmcb;
831 struct svm_regctx *regctx;
832
833 vmcb = svm_get_vmcb(svm_sc, vcpu);
834 regctx = svm_get_guest_regctx(svm_sc, vcpu);
835
836 /*
837 * Include the %cr0 bits which exist only in the shadow along with those
838 * in the running vCPU state.
839 */
840 *val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
841 }
842
843 static void
svm_handle_cr0_read(struct svm_softc * svm_sc,int vcpu,enum vm_reg_name reg)844 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
845 {
846 uint64_t val;
847 int err __maybe_unused;
848
849 svm_get_cr0(svm_sc, vcpu, &val);
850 err = svm_setreg(svm_sc, vcpu, reg, val);
851 ASSERT(err == 0);
852 }
853
854 static void
svm_handle_cr0_write(struct svm_softc * svm_sc,int vcpu,enum vm_reg_name reg)855 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
856 {
857 struct vmcb_state *state;
858 uint64_t val;
859 int err __maybe_unused;
860
861 state = svm_get_vmcb_state(svm_sc, vcpu);
862
863 err = svm_getreg(svm_sc, vcpu, reg, &val);
864 ASSERT(err == 0);
865
866 if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) {
867 /* NW without CD is nonsensical */
868 vm_inject_gp(svm_sc->vm, vcpu);
869 return;
870 }
871 if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) {
872 /* PG requires PE */
873 vm_inject_gp(svm_sc->vm, vcpu);
874 return;
875 }
876 if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) {
877 /* When enabling paging, PAE must be enabled if LME is. */
878 if ((state->efer & EFER_LME) != 0 &&
879 (state->cr4 & CR4_PAE) == 0) {
880 vm_inject_gp(svm_sc->vm, vcpu);
881 return;
882 }
883 }
884
885 svm_set_cr0(svm_sc, vcpu, val, true);
886 }
887
888 static void
svm_inst_emul_other(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)889 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
890 {
891 struct vie *vie;
892 struct vm_guest_paging paging;
893
894 /* Let the instruction emulation (hopefully in-kernel) handle it */
895 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
896 bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul));
897 vie = vm_vie_ctx(svm_sc->vm, vcpu);
898 svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
899 vie_init_other(vie, &paging);
900
901 /* The instruction emulation will handle advancing %rip */
902 vmexit->inst_length = 0;
903 }
904
905 static void
svm_update_virqinfo(struct svm_softc * sc,int vcpu)906 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
907 {
908 struct vm *vm;
909 struct vlapic *vlapic;
910 struct vmcb_ctrl *ctrl;
911
912 vm = sc->vm;
913 vlapic = vm_lapic(vm, vcpu);
914 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
915
916 /* Update %cr8 in the emulated vlapic */
917 vlapic_set_cr8(vlapic, ctrl->v_tpr);
918
919 /* Virtual interrupt injection is not used. */
920 KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
921 "v_intr_vector %d", __func__, ctrl->v_intr_vector));
922 }
923
924 CTASSERT(VMCB_EVENTINJ_TYPE_INTR == VM_INTINFO_HWINTR);
925 CTASSERT(VMCB_EVENTINJ_TYPE_NMI == VM_INTINFO_NMI);
926 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION == VM_INTINFO_HWEXCP);
927 CTASSERT(VMCB_EVENTINJ_TYPE_INTn == VM_INTINFO_SWINTR);
928 CTASSERT(VMCB_EVENTINJ_EC_VALID == VM_INTINFO_DEL_ERRCODE);
929 CTASSERT(VMCB_EVENTINJ_VALID == VM_INTINFO_VALID);
930
931 /*
932 * Store SVM-specific event injection info for later handling. This depends on
933 * the bhyve-internal event definitions matching those in the VMCB, as ensured
934 * by the above CTASSERTs.
935 */
936 static void
svm_stash_intinfo(struct svm_softc * svm_sc,int vcpu,uint64_t intinfo)937 svm_stash_intinfo(struct svm_softc *svm_sc, int vcpu, uint64_t intinfo)
938 {
939 ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
940
941 /*
942 * If stashing an NMI pending injection, ensure that it bears the
943 * correct vector which exit_intinfo expects.
944 */
945 if (VM_INTINFO_TYPE(intinfo) == VM_INTINFO_NMI) {
946 intinfo &= ~VM_INTINFO_MASK_VECTOR;
947 intinfo |= IDT_NMI;
948 }
949
950 VERIFY0(vm_exit_intinfo(svm_sc->vm, vcpu, intinfo));
951 }
952
953 static void
svm_save_exitintinfo(struct svm_softc * svm_sc,int vcpu)954 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
955 {
956 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
957 uint64_t intinfo = ctrl->exitintinfo;
958
959 if (VMCB_EXITINTINFO_VALID(intinfo)) {
960 /*
961 * If a #VMEXIT happened during event delivery then record the
962 * event that was being delivered.
963 */
964 vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
965
966 svm_stash_intinfo(svm_sc, vcpu, intinfo);
967 }
968 }
969
970 static __inline int
vintr_intercept_enabled(struct svm_softc * sc,int vcpu)971 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
972 {
973
974 return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
975 VMCB_INTCPT_VINTR));
976 }
977
978 static void
svm_enable_intr_window_exiting(struct svm_softc * sc,int vcpu)979 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
980 {
981 struct vmcb_ctrl *ctrl;
982 struct vmcb_state *state;
983
984 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
985 state = svm_get_vmcb_state(sc, vcpu);
986
987 if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
988 KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
989 ("%s: invalid v_ign_tpr", __func__));
990 KASSERT(vintr_intercept_enabled(sc, vcpu),
991 ("%s: vintr intercept should be enabled", __func__));
992 return;
993 }
994
995 /*
996 * We use V_IRQ in conjunction with the VINTR intercept to trap into the
997 * hypervisor as soon as a virtual interrupt can be delivered.
998 *
999 * Since injected events are not subject to intercept checks we need to
1000 * ensure that the V_IRQ is not actually going to be delivered on VM
1001 * entry.
1002 */
1003 VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
1004 (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
1005
1006 ctrl->v_irq |= V_IRQ;
1007 ctrl->v_intr_prio |= V_IGN_TPR;
1008 ctrl->v_intr_vector = 0;
1009 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1010 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1011 }
1012
1013 static void
svm_disable_intr_window_exiting(struct svm_softc * sc,int vcpu)1014 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
1015 {
1016 struct vmcb_ctrl *ctrl;
1017
1018 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1019
1020 if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
1021 KASSERT(!vintr_intercept_enabled(sc, vcpu),
1022 ("%s: vintr intercept should be disabled", __func__));
1023 return;
1024 }
1025
1026 ctrl->v_irq &= ~V_IRQ;
1027 ctrl->v_intr_vector = 0;
1028 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1029 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1030 }
1031
1032 /*
1033 * Once an NMI is injected it blocks delivery of further NMIs until the handler
1034 * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1035 * to track when the vcpu is done handling the NMI.
1036 */
1037 static int
svm_nmi_blocked(struct svm_softc * sc,int vcpu)1038 svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1039 {
1040 return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1041 VMCB_INTCPT_IRET));
1042 }
1043
1044 static void
svm_clear_nmi_blocking(struct svm_softc * sc,int vcpu)1045 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1046 {
1047 struct vmcb_ctrl *ctrl;
1048
1049 KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1050 /*
1051 * When the IRET intercept is cleared the vcpu will attempt to execute
1052 * the "iret" when it runs next. However, it is possible to inject
1053 * another NMI into the vcpu before the "iret" has actually executed.
1054 *
1055 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1056 * it will trap back into the hypervisor. If an NMI is pending for
1057 * the vcpu it will be injected into the guest.
1058 *
1059 * XXX this needs to be fixed
1060 */
1061 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1062
1063 /*
1064 * Set an interrupt shadow to prevent an NMI from being immediately
1065 * injected on the next VMRUN.
1066 */
1067 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1068 ctrl->intr_shadow = 1;
1069 }
1070
1071 static void
svm_inject_event(struct vmcb_ctrl * ctrl,uint64_t info)1072 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info)
1073 {
1074 ASSERT(VM_INTINFO_PENDING(info));
1075
1076 uint8_t vector = VM_INTINFO_VECTOR(info);
1077 uint32_t type = VM_INTINFO_TYPE(info);
1078
1079 /*
1080 * Correct behavior depends on bhyve intinfo event types lining up with
1081 * those defined by AMD for event injection in the VMCB. The CTASSERTs
1082 * above svm_save_exitintinfo() ensure it.
1083 */
1084 switch (type) {
1085 case VM_INTINFO_NMI:
1086 /* Ensure vector for injected event matches its type (NMI) */
1087 vector = IDT_NMI;
1088 break;
1089 case VM_INTINFO_HWINTR:
1090 case VM_INTINFO_SWINTR:
1091 break;
1092 case VM_INTINFO_HWEXCP:
1093 if (vector == IDT_NMI) {
1094 /*
1095 * NMIs are expected to be injected with
1096 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception
1097 * with the NMI vector.
1098 */
1099 type = VM_INTINFO_NMI;
1100 }
1101 VERIFY(vector < 32);
1102 break;
1103 default:
1104 /*
1105 * Since there is not strong validation for injected event types
1106 * at this point, fall back to software interrupt for those we
1107 * do not recognized.
1108 */
1109 type = VM_INTINFO_SWINTR;
1110 break;
1111 }
1112
1113 ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector;
1114 if (VM_INTINFO_HAS_ERRCODE(info)) {
1115 ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1116 ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32;
1117 }
1118 }
1119
1120 static void
svm_inject_nmi(struct svm_softc * sc,int vcpu)1121 svm_inject_nmi(struct svm_softc *sc, int vcpu)
1122 {
1123 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1124
1125 ASSERT(!svm_nmi_blocked(sc, vcpu));
1126
1127 ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1128 vm_nmi_clear(sc->vm, vcpu);
1129
1130 /*
1131 * Virtual NMI blocking is now in effect.
1132 *
1133 * Not only does this block a subsequent NMI injection from taking
1134 * place, it also configures an intercept on the IRET so we can track
1135 * when the next injection can take place.
1136 */
1137 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1138 }
1139
1140 static void
svm_inject_irq(struct svm_softc * sc,int vcpu,int vector)1141 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1142 {
1143 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1144
1145 ASSERT(vector >= 0 && vector <= 255);
1146
1147 ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1148 }
1149
1150 #define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL
1151
1152 static vm_msr_result_t
svm_write_efer(struct svm_softc * sc,int vcpu,uint64_t newval)1153 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1154 {
1155 struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu);
1156 uint64_t lma;
1157 int error;
1158
1159 newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */
1160
1161 if (newval & EFER_MBZ_BITS) {
1162 return (VMR_GP);
1163 }
1164
1165 /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1166 const uint64_t changed = state->efer ^ newval;
1167 if (changed & EFER_LME) {
1168 if (state->cr0 & CR0_PG) {
1169 return (VMR_GP);
1170 }
1171 }
1172
1173 /* EFER.LMA = EFER.LME & CR0.PG */
1174 if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) {
1175 lma = EFER_LMA;
1176 } else {
1177 lma = 0;
1178 }
1179 if ((newval & EFER_LMA) != lma) {
1180 return (VMR_GP);
1181 }
1182
1183 if ((newval & EFER_NXE) != 0 &&
1184 !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) {
1185 return (VMR_GP);
1186 }
1187 if ((newval & EFER_FFXSR) != 0 &&
1188 !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) {
1189 return (VMR_GP);
1190 }
1191 if ((newval & EFER_TCE) != 0 &&
1192 !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) {
1193 return (VMR_GP);
1194 }
1195
1196 /*
1197 * Until bhyve has proper support for long-mode segment limits, just
1198 * toss a #GP at the guest if they attempt to use it.
1199 */
1200 if (newval & EFER_LMSLE) {
1201 return (VMR_GP);
1202 }
1203
1204 error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1205 VERIFY0(error);
1206 return (VMR_OK);
1207 }
1208
1209 static int
svm_handle_msr(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit,bool is_wrmsr)1210 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
1211 bool is_wrmsr)
1212 {
1213 struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1214 struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1215 const uint32_t ecx = ctx->sctx_rcx;
1216 vm_msr_result_t res;
1217 uint64_t val = 0;
1218
1219 if (is_wrmsr) {
1220 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1221 val = ctx->sctx_rdx << 32 | (uint32_t)state->rax;
1222
1223 if (vlapic_owned_msr(ecx)) {
1224 struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1225
1226 res = vlapic_wrmsr(vlapic, ecx, val);
1227 } else if (ecx == MSR_EFER) {
1228 res = svm_write_efer(svm_sc, vcpu, val);
1229 } else {
1230 res = svm_wrmsr(svm_sc, vcpu, ecx, val);
1231 }
1232 } else {
1233 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1234
1235 if (vlapic_owned_msr(ecx)) {
1236 struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1237
1238 res = vlapic_rdmsr(vlapic, ecx, &val);
1239 } else {
1240 res = svm_rdmsr(svm_sc, vcpu, ecx, &val);
1241 }
1242 }
1243
1244 switch (res) {
1245 case VMR_OK:
1246 /* Store rdmsr result in the appropriate registers */
1247 if (!is_wrmsr) {
1248 state->rax = (uint32_t)val;
1249 ctx->sctx_rdx = val >> 32;
1250 }
1251 return (1);
1252 case VMR_GP:
1253 vm_inject_gp(svm_sc->vm, vcpu);
1254 return (1);
1255 case VMR_UNHANLDED:
1256 vmexit->exitcode = is_wrmsr ?
1257 VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR;
1258 vmexit->u.msr.code = ecx;
1259 vmexit->u.msr.wval = val;
1260 return (0);
1261 default:
1262 panic("unexpected msr result %u\n", res);
1263 }
1264 }
1265
1266 /*
1267 * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1268 * that are due to instruction intercepts as well as MSR and IOIO intercepts
1269 * and exceptions caused by INT3, INTO and BOUND instructions.
1270 *
1271 * Return 1 if the nRIP is valid and 0 otherwise.
1272 */
1273 static int
nrip_valid(uint64_t exitcode)1274 nrip_valid(uint64_t exitcode)
1275 {
1276 switch (exitcode) {
1277 case 0x00 ... 0x0F: /* read of CR0 through CR15 */
1278 case 0x10 ... 0x1F: /* write of CR0 through CR15 */
1279 case 0x20 ... 0x2F: /* read of DR0 through DR15 */
1280 case 0x30 ... 0x3F: /* write of DR0 through DR15 */
1281 case 0x43: /* INT3 */
1282 case 0x44: /* INTO */
1283 case 0x45: /* BOUND */
1284 case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1285 case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1286 return (1);
1287 default:
1288 return (0);
1289 }
1290 }
1291
1292 static int
svm_vmexit(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)1293 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1294 {
1295 struct vmcb *vmcb;
1296 struct vmcb_state *state;
1297 struct vmcb_ctrl *ctrl;
1298 struct svm_regctx *ctx;
1299 uint64_t code, info1, info2;
1300 int handled;
1301
1302 ctx = svm_get_guest_regctx(svm_sc, vcpu);
1303 vmcb = svm_get_vmcb(svm_sc, vcpu);
1304 state = &vmcb->state;
1305 ctrl = &vmcb->ctrl;
1306
1307 handled = 0;
1308 code = ctrl->exitcode;
1309 info1 = ctrl->exitinfo1;
1310 info2 = ctrl->exitinfo2;
1311
1312 vmexit->exitcode = VM_EXITCODE_BOGUS;
1313 vmexit->rip = state->rip;
1314 vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1315
1316 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1317
1318 /*
1319 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1320 * in an inconsistent state and can trigger assertions that would
1321 * never happen otherwise.
1322 */
1323 if (code == VMCB_EXIT_INVALID) {
1324 vm_exit_svm(vmexit, code, info1, info2);
1325 return (0);
1326 }
1327
1328 KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1329 "injection valid bit is set %lx", __func__, ctrl->eventinj));
1330
1331 KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1332 ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1333 vmexit->inst_length, code, info1, info2));
1334
1335 svm_update_virqinfo(svm_sc, vcpu);
1336 svm_save_exitintinfo(svm_sc, vcpu);
1337
1338 switch (code) {
1339 case VMCB_EXIT_CR0_READ:
1340 if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1341 svm_handle_cr0_read(svm_sc, vcpu,
1342 vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1343 handled = 1;
1344 } else {
1345 /*
1346 * If SMSW is used to read the contents of %cr0, then
1347 * the VALID bit will not be set in `info1`, since the
1348 * handling is different from the mov-to-reg case.
1349 *
1350 * Punt to the instruction emulation to handle it.
1351 */
1352 svm_inst_emul_other(svm_sc, vcpu, vmexit);
1353 }
1354 break;
1355 case VMCB_EXIT_CR0_WRITE:
1356 case VMCB_EXIT_CR0_SEL_WRITE:
1357 if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1358 svm_handle_cr0_write(svm_sc, vcpu,
1359 vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1360 handled = 1;
1361 } else {
1362 /*
1363 * Writes to %cr0 without VALID being set in `info1` are
1364 * initiated by the LMSW and CLTS instructions. While
1365 * LMSW (like SMSW) sees little use in modern OSes and
1366 * bootloaders, CLTS is still used for handling FPU
1367 * state transitions.
1368 *
1369 * Punt to the instruction emulation to handle them.
1370 */
1371 svm_inst_emul_other(svm_sc, vcpu, vmexit);
1372 }
1373 break;
1374 case VMCB_EXIT_IRET:
1375 /*
1376 * Restart execution at "iret" but with the intercept cleared.
1377 */
1378 vmexit->inst_length = 0;
1379 svm_clear_nmi_blocking(svm_sc, vcpu);
1380 handled = 1;
1381 break;
1382 case VMCB_EXIT_VINTR: /* interrupt window exiting */
1383 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1384 svm_disable_intr_window_exiting(svm_sc, vcpu);
1385 handled = 1;
1386 break;
1387 case VMCB_EXIT_INTR: /* external interrupt */
1388 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1389 handled = 1;
1390 break;
1391 case VMCB_EXIT_NMI:
1392 case VMCB_EXIT_SMI:
1393 case VMCB_EXIT_INIT:
1394 /*
1395 * For external NMI/SMI and physical INIT interrupts, simply
1396 * continue execution, as those host events will be handled by
1397 * the physical CPU.
1398 */
1399 handled = 1;
1400 break;
1401 case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: {
1402 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1403
1404 const uint8_t idtvec = code - VMCB_EXIT_EXCP0;
1405 uint32_t errcode = 0;
1406 bool reflect = true;
1407 bool errcode_valid = false;
1408
1409 switch (idtvec) {
1410 case IDT_MC:
1411 /* The host will handle the MCE itself. */
1412 reflect = false;
1413 vmm_call_trap(T_MCE);
1414 break;
1415 case IDT_PF:
1416 VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1417 info2));
1418 /* fallthru */
1419 case IDT_NP:
1420 case IDT_SS:
1421 case IDT_GP:
1422 case IDT_AC:
1423 case IDT_TS:
1424 errcode_valid = true;
1425 errcode = info1;
1426 break;
1427
1428 case IDT_DF:
1429 errcode_valid = true;
1430 break;
1431
1432 case IDT_BP:
1433 case IDT_OF:
1434 case IDT_BR:
1435 /*
1436 * The 'nrip' field is populated for INT3, INTO and
1437 * BOUND exceptions and this also implies that
1438 * 'inst_length' is non-zero.
1439 *
1440 * Reset 'inst_length' to zero so the guest %rip at
1441 * event injection is identical to what it was when
1442 * the exception originally happened.
1443 */
1444 vmexit->inst_length = 0;
1445 /* fallthru */
1446 default:
1447 errcode_valid = false;
1448 break;
1449 }
1450 VERIFY0(vmexit->inst_length);
1451
1452 if (reflect) {
1453 /* Reflect the exception back into the guest */
1454 VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1455 errcode_valid, errcode, false));
1456 }
1457 handled = 1;
1458 break;
1459 }
1460 case VMCB_EXIT_MSR:
1461 handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0);
1462 break;
1463 case VMCB_EXIT_IO:
1464 handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1465 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1466 break;
1467 case VMCB_EXIT_SHUTDOWN:
1468 (void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT, vcpu);
1469 handled = 1;
1470 break;
1471 case VMCB_EXIT_INVLPGA:
1472 /* privileged invalidation instructions */
1473 vm_inject_ud(svm_sc->vm, vcpu);
1474 handled = 1;
1475 break;
1476 case VMCB_EXIT_VMRUN:
1477 case VMCB_EXIT_VMLOAD:
1478 case VMCB_EXIT_VMSAVE:
1479 case VMCB_EXIT_STGI:
1480 case VMCB_EXIT_CLGI:
1481 case VMCB_EXIT_SKINIT:
1482 /* privileged vmm instructions */
1483 vm_inject_ud(svm_sc->vm, vcpu);
1484 handled = 1;
1485 break;
1486 case VMCB_EXIT_INVD:
1487 case VMCB_EXIT_WBINVD:
1488 /* ignore exit */
1489 handled = 1;
1490 break;
1491 case VMCB_EXIT_VMMCALL:
1492 /* No handlers make use of VMMCALL for now */
1493 vm_inject_ud(svm_sc->vm, vcpu);
1494 handled = 1;
1495 break;
1496 case VMCB_EXIT_CPUID:
1497 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1498 vcpu_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1499 &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1500 handled = 1;
1501 break;
1502 case VMCB_EXIT_HLT:
1503 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1504 vmexit->exitcode = VM_EXITCODE_HLT;
1505 vmexit->u.hlt.rflags = state->rflags;
1506 break;
1507 case VMCB_EXIT_PAUSE:
1508 vmexit->exitcode = VM_EXITCODE_PAUSE;
1509 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1510 break;
1511 case VMCB_EXIT_NPF:
1512 /* EXITINFO2 contains the faulting guest physical address */
1513 if (info1 & VMCB_NPF_INFO1_RSV) {
1514 /* nested fault with reserved bits set */
1515 } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1516 vmexit->exitcode = VM_EXITCODE_PAGING;
1517 vmexit->u.paging.gpa = info2;
1518 vmexit->u.paging.fault_type = npf_fault_type(info1);
1519 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1520 } else if (svm_npf_emul_fault(info1)) {
1521 svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1522 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1523 }
1524 break;
1525 case VMCB_EXIT_MONITOR:
1526 vmexit->exitcode = VM_EXITCODE_MONITOR;
1527 break;
1528 case VMCB_EXIT_MWAIT:
1529 vmexit->exitcode = VM_EXITCODE_MWAIT;
1530 break;
1531 default:
1532 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1533 break;
1534 }
1535
1536 DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1537 code);
1538
1539 if (handled) {
1540 vmexit->rip += vmexit->inst_length;
1541 vmexit->inst_length = 0;
1542 state->rip = vmexit->rip;
1543 } else {
1544 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1545 /*
1546 * If this VM exit was not claimed by anybody then
1547 * treat it as a generic SVM exit.
1548 */
1549 vm_exit_svm(vmexit, code, info1, info2);
1550 } else {
1551 /*
1552 * The exitcode and collateral have been populated.
1553 * The VM exit will be processed further in userland.
1554 */
1555 }
1556 }
1557 return (handled);
1558 }
1559
1560 /*
1561 * Inject exceptions, NMIs, and ExtINTs.
1562 *
1563 * The logic behind these are complicated and may involve mutex contention, so
1564 * the injection is performed without the protection of host CPU interrupts
1565 * being disabled. This means a racing notification could be "lost",
1566 * necessitating a later call to svm_inject_recheck() to close that window
1567 * of opportunity.
1568 */
1569 static enum event_inject_state
svm_inject_events(struct svm_softc * sc,int vcpu)1570 svm_inject_events(struct svm_softc *sc, int vcpu)
1571 {
1572 struct vmcb_ctrl *ctrl;
1573 struct vmcb_state *state;
1574 struct svm_vcpu *vcpustate;
1575 uint64_t intinfo;
1576 enum event_inject_state ev_state;
1577
1578 state = svm_get_vmcb_state(sc, vcpu);
1579 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1580 vcpustate = svm_get_vcpu(sc, vcpu);
1581 ev_state = EIS_CAN_INJECT;
1582
1583 /* Clear any interrupt shadow if guest %rip has changed */
1584 if (vcpustate->nextrip != state->rip) {
1585 ctrl->intr_shadow = 0;
1586 }
1587
1588 /*
1589 * An event is already pending for injection. This can occur when the
1590 * vCPU exits prior to VM entry (like for an AST).
1591 */
1592 if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1593 return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1594 }
1595
1596 /*
1597 * Inject pending events or exceptions for this vcpu.
1598 *
1599 * An event might be pending because the previous #VMEXIT happened
1600 * during event delivery (i.e. ctrl->exitintinfo).
1601 *
1602 * An event might also be pending because an exception was injected
1603 * by the hypervisor (e.g. #PF during instruction emulation).
1604 */
1605 if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1606 svm_inject_event(ctrl, intinfo);
1607 vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1608 ev_state = EIS_EV_INJECTED;
1609 }
1610
1611 /* NMI event has priority over interrupts. */
1612 if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1613 if (ev_state == EIS_CAN_INJECT) {
1614 /* Can't inject NMI if vcpu is in an intr_shadow. */
1615 if (ctrl->intr_shadow) {
1616 return (EIS_GI_BLOCK);
1617 }
1618
1619 svm_inject_nmi(sc, vcpu);
1620 ev_state = EIS_EV_INJECTED;
1621 } else {
1622 return (ev_state | EIS_REQ_EXIT);
1623 }
1624 }
1625
1626 if (vm_extint_pending(sc->vm, vcpu)) {
1627 int vector;
1628
1629 if (ev_state != EIS_CAN_INJECT) {
1630 return (ev_state | EIS_REQ_EXIT);
1631 }
1632
1633 /*
1634 * If the guest has disabled interrupts or is in an interrupt
1635 * shadow then we cannot inject the pending interrupt.
1636 */
1637 if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1638 return (EIS_GI_BLOCK);
1639 }
1640
1641 /* Ask the legacy pic for a vector to inject */
1642 vatpic_pending_intr(sc->vm, &vector);
1643 KASSERT(vector >= 0 && vector <= 255,
1644 ("invalid vector %d from INTR", vector));
1645
1646 svm_inject_irq(sc, vcpu, vector);
1647 vm_extint_clear(sc->vm, vcpu);
1648 vatpic_intr_accepted(sc->vm, vector);
1649 ev_state = EIS_EV_INJECTED;
1650 }
1651
1652 return (ev_state);
1653 }
1654
1655 /*
1656 * Synchronize vLAPIC state and inject any interrupts pending on it.
1657 *
1658 * This is done with host CPU interrupts disabled so notification IPIs will be
1659 * queued on the host APIC and recognized when entering SVM guest context.
1660 */
1661 static enum event_inject_state
svm_inject_vlapic(struct svm_softc * sc,int vcpu,struct vlapic * vlapic,enum event_inject_state ev_state)1662 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1663 enum event_inject_state ev_state)
1664 {
1665 struct vmcb_ctrl *ctrl;
1666 struct vmcb_state *state;
1667 int vector;
1668 uint8_t v_tpr;
1669
1670 state = svm_get_vmcb_state(sc, vcpu);
1671 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1672
1673 /*
1674 * The guest can modify the TPR by writing to %cr8. In guest mode the
1675 * CPU reflects this write to V_TPR without hypervisor intervention.
1676 *
1677 * The guest can also modify the TPR by writing to it via the memory
1678 * mapped APIC page. In this case, the write will be emulated by the
1679 * hypervisor. For this reason V_TPR must be updated before every
1680 * VMRUN.
1681 */
1682 v_tpr = vlapic_get_cr8(vlapic);
1683 KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1684 if (ctrl->v_tpr != v_tpr) {
1685 ctrl->v_tpr = v_tpr;
1686 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1687 }
1688
1689 /* If an event cannot otherwise be injected, we are done for now */
1690 if (ev_state != EIS_CAN_INJECT) {
1691 return (ev_state);
1692 }
1693
1694 if (!vlapic_pending_intr(vlapic, &vector)) {
1695 return (EIS_CAN_INJECT);
1696 }
1697 KASSERT(vector >= 16 && vector <= 255,
1698 ("invalid vector %d from local APIC", vector));
1699
1700 /*
1701 * If the guest has disabled interrupts or is in an interrupt shadow
1702 * then we cannot inject the pending interrupt.
1703 */
1704 if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1705 return (EIS_GI_BLOCK);
1706 }
1707
1708 svm_inject_irq(sc, vcpu, vector);
1709 vlapic_intr_accepted(vlapic, vector);
1710 return (EIS_EV_INJECTED);
1711 }
1712
1713 /*
1714 * Re-check for events to be injected.
1715 *
1716 * Once host CPU interrupts are disabled, check for the presence of any events
1717 * which require injection processing. If an exit is required upon injection,
1718 * or once the guest becomes interruptable, that will be configured too.
1719 */
1720 static bool
svm_inject_recheck(struct svm_softc * sc,int vcpu,enum event_inject_state ev_state)1721 svm_inject_recheck(struct svm_softc *sc, int vcpu,
1722 enum event_inject_state ev_state)
1723 {
1724 struct vmcb_ctrl *ctrl;
1725
1726 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1727
1728 if (ev_state == EIS_CAN_INJECT) {
1729 /*
1730 * An active interrupt shadow would preclude us from injecting
1731 * any events picked up during a re-check.
1732 */
1733 if (ctrl->intr_shadow != 0) {
1734 return (false);
1735 }
1736
1737 if (vm_nmi_pending(sc->vm, vcpu) &&
1738 !svm_nmi_blocked(sc, vcpu)) {
1739 /* queued NMI not blocked by NMI-window-exiting */
1740 return (true);
1741 }
1742 if (vm_extint_pending(sc->vm, vcpu)) {
1743 /* queued ExtINT not blocked by existing injection */
1744 return (true);
1745 }
1746 } else {
1747 if ((ev_state & EIS_REQ_EXIT) != 0) {
1748 /*
1749 * Use a self-IPI to force an immediate exit after
1750 * event injection has occurred.
1751 */
1752 poke_cpu(CPU->cpu_id);
1753 } else {
1754 /*
1755 * If any event is being injected, an exit immediately
1756 * upon becoming interruptable again will allow pending
1757 * or newly queued events to be injected in a timely
1758 * manner.
1759 */
1760 svm_enable_intr_window_exiting(sc, vcpu);
1761 }
1762 }
1763 return (false);
1764 }
1765
1766
1767 static void
check_asid(struct svm_softc * sc,int vcpuid,uint_t thiscpu,uint64_t nptgen)1768 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen)
1769 {
1770 struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1771 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1772 uint8_t flush;
1773
1774 flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(),
1775 vcpustate->nptgen != nptgen);
1776
1777 if (flush != VMCB_TLB_FLUSH_NOTHING) {
1778 ctrl->asid = vcpustate->hma_asid.hsa_asid;
1779 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1780 }
1781 ctrl->tlb_ctrl = flush;
1782 vcpustate->nptgen = nptgen;
1783 }
1784
1785 static void
flush_asid(struct svm_softc * sc,int vcpuid)1786 flush_asid(struct svm_softc *sc, int vcpuid)
1787 {
1788 struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1789 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1790 uint8_t flush;
1791
1792 flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(),
1793 true);
1794
1795 ASSERT(flush != VMCB_TLB_FLUSH_NOTHING);
1796 ctrl->asid = vcpustate->hma_asid.hsa_asid;
1797 ctrl->tlb_ctrl = flush;
1798 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1799 /*
1800 * A potential future optimization: We could choose to update the nptgen
1801 * associated with the vCPU, since any pending nptgen change requiring a
1802 * flush will be satisfied by the one which has just now been queued.
1803 */
1804 }
1805
1806 static __inline void
disable_gintr(void)1807 disable_gintr(void)
1808 {
1809 __asm __volatile("clgi");
1810 }
1811
1812 static __inline void
enable_gintr(void)1813 enable_gintr(void)
1814 {
1815 __asm __volatile("stgi");
1816 }
1817
1818 static __inline void
svm_dr_enter_guest(struct svm_regctx * gctx)1819 svm_dr_enter_guest(struct svm_regctx *gctx)
1820 {
1821
1822 /* Save host control debug registers. */
1823 gctx->host_dr7 = rdr7();
1824 gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1825
1826 /*
1827 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1828 * exceptions in the host based on the guest DRx values. The
1829 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1830 * VMCB.
1831 */
1832 load_dr7(0);
1833 wrmsr(MSR_DEBUGCTLMSR, 0);
1834
1835 /* Save host debug registers. */
1836 gctx->host_dr0 = rdr0();
1837 gctx->host_dr1 = rdr1();
1838 gctx->host_dr2 = rdr2();
1839 gctx->host_dr3 = rdr3();
1840 gctx->host_dr6 = rdr6();
1841
1842 /* Restore guest debug registers. */
1843 load_dr0(gctx->sctx_dr0);
1844 load_dr1(gctx->sctx_dr1);
1845 load_dr2(gctx->sctx_dr2);
1846 load_dr3(gctx->sctx_dr3);
1847 }
1848
1849 static __inline void
svm_dr_leave_guest(struct svm_regctx * gctx)1850 svm_dr_leave_guest(struct svm_regctx *gctx)
1851 {
1852
1853 /* Save guest debug registers. */
1854 gctx->sctx_dr0 = rdr0();
1855 gctx->sctx_dr1 = rdr1();
1856 gctx->sctx_dr2 = rdr2();
1857 gctx->sctx_dr3 = rdr3();
1858
1859 /*
1860 * Restore host debug registers. Restore DR7 and DEBUGCTL
1861 * last.
1862 */
1863 load_dr0(gctx->host_dr0);
1864 load_dr1(gctx->host_dr1);
1865 load_dr2(gctx->host_dr2);
1866 load_dr3(gctx->host_dr3);
1867 load_dr6(gctx->host_dr6);
1868 wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1869 load_dr7(gctx->host_dr7);
1870 }
1871
1872 /*
1873 * Apply the TSC offset for a vCPU, including physical CPU and per-vCPU offsets.
1874 */
1875 static void
svm_apply_tsc_adjust(struct svm_softc * svm_sc,int vcpuid)1876 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid)
1877 {
1878 const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true);
1879 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid);
1880
1881 if (ctrl->tsc_offset != offset) {
1882 ctrl->tsc_offset = offset;
1883 svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I);
1884 }
1885 }
1886
1887 /*
1888 * Start vcpu with specified RIP.
1889 */
1890 static int
svm_vmrun(void * arg,int vcpu,uint64_t rip)1891 svm_vmrun(void *arg, int vcpu, uint64_t rip)
1892 {
1893 struct svm_regctx *gctx;
1894 struct svm_softc *svm_sc;
1895 struct svm_vcpu *vcpustate;
1896 struct vmcb_state *state;
1897 struct vmcb_ctrl *ctrl;
1898 struct vm_exit *vmexit;
1899 struct vlapic *vlapic;
1900 vm_client_t *vmc;
1901 struct vm *vm;
1902 uint64_t vmcb_pa;
1903 int handled;
1904 uint16_t ldt_sel;
1905
1906 svm_sc = arg;
1907 vm = svm_sc->vm;
1908
1909 vcpustate = svm_get_vcpu(svm_sc, vcpu);
1910 state = svm_get_vmcb_state(svm_sc, vcpu);
1911 ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
1912 vmexit = vm_exitinfo(vm, vcpu);
1913 vlapic = vm_lapic(vm, vcpu);
1914 vmc = vm_get_vmclient(vm, vcpu);
1915
1916 gctx = svm_get_guest_regctx(svm_sc, vcpu);
1917 vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1918
1919 if (vcpustate->lastcpu != curcpu) {
1920 /*
1921 * Force new ASID allocation by invalidating the generation.
1922 */
1923 vcpustate->hma_asid.hsa_gen = 0;
1924
1925 /*
1926 * Invalidate the VMCB state cache by marking all fields dirty.
1927 */
1928 svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1929
1930 /*
1931 * XXX
1932 * Setting 'vcpustate->lastcpu' here is bit premature because
1933 * we may return from this function without actually executing
1934 * the VMRUN instruction. This could happen if an AST or yield
1935 * condition is pending on the first time through the loop.
1936 *
1937 * This works for now but any new side-effects of vcpu
1938 * migration should take this case into account.
1939 */
1940 vcpustate->lastcpu = curcpu;
1941 vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1942 }
1943
1944 svm_apply_tsc_adjust(svm_sc, vcpu);
1945
1946 svm_msr_guest_enter(svm_sc, vcpu);
1947
1948 VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1949 vcpustate->loaded = B_TRUE;
1950
1951 /* Update Guest RIP */
1952 state->rip = rip;
1953
1954 do {
1955 enum event_inject_state inject_state;
1956 uint64_t nptgen;
1957
1958 /*
1959 * Initial event injection is complex and may involve mutex
1960 * contention, so it must be performed with global interrupts
1961 * still enabled.
1962 */
1963 inject_state = svm_inject_events(svm_sc, vcpu);
1964 handled = 0;
1965
1966 /*
1967 * Disable global interrupts to guarantee atomicity during
1968 * loading of guest state. This includes not only the state
1969 * loaded by the "vmrun" instruction but also software state
1970 * maintained by the hypervisor: suspended and rendezvous
1971 * state, NPT generation number, vlapic interrupts etc.
1972 */
1973 disable_gintr();
1974
1975 /*
1976 * Synchronizing and injecting vlapic state is lock-free and is
1977 * safe (and prudent) to perform with interrupts disabled.
1978 */
1979 inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
1980 inject_state);
1981
1982 /*
1983 * Check for vCPU bail-out conditions. This must be done after
1984 * svm_inject_events() to detect a triple-fault condition.
1985 */
1986 if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
1987 enable_gintr();
1988 break;
1989 }
1990
1991 if (vcpu_run_state_pending(vm, vcpu)) {
1992 enable_gintr();
1993 vm_exit_run_state(vm, vcpu, state->rip);
1994 break;
1995 }
1996
1997 /*
1998 * If subsequent activity queued events which require injection
1999 * handling, take another lap to handle them.
2000 */
2001 if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
2002 enable_gintr();
2003 handled = 1;
2004 continue;
2005 }
2006
2007 /*
2008 * #VMEXIT resumes the host with the guest LDTR, so
2009 * save the current LDT selector so it can be restored
2010 * after an exit. The userspace hypervisor probably
2011 * doesn't use a LDT, but save and restore it to be
2012 * safe.
2013 */
2014 ldt_sel = sldt();
2015
2016 /*
2017 * Check the vmspace and ASID generations to ensure that the
2018 * vcpu does not use stale TLB mappings.
2019 */
2020 nptgen = vmc_table_enter(vmc);
2021 check_asid(svm_sc, vcpu, curcpu, nptgen);
2022
2023 ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
2024 vcpustate->dirty = 0;
2025
2026 /* Launch Virtual Machine. */
2027 vcpu_ustate_change(vm, vcpu, VU_RUN);
2028 svm_dr_enter_guest(gctx);
2029 svm_launch(vmcb_pa, gctx, get_pcpu());
2030 svm_dr_leave_guest(gctx);
2031 vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
2032
2033 /* Restore host LDTR. */
2034 lldt(ldt_sel);
2035
2036 /* #VMEXIT disables interrupts so re-enable them here. */
2037 enable_gintr();
2038
2039 vmc_table_exit(vmc);
2040
2041 /* Update 'nextrip' */
2042 vcpustate->nextrip = state->rip;
2043
2044 /* Handle #VMEXIT and if required return to user space. */
2045 handled = svm_vmexit(svm_sc, vcpu, vmexit);
2046 } while (handled);
2047
2048 svm_msr_guest_exit(svm_sc, vcpu);
2049
2050 VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2051 vcpustate->loaded = B_FALSE;
2052
2053 return (0);
2054 }
2055
2056 static void
svm_vmcleanup(void * arg)2057 svm_vmcleanup(void *arg)
2058 {
2059 struct svm_softc *sc = arg;
2060
2061 vmm_contig_free(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE);
2062 vmm_contig_free(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE);
2063 kmem_free(sc, sizeof (*sc));
2064 }
2065
2066 static uint64_t *
swctx_regptr(struct svm_regctx * regctx,int reg)2067 swctx_regptr(struct svm_regctx *regctx, int reg)
2068 {
2069 switch (reg) {
2070 case VM_REG_GUEST_RBX:
2071 return (®ctx->sctx_rbx);
2072 case VM_REG_GUEST_RCX:
2073 return (®ctx->sctx_rcx);
2074 case VM_REG_GUEST_RDX:
2075 return (®ctx->sctx_rdx);
2076 case VM_REG_GUEST_RDI:
2077 return (®ctx->sctx_rdi);
2078 case VM_REG_GUEST_RSI:
2079 return (®ctx->sctx_rsi);
2080 case VM_REG_GUEST_RBP:
2081 return (®ctx->sctx_rbp);
2082 case VM_REG_GUEST_R8:
2083 return (®ctx->sctx_r8);
2084 case VM_REG_GUEST_R9:
2085 return (®ctx->sctx_r9);
2086 case VM_REG_GUEST_R10:
2087 return (®ctx->sctx_r10);
2088 case VM_REG_GUEST_R11:
2089 return (®ctx->sctx_r11);
2090 case VM_REG_GUEST_R12:
2091 return (®ctx->sctx_r12);
2092 case VM_REG_GUEST_R13:
2093 return (®ctx->sctx_r13);
2094 case VM_REG_GUEST_R14:
2095 return (®ctx->sctx_r14);
2096 case VM_REG_GUEST_R15:
2097 return (®ctx->sctx_r15);
2098 case VM_REG_GUEST_DR0:
2099 return (®ctx->sctx_dr0);
2100 case VM_REG_GUEST_DR1:
2101 return (®ctx->sctx_dr1);
2102 case VM_REG_GUEST_DR2:
2103 return (®ctx->sctx_dr2);
2104 case VM_REG_GUEST_DR3:
2105 return (®ctx->sctx_dr3);
2106 default:
2107 return (NULL);
2108 }
2109 }
2110
2111 static int
svm_getreg(void * arg,int vcpu,int ident,uint64_t * val)2112 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2113 {
2114 struct svm_softc *sc;
2115 struct vmcb *vmcb;
2116 uint64_t *regp;
2117 uint64_t *fieldp;
2118 struct vmcb_segment *seg;
2119
2120 sc = arg;
2121 vmcb = svm_get_vmcb(sc, vcpu);
2122
2123 regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2124 if (regp != NULL) {
2125 *val = *regp;
2126 return (0);
2127 }
2128
2129 switch (ident) {
2130 case VM_REG_GUEST_INTR_SHADOW:
2131 *val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2132 break;
2133
2134 case VM_REG_GUEST_CR0:
2135 svm_get_cr0(sc, vcpu, val);
2136 break;
2137 case VM_REG_GUEST_CR2:
2138 case VM_REG_GUEST_CR3:
2139 case VM_REG_GUEST_CR4:
2140 case VM_REG_GUEST_DR6:
2141 case VM_REG_GUEST_DR7:
2142 case VM_REG_GUEST_EFER:
2143 case VM_REG_GUEST_RAX:
2144 case VM_REG_GUEST_RFLAGS:
2145 case VM_REG_GUEST_RIP:
2146 case VM_REG_GUEST_RSP:
2147 fieldp = vmcb_regptr(vmcb, ident, NULL);
2148 *val = *fieldp;
2149 break;
2150
2151 case VM_REG_GUEST_CS:
2152 case VM_REG_GUEST_DS:
2153 case VM_REG_GUEST_ES:
2154 case VM_REG_GUEST_FS:
2155 case VM_REG_GUEST_GS:
2156 case VM_REG_GUEST_SS:
2157 case VM_REG_GUEST_LDTR:
2158 case VM_REG_GUEST_TR:
2159 seg = vmcb_segptr(vmcb, ident);
2160 *val = seg->selector;
2161 break;
2162
2163 case VM_REG_GUEST_GDTR:
2164 case VM_REG_GUEST_IDTR:
2165 /* GDTR and IDTR don't have segment selectors */
2166 return (EINVAL);
2167
2168 case VM_REG_GUEST_PDPTE0:
2169 case VM_REG_GUEST_PDPTE1:
2170 case VM_REG_GUEST_PDPTE2:
2171 case VM_REG_GUEST_PDPTE3:
2172 /*
2173 * Unlike VMX, where the PDPTEs are explicitly cached as part of
2174 * several well-defined events related to paging (such as
2175 * loading %cr3), SVM walks the PDPEs (their PDPTE) as part of
2176 * nested paging lookups. This makes these registers
2177 * effectively irrelevant on SVM.
2178 *
2179 * Rather than tossing an error, emit zeroed values so casual
2180 * consumers do not need to be as careful about that difference.
2181 */
2182 *val = 0;
2183 break;
2184
2185 default:
2186 return (EINVAL);
2187 }
2188
2189 return (0);
2190 }
2191
2192 static int
svm_setreg(void * arg,int vcpu,int ident,uint64_t val)2193 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2194 {
2195 struct svm_softc *sc;
2196 struct vmcb *vmcb;
2197 uint64_t *regp;
2198 uint64_t *fieldp;
2199 uint32_t dirty;
2200 struct vmcb_segment *seg;
2201
2202 sc = arg;
2203 vmcb = svm_get_vmcb(sc, vcpu);
2204
2205 regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2206 if (regp != NULL) {
2207 *regp = val;
2208 return (0);
2209 }
2210
2211 dirty = VMCB_CACHE_NONE;
2212 switch (ident) {
2213 case VM_REG_GUEST_INTR_SHADOW:
2214 vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2215 break;
2216
2217 case VM_REG_GUEST_EFER:
2218 fieldp = vmcb_regptr(vmcb, ident, &dirty);
2219 /* EFER_SVM must always be set when the guest is executing */
2220 *fieldp = val | EFER_SVM;
2221 dirty |= VMCB_CACHE_CR;
2222 break;
2223
2224 case VM_REG_GUEST_CR0:
2225 svm_set_cr0(sc, vcpu, val, false);
2226 break;
2227 case VM_REG_GUEST_CR2:
2228 case VM_REG_GUEST_CR3:
2229 case VM_REG_GUEST_CR4:
2230 case VM_REG_GUEST_DR6:
2231 case VM_REG_GUEST_DR7:
2232 case VM_REG_GUEST_RAX:
2233 case VM_REG_GUEST_RFLAGS:
2234 case VM_REG_GUEST_RIP:
2235 case VM_REG_GUEST_RSP:
2236 fieldp = vmcb_regptr(vmcb, ident, &dirty);
2237 *fieldp = val;
2238 break;
2239
2240 case VM_REG_GUEST_CS:
2241 case VM_REG_GUEST_DS:
2242 case VM_REG_GUEST_ES:
2243 case VM_REG_GUEST_SS:
2244 case VM_REG_GUEST_FS:
2245 case VM_REG_GUEST_GS:
2246 case VM_REG_GUEST_LDTR:
2247 case VM_REG_GUEST_TR:
2248 dirty |= VMCB_CACHE_SEG;
2249 seg = vmcb_segptr(vmcb, ident);
2250 seg->selector = (uint16_t)val;
2251 break;
2252
2253 case VM_REG_GUEST_GDTR:
2254 case VM_REG_GUEST_IDTR:
2255 /* GDTR and IDTR don't have segment selectors */
2256 return (EINVAL);
2257
2258 case VM_REG_GUEST_PDPTE0:
2259 case VM_REG_GUEST_PDPTE1:
2260 case VM_REG_GUEST_PDPTE2:
2261 case VM_REG_GUEST_PDPTE3:
2262 /*
2263 * PDPEs (AMD's PDPTE) are not cached under SVM, so we can
2264 * ignore attempts to set them. See handler in svm_getreg() for
2265 * more details.
2266 */
2267 break;
2268
2269 default:
2270 return (EINVAL);
2271 }
2272
2273 if (dirty != VMCB_CACHE_NONE) {
2274 svm_set_dirty(sc, vcpu, dirty);
2275 }
2276
2277 /*
2278 * XXX deal with CR3 and invalidate TLB entries tagged with the
2279 * vcpu's ASID. This needs to be treated differently depending on
2280 * whether 'running' is true/false.
2281 */
2282
2283 return (0);
2284 }
2285
2286 static int
svm_setdesc(void * arg,int vcpu,int reg,const struct seg_desc * desc)2287 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2288 {
2289 struct vmcb *vmcb;
2290 struct svm_softc *sc;
2291 struct vmcb_segment *seg;
2292
2293 sc = arg;
2294 vmcb = svm_get_vmcb(sc, vcpu);
2295
2296 switch (reg) {
2297 case VM_REG_GUEST_CS:
2298 case VM_REG_GUEST_DS:
2299 case VM_REG_GUEST_ES:
2300 case VM_REG_GUEST_SS:
2301 case VM_REG_GUEST_FS:
2302 case VM_REG_GUEST_GS:
2303 case VM_REG_GUEST_LDTR:
2304 case VM_REG_GUEST_TR:
2305 svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2306 seg = vmcb_segptr(vmcb, reg);
2307 /*
2308 * Map seg_desc access to VMCB attribute format.
2309 *
2310 * SVM uses the 'P' bit in the segment attributes to indicate a
2311 * NULL segment so clear it if the segment is marked unusable.
2312 */
2313 seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2314 if (SEG_DESC_UNUSABLE(desc->access)) {
2315 seg->attrib &= ~0x80;
2316 }
2317 /*
2318 * Keep CPL synced with the DPL specified for %ss.
2319 *
2320 * KVM notes that a SYSRET to non-cpl-3 is possible on AMD
2321 * (unlike Intel), but accepts such a possible deviation for
2322 * what is otherwise unreasonable behavior for a guest OS, since
2323 * they do the same synchronization.
2324 */
2325 if (reg == VM_REG_GUEST_SS) {
2326 vmcb->state.cpl = SEG_DESC_DPL(desc->access);
2327 }
2328 break;
2329
2330 case VM_REG_GUEST_GDTR:
2331 case VM_REG_GUEST_IDTR:
2332 svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2333 seg = vmcb_segptr(vmcb, reg);
2334 break;
2335
2336 default:
2337 return (EINVAL);
2338 }
2339
2340 ASSERT(seg != NULL);
2341 seg->base = desc->base;
2342 seg->limit = desc->limit;
2343
2344 return (0);
2345 }
2346
2347 static int
svm_getdesc(void * arg,int vcpu,int reg,struct seg_desc * desc)2348 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2349 {
2350 struct vmcb *vmcb;
2351 struct svm_softc *sc;
2352 struct vmcb_segment *seg;
2353
2354 sc = arg;
2355 vmcb = svm_get_vmcb(sc, vcpu);
2356
2357 switch (reg) {
2358 case VM_REG_GUEST_DS:
2359 case VM_REG_GUEST_ES:
2360 case VM_REG_GUEST_FS:
2361 case VM_REG_GUEST_GS:
2362 case VM_REG_GUEST_SS:
2363 case VM_REG_GUEST_LDTR:
2364 seg = vmcb_segptr(vmcb, reg);
2365 desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2366 /*
2367 * VT-x uses bit 16 to indicate a segment that has been loaded
2368 * with a NULL selector (aka unusable). The 'desc->access'
2369 * field is interpreted in the VT-x format by the
2370 * processor-independent code.
2371 *
2372 * SVM uses the 'P' bit to convey the same information so
2373 * convert it into the VT-x format. For more details refer to
2374 * section "Segment State in the VMCB" in APMv2.
2375 */
2376 if ((desc->access & 0x80) == 0) {
2377 /* Unusable segment */
2378 desc->access |= 0x10000;
2379 }
2380
2381 /*
2382 * Just as CPL (in the VMCB) is kept synced to SS when the
2383 * segment is written, so too shall the segment sync from CPL
2384 * when it is read.
2385 */
2386 if (reg == VM_REG_GUEST_SS) {
2387 desc->access &=
2388 ~(SEG_DESC_DPL_MASK << SEG_DESC_DPL_SHIFT);
2389 desc->access |=
2390 (vmcb->state.cpl & SEG_DESC_DPL_MASK) <<
2391 SEG_DESC_DPL_SHIFT;
2392 }
2393 break;
2394
2395 case VM_REG_GUEST_CS:
2396 case VM_REG_GUEST_TR:
2397 seg = vmcb_segptr(vmcb, reg);
2398 desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2399 break;
2400
2401 case VM_REG_GUEST_GDTR:
2402 case VM_REG_GUEST_IDTR:
2403 seg = vmcb_segptr(vmcb, reg);
2404 /*
2405 * Since there are no access bits associated with the GDTR or
2406 * the IDTR, zero out the field to ensure it does not contain
2407 * garbage which might confuse the consumer.
2408 */
2409 desc->access = 0;
2410 break;
2411
2412 default:
2413 return (EINVAL);
2414 }
2415
2416 ASSERT(seg != NULL);
2417 desc->base = seg->base;
2418 desc->limit = seg->limit;
2419 return (0);
2420 }
2421
2422 static int
svm_get_msr(void * arg,int vcpu,uint32_t msr,uint64_t * valp)2423 svm_get_msr(void *arg, int vcpu, uint32_t msr, uint64_t *valp)
2424 {
2425 struct svm_softc *sc = arg;
2426 struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2427 const uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, NULL);
2428
2429 if (msrp != NULL) {
2430 *valp = *msrp;
2431 return (0);
2432 }
2433
2434 return (EINVAL);
2435 }
2436
2437 static int
svm_set_msr(void * arg,int vcpu,uint32_t msr,uint64_t val)2438 svm_set_msr(void *arg, int vcpu, uint32_t msr, uint64_t val)
2439 {
2440 struct svm_softc *sc = arg;
2441 struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2442
2443 uint32_t dirty = 0;
2444 uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, &dirty);
2445 if (msrp == NULL) {
2446 return (EINVAL);
2447 }
2448 switch (msr) {
2449 case MSR_EFER:
2450 /*
2451 * For now, just clone the logic from
2452 * svm_setreg():
2453 *
2454 * EFER_SVM must always be set when the guest is
2455 * executing
2456 */
2457 *msrp = val | EFER_SVM;
2458 break;
2459 /* TODO: other necessary MSR masking */
2460 default:
2461 *msrp = val;
2462 break;
2463 }
2464 if (dirty != 0) {
2465 svm_set_dirty(sc, vcpu, dirty);
2466 }
2467 return (0);
2468
2469 }
2470
2471 static int
svm_setcap(void * arg,int vcpu,int type,int val)2472 svm_setcap(void *arg, int vcpu, int type, int val)
2473 {
2474 struct svm_softc *sc;
2475 int error;
2476
2477 sc = arg;
2478 error = 0;
2479 switch (type) {
2480 case VM_CAP_HALT_EXIT:
2481 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2482 VMCB_INTCPT_HLT, val);
2483 break;
2484 case VM_CAP_PAUSE_EXIT:
2485 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2486 VMCB_INTCPT_PAUSE, val);
2487 break;
2488 default:
2489 error = ENOENT;
2490 break;
2491 }
2492 return (error);
2493 }
2494
2495 static int
svm_getcap(void * arg,int vcpu,int type,int * retval)2496 svm_getcap(void *arg, int vcpu, int type, int *retval)
2497 {
2498 struct svm_softc *sc;
2499 int error;
2500
2501 sc = arg;
2502 error = 0;
2503
2504 switch (type) {
2505 case VM_CAP_HALT_EXIT:
2506 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2507 VMCB_INTCPT_HLT);
2508 break;
2509 case VM_CAP_PAUSE_EXIT:
2510 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2511 VMCB_INTCPT_PAUSE);
2512 break;
2513 default:
2514 error = ENOENT;
2515 break;
2516 }
2517 return (error);
2518 }
2519
2520 static struct vlapic *
svm_vlapic_init(void * arg,int vcpuid)2521 svm_vlapic_init(void *arg, int vcpuid)
2522 {
2523 struct svm_softc *svm_sc;
2524 struct vlapic *vlapic;
2525
2526 svm_sc = arg;
2527 vlapic = kmem_zalloc(sizeof (struct vlapic), KM_SLEEP);
2528 vlapic->vm = svm_sc->vm;
2529 vlapic->vcpuid = vcpuid;
2530 vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2531
2532 vlapic_init(vlapic);
2533
2534 return (vlapic);
2535 }
2536
2537 static void
svm_vlapic_cleanup(void * arg,struct vlapic * vlapic)2538 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2539 {
2540 vlapic_cleanup(vlapic);
2541 kmem_free(vlapic, sizeof (struct vlapic));
2542 }
2543
2544 static void
svm_pause(void * arg,int vcpu)2545 svm_pause(void *arg, int vcpu)
2546 {
2547 struct svm_softc *sc = arg;
2548 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
2549
2550 /*
2551 * If an event is pending injection in the VMCB, stash it in
2552 * exit_intinfo as if it were deferred by an exit from guest context.
2553 */
2554 const uint64_t intinfo = ctrl->eventinj;
2555 if ((intinfo & VMCB_EVENTINJ_VALID) != 0) {
2556 svm_stash_intinfo(sc, vcpu, intinfo);
2557 ctrl->eventinj = 0;
2558 }
2559
2560 /*
2561 * Now that no event is pending injection, interrupt-window exiting and
2562 * NMI-blocking can be disabled. If/when this vCPU is made to run
2563 * again, those conditions will be reinstated when the now-queued events
2564 * are re-injected.
2565 */
2566 svm_disable_intr_window_exiting(sc, vcpu);
2567 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
2568 }
2569
2570 static void
svm_savectx(void * arg,int vcpu)2571 svm_savectx(void *arg, int vcpu)
2572 {
2573 struct svm_softc *sc = arg;
2574
2575 if (sc->vcpu[vcpu].loaded) {
2576 svm_msr_guest_exit(sc, vcpu);
2577 }
2578 }
2579
2580 static void
svm_restorectx(void * arg,int vcpu)2581 svm_restorectx(void *arg, int vcpu)
2582 {
2583 struct svm_softc *sc = arg;
2584
2585 if (sc->vcpu[vcpu].loaded) {
2586 svm_msr_guest_enter(sc, vcpu);
2587 }
2588 }
2589
2590 static freqratio_res_t
svm_freq_ratio(uint64_t guest_hz,uint64_t host_hz,uint64_t * mult)2591 svm_freq_ratio(uint64_t guest_hz, uint64_t host_hz, uint64_t *mult)
2592 {
2593 /*
2594 * Check whether scaling is needed at all before potentially erroring
2595 * out for other reasons.
2596 */
2597 if (guest_hz == host_hz) {
2598 return (FR_SCALING_NOT_NEEDED);
2599 }
2600
2601 /*
2602 * Confirm that scaling is available.
2603 */
2604 if (!has_tsc_freq_ctl()) {
2605 return (FR_SCALING_NOT_SUPPORTED);
2606 }
2607
2608 /*
2609 * Verify the guest_hz is within the supported range.
2610 */
2611 if ((guest_hz < AMD_TSC_MIN_FREQ) ||
2612 (guest_hz >= (host_hz * AMD_TSC_MAX_FREQ_RATIO))) {
2613 return (FR_OUT_OF_RANGE);
2614 }
2615
2616 /* Calculate the multiplier. */
2617 uint64_t m = vmm_calc_freq_multiplier(guest_hz, host_hz,
2618 AMD_TSCM_FRAC_SIZE);
2619 *mult = m;
2620
2621 return (FR_VALID);
2622 }
2623
2624 struct vmm_ops vmm_ops_amd = {
2625 .init = svm_init,
2626 .cleanup = svm_cleanup,
2627 .resume = svm_restore,
2628
2629 .vminit = svm_vminit,
2630 .vmrun = svm_vmrun,
2631 .vmcleanup = svm_vmcleanup,
2632 .vmgetreg = svm_getreg,
2633 .vmsetreg = svm_setreg,
2634 .vmgetdesc = svm_getdesc,
2635 .vmsetdesc = svm_setdesc,
2636 .vmgetcap = svm_getcap,
2637 .vmsetcap = svm_setcap,
2638 .vlapic_init = svm_vlapic_init,
2639 .vlapic_cleanup = svm_vlapic_cleanup,
2640 .vmpause = svm_pause,
2641
2642 .vmsavectx = svm_savectx,
2643 .vmrestorectx = svm_restorectx,
2644
2645 .vmgetmsr = svm_get_msr,
2646 .vmsetmsr = svm_set_msr,
2647
2648 .vmfreqratio = svm_freq_ratio,
2649 .fr_intsize = AMD_TSCM_INT_SIZE,
2650 .fr_fracsize = AMD_TSCM_FRAC_SIZE,
2651 };
2652