1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
12 * disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * This file and its contents are supplied under the terms of the
31 * Common Development and Distribution License ("CDDL"), version 1.0.
32 * You may only use this file in accordance with the terms of version
33 * 1.0 of the CDDL.
34 *
35 * A full copy of the text of the CDDL should have accompanied this
36 * source. A copy of the CDDL is also available via the Internet at
37 * http://www.illumos.org/license/CDDL.
38 */
39 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
40
41 /*
42 * Copyright 2018 Joyent, Inc.
43 * Copyright 2023 Oxide Computer Company
44 */
45
46 #include <sys/cdefs.h>
47
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/kernel.h>
51 #include <sys/kmem.h>
52 #include <sys/pcpu.h>
53 #include <sys/proc.h>
54 #include <sys/sysctl.h>
55 #include <sys/cpu.h>
56
57 #include <sys/x86_archext.h>
58 #include <sys/archsystm.h>
59 #include <sys/trap.h>
60
61 #include <machine/cpufunc.h>
62 #include <machine/psl.h>
63 #include <machine/md_var.h>
64 #include <machine/reg.h>
65 #include <machine/specialreg.h>
66 #include <machine/vmm.h>
67 #include <machine/vmm_dev.h>
68 #include <sys/vmm_instruction_emul.h>
69 #include <sys/vmm_vm.h>
70 #include <sys/vmm_kernel.h>
71
72 #include "vmm_lapic.h"
73 #include "vmm_stat.h"
74 #include "vmm_ioport.h"
75 #include "vatpic.h"
76 #include "vlapic.h"
77 #include "vlapic_priv.h"
78
79 #include "vmcb.h"
80 #include "svm.h"
81 #include "svm_softc.h"
82 #include "svm_msr.h"
83
84 SYSCTL_DECL(_hw_vmm);
85 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
86 NULL);
87
88 /*
89 * Guardrails for supported guest TSC frequencies.
90 *
91 * A minimum of 0.5 GHz, which should be sufficient for all recent AMD CPUs, and
92 * a maximum ratio of (15 * host frequency), which is sufficient to prevent
93 * overflowing frequency calcuations and give plenty of bandwidth for future CPU
94 * frequency increases.
95 */
96 #define AMD_TSC_MIN_FREQ 500000000
97 #define AMD_TSC_MAX_FREQ_RATIO 15
98
99 /* SVM features advertised by CPUID.8000000AH:EDX */
100 static uint32_t svm_feature = 0;
101
102 static int disable_npf_assist;
103
104 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
105 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
106 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
107
108 static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
109 static int svm_getreg(void *arg, int vcpu, int ident, uint64_t *val);
110 static void flush_asid(struct svm_softc *sc, int vcpuid);
111
112 static __inline bool
has_flush_by_asid(void)113 has_flush_by_asid(void)
114 {
115 return ((svm_feature & CPUID_AMD_EDX_FLUSH_ASID) != 0);
116 }
117
118 static __inline bool
has_lbr_virt(void)119 has_lbr_virt(void)
120 {
121 return ((svm_feature & CPUID_AMD_EDX_LBR_VIRT) != 0);
122 }
123
124 static __inline bool
has_decode_assist(void)125 has_decode_assist(void)
126 {
127 return ((svm_feature & CPUID_AMD_EDX_DECODE_ASSISTS) != 0);
128 }
129
130 static __inline bool
has_tsc_freq_ctl(void)131 has_tsc_freq_ctl(void)
132 {
133 return ((svm_feature & CPUID_AMD_EDX_TSC_RATE_MSR) != 0);
134 }
135
136 static int
svm_init(void)137 svm_init(void)
138 {
139 /* Grab a (bhyve) local copy of the SVM feature bits */
140 struct cpuid_regs regs = {
141 .cp_eax = 0x8000000a,
142 };
143 (void) cpuid_insn(NULL, ®s);
144 svm_feature = regs.cp_edx;
145
146 /*
147 * HMA should have already checked for these features which we refuse to
148 * operate without, but no harm in making sure
149 */
150 const uint32_t demand_bits =
151 (CPUID_AMD_EDX_NESTED_PAGING | CPUID_AMD_EDX_NRIPS);
152 VERIFY((svm_feature & demand_bits) == demand_bits);
153
154 return (0);
155 }
156
157 static void
svm_restore(void)158 svm_restore(void)
159 {
160 /* No-op on illumos */
161 }
162
163 /* Pentium compatible MSRs */
164 #define MSR_PENTIUM_START 0
165 #define MSR_PENTIUM_END 0x1FFF
166 /* AMD 6th generation and Intel compatible MSRs */
167 #define MSR_AMD6TH_START 0xC0000000UL
168 #define MSR_AMD6TH_END 0xC0001FFFUL
169 /* AMD 7th and 8th generation compatible MSRs */
170 #define MSR_AMD7TH_START 0xC0010000UL
171 #define MSR_AMD7TH_END 0xC0011FFFUL
172
173 /*
174 * Get the index and bit position for a MSR in permission bitmap.
175 * Two bits are used for each MSR: lower bit for read and higher bit for write.
176 */
177 static int
svm_msr_index(uint64_t msr,int * index,int * bit)178 svm_msr_index(uint64_t msr, int *index, int *bit)
179 {
180 uint32_t base, off;
181
182 *index = -1;
183 *bit = (msr % 4) * 2;
184 base = 0;
185
186 if (msr <= MSR_PENTIUM_END) {
187 *index = msr / 4;
188 return (0);
189 }
190
191 base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
192 if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
193 off = (msr - MSR_AMD6TH_START);
194 *index = (off + base) / 4;
195 return (0);
196 }
197
198 base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
199 if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
200 off = (msr - MSR_AMD7TH_START);
201 *index = (off + base) / 4;
202 return (0);
203 }
204
205 return (EINVAL);
206 }
207
208 /*
209 * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
210 */
211 static void
svm_msr_perm(uint8_t * perm_bitmap,uint64_t msr,bool read,bool write)212 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
213 {
214 int index, bit, error;
215
216 error = svm_msr_index(msr, &index, &bit);
217 KASSERT(error == 0, ("%s: invalid msr %lx", __func__, msr));
218 KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
219 ("%s: invalid index %d for msr %lx", __func__, index, msr));
220 KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
221 "msr %lx", __func__, bit, msr));
222
223 if (read)
224 perm_bitmap[index] &= ~(1UL << bit);
225
226 if (write)
227 perm_bitmap[index] &= ~(2UL << bit);
228 }
229
230 static void
svm_msr_rw_ok(uint8_t * perm_bitmap,uint64_t msr)231 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
232 {
233
234 svm_msr_perm(perm_bitmap, msr, true, true);
235 }
236
237 static void
svm_msr_rd_ok(uint8_t * perm_bitmap,uint64_t msr)238 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
239 {
240
241 svm_msr_perm(perm_bitmap, msr, true, false);
242 }
243
244 int
svm_get_intercept(struct svm_softc * sc,int vcpu,int idx,uint32_t bitmask)245 svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
246 {
247 struct vmcb_ctrl *ctrl;
248
249 KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
250
251 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
252 return (ctrl->intercept[idx] & bitmask ? 1 : 0);
253 }
254
255 void
svm_set_intercept(struct svm_softc * sc,int vcpu,int idx,uint32_t bitmask,int enabled)256 svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
257 int enabled)
258 {
259 struct vmcb_ctrl *ctrl;
260 uint32_t oldval;
261
262 KASSERT(idx >= 0 && idx < 5, ("invalid intercept index %d", idx));
263
264 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
265 oldval = ctrl->intercept[idx];
266
267 if (enabled)
268 ctrl->intercept[idx] |= bitmask;
269 else
270 ctrl->intercept[idx] &= ~bitmask;
271
272 if (ctrl->intercept[idx] != oldval) {
273 svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
274 }
275 }
276
277 static void
vmcb_init(struct svm_softc * sc,int vcpu,uint64_t iopm_base_pa,uint64_t msrpm_base_pa,uint64_t np_pml4)278 vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
279 uint64_t msrpm_base_pa, uint64_t np_pml4)
280 {
281 struct vmcb_ctrl *ctrl;
282 struct vmcb_state *state;
283 uint32_t mask;
284 int n;
285
286 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
287 state = svm_get_vmcb_state(sc, vcpu);
288
289 ctrl->iopm_base_pa = iopm_base_pa;
290 ctrl->msrpm_base_pa = msrpm_base_pa;
291
292 /* Enable nested paging */
293 ctrl->np_ctrl = NP_ENABLE;
294 ctrl->n_cr3 = np_pml4;
295
296 /*
297 * Intercept accesses to the control registers that are not shadowed
298 * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
299 */
300 for (n = 0; n < 16; n++) {
301 mask = (BIT(n) << 16) | BIT(n);
302 if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
303 svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
304 else
305 svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
306 }
307
308 /*
309 * Selectively intercept writes to %cr0. This triggers on operations
310 * which would change bits other than TS or MP.
311 */
312 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
313 VMCB_INTCPT_CR0_WRITE);
314
315 /*
316 * Intercept everything when tracing guest exceptions otherwise
317 * just intercept machine check exception.
318 */
319 if (vcpu_trace_exceptions(sc->vm, vcpu)) {
320 for (n = 0; n < 32; n++) {
321 /*
322 * Skip unimplemented vectors in the exception bitmap.
323 */
324 if (n == 2 || n == 9) {
325 continue;
326 }
327 svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
328 }
329 } else {
330 svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
331 }
332
333 /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
334 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
335 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
336 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
337 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
338 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
339 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
340 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
341 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_RDPMC);
342 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
343 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
344 VMCB_INTCPT_FERR_FREEZE);
345
346 /* Enable exit-on-hlt by default */
347 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_HLT);
348
349 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
350 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
351
352 /* Intercept privileged invalidation instructions. */
353 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
354 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
355
356 /*
357 * Intercept all virtualization-related instructions.
358 *
359 * From section "Canonicalization and Consistency Checks" in APMv2
360 * the VMRUN intercept bit must be set to pass the consistency check.
361 */
362 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
363 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL);
364 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
365 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
366 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
367 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
368 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
369 if (vcpu_trap_wbinvd(sc->vm, vcpu) != 0) {
370 svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT,
371 VMCB_INTCPT_WBINVD);
372 }
373
374 /*
375 * The ASID will be set to a non-zero value just before VMRUN.
376 */
377 ctrl->asid = 0;
378
379 /*
380 * Section 15.21.1, Interrupt Masking in EFLAGS
381 * Section 15.21.2, Virtualizing APIC.TPR
382 *
383 * This must be set for %rflag and %cr8 isolation of guest and host.
384 */
385 ctrl->v_intr_ctrl |= V_INTR_MASKING;
386
387 /* Enable Last Branch Record aka LBR-virt (if available) */
388 if (has_lbr_virt()) {
389 ctrl->misc_ctrl |= LBR_VIRT_ENABLE;
390 }
391
392 /* EFER_SVM must always be set when the guest is executing */
393 state->efer = EFER_SVM;
394
395 /* Set up the PAT to power-on state */
396 state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) |
397 PAT_VALUE(1, PAT_WRITE_THROUGH) |
398 PAT_VALUE(2, PAT_UNCACHED) |
399 PAT_VALUE(3, PAT_UNCACHEABLE) |
400 PAT_VALUE(4, PAT_WRITE_BACK) |
401 PAT_VALUE(5, PAT_WRITE_THROUGH) |
402 PAT_VALUE(6, PAT_UNCACHED) |
403 PAT_VALUE(7, PAT_UNCACHEABLE);
404
405 /* Set up DR6/7 to power-on state */
406 state->dr6 = DBREG_DR6_RESERVED1;
407 state->dr7 = DBREG_DR7_RESERVED1;
408 }
409
410 /*
411 * Initialize a virtual machine.
412 */
413 static void *
svm_vminit(struct vm * vm)414 svm_vminit(struct vm *vm)
415 {
416 struct svm_softc *svm_sc;
417 struct svm_vcpu *vcpu;
418 vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
419 int i;
420 uint16_t maxcpus;
421
422 svm_sc = kmem_zalloc(sizeof (*svm_sc), KM_SLEEP);
423 VERIFY3U(((uintptr_t)svm_sc & PAGE_MASK), ==, 0);
424
425 svm_sc->msr_bitmap = vmm_contig_alloc(SVM_MSR_BITMAP_SIZE);
426 if (svm_sc->msr_bitmap == NULL)
427 panic("contigmalloc of SVM MSR bitmap failed");
428 svm_sc->iopm_bitmap = vmm_contig_alloc(SVM_IO_BITMAP_SIZE);
429 if (svm_sc->iopm_bitmap == NULL)
430 panic("contigmalloc of SVM IO bitmap failed");
431
432 svm_sc->vm = vm;
433 svm_sc->nptp = vmspace_table_root(vm_get_vmspace(vm));
434
435 /*
436 * Intercept read and write accesses to all MSRs.
437 */
438 memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
439
440 /*
441 * Access to the following MSRs is redirected to the VMCB when the
442 * guest is executing. Therefore it is safe to allow the guest to
443 * read/write these MSRs directly without hypervisor involvement.
444 */
445 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
446 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
447 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
448
449 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
450 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
451 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
452 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
453 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
454 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
455 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
456 svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
457
458 svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
459
460 /*
461 * Intercept writes to make sure that the EFER_SVM bit is not cleared.
462 */
463 svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
464
465 /* Intercept access to all I/O ports. */
466 memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
467
468 iopm_pa = vtophys(svm_sc->iopm_bitmap);
469 msrpm_pa = vtophys(svm_sc->msr_bitmap);
470 pml4_pa = svm_sc->nptp;
471 maxcpus = vm_get_maxcpus(svm_sc->vm);
472 for (i = 0; i < maxcpus; i++) {
473 vcpu = svm_get_vcpu(svm_sc, i);
474 vcpu->nextrip = ~0;
475 vcpu->lastcpu = NOCPU;
476 vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
477 vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
478 svm_msr_guest_init(svm_sc, i);
479 }
480
481 svm_pmu_init(svm_sc);
482
483 return (svm_sc);
484 }
485
486 /*
487 * Collateral for a generic SVM VM-exit.
488 */
489 static void
vm_exit_svm(struct vm_exit * vme,uint64_t code,uint64_t info1,uint64_t info2)490 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
491 {
492
493 vme->exitcode = VM_EXITCODE_SVM;
494 vme->u.svm.exitcode = code;
495 vme->u.svm.exitinfo1 = info1;
496 vme->u.svm.exitinfo2 = info2;
497 }
498
499 static enum vm_cpu_mode
svm_vcpu_mode(struct vmcb * vmcb)500 svm_vcpu_mode(struct vmcb *vmcb)
501 {
502 struct vmcb_state *state;
503
504 state = &vmcb->state;
505
506 if (state->efer & EFER_LMA) {
507 struct vmcb_segment *seg;
508
509 /*
510 * Section 4.8.1 for APM2, check if Code Segment has
511 * Long attribute set in descriptor.
512 */
513 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
514 if (seg->attrib & VMCB_CS_ATTRIB_L)
515 return (CPU_MODE_64BIT);
516 else
517 return (CPU_MODE_COMPATIBILITY);
518 } else if (state->cr0 & CR0_PE) {
519 return (CPU_MODE_PROTECTED);
520 } else {
521 return (CPU_MODE_REAL);
522 }
523 }
524
525 static enum vm_paging_mode
svm_paging_mode(uint64_t cr0,uint64_t cr4,uint64_t efer)526 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
527 {
528
529 if ((cr0 & CR0_PG) == 0)
530 return (PAGING_MODE_FLAT);
531 if ((cr4 & CR4_PAE) == 0)
532 return (PAGING_MODE_32);
533 if (efer & EFER_LME)
534 return (PAGING_MODE_64);
535 else
536 return (PAGING_MODE_PAE);
537 }
538
539 static void
svm_paging_info(struct vmcb * vmcb,struct vm_guest_paging * paging)540 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
541 {
542 struct vmcb_state *state;
543
544 state = &vmcb->state;
545 paging->cr3 = state->cr3;
546 paging->cpl = state->cpl;
547 paging->cpu_mode = svm_vcpu_mode(vmcb);
548 paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
549 state->efer);
550 }
551
552 #define UNHANDLED 0
553
554 /*
555 * Handle guest I/O intercept.
556 */
557 static int
svm_handle_inout(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)558 svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
559 {
560 struct vmcb_ctrl *ctrl;
561 struct vmcb_state *state;
562 struct vm_inout *inout;
563 struct vie *vie;
564 uint64_t info1;
565 struct vm_guest_paging paging;
566
567 state = svm_get_vmcb_state(svm_sc, vcpu);
568 ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
569 inout = &vmexit->u.inout;
570 info1 = ctrl->exitinfo1;
571
572 inout->bytes = (info1 >> 4) & 0x7;
573 inout->flags = 0;
574 inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0;
575 inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0;
576 inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0;
577 inout->port = (uint16_t)(info1 >> 16);
578 inout->eax = (uint32_t)(state->rax);
579
580 /*
581 * We'll always need paging and vie info, even if we bail out early
582 * due to missing DecodeAssist.
583 */
584 svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
585 vie = vm_vie_ctx(svm_sc->vm, vcpu);
586
587 if ((inout->flags & INOUT_STR) != 0) {
588 /*
589 * The effective segment number in EXITINFO1[12:10] is populated
590 * only if the processor has the DecodeAssist capability.
591 *
592 * This is not specified explicitly in APMv2 but can be verified
593 * empirically.
594 */
595 if (!has_decode_assist()) {
596 /*
597 * Without decoding assistance, force the task of
598 * emulating the ins/outs on userspace.
599 */
600 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
601 bzero(&vmexit->u.inst_emul,
602 sizeof (vmexit->u.inst_emul));
603 vie_init_other(vie, &paging);
604 return (UNHANDLED);
605 }
606
607 /*
608 * Bits 7-9 encode the address size of ins/outs operations where
609 * the 1/2/4 values correspond to 16/32/64 bit sizes.
610 */
611 inout->addrsize = 2 * ((info1 >> 7) & 0x7);
612 VERIFY(inout->addrsize == 2 || inout->addrsize == 4 ||
613 inout->addrsize == 8);
614
615 if (inout->flags & INOUT_IN) {
616 /*
617 * For INS instructions, %es (encoded as 0) is the
618 * implied segment for the operation.
619 */
620 inout->segment = 0;
621 } else {
622 /*
623 * Bits 10-12 encode the segment for OUTS.
624 * This value follows the standard x86 segment order.
625 */
626 inout->segment = (info1 >> 10) & 0x7;
627 }
628 }
629
630 vmexit->exitcode = VM_EXITCODE_INOUT;
631 vie_init_inout(vie, inout, vmexit->inst_length, &paging);
632
633 /* The in/out emulation will handle advancing %rip */
634 vmexit->inst_length = 0;
635
636 return (UNHANDLED);
637 }
638
639 static int
npf_fault_type(uint64_t exitinfo1)640 npf_fault_type(uint64_t exitinfo1)
641 {
642
643 if (exitinfo1 & VMCB_NPF_INFO1_W)
644 return (PROT_WRITE);
645 else if (exitinfo1 & VMCB_NPF_INFO1_ID)
646 return (PROT_EXEC);
647 else
648 return (PROT_READ);
649 }
650
651 static bool
svm_npf_emul_fault(uint64_t exitinfo1)652 svm_npf_emul_fault(uint64_t exitinfo1)
653 {
654 if (exitinfo1 & VMCB_NPF_INFO1_ID) {
655 return (false);
656 }
657
658 if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
659 return (false);
660 }
661
662 if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
663 return (false);
664 }
665
666 return (true);
667 }
668
669 static void
svm_handle_mmio_emul(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit,uint64_t gpa)670 svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
671 uint64_t gpa)
672 {
673 struct vmcb_ctrl *ctrl;
674 struct vmcb *vmcb;
675 struct vie *vie;
676 struct vm_guest_paging paging;
677 struct vmcb_segment *seg;
678 char *inst_bytes = NULL;
679 uint8_t inst_len = 0;
680
681 vmcb = svm_get_vmcb(svm_sc, vcpu);
682 ctrl = &vmcb->ctrl;
683
684 vmexit->exitcode = VM_EXITCODE_MMIO_EMUL;
685 vmexit->u.mmio_emul.gpa = gpa;
686 vmexit->u.mmio_emul.gla = VIE_INVALID_GLA;
687 svm_paging_info(vmcb, &paging);
688
689 switch (paging.cpu_mode) {
690 case CPU_MODE_REAL:
691 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
692 vmexit->u.mmio_emul.cs_base = seg->base;
693 vmexit->u.mmio_emul.cs_d = 0;
694 break;
695 case CPU_MODE_PROTECTED:
696 case CPU_MODE_COMPATIBILITY:
697 seg = vmcb_segptr(vmcb, VM_REG_GUEST_CS);
698 vmexit->u.mmio_emul.cs_base = seg->base;
699
700 /*
701 * Section 4.8.1 of APM2, Default Operand Size or D bit.
702 */
703 vmexit->u.mmio_emul.cs_d = (seg->attrib & VMCB_CS_ATTRIB_D) ?
704 1 : 0;
705 break;
706 default:
707 vmexit->u.mmio_emul.cs_base = 0;
708 vmexit->u.mmio_emul.cs_d = 0;
709 break;
710 }
711
712 /*
713 * Copy the instruction bytes into 'vie' if available.
714 */
715 if (has_decode_assist() && !disable_npf_assist) {
716 inst_len = ctrl->inst_len;
717 inst_bytes = (char *)ctrl->inst_bytes;
718 }
719 vie = vm_vie_ctx(svm_sc->vm, vcpu);
720 vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
721 }
722
723 /*
724 * Do not allow CD, NW, or invalid high bits to be asserted in the value of cr0
725 * which is live in the guest. They are visible via the shadow instead.
726 */
727 #define SVM_CR0_MASK ~(CR0_CD | CR0_NW | 0xffffffff00000000)
728
729 static void
svm_set_cr0(struct svm_softc * svm_sc,int vcpu,uint64_t val,bool guest_write)730 svm_set_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t val, bool guest_write)
731 {
732 struct vmcb_state *state;
733 struct svm_regctx *regctx;
734 uint64_t masked, old, diff;
735
736 state = svm_get_vmcb_state(svm_sc, vcpu);
737 regctx = svm_get_guest_regctx(svm_sc, vcpu);
738
739 old = state->cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
740 diff = old ^ val;
741
742 /* No further work needed if register contents remain the same */
743 if (diff == 0) {
744 return;
745 }
746
747 /* Flush the TLB if the paging or write-protect bits are changing */
748 if ((diff & CR0_PG) != 0 || (diff & CR0_WP) != 0) {
749 flush_asid(svm_sc, vcpu);
750 }
751
752 /*
753 * If the change in %cr0 is due to a guest action (via interception)
754 * then other CPU state updates may be required.
755 */
756 if (guest_write) {
757 if ((diff & CR0_PG) != 0) {
758 uint64_t efer = state->efer;
759
760 /* Keep the long-mode state in EFER in sync */
761 if ((val & CR0_PG) != 0 && (efer & EFER_LME) != 0) {
762 state->efer |= EFER_LMA;
763 }
764 if ((val & CR0_PG) == 0 && (efer & EFER_LME) != 0) {
765 state->efer &= ~EFER_LMA;
766 }
767 }
768 }
769
770 masked = val & SVM_CR0_MASK;
771 regctx->sctx_cr0_shadow = val;
772 state->cr0 = masked;
773 svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_CR);
774
775 if ((masked ^ val) != 0) {
776 /*
777 * The guest has set bits in %cr0 which we are masking out and
778 * exposing via shadow.
779 *
780 * We must intercept %cr0 reads in order to make the shadowed
781 * view available to the guest.
782 *
783 * Writes to %cr0 must also be intercepted (unconditionally,
784 * unlike the VMCB_INTCPT_CR0_WRITE mechanism) so we can catch
785 * if/when the guest clears those shadowed bits.
786 */
787 svm_enable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
788 BIT(0) | BIT(16));
789 } else {
790 /*
791 * When no bits remain in %cr0 which require shadowing, the
792 * unconditional intercept of reads/writes to %cr0 can be
793 * disabled.
794 *
795 * The selective write intercept (VMCB_INTCPT_CR0_WRITE) remains
796 * in place so we can be notified of operations which change
797 * bits other than TS or MP.
798 */
799 svm_disable_intercept(svm_sc, vcpu, VMCB_CR_INTCPT,
800 BIT(0) | BIT(16));
801 }
802 svm_set_dirty(svm_sc, vcpu, VMCB_CACHE_I);
803 }
804
805 static void
svm_get_cr0(struct svm_softc * svm_sc,int vcpu,uint64_t * val)806 svm_get_cr0(struct svm_softc *svm_sc, int vcpu, uint64_t *val)
807 {
808 struct vmcb *vmcb;
809 struct svm_regctx *regctx;
810
811 vmcb = svm_get_vmcb(svm_sc, vcpu);
812 regctx = svm_get_guest_regctx(svm_sc, vcpu);
813
814 /*
815 * Include the %cr0 bits which exist only in the shadow along with those
816 * in the running vCPU state.
817 */
818 *val = vmcb->state.cr0 | (regctx->sctx_cr0_shadow & ~SVM_CR0_MASK);
819 }
820
821 static void
svm_handle_cr0_read(struct svm_softc * svm_sc,int vcpu,enum vm_reg_name reg)822 svm_handle_cr0_read(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
823 {
824 uint64_t val;
825 int err __maybe_unused;
826
827 svm_get_cr0(svm_sc, vcpu, &val);
828 err = svm_setreg(svm_sc, vcpu, reg, val);
829 ASSERT(err == 0);
830 }
831
832 static void
svm_handle_cr0_write(struct svm_softc * svm_sc,int vcpu,enum vm_reg_name reg)833 svm_handle_cr0_write(struct svm_softc *svm_sc, int vcpu, enum vm_reg_name reg)
834 {
835 struct vmcb_state *state;
836 uint64_t val;
837 int err __maybe_unused;
838
839 state = svm_get_vmcb_state(svm_sc, vcpu);
840
841 err = svm_getreg(svm_sc, vcpu, reg, &val);
842 ASSERT(err == 0);
843
844 if ((val & CR0_NW) != 0 && (val & CR0_CD) == 0) {
845 /* NW without CD is nonsensical */
846 vm_inject_gp(svm_sc->vm, vcpu);
847 return;
848 }
849 if ((val & CR0_PG) != 0 && (val & CR0_PE) == 0) {
850 /* PG requires PE */
851 vm_inject_gp(svm_sc->vm, vcpu);
852 return;
853 }
854 if ((state->cr0 & CR0_PG) == 0 && (val & CR0_PG) != 0) {
855 /* When enabling paging, PAE must be enabled if LME is. */
856 if ((state->efer & EFER_LME) != 0 &&
857 (state->cr4 & CR4_PAE) == 0) {
858 vm_inject_gp(svm_sc->vm, vcpu);
859 return;
860 }
861 }
862
863 svm_set_cr0(svm_sc, vcpu, val, true);
864 }
865
866 static void
svm_inst_emul_other(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)867 svm_inst_emul_other(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
868 {
869 struct vie *vie;
870 struct vm_guest_paging paging;
871
872 /* Let the instruction emulation (hopefully in-kernel) handle it */
873 vmexit->exitcode = VM_EXITCODE_INST_EMUL;
874 bzero(&vmexit->u.inst_emul, sizeof (vmexit->u.inst_emul));
875 vie = vm_vie_ctx(svm_sc->vm, vcpu);
876 svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging);
877 vie_init_other(vie, &paging);
878
879 /* The instruction emulation will handle advancing %rip */
880 vmexit->inst_length = 0;
881 }
882
883 static void
svm_update_virqinfo(struct svm_softc * sc,int vcpu)884 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
885 {
886 struct vm *vm;
887 struct vlapic *vlapic;
888 struct vmcb_ctrl *ctrl;
889
890 vm = sc->vm;
891 vlapic = vm_lapic(vm, vcpu);
892 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
893
894 /* Update %cr8 in the emulated vlapic */
895 vlapic_set_cr8(vlapic, ctrl->v_tpr);
896
897 /* Virtual interrupt injection is not used. */
898 KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
899 "v_intr_vector %d", __func__, ctrl->v_intr_vector));
900 }
901
902 CTASSERT(VMCB_EVENTINJ_TYPE_INTR == VM_INTINFO_HWINTR);
903 CTASSERT(VMCB_EVENTINJ_TYPE_NMI == VM_INTINFO_NMI);
904 CTASSERT(VMCB_EVENTINJ_TYPE_EXCEPTION == VM_INTINFO_HWEXCP);
905 CTASSERT(VMCB_EVENTINJ_TYPE_INTn == VM_INTINFO_SWINTR);
906 CTASSERT(VMCB_EVENTINJ_EC_VALID == VM_INTINFO_DEL_ERRCODE);
907 CTASSERT(VMCB_EVENTINJ_VALID == VM_INTINFO_VALID);
908
909 /*
910 * Store SVM-specific event injection info for later handling. This depends on
911 * the bhyve-internal event definitions matching those in the VMCB, as ensured
912 * by the above CTASSERTs.
913 */
914 static void
svm_stash_intinfo(struct svm_softc * svm_sc,int vcpu,uint64_t intinfo)915 svm_stash_intinfo(struct svm_softc *svm_sc, int vcpu, uint64_t intinfo)
916 {
917 ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
918
919 /*
920 * If stashing an NMI pending injection, ensure that it bears the
921 * correct vector which exit_intinfo expects.
922 */
923 if (VM_INTINFO_TYPE(intinfo) == VM_INTINFO_NMI) {
924 intinfo &= ~VM_INTINFO_MASK_VECTOR;
925 intinfo |= IDT_NMI;
926 }
927
928 VERIFY0(vm_exit_intinfo(svm_sc->vm, vcpu, intinfo));
929 }
930
931 static void
svm_save_exitintinfo(struct svm_softc * svm_sc,int vcpu)932 svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
933 {
934 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
935 uint64_t intinfo = ctrl->exitintinfo;
936
937 if (VMCB_EXITINTINFO_VALID(intinfo)) {
938 /*
939 * If a #VMEXIT happened during event delivery then record the
940 * event that was being delivered.
941 */
942 vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
943
944 svm_stash_intinfo(svm_sc, vcpu, intinfo);
945 }
946 }
947
948 static __inline int
vintr_intercept_enabled(struct svm_softc * sc,int vcpu)949 vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
950 {
951
952 return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
953 VMCB_INTCPT_VINTR));
954 }
955
956 static void
svm_enable_intr_window_exiting(struct svm_softc * sc,int vcpu)957 svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
958 {
959 struct vmcb_ctrl *ctrl;
960 struct vmcb_state *state;
961
962 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
963 state = svm_get_vmcb_state(sc, vcpu);
964
965 if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
966 KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
967 ("%s: invalid v_ign_tpr", __func__));
968 KASSERT(vintr_intercept_enabled(sc, vcpu),
969 ("%s: vintr intercept should be enabled", __func__));
970 return;
971 }
972
973 /*
974 * We use V_IRQ in conjunction with the VINTR intercept to trap into the
975 * hypervisor as soon as a virtual interrupt can be delivered.
976 *
977 * Since injected events are not subject to intercept checks we need to
978 * ensure that the V_IRQ is not actually going to be delivered on VM
979 * entry.
980 */
981 VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
982 (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
983
984 ctrl->v_irq |= V_IRQ;
985 ctrl->v_intr_prio |= V_IGN_TPR;
986 ctrl->v_intr_vector = 0;
987 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
988 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
989 }
990
991 static void
svm_disable_intr_window_exiting(struct svm_softc * sc,int vcpu)992 svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
993 {
994 struct vmcb_ctrl *ctrl;
995
996 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
997
998 if ((ctrl->v_irq & V_IRQ) == 0 && ctrl->v_intr_vector == 0) {
999 KASSERT(!vintr_intercept_enabled(sc, vcpu),
1000 ("%s: vintr intercept should be disabled", __func__));
1001 return;
1002 }
1003
1004 ctrl->v_irq &= ~V_IRQ;
1005 ctrl->v_intr_vector = 0;
1006 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1007 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
1008 }
1009
1010 /*
1011 * Once an NMI is injected it blocks delivery of further NMIs until the handler
1012 * executes an IRET. The IRET intercept is enabled when an NMI is injected to
1013 * to track when the vcpu is done handling the NMI.
1014 */
1015 static int
svm_nmi_blocked(struct svm_softc * sc,int vcpu)1016 svm_nmi_blocked(struct svm_softc *sc, int vcpu)
1017 {
1018 return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
1019 VMCB_INTCPT_IRET));
1020 }
1021
1022 static void
svm_clear_nmi_blocking(struct svm_softc * sc,int vcpu)1023 svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
1024 {
1025 struct vmcb_ctrl *ctrl;
1026
1027 KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
1028 /*
1029 * When the IRET intercept is cleared the vcpu will attempt to execute
1030 * the "iret" when it runs next. However, it is possible to inject
1031 * another NMI into the vcpu before the "iret" has actually executed.
1032 *
1033 * For e.g. if the "iret" encounters a #NPF when accessing the stack
1034 * it will trap back into the hypervisor. If an NMI is pending for
1035 * the vcpu it will be injected into the guest.
1036 *
1037 * XXX this needs to be fixed
1038 */
1039 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1040
1041 /*
1042 * Set an interrupt shadow to prevent an NMI from being immediately
1043 * injected on the next VMRUN.
1044 */
1045 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1046 ctrl->intr_shadow = 1;
1047 }
1048
1049 static void
svm_inject_event(struct vmcb_ctrl * ctrl,uint64_t info)1050 svm_inject_event(struct vmcb_ctrl *ctrl, uint64_t info)
1051 {
1052 ASSERT(VM_INTINFO_PENDING(info));
1053
1054 uint8_t vector = VM_INTINFO_VECTOR(info);
1055 uint32_t type = VM_INTINFO_TYPE(info);
1056
1057 /*
1058 * Correct behavior depends on bhyve intinfo event types lining up with
1059 * those defined by AMD for event injection in the VMCB. The CTASSERTs
1060 * above svm_save_exitintinfo() ensure it.
1061 */
1062 switch (type) {
1063 case VM_INTINFO_NMI:
1064 /* Ensure vector for injected event matches its type (NMI) */
1065 vector = IDT_NMI;
1066 break;
1067 case VM_INTINFO_HWINTR:
1068 case VM_INTINFO_SWINTR:
1069 break;
1070 case VM_INTINFO_HWEXCP:
1071 if (vector == IDT_NMI) {
1072 /*
1073 * NMIs are expected to be injected with
1074 * VMCB_EVENTINJ_TYPE_NMI, rather than as an exception
1075 * with the NMI vector.
1076 */
1077 type = VM_INTINFO_NMI;
1078 }
1079 VERIFY(vector < 32);
1080 break;
1081 default:
1082 /*
1083 * Since there is not strong validation for injected event types
1084 * at this point, fall back to software interrupt for those we
1085 * do not recognized.
1086 */
1087 type = VM_INTINFO_SWINTR;
1088 break;
1089 }
1090
1091 ctrl->eventinj = VMCB_EVENTINJ_VALID | type | vector;
1092 if (VM_INTINFO_HAS_ERRCODE(info)) {
1093 ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
1094 ctrl->eventinj |= (uint64_t)VM_INTINFO_ERRCODE(info) << 32;
1095 }
1096 }
1097
1098 static void
svm_inject_nmi(struct svm_softc * sc,int vcpu)1099 svm_inject_nmi(struct svm_softc *sc, int vcpu)
1100 {
1101 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1102
1103 ASSERT(!svm_nmi_blocked(sc, vcpu));
1104
1105 ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
1106 vm_nmi_clear(sc->vm, vcpu);
1107
1108 /*
1109 * Virtual NMI blocking is now in effect.
1110 *
1111 * Not only does this block a subsequent NMI injection from taking
1112 * place, it also configures an intercept on the IRET so we can track
1113 * when the next injection can take place.
1114 */
1115 svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
1116 }
1117
1118 static void
svm_inject_irq(struct svm_softc * sc,int vcpu,int vector)1119 svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
1120 {
1121 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1122
1123 ASSERT(vector >= 0 && vector <= 255);
1124
1125 ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
1126 }
1127
1128 #define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL
1129
1130 static vm_msr_result_t
svm_write_efer(struct svm_softc * sc,int vcpu,uint64_t newval)1131 svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval)
1132 {
1133 struct vmcb_state *state = svm_get_vmcb_state(sc, vcpu);
1134 uint64_t lma;
1135 int error;
1136
1137 newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */
1138
1139 if (newval & EFER_MBZ_BITS) {
1140 return (VMR_GP);
1141 }
1142
1143 /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
1144 const uint64_t changed = state->efer ^ newval;
1145 if (changed & EFER_LME) {
1146 if (state->cr0 & CR0_PG) {
1147 return (VMR_GP);
1148 }
1149 }
1150
1151 /* EFER.LMA = EFER.LME & CR0.PG */
1152 if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) {
1153 lma = EFER_LMA;
1154 } else {
1155 lma = 0;
1156 }
1157 if ((newval & EFER_LMA) != lma) {
1158 return (VMR_GP);
1159 }
1160
1161 if ((newval & EFER_NXE) != 0 &&
1162 !vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) {
1163 return (VMR_GP);
1164 }
1165 if ((newval & EFER_FFXSR) != 0 &&
1166 !vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) {
1167 return (VMR_GP);
1168 }
1169 if ((newval & EFER_TCE) != 0 &&
1170 !vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) {
1171 return (VMR_GP);
1172 }
1173
1174 /*
1175 * Until bhyve has proper support for long-mode segment limits, just
1176 * toss a #GP at the guest if they attempt to use it.
1177 */
1178 if (newval & EFER_LMSLE) {
1179 return (VMR_GP);
1180 }
1181
1182 error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
1183 VERIFY0(error);
1184 return (VMR_OK);
1185 }
1186
1187 static int
svm_handle_msr(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit,bool is_wrmsr)1188 svm_handle_msr(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
1189 bool is_wrmsr)
1190 {
1191 struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1192 struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1193 const uint32_t ecx = ctx->sctx_rcx;
1194 vm_msr_result_t res;
1195 uint64_t val = 0;
1196
1197 if (is_wrmsr) {
1198 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
1199 val = ctx->sctx_rdx << 32 | (uint32_t)state->rax;
1200
1201 if (vlapic_owned_msr(ecx)) {
1202 struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1203
1204 res = vlapic_wrmsr(vlapic, ecx, val);
1205 } else if (ecx == MSR_EFER) {
1206 res = svm_write_efer(svm_sc, vcpu, val);
1207 } else if (svm_pmu_owned_msr(ecx)) {
1208 res = svm_pmu_wrmsr(svm_sc, vcpu, ecx, val);
1209 } else {
1210 res = svm_wrmsr(svm_sc, vcpu, ecx, val);
1211 }
1212 } else {
1213 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
1214
1215 if (vlapic_owned_msr(ecx)) {
1216 struct vlapic *vlapic = vm_lapic(svm_sc->vm, vcpu);
1217
1218 res = vlapic_rdmsr(vlapic, ecx, &val);
1219 } else if (svm_pmu_owned_msr(ecx)) {
1220 res = svm_pmu_rdmsr(svm_sc, vcpu, ecx, &val);
1221 } else {
1222 res = svm_rdmsr(svm_sc, vcpu, ecx, &val);
1223 }
1224 }
1225
1226 switch (res) {
1227 case VMR_OK:
1228 /* Store rdmsr result in the appropriate registers */
1229 if (!is_wrmsr) {
1230 state->rax = (uint32_t)val;
1231 ctx->sctx_rdx = val >> 32;
1232 }
1233 return (1);
1234 case VMR_GP:
1235 vm_inject_gp(svm_sc->vm, vcpu);
1236 return (1);
1237 case VMR_UNHANLDED:
1238 vmexit->exitcode = is_wrmsr ?
1239 VM_EXITCODE_WRMSR : VM_EXITCODE_RDMSR;
1240 vmexit->u.msr.code = ecx;
1241 vmexit->u.msr.wval = val;
1242 return (0);
1243 default:
1244 panic("unexpected msr result %u\n", res);
1245 }
1246 }
1247
1248 static void
svm_handle_rdpmc(struct svm_softc * svm_sc,int vcpu)1249 svm_handle_rdpmc(struct svm_softc *svm_sc, int vcpu)
1250 {
1251 struct vmcb_state *state = svm_get_vmcb_state(svm_sc, vcpu);
1252 struct svm_regctx *ctx = svm_get_guest_regctx(svm_sc, vcpu);
1253 const uint32_t ecx = ctx->sctx_rcx;
1254 uint64_t val = 0;
1255
1256 if (svm_pmu_rdpmc(svm_sc, vcpu, ecx, &val)) {
1257 state->rax = (uint32_t)val;
1258 ctx->sctx_rdx = val >> 32;
1259 } else {
1260 vm_inject_gp(svm_sc->vm, vcpu);
1261 }
1262 }
1263
1264 /*
1265 * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
1266 * that are due to instruction intercepts as well as MSR and IOIO intercepts
1267 * and exceptions caused by INT3, INTO and BOUND instructions.
1268 *
1269 * Return 1 if the nRIP is valid and 0 otherwise.
1270 */
1271 static int
nrip_valid(uint64_t exitcode)1272 nrip_valid(uint64_t exitcode)
1273 {
1274 switch (exitcode) {
1275 case 0x00 ... 0x0F: /* read of CR0 through CR15 */
1276 case 0x10 ... 0x1F: /* write of CR0 through CR15 */
1277 case 0x20 ... 0x2F: /* read of DR0 through DR15 */
1278 case 0x30 ... 0x3F: /* write of DR0 through DR15 */
1279 case 0x43: /* INT3 */
1280 case 0x44: /* INTO */
1281 case 0x45: /* BOUND */
1282 case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
1283 case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */
1284 return (1);
1285 default:
1286 return (0);
1287 }
1288 }
1289
1290 static int
svm_vmexit(struct svm_softc * svm_sc,int vcpu,struct vm_exit * vmexit)1291 svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
1292 {
1293 struct vmcb *vmcb;
1294 struct vmcb_state *state;
1295 struct vmcb_ctrl *ctrl;
1296 struct svm_regctx *ctx;
1297 uint64_t code, info1, info2;
1298 int handled;
1299
1300 ctx = svm_get_guest_regctx(svm_sc, vcpu);
1301 vmcb = svm_get_vmcb(svm_sc, vcpu);
1302 state = &vmcb->state;
1303 ctrl = &vmcb->ctrl;
1304
1305 handled = 0;
1306 code = ctrl->exitcode;
1307 info1 = ctrl->exitinfo1;
1308 info2 = ctrl->exitinfo2;
1309
1310 vmexit->exitcode = VM_EXITCODE_BOGUS;
1311 vmexit->rip = state->rip;
1312 vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
1313
1314 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
1315
1316 /*
1317 * #VMEXIT(INVALID) needs to be handled early because the VMCB is
1318 * in an inconsistent state and can trigger assertions that would
1319 * never happen otherwise.
1320 */
1321 if (code == VMCB_EXIT_INVALID) {
1322 vm_exit_svm(vmexit, code, info1, info2);
1323 return (0);
1324 }
1325
1326 KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
1327 "injection valid bit is set %lx", __func__, ctrl->eventinj));
1328
1329 KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
1330 ("invalid inst_length %d: code (%lx), info1 (%lx), info2 (%lx)",
1331 vmexit->inst_length, code, info1, info2));
1332
1333 svm_update_virqinfo(svm_sc, vcpu);
1334 svm_save_exitintinfo(svm_sc, vcpu);
1335
1336 switch (code) {
1337 case VMCB_EXIT_CR0_READ:
1338 if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1339 svm_handle_cr0_read(svm_sc, vcpu,
1340 vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1341 handled = 1;
1342 } else {
1343 /*
1344 * If SMSW is used to read the contents of %cr0, then
1345 * the VALID bit will not be set in `info1`, since the
1346 * handling is different from the mov-to-reg case.
1347 *
1348 * Punt to the instruction emulation to handle it.
1349 */
1350 svm_inst_emul_other(svm_sc, vcpu, vmexit);
1351 }
1352 break;
1353 case VMCB_EXIT_CR0_WRITE:
1354 case VMCB_EXIT_CR0_SEL_WRITE:
1355 if (VMCB_CRx_INFO1_VALID(info1) != 0) {
1356 svm_handle_cr0_write(svm_sc, vcpu,
1357 vie_regnum_map(VMCB_CRx_INFO1_GPR(info1)));
1358 handled = 1;
1359 } else {
1360 /*
1361 * Writes to %cr0 without VALID being set in `info1` are
1362 * initiated by the LMSW and CLTS instructions. While
1363 * LMSW (like SMSW) sees little use in modern OSes and
1364 * bootloaders, CLTS is still used for handling FPU
1365 * state transitions.
1366 *
1367 * Punt to the instruction emulation to handle them.
1368 */
1369 svm_inst_emul_other(svm_sc, vcpu, vmexit);
1370 }
1371 break;
1372 case VMCB_EXIT_IRET:
1373 /*
1374 * Restart execution at "iret" but with the intercept cleared.
1375 */
1376 vmexit->inst_length = 0;
1377 svm_clear_nmi_blocking(svm_sc, vcpu);
1378 handled = 1;
1379 break;
1380 case VMCB_EXIT_VINTR: /* interrupt window exiting */
1381 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
1382 svm_disable_intr_window_exiting(svm_sc, vcpu);
1383 handled = 1;
1384 break;
1385 case VMCB_EXIT_INTR: /* external interrupt */
1386 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
1387 handled = 1;
1388 break;
1389 case VMCB_EXIT_NMI:
1390 case VMCB_EXIT_SMI:
1391 case VMCB_EXIT_INIT:
1392 /*
1393 * For external NMI/SMI and physical INIT interrupts, simply
1394 * continue execution, as those host events will be handled by
1395 * the physical CPU.
1396 */
1397 handled = 1;
1398 break;
1399 case VMCB_EXIT_EXCP0 ... VMCB_EXIT_EXCP31: {
1400 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
1401
1402 const uint8_t idtvec = code - VMCB_EXIT_EXCP0;
1403 uint32_t errcode = 0;
1404 bool reflect = true;
1405 bool errcode_valid = false;
1406
1407 switch (idtvec) {
1408 case IDT_MC:
1409 /* The host will handle the MCE itself. */
1410 reflect = false;
1411 vmm_call_trap(T_MCE);
1412 break;
1413 case IDT_PF:
1414 VERIFY0(svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
1415 info2));
1416 /* fallthru */
1417 case IDT_NP:
1418 case IDT_SS:
1419 case IDT_GP:
1420 case IDT_AC:
1421 case IDT_TS:
1422 errcode_valid = true;
1423 errcode = info1;
1424 break;
1425
1426 case IDT_DF:
1427 errcode_valid = true;
1428 break;
1429
1430 case IDT_BP:
1431 case IDT_OF:
1432 case IDT_BR:
1433 /*
1434 * The 'nrip' field is populated for INT3, INTO and
1435 * BOUND exceptions and this also implies that
1436 * 'inst_length' is non-zero.
1437 *
1438 * Reset 'inst_length' to zero so the guest %rip at
1439 * event injection is identical to what it was when
1440 * the exception originally happened.
1441 */
1442 vmexit->inst_length = 0;
1443 /* fallthru */
1444 default:
1445 errcode_valid = false;
1446 break;
1447 }
1448 VERIFY0(vmexit->inst_length);
1449
1450 if (reflect) {
1451 /* Reflect the exception back into the guest */
1452 VERIFY0(vm_inject_exception(svm_sc->vm, vcpu, idtvec,
1453 errcode_valid, errcode, false));
1454 }
1455 handled = 1;
1456 break;
1457 }
1458 case VMCB_EXIT_MSR:
1459 handled = svm_handle_msr(svm_sc, vcpu, vmexit, info1 != 0);
1460 break;
1461 case VMCB_EXIT_RDPMC:
1462 svm_handle_rdpmc(svm_sc, vcpu);
1463 handled = 1;
1464 break;
1465 case VMCB_EXIT_IO:
1466 handled = svm_handle_inout(svm_sc, vcpu, vmexit);
1467 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
1468 break;
1469 case VMCB_EXIT_SHUTDOWN:
1470 (void) vm_suspend(svm_sc->vm, VM_SUSPEND_TRIPLEFAULT, vcpu);
1471 handled = 1;
1472 break;
1473 case VMCB_EXIT_INVLPGA:
1474 /* privileged invalidation instructions */
1475 vm_inject_ud(svm_sc->vm, vcpu);
1476 handled = 1;
1477 break;
1478 case VMCB_EXIT_VMRUN:
1479 case VMCB_EXIT_VMLOAD:
1480 case VMCB_EXIT_VMSAVE:
1481 case VMCB_EXIT_STGI:
1482 case VMCB_EXIT_CLGI:
1483 case VMCB_EXIT_SKINIT:
1484 /* privileged vmm instructions */
1485 vm_inject_ud(svm_sc->vm, vcpu);
1486 handled = 1;
1487 break;
1488 case VMCB_EXIT_INVD:
1489 case VMCB_EXIT_WBINVD:
1490 /* ignore exit */
1491 handled = 1;
1492 break;
1493 case VMCB_EXIT_VMMCALL:
1494 /* No handlers make use of VMMCALL for now */
1495 vm_inject_ud(svm_sc->vm, vcpu);
1496 handled = 1;
1497 break;
1498 case VMCB_EXIT_CPUID:
1499 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
1500 vcpu_emulate_cpuid(svm_sc->vm, vcpu, &state->rax,
1501 &ctx->sctx_rbx, &ctx->sctx_rcx, &ctx->sctx_rdx);
1502 handled = 1;
1503 break;
1504 case VMCB_EXIT_HLT:
1505 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
1506 vmexit->exitcode = VM_EXITCODE_HLT;
1507 vmexit->u.hlt.rflags = state->rflags;
1508 break;
1509 case VMCB_EXIT_PAUSE:
1510 vmexit->exitcode = VM_EXITCODE_PAUSE;
1511 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
1512 break;
1513 case VMCB_EXIT_NPF:
1514 /* EXITINFO2 contains the faulting guest physical address */
1515 if (info1 & VMCB_NPF_INFO1_RSV) {
1516 /* nested fault with reserved bits set */
1517 } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
1518 vmexit->exitcode = VM_EXITCODE_PAGING;
1519 vmexit->u.paging.gpa = info2;
1520 vmexit->u.paging.fault_type = npf_fault_type(info1);
1521 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
1522 } else if (svm_npf_emul_fault(info1)) {
1523 svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2);
1524 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1);
1525 }
1526 break;
1527 case VMCB_EXIT_MONITOR:
1528 vmexit->exitcode = VM_EXITCODE_MONITOR;
1529 break;
1530 case VMCB_EXIT_MWAIT:
1531 vmexit->exitcode = VM_EXITCODE_MWAIT;
1532 break;
1533 default:
1534 vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
1535 break;
1536 }
1537
1538 DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, vmexit->rip, uint32_t,
1539 code);
1540
1541 if (handled) {
1542 vmexit->rip += vmexit->inst_length;
1543 vmexit->inst_length = 0;
1544 state->rip = vmexit->rip;
1545 } else {
1546 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1547 /*
1548 * If this VM exit was not claimed by anybody then
1549 * treat it as a generic SVM exit.
1550 */
1551 vm_exit_svm(vmexit, code, info1, info2);
1552 } else {
1553 /*
1554 * The exitcode and collateral have been populated.
1555 * The VM exit will be processed further in userland.
1556 */
1557 }
1558 }
1559 return (handled);
1560 }
1561
1562 /*
1563 * Inject exceptions, NMIs, and ExtINTs.
1564 *
1565 * The logic behind these are complicated and may involve mutex contention, so
1566 * the injection is performed without the protection of host CPU interrupts
1567 * being disabled. This means a racing notification could be "lost",
1568 * necessitating a later call to svm_inject_recheck() to close that window
1569 * of opportunity.
1570 */
1571 static enum event_inject_state
svm_inject_events(struct svm_softc * sc,int vcpu)1572 svm_inject_events(struct svm_softc *sc, int vcpu)
1573 {
1574 struct vmcb_ctrl *ctrl;
1575 struct vmcb_state *state;
1576 struct svm_vcpu *vcpustate;
1577 uint64_t intinfo;
1578 enum event_inject_state ev_state;
1579
1580 state = svm_get_vmcb_state(sc, vcpu);
1581 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1582 vcpustate = svm_get_vcpu(sc, vcpu);
1583 ev_state = EIS_CAN_INJECT;
1584
1585 /* Clear any interrupt shadow if guest %rip has changed */
1586 if (vcpustate->nextrip != state->rip) {
1587 ctrl->intr_shadow = 0;
1588 }
1589
1590 /*
1591 * An event is already pending for injection. This can occur when the
1592 * vCPU exits prior to VM entry (like for an AST).
1593 */
1594 if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
1595 return (EIS_EV_EXISTING | EIS_REQ_EXIT);
1596 }
1597
1598 /*
1599 * Inject pending events or exceptions for this vcpu.
1600 *
1601 * An event might be pending because the previous #VMEXIT happened
1602 * during event delivery (i.e. ctrl->exitintinfo).
1603 *
1604 * An event might also be pending because an exception was injected
1605 * by the hypervisor (e.g. #PF during instruction emulation).
1606 */
1607 if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
1608 svm_inject_event(ctrl, intinfo);
1609 vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
1610 ev_state = EIS_EV_INJECTED;
1611 }
1612
1613 /* NMI event has priority over interrupts. */
1614 if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
1615 if (ev_state == EIS_CAN_INJECT) {
1616 /* Can't inject NMI if vcpu is in an intr_shadow. */
1617 if (ctrl->intr_shadow) {
1618 return (EIS_GI_BLOCK);
1619 }
1620
1621 svm_inject_nmi(sc, vcpu);
1622 ev_state = EIS_EV_INJECTED;
1623 } else {
1624 return (ev_state | EIS_REQ_EXIT);
1625 }
1626 }
1627
1628 if (vm_extint_pending(sc->vm, vcpu)) {
1629 int vector;
1630
1631 if (ev_state != EIS_CAN_INJECT) {
1632 return (ev_state | EIS_REQ_EXIT);
1633 }
1634
1635 /*
1636 * If the guest has disabled interrupts or is in an interrupt
1637 * shadow then we cannot inject the pending interrupt.
1638 */
1639 if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1640 return (EIS_GI_BLOCK);
1641 }
1642
1643 /* Ask the legacy pic for a vector to inject */
1644 vatpic_pending_intr(sc->vm, &vector);
1645 KASSERT(vector >= 0 && vector <= 255,
1646 ("invalid vector %d from INTR", vector));
1647
1648 svm_inject_irq(sc, vcpu, vector);
1649 vm_extint_clear(sc->vm, vcpu);
1650 vatpic_intr_accepted(sc->vm, vector);
1651 ev_state = EIS_EV_INJECTED;
1652 }
1653
1654 return (ev_state);
1655 }
1656
1657 /*
1658 * Synchronize vLAPIC state and inject any interrupts pending on it.
1659 *
1660 * This is done with host CPU interrupts disabled so notification IPIs will be
1661 * queued on the host APIC and recognized when entering SVM guest context.
1662 */
1663 static enum event_inject_state
svm_inject_vlapic(struct svm_softc * sc,int vcpu,struct vlapic * vlapic,enum event_inject_state ev_state)1664 svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
1665 enum event_inject_state ev_state)
1666 {
1667 struct vmcb_ctrl *ctrl;
1668 struct vmcb_state *state;
1669 int vector;
1670 uint8_t v_tpr;
1671
1672 state = svm_get_vmcb_state(sc, vcpu);
1673 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1674
1675 /*
1676 * The guest can modify the TPR by writing to %cr8. In guest mode the
1677 * CPU reflects this write to V_TPR without hypervisor intervention.
1678 *
1679 * The guest can also modify the TPR by writing to it via the memory
1680 * mapped APIC page. In this case, the write will be emulated by the
1681 * hypervisor. For this reason V_TPR must be updated before every
1682 * VMRUN.
1683 */
1684 v_tpr = vlapic_get_cr8(vlapic);
1685 KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
1686 if (ctrl->v_tpr != v_tpr) {
1687 ctrl->v_tpr = v_tpr;
1688 svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
1689 }
1690
1691 /* If an event cannot otherwise be injected, we are done for now */
1692 if (ev_state != EIS_CAN_INJECT) {
1693 return (ev_state);
1694 }
1695
1696 if (!vlapic_pending_intr(vlapic, &vector)) {
1697 return (EIS_CAN_INJECT);
1698 }
1699 KASSERT(vector >= 16 && vector <= 255,
1700 ("invalid vector %d from local APIC", vector));
1701
1702 /*
1703 * If the guest has disabled interrupts or is in an interrupt shadow
1704 * then we cannot inject the pending interrupt.
1705 */
1706 if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
1707 return (EIS_GI_BLOCK);
1708 }
1709
1710 svm_inject_irq(sc, vcpu, vector);
1711 vlapic_intr_accepted(vlapic, vector);
1712 return (EIS_EV_INJECTED);
1713 }
1714
1715 /*
1716 * Re-check for events to be injected.
1717 *
1718 * Once host CPU interrupts are disabled, check for the presence of any events
1719 * which require injection processing. If an exit is required upon injection,
1720 * or once the guest becomes interruptable, that will be configured too.
1721 */
1722 static bool
svm_inject_recheck(struct svm_softc * sc,int vcpu,enum event_inject_state ev_state)1723 svm_inject_recheck(struct svm_softc *sc, int vcpu,
1724 enum event_inject_state ev_state)
1725 {
1726 struct vmcb_ctrl *ctrl;
1727
1728 ctrl = svm_get_vmcb_ctrl(sc, vcpu);
1729
1730 if (ev_state == EIS_CAN_INJECT) {
1731 /*
1732 * An active interrupt shadow would preclude us from injecting
1733 * any events picked up during a re-check.
1734 */
1735 if (ctrl->intr_shadow != 0) {
1736 return (false);
1737 }
1738
1739 if (vm_nmi_pending(sc->vm, vcpu) &&
1740 !svm_nmi_blocked(sc, vcpu)) {
1741 /* queued NMI not blocked by NMI-window-exiting */
1742 return (true);
1743 }
1744 if (vm_extint_pending(sc->vm, vcpu)) {
1745 /* queued ExtINT not blocked by existing injection */
1746 return (true);
1747 }
1748 } else {
1749 if ((ev_state & EIS_REQ_EXIT) != 0) {
1750 /*
1751 * Use a self-IPI to force an immediate exit after
1752 * event injection has occurred.
1753 */
1754 poke_cpu(CPU->cpu_id);
1755 } else {
1756 /*
1757 * If any event is being injected, an exit immediately
1758 * upon becoming interruptable again will allow pending
1759 * or newly queued events to be injected in a timely
1760 * manner.
1761 */
1762 svm_enable_intr_window_exiting(sc, vcpu);
1763 }
1764 }
1765 return (false);
1766 }
1767
1768
1769 static void
check_asid(struct svm_softc * sc,int vcpuid,uint_t thiscpu,uint64_t nptgen)1770 check_asid(struct svm_softc *sc, int vcpuid, uint_t thiscpu, uint64_t nptgen)
1771 {
1772 struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1773 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1774 uint8_t flush;
1775
1776 flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(),
1777 vcpustate->nptgen != nptgen);
1778
1779 if (flush != VMCB_TLB_FLUSH_NOTHING) {
1780 ctrl->asid = vcpustate->hma_asid.hsa_asid;
1781 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1782 }
1783 ctrl->tlb_ctrl = flush;
1784 vcpustate->nptgen = nptgen;
1785 }
1786
1787 static void
flush_asid(struct svm_softc * sc,int vcpuid)1788 flush_asid(struct svm_softc *sc, int vcpuid)
1789 {
1790 struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid);
1791 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
1792 uint8_t flush;
1793
1794 /* HMA ASID updates are expected to be done with interrupts disabled */
1795 const ulong_t iflag = intr_clear();
1796 flush = hma_svm_asid_update(&vcpustate->hma_asid, has_flush_by_asid(),
1797 true);
1798 intr_restore(iflag);
1799
1800 ASSERT(flush != VMCB_TLB_FLUSH_NOTHING);
1801 ctrl->asid = vcpustate->hma_asid.hsa_asid;
1802 ctrl->tlb_ctrl = flush;
1803 svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
1804 /*
1805 * A potential future optimization: We could choose to update the nptgen
1806 * associated with the vCPU, since any pending nptgen change requiring a
1807 * flush will be satisfied by the one which has just now been queued.
1808 */
1809 }
1810
1811 static __inline void
svm_dr_enter_guest(struct svm_regctx * gctx)1812 svm_dr_enter_guest(struct svm_regctx *gctx)
1813 {
1814
1815 /* Save host control debug registers. */
1816 gctx->host_dr7 = rdr7();
1817 gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
1818
1819 /*
1820 * Disable debugging in DR7 and DEBUGCTL to avoid triggering
1821 * exceptions in the host based on the guest DRx values. The
1822 * guest DR6, DR7, and DEBUGCTL are saved/restored in the
1823 * VMCB.
1824 */
1825 load_dr7(0);
1826 wrmsr(MSR_DEBUGCTLMSR, 0);
1827
1828 /* Save host debug registers. */
1829 gctx->host_dr0 = rdr0();
1830 gctx->host_dr1 = rdr1();
1831 gctx->host_dr2 = rdr2();
1832 gctx->host_dr3 = rdr3();
1833 gctx->host_dr6 = rdr6();
1834
1835 /* Restore guest debug registers. */
1836 load_dr0(gctx->sctx_dr0);
1837 load_dr1(gctx->sctx_dr1);
1838 load_dr2(gctx->sctx_dr2);
1839 load_dr3(gctx->sctx_dr3);
1840 }
1841
1842 static __inline void
svm_dr_leave_guest(struct svm_regctx * gctx)1843 svm_dr_leave_guest(struct svm_regctx *gctx)
1844 {
1845
1846 /* Save guest debug registers. */
1847 gctx->sctx_dr0 = rdr0();
1848 gctx->sctx_dr1 = rdr1();
1849 gctx->sctx_dr2 = rdr2();
1850 gctx->sctx_dr3 = rdr3();
1851
1852 /*
1853 * Restore host debug registers. Restore DR7 and DEBUGCTL
1854 * last.
1855 */
1856 load_dr0(gctx->host_dr0);
1857 load_dr1(gctx->host_dr1);
1858 load_dr2(gctx->host_dr2);
1859 load_dr3(gctx->host_dr3);
1860 load_dr6(gctx->host_dr6);
1861 wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
1862 load_dr7(gctx->host_dr7);
1863 }
1864
1865 /*
1866 * Apply the TSC offset for a vCPU, including physical CPU and per-vCPU offsets.
1867 */
1868 static void
svm_apply_tsc_adjust(struct svm_softc * svm_sc,int vcpuid)1869 svm_apply_tsc_adjust(struct svm_softc *svm_sc, int vcpuid)
1870 {
1871 const uint64_t offset = vcpu_tsc_offset(svm_sc->vm, vcpuid, true);
1872 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(svm_sc, vcpuid);
1873
1874 if (ctrl->tsc_offset != offset) {
1875 ctrl->tsc_offset = offset;
1876 svm_set_dirty(svm_sc, vcpuid, VMCB_CACHE_I);
1877 }
1878 }
1879
1880 /*
1881 * Start vcpu with specified RIP.
1882 */
1883 static int
svm_vmrun(void * arg,int vcpu,uint64_t rip)1884 svm_vmrun(void *arg, int vcpu, uint64_t rip)
1885 {
1886 struct svm_regctx *gctx;
1887 struct svm_softc *svm_sc;
1888 struct svm_vcpu *vcpustate;
1889 struct vmcb_state *state;
1890 struct vm_exit *vmexit;
1891 struct vlapic *vlapic;
1892 vm_client_t *vmc;
1893 struct vm *vm;
1894 uint64_t vmcb_pa;
1895 int handled;
1896 uint16_t ldt_sel;
1897
1898 svm_sc = arg;
1899 vm = svm_sc->vm;
1900
1901 vcpustate = svm_get_vcpu(svm_sc, vcpu);
1902 state = svm_get_vmcb_state(svm_sc, vcpu);
1903 vmexit = vm_exitinfo(vm, vcpu);
1904 vlapic = vm_lapic(vm, vcpu);
1905 vmc = vm_get_vmclient(vm, vcpu);
1906
1907 gctx = svm_get_guest_regctx(svm_sc, vcpu);
1908 vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
1909
1910 if (vcpustate->lastcpu != curcpu) {
1911 /*
1912 * Force new ASID allocation by invalidating the generation.
1913 */
1914 vcpustate->hma_asid.hsa_gen = 0;
1915
1916 /*
1917 * Invalidate the VMCB state cache by marking all fields dirty.
1918 */
1919 svm_set_dirty(svm_sc, vcpu, 0xffffffff);
1920
1921 /*
1922 * XXX
1923 * Setting 'vcpustate->lastcpu' here is bit premature because
1924 * we may return from this function without actually executing
1925 * the VMRUN instruction. This could happen if an AST or yield
1926 * condition is pending on the first time through the loop.
1927 *
1928 * This works for now but any new side-effects of vcpu
1929 * migration should take this case into account.
1930 */
1931 vcpustate->lastcpu = curcpu;
1932 vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
1933 }
1934
1935 svm_apply_tsc_adjust(svm_sc, vcpu);
1936
1937 svm_msr_guest_enter(svm_sc, vcpu);
1938
1939 VERIFY(!vcpustate->loaded && curthread->t_preempt != 0);
1940 vcpustate->loaded = B_TRUE;
1941
1942 /* Update Guest RIP */
1943 state->rip = rip;
1944
1945 do {
1946 enum event_inject_state inject_state;
1947 uint64_t nptgen;
1948
1949 /*
1950 * Initial event injection is complex and may involve mutex
1951 * contention, so it must be performed with global interrupts
1952 * still enabled.
1953 */
1954 inject_state = svm_inject_events(svm_sc, vcpu);
1955 handled = 0;
1956
1957 /*
1958 * Disable interrupts while loading VM state and performing
1959 * event injection.
1960 */
1961 const ulong_t iflag = intr_clear();
1962
1963 /*
1964 * Synchronizing and injecting vlapic state is lock-free and is
1965 * safe (and prudent) to perform with interrupts disabled.
1966 */
1967 inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
1968 inject_state);
1969
1970 /*
1971 * Check for vCPU bail-out conditions. This must be done after
1972 * svm_inject_events() to detect a triple-fault condition.
1973 */
1974 if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
1975 intr_restore(iflag);
1976 break;
1977 }
1978
1979 if (vcpu_run_state_pending(vm, vcpu)) {
1980 intr_restore(iflag);
1981 vm_exit_run_state(vm, vcpu, state->rip);
1982 break;
1983 }
1984
1985 /*
1986 * If subsequent activity queued events which require injection
1987 * handling, take another lap to handle them.
1988 */
1989 if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
1990 intr_restore(iflag);
1991 handled = 1;
1992 continue;
1993 }
1994
1995 /*
1996 * #VMEXIT resumes the host with the guest LDTR, so
1997 * save the current LDT selector so it can be restored
1998 * after an exit. The userspace hypervisor probably
1999 * doesn't use a LDT, but save and restore it to be
2000 * safe.
2001 */
2002 ldt_sel = sldt();
2003
2004 /*
2005 * Check the vmspace and ASID generations to ensure that the
2006 * vcpu does not use stale TLB mappings.
2007 */
2008 nptgen = vmc_table_enter(vmc);
2009 check_asid(svm_sc, vcpu, curcpu, nptgen);
2010
2011 svm_pmu_enter(svm_sc, vcpu);
2012 vcpu_ustate_change(vm, vcpu, VU_RUN);
2013 svm_dr_enter_guest(gctx);
2014 svm_apply_dirty(svm_sc, vcpu);
2015
2016 /*
2017 * Perform VMRUN to enter guest context.
2018 *
2019 * This is done with the protection of clearing the GIF
2020 * (global interrupt flag) as required by SVM.
2021 */
2022 hma_svm_gif_disable();
2023 svm_launch(vmcb_pa, gctx, get_pcpu());
2024 hma_svm_gif_enable();
2025
2026 svm_dr_leave_guest(gctx);
2027 vcpu_ustate_change(vm, vcpu, VU_EMU_KERN);
2028 svm_pmu_exit(svm_sc, vcpu);
2029
2030 /* Restore host LDTR. */
2031 lldt(ldt_sel);
2032
2033 /*
2034 * Re-enable interrupts now that necessary CPU state has been
2035 * restored. Subsequent logic may need to block.
2036 */
2037 intr_restore(iflag);
2038
2039 vmc_table_exit(vmc);
2040
2041 /* Update 'nextrip' */
2042 vcpustate->nextrip = state->rip;
2043
2044 /* Handle #VMEXIT and if required return to user space. */
2045 handled = svm_vmexit(svm_sc, vcpu, vmexit);
2046 } while (handled);
2047
2048 svm_msr_guest_exit(svm_sc, vcpu);
2049
2050 ASSERT(interrupts_enabled());
2051 VERIFY(vcpustate->loaded && curthread->t_preempt != 0);
2052 vcpustate->loaded = B_FALSE;
2053
2054 return (0);
2055 }
2056
2057 static void
svm_vmcleanup(void * arg)2058 svm_vmcleanup(void *arg)
2059 {
2060 struct svm_softc *sc = arg;
2061
2062 vmm_contig_free(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE);
2063 vmm_contig_free(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE);
2064 kmem_free(sc, sizeof (*sc));
2065 }
2066
2067 static uint64_t *
swctx_regptr(struct svm_regctx * regctx,int reg)2068 swctx_regptr(struct svm_regctx *regctx, int reg)
2069 {
2070 switch (reg) {
2071 case VM_REG_GUEST_RBX:
2072 return (®ctx->sctx_rbx);
2073 case VM_REG_GUEST_RCX:
2074 return (®ctx->sctx_rcx);
2075 case VM_REG_GUEST_RDX:
2076 return (®ctx->sctx_rdx);
2077 case VM_REG_GUEST_RDI:
2078 return (®ctx->sctx_rdi);
2079 case VM_REG_GUEST_RSI:
2080 return (®ctx->sctx_rsi);
2081 case VM_REG_GUEST_RBP:
2082 return (®ctx->sctx_rbp);
2083 case VM_REG_GUEST_R8:
2084 return (®ctx->sctx_r8);
2085 case VM_REG_GUEST_R9:
2086 return (®ctx->sctx_r9);
2087 case VM_REG_GUEST_R10:
2088 return (®ctx->sctx_r10);
2089 case VM_REG_GUEST_R11:
2090 return (®ctx->sctx_r11);
2091 case VM_REG_GUEST_R12:
2092 return (®ctx->sctx_r12);
2093 case VM_REG_GUEST_R13:
2094 return (®ctx->sctx_r13);
2095 case VM_REG_GUEST_R14:
2096 return (®ctx->sctx_r14);
2097 case VM_REG_GUEST_R15:
2098 return (®ctx->sctx_r15);
2099 case VM_REG_GUEST_DR0:
2100 return (®ctx->sctx_dr0);
2101 case VM_REG_GUEST_DR1:
2102 return (®ctx->sctx_dr1);
2103 case VM_REG_GUEST_DR2:
2104 return (®ctx->sctx_dr2);
2105 case VM_REG_GUEST_DR3:
2106 return (®ctx->sctx_dr3);
2107 default:
2108 return (NULL);
2109 }
2110 }
2111
2112 static int
svm_getreg(void * arg,int vcpu,int ident,uint64_t * val)2113 svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
2114 {
2115 struct svm_softc *sc;
2116 struct vmcb *vmcb;
2117 uint64_t *regp;
2118 uint64_t *fieldp;
2119 struct vmcb_segment *seg;
2120
2121 sc = arg;
2122 vmcb = svm_get_vmcb(sc, vcpu);
2123
2124 regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2125 if (regp != NULL) {
2126 *val = *regp;
2127 return (0);
2128 }
2129
2130 switch (ident) {
2131 case VM_REG_GUEST_INTR_SHADOW:
2132 *val = (vmcb->ctrl.intr_shadow != 0) ? 1 : 0;
2133 break;
2134
2135 case VM_REG_GUEST_CR0:
2136 svm_get_cr0(sc, vcpu, val);
2137 break;
2138 case VM_REG_GUEST_CR2:
2139 case VM_REG_GUEST_CR3:
2140 case VM_REG_GUEST_CR4:
2141 case VM_REG_GUEST_DR6:
2142 case VM_REG_GUEST_DR7:
2143 case VM_REG_GUEST_EFER:
2144 case VM_REG_GUEST_RAX:
2145 case VM_REG_GUEST_RFLAGS:
2146 case VM_REG_GUEST_RIP:
2147 case VM_REG_GUEST_RSP:
2148 fieldp = vmcb_regptr(vmcb, ident, NULL);
2149 *val = *fieldp;
2150 break;
2151
2152 case VM_REG_GUEST_CS:
2153 case VM_REG_GUEST_DS:
2154 case VM_REG_GUEST_ES:
2155 case VM_REG_GUEST_FS:
2156 case VM_REG_GUEST_GS:
2157 case VM_REG_GUEST_SS:
2158 case VM_REG_GUEST_LDTR:
2159 case VM_REG_GUEST_TR:
2160 seg = vmcb_segptr(vmcb, ident);
2161 *val = seg->selector;
2162 break;
2163
2164 case VM_REG_GUEST_GDTR:
2165 case VM_REG_GUEST_IDTR:
2166 /* GDTR and IDTR don't have segment selectors */
2167 return (EINVAL);
2168
2169 case VM_REG_GUEST_PDPTE0:
2170 case VM_REG_GUEST_PDPTE1:
2171 case VM_REG_GUEST_PDPTE2:
2172 case VM_REG_GUEST_PDPTE3:
2173 /*
2174 * Unlike VMX, where the PDPTEs are explicitly cached as part of
2175 * several well-defined events related to paging (such as
2176 * loading %cr3), SVM walks the PDPEs (their PDPTE) as part of
2177 * nested paging lookups. This makes these registers
2178 * effectively irrelevant on SVM.
2179 *
2180 * Rather than tossing an error, emit zeroed values so casual
2181 * consumers do not need to be as careful about that difference.
2182 */
2183 *val = 0;
2184 break;
2185
2186 default:
2187 return (EINVAL);
2188 }
2189
2190 return (0);
2191 }
2192
2193 static int
svm_setreg(void * arg,int vcpu,int ident,uint64_t val)2194 svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
2195 {
2196 struct svm_softc *sc;
2197 struct vmcb *vmcb;
2198 uint64_t *regp;
2199 uint64_t *fieldp;
2200 uint32_t dirty;
2201 struct vmcb_segment *seg;
2202
2203 sc = arg;
2204 vmcb = svm_get_vmcb(sc, vcpu);
2205
2206 regp = swctx_regptr(svm_get_guest_regctx(sc, vcpu), ident);
2207 if (regp != NULL) {
2208 *regp = val;
2209 return (0);
2210 }
2211
2212 dirty = VMCB_CACHE_NONE;
2213 switch (ident) {
2214 case VM_REG_GUEST_INTR_SHADOW:
2215 vmcb->ctrl.intr_shadow = (val != 0) ? 1 : 0;
2216 break;
2217
2218 case VM_REG_GUEST_EFER:
2219 fieldp = vmcb_regptr(vmcb, ident, &dirty);
2220 /* EFER_SVM must always be set when the guest is executing */
2221 *fieldp = val | EFER_SVM;
2222 dirty |= VMCB_CACHE_CR;
2223 break;
2224
2225 case VM_REG_GUEST_CR0:
2226 svm_set_cr0(sc, vcpu, val, false);
2227 break;
2228 case VM_REG_GUEST_CR2:
2229 case VM_REG_GUEST_CR3:
2230 case VM_REG_GUEST_CR4:
2231 case VM_REG_GUEST_DR6:
2232 case VM_REG_GUEST_DR7:
2233 case VM_REG_GUEST_RAX:
2234 case VM_REG_GUEST_RFLAGS:
2235 case VM_REG_GUEST_RIP:
2236 case VM_REG_GUEST_RSP:
2237 fieldp = vmcb_regptr(vmcb, ident, &dirty);
2238 *fieldp = val;
2239 break;
2240
2241 case VM_REG_GUEST_CS:
2242 case VM_REG_GUEST_DS:
2243 case VM_REG_GUEST_ES:
2244 case VM_REG_GUEST_SS:
2245 case VM_REG_GUEST_FS:
2246 case VM_REG_GUEST_GS:
2247 case VM_REG_GUEST_LDTR:
2248 case VM_REG_GUEST_TR:
2249 dirty |= VMCB_CACHE_SEG;
2250 seg = vmcb_segptr(vmcb, ident);
2251 seg->selector = (uint16_t)val;
2252 break;
2253
2254 case VM_REG_GUEST_GDTR:
2255 case VM_REG_GUEST_IDTR:
2256 /* GDTR and IDTR don't have segment selectors */
2257 return (EINVAL);
2258
2259 case VM_REG_GUEST_PDPTE0:
2260 case VM_REG_GUEST_PDPTE1:
2261 case VM_REG_GUEST_PDPTE2:
2262 case VM_REG_GUEST_PDPTE3:
2263 /*
2264 * PDPEs (AMD's PDPTE) are not cached under SVM, so we can
2265 * ignore attempts to set them. See handler in svm_getreg() for
2266 * more details.
2267 */
2268 break;
2269
2270 default:
2271 return (EINVAL);
2272 }
2273
2274 if (dirty != VMCB_CACHE_NONE) {
2275 svm_set_dirty(sc, vcpu, dirty);
2276 }
2277
2278 /*
2279 * XXX deal with CR3 and invalidate TLB entries tagged with the
2280 * vcpu's ASID. This needs to be treated differently depending on
2281 * whether 'running' is true/false.
2282 */
2283
2284 return (0);
2285 }
2286
2287 static int
svm_setdesc(void * arg,int vcpu,int reg,const struct seg_desc * desc)2288 svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
2289 {
2290 struct vmcb *vmcb;
2291 struct svm_softc *sc;
2292 struct vmcb_segment *seg;
2293
2294 sc = arg;
2295 vmcb = svm_get_vmcb(sc, vcpu);
2296
2297 switch (reg) {
2298 case VM_REG_GUEST_CS:
2299 case VM_REG_GUEST_DS:
2300 case VM_REG_GUEST_ES:
2301 case VM_REG_GUEST_SS:
2302 case VM_REG_GUEST_FS:
2303 case VM_REG_GUEST_GS:
2304 case VM_REG_GUEST_LDTR:
2305 case VM_REG_GUEST_TR:
2306 svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG);
2307 seg = vmcb_segptr(vmcb, reg);
2308 /*
2309 * Map seg_desc access to VMCB attribute format.
2310 *
2311 * SVM uses the 'P' bit in the segment attributes to indicate a
2312 * NULL segment so clear it if the segment is marked unusable.
2313 */
2314 seg->attrib = VMCB_ACCESS2ATTR(desc->access);
2315 if (SEG_DESC_UNUSABLE(desc->access)) {
2316 seg->attrib &= ~0x80;
2317 }
2318 /*
2319 * Keep CPL synced with the DPL specified for %ss.
2320 *
2321 * KVM notes that a SYSRET to non-cpl-3 is possible on AMD
2322 * (unlike Intel), but accepts such a possible deviation for
2323 * what is otherwise unreasonable behavior for a guest OS, since
2324 * they do the same synchronization.
2325 */
2326 if (reg == VM_REG_GUEST_SS) {
2327 vmcb->state.cpl = SEG_DESC_DPL(desc->access);
2328 }
2329 break;
2330
2331 case VM_REG_GUEST_GDTR:
2332 case VM_REG_GUEST_IDTR:
2333 svm_set_dirty(sc, vcpu, VMCB_CACHE_DT);
2334 seg = vmcb_segptr(vmcb, reg);
2335 break;
2336
2337 default:
2338 return (EINVAL);
2339 }
2340
2341 ASSERT(seg != NULL);
2342 seg->base = desc->base;
2343 seg->limit = desc->limit;
2344
2345 return (0);
2346 }
2347
2348 static int
svm_getdesc(void * arg,int vcpu,int reg,struct seg_desc * desc)2349 svm_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
2350 {
2351 struct vmcb *vmcb;
2352 struct svm_softc *sc;
2353 struct vmcb_segment *seg;
2354
2355 sc = arg;
2356 vmcb = svm_get_vmcb(sc, vcpu);
2357
2358 switch (reg) {
2359 case VM_REG_GUEST_DS:
2360 case VM_REG_GUEST_ES:
2361 case VM_REG_GUEST_FS:
2362 case VM_REG_GUEST_GS:
2363 case VM_REG_GUEST_SS:
2364 case VM_REG_GUEST_LDTR:
2365 seg = vmcb_segptr(vmcb, reg);
2366 desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2367 /*
2368 * VT-x uses bit 16 to indicate a segment that has been loaded
2369 * with a NULL selector (aka unusable). The 'desc->access'
2370 * field is interpreted in the VT-x format by the
2371 * processor-independent code.
2372 *
2373 * SVM uses the 'P' bit to convey the same information so
2374 * convert it into the VT-x format. For more details refer to
2375 * section "Segment State in the VMCB" in APMv2.
2376 */
2377 if ((desc->access & 0x80) == 0) {
2378 /* Unusable segment */
2379 desc->access |= 0x10000;
2380 }
2381
2382 /*
2383 * Just as CPL (in the VMCB) is kept synced to SS when the
2384 * segment is written, so too shall the segment sync from CPL
2385 * when it is read.
2386 */
2387 if (reg == VM_REG_GUEST_SS) {
2388 desc->access &=
2389 ~(SEG_DESC_DPL_MASK << SEG_DESC_DPL_SHIFT);
2390 desc->access |=
2391 (vmcb->state.cpl & SEG_DESC_DPL_MASK) <<
2392 SEG_DESC_DPL_SHIFT;
2393 }
2394 break;
2395
2396 case VM_REG_GUEST_CS:
2397 case VM_REG_GUEST_TR:
2398 seg = vmcb_segptr(vmcb, reg);
2399 desc->access = VMCB_ATTR2ACCESS(seg->attrib);
2400 break;
2401
2402 case VM_REG_GUEST_GDTR:
2403 case VM_REG_GUEST_IDTR:
2404 seg = vmcb_segptr(vmcb, reg);
2405 /*
2406 * Since there are no access bits associated with the GDTR or
2407 * the IDTR, zero out the field to ensure it does not contain
2408 * garbage which might confuse the consumer.
2409 */
2410 desc->access = 0;
2411 break;
2412
2413 default:
2414 return (EINVAL);
2415 }
2416
2417 ASSERT(seg != NULL);
2418 desc->base = seg->base;
2419 desc->limit = seg->limit;
2420 return (0);
2421 }
2422
2423 static int
svm_get_msr(void * arg,int vcpu,uint32_t msr,uint64_t * valp)2424 svm_get_msr(void *arg, int vcpu, uint32_t msr, uint64_t *valp)
2425 {
2426 struct svm_softc *sc = arg;
2427 struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2428 const uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, NULL);
2429
2430 if (msrp != NULL) {
2431 *valp = *msrp;
2432 return (0);
2433 }
2434
2435 return (EINVAL);
2436 }
2437
2438 static int
svm_set_msr(void * arg,int vcpu,uint32_t msr,uint64_t val)2439 svm_set_msr(void *arg, int vcpu, uint32_t msr, uint64_t val)
2440 {
2441 struct svm_softc *sc = arg;
2442 struct vmcb *vmcb = svm_get_vmcb(sc, vcpu);
2443
2444 uint32_t dirty = 0;
2445 uint64_t *msrp = vmcb_msr_ptr(vmcb, msr, &dirty);
2446 if (msrp == NULL) {
2447 return (EINVAL);
2448 }
2449 switch (msr) {
2450 case MSR_EFER:
2451 /*
2452 * For now, just clone the logic from
2453 * svm_setreg():
2454 *
2455 * EFER_SVM must always be set when the guest is
2456 * executing
2457 */
2458 *msrp = val | EFER_SVM;
2459 break;
2460 /* TODO: other necessary MSR masking */
2461 default:
2462 *msrp = val;
2463 break;
2464 }
2465 if (dirty != 0) {
2466 svm_set_dirty(sc, vcpu, dirty);
2467 }
2468 return (0);
2469
2470 }
2471
2472 static int
svm_setcap(void * arg,int vcpu,int type,int val)2473 svm_setcap(void *arg, int vcpu, int type, int val)
2474 {
2475 struct svm_softc *sc;
2476 int error;
2477
2478 sc = arg;
2479 error = 0;
2480 switch (type) {
2481 case VM_CAP_HALT_EXIT:
2482 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2483 VMCB_INTCPT_HLT, val);
2484 break;
2485 case VM_CAP_PAUSE_EXIT:
2486 svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2487 VMCB_INTCPT_PAUSE, val);
2488 break;
2489 default:
2490 error = ENOENT;
2491 break;
2492 }
2493 return (error);
2494 }
2495
2496 static int
svm_getcap(void * arg,int vcpu,int type,int * retval)2497 svm_getcap(void *arg, int vcpu, int type, int *retval)
2498 {
2499 struct svm_softc *sc;
2500 int error;
2501
2502 sc = arg;
2503 error = 0;
2504
2505 switch (type) {
2506 case VM_CAP_HALT_EXIT:
2507 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2508 VMCB_INTCPT_HLT);
2509 break;
2510 case VM_CAP_PAUSE_EXIT:
2511 *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
2512 VMCB_INTCPT_PAUSE);
2513 break;
2514 default:
2515 error = ENOENT;
2516 break;
2517 }
2518 return (error);
2519 }
2520
2521 static struct vlapic *
svm_vlapic_init(void * arg,int vcpuid)2522 svm_vlapic_init(void *arg, int vcpuid)
2523 {
2524 struct svm_softc *svm_sc;
2525 struct vlapic *vlapic;
2526
2527 svm_sc = arg;
2528 vlapic = kmem_zalloc(sizeof (struct vlapic), KM_SLEEP);
2529 vlapic->vm = svm_sc->vm;
2530 vlapic->vcpuid = vcpuid;
2531 vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
2532
2533 vlapic_init(vlapic);
2534
2535 return (vlapic);
2536 }
2537
2538 static void
svm_vlapic_cleanup(void * arg,struct vlapic * vlapic)2539 svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
2540 {
2541 vlapic_cleanup(vlapic);
2542 kmem_free(vlapic, sizeof (struct vlapic));
2543 }
2544
2545 static void
svm_pause(void * arg,int vcpu)2546 svm_pause(void *arg, int vcpu)
2547 {
2548 struct svm_softc *sc = arg;
2549 struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
2550
2551 /*
2552 * If an event is pending injection in the VMCB, stash it in
2553 * exit_intinfo as if it were deferred by an exit from guest context.
2554 */
2555 const uint64_t intinfo = ctrl->eventinj;
2556 if ((intinfo & VMCB_EVENTINJ_VALID) != 0) {
2557 svm_stash_intinfo(sc, vcpu, intinfo);
2558 ctrl->eventinj = 0;
2559 }
2560
2561 /*
2562 * Now that no event is pending injection, interrupt-window exiting and
2563 * NMI-blocking can be disabled. If/when this vCPU is made to run
2564 * again, those conditions will be reinstated when the now-queued events
2565 * are re-injected.
2566 */
2567 svm_disable_intr_window_exiting(sc, vcpu);
2568 svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
2569 }
2570
2571 static void
svm_savectx(void * arg,int vcpu)2572 svm_savectx(void *arg, int vcpu)
2573 {
2574 struct svm_softc *sc = arg;
2575
2576 /* We should _never_ go off-CPU with the GIF disabled */
2577 ASSERT(!hma_svm_gif_is_disabled());
2578
2579 if (sc->vcpu[vcpu].loaded) {
2580 svm_msr_guest_exit(sc, vcpu);
2581 }
2582 }
2583
2584 static void
svm_restorectx(void * arg,int vcpu)2585 svm_restorectx(void *arg, int vcpu)
2586 {
2587 struct svm_softc *sc = arg;
2588
2589 if (sc->vcpu[vcpu].loaded) {
2590 svm_msr_guest_enter(sc, vcpu);
2591 }
2592 }
2593
2594 static freqratio_res_t
svm_freq_ratio(uint64_t guest_hz,uint64_t host_hz,uint64_t * mult)2595 svm_freq_ratio(uint64_t guest_hz, uint64_t host_hz, uint64_t *mult)
2596 {
2597 /*
2598 * Check whether scaling is needed at all before potentially erroring
2599 * out for other reasons.
2600 */
2601 if (guest_hz == host_hz) {
2602 return (FR_SCALING_NOT_NEEDED);
2603 }
2604
2605 /*
2606 * Confirm that scaling is available.
2607 */
2608 if (!has_tsc_freq_ctl()) {
2609 return (FR_SCALING_NOT_SUPPORTED);
2610 }
2611
2612 /*
2613 * Verify the guest_hz is within the supported range.
2614 */
2615 if ((guest_hz < AMD_TSC_MIN_FREQ) ||
2616 (guest_hz >= (host_hz * AMD_TSC_MAX_FREQ_RATIO))) {
2617 return (FR_OUT_OF_RANGE);
2618 }
2619
2620 /* Calculate the multiplier. */
2621 uint64_t m = vmm_calc_freq_multiplier(guest_hz, host_hz,
2622 AMD_TSCM_FRAC_SIZE);
2623 *mult = m;
2624
2625 return (FR_VALID);
2626 }
2627
2628 struct vmm_ops vmm_ops_amd = {
2629 .init = svm_init,
2630 .resume = svm_restore,
2631
2632 .vminit = svm_vminit,
2633 .vmrun = svm_vmrun,
2634 .vmcleanup = svm_vmcleanup,
2635 .vmgetreg = svm_getreg,
2636 .vmsetreg = svm_setreg,
2637 .vmgetdesc = svm_getdesc,
2638 .vmsetdesc = svm_setdesc,
2639 .vmgetcap = svm_getcap,
2640 .vmsetcap = svm_setcap,
2641 .vlapic_init = svm_vlapic_init,
2642 .vlapic_cleanup = svm_vlapic_cleanup,
2643 .vmpause = svm_pause,
2644
2645 .vmsavectx = svm_savectx,
2646 .vmrestorectx = svm_restorectx,
2647
2648 .vmgetmsr = svm_get_msr,
2649 .vmsetmsr = svm_set_msr,
2650
2651 .vmfreqratio = svm_freq_ratio,
2652 .fr_intsize = AMD_TSCM_INT_SIZE,
2653 .fr_fracsize = AMD_TSCM_FRAC_SIZE,
2654 };
2655