1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
3 #include <linux/kvm_host.h>
4
5 #include "irq.h"
6 #include "mmu.h"
7 #include "kvm_cache_regs.h"
8 #include "x86.h"
9 #include "smm.h"
10 #include "cpuid.h"
11 #include "pmu.h"
12
13 #include <linux/module.h>
14 #include <linux/mod_devicetable.h>
15 #include <linux/kernel.h>
16 #include <linux/vmalloc.h>
17 #include <linux/highmem.h>
18 #include <linux/amd-iommu.h>
19 #include <linux/sched.h>
20 #include <linux/trace_events.h>
21 #include <linux/slab.h>
22 #include <linux/hashtable.h>
23 #include <linux/objtool.h>
24 #include <linux/psp-sev.h>
25 #include <linux/file.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/rwsem.h>
29 #include <linux/cc_platform.h>
30 #include <linux/smp.h>
31 #include <linux/string_choices.h>
32 #include <linux/mutex.h>
33
34 #include <asm/apic.h>
35 #include <asm/msr.h>
36 #include <asm/perf_event.h>
37 #include <asm/tlbflush.h>
38 #include <asm/desc.h>
39 #include <asm/debugreg.h>
40 #include <asm/kvm_para.h>
41 #include <asm/irq_remapping.h>
42 #include <asm/spec-ctrl.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/traps.h>
45 #include <asm/reboot.h>
46 #include <asm/fpu/api.h>
47
48 #include <trace/events/ipi.h>
49
50 #include "trace.h"
51
52 #include "svm.h"
53 #include "svm_ops.h"
54
55 #include "kvm_onhyperv.h"
56 #include "svm_onhyperv.h"
57
58 MODULE_AUTHOR("Qumranet");
59 MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
60 MODULE_LICENSE("GPL");
61
62 #ifdef MODULE
63 static const struct x86_cpu_id svm_cpu_id[] = {
64 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
65 {}
66 };
67 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
68 #endif
69
70 #define SEG_TYPE_LDT 2
71 #define SEG_TYPE_BUSY_TSS16 3
72
73 static bool erratum_383_found __read_mostly;
74
75 /*
76 * Set osvw_len to higher value when updated Revision Guides
77 * are published and we know what the new status bits are
78 */
79 static uint64_t osvw_len = 4, osvw_status;
80
81 static DEFINE_PER_CPU(u64, current_tsc_ratio);
82
83 /*
84 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
85 * pause_filter_count: On processors that support Pause filtering(indicated
86 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
87 * count value. On VMRUN this value is loaded into an internal counter.
88 * Each time a pause instruction is executed, this counter is decremented
89 * until it reaches zero at which time a #VMEXIT is generated if pause
90 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
91 * Intercept Filtering for more details.
92 * This also indicate if ple logic enabled.
93 *
94 * pause_filter_thresh: In addition, some processor families support advanced
95 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
96 * the amount of time a guest is allowed to execute in a pause loop.
97 * In this mode, a 16-bit pause filter threshold field is added in the
98 * VMCB. The threshold value is a cycle count that is used to reset the
99 * pause counter. As with simple pause filtering, VMRUN loads the pause
100 * count value from VMCB into an internal counter. Then, on each pause
101 * instruction the hardware checks the elapsed number of cycles since
102 * the most recent pause instruction against the pause filter threshold.
103 * If the elapsed cycle count is greater than the pause filter threshold,
104 * then the internal pause count is reloaded from the VMCB and execution
105 * continues. If the elapsed cycle count is less than the pause filter
106 * threshold, then the internal pause count is decremented. If the count
107 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
108 * triggered. If advanced pause filtering is supported and pause filter
109 * threshold field is set to zero, the filter will operate in the simpler,
110 * count only mode.
111 */
112
113 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
114 module_param(pause_filter_thresh, ushort, 0444);
115
116 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
117 module_param(pause_filter_count, ushort, 0444);
118
119 /* Default doubles per-vcpu window every exit. */
120 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
121 module_param(pause_filter_count_grow, ushort, 0444);
122
123 /* Default resets per-vcpu window every exit to pause_filter_count. */
124 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
125 module_param(pause_filter_count_shrink, ushort, 0444);
126
127 /* Default is to compute the maximum so we can never overflow. */
128 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
129 module_param(pause_filter_count_max, ushort, 0444);
130
131 /*
132 * Use nested page tables by default. Note, NPT may get forced off by
133 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
134 */
135 bool npt_enabled = true;
136 module_param_named(npt, npt_enabled, bool, 0444);
137
138 /* allow nested virtualization in KVM/SVM */
139 static int nested = true;
140 module_param(nested, int, 0444);
141
142 /* enable/disable Next RIP Save */
143 int nrips = true;
144 module_param(nrips, int, 0444);
145
146 /* enable/disable Virtual VMLOAD VMSAVE */
147 static int vls = true;
148 module_param(vls, int, 0444);
149
150 /* enable/disable Virtual GIF */
151 int vgif = true;
152 module_param(vgif, int, 0444);
153
154 /* enable/disable LBR virtualization */
155 int lbrv = true;
156 module_param(lbrv, int, 0444);
157
158 static int tsc_scaling = true;
159 module_param(tsc_scaling, int, 0444);
160
161 module_param(enable_device_posted_irqs, bool, 0444);
162
163 bool __read_mostly dump_invalid_vmcb;
164 module_param(dump_invalid_vmcb, bool, 0644);
165
166
167 bool intercept_smi = true;
168 module_param(intercept_smi, bool, 0444);
169
170 bool vnmi = true;
171 module_param(vnmi, bool, 0444);
172
173 module_param(enable_mediated_pmu, bool, 0444);
174
175 static bool svm_gp_erratum_intercept = true;
176
177 static u8 rsm_ins_bytes[] = "\x0f\xaa";
178
179 static unsigned long iopm_base;
180
181 DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
182
183 static DEFINE_MUTEX(vmcb_dump_mutex);
184
185 /*
186 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
187 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
188 *
189 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
190 * defer the restoration of TSC_AUX until the CPU returns to userspace.
191 */
192 int tsc_aux_uret_slot __ro_after_init = -1;
193
get_npt_level(void)194 static int get_npt_level(void)
195 {
196 #ifdef CONFIG_X86_64
197 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
198 #else
199 return PT32E_ROOT_LEVEL;
200 #endif
201 }
202
svm_set_efer(struct kvm_vcpu * vcpu,u64 efer)203 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
204 {
205 struct vcpu_svm *svm = to_svm(vcpu);
206 u64 old_efer = vcpu->arch.efer;
207 vcpu->arch.efer = efer;
208
209 if (!npt_enabled) {
210 /* Shadow paging assumes NX to be available. */
211 efer |= EFER_NX;
212
213 if (!(efer & EFER_LMA))
214 efer &= ~EFER_LME;
215 }
216
217 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
218 if (!(efer & EFER_SVME)) {
219 svm_leave_nested(vcpu);
220 /* #GP intercept is still needed for vmware backdoor */
221 if (!enable_vmware_backdoor)
222 clr_exception_intercept(svm, GP_VECTOR);
223
224 /*
225 * Free the nested guest state, unless we are in SMM.
226 * In this case we will return to the nested guest
227 * as soon as we leave SMM.
228 */
229 if (!is_smm(vcpu))
230 svm_free_nested(svm);
231
232 } else {
233 int ret = svm_allocate_nested(svm);
234
235 if (ret) {
236 vcpu->arch.efer = old_efer;
237 return ret;
238 }
239
240 /*
241 * Never intercept #GP for SEV guests, KVM can't
242 * decrypt guest memory to workaround the erratum.
243 */
244 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
245 set_exception_intercept(svm, GP_VECTOR);
246 }
247 }
248
249 svm->vmcb->save.efer = efer | EFER_SVME;
250 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
251 return 0;
252 }
253
svm_get_interrupt_shadow(struct kvm_vcpu * vcpu)254 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
255 {
256 struct vcpu_svm *svm = to_svm(vcpu);
257 u32 ret = 0;
258
259 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
260 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
261 return ret;
262 }
263
svm_set_interrupt_shadow(struct kvm_vcpu * vcpu,int mask)264 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
265 {
266 struct vcpu_svm *svm = to_svm(vcpu);
267
268 if (mask == 0)
269 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
270 else
271 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
272
273 }
274
__svm_skip_emulated_instruction(struct kvm_vcpu * vcpu,int emul_type,bool commit_side_effects)275 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
276 int emul_type,
277 bool commit_side_effects)
278 {
279 struct vcpu_svm *svm = to_svm(vcpu);
280 unsigned long old_rflags;
281
282 /*
283 * SEV-ES does not expose the next RIP. The RIP update is controlled by
284 * the type of exit and the #VC handler in the guest.
285 */
286 if (sev_es_guest(vcpu->kvm))
287 goto done;
288
289 if (nrips && svm->vmcb->control.next_rip != 0) {
290 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
291 svm->next_rip = svm->vmcb->control.next_rip;
292 }
293
294 if (!svm->next_rip) {
295 if (unlikely(!commit_side_effects))
296 old_rflags = svm->vmcb->save.rflags;
297
298 if (!kvm_emulate_instruction(vcpu, emul_type))
299 return 0;
300
301 if (unlikely(!commit_side_effects))
302 svm->vmcb->save.rflags = old_rflags;
303 } else {
304 kvm_rip_write(vcpu, svm->next_rip);
305 }
306
307 done:
308 if (likely(commit_side_effects))
309 svm_set_interrupt_shadow(vcpu, 0);
310
311 return 1;
312 }
313
svm_skip_emulated_instruction(struct kvm_vcpu * vcpu)314 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
315 {
316 return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
317 }
318
svm_update_soft_interrupt_rip(struct kvm_vcpu * vcpu,u8 vector)319 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector)
320 {
321 const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT |
322 EMULTYPE_SET_SOFT_INT_VECTOR(vector);
323 unsigned long rip, old_rip = kvm_rip_read(vcpu);
324 struct vcpu_svm *svm = to_svm(vcpu);
325
326 /*
327 * Due to architectural shortcomings, the CPU doesn't always provide
328 * NextRIP, e.g. if KVM intercepted an exception that occurred while
329 * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
330 * the instruction even if NextRIP is supported to acquire the next
331 * RIP so that it can be shoved into the NextRIP field, otherwise
332 * hardware will fail to advance guest RIP during event injection.
333 * Drop the exception/interrupt if emulation fails and effectively
334 * retry the instruction, it's the least awful option. If NRIPS is
335 * in use, the skip must not commit any side effects such as clearing
336 * the interrupt shadow or RFLAGS.RF.
337 */
338 if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips))
339 return -EIO;
340
341 rip = kvm_rip_read(vcpu);
342
343 /*
344 * Save the injection information, even when using next_rip, as the
345 * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
346 * doesn't complete due to a VM-Exit occurring while the CPU is
347 * vectoring the event. Decoding the instruction isn't guaranteed to
348 * work as there may be no backing instruction, e.g. if the event is
349 * being injected by L1 for L2, or if the guest is patching INT3 into
350 * a different instruction.
351 */
352 svm->soft_int_injected = true;
353 svm->soft_int_csbase = svm->vmcb->save.cs.base;
354 svm->soft_int_old_rip = old_rip;
355 svm->soft_int_next_rip = rip;
356
357 if (nrips)
358 kvm_rip_write(vcpu, old_rip);
359
360 if (static_cpu_has(X86_FEATURE_NRIPS))
361 svm->vmcb->control.next_rip = rip;
362
363 return 0;
364 }
365
svm_inject_exception(struct kvm_vcpu * vcpu)366 static void svm_inject_exception(struct kvm_vcpu *vcpu)
367 {
368 struct kvm_queued_exception *ex = &vcpu->arch.exception;
369 struct vcpu_svm *svm = to_svm(vcpu);
370
371 kvm_deliver_exception_payload(vcpu, ex);
372
373 if (kvm_exception_is_soft(ex->vector) &&
374 svm_update_soft_interrupt_rip(vcpu, ex->vector))
375 return;
376
377 svm->vmcb->control.event_inj = ex->vector
378 | SVM_EVTINJ_VALID
379 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
380 | SVM_EVTINJ_TYPE_EXEPT;
381 svm->vmcb->control.event_inj_err = ex->error_code;
382 }
383
svm_init_erratum_383(void)384 static void svm_init_erratum_383(void)
385 {
386 u64 val;
387
388 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
389 return;
390
391 /* Use _safe variants to not break nested virtualization */
392 if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val))
393 return;
394
395 val |= (1ULL << 47);
396
397 native_write_msr_safe(MSR_AMD64_DC_CFG, val);
398
399 erratum_383_found = true;
400 }
401
svm_init_osvw(struct kvm_vcpu * vcpu)402 static void svm_init_osvw(struct kvm_vcpu *vcpu)
403 {
404 /*
405 * Guests should see errata 400 and 415 as fixed (assuming that
406 * HLT and IO instructions are intercepted).
407 */
408 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
409 vcpu->arch.osvw.status = osvw_status & ~(6ULL);
410
411 /*
412 * By increasing VCPU's osvw.length to 3 we are telling the guest that
413 * all osvw.status bits inside that length, including bit 0 (which is
414 * reserved for erratum 298), are valid. However, if host processor's
415 * osvw_len is 0 then osvw_status[0] carries no information. We need to
416 * be conservative here and therefore we tell the guest that erratum 298
417 * is present (because we really don't know).
418 */
419 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
420 vcpu->arch.osvw.status |= 1;
421 }
422
__kvm_is_svm_supported(void)423 static bool __kvm_is_svm_supported(void)
424 {
425 int cpu = smp_processor_id();
426 struct cpuinfo_x86 *c = &cpu_data(cpu);
427
428 if (c->x86_vendor != X86_VENDOR_AMD &&
429 c->x86_vendor != X86_VENDOR_HYGON) {
430 pr_err("CPU %d isn't AMD or Hygon\n", cpu);
431 return false;
432 }
433
434 if (!cpu_has(c, X86_FEATURE_SVM)) {
435 pr_err("SVM not supported by CPU %d\n", cpu);
436 return false;
437 }
438
439 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
440 pr_info("KVM is unsupported when running as an SEV guest\n");
441 return false;
442 }
443
444 return true;
445 }
446
kvm_is_svm_supported(void)447 static bool kvm_is_svm_supported(void)
448 {
449 bool supported;
450
451 migrate_disable();
452 supported = __kvm_is_svm_supported();
453 migrate_enable();
454
455 return supported;
456 }
457
svm_check_processor_compat(void)458 static int svm_check_processor_compat(void)
459 {
460 if (!__kvm_is_svm_supported())
461 return -EIO;
462
463 return 0;
464 }
465
__svm_write_tsc_multiplier(u64 multiplier)466 static void __svm_write_tsc_multiplier(u64 multiplier)
467 {
468 if (multiplier == __this_cpu_read(current_tsc_ratio))
469 return;
470
471 wrmsrq(MSR_AMD64_TSC_RATIO, multiplier);
472 __this_cpu_write(current_tsc_ratio, multiplier);
473 }
474
sev_es_host_save_area(struct svm_cpu_data * sd)475 static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
476 {
477 return &sd->save_area->host_sev_es_save;
478 }
479
kvm_cpu_svm_disable(void)480 static inline void kvm_cpu_svm_disable(void)
481 {
482 uint64_t efer;
483
484 wrmsrq(MSR_VM_HSAVE_PA, 0);
485 rdmsrq(MSR_EFER, efer);
486 if (efer & EFER_SVME) {
487 /*
488 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
489 * NMI aren't blocked.
490 */
491 stgi();
492 wrmsrq(MSR_EFER, efer & ~EFER_SVME);
493 }
494 }
495
svm_emergency_disable_virtualization_cpu(void)496 static void svm_emergency_disable_virtualization_cpu(void)
497 {
498 kvm_rebooting = true;
499
500 kvm_cpu_svm_disable();
501 }
502
svm_disable_virtualization_cpu(void)503 static void svm_disable_virtualization_cpu(void)
504 {
505 /* Make sure we clean up behind us */
506 if (tsc_scaling)
507 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
508
509 kvm_cpu_svm_disable();
510
511 amd_pmu_disable_virt();
512 }
513
svm_enable_virtualization_cpu(void)514 static int svm_enable_virtualization_cpu(void)
515 {
516
517 struct svm_cpu_data *sd;
518 uint64_t efer;
519 int me = raw_smp_processor_id();
520
521 rdmsrq(MSR_EFER, efer);
522 if (efer & EFER_SVME)
523 return -EBUSY;
524
525 sd = per_cpu_ptr(&svm_data, me);
526 sd->asid_generation = 1;
527 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
528 sd->next_asid = sd->max_asid + 1;
529 sd->min_asid = max_sev_asid + 1;
530
531 wrmsrq(MSR_EFER, efer | EFER_SVME);
532
533 wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa);
534
535 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
536 /*
537 * Set the default value, even if we don't use TSC scaling
538 * to avoid having stale value in the msr
539 */
540 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
541 }
542
543
544 /*
545 * Get OSVW bits.
546 *
547 * Note that it is possible to have a system with mixed processor
548 * revisions and therefore different OSVW bits. If bits are not the same
549 * on different processors then choose the worst case (i.e. if erratum
550 * is present on one processor and not on another then assume that the
551 * erratum is present everywhere).
552 */
553 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
554 u64 len, status = 0;
555 int err;
556
557 err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len);
558 if (!err)
559 err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status);
560
561 if (err)
562 osvw_status = osvw_len = 0;
563 else {
564 if (len < osvw_len)
565 osvw_len = len;
566 osvw_status |= status;
567 osvw_status &= (1ULL << osvw_len) - 1;
568 }
569 } else
570 osvw_status = osvw_len = 0;
571
572 svm_init_erratum_383();
573
574 amd_pmu_enable_virt();
575
576 return 0;
577 }
578
svm_cpu_uninit(int cpu)579 static void svm_cpu_uninit(int cpu)
580 {
581 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
582
583 if (!sd->save_area)
584 return;
585
586 kfree(sd->sev_vmcbs);
587 __free_page(__sme_pa_to_page(sd->save_area_pa));
588 sd->save_area_pa = 0;
589 sd->save_area = NULL;
590 }
591
svm_cpu_init(int cpu)592 static int svm_cpu_init(int cpu)
593 {
594 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
595 struct page *save_area_page;
596 int ret = -ENOMEM;
597
598 memset(sd, 0, sizeof(struct svm_cpu_data));
599 save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
600 if (!save_area_page)
601 return ret;
602
603 ret = sev_cpu_init(sd);
604 if (ret)
605 goto free_save_area;
606
607 sd->save_area = page_address(save_area_page);
608 sd->save_area_pa = __sme_page_pa(save_area_page);
609 return 0;
610
611 free_save_area:
612 __free_page(save_area_page);
613 return ret;
614
615 }
616
set_dr_intercepts(struct vcpu_svm * svm)617 static void set_dr_intercepts(struct vcpu_svm *svm)
618 {
619 struct vmcb *vmcb = svm->vmcb01.ptr;
620
621 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
622 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
623 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
624 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
625 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
626 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
627 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
628 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
629 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
630 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
631 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
632 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
633 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
634 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
635 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
636 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
637
638 recalc_intercepts(svm);
639 }
640
clr_dr_intercepts(struct vcpu_svm * svm)641 static void clr_dr_intercepts(struct vcpu_svm *svm)
642 {
643 struct vmcb *vmcb = svm->vmcb01.ptr;
644
645 vmcb->control.intercepts[INTERCEPT_DR] = 0;
646
647 recalc_intercepts(svm);
648 }
649
msr_write_intercepted(struct kvm_vcpu * vcpu,u32 msr)650 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
651 {
652 /*
653 * For non-nested case:
654 * If the L01 MSR bitmap does not intercept the MSR, then we need to
655 * save it.
656 *
657 * For nested case:
658 * If the L02 MSR bitmap does not intercept the MSR, then we need to
659 * save it.
660 */
661 void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
662 to_svm(vcpu)->msrpm;
663
664 return svm_test_msr_bitmap_write(msrpm, msr);
665 }
666
svm_set_intercept_for_msr(struct kvm_vcpu * vcpu,u32 msr,int type,bool set)667 void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
668 {
669 struct vcpu_svm *svm = to_svm(vcpu);
670 void *msrpm = svm->msrpm;
671
672 /* Don't disable interception for MSRs userspace wants to handle. */
673 if (type & MSR_TYPE_R) {
674 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
675 svm_clear_msr_bitmap_read(msrpm, msr);
676 else
677 svm_set_msr_bitmap_read(msrpm, msr);
678 }
679
680 if (type & MSR_TYPE_W) {
681 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
682 svm_clear_msr_bitmap_write(msrpm, msr);
683 else
684 svm_set_msr_bitmap_write(msrpm, msr);
685 }
686
687 svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
688 svm->nested.force_msr_bitmap_recalc = true;
689 }
690
svm_alloc_permissions_map(unsigned long size,gfp_t gfp_mask)691 void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
692 {
693 unsigned int order = get_order(size);
694 struct page *pages = alloc_pages(gfp_mask, order);
695 void *pm;
696
697 if (!pages)
698 return NULL;
699
700 /*
701 * Set all bits in the permissions map so that all MSR and I/O accesses
702 * are intercepted by default.
703 */
704 pm = page_address(pages);
705 memset(pm, 0xff, PAGE_SIZE * (1 << order));
706
707 return pm;
708 }
709
svm_recalc_lbr_msr_intercepts(struct kvm_vcpu * vcpu)710 static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
711 {
712 struct vcpu_svm *svm = to_svm(vcpu);
713 bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
714
715 if (intercept == svm->lbr_msrs_intercepted)
716 return;
717
718 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
719 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
720 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
721 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);
722
723 if (sev_es_guest(vcpu->kvm))
724 svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
725
726 svm->lbr_msrs_intercepted = intercept;
727 }
728
svm_vcpu_free_msrpm(void * msrpm)729 void svm_vcpu_free_msrpm(void *msrpm)
730 {
731 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
732 }
733
svm_recalc_pmu_msr_intercepts(struct kvm_vcpu * vcpu)734 static void svm_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu)
735 {
736 bool intercept = !kvm_vcpu_has_mediated_pmu(vcpu);
737 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
738 int i;
739
740 if (!enable_mediated_pmu)
741 return;
742
743 /* Legacy counters are always available for AMD CPUs with a PMU. */
744 for (i = 0; i < min(pmu->nr_arch_gp_counters, AMD64_NUM_COUNTERS); i++)
745 svm_set_intercept_for_msr(vcpu, MSR_K7_PERFCTR0 + i,
746 MSR_TYPE_RW, intercept);
747
748 intercept |= !guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE);
749 for (i = 0; i < pmu->nr_arch_gp_counters; i++)
750 svm_set_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i,
751 MSR_TYPE_RW, intercept);
752
753 for ( ; i < kvm_pmu_cap.num_counters_gp; i++)
754 svm_enable_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i,
755 MSR_TYPE_RW);
756
757 intercept = kvm_need_perf_global_ctrl_intercept(vcpu);
758 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
759 MSR_TYPE_RW, intercept);
760 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
761 MSR_TYPE_RW, intercept);
762 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
763 MSR_TYPE_RW, intercept);
764 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET,
765 MSR_TYPE_RW, intercept);
766 }
767
svm_recalc_msr_intercepts(struct kvm_vcpu * vcpu)768 static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
769 {
770 struct vcpu_svm *svm = to_svm(vcpu);
771
772 svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
773 svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
774
775 #ifdef CONFIG_X86_64
776 svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
777 svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
778 svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
779 svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
780 svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
781 svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
782 #endif
783
784 if (lbrv)
785 svm_recalc_lbr_msr_intercepts(vcpu);
786
787 if (cpu_feature_enabled(X86_FEATURE_IBPB))
788 svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
789 !guest_has_pred_cmd_msr(vcpu));
790
791 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
792 svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
793 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
794
795 /*
796 * Disable interception of SPEC_CTRL if KVM doesn't need to manually
797 * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
798 * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
799 * using SPEC_CTRL.
800 */
801 if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
802 svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
803 !guest_has_spec_ctrl_msr(vcpu));
804 else
805 svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
806 !svm->spec_ctrl);
807
808 /*
809 * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
810 * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
811 */
812 svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
813 guest_cpuid_is_intel_compatible(vcpu));
814 svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
815 guest_cpuid_is_intel_compatible(vcpu));
816
817 if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
818 svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
819 svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
820 }
821
822 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
823 bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
824
825 svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled);
826 svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled);
827 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled);
828 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled);
829 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled);
830 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled);
831 }
832
833 if (sev_es_guest(vcpu->kvm))
834 sev_es_recalc_msr_intercepts(vcpu);
835
836 svm_recalc_pmu_msr_intercepts(vcpu);
837
838 /*
839 * x2APIC intercepts are modified on-demand and cannot be filtered by
840 * userspace.
841 */
842 }
843
svm_copy_lbrs(struct vmcb * to_vmcb,struct vmcb * from_vmcb)844 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
845 {
846 to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
847 to_vmcb->save.br_from = from_vmcb->save.br_from;
848 to_vmcb->save.br_to = from_vmcb->save.br_to;
849 to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
850 to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
851
852 vmcb_mark_dirty(to_vmcb, VMCB_LBR);
853 }
854
__svm_enable_lbrv(struct kvm_vcpu * vcpu)855 static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
856 {
857 to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
858 }
859
svm_enable_lbrv(struct kvm_vcpu * vcpu)860 void svm_enable_lbrv(struct kvm_vcpu *vcpu)
861 {
862 __svm_enable_lbrv(vcpu);
863 svm_recalc_lbr_msr_intercepts(vcpu);
864 }
865
__svm_disable_lbrv(struct kvm_vcpu * vcpu)866 static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
867 {
868 KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
869 to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
870 }
871
svm_update_lbrv(struct kvm_vcpu * vcpu)872 void svm_update_lbrv(struct kvm_vcpu *vcpu)
873 {
874 struct vcpu_svm *svm = to_svm(vcpu);
875 bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
876 bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
877 (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
878 (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
879
880 if (enable_lbrv && !current_enable_lbrv)
881 __svm_enable_lbrv(vcpu);
882 else if (!enable_lbrv && current_enable_lbrv)
883 __svm_disable_lbrv(vcpu);
884
885 /*
886 * During nested transitions, it is possible that the current VMCB has
887 * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
888 * In this case, even though LBR_CTL does not need an update, intercepts
889 * do, so always recalculate the intercepts here.
890 */
891 svm_recalc_lbr_msr_intercepts(vcpu);
892 }
893
disable_nmi_singlestep(struct vcpu_svm * svm)894 void disable_nmi_singlestep(struct vcpu_svm *svm)
895 {
896 svm->nmi_singlestep = false;
897
898 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
899 /* Clear our flags if they were not set by the guest */
900 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
901 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
902 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
903 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
904 }
905 }
906
grow_ple_window(struct kvm_vcpu * vcpu)907 static void grow_ple_window(struct kvm_vcpu *vcpu)
908 {
909 struct vcpu_svm *svm = to_svm(vcpu);
910 struct vmcb_control_area *control = &svm->vmcb->control;
911 int old = control->pause_filter_count;
912
913 if (kvm_pause_in_guest(vcpu->kvm))
914 return;
915
916 control->pause_filter_count = __grow_ple_window(old,
917 pause_filter_count,
918 pause_filter_count_grow,
919 pause_filter_count_max);
920
921 if (control->pause_filter_count != old) {
922 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
923 trace_kvm_ple_window_update(vcpu->vcpu_id,
924 control->pause_filter_count, old);
925 }
926 }
927
shrink_ple_window(struct kvm_vcpu * vcpu)928 static void shrink_ple_window(struct kvm_vcpu *vcpu)
929 {
930 struct vcpu_svm *svm = to_svm(vcpu);
931 struct vmcb_control_area *control = &svm->vmcb->control;
932 int old = control->pause_filter_count;
933
934 if (kvm_pause_in_guest(vcpu->kvm))
935 return;
936
937 control->pause_filter_count =
938 __shrink_ple_window(old,
939 pause_filter_count,
940 pause_filter_count_shrink,
941 pause_filter_count);
942 if (control->pause_filter_count != old) {
943 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
944 trace_kvm_ple_window_update(vcpu->vcpu_id,
945 control->pause_filter_count, old);
946 }
947 }
948
svm_hardware_unsetup(void)949 static void svm_hardware_unsetup(void)
950 {
951 int cpu;
952
953 avic_hardware_unsetup();
954
955 sev_hardware_unsetup();
956
957 for_each_possible_cpu(cpu)
958 svm_cpu_uninit(cpu);
959
960 __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE));
961 iopm_base = 0;
962 }
963
init_seg(struct vmcb_seg * seg)964 static void init_seg(struct vmcb_seg *seg)
965 {
966 seg->selector = 0;
967 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
968 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
969 seg->limit = 0xffff;
970 seg->base = 0;
971 }
972
init_sys_seg(struct vmcb_seg * seg,uint32_t type)973 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
974 {
975 seg->selector = 0;
976 seg->attrib = SVM_SELECTOR_P_MASK | type;
977 seg->limit = 0xffff;
978 seg->base = 0;
979 }
980
svm_get_l2_tsc_offset(struct kvm_vcpu * vcpu)981 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
982 {
983 struct vcpu_svm *svm = to_svm(vcpu);
984
985 return svm->nested.ctl.tsc_offset;
986 }
987
svm_get_l2_tsc_multiplier(struct kvm_vcpu * vcpu)988 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
989 {
990 struct vcpu_svm *svm = to_svm(vcpu);
991
992 return svm->tsc_ratio_msr;
993 }
994
svm_write_tsc_offset(struct kvm_vcpu * vcpu)995 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
996 {
997 struct vcpu_svm *svm = to_svm(vcpu);
998
999 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1000 svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
1001 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1002 }
1003
svm_write_tsc_multiplier(struct kvm_vcpu * vcpu)1004 void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
1005 {
1006 preempt_disable();
1007 if (to_svm(vcpu)->guest_state_loaded)
1008 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1009 preempt_enable();
1010 }
1011
1012 /* Evaluate instruction intercepts that depend on guest CPUID features. */
svm_recalc_instruction_intercepts(struct kvm_vcpu * vcpu)1013 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
1014 {
1015 struct vcpu_svm *svm = to_svm(vcpu);
1016
1017 /*
1018 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1019 * roots, or if INVPCID is disabled in the guest to inject #UD.
1020 */
1021 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1022 if (!npt_enabled ||
1023 !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID))
1024 svm_set_intercept(svm, INTERCEPT_INVPCID);
1025 else
1026 svm_clr_intercept(svm, INTERCEPT_INVPCID);
1027 }
1028
1029 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1030 if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
1031 svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1032 else
1033 svm_set_intercept(svm, INTERCEPT_RDTSCP);
1034 }
1035
1036 /*
1037 * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is
1038 * always set if vls is enabled. If the intercepts are set, the bit is
1039 * meaningless anyway.
1040 */
1041 if (guest_cpuid_is_intel_compatible(vcpu)) {
1042 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1043 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1044 } else {
1045 /*
1046 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1047 * in VMCB and clear intercepts to avoid #VMEXIT.
1048 */
1049 if (vls) {
1050 svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1051 svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1052 }
1053 }
1054
1055 if (kvm_need_rdpmc_intercept(vcpu))
1056 svm_set_intercept(svm, INTERCEPT_RDPMC);
1057 else
1058 svm_clr_intercept(svm, INTERCEPT_RDPMC);
1059 }
1060
svm_recalc_intercepts(struct kvm_vcpu * vcpu)1061 static void svm_recalc_intercepts(struct kvm_vcpu *vcpu)
1062 {
1063 svm_recalc_instruction_intercepts(vcpu);
1064 svm_recalc_msr_intercepts(vcpu);
1065 }
1066
init_vmcb(struct kvm_vcpu * vcpu,bool init_event)1067 static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
1068 {
1069 struct vcpu_svm *svm = to_svm(vcpu);
1070 struct vmcb *vmcb = svm->vmcb01.ptr;
1071 struct vmcb_control_area *control = &vmcb->control;
1072 struct vmcb_save_area *save = &vmcb->save;
1073
1074 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1075 svm_set_intercept(svm, INTERCEPT_CR3_READ);
1076 svm_set_intercept(svm, INTERCEPT_CR4_READ);
1077 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1078 svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1079 svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1080 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1081
1082 set_dr_intercepts(svm);
1083
1084 set_exception_intercept(svm, PF_VECTOR);
1085 set_exception_intercept(svm, UD_VECTOR);
1086 set_exception_intercept(svm, MC_VECTOR);
1087 set_exception_intercept(svm, AC_VECTOR);
1088 set_exception_intercept(svm, DB_VECTOR);
1089 /*
1090 * Guest access to VMware backdoor ports could legitimately
1091 * trigger #GP because of TSS I/O permission bitmap.
1092 * We intercept those #GP and allow access to them anyway
1093 * as VMware does.
1094 */
1095 if (enable_vmware_backdoor)
1096 set_exception_intercept(svm, GP_VECTOR);
1097
1098 svm_set_intercept(svm, INTERCEPT_INTR);
1099 svm_set_intercept(svm, INTERCEPT_NMI);
1100
1101 if (intercept_smi)
1102 svm_set_intercept(svm, INTERCEPT_SMI);
1103
1104 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1105 svm_set_intercept(svm, INTERCEPT_RDPMC);
1106 svm_set_intercept(svm, INTERCEPT_CPUID);
1107 svm_set_intercept(svm, INTERCEPT_INVD);
1108 svm_set_intercept(svm, INTERCEPT_INVLPG);
1109 svm_set_intercept(svm, INTERCEPT_INVLPGA);
1110 svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1111 svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1112 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1113 svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1114 svm_set_intercept(svm, INTERCEPT_VMRUN);
1115 svm_set_intercept(svm, INTERCEPT_VMMCALL);
1116 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1117 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1118 svm_set_intercept(svm, INTERCEPT_STGI);
1119 svm_set_intercept(svm, INTERCEPT_CLGI);
1120 svm_set_intercept(svm, INTERCEPT_SKINIT);
1121 svm_set_intercept(svm, INTERCEPT_WBINVD);
1122 svm_set_intercept(svm, INTERCEPT_XSETBV);
1123 svm_set_intercept(svm, INTERCEPT_RDPRU);
1124 svm_set_intercept(svm, INTERCEPT_RSM);
1125
1126 if (!kvm_mwait_in_guest(vcpu->kvm)) {
1127 svm_set_intercept(svm, INTERCEPT_MONITOR);
1128 svm_set_intercept(svm, INTERCEPT_MWAIT);
1129 }
1130
1131 if (!kvm_hlt_in_guest(vcpu->kvm)) {
1132 if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT))
1133 svm_set_intercept(svm, INTERCEPT_IDLE_HLT);
1134 else
1135 svm_set_intercept(svm, INTERCEPT_HLT);
1136 }
1137
1138 control->iopm_base_pa = iopm_base;
1139 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1140 control->int_ctl = V_INTR_MASKING_MASK;
1141
1142 init_seg(&save->es);
1143 init_seg(&save->ss);
1144 init_seg(&save->ds);
1145 init_seg(&save->fs);
1146 init_seg(&save->gs);
1147
1148 save->cs.selector = 0xf000;
1149 save->cs.base = 0xffff0000;
1150 /* Executable/Readable Code Segment */
1151 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1152 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1153 save->cs.limit = 0xffff;
1154
1155 save->gdtr.base = 0;
1156 save->gdtr.limit = 0xffff;
1157 save->idtr.base = 0;
1158 save->idtr.limit = 0xffff;
1159
1160 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1161 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1162
1163 if (npt_enabled) {
1164 /* Setup VMCB for Nested Paging */
1165 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1166 svm_clr_intercept(svm, INTERCEPT_INVLPG);
1167 clr_exception_intercept(svm, PF_VECTOR);
1168 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1169 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1170 save->g_pat = vcpu->arch.pat;
1171 save->cr3 = 0;
1172 }
1173 svm->current_vmcb->asid_generation = 0;
1174 svm->asid = 0;
1175
1176 svm->nested.vmcb12_gpa = INVALID_GPA;
1177 svm->nested.last_vmcb12_gpa = INVALID_GPA;
1178
1179 if (!kvm_pause_in_guest(vcpu->kvm)) {
1180 control->pause_filter_count = pause_filter_count;
1181 if (pause_filter_thresh)
1182 control->pause_filter_thresh = pause_filter_thresh;
1183 svm_set_intercept(svm, INTERCEPT_PAUSE);
1184 } else {
1185 svm_clr_intercept(svm, INTERCEPT_PAUSE);
1186 }
1187
1188 if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
1189 svm->vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP;
1190
1191 if (enable_apicv && irqchip_in_kernel(vcpu->kvm))
1192 avic_init_vmcb(svm, vmcb);
1193
1194 if (vnmi)
1195 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
1196
1197 if (vgif) {
1198 svm_clr_intercept(svm, INTERCEPT_STGI);
1199 svm_clr_intercept(svm, INTERCEPT_CLGI);
1200 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1201 }
1202
1203 if (vls)
1204 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1205
1206 if (vcpu->kvm->arch.bus_lock_detection_enabled)
1207 svm_set_intercept(svm, INTERCEPT_BUSLOCK);
1208
1209 if (sev_guest(vcpu->kvm))
1210 sev_init_vmcb(svm, init_event);
1211
1212 svm_hv_init_vmcb(vmcb);
1213
1214 kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
1215
1216 vmcb_mark_all_dirty(vmcb);
1217
1218 enable_gif(svm);
1219 }
1220
__svm_vcpu_reset(struct kvm_vcpu * vcpu)1221 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1222 {
1223 struct vcpu_svm *svm = to_svm(vcpu);
1224
1225 svm_init_osvw(vcpu);
1226
1227 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
1228 vcpu->arch.microcode_version = 0x01000065;
1229 svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1230
1231 svm->nmi_masked = false;
1232 svm->awaiting_iret_completion = false;
1233 }
1234
svm_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)1235 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1236 {
1237 struct vcpu_svm *svm = to_svm(vcpu);
1238
1239 svm->spec_ctrl = 0;
1240 svm->virt_spec_ctrl = 0;
1241
1242 init_vmcb(vcpu, init_event);
1243
1244 if (!init_event)
1245 __svm_vcpu_reset(vcpu);
1246 }
1247
svm_switch_vmcb(struct vcpu_svm * svm,struct kvm_vmcb_info * target_vmcb)1248 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1249 {
1250 svm->current_vmcb = target_vmcb;
1251 svm->vmcb = target_vmcb->ptr;
1252 }
1253
svm_vcpu_precreate(struct kvm * kvm)1254 static int svm_vcpu_precreate(struct kvm *kvm)
1255 {
1256 return avic_alloc_physical_id_table(kvm);
1257 }
1258
svm_vcpu_create(struct kvm_vcpu * vcpu)1259 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1260 {
1261 struct vcpu_svm *svm;
1262 struct page *vmcb01_page;
1263 int err;
1264
1265 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1266 svm = to_svm(vcpu);
1267
1268 err = -ENOMEM;
1269 vmcb01_page = snp_safe_alloc_page();
1270 if (!vmcb01_page)
1271 goto out;
1272
1273 err = sev_vcpu_create(vcpu);
1274 if (err)
1275 goto error_free_vmcb_page;
1276
1277 err = avic_init_vcpu(svm);
1278 if (err)
1279 goto error_free_sev;
1280
1281 svm->msrpm = svm_vcpu_alloc_msrpm();
1282 if (!svm->msrpm) {
1283 err = -ENOMEM;
1284 goto error_free_sev;
1285 }
1286
1287 svm->x2avic_msrs_intercepted = true;
1288 svm->lbr_msrs_intercepted = true;
1289
1290 svm->vmcb01.ptr = page_address(vmcb01_page);
1291 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1292 svm_switch_vmcb(svm, &svm->vmcb01);
1293
1294 svm->guest_state_loaded = false;
1295
1296 return 0;
1297
1298 error_free_sev:
1299 sev_free_vcpu(vcpu);
1300 error_free_vmcb_page:
1301 __free_page(vmcb01_page);
1302 out:
1303 return err;
1304 }
1305
svm_vcpu_free(struct kvm_vcpu * vcpu)1306 static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1307 {
1308 struct vcpu_svm *svm = to_svm(vcpu);
1309
1310 WARN_ON_ONCE(!list_empty(&svm->ir_list));
1311
1312 svm_leave_nested(vcpu);
1313 svm_free_nested(svm);
1314
1315 sev_free_vcpu(vcpu);
1316
1317 __free_page(__sme_pa_to_page(svm->vmcb01.pa));
1318 svm_vcpu_free_msrpm(svm->msrpm);
1319 }
1320
1321 #ifdef CONFIG_CPU_MITIGATIONS
1322 static DEFINE_SPINLOCK(srso_lock);
1323 static atomic_t srso_nr_vms;
1324
svm_srso_clear_bp_spec_reduce(void * ign)1325 static void svm_srso_clear_bp_spec_reduce(void *ign)
1326 {
1327 struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
1328
1329 if (!sd->bp_spec_reduce_set)
1330 return;
1331
1332 msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
1333 sd->bp_spec_reduce_set = false;
1334 }
1335
svm_srso_vm_destroy(void)1336 static void svm_srso_vm_destroy(void)
1337 {
1338 if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
1339 return;
1340
1341 if (atomic_dec_return(&srso_nr_vms))
1342 return;
1343
1344 guard(spinlock)(&srso_lock);
1345
1346 /*
1347 * Verify a new VM didn't come along, acquire the lock, and increment
1348 * the count before this task acquired the lock.
1349 */
1350 if (atomic_read(&srso_nr_vms))
1351 return;
1352
1353 on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
1354 }
1355
svm_srso_vm_init(void)1356 static void svm_srso_vm_init(void)
1357 {
1358 if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
1359 return;
1360
1361 /*
1362 * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
1363 * transition, i.e. destroying the last VM, is fully complete, e.g. so
1364 * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
1365 */
1366 if (atomic_inc_not_zero(&srso_nr_vms))
1367 return;
1368
1369 guard(spinlock)(&srso_lock);
1370
1371 atomic_inc(&srso_nr_vms);
1372 }
1373 #else
svm_srso_vm_init(void)1374 static void svm_srso_vm_init(void) { }
svm_srso_vm_destroy(void)1375 static void svm_srso_vm_destroy(void) { }
1376 #endif
1377
svm_prepare_switch_to_guest(struct kvm_vcpu * vcpu)1378 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1379 {
1380 struct vcpu_svm *svm = to_svm(vcpu);
1381 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1382
1383 if (sev_es_guest(vcpu->kvm))
1384 sev_es_unmap_ghcb(svm);
1385
1386 if (svm->guest_state_loaded)
1387 return;
1388
1389 /*
1390 * Save additional host state that will be restored on VMEXIT (sev-es)
1391 * or subsequent vmload of host save area.
1392 */
1393 vmsave(sd->save_area_pa);
1394 if (sev_es_guest(vcpu->kvm))
1395 sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));
1396
1397 if (tsc_scaling)
1398 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1399
1400 /*
1401 * TSC_AUX is always virtualized (context switched by hardware) for
1402 * SEV-ES guests when the feature is available. For non-SEV-ES guests,
1403 * context switch TSC_AUX via the user_return MSR infrastructure (not
1404 * all CPUs support TSC_AUX virtualization).
1405 */
1406 if (likely(tsc_aux_uret_slot >= 0) &&
1407 (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
1408 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1409
1410 if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
1411 !sd->bp_spec_reduce_set) {
1412 sd->bp_spec_reduce_set = true;
1413 msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
1414 }
1415 svm->guest_state_loaded = true;
1416 }
1417
svm_prepare_host_switch(struct kvm_vcpu * vcpu)1418 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1419 {
1420 to_svm(vcpu)->guest_state_loaded = false;
1421 }
1422
svm_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1423 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1424 {
1425 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
1426 shrink_ple_window(vcpu);
1427
1428 if (kvm_vcpu_apicv_active(vcpu))
1429 avic_vcpu_load(vcpu, cpu);
1430 }
1431
svm_vcpu_put(struct kvm_vcpu * vcpu)1432 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1433 {
1434 if (kvm_vcpu_apicv_active(vcpu))
1435 avic_vcpu_put(vcpu);
1436
1437 svm_prepare_host_switch(vcpu);
1438
1439 ++vcpu->stat.host_state_reload;
1440 }
1441
svm_get_rflags(struct kvm_vcpu * vcpu)1442 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1443 {
1444 struct vcpu_svm *svm = to_svm(vcpu);
1445 unsigned long rflags = svm->vmcb->save.rflags;
1446
1447 if (svm->nmi_singlestep) {
1448 /* Hide our flags if they were not set by the guest */
1449 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1450 rflags &= ~X86_EFLAGS_TF;
1451 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1452 rflags &= ~X86_EFLAGS_RF;
1453 }
1454 return rflags;
1455 }
1456
svm_set_rflags(struct kvm_vcpu * vcpu,unsigned long rflags)1457 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1458 {
1459 if (to_svm(vcpu)->nmi_singlestep)
1460 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1461
1462 /*
1463 * Any change of EFLAGS.VM is accompanied by a reload of SS
1464 * (caused by either a task switch or an inter-privilege IRET),
1465 * so we do not need to update the CPL here.
1466 */
1467 to_svm(vcpu)->vmcb->save.rflags = rflags;
1468 }
1469
svm_get_if_flag(struct kvm_vcpu * vcpu)1470 static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1471 {
1472 struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1473
1474 return sev_es_guest(vcpu->kvm)
1475 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1476 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1477 }
1478
svm_cache_reg(struct kvm_vcpu * vcpu,enum kvm_reg reg)1479 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1480 {
1481 kvm_register_mark_available(vcpu, reg);
1482
1483 switch (reg) {
1484 case VCPU_EXREG_PDPTR:
1485 /*
1486 * When !npt_enabled, mmu->pdptrs[] is already available since
1487 * it is always updated per SDM when moving to CRs.
1488 */
1489 if (npt_enabled)
1490 load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1491 break;
1492 default:
1493 KVM_BUG_ON(1, vcpu->kvm);
1494 }
1495 }
1496
svm_set_vintr(struct vcpu_svm * svm)1497 static void svm_set_vintr(struct vcpu_svm *svm)
1498 {
1499 struct vmcb_control_area *control;
1500
1501 /*
1502 * The following fields are ignored when AVIC is enabled
1503 */
1504 WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1505
1506 svm_set_intercept(svm, INTERCEPT_VINTR);
1507
1508 /*
1509 * Recalculating intercepts may have cleared the VINTR intercept. If
1510 * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
1511 * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
1512 * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
1513 * interrupts will never be unblocked while L2 is running.
1514 */
1515 if (!svm_is_intercept(svm, INTERCEPT_VINTR))
1516 return;
1517
1518 /*
1519 * This is just a dummy VINTR to actually cause a vmexit to happen.
1520 * Actual injection of virtual interrupts happens through EVENTINJ.
1521 */
1522 control = &svm->vmcb->control;
1523 control->int_vector = 0x0;
1524 control->int_ctl &= ~V_INTR_PRIO_MASK;
1525 control->int_ctl |= V_IRQ_MASK |
1526 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1527 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1528 }
1529
svm_clear_vintr(struct vcpu_svm * svm)1530 static void svm_clear_vintr(struct vcpu_svm *svm)
1531 {
1532 svm_clr_intercept(svm, INTERCEPT_VINTR);
1533
1534 /* Drop int_ctl fields related to VINTR injection. */
1535 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1536 if (is_guest_mode(&svm->vcpu)) {
1537 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1538
1539 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1540 (svm->nested.ctl.int_ctl & V_TPR_MASK));
1541
1542 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1543 V_IRQ_INJECTION_BITS_MASK;
1544
1545 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1546 }
1547
1548 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1549 }
1550
svm_seg(struct kvm_vcpu * vcpu,int seg)1551 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1552 {
1553 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1554 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1555
1556 switch (seg) {
1557 case VCPU_SREG_CS: return &save->cs;
1558 case VCPU_SREG_DS: return &save->ds;
1559 case VCPU_SREG_ES: return &save->es;
1560 case VCPU_SREG_FS: return &save01->fs;
1561 case VCPU_SREG_GS: return &save01->gs;
1562 case VCPU_SREG_SS: return &save->ss;
1563 case VCPU_SREG_TR: return &save01->tr;
1564 case VCPU_SREG_LDTR: return &save01->ldtr;
1565 }
1566 BUG();
1567 return NULL;
1568 }
1569
svm_get_segment_base(struct kvm_vcpu * vcpu,int seg)1570 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1571 {
1572 struct vmcb_seg *s = svm_seg(vcpu, seg);
1573
1574 return s->base;
1575 }
1576
svm_get_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)1577 static void svm_get_segment(struct kvm_vcpu *vcpu,
1578 struct kvm_segment *var, int seg)
1579 {
1580 struct vmcb_seg *s = svm_seg(vcpu, seg);
1581
1582 var->base = s->base;
1583 var->limit = s->limit;
1584 var->selector = s->selector;
1585 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1586 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1587 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1588 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1589 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1590 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1591 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1592
1593 /*
1594 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1595 * However, the SVM spec states that the G bit is not observed by the
1596 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1597 * So let's synthesize a legal G bit for all segments, this helps
1598 * running KVM nested. It also helps cross-vendor migration, because
1599 * Intel's vmentry has a check on the 'G' bit.
1600 */
1601 var->g = s->limit > 0xfffff;
1602
1603 /*
1604 * AMD's VMCB does not have an explicit unusable field, so emulate it
1605 * for cross vendor migration purposes by "not present"
1606 */
1607 var->unusable = !var->present;
1608
1609 switch (seg) {
1610 case VCPU_SREG_TR:
1611 /*
1612 * Work around a bug where the busy flag in the tr selector
1613 * isn't exposed
1614 */
1615 var->type |= 0x2;
1616 break;
1617 case VCPU_SREG_DS:
1618 case VCPU_SREG_ES:
1619 case VCPU_SREG_FS:
1620 case VCPU_SREG_GS:
1621 /*
1622 * The accessed bit must always be set in the segment
1623 * descriptor cache, although it can be cleared in the
1624 * descriptor, the cached bit always remains at 1. Since
1625 * Intel has a check on this, set it here to support
1626 * cross-vendor migration.
1627 */
1628 if (!var->unusable)
1629 var->type |= 0x1;
1630 break;
1631 case VCPU_SREG_SS:
1632 /*
1633 * On AMD CPUs sometimes the DB bit in the segment
1634 * descriptor is left as 1, although the whole segment has
1635 * been made unusable. Clear it here to pass an Intel VMX
1636 * entry check when cross vendor migrating.
1637 */
1638 if (var->unusable)
1639 var->db = 0;
1640 /* This is symmetric with svm_set_segment() */
1641 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1642 break;
1643 }
1644 }
1645
svm_get_cpl(struct kvm_vcpu * vcpu)1646 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1647 {
1648 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1649
1650 return save->cpl;
1651 }
1652
svm_get_cs_db_l_bits(struct kvm_vcpu * vcpu,int * db,int * l)1653 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1654 {
1655 struct kvm_segment cs;
1656
1657 svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1658 *db = cs.db;
1659 *l = cs.l;
1660 }
1661
svm_get_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1662 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1663 {
1664 struct vcpu_svm *svm = to_svm(vcpu);
1665
1666 dt->size = svm->vmcb->save.idtr.limit;
1667 dt->address = svm->vmcb->save.idtr.base;
1668 }
1669
svm_set_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1670 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1671 {
1672 struct vcpu_svm *svm = to_svm(vcpu);
1673
1674 svm->vmcb->save.idtr.limit = dt->size;
1675 svm->vmcb->save.idtr.base = dt->address ;
1676 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1677 }
1678
svm_get_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1679 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1680 {
1681 struct vcpu_svm *svm = to_svm(vcpu);
1682
1683 dt->size = svm->vmcb->save.gdtr.limit;
1684 dt->address = svm->vmcb->save.gdtr.base;
1685 }
1686
svm_set_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1687 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1688 {
1689 struct vcpu_svm *svm = to_svm(vcpu);
1690
1691 svm->vmcb->save.gdtr.limit = dt->size;
1692 svm->vmcb->save.gdtr.base = dt->address ;
1693 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1694 }
1695
sev_post_set_cr3(struct kvm_vcpu * vcpu,unsigned long cr3)1696 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1697 {
1698 struct vcpu_svm *svm = to_svm(vcpu);
1699
1700 /*
1701 * For guests that don't set guest_state_protected, the cr3 update is
1702 * handled via kvm_mmu_load() while entering the guest. For guests
1703 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1704 * VMCB save area now, since the save area will become the initial
1705 * contents of the VMSA, and future VMCB save area updates won't be
1706 * seen.
1707 */
1708 if (sev_es_guest(vcpu->kvm)) {
1709 svm->vmcb->save.cr3 = cr3;
1710 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1711 }
1712 }
1713
svm_is_valid_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)1714 static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1715 {
1716 return true;
1717 }
1718
svm_set_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)1719 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1720 {
1721 struct vcpu_svm *svm = to_svm(vcpu);
1722 u64 hcr0 = cr0;
1723 bool old_paging = is_paging(vcpu);
1724
1725 #ifdef CONFIG_X86_64
1726 if (vcpu->arch.efer & EFER_LME) {
1727 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1728 vcpu->arch.efer |= EFER_LMA;
1729 if (!vcpu->arch.guest_state_protected)
1730 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1731 }
1732
1733 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1734 vcpu->arch.efer &= ~EFER_LMA;
1735 if (!vcpu->arch.guest_state_protected)
1736 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1737 }
1738 }
1739 #endif
1740 vcpu->arch.cr0 = cr0;
1741
1742 if (!npt_enabled) {
1743 hcr0 |= X86_CR0_PG | X86_CR0_WP;
1744 if (old_paging != is_paging(vcpu))
1745 svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1746 }
1747
1748 /*
1749 * re-enable caching here because the QEMU bios
1750 * does not do it - this results in some delay at
1751 * reboot
1752 */
1753 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1754 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1755
1756 svm->vmcb->save.cr0 = hcr0;
1757 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1758
1759 /*
1760 * SEV-ES guests must always keep the CR intercepts cleared. CR
1761 * tracking is done using the CR write traps.
1762 */
1763 if (sev_es_guest(vcpu->kvm))
1764 return;
1765
1766 if (hcr0 == cr0) {
1767 /* Selective CR0 write remains on. */
1768 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1769 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1770 } else {
1771 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1772 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1773 }
1774 }
1775
svm_is_valid_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)1776 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1777 {
1778 return true;
1779 }
1780
svm_set_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)1781 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1782 {
1783 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1784 unsigned long old_cr4 = vcpu->arch.cr4;
1785
1786 vcpu->arch.cr4 = cr4;
1787 if (!npt_enabled) {
1788 cr4 |= X86_CR4_PAE;
1789
1790 if (!is_paging(vcpu))
1791 cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1792 }
1793 cr4 |= host_cr4_mce;
1794 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1795 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1796
1797 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1798 vcpu->arch.cpuid_dynamic_bits_dirty = true;
1799 }
1800
svm_set_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)1801 static void svm_set_segment(struct kvm_vcpu *vcpu,
1802 struct kvm_segment *var, int seg)
1803 {
1804 struct vcpu_svm *svm = to_svm(vcpu);
1805 struct vmcb_seg *s = svm_seg(vcpu, seg);
1806
1807 s->base = var->base;
1808 s->limit = var->limit;
1809 s->selector = var->selector;
1810 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1811 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1812 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1813 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1814 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1815 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1816 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1817 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1818
1819 /*
1820 * This is always accurate, except if SYSRET returned to a segment
1821 * with SS.DPL != 3. Intel does not have this quirk, and always
1822 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1823 * would entail passing the CPL to userspace and back.
1824 */
1825 if (seg == VCPU_SREG_SS)
1826 /* This is symmetric with svm_get_segment() */
1827 svm->vmcb->save.cpl = (var->dpl & 3);
1828
1829 vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1830 }
1831
svm_update_exception_bitmap(struct kvm_vcpu * vcpu)1832 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1833 {
1834 struct vcpu_svm *svm = to_svm(vcpu);
1835
1836 clr_exception_intercept(svm, BP_VECTOR);
1837
1838 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1839 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1840 set_exception_intercept(svm, BP_VECTOR);
1841 }
1842 }
1843
new_asid(struct vcpu_svm * svm,struct svm_cpu_data * sd)1844 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1845 {
1846 if (sd->next_asid > sd->max_asid) {
1847 ++sd->asid_generation;
1848 sd->next_asid = sd->min_asid;
1849 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1850 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1851 }
1852
1853 svm->current_vmcb->asid_generation = sd->asid_generation;
1854 svm->asid = sd->next_asid++;
1855 }
1856
svm_set_dr6(struct kvm_vcpu * vcpu,unsigned long value)1857 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
1858 {
1859 struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1860
1861 if (vcpu->arch.guest_state_protected)
1862 return;
1863
1864 if (unlikely(value != vmcb->save.dr6)) {
1865 vmcb->save.dr6 = value;
1866 vmcb_mark_dirty(vmcb, VMCB_DR);
1867 }
1868 }
1869
svm_sync_dirty_debug_regs(struct kvm_vcpu * vcpu)1870 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1871 {
1872 struct vcpu_svm *svm = to_svm(vcpu);
1873
1874 if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
1875 return;
1876
1877 get_debugreg(vcpu->arch.db[0], 0);
1878 get_debugreg(vcpu->arch.db[1], 1);
1879 get_debugreg(vcpu->arch.db[2], 2);
1880 get_debugreg(vcpu->arch.db[3], 3);
1881 /*
1882 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1883 * because db_interception might need it. We can do it before vmentry.
1884 */
1885 vcpu->arch.dr6 = svm->vmcb->save.dr6;
1886 vcpu->arch.dr7 = svm->vmcb->save.dr7;
1887 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1888 set_dr_intercepts(svm);
1889 }
1890
svm_set_dr7(struct kvm_vcpu * vcpu,unsigned long value)1891 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1892 {
1893 struct vcpu_svm *svm = to_svm(vcpu);
1894
1895 if (vcpu->arch.guest_state_protected)
1896 return;
1897
1898 svm->vmcb->save.dr7 = value;
1899 vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1900 }
1901
pf_interception(struct kvm_vcpu * vcpu)1902 static int pf_interception(struct kvm_vcpu *vcpu)
1903 {
1904 struct vcpu_svm *svm = to_svm(vcpu);
1905
1906 u64 fault_address = svm->vmcb->control.exit_info_2;
1907 u64 error_code = svm->vmcb->control.exit_info_1;
1908
1909 return kvm_handle_page_fault(vcpu, error_code, fault_address,
1910 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1911 svm->vmcb->control.insn_bytes : NULL,
1912 svm->vmcb->control.insn_len);
1913 }
1914
1915 static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1916 void *insn, int insn_len);
1917
npf_interception(struct kvm_vcpu * vcpu)1918 static int npf_interception(struct kvm_vcpu *vcpu)
1919 {
1920 struct vcpu_svm *svm = to_svm(vcpu);
1921 int rc;
1922
1923 u64 error_code = svm->vmcb->control.exit_info_1;
1924 gpa_t gpa = svm->vmcb->control.exit_info_2;
1925
1926 /*
1927 * WARN if hardware generates a fault with an error code that collides
1928 * with KVM-defined sythentic flags. Clear the flags and continue on,
1929 * i.e. don't terminate the VM, as KVM can't possibly be relying on a
1930 * flag that KVM doesn't know about.
1931 */
1932 if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
1933 error_code &= ~PFERR_SYNTHETIC_MASK;
1934
1935 /*
1936 * Expedite fast MMIO kicks if the next RIP is known and KVM is allowed
1937 * emulate a page fault, e.g. skipping the current instruction is wrong
1938 * if the #NPF occurred while vectoring an event.
1939 */
1940 if ((error_code & PFERR_RSVD_MASK) && !is_guest_mode(vcpu)) {
1941 const int emul_type = EMULTYPE_PF | EMULTYPE_NO_DECODE;
1942
1943 if (svm_check_emulate_instruction(vcpu, emul_type, NULL, 0))
1944 return 1;
1945
1946 if (nrips && svm->vmcb->control.next_rip &&
1947 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1948 trace_kvm_fast_mmio(gpa);
1949 return kvm_skip_emulated_instruction(vcpu);
1950 }
1951 }
1952
1953 if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
1954 error_code |= PFERR_PRIVATE_ACCESS;
1955
1956 trace_kvm_page_fault(vcpu, gpa, error_code);
1957 rc = kvm_mmu_page_fault(vcpu, gpa, error_code,
1958 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1959 svm->vmcb->control.insn_bytes : NULL,
1960 svm->vmcb->control.insn_len);
1961
1962 if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
1963 sev_handle_rmp_fault(vcpu, gpa, error_code);
1964
1965 return rc;
1966 }
1967
db_interception(struct kvm_vcpu * vcpu)1968 static int db_interception(struct kvm_vcpu *vcpu)
1969 {
1970 struct kvm_run *kvm_run = vcpu->run;
1971 struct vcpu_svm *svm = to_svm(vcpu);
1972
1973 if (!(vcpu->guest_debug &
1974 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1975 !svm->nmi_singlestep) {
1976 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
1977 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
1978 return 1;
1979 }
1980
1981 if (svm->nmi_singlestep) {
1982 disable_nmi_singlestep(svm);
1983 /* Make sure we check for pending NMIs upon entry */
1984 kvm_make_request(KVM_REQ_EVENT, vcpu);
1985 }
1986
1987 if (vcpu->guest_debug &
1988 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1989 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1990 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
1991 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
1992 kvm_run->debug.arch.pc =
1993 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1994 kvm_run->debug.arch.exception = DB_VECTOR;
1995 return 0;
1996 }
1997
1998 return 1;
1999 }
2000
bp_interception(struct kvm_vcpu * vcpu)2001 static int bp_interception(struct kvm_vcpu *vcpu)
2002 {
2003 struct vcpu_svm *svm = to_svm(vcpu);
2004 struct kvm_run *kvm_run = vcpu->run;
2005
2006 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2007 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2008 kvm_run->debug.arch.exception = BP_VECTOR;
2009 return 0;
2010 }
2011
ud_interception(struct kvm_vcpu * vcpu)2012 static int ud_interception(struct kvm_vcpu *vcpu)
2013 {
2014 return handle_ud(vcpu);
2015 }
2016
ac_interception(struct kvm_vcpu * vcpu)2017 static int ac_interception(struct kvm_vcpu *vcpu)
2018 {
2019 kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2020 return 1;
2021 }
2022
is_erratum_383(void)2023 static bool is_erratum_383(void)
2024 {
2025 int i;
2026 u64 value;
2027
2028 if (!erratum_383_found)
2029 return false;
2030
2031 if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value))
2032 return false;
2033
2034 /* Bit 62 may or may not be set for this mce */
2035 value &= ~(1ULL << 62);
2036
2037 if (value != 0xb600000000010015ULL)
2038 return false;
2039
2040 /* Clear MCi_STATUS registers */
2041 for (i = 0; i < 6; ++i)
2042 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0);
2043
2044 if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) {
2045 value &= ~(1ULL << 2);
2046 native_write_msr_safe(MSR_IA32_MCG_STATUS, value);
2047 }
2048
2049 /* Flush tlb to evict multi-match entries */
2050 __flush_tlb_all();
2051
2052 return true;
2053 }
2054
svm_handle_mce(struct kvm_vcpu * vcpu)2055 static void svm_handle_mce(struct kvm_vcpu *vcpu)
2056 {
2057 if (is_erratum_383()) {
2058 /*
2059 * Erratum 383 triggered. Guest state is corrupt so kill the
2060 * guest.
2061 */
2062 pr_err("Guest triggered AMD Erratum 383\n");
2063
2064 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2065
2066 return;
2067 }
2068
2069 /*
2070 * On an #MC intercept the MCE handler is not called automatically in
2071 * the host. So do it by hand here.
2072 */
2073 kvm_machine_check();
2074 }
2075
mc_interception(struct kvm_vcpu * vcpu)2076 static int mc_interception(struct kvm_vcpu *vcpu)
2077 {
2078 return 1;
2079 }
2080
shutdown_interception(struct kvm_vcpu * vcpu)2081 static int shutdown_interception(struct kvm_vcpu *vcpu)
2082 {
2083 struct kvm_run *kvm_run = vcpu->run;
2084 struct vcpu_svm *svm = to_svm(vcpu);
2085
2086
2087 /*
2088 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
2089 * the VMCB in a known good state. Unfortuately, KVM doesn't have
2090 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2091 * userspace. At a platform view, INIT is acceptable behavior as
2092 * there exist bare metal platforms that automatically INIT the CPU
2093 * in response to shutdown.
2094 *
2095 * The VM save area for SEV-ES guests has already been encrypted so it
2096 * cannot be reinitialized, i.e. synthesizing INIT is futile.
2097 */
2098 if (!sev_es_guest(vcpu->kvm)) {
2099 clear_page(svm->vmcb);
2100 #ifdef CONFIG_KVM_SMM
2101 if (is_smm(vcpu))
2102 kvm_smm_changed(vcpu, false);
2103 #endif
2104 kvm_vcpu_reset(vcpu, true);
2105 }
2106
2107 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2108 return 0;
2109 }
2110
io_interception(struct kvm_vcpu * vcpu)2111 static int io_interception(struct kvm_vcpu *vcpu)
2112 {
2113 struct vcpu_svm *svm = to_svm(vcpu);
2114 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2115 int size, in, string;
2116 unsigned port;
2117
2118 ++vcpu->stat.io_exits;
2119 string = (io_info & SVM_IOIO_STR_MASK) != 0;
2120 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2121 port = io_info >> 16;
2122 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2123
2124 if (string) {
2125 if (sev_es_guest(vcpu->kvm))
2126 return sev_es_string_io(svm, size, port, in);
2127 else
2128 return kvm_emulate_instruction(vcpu, 0);
2129 }
2130
2131 svm->next_rip = svm->vmcb->control.exit_info_2;
2132
2133 return kvm_fast_pio(vcpu, size, port, in);
2134 }
2135
nmi_interception(struct kvm_vcpu * vcpu)2136 static int nmi_interception(struct kvm_vcpu *vcpu)
2137 {
2138 return 1;
2139 }
2140
smi_interception(struct kvm_vcpu * vcpu)2141 static int smi_interception(struct kvm_vcpu *vcpu)
2142 {
2143 return 1;
2144 }
2145
intr_interception(struct kvm_vcpu * vcpu)2146 static int intr_interception(struct kvm_vcpu *vcpu)
2147 {
2148 ++vcpu->stat.irq_exits;
2149 return 1;
2150 }
2151
vmload_vmsave_interception(struct kvm_vcpu * vcpu,bool vmload)2152 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2153 {
2154 struct vcpu_svm *svm = to_svm(vcpu);
2155 struct vmcb *vmcb12;
2156 struct kvm_host_map map;
2157 int ret;
2158
2159 if (nested_svm_check_permissions(vcpu))
2160 return 1;
2161
2162 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2163 if (ret) {
2164 if (ret == -EINVAL)
2165 kvm_inject_gp(vcpu, 0);
2166 return 1;
2167 }
2168
2169 vmcb12 = map.hva;
2170
2171 ret = kvm_skip_emulated_instruction(vcpu);
2172
2173 /* KVM always performs VMLOAD/VMSAVE on VMCB01 (see __svm_vcpu_run()) */
2174 if (vmload) {
2175 svm_copy_vmloadsave_state(svm->vmcb01.ptr, vmcb12);
2176 svm->sysenter_eip_hi = 0;
2177 svm->sysenter_esp_hi = 0;
2178 } else {
2179 svm_copy_vmloadsave_state(vmcb12, svm->vmcb01.ptr);
2180 }
2181
2182 kvm_vcpu_unmap(vcpu, &map);
2183
2184 return ret;
2185 }
2186
vmload_interception(struct kvm_vcpu * vcpu)2187 static int vmload_interception(struct kvm_vcpu *vcpu)
2188 {
2189 return vmload_vmsave_interception(vcpu, true);
2190 }
2191
vmsave_interception(struct kvm_vcpu * vcpu)2192 static int vmsave_interception(struct kvm_vcpu *vcpu)
2193 {
2194 return vmload_vmsave_interception(vcpu, false);
2195 }
2196
vmrun_interception(struct kvm_vcpu * vcpu)2197 static int vmrun_interception(struct kvm_vcpu *vcpu)
2198 {
2199 if (nested_svm_check_permissions(vcpu))
2200 return 1;
2201
2202 return nested_svm_vmrun(vcpu);
2203 }
2204
2205 enum {
2206 NONE_SVM_INSTR,
2207 SVM_INSTR_VMRUN,
2208 SVM_INSTR_VMLOAD,
2209 SVM_INSTR_VMSAVE,
2210 };
2211
2212 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
svm_instr_opcode(struct kvm_vcpu * vcpu)2213 static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2214 {
2215 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2216
2217 if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2218 return NONE_SVM_INSTR;
2219
2220 switch (ctxt->modrm) {
2221 case 0xd8: /* VMRUN */
2222 return SVM_INSTR_VMRUN;
2223 case 0xda: /* VMLOAD */
2224 return SVM_INSTR_VMLOAD;
2225 case 0xdb: /* VMSAVE */
2226 return SVM_INSTR_VMSAVE;
2227 default:
2228 break;
2229 }
2230
2231 return NONE_SVM_INSTR;
2232 }
2233
emulate_svm_instr(struct kvm_vcpu * vcpu,int opcode)2234 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2235 {
2236 const int guest_mode_exit_codes[] = {
2237 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2238 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2239 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2240 };
2241 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2242 [SVM_INSTR_VMRUN] = vmrun_interception,
2243 [SVM_INSTR_VMLOAD] = vmload_interception,
2244 [SVM_INSTR_VMSAVE] = vmsave_interception,
2245 };
2246 struct vcpu_svm *svm = to_svm(vcpu);
2247 int ret;
2248
2249 if (is_guest_mode(vcpu)) {
2250 /* Returns '1' or -errno on failure, '0' on success. */
2251 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2252 if (ret)
2253 return ret;
2254 return 1;
2255 }
2256 return svm_instr_handlers[opcode](vcpu);
2257 }
2258
2259 /*
2260 * #GP handling code. Note that #GP can be triggered under the following two
2261 * cases:
2262 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2263 * some AMD CPUs when EAX of these instructions are in the reserved memory
2264 * regions (e.g. SMM memory on host).
2265 * 2) VMware backdoor
2266 */
gp_interception(struct kvm_vcpu * vcpu)2267 static int gp_interception(struct kvm_vcpu *vcpu)
2268 {
2269 struct vcpu_svm *svm = to_svm(vcpu);
2270 u32 error_code = svm->vmcb->control.exit_info_1;
2271 int opcode;
2272
2273 /* Both #GP cases have zero error_code */
2274 if (error_code)
2275 goto reinject;
2276
2277 /* Decode the instruction for usage later */
2278 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2279 goto reinject;
2280
2281 opcode = svm_instr_opcode(vcpu);
2282
2283 if (opcode == NONE_SVM_INSTR) {
2284 if (!enable_vmware_backdoor)
2285 goto reinject;
2286
2287 /*
2288 * VMware backdoor emulation on #GP interception only handles
2289 * IN{S}, OUT{S}, and RDPMC.
2290 */
2291 if (!is_guest_mode(vcpu))
2292 return kvm_emulate_instruction(vcpu,
2293 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2294 } else {
2295 /* All SVM instructions expect page aligned RAX */
2296 if (svm->vmcb->save.rax & ~PAGE_MASK)
2297 goto reinject;
2298
2299 return emulate_svm_instr(vcpu, opcode);
2300 }
2301
2302 reinject:
2303 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2304 return 1;
2305 }
2306
svm_set_gif(struct vcpu_svm * svm,bool value)2307 void svm_set_gif(struct vcpu_svm *svm, bool value)
2308 {
2309 if (value) {
2310 /*
2311 * If VGIF is enabled, the STGI intercept is only added to
2312 * detect the opening of the SMI/NMI window; remove it now.
2313 * Likewise, clear the VINTR intercept, we will set it
2314 * again while processing KVM_REQ_EVENT if needed.
2315 */
2316 if (vgif)
2317 svm_clr_intercept(svm, INTERCEPT_STGI);
2318 if (svm_is_intercept(svm, INTERCEPT_VINTR))
2319 svm_clear_vintr(svm);
2320
2321 enable_gif(svm);
2322 if (svm->vcpu.arch.smi_pending ||
2323 svm->vcpu.arch.nmi_pending ||
2324 kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2325 kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2326 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2327 } else {
2328 disable_gif(svm);
2329
2330 /*
2331 * After a CLGI no interrupts should come. But if vGIF is
2332 * in use, we still rely on the VINTR intercept (rather than
2333 * STGI) to detect an open interrupt window.
2334 */
2335 if (!vgif)
2336 svm_clear_vintr(svm);
2337 }
2338 }
2339
stgi_interception(struct kvm_vcpu * vcpu)2340 static int stgi_interception(struct kvm_vcpu *vcpu)
2341 {
2342 int ret;
2343
2344 if (nested_svm_check_permissions(vcpu))
2345 return 1;
2346
2347 ret = kvm_skip_emulated_instruction(vcpu);
2348 svm_set_gif(to_svm(vcpu), true);
2349 return ret;
2350 }
2351
clgi_interception(struct kvm_vcpu * vcpu)2352 static int clgi_interception(struct kvm_vcpu *vcpu)
2353 {
2354 int ret;
2355
2356 if (nested_svm_check_permissions(vcpu))
2357 return 1;
2358
2359 ret = kvm_skip_emulated_instruction(vcpu);
2360 svm_set_gif(to_svm(vcpu), false);
2361 return ret;
2362 }
2363
invlpga_interception(struct kvm_vcpu * vcpu)2364 static int invlpga_interception(struct kvm_vcpu *vcpu)
2365 {
2366 gva_t gva = kvm_rax_read(vcpu);
2367 u32 asid = kvm_rcx_read(vcpu);
2368
2369 /* FIXME: Handle an address size prefix. */
2370 if (!is_long_mode(vcpu))
2371 gva = (u32)gva;
2372
2373 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2374
2375 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2376 kvm_mmu_invlpg(vcpu, gva);
2377
2378 return kvm_skip_emulated_instruction(vcpu);
2379 }
2380
skinit_interception(struct kvm_vcpu * vcpu)2381 static int skinit_interception(struct kvm_vcpu *vcpu)
2382 {
2383 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2384
2385 kvm_queue_exception(vcpu, UD_VECTOR);
2386 return 1;
2387 }
2388
task_switch_interception(struct kvm_vcpu * vcpu)2389 static int task_switch_interception(struct kvm_vcpu *vcpu)
2390 {
2391 struct vcpu_svm *svm = to_svm(vcpu);
2392 u16 tss_selector;
2393 int reason;
2394 int int_type = svm->vmcb->control.exit_int_info &
2395 SVM_EXITINTINFO_TYPE_MASK;
2396 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2397 uint32_t type =
2398 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2399 uint32_t idt_v =
2400 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2401 bool has_error_code = false;
2402 u32 error_code = 0;
2403
2404 tss_selector = (u16)svm->vmcb->control.exit_info_1;
2405
2406 if (svm->vmcb->control.exit_info_2 &
2407 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2408 reason = TASK_SWITCH_IRET;
2409 else if (svm->vmcb->control.exit_info_2 &
2410 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2411 reason = TASK_SWITCH_JMP;
2412 else if (idt_v)
2413 reason = TASK_SWITCH_GATE;
2414 else
2415 reason = TASK_SWITCH_CALL;
2416
2417 if (reason == TASK_SWITCH_GATE) {
2418 switch (type) {
2419 case SVM_EXITINTINFO_TYPE_NMI:
2420 vcpu->arch.nmi_injected = false;
2421 break;
2422 case SVM_EXITINTINFO_TYPE_EXEPT:
2423 if (svm->vmcb->control.exit_info_2 &
2424 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2425 has_error_code = true;
2426 error_code =
2427 (u32)svm->vmcb->control.exit_info_2;
2428 }
2429 kvm_clear_exception_queue(vcpu);
2430 break;
2431 case SVM_EXITINTINFO_TYPE_INTR:
2432 case SVM_EXITINTINFO_TYPE_SOFT:
2433 kvm_clear_interrupt_queue(vcpu);
2434 break;
2435 default:
2436 break;
2437 }
2438 }
2439
2440 if (reason != TASK_SWITCH_GATE ||
2441 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2442 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2443 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2444 if (!svm_skip_emulated_instruction(vcpu))
2445 return 0;
2446 }
2447
2448 if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2449 int_vec = -1;
2450
2451 return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2452 has_error_code, error_code);
2453 }
2454
svm_clr_iret_intercept(struct vcpu_svm * svm)2455 static void svm_clr_iret_intercept(struct vcpu_svm *svm)
2456 {
2457 if (!sev_es_guest(svm->vcpu.kvm))
2458 svm_clr_intercept(svm, INTERCEPT_IRET);
2459 }
2460
svm_set_iret_intercept(struct vcpu_svm * svm)2461 static void svm_set_iret_intercept(struct vcpu_svm *svm)
2462 {
2463 if (!sev_es_guest(svm->vcpu.kvm))
2464 svm_set_intercept(svm, INTERCEPT_IRET);
2465 }
2466
iret_interception(struct kvm_vcpu * vcpu)2467 static int iret_interception(struct kvm_vcpu *vcpu)
2468 {
2469 struct vcpu_svm *svm = to_svm(vcpu);
2470
2471 WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
2472
2473 ++vcpu->stat.nmi_window_exits;
2474 svm->awaiting_iret_completion = true;
2475
2476 svm_clr_iret_intercept(svm);
2477 svm->nmi_iret_rip = kvm_rip_read(vcpu);
2478
2479 kvm_make_request(KVM_REQ_EVENT, vcpu);
2480 return 1;
2481 }
2482
invlpg_interception(struct kvm_vcpu * vcpu)2483 static int invlpg_interception(struct kvm_vcpu *vcpu)
2484 {
2485 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2486 return kvm_emulate_instruction(vcpu, 0);
2487
2488 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2489 return kvm_skip_emulated_instruction(vcpu);
2490 }
2491
emulate_on_interception(struct kvm_vcpu * vcpu)2492 static int emulate_on_interception(struct kvm_vcpu *vcpu)
2493 {
2494 return kvm_emulate_instruction(vcpu, 0);
2495 }
2496
rsm_interception(struct kvm_vcpu * vcpu)2497 static int rsm_interception(struct kvm_vcpu *vcpu)
2498 {
2499 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2500 }
2501
check_selective_cr0_intercepted(struct kvm_vcpu * vcpu,unsigned long val)2502 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2503 unsigned long val)
2504 {
2505 struct vcpu_svm *svm = to_svm(vcpu);
2506 unsigned long cr0 = vcpu->arch.cr0;
2507 bool ret = false;
2508
2509 if (!is_guest_mode(vcpu) ||
2510 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2511 return false;
2512
2513 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2514 val &= ~SVM_CR0_SELECTIVE_MASK;
2515
2516 if (cr0 ^ val) {
2517 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2518 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2519 }
2520
2521 return ret;
2522 }
2523
2524 #define CR_VALID (1ULL << 63)
2525
cr_interception(struct kvm_vcpu * vcpu)2526 static int cr_interception(struct kvm_vcpu *vcpu)
2527 {
2528 struct vcpu_svm *svm = to_svm(vcpu);
2529 int reg, cr;
2530 unsigned long val;
2531 int err;
2532
2533 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2534 return emulate_on_interception(vcpu);
2535
2536 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2537 return emulate_on_interception(vcpu);
2538
2539 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2540 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2541 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2542 else
2543 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2544
2545 err = 0;
2546 if (cr >= 16) { /* mov to cr */
2547 cr -= 16;
2548 val = kvm_register_read(vcpu, reg);
2549 trace_kvm_cr_write(cr, val);
2550 switch (cr) {
2551 case 0:
2552 if (!check_selective_cr0_intercepted(vcpu, val))
2553 err = kvm_set_cr0(vcpu, val);
2554 else
2555 return 1;
2556
2557 break;
2558 case 3:
2559 err = kvm_set_cr3(vcpu, val);
2560 break;
2561 case 4:
2562 err = kvm_set_cr4(vcpu, val);
2563 break;
2564 case 8:
2565 err = kvm_set_cr8(vcpu, val);
2566 break;
2567 default:
2568 WARN(1, "unhandled write to CR%d", cr);
2569 kvm_queue_exception(vcpu, UD_VECTOR);
2570 return 1;
2571 }
2572 } else { /* mov from cr */
2573 switch (cr) {
2574 case 0:
2575 val = kvm_read_cr0(vcpu);
2576 break;
2577 case 2:
2578 val = vcpu->arch.cr2;
2579 break;
2580 case 3:
2581 val = kvm_read_cr3(vcpu);
2582 break;
2583 case 4:
2584 val = kvm_read_cr4(vcpu);
2585 break;
2586 case 8:
2587 val = kvm_get_cr8(vcpu);
2588 break;
2589 default:
2590 WARN(1, "unhandled read from CR%d", cr);
2591 kvm_queue_exception(vcpu, UD_VECTOR);
2592 return 1;
2593 }
2594 kvm_register_write(vcpu, reg, val);
2595 trace_kvm_cr_read(cr, val);
2596 }
2597 return kvm_complete_insn_gp(vcpu, err);
2598 }
2599
cr_trap(struct kvm_vcpu * vcpu)2600 static int cr_trap(struct kvm_vcpu *vcpu)
2601 {
2602 struct vcpu_svm *svm = to_svm(vcpu);
2603 unsigned long old_value, new_value;
2604 unsigned int cr;
2605 int ret = 0;
2606
2607 new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2608
2609 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2610 switch (cr) {
2611 case 0:
2612 old_value = kvm_read_cr0(vcpu);
2613 svm_set_cr0(vcpu, new_value);
2614
2615 kvm_post_set_cr0(vcpu, old_value, new_value);
2616 break;
2617 case 4:
2618 old_value = kvm_read_cr4(vcpu);
2619 svm_set_cr4(vcpu, new_value);
2620
2621 kvm_post_set_cr4(vcpu, old_value, new_value);
2622 break;
2623 case 8:
2624 ret = kvm_set_cr8(vcpu, new_value);
2625 break;
2626 default:
2627 WARN(1, "unhandled CR%d write trap", cr);
2628 kvm_queue_exception(vcpu, UD_VECTOR);
2629 return 1;
2630 }
2631
2632 return kvm_complete_insn_gp(vcpu, ret);
2633 }
2634
dr_interception(struct kvm_vcpu * vcpu)2635 static int dr_interception(struct kvm_vcpu *vcpu)
2636 {
2637 struct vcpu_svm *svm = to_svm(vcpu);
2638 int reg, dr;
2639 int err = 0;
2640
2641 /*
2642 * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
2643 * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
2644 */
2645 if (sev_es_guest(vcpu->kvm))
2646 return 1;
2647
2648 if (vcpu->guest_debug == 0) {
2649 /*
2650 * No more DR vmexits; force a reload of the debug registers
2651 * and reenter on this instruction. The next vmexit will
2652 * retrieve the full state of the debug registers.
2653 */
2654 clr_dr_intercepts(svm);
2655 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2656 return 1;
2657 }
2658
2659 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2660 return emulate_on_interception(vcpu);
2661
2662 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2663 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2664 if (dr >= 16) { /* mov to DRn */
2665 dr -= 16;
2666 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
2667 } else {
2668 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
2669 }
2670
2671 return kvm_complete_insn_gp(vcpu, err);
2672 }
2673
cr8_write_interception(struct kvm_vcpu * vcpu)2674 static int cr8_write_interception(struct kvm_vcpu *vcpu)
2675 {
2676 u8 cr8_prev = kvm_get_cr8(vcpu);
2677 int r;
2678
2679 WARN_ON_ONCE(kvm_vcpu_apicv_active(vcpu));
2680
2681 /* instruction emulation calls kvm_set_cr8() */
2682 r = cr_interception(vcpu);
2683 if (lapic_in_kernel(vcpu))
2684 return r;
2685 if (cr8_prev <= kvm_get_cr8(vcpu))
2686 return r;
2687 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2688 return 0;
2689 }
2690
efer_trap(struct kvm_vcpu * vcpu)2691 static int efer_trap(struct kvm_vcpu *vcpu)
2692 {
2693 struct msr_data msr_info;
2694 int ret;
2695
2696 /*
2697 * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2698 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2699 * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2700 * the guest doesn't have X86_FEATURE_SVM.
2701 */
2702 msr_info.host_initiated = false;
2703 msr_info.index = MSR_EFER;
2704 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2705 ret = kvm_set_msr_common(vcpu, &msr_info);
2706
2707 return kvm_complete_insn_gp(vcpu, ret);
2708 }
2709
svm_get_feature_msr(u32 msr,u64 * data)2710 static int svm_get_feature_msr(u32 msr, u64 *data)
2711 {
2712 *data = 0;
2713
2714 switch (msr) {
2715 case MSR_AMD64_DE_CFG:
2716 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2717 *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2718 break;
2719 default:
2720 return KVM_MSR_RET_UNSUPPORTED;
2721 }
2722
2723 return 0;
2724 }
2725
sev_es_prevent_msr_access(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2726 static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
2727 struct msr_data *msr_info)
2728 {
2729 return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected &&
2730 msr_info->index != MSR_IA32_XSS &&
2731 !msr_write_intercepted(vcpu, msr_info->index);
2732 }
2733
svm_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2734 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2735 {
2736 struct vcpu_svm *svm = to_svm(vcpu);
2737
2738 if (sev_es_prevent_msr_access(vcpu, msr_info)) {
2739 msr_info->data = 0;
2740 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
2741 }
2742
2743 switch (msr_info->index) {
2744 case MSR_AMD64_TSC_RATIO:
2745 if (!msr_info->host_initiated &&
2746 !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR))
2747 return 1;
2748 msr_info->data = svm->tsc_ratio_msr;
2749 break;
2750 case MSR_STAR:
2751 msr_info->data = svm->vmcb01.ptr->save.star;
2752 break;
2753 #ifdef CONFIG_X86_64
2754 case MSR_LSTAR:
2755 msr_info->data = svm->vmcb01.ptr->save.lstar;
2756 break;
2757 case MSR_CSTAR:
2758 msr_info->data = svm->vmcb01.ptr->save.cstar;
2759 break;
2760 case MSR_GS_BASE:
2761 msr_info->data = svm->vmcb01.ptr->save.gs.base;
2762 break;
2763 case MSR_FS_BASE:
2764 msr_info->data = svm->vmcb01.ptr->save.fs.base;
2765 break;
2766 case MSR_KERNEL_GS_BASE:
2767 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2768 break;
2769 case MSR_SYSCALL_MASK:
2770 msr_info->data = svm->vmcb01.ptr->save.sfmask;
2771 break;
2772 #endif
2773 case MSR_IA32_SYSENTER_CS:
2774 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2775 break;
2776 case MSR_IA32_SYSENTER_EIP:
2777 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2778 if (guest_cpuid_is_intel_compatible(vcpu))
2779 msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2780 break;
2781 case MSR_IA32_SYSENTER_ESP:
2782 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2783 if (guest_cpuid_is_intel_compatible(vcpu))
2784 msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2785 break;
2786 case MSR_IA32_S_CET:
2787 msr_info->data = svm->vmcb->save.s_cet;
2788 break;
2789 case MSR_IA32_INT_SSP_TAB:
2790 msr_info->data = svm->vmcb->save.isst_addr;
2791 break;
2792 case MSR_KVM_INTERNAL_GUEST_SSP:
2793 msr_info->data = svm->vmcb->save.ssp;
2794 break;
2795 case MSR_TSC_AUX:
2796 msr_info->data = svm->tsc_aux;
2797 break;
2798 case MSR_IA32_DEBUGCTLMSR:
2799 msr_info->data = svm->vmcb->save.dbgctl;
2800 break;
2801 case MSR_IA32_LASTBRANCHFROMIP:
2802 msr_info->data = svm->vmcb->save.br_from;
2803 break;
2804 case MSR_IA32_LASTBRANCHTOIP:
2805 msr_info->data = svm->vmcb->save.br_to;
2806 break;
2807 case MSR_IA32_LASTINTFROMIP:
2808 msr_info->data = svm->vmcb->save.last_excp_from;
2809 break;
2810 case MSR_IA32_LASTINTTOIP:
2811 msr_info->data = svm->vmcb->save.last_excp_to;
2812 break;
2813 case MSR_VM_HSAVE_PA:
2814 msr_info->data = svm->nested.hsave_msr;
2815 break;
2816 case MSR_VM_CR:
2817 msr_info->data = svm->nested.vm_cr_msr;
2818 break;
2819 case MSR_IA32_SPEC_CTRL:
2820 if (!msr_info->host_initiated &&
2821 !guest_has_spec_ctrl_msr(vcpu))
2822 return 1;
2823
2824 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2825 msr_info->data = svm->vmcb->save.spec_ctrl;
2826 else
2827 msr_info->data = svm->spec_ctrl;
2828 break;
2829 case MSR_AMD64_VIRT_SPEC_CTRL:
2830 if (!msr_info->host_initiated &&
2831 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
2832 return 1;
2833
2834 msr_info->data = svm->virt_spec_ctrl;
2835 break;
2836 case MSR_F15H_IC_CFG: {
2837
2838 int family, model;
2839
2840 family = guest_cpuid_family(vcpu);
2841 model = guest_cpuid_model(vcpu);
2842
2843 if (family < 0 || model < 0)
2844 return kvm_get_msr_common(vcpu, msr_info);
2845
2846 msr_info->data = 0;
2847
2848 if (family == 0x15 &&
2849 (model >= 0x2 && model < 0x20))
2850 msr_info->data = 0x1E;
2851 }
2852 break;
2853 case MSR_AMD64_DE_CFG:
2854 msr_info->data = svm->msr_decfg;
2855 break;
2856 default:
2857 return kvm_get_msr_common(vcpu, msr_info);
2858 }
2859 return 0;
2860 }
2861
svm_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)2862 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2863 {
2864 struct vcpu_svm *svm = to_svm(vcpu);
2865 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2866 return kvm_complete_insn_gp(vcpu, err);
2867
2868 svm_vmgexit_inject_exception(svm, X86_TRAP_GP);
2869 return 1;
2870 }
2871
svm_set_vm_cr(struct kvm_vcpu * vcpu,u64 data)2872 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2873 {
2874 struct vcpu_svm *svm = to_svm(vcpu);
2875 int svm_dis, chg_mask;
2876
2877 if (data & ~SVM_VM_CR_VALID_MASK)
2878 return 1;
2879
2880 chg_mask = SVM_VM_CR_VALID_MASK;
2881
2882 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2883 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2884
2885 svm->nested.vm_cr_msr &= ~chg_mask;
2886 svm->nested.vm_cr_msr |= (data & chg_mask);
2887
2888 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2889
2890 /* check for svm_disable while efer.svme is set */
2891 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2892 return 1;
2893
2894 return 0;
2895 }
2896
svm_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2897 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2898 {
2899 struct vcpu_svm *svm = to_svm(vcpu);
2900 int ret = 0;
2901
2902 u32 ecx = msr->index;
2903 u64 data = msr->data;
2904
2905 if (sev_es_prevent_msr_access(vcpu, msr))
2906 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
2907
2908 switch (ecx) {
2909 case MSR_AMD64_TSC_RATIO:
2910
2911 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) {
2912
2913 if (!msr->host_initiated)
2914 return 1;
2915 /*
2916 * In case TSC scaling is not enabled, always
2917 * leave this MSR at the default value.
2918 *
2919 * Due to bug in qemu 6.2.0, it would try to set
2920 * this msr to 0 if tsc scaling is not enabled.
2921 * Ignore this value as well.
2922 */
2923 if (data != 0 && data != svm->tsc_ratio_msr)
2924 return 1;
2925 break;
2926 }
2927
2928 if (data & SVM_TSC_RATIO_RSVD)
2929 return 1;
2930
2931 svm->tsc_ratio_msr = data;
2932
2933 if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
2934 is_guest_mode(vcpu))
2935 nested_svm_update_tsc_ratio_msr(vcpu);
2936
2937 break;
2938 case MSR_IA32_CR_PAT:
2939 ret = kvm_set_msr_common(vcpu, msr);
2940 if (ret)
2941 break;
2942
2943 svm->vmcb01.ptr->save.g_pat = data;
2944 if (is_guest_mode(vcpu))
2945 nested_vmcb02_compute_g_pat(svm);
2946 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2947 break;
2948 case MSR_IA32_SPEC_CTRL:
2949 if (!msr->host_initiated &&
2950 !guest_has_spec_ctrl_msr(vcpu))
2951 return 1;
2952
2953 if (kvm_spec_ctrl_test_value(data))
2954 return 1;
2955
2956 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2957 svm->vmcb->save.spec_ctrl = data;
2958 else
2959 svm->spec_ctrl = data;
2960 if (!data)
2961 break;
2962
2963 /*
2964 * For non-nested:
2965 * When it's written (to non-zero) for the first time, pass
2966 * it through.
2967 *
2968 * For nested:
2969 * The handling of the MSR bitmap for L2 guests is done in
2970 * nested_svm_merge_msrpm().
2971 * We update the L1 MSR bit as well since it will end up
2972 * touching the MSR anyway now.
2973 */
2974 svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
2975 break;
2976 case MSR_AMD64_VIRT_SPEC_CTRL:
2977 if (!msr->host_initiated &&
2978 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
2979 return 1;
2980
2981 if (data & ~SPEC_CTRL_SSBD)
2982 return 1;
2983
2984 svm->virt_spec_ctrl = data;
2985 break;
2986 case MSR_STAR:
2987 svm->vmcb01.ptr->save.star = data;
2988 break;
2989 #ifdef CONFIG_X86_64
2990 case MSR_LSTAR:
2991 svm->vmcb01.ptr->save.lstar = data;
2992 break;
2993 case MSR_CSTAR:
2994 svm->vmcb01.ptr->save.cstar = data;
2995 break;
2996 case MSR_GS_BASE:
2997 svm->vmcb01.ptr->save.gs.base = data;
2998 break;
2999 case MSR_FS_BASE:
3000 svm->vmcb01.ptr->save.fs.base = data;
3001 break;
3002 case MSR_KERNEL_GS_BASE:
3003 svm->vmcb01.ptr->save.kernel_gs_base = data;
3004 break;
3005 case MSR_SYSCALL_MASK:
3006 svm->vmcb01.ptr->save.sfmask = data;
3007 break;
3008 #endif
3009 case MSR_IA32_SYSENTER_CS:
3010 svm->vmcb01.ptr->save.sysenter_cs = data;
3011 break;
3012 case MSR_IA32_SYSENTER_EIP:
3013 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
3014 /*
3015 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
3016 * when we spoof an Intel vendor ID (for cross vendor migration).
3017 * In this case we use this intercept to track the high
3018 * 32 bit part of these msrs to support Intel's
3019 * implementation of SYSENTER/SYSEXIT.
3020 */
3021 svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
3022 break;
3023 case MSR_IA32_SYSENTER_ESP:
3024 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
3025 svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
3026 break;
3027 case MSR_IA32_S_CET:
3028 svm->vmcb->save.s_cet = data;
3029 vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
3030 break;
3031 case MSR_IA32_INT_SSP_TAB:
3032 svm->vmcb->save.isst_addr = data;
3033 vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
3034 break;
3035 case MSR_KVM_INTERNAL_GUEST_SSP:
3036 svm->vmcb->save.ssp = data;
3037 vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
3038 break;
3039 case MSR_TSC_AUX:
3040 /*
3041 * TSC_AUX is always virtualized for SEV-ES guests when the
3042 * feature is available. The user return MSR support is not
3043 * required in this case because TSC_AUX is restored on #VMEXIT
3044 * from the host save area.
3045 */
3046 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
3047 break;
3048
3049 /*
3050 * TSC_AUX is usually changed only during boot and never read
3051 * directly. Intercept TSC_AUX and switch it via user return.
3052 */
3053 preempt_disable();
3054 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
3055 preempt_enable();
3056 if (ret)
3057 break;
3058
3059 svm->tsc_aux = data;
3060 break;
3061 case MSR_IA32_DEBUGCTLMSR:
3062 if (!lbrv) {
3063 kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3064 break;
3065 }
3066
3067 /*
3068 * Suppress BTF as KVM doesn't virtualize BTF, but there's no
3069 * way to communicate lack of support to the guest.
3070 */
3071 if (data & DEBUGCTLMSR_BTF) {
3072 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
3073 data &= ~DEBUGCTLMSR_BTF;
3074 }
3075
3076 if (data & DEBUGCTL_RESERVED_BITS)
3077 return 1;
3078
3079 if (svm->vmcb->save.dbgctl == data)
3080 break;
3081
3082 svm->vmcb->save.dbgctl = data;
3083 vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
3084 svm_update_lbrv(vcpu);
3085 break;
3086 case MSR_VM_HSAVE_PA:
3087 /*
3088 * Old kernels did not validate the value written to
3089 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
3090 * value to allow live migrating buggy or malicious guests
3091 * originating from those kernels.
3092 */
3093 if (!msr->host_initiated && !page_address_valid(vcpu, data))
3094 return 1;
3095
3096 svm->nested.hsave_msr = data & PAGE_MASK;
3097 break;
3098 case MSR_VM_CR:
3099 return svm_set_vm_cr(vcpu, data);
3100 case MSR_VM_IGNNE:
3101 kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3102 break;
3103 case MSR_AMD64_DE_CFG: {
3104 u64 supported_de_cfg;
3105
3106 if (svm_get_feature_msr(ecx, &supported_de_cfg))
3107 return 1;
3108
3109 if (data & ~supported_de_cfg)
3110 return 1;
3111
3112 svm->msr_decfg = data;
3113 break;
3114 }
3115 default:
3116 return kvm_set_msr_common(vcpu, msr);
3117 }
3118 return ret;
3119 }
3120
msr_interception(struct kvm_vcpu * vcpu)3121 static int msr_interception(struct kvm_vcpu *vcpu)
3122 {
3123 if (to_svm(vcpu)->vmcb->control.exit_info_1)
3124 return kvm_emulate_wrmsr(vcpu);
3125 else
3126 return kvm_emulate_rdmsr(vcpu);
3127 }
3128
interrupt_window_interception(struct kvm_vcpu * vcpu)3129 static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3130 {
3131 kvm_make_request(KVM_REQ_EVENT, vcpu);
3132 svm_clear_vintr(to_svm(vcpu));
3133
3134 /*
3135 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3136 * In this case AVIC was temporarily disabled for
3137 * requesting the IRQ window and we have to re-enable it.
3138 *
3139 * If running nested, still remove the VM wide AVIC inhibit to
3140 * support case in which the interrupt window was requested when the
3141 * vCPU was not running nested.
3142
3143 * All vCPUs which run still run nested, will remain to have their
3144 * AVIC still inhibited due to per-cpu AVIC inhibition.
3145 */
3146 kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3147
3148 ++vcpu->stat.irq_window_exits;
3149 return 1;
3150 }
3151
pause_interception(struct kvm_vcpu * vcpu)3152 static int pause_interception(struct kvm_vcpu *vcpu)
3153 {
3154 bool in_kernel;
3155 /*
3156 * CPL is not made available for an SEV-ES guest, therefore
3157 * vcpu->arch.preempted_in_kernel can never be true. Just
3158 * set in_kernel to false as well.
3159 */
3160 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3161
3162 grow_ple_window(vcpu);
3163
3164 kvm_vcpu_on_spin(vcpu, in_kernel);
3165 return kvm_skip_emulated_instruction(vcpu);
3166 }
3167
invpcid_interception(struct kvm_vcpu * vcpu)3168 static int invpcid_interception(struct kvm_vcpu *vcpu)
3169 {
3170 struct vcpu_svm *svm = to_svm(vcpu);
3171 unsigned long type;
3172 gva_t gva;
3173
3174 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
3175 kvm_queue_exception(vcpu, UD_VECTOR);
3176 return 1;
3177 }
3178
3179 /*
3180 * For an INVPCID intercept:
3181 * EXITINFO1 provides the linear address of the memory operand.
3182 * EXITINFO2 provides the contents of the register operand.
3183 */
3184 type = svm->vmcb->control.exit_info_2;
3185 gva = svm->vmcb->control.exit_info_1;
3186
3187 /*
3188 * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the
3189 * stack segment is used. The intercept takes priority over all
3190 * #GP checks except CPL>0, but somehow still generates a linear
3191 * address? The APM is sorely lacking.
3192 */
3193 if (is_noncanonical_address(gva, vcpu, 0)) {
3194 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3195 return 1;
3196 }
3197
3198 return kvm_handle_invpcid(vcpu, type, gva);
3199 }
3200
complete_userspace_buslock(struct kvm_vcpu * vcpu)3201 static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
3202 {
3203 struct vcpu_svm *svm = to_svm(vcpu);
3204
3205 /*
3206 * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
3207 * execute the bus-locking instruction. Set the bus lock counter to '1'
3208 * to effectively step past the bus lock.
3209 */
3210 if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
3211 svm->vmcb->control.bus_lock_counter = 1;
3212
3213 return 1;
3214 }
3215
bus_lock_exit(struct kvm_vcpu * vcpu)3216 static int bus_lock_exit(struct kvm_vcpu *vcpu)
3217 {
3218 struct vcpu_svm *svm = to_svm(vcpu);
3219
3220 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
3221 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
3222
3223 vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
3224 vcpu->arch.complete_userspace_io = complete_userspace_buslock;
3225
3226 if (is_guest_mode(vcpu))
3227 svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;
3228
3229 return 0;
3230 }
3231
3232 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3233 [SVM_EXIT_READ_CR0] = cr_interception,
3234 [SVM_EXIT_READ_CR3] = cr_interception,
3235 [SVM_EXIT_READ_CR4] = cr_interception,
3236 [SVM_EXIT_READ_CR8] = cr_interception,
3237 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
3238 [SVM_EXIT_WRITE_CR0] = cr_interception,
3239 [SVM_EXIT_WRITE_CR3] = cr_interception,
3240 [SVM_EXIT_WRITE_CR4] = cr_interception,
3241 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
3242 [SVM_EXIT_READ_DR0] = dr_interception,
3243 [SVM_EXIT_READ_DR1] = dr_interception,
3244 [SVM_EXIT_READ_DR2] = dr_interception,
3245 [SVM_EXIT_READ_DR3] = dr_interception,
3246 [SVM_EXIT_READ_DR4] = dr_interception,
3247 [SVM_EXIT_READ_DR5] = dr_interception,
3248 [SVM_EXIT_READ_DR6] = dr_interception,
3249 [SVM_EXIT_READ_DR7] = dr_interception,
3250 [SVM_EXIT_WRITE_DR0] = dr_interception,
3251 [SVM_EXIT_WRITE_DR1] = dr_interception,
3252 [SVM_EXIT_WRITE_DR2] = dr_interception,
3253 [SVM_EXIT_WRITE_DR3] = dr_interception,
3254 [SVM_EXIT_WRITE_DR4] = dr_interception,
3255 [SVM_EXIT_WRITE_DR5] = dr_interception,
3256 [SVM_EXIT_WRITE_DR6] = dr_interception,
3257 [SVM_EXIT_WRITE_DR7] = dr_interception,
3258 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
3259 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
3260 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
3261 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
3262 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
3263 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
3264 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
3265 [SVM_EXIT_INTR] = intr_interception,
3266 [SVM_EXIT_NMI] = nmi_interception,
3267 [SVM_EXIT_SMI] = smi_interception,
3268 [SVM_EXIT_VINTR] = interrupt_window_interception,
3269 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
3270 [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
3271 [SVM_EXIT_IRET] = iret_interception,
3272 [SVM_EXIT_INVD] = kvm_emulate_invd,
3273 [SVM_EXIT_PAUSE] = pause_interception,
3274 [SVM_EXIT_HLT] = kvm_emulate_halt,
3275 [SVM_EXIT_INVLPG] = invlpg_interception,
3276 [SVM_EXIT_INVLPGA] = invlpga_interception,
3277 [SVM_EXIT_IOIO] = io_interception,
3278 [SVM_EXIT_MSR] = msr_interception,
3279 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
3280 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
3281 [SVM_EXIT_VMRUN] = vmrun_interception,
3282 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
3283 [SVM_EXIT_VMLOAD] = vmload_interception,
3284 [SVM_EXIT_VMSAVE] = vmsave_interception,
3285 [SVM_EXIT_STGI] = stgi_interception,
3286 [SVM_EXIT_CLGI] = clgi_interception,
3287 [SVM_EXIT_SKINIT] = skinit_interception,
3288 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
3289 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
3290 [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
3291 [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
3292 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
3293 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
3294 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
3295 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
3296 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
3297 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
3298 [SVM_EXIT_INVPCID] = invpcid_interception,
3299 [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
3300 [SVM_EXIT_NPF] = npf_interception,
3301 [SVM_EXIT_BUS_LOCK] = bus_lock_exit,
3302 [SVM_EXIT_RSM] = rsm_interception,
3303 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
3304 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
3305 #ifdef CONFIG_KVM_AMD_SEV
3306 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
3307 #endif
3308 };
3309
dump_vmcb(struct kvm_vcpu * vcpu)3310 static void dump_vmcb(struct kvm_vcpu *vcpu)
3311 {
3312 struct vcpu_svm *svm = to_svm(vcpu);
3313 struct vmcb_control_area *control = &svm->vmcb->control;
3314 struct vmcb_save_area *save = &svm->vmcb->save;
3315 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3316 char *vm_type;
3317
3318 if (!dump_invalid_vmcb) {
3319 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3320 return;
3321 }
3322
3323 guard(mutex)(&vmcb_dump_mutex);
3324
3325 vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" :
3326 sev_es_guest(vcpu->kvm) ? "SEV-ES" :
3327 sev_guest(vcpu->kvm) ? "SEV" : "SVM";
3328
3329 pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n",
3330 vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3331 pr_err("VMCB Control Area:\n");
3332 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3333 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3334 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3335 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3336 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3337 pr_err("%-20s%08x %08x\n", "intercepts:",
3338 control->intercepts[INTERCEPT_WORD3],
3339 control->intercepts[INTERCEPT_WORD4]);
3340 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3341 pr_err("%-20s%d\n", "pause filter threshold:",
3342 control->pause_filter_thresh);
3343 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3344 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3345 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3346 pr_err("%-20s%d\n", "asid:", control->asid);
3347 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3348 pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl);
3349 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3350 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3351 pr_err("%-20s%08x\n", "int_state:", control->int_state);
3352 pr_err("%-20s%016llx\n", "exit_code:", control->exit_code);
3353 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3354 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3355 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3356 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3357 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3358 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3359 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3360 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3361 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3362 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3363 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3364 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3365 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3366 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3367 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3368 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3369 pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features);
3370 pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features);
3371
3372 if (sev_es_guest(vcpu->kvm)) {
3373 save = sev_decrypt_vmsa(vcpu);
3374 if (!save)
3375 goto no_vmsa;
3376
3377 save01 = save;
3378 }
3379
3380 pr_err("VMCB State Save Area:\n");
3381 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3382 "es:",
3383 save->es.selector, save->es.attrib,
3384 save->es.limit, save->es.base);
3385 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3386 "cs:",
3387 save->cs.selector, save->cs.attrib,
3388 save->cs.limit, save->cs.base);
3389 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3390 "ss:",
3391 save->ss.selector, save->ss.attrib,
3392 save->ss.limit, save->ss.base);
3393 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3394 "ds:",
3395 save->ds.selector, save->ds.attrib,
3396 save->ds.limit, save->ds.base);
3397 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3398 "fs:",
3399 save01->fs.selector, save01->fs.attrib,
3400 save01->fs.limit, save01->fs.base);
3401 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3402 "gs:",
3403 save01->gs.selector, save01->gs.attrib,
3404 save01->gs.limit, save01->gs.base);
3405 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3406 "gdtr:",
3407 save->gdtr.selector, save->gdtr.attrib,
3408 save->gdtr.limit, save->gdtr.base);
3409 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3410 "ldtr:",
3411 save01->ldtr.selector, save01->ldtr.attrib,
3412 save01->ldtr.limit, save01->ldtr.base);
3413 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3414 "idtr:",
3415 save->idtr.selector, save->idtr.attrib,
3416 save->idtr.limit, save->idtr.base);
3417 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3418 "tr:",
3419 save01->tr.selector, save01->tr.attrib,
3420 save01->tr.limit, save01->tr.base);
3421 pr_err("vmpl: %d cpl: %d efer: %016llx\n",
3422 save->vmpl, save->cpl, save->efer);
3423 pr_err("%-15s %016llx %-13s %016llx\n",
3424 "cr0:", save->cr0, "cr2:", save->cr2);
3425 pr_err("%-15s %016llx %-13s %016llx\n",
3426 "cr3:", save->cr3, "cr4:", save->cr4);
3427 pr_err("%-15s %016llx %-13s %016llx\n",
3428 "dr6:", save->dr6, "dr7:", save->dr7);
3429 pr_err("%-15s %016llx %-13s %016llx\n",
3430 "rip:", save->rip, "rflags:", save->rflags);
3431 pr_err("%-15s %016llx %-13s %016llx\n",
3432 "rsp:", save->rsp, "rax:", save->rax);
3433 pr_err("%-15s %016llx %-13s %016llx\n",
3434 "s_cet:", save->s_cet, "ssp:", save->ssp);
3435 pr_err("%-15s %016llx\n",
3436 "isst_addr:", save->isst_addr);
3437 pr_err("%-15s %016llx %-13s %016llx\n",
3438 "star:", save01->star, "lstar:", save01->lstar);
3439 pr_err("%-15s %016llx %-13s %016llx\n",
3440 "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3441 pr_err("%-15s %016llx %-13s %016llx\n",
3442 "kernel_gs_base:", save01->kernel_gs_base,
3443 "sysenter_cs:", save01->sysenter_cs);
3444 pr_err("%-15s %016llx %-13s %016llx\n",
3445 "sysenter_esp:", save01->sysenter_esp,
3446 "sysenter_eip:", save01->sysenter_eip);
3447 pr_err("%-15s %016llx %-13s %016llx\n",
3448 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3449 pr_err("%-15s %016llx %-13s %016llx\n",
3450 "br_from:", save->br_from, "br_to:", save->br_to);
3451 pr_err("%-15s %016llx %-13s %016llx\n",
3452 "excp_from:", save->last_excp_from,
3453 "excp_to:", save->last_excp_to);
3454
3455 if (sev_es_guest(vcpu->kvm)) {
3456 struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
3457
3458 pr_err("%-15s %016llx\n",
3459 "sev_features", vmsa->sev_features);
3460
3461 pr_err("%-15s %016llx %-13s %016llx\n",
3462 "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp);
3463 pr_err("%-15s %016llx %-13s %016llx\n",
3464 "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp);
3465 pr_err("%-15s %016llx\n",
3466 "u_cet:", vmsa->u_cet);
3467
3468 pr_err("%-15s %016llx %-13s %016llx\n",
3469 "rax:", vmsa->rax, "rbx:", vmsa->rbx);
3470 pr_err("%-15s %016llx %-13s %016llx\n",
3471 "rcx:", vmsa->rcx, "rdx:", vmsa->rdx);
3472 pr_err("%-15s %016llx %-13s %016llx\n",
3473 "rsi:", vmsa->rsi, "rdi:", vmsa->rdi);
3474 pr_err("%-15s %016llx %-13s %016llx\n",
3475 "rbp:", vmsa->rbp, "rsp:", vmsa->rsp);
3476 pr_err("%-15s %016llx %-13s %016llx\n",
3477 "r8:", vmsa->r8, "r9:", vmsa->r9);
3478 pr_err("%-15s %016llx %-13s %016llx\n",
3479 "r10:", vmsa->r10, "r11:", vmsa->r11);
3480 pr_err("%-15s %016llx %-13s %016llx\n",
3481 "r12:", vmsa->r12, "r13:", vmsa->r13);
3482 pr_err("%-15s %016llx %-13s %016llx\n",
3483 "r14:", vmsa->r14, "r15:", vmsa->r15);
3484 pr_err("%-15s %016llx %-13s %016llx\n",
3485 "xcr0:", vmsa->xcr0, "xss:", vmsa->xss);
3486 } else {
3487 pr_err("%-15s %016llx %-13s %016lx\n",
3488 "rax:", save->rax, "rbx:",
3489 vcpu->arch.regs[VCPU_REGS_RBX]);
3490 pr_err("%-15s %016lx %-13s %016lx\n",
3491 "rcx:", vcpu->arch.regs[VCPU_REGS_RCX],
3492 "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]);
3493 pr_err("%-15s %016lx %-13s %016lx\n",
3494 "rsi:", vcpu->arch.regs[VCPU_REGS_RSI],
3495 "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]);
3496 pr_err("%-15s %016lx %-13s %016llx\n",
3497 "rbp:", vcpu->arch.regs[VCPU_REGS_RBP],
3498 "rsp:", save->rsp);
3499 #ifdef CONFIG_X86_64
3500 pr_err("%-15s %016lx %-13s %016lx\n",
3501 "r8:", vcpu->arch.regs[VCPU_REGS_R8],
3502 "r9:", vcpu->arch.regs[VCPU_REGS_R9]);
3503 pr_err("%-15s %016lx %-13s %016lx\n",
3504 "r10:", vcpu->arch.regs[VCPU_REGS_R10],
3505 "r11:", vcpu->arch.regs[VCPU_REGS_R11]);
3506 pr_err("%-15s %016lx %-13s %016lx\n",
3507 "r12:", vcpu->arch.regs[VCPU_REGS_R12],
3508 "r13:", vcpu->arch.regs[VCPU_REGS_R13]);
3509 pr_err("%-15s %016lx %-13s %016lx\n",
3510 "r14:", vcpu->arch.regs[VCPU_REGS_R14],
3511 "r15:", vcpu->arch.regs[VCPU_REGS_R15]);
3512 #endif
3513 }
3514
3515 no_vmsa:
3516 if (sev_es_guest(vcpu->kvm))
3517 sev_free_decrypted_vmsa(vcpu, save);
3518 }
3519
svm_invoke_exit_handler(struct kvm_vcpu * vcpu,u64 __exit_code)3520 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code)
3521 {
3522 u32 exit_code = __exit_code;
3523
3524 /*
3525 * SVM uses negative values, i.e. 64-bit values, to indicate that VMRUN
3526 * failed. Report all such errors to userspace (note, VMEXIT_INVALID,
3527 * a.k.a. SVM_EXIT_ERR, is special cased by svm_handle_exit()). Skip
3528 * the check when running as a VM, as KVM has historically left garbage
3529 * in bits 63:32, i.e. running KVM-on-KVM would hit false positives if
3530 * the underlying kernel is buggy.
3531 */
3532 if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR) &&
3533 (u64)exit_code != __exit_code)
3534 goto unexpected_vmexit;
3535
3536 #ifdef CONFIG_MITIGATION_RETPOLINE
3537 if (exit_code == SVM_EXIT_MSR)
3538 return msr_interception(vcpu);
3539 else if (exit_code == SVM_EXIT_VINTR)
3540 return interrupt_window_interception(vcpu);
3541 else if (exit_code == SVM_EXIT_INTR)
3542 return intr_interception(vcpu);
3543 else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT)
3544 return kvm_emulate_halt(vcpu);
3545 else if (exit_code == SVM_EXIT_NPF)
3546 return npf_interception(vcpu);
3547 #ifdef CONFIG_KVM_AMD_SEV
3548 else if (exit_code == SVM_EXIT_VMGEXIT)
3549 return sev_handle_vmgexit(vcpu);
3550 #endif
3551 #endif
3552 if (exit_code >= ARRAY_SIZE(svm_exit_handlers))
3553 goto unexpected_vmexit;
3554
3555 exit_code = array_index_nospec(exit_code, ARRAY_SIZE(svm_exit_handlers));
3556 if (!svm_exit_handlers[exit_code])
3557 goto unexpected_vmexit;
3558
3559 return svm_exit_handlers[exit_code](vcpu);
3560
3561 unexpected_vmexit:
3562 dump_vmcb(vcpu);
3563 kvm_prepare_unexpected_reason_exit(vcpu, __exit_code);
3564 return 0;
3565 }
3566
svm_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)3567 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3568 u64 *info1, u64 *info2,
3569 u32 *intr_info, u32 *error_code)
3570 {
3571 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3572
3573 *reason = control->exit_code;
3574 *info1 = control->exit_info_1;
3575 *info2 = control->exit_info_2;
3576 *intr_info = control->exit_int_info;
3577 if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3578 (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3579 *error_code = control->exit_int_info_err;
3580 else
3581 *error_code = 0;
3582 }
3583
svm_get_entry_info(struct kvm_vcpu * vcpu,u32 * intr_info,u32 * error_code)3584 static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info,
3585 u32 *error_code)
3586 {
3587 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3588
3589 *intr_info = control->event_inj;
3590
3591 if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3592 (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3593 *error_code = control->event_inj_err;
3594 else
3595 *error_code = 0;
3596
3597 }
3598
svm_handle_exit(struct kvm_vcpu * vcpu,fastpath_t exit_fastpath)3599 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3600 {
3601 struct vcpu_svm *svm = to_svm(vcpu);
3602 struct kvm_run *kvm_run = vcpu->run;
3603
3604 /* SEV-ES guests must use the CR write traps to track CR registers. */
3605 if (!sev_es_guest(vcpu->kvm)) {
3606 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3607 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3608 if (npt_enabled)
3609 vcpu->arch.cr3 = svm->vmcb->save.cr3;
3610 }
3611
3612 if (is_guest_mode(vcpu)) {
3613 int vmexit;
3614
3615 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3616
3617 vmexit = nested_svm_exit_special(svm);
3618
3619 if (vmexit == NESTED_EXIT_CONTINUE)
3620 vmexit = nested_svm_exit_handled(svm);
3621
3622 if (vmexit == NESTED_EXIT_DONE)
3623 return 1;
3624 }
3625
3626 if (svm_is_vmrun_failure(svm->vmcb->control.exit_code)) {
3627 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3628 kvm_run->fail_entry.hardware_entry_failure_reason
3629 = svm->vmcb->control.exit_code;
3630 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3631 dump_vmcb(vcpu);
3632 return 0;
3633 }
3634
3635 if (exit_fastpath != EXIT_FASTPATH_NONE)
3636 return 1;
3637
3638 return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code);
3639 }
3640
pre_svm_run(struct kvm_vcpu * vcpu)3641 static int pre_svm_run(struct kvm_vcpu *vcpu)
3642 {
3643 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3644 struct vcpu_svm *svm = to_svm(vcpu);
3645
3646 /*
3647 * If the previous vmrun of the vmcb occurred on a different physical
3648 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
3649 * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3650 */
3651 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3652 svm->current_vmcb->asid_generation = 0;
3653 vmcb_mark_all_dirty(svm->vmcb);
3654 svm->current_vmcb->cpu = vcpu->cpu;
3655 }
3656
3657 if (sev_guest(vcpu->kvm))
3658 return pre_sev_run(svm, vcpu->cpu);
3659
3660 /* FIXME: handle wraparound of asid_generation */
3661 if (svm->current_vmcb->asid_generation != sd->asid_generation)
3662 new_asid(svm, sd);
3663
3664 return 0;
3665 }
3666
svm_inject_nmi(struct kvm_vcpu * vcpu)3667 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3668 {
3669 struct vcpu_svm *svm = to_svm(vcpu);
3670
3671 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3672
3673 if (svm->nmi_l1_to_l2)
3674 return;
3675
3676 /*
3677 * No need to manually track NMI masking when vNMI is enabled, hardware
3678 * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
3679 * case where software directly injects an NMI.
3680 */
3681 if (!is_vnmi_enabled(svm)) {
3682 svm->nmi_masked = true;
3683 svm_set_iret_intercept(svm);
3684 }
3685 ++vcpu->stat.nmi_injections;
3686 }
3687
svm_is_vnmi_pending(struct kvm_vcpu * vcpu)3688 static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
3689 {
3690 struct vcpu_svm *svm = to_svm(vcpu);
3691
3692 if (!is_vnmi_enabled(svm))
3693 return false;
3694
3695 return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
3696 }
3697
svm_set_vnmi_pending(struct kvm_vcpu * vcpu)3698 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
3699 {
3700 struct vcpu_svm *svm = to_svm(vcpu);
3701
3702 if (!is_vnmi_enabled(svm))
3703 return false;
3704
3705 if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
3706 return false;
3707
3708 svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
3709 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
3710
3711 /*
3712 * Because the pending NMI is serviced by hardware, KVM can't know when
3713 * the NMI is "injected", but for all intents and purposes, passing the
3714 * NMI off to hardware counts as injection.
3715 */
3716 ++vcpu->stat.nmi_injections;
3717
3718 return true;
3719 }
3720
svm_inject_irq(struct kvm_vcpu * vcpu,bool reinjected)3721 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3722 {
3723 struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt;
3724 struct vcpu_svm *svm = to_svm(vcpu);
3725 u32 type;
3726
3727 if (intr->soft) {
3728 if (svm_update_soft_interrupt_rip(vcpu, intr->nr))
3729 return;
3730
3731 type = SVM_EVTINJ_TYPE_SOFT;
3732 } else {
3733 type = SVM_EVTINJ_TYPE_INTR;
3734 }
3735
3736 trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
3737 ++vcpu->stat.irq_injections;
3738
3739 svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
3740 }
3741
svm_complete_interrupt_delivery(struct kvm_vcpu * vcpu,int delivery_mode,int trig_mode,int vector)3742 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3743 int trig_mode, int vector)
3744 {
3745 /*
3746 * apic->apicv_active must be read after vcpu->mode.
3747 * Pairs with smp_store_release in vcpu_enter_guest.
3748 */
3749 bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3750
3751 /* Note, this is called iff the local APIC is in-kernel. */
3752 if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3753 /* Process the interrupt via kvm_check_and_inject_events(). */
3754 kvm_make_request(KVM_REQ_EVENT, vcpu);
3755 kvm_vcpu_kick(vcpu);
3756 return;
3757 }
3758
3759 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3760 if (in_guest_mode) {
3761 /*
3762 * Signal the doorbell to tell hardware to inject the IRQ. If
3763 * the vCPU exits the guest before the doorbell chimes, hardware
3764 * will automatically process AVIC interrupts at the next VMRUN.
3765 */
3766 avic_ring_doorbell(vcpu);
3767 } else {
3768 /*
3769 * Wake the vCPU if it was blocking. KVM will then detect the
3770 * pending IRQ when checking if the vCPU has a wake event.
3771 */
3772 kvm_vcpu_wake_up(vcpu);
3773 }
3774 }
3775
svm_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)3776 static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
3777 int trig_mode, int vector)
3778 {
3779 kvm_lapic_set_irr(vector, apic);
3780
3781 /*
3782 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3783 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3784 * the read of guest_mode. This guarantees that either VMRUN will see
3785 * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3786 * will signal the doorbell if the CPU has already entered the guest.
3787 */
3788 smp_mb__after_atomic();
3789 svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3790 }
3791
svm_update_cr8_intercept(struct kvm_vcpu * vcpu,int tpr,int irr)3792 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3793 {
3794 struct vcpu_svm *svm = to_svm(vcpu);
3795
3796 /*
3797 * SEV-ES guests must always keep the CR intercepts cleared. CR
3798 * tracking is done using the CR write traps.
3799 */
3800 if (sev_es_guest(vcpu->kvm))
3801 return;
3802
3803 if (nested_svm_virtualize_tpr(vcpu))
3804 return;
3805
3806 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3807
3808 if (irr == -1)
3809 return;
3810
3811 if (tpr >= irr)
3812 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3813 }
3814
svm_get_nmi_mask(struct kvm_vcpu * vcpu)3815 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3816 {
3817 struct vcpu_svm *svm = to_svm(vcpu);
3818
3819 if (is_vnmi_enabled(svm))
3820 return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
3821 else
3822 return svm->nmi_masked;
3823 }
3824
svm_set_nmi_mask(struct kvm_vcpu * vcpu,bool masked)3825 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3826 {
3827 struct vcpu_svm *svm = to_svm(vcpu);
3828
3829 if (is_vnmi_enabled(svm)) {
3830 if (masked)
3831 svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
3832 else
3833 svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
3834
3835 } else {
3836 svm->nmi_masked = masked;
3837 if (masked)
3838 svm_set_iret_intercept(svm);
3839 else
3840 svm_clr_iret_intercept(svm);
3841 }
3842 }
3843
svm_nmi_blocked(struct kvm_vcpu * vcpu)3844 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3845 {
3846 struct vcpu_svm *svm = to_svm(vcpu);
3847 struct vmcb *vmcb = svm->vmcb;
3848
3849 if (!gif_set(svm))
3850 return true;
3851
3852 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3853 return false;
3854
3855 if (svm_get_nmi_mask(vcpu))
3856 return true;
3857
3858 return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
3859 }
3860
svm_nmi_allowed(struct kvm_vcpu * vcpu,bool for_injection)3861 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3862 {
3863 struct vcpu_svm *svm = to_svm(vcpu);
3864 if (svm->nested.nested_run_pending)
3865 return -EBUSY;
3866
3867 if (svm_nmi_blocked(vcpu))
3868 return 0;
3869
3870 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
3871 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3872 return -EBUSY;
3873 return 1;
3874 }
3875
svm_interrupt_blocked(struct kvm_vcpu * vcpu)3876 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3877 {
3878 struct vcpu_svm *svm = to_svm(vcpu);
3879 struct vmcb *vmcb = svm->vmcb;
3880
3881 if (!gif_set(svm))
3882 return true;
3883
3884 if (is_guest_mode(vcpu)) {
3885 /* As long as interrupts are being delivered... */
3886 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3887 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3888 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3889 return true;
3890
3891 /* ... vmexits aren't blocked by the interrupt shadow */
3892 if (nested_exit_on_intr(svm))
3893 return false;
3894 } else {
3895 if (!svm_get_if_flag(vcpu))
3896 return true;
3897 }
3898
3899 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3900 }
3901
svm_interrupt_allowed(struct kvm_vcpu * vcpu,bool for_injection)3902 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3903 {
3904 struct vcpu_svm *svm = to_svm(vcpu);
3905
3906 if (svm->nested.nested_run_pending)
3907 return -EBUSY;
3908
3909 if (svm_interrupt_blocked(vcpu))
3910 return 0;
3911
3912 /*
3913 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3914 * e.g. if the IRQ arrived asynchronously after checking nested events.
3915 */
3916 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3917 return -EBUSY;
3918
3919 return 1;
3920 }
3921
svm_enable_irq_window(struct kvm_vcpu * vcpu)3922 static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3923 {
3924 struct vcpu_svm *svm = to_svm(vcpu);
3925
3926 /*
3927 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3928 * 1, because that's a separate STGI/VMRUN intercept. The next time we
3929 * get that intercept, this function will be called again though and
3930 * we'll get the vintr intercept. However, if the vGIF feature is
3931 * enabled, the STGI interception will not occur. Enable the irq
3932 * window under the assumption that the hardware will set the GIF.
3933 */
3934 if (vgif || gif_set(svm)) {
3935 /*
3936 * IRQ window is not needed when AVIC is enabled,
3937 * unless we have pending ExtINT since it cannot be injected
3938 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3939 * and fallback to injecting IRQ via V_IRQ.
3940 *
3941 * If running nested, AVIC is already locally inhibited
3942 * on this vCPU, therefore there is no need to request
3943 * the VM wide AVIC inhibition.
3944 */
3945 if (!is_guest_mode(vcpu))
3946 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3947
3948 svm_set_vintr(svm);
3949 }
3950 }
3951
svm_enable_nmi_window(struct kvm_vcpu * vcpu)3952 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3953 {
3954 struct vcpu_svm *svm = to_svm(vcpu);
3955
3956 /*
3957 * If NMIs are outright masked, i.e. the vCPU is already handling an
3958 * NMI, and KVM has not yet intercepted an IRET, then there is nothing
3959 * more to do at this time as KVM has already enabled IRET intercepts.
3960 * If KVM has already intercepted IRET, then single-step over the IRET,
3961 * as NMIs aren't architecturally unmasked until the IRET completes.
3962 *
3963 * If vNMI is enabled, KVM should never request an NMI window if NMIs
3964 * are masked, as KVM allows at most one to-be-injected NMI and one
3965 * pending NMI. If two NMIs arrive simultaneously, KVM will inject one
3966 * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are
3967 * unmasked. KVM _will_ request an NMI window in some situations, e.g.
3968 * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately
3969 * inject the NMI. In those situations, KVM needs to single-step over
3970 * the STI shadow or intercept STGI.
3971 */
3972 if (svm_get_nmi_mask(vcpu)) {
3973 WARN_ON_ONCE(is_vnmi_enabled(svm));
3974
3975 if (!svm->awaiting_iret_completion)
3976 return; /* IRET will cause a vm exit */
3977 }
3978
3979 /*
3980 * SEV-ES guests are responsible for signaling when a vCPU is ready to
3981 * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
3982 * KVM can't intercept and single-step IRET to detect when NMIs are
3983 * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE.
3984 *
3985 * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
3986 * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
3987 * supported NAEs in the GHCB protocol.
3988 */
3989 if (sev_es_guest(vcpu->kvm))
3990 return;
3991
3992 if (!gif_set(svm)) {
3993 if (vgif)
3994 svm_set_intercept(svm, INTERCEPT_STGI);
3995 return; /* STGI will cause a vm exit */
3996 }
3997
3998 /*
3999 * Something prevents NMI from been injected. Single step over possible
4000 * problem (IRET or exception injection or interrupt shadow)
4001 */
4002 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
4003 svm->nmi_singlestep = true;
4004 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
4005 }
4006
svm_flush_tlb_asid(struct kvm_vcpu * vcpu)4007 static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
4008 {
4009 struct vcpu_svm *svm = to_svm(vcpu);
4010
4011 /*
4012 * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
4013 * A TLB flush for the current ASID flushes both "host" and "guest" TLB
4014 * entries, and thus is a superset of Hyper-V's fine grained flushing.
4015 */
4016 kvm_hv_vcpu_purge_flush_tlb(vcpu);
4017
4018 /*
4019 * Flush only the current ASID even if the TLB flush was invoked via
4020 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
4021 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
4022 * unconditionally does a TLB flush on both nested VM-Enter and nested
4023 * VM-Exit (via kvm_mmu_reset_context()).
4024 */
4025 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
4026 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
4027 else
4028 svm->current_vmcb->asid_generation--;
4029 }
4030
svm_flush_tlb_current(struct kvm_vcpu * vcpu)4031 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
4032 {
4033 hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
4034
4035 /*
4036 * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
4037 * flush the NPT mappings via hypercall as flushing the ASID only
4038 * affects virtual to physical mappings, it does not invalidate guest
4039 * physical to host physical mappings.
4040 */
4041 if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
4042 hyperv_flush_guest_mapping(root_tdp);
4043
4044 svm_flush_tlb_asid(vcpu);
4045 }
4046
svm_flush_tlb_all(struct kvm_vcpu * vcpu)4047 static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
4048 {
4049 /*
4050 * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
4051 * flushes should be routed to hv_flush_remote_tlbs() without requesting
4052 * a "regular" remote flush. Reaching this point means either there's
4053 * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
4054 * which might be fatal to the guest. Yell, but try to recover.
4055 */
4056 if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
4057 hv_flush_remote_tlbs(vcpu->kvm);
4058
4059 svm_flush_tlb_asid(vcpu);
4060 }
4061
svm_flush_tlb_gva(struct kvm_vcpu * vcpu,gva_t gva)4062 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
4063 {
4064 struct vcpu_svm *svm = to_svm(vcpu);
4065
4066 invlpga(gva, svm->vmcb->control.asid);
4067 }
4068
svm_flush_tlb_guest(struct kvm_vcpu * vcpu)4069 static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu)
4070 {
4071 kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS);
4072
4073 svm_flush_tlb_asid(vcpu);
4074 }
4075
sync_cr8_to_lapic(struct kvm_vcpu * vcpu)4076 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
4077 {
4078 struct vcpu_svm *svm = to_svm(vcpu);
4079
4080 if (nested_svm_virtualize_tpr(vcpu))
4081 return;
4082
4083 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
4084 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
4085 kvm_set_cr8(vcpu, cr8);
4086 }
4087 }
4088
sync_lapic_to_cr8(struct kvm_vcpu * vcpu)4089 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
4090 {
4091 struct vcpu_svm *svm = to_svm(vcpu);
4092 u64 cr8;
4093
4094 if (nested_svm_virtualize_tpr(vcpu))
4095 return;
4096
4097 cr8 = kvm_get_cr8(vcpu);
4098 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
4099 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
4100 }
4101
svm_complete_soft_interrupt(struct kvm_vcpu * vcpu,u8 vector,int type)4102 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
4103 int type)
4104 {
4105 bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
4106 bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
4107 struct vcpu_svm *svm = to_svm(vcpu);
4108
4109 /*
4110 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
4111 * associated with the original soft exception/interrupt. next_rip is
4112 * cleared on all exits that can occur while vectoring an event, so KVM
4113 * needs to manually set next_rip for re-injection. Unlike the !nrips
4114 * case below, this needs to be done if and only if KVM is re-injecting
4115 * the same event, i.e. if the event is a soft exception/interrupt,
4116 * otherwise next_rip is unused on VMRUN.
4117 */
4118 if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
4119 kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
4120 svm->vmcb->control.next_rip = svm->soft_int_next_rip;
4121 /*
4122 * If NRIPS isn't enabled, KVM must manually advance RIP prior to
4123 * injecting the soft exception/interrupt. That advancement needs to
4124 * be unwound if vectoring didn't complete. Note, the new event may
4125 * not be the injected event, e.g. if KVM injected an INTn, the INTn
4126 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
4127 * be the reported vectored event, but RIP still needs to be unwound.
4128 */
4129 else if (!nrips && (is_soft || is_exception) &&
4130 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
4131 kvm_rip_write(vcpu, svm->soft_int_old_rip);
4132 }
4133
svm_complete_interrupts(struct kvm_vcpu * vcpu)4134 static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
4135 {
4136 struct vcpu_svm *svm = to_svm(vcpu);
4137 u8 vector;
4138 int type;
4139 u32 exitintinfo = svm->vmcb->control.exit_int_info;
4140 bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
4141 bool soft_int_injected = svm->soft_int_injected;
4142
4143 svm->nmi_l1_to_l2 = false;
4144 svm->soft_int_injected = false;
4145
4146 /*
4147 * If we've made progress since setting awaiting_iret_completion, we've
4148 * executed an IRET and can allow NMI injection.
4149 */
4150 if (svm->awaiting_iret_completion &&
4151 kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
4152 svm->awaiting_iret_completion = false;
4153 svm->nmi_masked = false;
4154 kvm_make_request(KVM_REQ_EVENT, vcpu);
4155 }
4156
4157 vcpu->arch.nmi_injected = false;
4158 kvm_clear_exception_queue(vcpu);
4159 kvm_clear_interrupt_queue(vcpu);
4160
4161 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
4162 return;
4163
4164 kvm_make_request(KVM_REQ_EVENT, vcpu);
4165
4166 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
4167 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
4168
4169 if (soft_int_injected)
4170 svm_complete_soft_interrupt(vcpu, vector, type);
4171
4172 switch (type) {
4173 case SVM_EXITINTINFO_TYPE_NMI:
4174 vcpu->arch.nmi_injected = true;
4175 svm->nmi_l1_to_l2 = nmi_l1_to_l2;
4176 break;
4177 case SVM_EXITINTINFO_TYPE_EXEPT: {
4178 u32 error_code = 0;
4179
4180 /*
4181 * Never re-inject a #VC exception.
4182 */
4183 if (vector == X86_TRAP_VC)
4184 break;
4185
4186 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR)
4187 error_code = svm->vmcb->control.exit_int_info_err;
4188
4189 kvm_requeue_exception(vcpu, vector,
4190 exitintinfo & SVM_EXITINTINFO_VALID_ERR,
4191 error_code);
4192 break;
4193 }
4194 case SVM_EXITINTINFO_TYPE_INTR:
4195 kvm_queue_interrupt(vcpu, vector, false);
4196 break;
4197 case SVM_EXITINTINFO_TYPE_SOFT:
4198 kvm_queue_interrupt(vcpu, vector, true);
4199 break;
4200 default:
4201 break;
4202 }
4203
4204 }
4205
svm_cancel_injection(struct kvm_vcpu * vcpu)4206 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4207 {
4208 struct vcpu_svm *svm = to_svm(vcpu);
4209 struct vmcb_control_area *control = &svm->vmcb->control;
4210
4211 control->exit_int_info = control->event_inj;
4212 control->exit_int_info_err = control->event_inj_err;
4213 control->event_inj = 0;
4214 svm_complete_interrupts(vcpu);
4215 }
4216
svm_vcpu_pre_run(struct kvm_vcpu * vcpu)4217 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
4218 {
4219 if (to_kvm_sev_info(vcpu->kvm)->need_init)
4220 return -EINVAL;
4221
4222 return 1;
4223 }
4224
svm_exit_handlers_fastpath(struct kvm_vcpu * vcpu)4225 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
4226 {
4227 struct vcpu_svm *svm = to_svm(vcpu);
4228 struct vmcb_control_area *control = &svm->vmcb->control;
4229
4230 /*
4231 * Next RIP must be provided as IRQs are disabled, and accessing guest
4232 * memory to decode the instruction might fault, i.e. might sleep.
4233 */
4234 if (!nrips || !control->next_rip)
4235 return EXIT_FASTPATH_NONE;
4236
4237 if (is_guest_mode(vcpu))
4238 return EXIT_FASTPATH_NONE;
4239
4240 switch (control->exit_code) {
4241 case SVM_EXIT_MSR:
4242 if (!control->exit_info_1)
4243 break;
4244 return handle_fastpath_wrmsr(vcpu);
4245 case SVM_EXIT_HLT:
4246 return handle_fastpath_hlt(vcpu);
4247 case SVM_EXIT_INVD:
4248 return handle_fastpath_invd(vcpu);
4249 default:
4250 break;
4251 }
4252
4253 return EXIT_FASTPATH_NONE;
4254 }
4255
svm_vcpu_enter_exit(struct kvm_vcpu * vcpu,bool spec_ctrl_intercepted)4256 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
4257 {
4258 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
4259 struct vcpu_svm *svm = to_svm(vcpu);
4260
4261 guest_state_enter_irqoff();
4262
4263 /*
4264 * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
4265 * VMRUN controls whether or not physical IRQs are masked (KVM always
4266 * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
4267 * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
4268 * into guest state if delivery of an event during VMRUN triggers a
4269 * #VMEXIT, and the guest_state transitions already tell lockdep that
4270 * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
4271 * this path, so IRQs aren't actually unmasked while running host code.
4272 */
4273 raw_local_irq_enable();
4274
4275 amd_clear_divider();
4276
4277 if (sev_es_guest(vcpu->kvm))
4278 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
4279 sev_es_host_save_area(sd));
4280 else
4281 __svm_vcpu_run(svm, spec_ctrl_intercepted);
4282
4283 raw_local_irq_disable();
4284
4285 guest_state_exit_irqoff();
4286 }
4287
svm_vcpu_run(struct kvm_vcpu * vcpu,u64 run_flags)4288 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
4289 {
4290 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
4291 struct vcpu_svm *svm = to_svm(vcpu);
4292 bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
4293
4294 trace_kvm_entry(vcpu, force_immediate_exit);
4295
4296 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4297 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4298 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4299
4300 /*
4301 * Disable singlestep if we're injecting an interrupt/exception.
4302 * We don't want our modified rflags to be pushed on the stack where
4303 * we might not be able to easily reset them if we disabled NMI
4304 * singlestep later.
4305 */
4306 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
4307 /*
4308 * Event injection happens before external interrupts cause a
4309 * vmexit and interrupts are disabled here, so smp_send_reschedule
4310 * is enough to force an immediate vmexit.
4311 */
4312 disable_nmi_singlestep(svm);
4313 force_immediate_exit = true;
4314 }
4315
4316 if (force_immediate_exit)
4317 smp_send_reschedule(vcpu->cpu);
4318
4319 if (pre_svm_run(vcpu)) {
4320 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4321 vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR;
4322 vcpu->run->fail_entry.cpu = vcpu->cpu;
4323 return EXIT_FASTPATH_EXIT_USERSPACE;
4324 }
4325
4326 sync_lapic_to_cr8(vcpu);
4327
4328 if (unlikely(svm->asid != svm->vmcb->control.asid)) {
4329 svm->vmcb->control.asid = svm->asid;
4330 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
4331 }
4332 svm->vmcb->save.cr2 = vcpu->arch.cr2;
4333
4334 if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) &&
4335 kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS))
4336 svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP;
4337
4338 svm_hv_update_vp_id(svm->vmcb, vcpu);
4339
4340 /*
4341 * Run with all-zero DR6 unless the guest can write DR6 freely, so that
4342 * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
4343 * KVM's snapshot is only necessary when DR accesses won't exit.
4344 */
4345 if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
4346 svm_set_dr6(vcpu, vcpu->arch.dr6);
4347 else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
4348 svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
4349
4350 clgi();
4351
4352 /*
4353 * Hardware only context switches DEBUGCTL if LBR virtualization is
4354 * enabled. Manually load DEBUGCTL if necessary (and restore it after
4355 * VM-Exit), as running with the host's DEBUGCTL can negatively affect
4356 * guest state and can even be fatal, e.g. due to Bus Lock Detect.
4357 */
4358 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
4359 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
4360 update_debugctlmsr(svm->vmcb->save.dbgctl);
4361
4362 kvm_wait_lapic_expire(vcpu);
4363
4364 /*
4365 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
4366 * it's non-zero. Since vmentry is serialising on affected CPUs, there
4367 * is no need to worry about the conditional branch over the wrmsr
4368 * being speculatively taken.
4369 */
4370 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4371 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
4372
4373 svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
4374
4375 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4376 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
4377
4378 if (!sev_es_guest(vcpu->kvm)) {
4379 vcpu->arch.cr2 = svm->vmcb->save.cr2;
4380 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4381 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4382 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4383 }
4384 vcpu->arch.regs_dirty = 0;
4385
4386 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4387 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4388
4389 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
4390 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
4391 update_debugctlmsr(vcpu->arch.host_debugctl);
4392
4393 stgi();
4394
4395 /* Any pending NMI will happen here */
4396
4397 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4398 kvm_after_interrupt(vcpu);
4399
4400 sync_cr8_to_lapic(vcpu);
4401
4402 svm->next_rip = 0;
4403 if (is_guest_mode(vcpu)) {
4404 nested_sync_control_from_vmcb02(svm);
4405
4406 /* Track VMRUNs that have made past consistency checking */
4407 if (svm->nested.nested_run_pending &&
4408 !svm_is_vmrun_failure(svm->vmcb->control.exit_code))
4409 ++vcpu->stat.nested_run;
4410
4411 svm->nested.nested_run_pending = 0;
4412 }
4413
4414 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4415
4416 /*
4417 * Unconditionally mask off the CLEAR_RAP bit, the AND is just as cheap
4418 * as the TEST+Jcc to avoid it.
4419 */
4420 if (cpu_feature_enabled(X86_FEATURE_ERAPS))
4421 svm->vmcb->control.erap_ctl &= ~ERAP_CONTROL_CLEAR_RAP;
4422
4423 vmcb_mark_all_clean(svm->vmcb);
4424
4425 /* if exit due to PF check for async PF */
4426 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4427 vcpu->arch.apf.host_apf_flags =
4428 kvm_read_and_reset_apf_flags();
4429
4430 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4431
4432 if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL))
4433 rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl);
4434
4435 trace_kvm_exit(vcpu, KVM_ISA_SVM);
4436
4437 svm_complete_interrupts(vcpu);
4438
4439 return svm_exit_handlers_fastpath(vcpu);
4440 }
4441
svm_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int root_level)4442 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4443 int root_level)
4444 {
4445 struct vcpu_svm *svm = to_svm(vcpu);
4446 unsigned long cr3;
4447
4448 if (npt_enabled) {
4449 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4450 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4451
4452 hv_track_root_tdp(vcpu, root_hpa);
4453
4454 cr3 = vcpu->arch.cr3;
4455 } else if (root_level >= PT64_ROOT_4LEVEL) {
4456 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4457 } else {
4458 /* PCID in the guest should be impossible with a 32-bit MMU. */
4459 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4460 cr3 = root_hpa;
4461 }
4462
4463 svm->vmcb->save.cr3 = cr3;
4464 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4465 }
4466
4467 static void
svm_patch_hypercall(struct kvm_vcpu * vcpu,unsigned char * hypercall)4468 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4469 {
4470 /*
4471 * Patch in the VMMCALL instruction:
4472 */
4473 hypercall[0] = 0x0f;
4474 hypercall[1] = 0x01;
4475 hypercall[2] = 0xd9;
4476 }
4477
4478 /*
4479 * The kvm parameter can be NULL (module initialization, or invocation before
4480 * VM creation). Be sure to check the kvm parameter before using it.
4481 */
svm_has_emulated_msr(struct kvm * kvm,u32 index)4482 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4483 {
4484 switch (index) {
4485 case MSR_IA32_MCG_EXT_CTL:
4486 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
4487 return false;
4488 case MSR_IA32_SMBASE:
4489 if (!IS_ENABLED(CONFIG_KVM_SMM))
4490 return false;
4491 /* SEV-ES guests do not support SMM, so report false */
4492 if (kvm && sev_es_guest(kvm))
4493 return false;
4494 break;
4495 default:
4496 break;
4497 }
4498
4499 return true;
4500 }
4501
svm_vcpu_after_set_cpuid(struct kvm_vcpu * vcpu)4502 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4503 {
4504 struct vcpu_svm *svm = to_svm(vcpu);
4505
4506 /*
4507 * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
4508 * can only disable all variants of by disallowing CR4.OSXSAVE from
4509 * being set. As a result, if the host has XSAVE and XSAVES, and the
4510 * guest has XSAVE enabled, the guest can execute XSAVES without
4511 * faulting. Treat XSAVES as enabled in this case regardless of
4512 * whether it's advertised to the guest so that KVM context switches
4513 * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
4514 * the guest read/write access to the host's XSS.
4515 */
4516 guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES,
4517 boot_cpu_has(X86_FEATURE_XSAVES) &&
4518 guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE));
4519
4520 /*
4521 * Intercept VMLOAD if the vCPU model is Intel in order to emulate that
4522 * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
4523 * SVM on Intel is bonkers and extremely unlikely to work).
4524 */
4525 if (guest_cpuid_is_intel_compatible(vcpu))
4526 guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4527
4528 if (sev_guest(vcpu->kvm))
4529 sev_vcpu_after_set_cpuid(svm);
4530 }
4531
svm_has_wbinvd_exit(void)4532 static bool svm_has_wbinvd_exit(void)
4533 {
4534 return true;
4535 }
4536
4537 #define PRE_EX(exit) { .exit_code = (exit), \
4538 .stage = X86_ICPT_PRE_EXCEPT, }
4539 #define POST_EX(exit) { .exit_code = (exit), \
4540 .stage = X86_ICPT_POST_EXCEPT, }
4541 #define POST_MEM(exit) { .exit_code = (exit), \
4542 .stage = X86_ICPT_POST_MEMACCESS, }
4543
4544 static const struct __x86_intercept {
4545 u32 exit_code;
4546 enum x86_intercept_stage stage;
4547 } x86_intercept_map[] = {
4548 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
4549 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
4550 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
4551 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
4552 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
4553 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
4554 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
4555 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
4556 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
4557 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
4558 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
4559 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
4560 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
4561 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
4562 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
4563 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
4564 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
4565 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
4566 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
4567 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
4568 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
4569 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
4570 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
4571 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
4572 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
4573 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
4574 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
4575 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
4576 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
4577 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
4578 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
4579 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
4580 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
4581 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
4582 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
4583 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
4584 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
4585 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
4586 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
4587 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
4588 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
4589 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
4590 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
4591 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
4592 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
4593 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
4594 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
4595 };
4596
4597 #undef PRE_EX
4598 #undef POST_EX
4599 #undef POST_MEM
4600
svm_check_intercept(struct kvm_vcpu * vcpu,struct x86_instruction_info * info,enum x86_intercept_stage stage,struct x86_exception * exception)4601 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4602 struct x86_instruction_info *info,
4603 enum x86_intercept_stage stage,
4604 struct x86_exception *exception)
4605 {
4606 struct vcpu_svm *svm = to_svm(vcpu);
4607 int vmexit, ret = X86EMUL_CONTINUE;
4608 struct __x86_intercept icpt_info;
4609 struct vmcb *vmcb = svm->vmcb;
4610
4611 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4612 goto out;
4613
4614 icpt_info = x86_intercept_map[info->intercept];
4615
4616 if (stage != icpt_info.stage)
4617 goto out;
4618
4619 switch (icpt_info.exit_code) {
4620 case SVM_EXIT_READ_CR0:
4621 if (info->intercept == x86_intercept_cr_read)
4622 icpt_info.exit_code += info->modrm_reg;
4623 break;
4624 case SVM_EXIT_WRITE_CR0: {
4625 unsigned long cr0, val;
4626
4627 /*
4628 * Adjust the exit code accordingly if a CR other than CR0 is
4629 * being written, and skip straight to the common handling as
4630 * only CR0 has an additional selective intercept.
4631 */
4632 if (info->intercept == x86_intercept_cr_write && info->modrm_reg) {
4633 icpt_info.exit_code += info->modrm_reg;
4634 break;
4635 }
4636
4637 /*
4638 * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a
4639 * selective CR0 intercept is triggered (the common logic will
4640 * treat the selective intercept as being enabled). Note, the
4641 * unconditional intercept has higher priority, i.e. this is
4642 * only relevant if *only* the selective intercept is enabled.
4643 */
4644 if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) ||
4645 !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))
4646 break;
4647
4648 /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */
4649 if (info->intercept == x86_intercept_clts)
4650 break;
4651
4652 /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */
4653 if (info->intercept == x86_intercept_lmsw) {
4654 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4655 break;
4656 }
4657
4658 /*
4659 * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit
4660 * other than SVM_CR0_SELECTIVE_MASK is changed.
4661 */
4662 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4663 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
4664 if (cr0 ^ val)
4665 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4666 break;
4667 }
4668 case SVM_EXIT_READ_DR0:
4669 case SVM_EXIT_WRITE_DR0:
4670 icpt_info.exit_code += info->modrm_reg;
4671 break;
4672 case SVM_EXIT_MSR:
4673 if (info->intercept == x86_intercept_wrmsr)
4674 vmcb->control.exit_info_1 = 1;
4675 else
4676 vmcb->control.exit_info_1 = 0;
4677 break;
4678 case SVM_EXIT_PAUSE:
4679 /*
4680 * We get this for NOP only, but pause
4681 * is rep not, check this here
4682 */
4683 if (info->rep_prefix != REPE_PREFIX)
4684 goto out;
4685 break;
4686 case SVM_EXIT_IOIO: {
4687 u64 exit_info;
4688 u32 bytes;
4689
4690 if (info->intercept == x86_intercept_in ||
4691 info->intercept == x86_intercept_ins) {
4692 exit_info = ((info->src_val & 0xffff) << 16) |
4693 SVM_IOIO_TYPE_MASK;
4694 bytes = info->dst_bytes;
4695 } else {
4696 exit_info = (info->dst_val & 0xffff) << 16;
4697 bytes = info->src_bytes;
4698 }
4699
4700 if (info->intercept == x86_intercept_outs ||
4701 info->intercept == x86_intercept_ins)
4702 exit_info |= SVM_IOIO_STR_MASK;
4703
4704 if (info->rep_prefix)
4705 exit_info |= SVM_IOIO_REP_MASK;
4706
4707 bytes = min(bytes, 4u);
4708
4709 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4710
4711 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4712
4713 vmcb->control.exit_info_1 = exit_info;
4714 vmcb->control.exit_info_2 = info->next_rip;
4715
4716 break;
4717 }
4718 default:
4719 break;
4720 }
4721
4722 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4723 if (static_cpu_has(X86_FEATURE_NRIPS))
4724 vmcb->control.next_rip = info->next_rip;
4725 vmcb->control.exit_code = icpt_info.exit_code;
4726 vmexit = nested_svm_exit_handled(svm);
4727
4728 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4729 : X86EMUL_CONTINUE;
4730
4731 out:
4732 return ret;
4733 }
4734
svm_handle_exit_irqoff(struct kvm_vcpu * vcpu)4735 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4736 {
4737 switch (to_svm(vcpu)->vmcb->control.exit_code) {
4738 case SVM_EXIT_EXCP_BASE + MC_VECTOR:
4739 svm_handle_mce(vcpu);
4740 break;
4741 case SVM_EXIT_INTR:
4742 vcpu->arch.at_instruction_boundary = true;
4743 break;
4744 default:
4745 break;
4746 }
4747 }
4748
svm_setup_mce(struct kvm_vcpu * vcpu)4749 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4750 {
4751 /* [63:9] are reserved. */
4752 vcpu->arch.mcg_cap &= 0x1ff;
4753 }
4754
4755 #ifdef CONFIG_KVM_SMM
svm_smi_blocked(struct kvm_vcpu * vcpu)4756 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4757 {
4758 struct vcpu_svm *svm = to_svm(vcpu);
4759
4760 /* Per APM Vol.2 15.22.2 "Response to SMI" */
4761 if (!gif_set(svm))
4762 return true;
4763
4764 return is_smm(vcpu);
4765 }
4766
svm_smi_allowed(struct kvm_vcpu * vcpu,bool for_injection)4767 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4768 {
4769 struct vcpu_svm *svm = to_svm(vcpu);
4770 if (svm->nested.nested_run_pending)
4771 return -EBUSY;
4772
4773 if (svm_smi_blocked(vcpu))
4774 return 0;
4775
4776 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
4777 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4778 return -EBUSY;
4779
4780 return 1;
4781 }
4782
svm_enter_smm(struct kvm_vcpu * vcpu,union kvm_smram * smram)4783 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4784 {
4785 struct vcpu_svm *svm = to_svm(vcpu);
4786 struct kvm_host_map map_save;
4787 int ret;
4788
4789 if (!is_guest_mode(vcpu))
4790 return 0;
4791
4792 /*
4793 * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is
4794 * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4795 */
4796
4797 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
4798 return 1;
4799
4800 smram->smram64.svm_guest_flag = 1;
4801 smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4802
4803 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4804 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4805 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4806
4807 ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4808 if (ret)
4809 return ret;
4810
4811 /*
4812 * KVM uses VMCB01 to store L1 host state while L2 runs but
4813 * VMCB01 is going to be used during SMM and thus the state will
4814 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4815 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4816 * format of the area is identical to guest save area offsetted
4817 * by 0x400 (matches the offset of 'struct vmcb_save_area'
4818 * within 'struct vmcb'). Note: HSAVE area may also be used by
4819 * L1 hypervisor to save additional host context (e.g. KVM does
4820 * that, see svm_prepare_switch_to_guest()) which must be
4821 * preserved.
4822 */
4823 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4824 return 1;
4825
4826 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4827
4828 svm_copy_vmrun_state(map_save.hva + 0x400,
4829 &svm->vmcb01.ptr->save);
4830
4831 kvm_vcpu_unmap(vcpu, &map_save);
4832 return 0;
4833 }
4834
svm_leave_smm(struct kvm_vcpu * vcpu,const union kvm_smram * smram)4835 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4836 {
4837 struct vcpu_svm *svm = to_svm(vcpu);
4838 struct kvm_host_map map, map_save;
4839 struct vmcb *vmcb12;
4840 int ret;
4841
4842 const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4843
4844 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
4845 return 0;
4846
4847 /* Non-zero if SMI arrived while vCPU was in guest mode. */
4848 if (!smram64->svm_guest_flag)
4849 return 0;
4850
4851 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
4852 return 1;
4853
4854 if (!(smram64->efer & EFER_SVME))
4855 return 1;
4856
4857 if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4858 return 1;
4859
4860 ret = 1;
4861 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4862 goto unmap_map;
4863
4864 if (svm_allocate_nested(svm))
4865 goto unmap_save;
4866
4867 /*
4868 * Restore L1 host state from L1 HSAVE area as VMCB01 was
4869 * used during SMM (see svm_enter_smm())
4870 */
4871
4872 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4873
4874 /*
4875 * Enter the nested guest now
4876 */
4877
4878 vmcb_mark_all_dirty(svm->vmcb01.ptr);
4879
4880 vmcb12 = map.hva;
4881 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4882 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4883
4884 if (nested_svm_check_cached_vmcb12(vcpu) < 0)
4885 goto unmap_save;
4886
4887 if (enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa,
4888 vmcb12, false) != 0)
4889 goto unmap_save;
4890
4891 ret = 0;
4892 svm->nested.nested_run_pending = 1;
4893
4894 unmap_save:
4895 kvm_vcpu_unmap(vcpu, &map_save);
4896 unmap_map:
4897 kvm_vcpu_unmap(vcpu, &map);
4898 return ret;
4899 }
4900
svm_enable_smi_window(struct kvm_vcpu * vcpu)4901 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4902 {
4903 struct vcpu_svm *svm = to_svm(vcpu);
4904
4905 if (!gif_set(svm)) {
4906 if (vgif)
4907 svm_set_intercept(svm, INTERCEPT_STGI);
4908 /* STGI will cause a vm exit */
4909 } else {
4910 /* We must be in SMM; RSM will cause a vmexit anyway. */
4911 }
4912 }
4913 #endif
4914
svm_check_emulate_instruction(struct kvm_vcpu * vcpu,int emul_type,void * insn,int insn_len)4915 static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4916 void *insn, int insn_len)
4917 {
4918 struct vcpu_svm *svm = to_svm(vcpu);
4919 bool smep, smap, is_user;
4920 u64 error_code;
4921
4922 /* Check that emulation is possible during event vectoring */
4923 if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) &&
4924 !kvm_can_emulate_event_vectoring(emul_type))
4925 return X86EMUL_UNHANDLEABLE_VECTORING;
4926
4927 /* Emulation is always possible when KVM has access to all guest state. */
4928 if (!sev_guest(vcpu->kvm))
4929 return X86EMUL_CONTINUE;
4930
4931 /* #UD and #GP should never be intercepted for SEV guests. */
4932 WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4933 EMULTYPE_TRAP_UD_FORCED |
4934 EMULTYPE_VMWARE_GP));
4935
4936 /*
4937 * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4938 * to guest register state.
4939 */
4940 if (sev_es_guest(vcpu->kvm))
4941 return X86EMUL_RETRY_INSTR;
4942
4943 /*
4944 * Emulation is possible if the instruction is already decoded, e.g.
4945 * when completing I/O after returning from userspace.
4946 */
4947 if (emul_type & EMULTYPE_NO_DECODE)
4948 return X86EMUL_CONTINUE;
4949
4950 /*
4951 * Emulation is possible for SEV guests if and only if a prefilled
4952 * buffer containing the bytes of the intercepted instruction is
4953 * available. SEV guest memory is encrypted with a guest specific key
4954 * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and
4955 * decode garbage.
4956 *
4957 * If KVM is NOT trying to simply skip an instruction, inject #UD if
4958 * KVM reached this point without an instruction buffer. In practice,
4959 * this path should never be hit by a well-behaved guest, e.g. KVM
4960 * doesn't intercept #UD or #GP for SEV guests, but this path is still
4961 * theoretically reachable, e.g. via unaccelerated fault-like AVIC
4962 * access, and needs to be handled by KVM to avoid putting the guest
4963 * into an infinite loop. Injecting #UD is somewhat arbitrary, but
4964 * its the least awful option given lack of insight into the guest.
4965 *
4966 * If KVM is trying to skip an instruction, simply resume the guest.
4967 * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
4968 * will attempt to re-inject the INT3/INTO and skip the instruction.
4969 * In that scenario, retrying the INT3/INTO and hoping the guest will
4970 * make forward progress is the only option that has a chance of
4971 * success (and in practice it will work the vast majority of the time).
4972 */
4973 if (unlikely(!insn)) {
4974 if (emul_type & EMULTYPE_SKIP)
4975 return X86EMUL_UNHANDLEABLE;
4976
4977 kvm_queue_exception(vcpu, UD_VECTOR);
4978 return X86EMUL_PROPAGATE_FAULT;
4979 }
4980
4981 /*
4982 * Emulate for SEV guests if the insn buffer is not empty. The buffer
4983 * will be empty if the DecodeAssist microcode cannot fetch bytes for
4984 * the faulting instruction because the code fetch itself faulted, e.g.
4985 * the guest attempted to fetch from emulated MMIO or a guest page
4986 * table used to translate CS:RIP resides in emulated MMIO.
4987 */
4988 if (likely(insn_len))
4989 return X86EMUL_CONTINUE;
4990
4991 /*
4992 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4993 *
4994 * Errata:
4995 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4996 * possible that CPU microcode implementing DecodeAssist will fail to
4997 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4998 * be '0'. This happens because microcode reads CS:RIP using a _data_
4999 * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
5000 * gives up and does not fill the instruction bytes buffer.
5001 *
5002 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
5003 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
5004 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
5005 * GuestIntrBytes field of the VMCB.
5006 *
5007 * This does _not_ mean that the erratum has been encountered, as the
5008 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
5009 * #PF, e.g. if the guest attempt to execute from emulated MMIO and
5010 * encountered a reserved/not-present #PF.
5011 *
5012 * To hit the erratum, the following conditions must be true:
5013 * 1. CR4.SMAP=1 (obviously).
5014 * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
5015 * have been hit as the guest would have encountered a SMEP
5016 * violation #PF, not a #NPF.
5017 * 3. The #NPF is not due to a code fetch, in which case failure to
5018 * retrieve the instruction bytes is legitimate (see abvoe).
5019 *
5020 * In addition, don't apply the erratum workaround if the #NPF occurred
5021 * while translating guest page tables (see below).
5022 */
5023 error_code = svm->vmcb->control.exit_info_1;
5024 if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
5025 goto resume_guest;
5026
5027 smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
5028 smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
5029 is_user = svm_get_cpl(vcpu) == 3;
5030 if (smap && (!smep || is_user)) {
5031 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
5032
5033 /*
5034 * If the fault occurred in userspace, arbitrarily inject #GP
5035 * to avoid killing the guest and to hopefully avoid confusing
5036 * the guest kernel too much, e.g. injecting #PF would not be
5037 * coherent with respect to the guest's page tables. Request
5038 * triple fault if the fault occurred in the kernel as there's
5039 * no fault that KVM can inject without confusing the guest.
5040 * In practice, the triple fault is moot as no sane SEV kernel
5041 * will execute from user memory while also running with SMAP=1.
5042 */
5043 if (is_user)
5044 kvm_inject_gp(vcpu, 0);
5045 else
5046 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5047 return X86EMUL_PROPAGATE_FAULT;
5048 }
5049
5050 resume_guest:
5051 /*
5052 * If the erratum was not hit, simply resume the guest and let it fault
5053 * again. While awful, e.g. the vCPU may get stuck in an infinite loop
5054 * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
5055 * userspace will kill the guest, and letting the emulator read garbage
5056 * will yield random behavior and potentially corrupt the guest.
5057 *
5058 * Simply resuming the guest is technically not a violation of the SEV
5059 * architecture. AMD's APM states that all code fetches and page table
5060 * accesses for SEV guest are encrypted, regardless of the C-Bit. The
5061 * APM also states that encrypted accesses to MMIO are "ignored", but
5062 * doesn't explicitly define "ignored", i.e. doing nothing and letting
5063 * the guest spin is technically "ignoring" the access.
5064 */
5065 return X86EMUL_RETRY_INSTR;
5066 }
5067
svm_apic_init_signal_blocked(struct kvm_vcpu * vcpu)5068 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
5069 {
5070 struct vcpu_svm *svm = to_svm(vcpu);
5071
5072 return !gif_set(svm);
5073 }
5074
svm_vcpu_deliver_sipi_vector(struct kvm_vcpu * vcpu,u8 vector)5075 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
5076 {
5077 if (!sev_es_guest(vcpu->kvm))
5078 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
5079
5080 sev_vcpu_deliver_sipi_vector(vcpu, vector);
5081 }
5082
svm_vm_destroy(struct kvm * kvm)5083 static void svm_vm_destroy(struct kvm *kvm)
5084 {
5085 avic_vm_destroy(kvm);
5086 sev_vm_destroy(kvm);
5087
5088 svm_srso_vm_destroy();
5089 }
5090
svm_vm_init(struct kvm * kvm)5091 static int svm_vm_init(struct kvm *kvm)
5092 {
5093 int type = kvm->arch.vm_type;
5094
5095 if (type != KVM_X86_DEFAULT_VM &&
5096 type != KVM_X86_SW_PROTECTED_VM) {
5097 kvm->arch.has_protected_state =
5098 (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM);
5099 to_kvm_sev_info(kvm)->need_init = true;
5100
5101 kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
5102 kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem;
5103 }
5104
5105 if (!pause_filter_count || !pause_filter_thresh)
5106 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
5107
5108 if (enable_apicv) {
5109 int ret = avic_vm_init(kvm);
5110 if (ret)
5111 return ret;
5112 }
5113
5114 svm_srso_vm_init();
5115 return 0;
5116 }
5117
svm_alloc_apic_backing_page(struct kvm_vcpu * vcpu)5118 static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
5119 {
5120 struct page *page = snp_safe_alloc_page();
5121
5122 if (!page)
5123 return NULL;
5124
5125 return page_address(page);
5126 }
5127
5128 struct kvm_x86_ops svm_x86_ops __initdata = {
5129 .name = KBUILD_MODNAME,
5130
5131 .check_processor_compatibility = svm_check_processor_compat,
5132
5133 .hardware_unsetup = svm_hardware_unsetup,
5134 .enable_virtualization_cpu = svm_enable_virtualization_cpu,
5135 .disable_virtualization_cpu = svm_disable_virtualization_cpu,
5136 .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
5137 .has_emulated_msr = svm_has_emulated_msr,
5138
5139 .vcpu_precreate = svm_vcpu_precreate,
5140 .vcpu_create = svm_vcpu_create,
5141 .vcpu_free = svm_vcpu_free,
5142 .vcpu_reset = svm_vcpu_reset,
5143
5144 .vm_size = sizeof(struct kvm_svm),
5145 .vm_init = svm_vm_init,
5146 .vm_destroy = svm_vm_destroy,
5147
5148 .prepare_switch_to_guest = svm_prepare_switch_to_guest,
5149 .vcpu_load = svm_vcpu_load,
5150 .vcpu_put = svm_vcpu_put,
5151 .vcpu_blocking = avic_vcpu_blocking,
5152 .vcpu_unblocking = avic_vcpu_unblocking,
5153
5154 .update_exception_bitmap = svm_update_exception_bitmap,
5155 .get_feature_msr = svm_get_feature_msr,
5156 .get_msr = svm_get_msr,
5157 .set_msr = svm_set_msr,
5158 .get_segment_base = svm_get_segment_base,
5159 .get_segment = svm_get_segment,
5160 .set_segment = svm_set_segment,
5161 .get_cpl = svm_get_cpl,
5162 .get_cpl_no_cache = svm_get_cpl,
5163 .get_cs_db_l_bits = svm_get_cs_db_l_bits,
5164 .is_valid_cr0 = svm_is_valid_cr0,
5165 .set_cr0 = svm_set_cr0,
5166 .post_set_cr3 = sev_post_set_cr3,
5167 .is_valid_cr4 = svm_is_valid_cr4,
5168 .set_cr4 = svm_set_cr4,
5169 .set_efer = svm_set_efer,
5170 .get_idt = svm_get_idt,
5171 .set_idt = svm_set_idt,
5172 .get_gdt = svm_get_gdt,
5173 .set_gdt = svm_set_gdt,
5174 .set_dr7 = svm_set_dr7,
5175 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
5176 .cache_reg = svm_cache_reg,
5177 .get_rflags = svm_get_rflags,
5178 .set_rflags = svm_set_rflags,
5179 .get_if_flag = svm_get_if_flag,
5180
5181 .flush_tlb_all = svm_flush_tlb_all,
5182 .flush_tlb_current = svm_flush_tlb_current,
5183 .flush_tlb_gva = svm_flush_tlb_gva,
5184 .flush_tlb_guest = svm_flush_tlb_guest,
5185
5186 .vcpu_pre_run = svm_vcpu_pre_run,
5187 .vcpu_run = svm_vcpu_run,
5188 .handle_exit = svm_handle_exit,
5189 .skip_emulated_instruction = svm_skip_emulated_instruction,
5190 .update_emulated_instruction = NULL,
5191 .set_interrupt_shadow = svm_set_interrupt_shadow,
5192 .get_interrupt_shadow = svm_get_interrupt_shadow,
5193 .patch_hypercall = svm_patch_hypercall,
5194 .inject_irq = svm_inject_irq,
5195 .inject_nmi = svm_inject_nmi,
5196 .is_vnmi_pending = svm_is_vnmi_pending,
5197 .set_vnmi_pending = svm_set_vnmi_pending,
5198 .inject_exception = svm_inject_exception,
5199 .cancel_injection = svm_cancel_injection,
5200 .interrupt_allowed = svm_interrupt_allowed,
5201 .nmi_allowed = svm_nmi_allowed,
5202 .get_nmi_mask = svm_get_nmi_mask,
5203 .set_nmi_mask = svm_set_nmi_mask,
5204 .enable_nmi_window = svm_enable_nmi_window,
5205 .enable_irq_window = svm_enable_irq_window,
5206 .update_cr8_intercept = svm_update_cr8_intercept,
5207
5208 .x2apic_icr_is_split = true,
5209 .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
5210 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
5211 .apicv_post_state_restore = avic_apicv_post_state_restore,
5212 .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
5213
5214 .get_exit_info = svm_get_exit_info,
5215 .get_entry_info = svm_get_entry_info,
5216
5217 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
5218
5219 .has_wbinvd_exit = svm_has_wbinvd_exit,
5220
5221 .get_l2_tsc_offset = svm_get_l2_tsc_offset,
5222 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
5223 .write_tsc_offset = svm_write_tsc_offset,
5224 .write_tsc_multiplier = svm_write_tsc_multiplier,
5225
5226 .load_mmu_pgd = svm_load_mmu_pgd,
5227
5228 .check_intercept = svm_check_intercept,
5229 .handle_exit_irqoff = svm_handle_exit_irqoff,
5230
5231 .nested_ops = &svm_nested_ops,
5232
5233 .deliver_interrupt = svm_deliver_interrupt,
5234 .pi_update_irte = avic_pi_update_irte,
5235 .setup_mce = svm_setup_mce,
5236
5237 #ifdef CONFIG_KVM_SMM
5238 .smi_allowed = svm_smi_allowed,
5239 .enter_smm = svm_enter_smm,
5240 .leave_smm = svm_leave_smm,
5241 .enable_smi_window = svm_enable_smi_window,
5242 #endif
5243
5244 #ifdef CONFIG_KVM_AMD_SEV
5245 .dev_get_attr = sev_dev_get_attr,
5246 .mem_enc_ioctl = sev_mem_enc_ioctl,
5247 .mem_enc_register_region = sev_mem_enc_register_region,
5248 .mem_enc_unregister_region = sev_mem_enc_unregister_region,
5249 .guest_memory_reclaimed = sev_guest_memory_reclaimed,
5250
5251 .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
5252 .vm_move_enc_context_from = sev_vm_move_enc_context_from,
5253 #endif
5254 .check_emulate_instruction = svm_check_emulate_instruction,
5255
5256 .apic_init_signal_blocked = svm_apic_init_signal_blocked,
5257
5258 .recalc_intercepts = svm_recalc_intercepts,
5259 .complete_emulated_msr = svm_complete_emulated_msr,
5260
5261 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
5262 .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
5263 .alloc_apic_backing_page = svm_alloc_apic_backing_page,
5264
5265 .gmem_prepare = sev_gmem_prepare,
5266 .gmem_invalidate = sev_gmem_invalidate,
5267 .gmem_max_mapping_level = sev_gmem_max_mapping_level,
5268 };
5269
5270 /*
5271 * The default MMIO mask is a single bit (excluding the present bit),
5272 * which could conflict with the memory encryption bit. Check for
5273 * memory encryption support and override the default MMIO mask if
5274 * memory encryption is enabled.
5275 */
svm_adjust_mmio_mask(void)5276 static __init void svm_adjust_mmio_mask(void)
5277 {
5278 unsigned int enc_bit, mask_bit;
5279 u64 msr, mask;
5280
5281 /* If there is no memory encryption support, use existing mask */
5282 if (cpuid_eax(0x80000000) < 0x8000001f)
5283 return;
5284
5285 /* If memory encryption is not enabled, use existing mask */
5286 rdmsrq(MSR_AMD64_SYSCFG, msr);
5287 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
5288 return;
5289
5290 enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
5291 mask_bit = boot_cpu_data.x86_phys_bits;
5292
5293 /* Increment the mask bit if it is the same as the encryption bit */
5294 if (enc_bit == mask_bit)
5295 mask_bit++;
5296
5297 /*
5298 * If the mask bit location is below 52, then some bits above the
5299 * physical addressing limit will always be reserved, so use the
5300 * rsvd_bits() function to generate the mask. This mask, along with
5301 * the present bit, will be used to generate a page fault with
5302 * PFER.RSV = 1.
5303 *
5304 * If the mask bit location is 52 (or above), then clear the mask.
5305 */
5306 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
5307
5308 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
5309 }
5310
svm_set_cpu_caps(void)5311 static __init void svm_set_cpu_caps(void)
5312 {
5313 kvm_initialize_cpu_caps();
5314
5315 kvm_caps.supported_perf_cap = 0;
5316
5317 kvm_cpu_cap_clear(X86_FEATURE_IBT);
5318
5319 /* CPUID 0x80000001 and 0x8000000A (SVM features) */
5320 if (nested) {
5321 kvm_cpu_cap_set(X86_FEATURE_SVM);
5322 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
5323
5324 /*
5325 * KVM currently flushes TLBs on *every* nested SVM transition,
5326 * and so for all intents and purposes KVM supports flushing by
5327 * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
5328 */
5329 kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);
5330
5331 if (nrips)
5332 kvm_cpu_cap_set(X86_FEATURE_NRIPS);
5333
5334 if (npt_enabled)
5335 kvm_cpu_cap_set(X86_FEATURE_NPT);
5336
5337 if (tsc_scaling)
5338 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
5339
5340 if (vls)
5341 kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
5342 if (lbrv)
5343 kvm_cpu_cap_set(X86_FEATURE_LBRV);
5344
5345 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
5346 kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
5347
5348 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
5349 kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
5350
5351 if (vgif)
5352 kvm_cpu_cap_set(X86_FEATURE_VGIF);
5353
5354 if (vnmi)
5355 kvm_cpu_cap_set(X86_FEATURE_VNMI);
5356
5357 /* Nested VM can receive #VMEXIT instead of triggering #GP */
5358 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
5359 }
5360
5361 if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD))
5362 kvm_caps.has_bus_lock_exit = true;
5363
5364 /* CPUID 0x80000008 */
5365 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
5366 boot_cpu_has(X86_FEATURE_AMD_SSBD))
5367 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
5368
5369 if (enable_pmu) {
5370 /*
5371 * Enumerate support for PERFCTR_CORE if and only if KVM has
5372 * access to enough counters to virtualize "core" support,
5373 * otherwise limit vPMU support to the legacy number of counters.
5374 */
5375 if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
5376 kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
5377 kvm_pmu_cap.num_counters_gp);
5378 else
5379 kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
5380
5381 if (kvm_pmu_cap.version != 2 ||
5382 !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
5383 kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
5384 }
5385
5386 /* CPUID 0x8000001F (SME/SEV features) */
5387 sev_set_cpu_caps();
5388
5389 /*
5390 * Clear capabilities that are automatically configured by common code,
5391 * but that require explicit SVM support (that isn't yet implemented).
5392 */
5393 kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
5394 kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM);
5395
5396 kvm_setup_xss_caps();
5397 kvm_finalize_cpu_caps();
5398 }
5399
svm_hardware_setup(void)5400 static __init int svm_hardware_setup(void)
5401 {
5402 void *iopm_va;
5403 int cpu, r;
5404
5405 /*
5406 * NX is required for shadow paging and for NPT if the NX huge pages
5407 * mitigation is enabled.
5408 */
5409 if (!boot_cpu_has(X86_FEATURE_NX)) {
5410 pr_err_ratelimited("NX (Execute Disable) not supported\n");
5411 return -EOPNOTSUPP;
5412 }
5413 kvm_enable_efer_bits(EFER_NX);
5414
5415 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
5416 XFEATURE_MASK_BNDCSR);
5417
5418 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
5419 kvm_enable_efer_bits(EFER_FFXSR);
5420
5421 if (tsc_scaling) {
5422 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
5423 tsc_scaling = false;
5424 } else {
5425 pr_info("TSC scaling supported\n");
5426 kvm_caps.has_tsc_control = true;
5427 }
5428 }
5429 kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
5430 kvm_caps.tsc_scaling_ratio_frac_bits = 32;
5431
5432 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
5433
5434 if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
5435 kvm_enable_efer_bits(EFER_AUTOIBRS);
5436
5437 /* Check for pause filtering support */
5438 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
5439 pause_filter_count = 0;
5440 pause_filter_thresh = 0;
5441 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
5442 pause_filter_thresh = 0;
5443 }
5444
5445 if (nested) {
5446 pr_info("Nested Virtualization enabled\n");
5447 kvm_enable_efer_bits(EFER_SVME);
5448 if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ))
5449 kvm_enable_efer_bits(EFER_LMSLE);
5450
5451 r = nested_svm_init_msrpm_merge_offsets();
5452 if (r)
5453 return r;
5454 }
5455
5456 /*
5457 * KVM's MMU doesn't support using 2-level paging for itself, and thus
5458 * NPT isn't supported if the host is using 2-level paging since host
5459 * CR4 is unchanged on VMRUN.
5460 */
5461 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
5462 npt_enabled = false;
5463
5464 if (!boot_cpu_has(X86_FEATURE_NPT))
5465 npt_enabled = false;
5466
5467 /* Force VM NPT level equal to the host's paging level */
5468 kvm_configure_mmu(npt_enabled, get_npt_level(),
5469 get_npt_level(), PG_LEVEL_1G);
5470 pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled));
5471
5472 /*
5473 * It seems that on AMD processors PTE's accessed bit is
5474 * being set by the CPU hardware before the NPF vmexit.
5475 * This is not expected behaviour and our tests fail because
5476 * of it.
5477 * A workaround here is to disable support for
5478 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5479 * In this case userspace can know if there is support using
5480 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5481 * it
5482 * If future AMD CPU models change the behaviour described above,
5483 * this variable can be changed accordingly
5484 */
5485 allow_smaller_maxphyaddr = !npt_enabled;
5486
5487 /* Setup shadow_me_value and shadow_me_mask */
5488 kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5489
5490 svm_adjust_mmio_mask();
5491
5492 nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
5493
5494 if (lbrv) {
5495 if (!boot_cpu_has(X86_FEATURE_LBRV))
5496 lbrv = false;
5497 else
5498 pr_info("LBR virtualization supported\n");
5499 }
5500
5501 iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL);
5502 if (!iopm_va)
5503 return -ENOMEM;
5504
5505 iopm_base = __sme_set(__pa(iopm_va));
5506
5507 /*
5508 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5509 * may be modified by svm_adjust_mmio_mask()), as well as nrips.
5510 */
5511 sev_hardware_setup();
5512
5513 svm_hv_hardware_setup();
5514
5515 enable_apicv = avic_hardware_setup();
5516 if (!enable_apicv) {
5517 enable_ipiv = false;
5518 svm_x86_ops.vcpu_blocking = NULL;
5519 svm_x86_ops.vcpu_unblocking = NULL;
5520 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5521 }
5522
5523 if (vls) {
5524 if (!npt_enabled ||
5525 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5526 !IS_ENABLED(CONFIG_X86_64)) {
5527 vls = false;
5528 } else {
5529 pr_info("Virtual VMLOAD VMSAVE supported\n");
5530 }
5531 }
5532
5533 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5534 svm_gp_erratum_intercept = false;
5535
5536 if (vgif) {
5537 if (!boot_cpu_has(X86_FEATURE_VGIF))
5538 vgif = false;
5539 else
5540 pr_info("Virtual GIF supported\n");
5541 }
5542
5543 vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
5544 if (vnmi)
5545 pr_info("Virtual NMI enabled\n");
5546
5547 if (!vnmi) {
5548 svm_x86_ops.is_vnmi_pending = NULL;
5549 svm_x86_ops.set_vnmi_pending = NULL;
5550 }
5551
5552 if (!enable_pmu)
5553 pr_info("PMU virtualization is disabled\n");
5554
5555 svm_set_cpu_caps();
5556
5557 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
5558
5559 for_each_possible_cpu(cpu) {
5560 r = svm_cpu_init(cpu);
5561 if (r)
5562 goto err;
5563 }
5564
5565 return 0;
5566
5567 err:
5568 svm_hardware_unsetup();
5569 return r;
5570 }
5571
5572
5573 static struct kvm_x86_init_ops svm_init_ops __initdata = {
5574 .hardware_setup = svm_hardware_setup,
5575
5576 .runtime_ops = &svm_x86_ops,
5577 .pmu_ops = &amd_pmu_ops,
5578 };
5579
__svm_exit(void)5580 static void __svm_exit(void)
5581 {
5582 kvm_x86_vendor_exit();
5583 }
5584
svm_init(void)5585 static int __init svm_init(void)
5586 {
5587 int r;
5588
5589 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);
5590
5591 __unused_size_checks();
5592
5593 if (!kvm_is_svm_supported())
5594 return -EOPNOTSUPP;
5595
5596 r = kvm_x86_vendor_init(&svm_init_ops);
5597 if (r)
5598 return r;
5599
5600 /*
5601 * Common KVM initialization _must_ come last, after this, /dev/kvm is
5602 * exposed to userspace!
5603 */
5604 r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
5605 THIS_MODULE);
5606 if (r)
5607 goto err_kvm_init;
5608
5609 return 0;
5610
5611 err_kvm_init:
5612 __svm_exit();
5613 return r;
5614 }
5615
svm_exit(void)5616 static void __exit svm_exit(void)
5617 {
5618 kvm_exit();
5619 __svm_exit();
5620 }
5621
5622 module_init(svm_init)
5623 module_exit(svm_exit)
5624