1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
3 #include <linux/kvm_host.h>
4
5 #include "irq.h"
6 #include "mmu.h"
7 #include "kvm_cache_regs.h"
8 #include "x86.h"
9 #include "smm.h"
10 #include "cpuid.h"
11 #include "pmu.h"
12
13 #include <linux/module.h>
14 #include <linux/mod_devicetable.h>
15 #include <linux/kernel.h>
16 #include <linux/vmalloc.h>
17 #include <linux/highmem.h>
18 #include <linux/amd-iommu.h>
19 #include <linux/sched.h>
20 #include <linux/trace_events.h>
21 #include <linux/slab.h>
22 #include <linux/hashtable.h>
23 #include <linux/objtool.h>
24 #include <linux/psp-sev.h>
25 #include <linux/file.h>
26 #include <linux/pagemap.h>
27 #include <linux/swap.h>
28 #include <linux/rwsem.h>
29 #include <linux/cc_platform.h>
30 #include <linux/smp.h>
31 #include <linux/string_choices.h>
32 #include <linux/mutex.h>
33
34 #include <asm/apic.h>
35 #include <asm/msr.h>
36 #include <asm/perf_event.h>
37 #include <asm/tlbflush.h>
38 #include <asm/desc.h>
39 #include <asm/debugreg.h>
40 #include <asm/kvm_para.h>
41 #include <asm/irq_remapping.h>
42 #include <asm/spec-ctrl.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/traps.h>
45 #include <asm/reboot.h>
46 #include <asm/fpu/api.h>
47
48 #include <trace/events/ipi.h>
49
50 #include "trace.h"
51
52 #include "svm.h"
53 #include "svm_ops.h"
54
55 #include "kvm_onhyperv.h"
56 #include "svm_onhyperv.h"
57
58 MODULE_AUTHOR("Qumranet");
59 MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
60 MODULE_LICENSE("GPL");
61
62 #ifdef MODULE
63 static const struct x86_cpu_id svm_cpu_id[] = {
64 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
65 {}
66 };
67 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
68 #endif
69
70 #define SEG_TYPE_LDT 2
71 #define SEG_TYPE_BUSY_TSS16 3
72
73 static bool erratum_383_found __read_mostly;
74
75 /*
76 * Set osvw_len to higher value when updated Revision Guides
77 * are published and we know what the new status bits are
78 */
79 static uint64_t osvw_len = 4, osvw_status;
80
81 static DEFINE_PER_CPU(u64, current_tsc_ratio);
82
83 /*
84 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
85 * pause_filter_count: On processors that support Pause filtering(indicated
86 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
87 * count value. On VMRUN this value is loaded into an internal counter.
88 * Each time a pause instruction is executed, this counter is decremented
89 * until it reaches zero at which time a #VMEXIT is generated if pause
90 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
91 * Intercept Filtering for more details.
92 * This also indicate if ple logic enabled.
93 *
94 * pause_filter_thresh: In addition, some processor families support advanced
95 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
96 * the amount of time a guest is allowed to execute in a pause loop.
97 * In this mode, a 16-bit pause filter threshold field is added in the
98 * VMCB. The threshold value is a cycle count that is used to reset the
99 * pause counter. As with simple pause filtering, VMRUN loads the pause
100 * count value from VMCB into an internal counter. Then, on each pause
101 * instruction the hardware checks the elapsed number of cycles since
102 * the most recent pause instruction against the pause filter threshold.
103 * If the elapsed cycle count is greater than the pause filter threshold,
104 * then the internal pause count is reloaded from the VMCB and execution
105 * continues. If the elapsed cycle count is less than the pause filter
106 * threshold, then the internal pause count is decremented. If the count
107 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
108 * triggered. If advanced pause filtering is supported and pause filter
109 * threshold field is set to zero, the filter will operate in the simpler,
110 * count only mode.
111 */
112
113 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
114 module_param(pause_filter_thresh, ushort, 0444);
115
116 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
117 module_param(pause_filter_count, ushort, 0444);
118
119 /* Default doubles per-vcpu window every exit. */
120 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
121 module_param(pause_filter_count_grow, ushort, 0444);
122
123 /* Default resets per-vcpu window every exit to pause_filter_count. */
124 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
125 module_param(pause_filter_count_shrink, ushort, 0444);
126
127 /* Default is to compute the maximum so we can never overflow. */
128 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
129 module_param(pause_filter_count_max, ushort, 0444);
130
131 /*
132 * Use nested page tables by default. Note, NPT may get forced off by
133 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
134 */
135 bool npt_enabled = true;
136 module_param_named(npt, npt_enabled, bool, 0444);
137
138 /* allow nested virtualization in KVM/SVM */
139 static int nested = true;
140 module_param(nested, int, 0444);
141
142 /* enable/disable Next RIP Save */
143 int nrips = true;
144 module_param(nrips, int, 0444);
145
146 /* enable/disable Virtual VMLOAD VMSAVE */
147 static int vls = true;
148 module_param(vls, int, 0444);
149
150 /* enable/disable Virtual GIF */
151 int vgif = true;
152 module_param(vgif, int, 0444);
153
154 /* enable/disable LBR virtualization */
155 int lbrv = true;
156 module_param(lbrv, int, 0444);
157
158 static int tsc_scaling = true;
159 module_param(tsc_scaling, int, 0444);
160
161 module_param(enable_device_posted_irqs, bool, 0444);
162
163 bool __read_mostly dump_invalid_vmcb;
164 module_param(dump_invalid_vmcb, bool, 0644);
165
166
167 bool intercept_smi = true;
168 module_param(intercept_smi, bool, 0444);
169
170 bool vnmi = true;
171 module_param(vnmi, bool, 0444);
172
173 module_param(enable_mediated_pmu, bool, 0444);
174
175 static bool svm_gp_erratum_intercept = true;
176
177 static u8 rsm_ins_bytes[] = "\x0f\xaa";
178
179 static unsigned long iopm_base;
180
181 DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
182
183 static DEFINE_MUTEX(vmcb_dump_mutex);
184
185 /*
186 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
187 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
188 *
189 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
190 * defer the restoration of TSC_AUX until the CPU returns to userspace.
191 */
192 int tsc_aux_uret_slot __ro_after_init = -1;
193
get_npt_level(void)194 static int get_npt_level(void)
195 {
196 #ifdef CONFIG_X86_64
197 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
198 #else
199 return PT32E_ROOT_LEVEL;
200 #endif
201 }
202
svm_set_efer(struct kvm_vcpu * vcpu,u64 efer)203 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
204 {
205 struct vcpu_svm *svm = to_svm(vcpu);
206 u64 old_efer = vcpu->arch.efer;
207 vcpu->arch.efer = efer;
208
209 if (!npt_enabled) {
210 /* Shadow paging assumes NX to be available. */
211 efer |= EFER_NX;
212
213 if (!(efer & EFER_LMA))
214 efer &= ~EFER_LME;
215 }
216
217 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
218 if (!(efer & EFER_SVME)) {
219 svm_leave_nested(vcpu);
220 /* #GP intercept is still needed for vmware backdoor */
221 if (!enable_vmware_backdoor)
222 clr_exception_intercept(svm, GP_VECTOR);
223
224 /*
225 * Free the nested guest state, unless we are in SMM.
226 * In this case we will return to the nested guest
227 * as soon as we leave SMM.
228 */
229 if (!is_smm(vcpu))
230 svm_free_nested(svm);
231
232 } else {
233 int ret = svm_allocate_nested(svm);
234
235 if (ret) {
236 vcpu->arch.efer = old_efer;
237 return ret;
238 }
239
240 /*
241 * Never intercept #GP for SEV guests, KVM can't
242 * decrypt guest memory to workaround the erratum.
243 */
244 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
245 set_exception_intercept(svm, GP_VECTOR);
246 }
247 }
248
249 svm->vmcb->save.efer = efer | EFER_SVME;
250 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
251 return 0;
252 }
253
svm_get_interrupt_shadow(struct kvm_vcpu * vcpu)254 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
255 {
256 struct vcpu_svm *svm = to_svm(vcpu);
257 u32 ret = 0;
258
259 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
260 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
261 return ret;
262 }
263
svm_set_interrupt_shadow(struct kvm_vcpu * vcpu,int mask)264 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
265 {
266 struct vcpu_svm *svm = to_svm(vcpu);
267
268 if (mask == 0)
269 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
270 else
271 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
272
273 }
274
__svm_skip_emulated_instruction(struct kvm_vcpu * vcpu,int emul_type,bool commit_side_effects)275 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
276 int emul_type,
277 bool commit_side_effects)
278 {
279 struct vcpu_svm *svm = to_svm(vcpu);
280 unsigned long old_rflags;
281
282 /*
283 * SEV-ES does not expose the next RIP. The RIP update is controlled by
284 * the type of exit and the #VC handler in the guest.
285 */
286 if (sev_es_guest(vcpu->kvm))
287 goto done;
288
289 if (nrips && svm->vmcb->control.next_rip != 0) {
290 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
291 svm->next_rip = svm->vmcb->control.next_rip;
292 }
293
294 if (!svm->next_rip) {
295 if (unlikely(!commit_side_effects))
296 old_rflags = svm->vmcb->save.rflags;
297
298 if (!kvm_emulate_instruction(vcpu, emul_type))
299 return 0;
300
301 if (unlikely(!commit_side_effects))
302 svm->vmcb->save.rflags = old_rflags;
303 } else {
304 kvm_rip_write(vcpu, svm->next_rip);
305 }
306
307 done:
308 if (likely(commit_side_effects))
309 svm_set_interrupt_shadow(vcpu, 0);
310
311 return 1;
312 }
313
svm_skip_emulated_instruction(struct kvm_vcpu * vcpu)314 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
315 {
316 return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true);
317 }
318
svm_update_soft_interrupt_rip(struct kvm_vcpu * vcpu,u8 vector)319 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector)
320 {
321 const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT |
322 EMULTYPE_SET_SOFT_INT_VECTOR(vector);
323 unsigned long rip, old_rip = kvm_rip_read(vcpu);
324 struct vcpu_svm *svm = to_svm(vcpu);
325
326 /*
327 * Due to architectural shortcomings, the CPU doesn't always provide
328 * NextRIP, e.g. if KVM intercepted an exception that occurred while
329 * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
330 * the instruction even if NextRIP is supported to acquire the next
331 * RIP so that it can be shoved into the NextRIP field, otherwise
332 * hardware will fail to advance guest RIP during event injection.
333 * Drop the exception/interrupt if emulation fails and effectively
334 * retry the instruction, it's the least awful option. If NRIPS is
335 * in use, the skip must not commit any side effects such as clearing
336 * the interrupt shadow or RFLAGS.RF.
337 */
338 if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips))
339 return -EIO;
340
341 rip = kvm_rip_read(vcpu);
342
343 /*
344 * Save the injection information, even when using next_rip, as the
345 * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
346 * doesn't complete due to a VM-Exit occurring while the CPU is
347 * vectoring the event. Decoding the instruction isn't guaranteed to
348 * work as there may be no backing instruction, e.g. if the event is
349 * being injected by L1 for L2, or if the guest is patching INT3 into
350 * a different instruction.
351 */
352 svm->soft_int_injected = true;
353 svm->soft_int_csbase = svm->vmcb->save.cs.base;
354 svm->soft_int_old_rip = old_rip;
355 svm->soft_int_next_rip = rip;
356
357 if (nrips)
358 kvm_rip_write(vcpu, old_rip);
359
360 if (static_cpu_has(X86_FEATURE_NRIPS))
361 svm->vmcb->control.next_rip = rip;
362
363 return 0;
364 }
365
svm_inject_exception(struct kvm_vcpu * vcpu)366 static void svm_inject_exception(struct kvm_vcpu *vcpu)
367 {
368 struct kvm_queued_exception *ex = &vcpu->arch.exception;
369 struct vcpu_svm *svm = to_svm(vcpu);
370
371 kvm_deliver_exception_payload(vcpu, ex);
372
373 if (kvm_exception_is_soft(ex->vector) &&
374 svm_update_soft_interrupt_rip(vcpu, ex->vector))
375 return;
376
377 svm->vmcb->control.event_inj = ex->vector
378 | SVM_EVTINJ_VALID
379 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
380 | SVM_EVTINJ_TYPE_EXEPT;
381 svm->vmcb->control.event_inj_err = ex->error_code;
382 }
383
svm_init_erratum_383(void)384 static void svm_init_erratum_383(void)
385 {
386 u64 val;
387
388 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
389 return;
390
391 /* Use _safe variants to not break nested virtualization */
392 if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val))
393 return;
394
395 val |= (1ULL << 47);
396
397 native_write_msr_safe(MSR_AMD64_DC_CFG, val);
398
399 erratum_383_found = true;
400 }
401
svm_init_osvw(struct kvm_vcpu * vcpu)402 static void svm_init_osvw(struct kvm_vcpu *vcpu)
403 {
404 /*
405 * Guests should see errata 400 and 415 as fixed (assuming that
406 * HLT and IO instructions are intercepted).
407 */
408 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
409 vcpu->arch.osvw.status = osvw_status & ~(6ULL);
410
411 /*
412 * By increasing VCPU's osvw.length to 3 we are telling the guest that
413 * all osvw.status bits inside that length, including bit 0 (which is
414 * reserved for erratum 298), are valid. However, if host processor's
415 * osvw_len is 0 then osvw_status[0] carries no information. We need to
416 * be conservative here and therefore we tell the guest that erratum 298
417 * is present (because we really don't know).
418 */
419 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
420 vcpu->arch.osvw.status |= 1;
421 }
422
__kvm_is_svm_supported(void)423 static bool __kvm_is_svm_supported(void)
424 {
425 int cpu = smp_processor_id();
426 struct cpuinfo_x86 *c = &cpu_data(cpu);
427
428 if (c->x86_vendor != X86_VENDOR_AMD &&
429 c->x86_vendor != X86_VENDOR_HYGON) {
430 pr_err("CPU %d isn't AMD or Hygon\n", cpu);
431 return false;
432 }
433
434 if (!cpu_has(c, X86_FEATURE_SVM)) {
435 pr_err("SVM not supported by CPU %d\n", cpu);
436 return false;
437 }
438
439 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
440 pr_info("KVM is unsupported when running as an SEV guest\n");
441 return false;
442 }
443
444 return true;
445 }
446
kvm_is_svm_supported(void)447 static bool kvm_is_svm_supported(void)
448 {
449 bool supported;
450
451 migrate_disable();
452 supported = __kvm_is_svm_supported();
453 migrate_enable();
454
455 return supported;
456 }
457
svm_check_processor_compat(void)458 static int svm_check_processor_compat(void)
459 {
460 if (!__kvm_is_svm_supported())
461 return -EIO;
462
463 return 0;
464 }
465
__svm_write_tsc_multiplier(u64 multiplier)466 static void __svm_write_tsc_multiplier(u64 multiplier)
467 {
468 if (multiplier == __this_cpu_read(current_tsc_ratio))
469 return;
470
471 wrmsrq(MSR_AMD64_TSC_RATIO, multiplier);
472 __this_cpu_write(current_tsc_ratio, multiplier);
473 }
474
sev_es_host_save_area(struct svm_cpu_data * sd)475 static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
476 {
477 return &sd->save_area->host_sev_es_save;
478 }
479
kvm_cpu_svm_disable(void)480 static inline void kvm_cpu_svm_disable(void)
481 {
482 uint64_t efer;
483
484 wrmsrq(MSR_VM_HSAVE_PA, 0);
485 rdmsrq(MSR_EFER, efer);
486 if (efer & EFER_SVME) {
487 /*
488 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
489 * NMI aren't blocked.
490 */
491 stgi();
492 wrmsrq(MSR_EFER, efer & ~EFER_SVME);
493 }
494 }
495
svm_emergency_disable_virtualization_cpu(void)496 static void svm_emergency_disable_virtualization_cpu(void)
497 {
498 kvm_rebooting = true;
499
500 kvm_cpu_svm_disable();
501 }
502
svm_disable_virtualization_cpu(void)503 static void svm_disable_virtualization_cpu(void)
504 {
505 /* Make sure we clean up behind us */
506 if (tsc_scaling)
507 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
508
509 kvm_cpu_svm_disable();
510
511 amd_pmu_disable_virt();
512 }
513
svm_enable_virtualization_cpu(void)514 static int svm_enable_virtualization_cpu(void)
515 {
516
517 struct svm_cpu_data *sd;
518 uint64_t efer;
519 int me = raw_smp_processor_id();
520
521 rdmsrq(MSR_EFER, efer);
522 if (efer & EFER_SVME)
523 return -EBUSY;
524
525 sd = per_cpu_ptr(&svm_data, me);
526 sd->asid_generation = 1;
527 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
528 sd->next_asid = sd->max_asid + 1;
529 sd->min_asid = max_sev_asid + 1;
530
531 wrmsrq(MSR_EFER, efer | EFER_SVME);
532
533 wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa);
534
535 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
536 /*
537 * Set the default value, even if we don't use TSC scaling
538 * to avoid having stale value in the msr
539 */
540 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
541 }
542
543
544 /*
545 * Get OSVW bits.
546 *
547 * Note that it is possible to have a system with mixed processor
548 * revisions and therefore different OSVW bits. If bits are not the same
549 * on different processors then choose the worst case (i.e. if erratum
550 * is present on one processor and not on another then assume that the
551 * erratum is present everywhere).
552 */
553 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
554 u64 len, status = 0;
555 int err;
556
557 err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len);
558 if (!err)
559 err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status);
560
561 if (err)
562 osvw_status = osvw_len = 0;
563 else {
564 if (len < osvw_len)
565 osvw_len = len;
566 osvw_status |= status;
567 osvw_status &= (1ULL << osvw_len) - 1;
568 }
569 } else
570 osvw_status = osvw_len = 0;
571
572 svm_init_erratum_383();
573
574 amd_pmu_enable_virt();
575
576 return 0;
577 }
578
svm_cpu_uninit(int cpu)579 static void svm_cpu_uninit(int cpu)
580 {
581 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
582
583 if (!sd->save_area)
584 return;
585
586 kfree(sd->sev_vmcbs);
587 __free_page(__sme_pa_to_page(sd->save_area_pa));
588 sd->save_area_pa = 0;
589 sd->save_area = NULL;
590 }
591
svm_cpu_init(int cpu)592 static int svm_cpu_init(int cpu)
593 {
594 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
595 struct page *save_area_page;
596 int ret = -ENOMEM;
597
598 memset(sd, 0, sizeof(struct svm_cpu_data));
599 save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
600 if (!save_area_page)
601 return ret;
602
603 ret = sev_cpu_init(sd);
604 if (ret)
605 goto free_save_area;
606
607 sd->save_area = page_address(save_area_page);
608 sd->save_area_pa = __sme_page_pa(save_area_page);
609 return 0;
610
611 free_save_area:
612 __free_page(save_area_page);
613 return ret;
614
615 }
616
set_dr_intercepts(struct vcpu_svm * svm)617 static void set_dr_intercepts(struct vcpu_svm *svm)
618 {
619 struct vmcb *vmcb = svm->vmcb01.ptr;
620
621 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
622 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
623 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
624 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
625 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
626 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
627 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
628 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
629 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
630 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
631 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
632 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
633 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
634 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
635 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
636 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
637
638 recalc_intercepts(svm);
639 }
640
clr_dr_intercepts(struct vcpu_svm * svm)641 static void clr_dr_intercepts(struct vcpu_svm *svm)
642 {
643 struct vmcb *vmcb = svm->vmcb01.ptr;
644
645 vmcb->control.intercepts[INTERCEPT_DR] = 0;
646
647 recalc_intercepts(svm);
648 }
649
msr_write_intercepted(struct kvm_vcpu * vcpu,u32 msr)650 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
651 {
652 /*
653 * For non-nested case:
654 * If the L01 MSR bitmap does not intercept the MSR, then we need to
655 * save it.
656 *
657 * For nested case:
658 * If the L02 MSR bitmap does not intercept the MSR, then we need to
659 * save it.
660 */
661 void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
662 to_svm(vcpu)->msrpm;
663
664 return svm_test_msr_bitmap_write(msrpm, msr);
665 }
666
svm_set_intercept_for_msr(struct kvm_vcpu * vcpu,u32 msr,int type,bool set)667 void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
668 {
669 struct vcpu_svm *svm = to_svm(vcpu);
670 void *msrpm = svm->msrpm;
671
672 /* Don't disable interception for MSRs userspace wants to handle. */
673 if (type & MSR_TYPE_R) {
674 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
675 svm_clear_msr_bitmap_read(msrpm, msr);
676 else
677 svm_set_msr_bitmap_read(msrpm, msr);
678 }
679
680 if (type & MSR_TYPE_W) {
681 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
682 svm_clear_msr_bitmap_write(msrpm, msr);
683 else
684 svm_set_msr_bitmap_write(msrpm, msr);
685 }
686
687 svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
688 svm->nested.force_msr_bitmap_recalc = true;
689 }
690
svm_alloc_permissions_map(unsigned long size,gfp_t gfp_mask)691 void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
692 {
693 unsigned int order = get_order(size);
694 struct page *pages = alloc_pages(gfp_mask, order);
695 void *pm;
696
697 if (!pages)
698 return NULL;
699
700 /*
701 * Set all bits in the permissions map so that all MSR and I/O accesses
702 * are intercepted by default.
703 */
704 pm = page_address(pages);
705 memset(pm, 0xff, PAGE_SIZE * (1 << order));
706
707 return pm;
708 }
709
svm_recalc_lbr_msr_intercepts(struct kvm_vcpu * vcpu)710 static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
711 {
712 struct vcpu_svm *svm = to_svm(vcpu);
713 bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
714
715 if (intercept == svm->lbr_msrs_intercepted)
716 return;
717
718 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
719 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
720 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
721 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);
722
723 if (sev_es_guest(vcpu->kvm))
724 svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
725
726 svm->lbr_msrs_intercepted = intercept;
727 }
728
svm_vcpu_free_msrpm(void * msrpm)729 void svm_vcpu_free_msrpm(void *msrpm)
730 {
731 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
732 }
733
svm_recalc_pmu_msr_intercepts(struct kvm_vcpu * vcpu)734 static void svm_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu)
735 {
736 bool intercept = !kvm_vcpu_has_mediated_pmu(vcpu);
737 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
738 int i;
739
740 if (!enable_mediated_pmu)
741 return;
742
743 /* Legacy counters are always available for AMD CPUs with a PMU. */
744 for (i = 0; i < min(pmu->nr_arch_gp_counters, AMD64_NUM_COUNTERS); i++)
745 svm_set_intercept_for_msr(vcpu, MSR_K7_PERFCTR0 + i,
746 MSR_TYPE_RW, intercept);
747
748 intercept |= !guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE);
749 for (i = 0; i < pmu->nr_arch_gp_counters; i++)
750 svm_set_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i,
751 MSR_TYPE_RW, intercept);
752
753 for ( ; i < kvm_pmu_cap.num_counters_gp; i++)
754 svm_enable_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i,
755 MSR_TYPE_RW);
756
757 intercept = kvm_need_perf_global_ctrl_intercept(vcpu);
758 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
759 MSR_TYPE_RW, intercept);
760 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
761 MSR_TYPE_RW, intercept);
762 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
763 MSR_TYPE_RW, intercept);
764 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET,
765 MSR_TYPE_RW, intercept);
766 }
767
svm_recalc_msr_intercepts(struct kvm_vcpu * vcpu)768 static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
769 {
770 struct vcpu_svm *svm = to_svm(vcpu);
771
772 svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
773 svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
774
775 #ifdef CONFIG_X86_64
776 svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
777 svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
778 svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
779 svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
780 svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
781 svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
782 #endif
783
784 if (lbrv)
785 svm_recalc_lbr_msr_intercepts(vcpu);
786
787 if (cpu_feature_enabled(X86_FEATURE_IBPB))
788 svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
789 !guest_has_pred_cmd_msr(vcpu));
790
791 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
792 svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
793 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
794
795 /*
796 * Disable interception of SPEC_CTRL if KVM doesn't need to manually
797 * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
798 * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
799 * using SPEC_CTRL.
800 */
801 if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
802 svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
803 !guest_has_spec_ctrl_msr(vcpu));
804 else
805 svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
806 !svm->spec_ctrl);
807
808 /*
809 * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
810 * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
811 */
812 svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
813 guest_cpuid_is_intel_compatible(vcpu));
814 svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
815 guest_cpuid_is_intel_compatible(vcpu));
816
817 if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
818 svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
819 svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
820 }
821
822 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
823 bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
824
825 svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled);
826 svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled);
827 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled);
828 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled);
829 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled);
830 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled);
831 }
832
833 if (sev_es_guest(vcpu->kvm))
834 sev_es_recalc_msr_intercepts(vcpu);
835
836 svm_recalc_pmu_msr_intercepts(vcpu);
837
838 /*
839 * x2APIC intercepts are modified on-demand and cannot be filtered by
840 * userspace.
841 */
842 }
843
svm_copy_lbrs(struct vmcb * to_vmcb,struct vmcb * from_vmcb)844 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
845 {
846 to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
847 to_vmcb->save.br_from = from_vmcb->save.br_from;
848 to_vmcb->save.br_to = from_vmcb->save.br_to;
849 to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
850 to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
851
852 vmcb_mark_dirty(to_vmcb, VMCB_LBR);
853 }
854
__svm_enable_lbrv(struct kvm_vcpu * vcpu)855 static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
856 {
857 to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
858 }
859
svm_enable_lbrv(struct kvm_vcpu * vcpu)860 void svm_enable_lbrv(struct kvm_vcpu *vcpu)
861 {
862 __svm_enable_lbrv(vcpu);
863 svm_recalc_lbr_msr_intercepts(vcpu);
864 }
865
__svm_disable_lbrv(struct kvm_vcpu * vcpu)866 static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
867 {
868 KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
869 to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
870 }
871
svm_update_lbrv(struct kvm_vcpu * vcpu)872 void svm_update_lbrv(struct kvm_vcpu *vcpu)
873 {
874 struct vcpu_svm *svm = to_svm(vcpu);
875 bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
876 bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
877 (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
878 (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
879
880 if (enable_lbrv && !current_enable_lbrv)
881 __svm_enable_lbrv(vcpu);
882 else if (!enable_lbrv && current_enable_lbrv)
883 __svm_disable_lbrv(vcpu);
884
885 /*
886 * During nested transitions, it is possible that the current VMCB has
887 * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
888 * In this case, even though LBR_CTL does not need an update, intercepts
889 * do, so always recalculate the intercepts here.
890 */
891 svm_recalc_lbr_msr_intercepts(vcpu);
892 }
893
disable_nmi_singlestep(struct vcpu_svm * svm)894 void disable_nmi_singlestep(struct vcpu_svm *svm)
895 {
896 svm->nmi_singlestep = false;
897
898 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
899 /* Clear our flags if they were not set by the guest */
900 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
901 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
902 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
903 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
904 }
905 }
906
grow_ple_window(struct kvm_vcpu * vcpu)907 static void grow_ple_window(struct kvm_vcpu *vcpu)
908 {
909 struct vcpu_svm *svm = to_svm(vcpu);
910 struct vmcb_control_area *control = &svm->vmcb->control;
911 int old = control->pause_filter_count;
912
913 if (kvm_pause_in_guest(vcpu->kvm))
914 return;
915
916 control->pause_filter_count = __grow_ple_window(old,
917 pause_filter_count,
918 pause_filter_count_grow,
919 pause_filter_count_max);
920
921 if (control->pause_filter_count != old) {
922 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
923 trace_kvm_ple_window_update(vcpu->vcpu_id,
924 control->pause_filter_count, old);
925 }
926 }
927
shrink_ple_window(struct kvm_vcpu * vcpu)928 static void shrink_ple_window(struct kvm_vcpu *vcpu)
929 {
930 struct vcpu_svm *svm = to_svm(vcpu);
931 struct vmcb_control_area *control = &svm->vmcb->control;
932 int old = control->pause_filter_count;
933
934 if (kvm_pause_in_guest(vcpu->kvm))
935 return;
936
937 control->pause_filter_count =
938 __shrink_ple_window(old,
939 pause_filter_count,
940 pause_filter_count_shrink,
941 pause_filter_count);
942 if (control->pause_filter_count != old) {
943 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
944 trace_kvm_ple_window_update(vcpu->vcpu_id,
945 control->pause_filter_count, old);
946 }
947 }
948
svm_hardware_unsetup(void)949 static void svm_hardware_unsetup(void)
950 {
951 int cpu;
952
953 avic_hardware_unsetup();
954
955 sev_hardware_unsetup();
956
957 for_each_possible_cpu(cpu)
958 svm_cpu_uninit(cpu);
959
960 __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE));
961 iopm_base = 0;
962 }
963
init_seg(struct vmcb_seg * seg)964 static void init_seg(struct vmcb_seg *seg)
965 {
966 seg->selector = 0;
967 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
968 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
969 seg->limit = 0xffff;
970 seg->base = 0;
971 }
972
init_sys_seg(struct vmcb_seg * seg,uint32_t type)973 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
974 {
975 seg->selector = 0;
976 seg->attrib = SVM_SELECTOR_P_MASK | type;
977 seg->limit = 0xffff;
978 seg->base = 0;
979 }
980
svm_get_l2_tsc_offset(struct kvm_vcpu * vcpu)981 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
982 {
983 struct vcpu_svm *svm = to_svm(vcpu);
984
985 return svm->nested.ctl.tsc_offset;
986 }
987
svm_get_l2_tsc_multiplier(struct kvm_vcpu * vcpu)988 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
989 {
990 struct vcpu_svm *svm = to_svm(vcpu);
991
992 return svm->tsc_ratio_msr;
993 }
994
svm_write_tsc_offset(struct kvm_vcpu * vcpu)995 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
996 {
997 struct vcpu_svm *svm = to_svm(vcpu);
998
999 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1000 svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
1001 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1002 }
1003
svm_write_tsc_multiplier(struct kvm_vcpu * vcpu)1004 void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
1005 {
1006 preempt_disable();
1007 if (to_svm(vcpu)->guest_state_loaded)
1008 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1009 preempt_enable();
1010 }
1011
1012 /* Evaluate instruction intercepts that depend on guest CPUID features. */
svm_recalc_instruction_intercepts(struct kvm_vcpu * vcpu)1013 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
1014 {
1015 struct vcpu_svm *svm = to_svm(vcpu);
1016
1017 /*
1018 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1019 * roots, or if INVPCID is disabled in the guest to inject #UD.
1020 */
1021 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1022 if (!npt_enabled ||
1023 !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID))
1024 svm_set_intercept(svm, INTERCEPT_INVPCID);
1025 else
1026 svm_clr_intercept(svm, INTERCEPT_INVPCID);
1027 }
1028
1029 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1030 if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP))
1031 svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1032 else
1033 svm_set_intercept(svm, INTERCEPT_RDTSCP);
1034 }
1035
1036 /*
1037 * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is
1038 * always set if vls is enabled. If the intercepts are set, the bit is
1039 * meaningless anyway.
1040 */
1041 if (guest_cpuid_is_intel_compatible(vcpu)) {
1042 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1043 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1044 } else {
1045 /*
1046 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1047 * in VMCB and clear intercepts to avoid #VMEXIT.
1048 */
1049 if (vls) {
1050 svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1051 svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1052 }
1053 }
1054
1055 if (kvm_need_rdpmc_intercept(vcpu))
1056 svm_set_intercept(svm, INTERCEPT_RDPMC);
1057 else
1058 svm_clr_intercept(svm, INTERCEPT_RDPMC);
1059 }
1060
svm_recalc_intercepts(struct kvm_vcpu * vcpu)1061 static void svm_recalc_intercepts(struct kvm_vcpu *vcpu)
1062 {
1063 svm_recalc_instruction_intercepts(vcpu);
1064 svm_recalc_msr_intercepts(vcpu);
1065 }
1066
init_vmcb(struct kvm_vcpu * vcpu,bool init_event)1067 static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
1068 {
1069 struct vcpu_svm *svm = to_svm(vcpu);
1070 struct vmcb *vmcb = svm->vmcb01.ptr;
1071 struct vmcb_control_area *control = &vmcb->control;
1072 struct vmcb_save_area *save = &vmcb->save;
1073
1074 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1075 svm_set_intercept(svm, INTERCEPT_CR3_READ);
1076 svm_set_intercept(svm, INTERCEPT_CR4_READ);
1077 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1078 svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1079 svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1080 if (!kvm_vcpu_apicv_active(vcpu))
1081 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1082
1083 set_dr_intercepts(svm);
1084
1085 set_exception_intercept(svm, PF_VECTOR);
1086 set_exception_intercept(svm, UD_VECTOR);
1087 set_exception_intercept(svm, MC_VECTOR);
1088 set_exception_intercept(svm, AC_VECTOR);
1089 set_exception_intercept(svm, DB_VECTOR);
1090 /*
1091 * Guest access to VMware backdoor ports could legitimately
1092 * trigger #GP because of TSS I/O permission bitmap.
1093 * We intercept those #GP and allow access to them anyway
1094 * as VMware does.
1095 */
1096 if (enable_vmware_backdoor)
1097 set_exception_intercept(svm, GP_VECTOR);
1098
1099 svm_set_intercept(svm, INTERCEPT_INTR);
1100 svm_set_intercept(svm, INTERCEPT_NMI);
1101
1102 if (intercept_smi)
1103 svm_set_intercept(svm, INTERCEPT_SMI);
1104
1105 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1106 svm_set_intercept(svm, INTERCEPT_RDPMC);
1107 svm_set_intercept(svm, INTERCEPT_CPUID);
1108 svm_set_intercept(svm, INTERCEPT_INVD);
1109 svm_set_intercept(svm, INTERCEPT_INVLPG);
1110 svm_set_intercept(svm, INTERCEPT_INVLPGA);
1111 svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1112 svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1113 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1114 svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1115 svm_set_intercept(svm, INTERCEPT_VMRUN);
1116 svm_set_intercept(svm, INTERCEPT_VMMCALL);
1117 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1118 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1119 svm_set_intercept(svm, INTERCEPT_STGI);
1120 svm_set_intercept(svm, INTERCEPT_CLGI);
1121 svm_set_intercept(svm, INTERCEPT_SKINIT);
1122 svm_set_intercept(svm, INTERCEPT_WBINVD);
1123 svm_set_intercept(svm, INTERCEPT_XSETBV);
1124 svm_set_intercept(svm, INTERCEPT_RDPRU);
1125 svm_set_intercept(svm, INTERCEPT_RSM);
1126
1127 if (!kvm_mwait_in_guest(vcpu->kvm)) {
1128 svm_set_intercept(svm, INTERCEPT_MONITOR);
1129 svm_set_intercept(svm, INTERCEPT_MWAIT);
1130 }
1131
1132 if (!kvm_hlt_in_guest(vcpu->kvm)) {
1133 if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT))
1134 svm_set_intercept(svm, INTERCEPT_IDLE_HLT);
1135 else
1136 svm_set_intercept(svm, INTERCEPT_HLT);
1137 }
1138
1139 control->iopm_base_pa = iopm_base;
1140 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1141 control->int_ctl = V_INTR_MASKING_MASK;
1142
1143 init_seg(&save->es);
1144 init_seg(&save->ss);
1145 init_seg(&save->ds);
1146 init_seg(&save->fs);
1147 init_seg(&save->gs);
1148
1149 save->cs.selector = 0xf000;
1150 save->cs.base = 0xffff0000;
1151 /* Executable/Readable Code Segment */
1152 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1153 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1154 save->cs.limit = 0xffff;
1155
1156 save->gdtr.base = 0;
1157 save->gdtr.limit = 0xffff;
1158 save->idtr.base = 0;
1159 save->idtr.limit = 0xffff;
1160
1161 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1162 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1163
1164 if (npt_enabled) {
1165 /* Setup VMCB for Nested Paging */
1166 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1167 svm_clr_intercept(svm, INTERCEPT_INVLPG);
1168 clr_exception_intercept(svm, PF_VECTOR);
1169 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1170 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1171 save->g_pat = vcpu->arch.pat;
1172 save->cr3 = 0;
1173 }
1174 svm->current_vmcb->asid_generation = 0;
1175 svm->asid = 0;
1176
1177 svm->nested.vmcb12_gpa = INVALID_GPA;
1178 svm->nested.last_vmcb12_gpa = INVALID_GPA;
1179
1180 if (!kvm_pause_in_guest(vcpu->kvm)) {
1181 control->pause_filter_count = pause_filter_count;
1182 if (pause_filter_thresh)
1183 control->pause_filter_thresh = pause_filter_thresh;
1184 svm_set_intercept(svm, INTERCEPT_PAUSE);
1185 } else {
1186 svm_clr_intercept(svm, INTERCEPT_PAUSE);
1187 }
1188
1189 if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
1190 svm->vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP;
1191
1192 if (kvm_vcpu_apicv_active(vcpu))
1193 avic_init_vmcb(svm, vmcb);
1194
1195 if (vnmi)
1196 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
1197
1198 if (vgif) {
1199 svm_clr_intercept(svm, INTERCEPT_STGI);
1200 svm_clr_intercept(svm, INTERCEPT_CLGI);
1201 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1202 }
1203
1204 if (vls)
1205 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1206
1207 if (vcpu->kvm->arch.bus_lock_detection_enabled)
1208 svm_set_intercept(svm, INTERCEPT_BUSLOCK);
1209
1210 if (sev_guest(vcpu->kvm))
1211 sev_init_vmcb(svm, init_event);
1212
1213 svm_hv_init_vmcb(vmcb);
1214
1215 kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
1216
1217 vmcb_mark_all_dirty(vmcb);
1218
1219 enable_gif(svm);
1220 }
1221
__svm_vcpu_reset(struct kvm_vcpu * vcpu)1222 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1223 {
1224 struct vcpu_svm *svm = to_svm(vcpu);
1225
1226 svm_init_osvw(vcpu);
1227
1228 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
1229 vcpu->arch.microcode_version = 0x01000065;
1230 svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1231
1232 svm->nmi_masked = false;
1233 svm->awaiting_iret_completion = false;
1234 }
1235
svm_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)1236 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1237 {
1238 struct vcpu_svm *svm = to_svm(vcpu);
1239
1240 svm->spec_ctrl = 0;
1241 svm->virt_spec_ctrl = 0;
1242
1243 init_vmcb(vcpu, init_event);
1244
1245 if (!init_event)
1246 __svm_vcpu_reset(vcpu);
1247 }
1248
svm_switch_vmcb(struct vcpu_svm * svm,struct kvm_vmcb_info * target_vmcb)1249 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1250 {
1251 svm->current_vmcb = target_vmcb;
1252 svm->vmcb = target_vmcb->ptr;
1253 }
1254
svm_vcpu_precreate(struct kvm * kvm)1255 static int svm_vcpu_precreate(struct kvm *kvm)
1256 {
1257 return avic_alloc_physical_id_table(kvm);
1258 }
1259
svm_vcpu_create(struct kvm_vcpu * vcpu)1260 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1261 {
1262 struct vcpu_svm *svm;
1263 struct page *vmcb01_page;
1264 int err;
1265
1266 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1267 svm = to_svm(vcpu);
1268
1269 err = -ENOMEM;
1270 vmcb01_page = snp_safe_alloc_page();
1271 if (!vmcb01_page)
1272 goto out;
1273
1274 err = sev_vcpu_create(vcpu);
1275 if (err)
1276 goto error_free_vmcb_page;
1277
1278 err = avic_init_vcpu(svm);
1279 if (err)
1280 goto error_free_sev;
1281
1282 svm->msrpm = svm_vcpu_alloc_msrpm();
1283 if (!svm->msrpm) {
1284 err = -ENOMEM;
1285 goto error_free_sev;
1286 }
1287
1288 svm->x2avic_msrs_intercepted = true;
1289 svm->lbr_msrs_intercepted = true;
1290
1291 svm->vmcb01.ptr = page_address(vmcb01_page);
1292 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1293 svm_switch_vmcb(svm, &svm->vmcb01);
1294
1295 svm->guest_state_loaded = false;
1296
1297 return 0;
1298
1299 error_free_sev:
1300 sev_free_vcpu(vcpu);
1301 error_free_vmcb_page:
1302 __free_page(vmcb01_page);
1303 out:
1304 return err;
1305 }
1306
svm_vcpu_free(struct kvm_vcpu * vcpu)1307 static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1308 {
1309 struct vcpu_svm *svm = to_svm(vcpu);
1310
1311 WARN_ON_ONCE(!list_empty(&svm->ir_list));
1312
1313 svm_leave_nested(vcpu);
1314 svm_free_nested(svm);
1315
1316 sev_free_vcpu(vcpu);
1317
1318 __free_page(__sme_pa_to_page(svm->vmcb01.pa));
1319 svm_vcpu_free_msrpm(svm->msrpm);
1320 }
1321
1322 #ifdef CONFIG_CPU_MITIGATIONS
1323 static DEFINE_SPINLOCK(srso_lock);
1324 static atomic_t srso_nr_vms;
1325
svm_srso_clear_bp_spec_reduce(void * ign)1326 static void svm_srso_clear_bp_spec_reduce(void *ign)
1327 {
1328 struct svm_cpu_data *sd = this_cpu_ptr(&svm_data);
1329
1330 if (!sd->bp_spec_reduce_set)
1331 return;
1332
1333 msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
1334 sd->bp_spec_reduce_set = false;
1335 }
1336
svm_srso_vm_destroy(void)1337 static void svm_srso_vm_destroy(void)
1338 {
1339 if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
1340 return;
1341
1342 if (atomic_dec_return(&srso_nr_vms))
1343 return;
1344
1345 guard(spinlock)(&srso_lock);
1346
1347 /*
1348 * Verify a new VM didn't come along, acquire the lock, and increment
1349 * the count before this task acquired the lock.
1350 */
1351 if (atomic_read(&srso_nr_vms))
1352 return;
1353
1354 on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1);
1355 }
1356
svm_srso_vm_init(void)1357 static void svm_srso_vm_init(void)
1358 {
1359 if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE))
1360 return;
1361
1362 /*
1363 * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0
1364 * transition, i.e. destroying the last VM, is fully complete, e.g. so
1365 * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs.
1366 */
1367 if (atomic_inc_not_zero(&srso_nr_vms))
1368 return;
1369
1370 guard(spinlock)(&srso_lock);
1371
1372 atomic_inc(&srso_nr_vms);
1373 }
1374 #else
svm_srso_vm_init(void)1375 static void svm_srso_vm_init(void) { }
svm_srso_vm_destroy(void)1376 static void svm_srso_vm_destroy(void) { }
1377 #endif
1378
svm_prepare_switch_to_guest(struct kvm_vcpu * vcpu)1379 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1380 {
1381 struct vcpu_svm *svm = to_svm(vcpu);
1382 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1383
1384 if (sev_es_guest(vcpu->kvm))
1385 sev_es_unmap_ghcb(svm);
1386
1387 if (svm->guest_state_loaded)
1388 return;
1389
1390 /*
1391 * Save additional host state that will be restored on VMEXIT (sev-es)
1392 * or subsequent vmload of host save area.
1393 */
1394 vmsave(sd->save_area_pa);
1395 if (sev_es_guest(vcpu->kvm))
1396 sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));
1397
1398 if (tsc_scaling)
1399 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1400
1401 /*
1402 * TSC_AUX is always virtualized (context switched by hardware) for
1403 * SEV-ES guests when the feature is available. For non-SEV-ES guests,
1404 * context switch TSC_AUX via the user_return MSR infrastructure (not
1405 * all CPUs support TSC_AUX virtualization).
1406 */
1407 if (likely(tsc_aux_uret_slot >= 0) &&
1408 (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
1409 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1410
1411 if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) &&
1412 !sd->bp_spec_reduce_set) {
1413 sd->bp_spec_reduce_set = true;
1414 msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT);
1415 }
1416 svm->guest_state_loaded = true;
1417 }
1418
svm_prepare_host_switch(struct kvm_vcpu * vcpu)1419 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1420 {
1421 to_svm(vcpu)->guest_state_loaded = false;
1422 }
1423
svm_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1424 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1425 {
1426 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
1427 shrink_ple_window(vcpu);
1428
1429 if (kvm_vcpu_apicv_active(vcpu))
1430 avic_vcpu_load(vcpu, cpu);
1431 }
1432
svm_vcpu_put(struct kvm_vcpu * vcpu)1433 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1434 {
1435 if (kvm_vcpu_apicv_active(vcpu))
1436 avic_vcpu_put(vcpu);
1437
1438 svm_prepare_host_switch(vcpu);
1439
1440 ++vcpu->stat.host_state_reload;
1441 }
1442
svm_get_rflags(struct kvm_vcpu * vcpu)1443 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1444 {
1445 struct vcpu_svm *svm = to_svm(vcpu);
1446 unsigned long rflags = svm->vmcb->save.rflags;
1447
1448 if (svm->nmi_singlestep) {
1449 /* Hide our flags if they were not set by the guest */
1450 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1451 rflags &= ~X86_EFLAGS_TF;
1452 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1453 rflags &= ~X86_EFLAGS_RF;
1454 }
1455 return rflags;
1456 }
1457
svm_set_rflags(struct kvm_vcpu * vcpu,unsigned long rflags)1458 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1459 {
1460 if (to_svm(vcpu)->nmi_singlestep)
1461 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1462
1463 /*
1464 * Any change of EFLAGS.VM is accompanied by a reload of SS
1465 * (caused by either a task switch or an inter-privilege IRET),
1466 * so we do not need to update the CPL here.
1467 */
1468 to_svm(vcpu)->vmcb->save.rflags = rflags;
1469 }
1470
svm_get_if_flag(struct kvm_vcpu * vcpu)1471 static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1472 {
1473 struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1474
1475 return sev_es_guest(vcpu->kvm)
1476 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1477 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1478 }
1479
svm_cache_reg(struct kvm_vcpu * vcpu,enum kvm_reg reg)1480 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1481 {
1482 kvm_register_mark_available(vcpu, reg);
1483
1484 switch (reg) {
1485 case VCPU_EXREG_PDPTR:
1486 /*
1487 * When !npt_enabled, mmu->pdptrs[] is already available since
1488 * it is always updated per SDM when moving to CRs.
1489 */
1490 if (npt_enabled)
1491 load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1492 break;
1493 default:
1494 KVM_BUG_ON(1, vcpu->kvm);
1495 }
1496 }
1497
svm_set_vintr(struct vcpu_svm * svm)1498 static void svm_set_vintr(struct vcpu_svm *svm)
1499 {
1500 struct vmcb_control_area *control;
1501
1502 /*
1503 * The following fields are ignored when AVIC is enabled
1504 */
1505 WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1506
1507 svm_set_intercept(svm, INTERCEPT_VINTR);
1508
1509 /*
1510 * Recalculating intercepts may have cleared the VINTR intercept. If
1511 * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
1512 * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
1513 * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
1514 * interrupts will never be unblocked while L2 is running.
1515 */
1516 if (!svm_is_intercept(svm, INTERCEPT_VINTR))
1517 return;
1518
1519 /*
1520 * This is just a dummy VINTR to actually cause a vmexit to happen.
1521 * Actual injection of virtual interrupts happens through EVENTINJ.
1522 */
1523 control = &svm->vmcb->control;
1524 control->int_vector = 0x0;
1525 control->int_ctl &= ~V_INTR_PRIO_MASK;
1526 control->int_ctl |= V_IRQ_MASK |
1527 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1528 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1529 }
1530
svm_clear_vintr(struct vcpu_svm * svm)1531 static void svm_clear_vintr(struct vcpu_svm *svm)
1532 {
1533 svm_clr_intercept(svm, INTERCEPT_VINTR);
1534
1535 /* Drop int_ctl fields related to VINTR injection. */
1536 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1537 if (is_guest_mode(&svm->vcpu)) {
1538 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1539
1540 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1541 (svm->nested.ctl.int_ctl & V_TPR_MASK));
1542
1543 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1544 V_IRQ_INJECTION_BITS_MASK;
1545
1546 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1547 }
1548
1549 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1550 }
1551
svm_seg(struct kvm_vcpu * vcpu,int seg)1552 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1553 {
1554 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1555 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1556
1557 switch (seg) {
1558 case VCPU_SREG_CS: return &save->cs;
1559 case VCPU_SREG_DS: return &save->ds;
1560 case VCPU_SREG_ES: return &save->es;
1561 case VCPU_SREG_FS: return &save01->fs;
1562 case VCPU_SREG_GS: return &save01->gs;
1563 case VCPU_SREG_SS: return &save->ss;
1564 case VCPU_SREG_TR: return &save01->tr;
1565 case VCPU_SREG_LDTR: return &save01->ldtr;
1566 }
1567 BUG();
1568 return NULL;
1569 }
1570
svm_get_segment_base(struct kvm_vcpu * vcpu,int seg)1571 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1572 {
1573 struct vmcb_seg *s = svm_seg(vcpu, seg);
1574
1575 return s->base;
1576 }
1577
svm_get_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)1578 static void svm_get_segment(struct kvm_vcpu *vcpu,
1579 struct kvm_segment *var, int seg)
1580 {
1581 struct vmcb_seg *s = svm_seg(vcpu, seg);
1582
1583 var->base = s->base;
1584 var->limit = s->limit;
1585 var->selector = s->selector;
1586 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1587 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1588 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1589 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1590 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1591 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1592 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1593
1594 /*
1595 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1596 * However, the SVM spec states that the G bit is not observed by the
1597 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1598 * So let's synthesize a legal G bit for all segments, this helps
1599 * running KVM nested. It also helps cross-vendor migration, because
1600 * Intel's vmentry has a check on the 'G' bit.
1601 */
1602 var->g = s->limit > 0xfffff;
1603
1604 /*
1605 * AMD's VMCB does not have an explicit unusable field, so emulate it
1606 * for cross vendor migration purposes by "not present"
1607 */
1608 var->unusable = !var->present;
1609
1610 switch (seg) {
1611 case VCPU_SREG_TR:
1612 /*
1613 * Work around a bug where the busy flag in the tr selector
1614 * isn't exposed
1615 */
1616 var->type |= 0x2;
1617 break;
1618 case VCPU_SREG_DS:
1619 case VCPU_SREG_ES:
1620 case VCPU_SREG_FS:
1621 case VCPU_SREG_GS:
1622 /*
1623 * The accessed bit must always be set in the segment
1624 * descriptor cache, although it can be cleared in the
1625 * descriptor, the cached bit always remains at 1. Since
1626 * Intel has a check on this, set it here to support
1627 * cross-vendor migration.
1628 */
1629 if (!var->unusable)
1630 var->type |= 0x1;
1631 break;
1632 case VCPU_SREG_SS:
1633 /*
1634 * On AMD CPUs sometimes the DB bit in the segment
1635 * descriptor is left as 1, although the whole segment has
1636 * been made unusable. Clear it here to pass an Intel VMX
1637 * entry check when cross vendor migrating.
1638 */
1639 if (var->unusable)
1640 var->db = 0;
1641 /* This is symmetric with svm_set_segment() */
1642 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1643 break;
1644 }
1645 }
1646
svm_get_cpl(struct kvm_vcpu * vcpu)1647 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1648 {
1649 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1650
1651 return save->cpl;
1652 }
1653
svm_get_cs_db_l_bits(struct kvm_vcpu * vcpu,int * db,int * l)1654 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1655 {
1656 struct kvm_segment cs;
1657
1658 svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1659 *db = cs.db;
1660 *l = cs.l;
1661 }
1662
svm_get_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1663 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1664 {
1665 struct vcpu_svm *svm = to_svm(vcpu);
1666
1667 dt->size = svm->vmcb->save.idtr.limit;
1668 dt->address = svm->vmcb->save.idtr.base;
1669 }
1670
svm_set_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1671 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1672 {
1673 struct vcpu_svm *svm = to_svm(vcpu);
1674
1675 svm->vmcb->save.idtr.limit = dt->size;
1676 svm->vmcb->save.idtr.base = dt->address ;
1677 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1678 }
1679
svm_get_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1680 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1681 {
1682 struct vcpu_svm *svm = to_svm(vcpu);
1683
1684 dt->size = svm->vmcb->save.gdtr.limit;
1685 dt->address = svm->vmcb->save.gdtr.base;
1686 }
1687
svm_set_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1688 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1689 {
1690 struct vcpu_svm *svm = to_svm(vcpu);
1691
1692 svm->vmcb->save.gdtr.limit = dt->size;
1693 svm->vmcb->save.gdtr.base = dt->address ;
1694 vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1695 }
1696
sev_post_set_cr3(struct kvm_vcpu * vcpu,unsigned long cr3)1697 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1698 {
1699 struct vcpu_svm *svm = to_svm(vcpu);
1700
1701 /*
1702 * For guests that don't set guest_state_protected, the cr3 update is
1703 * handled via kvm_mmu_load() while entering the guest. For guests
1704 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1705 * VMCB save area now, since the save area will become the initial
1706 * contents of the VMSA, and future VMCB save area updates won't be
1707 * seen.
1708 */
1709 if (sev_es_guest(vcpu->kvm)) {
1710 svm->vmcb->save.cr3 = cr3;
1711 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1712 }
1713 }
1714
svm_is_valid_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)1715 static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1716 {
1717 return true;
1718 }
1719
svm_set_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)1720 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1721 {
1722 struct vcpu_svm *svm = to_svm(vcpu);
1723 u64 hcr0 = cr0;
1724 bool old_paging = is_paging(vcpu);
1725
1726 #ifdef CONFIG_X86_64
1727 if (vcpu->arch.efer & EFER_LME) {
1728 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1729 vcpu->arch.efer |= EFER_LMA;
1730 if (!vcpu->arch.guest_state_protected)
1731 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1732 }
1733
1734 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1735 vcpu->arch.efer &= ~EFER_LMA;
1736 if (!vcpu->arch.guest_state_protected)
1737 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1738 }
1739 }
1740 #endif
1741 vcpu->arch.cr0 = cr0;
1742
1743 if (!npt_enabled) {
1744 hcr0 |= X86_CR0_PG | X86_CR0_WP;
1745 if (old_paging != is_paging(vcpu))
1746 svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1747 }
1748
1749 /*
1750 * re-enable caching here because the QEMU bios
1751 * does not do it - this results in some delay at
1752 * reboot
1753 */
1754 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1755 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1756
1757 svm->vmcb->save.cr0 = hcr0;
1758 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1759
1760 /*
1761 * SEV-ES guests must always keep the CR intercepts cleared. CR
1762 * tracking is done using the CR write traps.
1763 */
1764 if (sev_es_guest(vcpu->kvm))
1765 return;
1766
1767 if (hcr0 == cr0) {
1768 /* Selective CR0 write remains on. */
1769 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1770 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1771 } else {
1772 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1773 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1774 }
1775 }
1776
svm_is_valid_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)1777 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1778 {
1779 return true;
1780 }
1781
svm_set_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)1782 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1783 {
1784 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1785 unsigned long old_cr4 = vcpu->arch.cr4;
1786
1787 vcpu->arch.cr4 = cr4;
1788 if (!npt_enabled) {
1789 cr4 |= X86_CR4_PAE;
1790
1791 if (!is_paging(vcpu))
1792 cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1793 }
1794 cr4 |= host_cr4_mce;
1795 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1796 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1797
1798 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1799 vcpu->arch.cpuid_dynamic_bits_dirty = true;
1800 }
1801
svm_set_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)1802 static void svm_set_segment(struct kvm_vcpu *vcpu,
1803 struct kvm_segment *var, int seg)
1804 {
1805 struct vcpu_svm *svm = to_svm(vcpu);
1806 struct vmcb_seg *s = svm_seg(vcpu, seg);
1807
1808 s->base = var->base;
1809 s->limit = var->limit;
1810 s->selector = var->selector;
1811 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1812 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1813 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1814 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1815 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1816 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1817 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1818 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1819
1820 /*
1821 * This is always accurate, except if SYSRET returned to a segment
1822 * with SS.DPL != 3. Intel does not have this quirk, and always
1823 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1824 * would entail passing the CPL to userspace and back.
1825 */
1826 if (seg == VCPU_SREG_SS)
1827 /* This is symmetric with svm_get_segment() */
1828 svm->vmcb->save.cpl = (var->dpl & 3);
1829
1830 vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1831 }
1832
svm_update_exception_bitmap(struct kvm_vcpu * vcpu)1833 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1834 {
1835 struct vcpu_svm *svm = to_svm(vcpu);
1836
1837 clr_exception_intercept(svm, BP_VECTOR);
1838
1839 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1840 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1841 set_exception_intercept(svm, BP_VECTOR);
1842 }
1843 }
1844
new_asid(struct vcpu_svm * svm,struct svm_cpu_data * sd)1845 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1846 {
1847 if (sd->next_asid > sd->max_asid) {
1848 ++sd->asid_generation;
1849 sd->next_asid = sd->min_asid;
1850 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1851 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1852 }
1853
1854 svm->current_vmcb->asid_generation = sd->asid_generation;
1855 svm->asid = sd->next_asid++;
1856 }
1857
svm_set_dr6(struct kvm_vcpu * vcpu,unsigned long value)1858 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
1859 {
1860 struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1861
1862 if (vcpu->arch.guest_state_protected)
1863 return;
1864
1865 if (unlikely(value != vmcb->save.dr6)) {
1866 vmcb->save.dr6 = value;
1867 vmcb_mark_dirty(vmcb, VMCB_DR);
1868 }
1869 }
1870
svm_sync_dirty_debug_regs(struct kvm_vcpu * vcpu)1871 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1872 {
1873 struct vcpu_svm *svm = to_svm(vcpu);
1874
1875 if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
1876 return;
1877
1878 get_debugreg(vcpu->arch.db[0], 0);
1879 get_debugreg(vcpu->arch.db[1], 1);
1880 get_debugreg(vcpu->arch.db[2], 2);
1881 get_debugreg(vcpu->arch.db[3], 3);
1882 /*
1883 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1884 * because db_interception might need it. We can do it before vmentry.
1885 */
1886 vcpu->arch.dr6 = svm->vmcb->save.dr6;
1887 vcpu->arch.dr7 = svm->vmcb->save.dr7;
1888 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1889 set_dr_intercepts(svm);
1890 }
1891
svm_set_dr7(struct kvm_vcpu * vcpu,unsigned long value)1892 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1893 {
1894 struct vcpu_svm *svm = to_svm(vcpu);
1895
1896 if (vcpu->arch.guest_state_protected)
1897 return;
1898
1899 svm->vmcb->save.dr7 = value;
1900 vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1901 }
1902
pf_interception(struct kvm_vcpu * vcpu)1903 static int pf_interception(struct kvm_vcpu *vcpu)
1904 {
1905 struct vcpu_svm *svm = to_svm(vcpu);
1906
1907 u64 fault_address = svm->vmcb->control.exit_info_2;
1908 u64 error_code = svm->vmcb->control.exit_info_1;
1909
1910 return kvm_handle_page_fault(vcpu, error_code, fault_address,
1911 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1912 svm->vmcb->control.insn_bytes : NULL,
1913 svm->vmcb->control.insn_len);
1914 }
1915
1916 static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
1917 void *insn, int insn_len);
1918
npf_interception(struct kvm_vcpu * vcpu)1919 static int npf_interception(struct kvm_vcpu *vcpu)
1920 {
1921 struct vcpu_svm *svm = to_svm(vcpu);
1922 int rc;
1923
1924 u64 error_code = svm->vmcb->control.exit_info_1;
1925 gpa_t gpa = svm->vmcb->control.exit_info_2;
1926
1927 /*
1928 * WARN if hardware generates a fault with an error code that collides
1929 * with KVM-defined sythentic flags. Clear the flags and continue on,
1930 * i.e. don't terminate the VM, as KVM can't possibly be relying on a
1931 * flag that KVM doesn't know about.
1932 */
1933 if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
1934 error_code &= ~PFERR_SYNTHETIC_MASK;
1935
1936 /*
1937 * Expedite fast MMIO kicks if the next RIP is known and KVM is allowed
1938 * emulate a page fault, e.g. skipping the current instruction is wrong
1939 * if the #NPF occurred while vectoring an event.
1940 */
1941 if ((error_code & PFERR_RSVD_MASK) && !is_guest_mode(vcpu)) {
1942 const int emul_type = EMULTYPE_PF | EMULTYPE_NO_DECODE;
1943
1944 if (svm_check_emulate_instruction(vcpu, emul_type, NULL, 0))
1945 return 1;
1946
1947 if (nrips && svm->vmcb->control.next_rip &&
1948 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1949 trace_kvm_fast_mmio(gpa);
1950 return kvm_skip_emulated_instruction(vcpu);
1951 }
1952 }
1953
1954 if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
1955 error_code |= PFERR_PRIVATE_ACCESS;
1956
1957 trace_kvm_page_fault(vcpu, gpa, error_code);
1958 rc = kvm_mmu_page_fault(vcpu, gpa, error_code,
1959 static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1960 svm->vmcb->control.insn_bytes : NULL,
1961 svm->vmcb->control.insn_len);
1962
1963 if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
1964 sev_handle_rmp_fault(vcpu, gpa, error_code);
1965
1966 return rc;
1967 }
1968
db_interception(struct kvm_vcpu * vcpu)1969 static int db_interception(struct kvm_vcpu *vcpu)
1970 {
1971 struct kvm_run *kvm_run = vcpu->run;
1972 struct vcpu_svm *svm = to_svm(vcpu);
1973
1974 if (!(vcpu->guest_debug &
1975 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1976 !svm->nmi_singlestep) {
1977 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
1978 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
1979 return 1;
1980 }
1981
1982 if (svm->nmi_singlestep) {
1983 disable_nmi_singlestep(svm);
1984 /* Make sure we check for pending NMIs upon entry */
1985 kvm_make_request(KVM_REQ_EVENT, vcpu);
1986 }
1987
1988 if (vcpu->guest_debug &
1989 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1990 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1991 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
1992 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
1993 kvm_run->debug.arch.pc =
1994 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1995 kvm_run->debug.arch.exception = DB_VECTOR;
1996 return 0;
1997 }
1998
1999 return 1;
2000 }
2001
bp_interception(struct kvm_vcpu * vcpu)2002 static int bp_interception(struct kvm_vcpu *vcpu)
2003 {
2004 struct vcpu_svm *svm = to_svm(vcpu);
2005 struct kvm_run *kvm_run = vcpu->run;
2006
2007 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2008 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2009 kvm_run->debug.arch.exception = BP_VECTOR;
2010 return 0;
2011 }
2012
ud_interception(struct kvm_vcpu * vcpu)2013 static int ud_interception(struct kvm_vcpu *vcpu)
2014 {
2015 return handle_ud(vcpu);
2016 }
2017
ac_interception(struct kvm_vcpu * vcpu)2018 static int ac_interception(struct kvm_vcpu *vcpu)
2019 {
2020 kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2021 return 1;
2022 }
2023
is_erratum_383(void)2024 static bool is_erratum_383(void)
2025 {
2026 int i;
2027 u64 value;
2028
2029 if (!erratum_383_found)
2030 return false;
2031
2032 if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value))
2033 return false;
2034
2035 /* Bit 62 may or may not be set for this mce */
2036 value &= ~(1ULL << 62);
2037
2038 if (value != 0xb600000000010015ULL)
2039 return false;
2040
2041 /* Clear MCi_STATUS registers */
2042 for (i = 0; i < 6; ++i)
2043 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0);
2044
2045 if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) {
2046 value &= ~(1ULL << 2);
2047 native_write_msr_safe(MSR_IA32_MCG_STATUS, value);
2048 }
2049
2050 /* Flush tlb to evict multi-match entries */
2051 __flush_tlb_all();
2052
2053 return true;
2054 }
2055
svm_handle_mce(struct kvm_vcpu * vcpu)2056 static void svm_handle_mce(struct kvm_vcpu *vcpu)
2057 {
2058 if (is_erratum_383()) {
2059 /*
2060 * Erratum 383 triggered. Guest state is corrupt so kill the
2061 * guest.
2062 */
2063 pr_err("Guest triggered AMD Erratum 383\n");
2064
2065 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2066
2067 return;
2068 }
2069
2070 /*
2071 * On an #MC intercept the MCE handler is not called automatically in
2072 * the host. So do it by hand here.
2073 */
2074 kvm_machine_check();
2075 }
2076
mc_interception(struct kvm_vcpu * vcpu)2077 static int mc_interception(struct kvm_vcpu *vcpu)
2078 {
2079 return 1;
2080 }
2081
shutdown_interception(struct kvm_vcpu * vcpu)2082 static int shutdown_interception(struct kvm_vcpu *vcpu)
2083 {
2084 struct kvm_run *kvm_run = vcpu->run;
2085 struct vcpu_svm *svm = to_svm(vcpu);
2086
2087
2088 /*
2089 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
2090 * the VMCB in a known good state. Unfortuately, KVM doesn't have
2091 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2092 * userspace. At a platform view, INIT is acceptable behavior as
2093 * there exist bare metal platforms that automatically INIT the CPU
2094 * in response to shutdown.
2095 *
2096 * The VM save area for SEV-ES guests has already been encrypted so it
2097 * cannot be reinitialized, i.e. synthesizing INIT is futile.
2098 */
2099 if (!sev_es_guest(vcpu->kvm)) {
2100 clear_page(svm->vmcb);
2101 #ifdef CONFIG_KVM_SMM
2102 if (is_smm(vcpu))
2103 kvm_smm_changed(vcpu, false);
2104 #endif
2105 kvm_vcpu_reset(vcpu, true);
2106 }
2107
2108 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2109 return 0;
2110 }
2111
io_interception(struct kvm_vcpu * vcpu)2112 static int io_interception(struct kvm_vcpu *vcpu)
2113 {
2114 struct vcpu_svm *svm = to_svm(vcpu);
2115 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2116 int size, in, string;
2117 unsigned port;
2118
2119 ++vcpu->stat.io_exits;
2120 string = (io_info & SVM_IOIO_STR_MASK) != 0;
2121 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2122 port = io_info >> 16;
2123 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2124
2125 if (string) {
2126 if (sev_es_guest(vcpu->kvm))
2127 return sev_es_string_io(svm, size, port, in);
2128 else
2129 return kvm_emulate_instruction(vcpu, 0);
2130 }
2131
2132 svm->next_rip = svm->vmcb->control.exit_info_2;
2133
2134 return kvm_fast_pio(vcpu, size, port, in);
2135 }
2136
nmi_interception(struct kvm_vcpu * vcpu)2137 static int nmi_interception(struct kvm_vcpu *vcpu)
2138 {
2139 return 1;
2140 }
2141
smi_interception(struct kvm_vcpu * vcpu)2142 static int smi_interception(struct kvm_vcpu *vcpu)
2143 {
2144 return 1;
2145 }
2146
intr_interception(struct kvm_vcpu * vcpu)2147 static int intr_interception(struct kvm_vcpu *vcpu)
2148 {
2149 ++vcpu->stat.irq_exits;
2150 return 1;
2151 }
2152
vmload_vmsave_interception(struct kvm_vcpu * vcpu,bool vmload)2153 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2154 {
2155 struct vcpu_svm *svm = to_svm(vcpu);
2156 struct vmcb *vmcb12;
2157 struct kvm_host_map map;
2158 int ret;
2159
2160 if (nested_svm_check_permissions(vcpu))
2161 return 1;
2162
2163 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2164 if (ret) {
2165 if (ret == -EINVAL)
2166 kvm_inject_gp(vcpu, 0);
2167 return 1;
2168 }
2169
2170 vmcb12 = map.hva;
2171
2172 ret = kvm_skip_emulated_instruction(vcpu);
2173
2174 /* KVM always performs VMLOAD/VMSAVE on VMCB01 (see __svm_vcpu_run()) */
2175 if (vmload) {
2176 svm_copy_vmloadsave_state(svm->vmcb01.ptr, vmcb12);
2177 svm->sysenter_eip_hi = 0;
2178 svm->sysenter_esp_hi = 0;
2179 } else {
2180 svm_copy_vmloadsave_state(vmcb12, svm->vmcb01.ptr);
2181 }
2182
2183 kvm_vcpu_unmap(vcpu, &map);
2184
2185 return ret;
2186 }
2187
vmload_interception(struct kvm_vcpu * vcpu)2188 static int vmload_interception(struct kvm_vcpu *vcpu)
2189 {
2190 return vmload_vmsave_interception(vcpu, true);
2191 }
2192
vmsave_interception(struct kvm_vcpu * vcpu)2193 static int vmsave_interception(struct kvm_vcpu *vcpu)
2194 {
2195 return vmload_vmsave_interception(vcpu, false);
2196 }
2197
vmrun_interception(struct kvm_vcpu * vcpu)2198 static int vmrun_interception(struct kvm_vcpu *vcpu)
2199 {
2200 if (nested_svm_check_permissions(vcpu))
2201 return 1;
2202
2203 return nested_svm_vmrun(vcpu);
2204 }
2205
2206 enum {
2207 NONE_SVM_INSTR,
2208 SVM_INSTR_VMRUN,
2209 SVM_INSTR_VMLOAD,
2210 SVM_INSTR_VMSAVE,
2211 };
2212
2213 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
svm_instr_opcode(struct kvm_vcpu * vcpu)2214 static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2215 {
2216 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2217
2218 if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2219 return NONE_SVM_INSTR;
2220
2221 switch (ctxt->modrm) {
2222 case 0xd8: /* VMRUN */
2223 return SVM_INSTR_VMRUN;
2224 case 0xda: /* VMLOAD */
2225 return SVM_INSTR_VMLOAD;
2226 case 0xdb: /* VMSAVE */
2227 return SVM_INSTR_VMSAVE;
2228 default:
2229 break;
2230 }
2231
2232 return NONE_SVM_INSTR;
2233 }
2234
emulate_svm_instr(struct kvm_vcpu * vcpu,int opcode)2235 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2236 {
2237 const int guest_mode_exit_codes[] = {
2238 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2239 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2240 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2241 };
2242 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2243 [SVM_INSTR_VMRUN] = vmrun_interception,
2244 [SVM_INSTR_VMLOAD] = vmload_interception,
2245 [SVM_INSTR_VMSAVE] = vmsave_interception,
2246 };
2247 struct vcpu_svm *svm = to_svm(vcpu);
2248 int ret;
2249
2250 if (is_guest_mode(vcpu)) {
2251 /* Returns '1' or -errno on failure, '0' on success. */
2252 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2253 if (ret)
2254 return ret;
2255 return 1;
2256 }
2257 return svm_instr_handlers[opcode](vcpu);
2258 }
2259
2260 /*
2261 * #GP handling code. Note that #GP can be triggered under the following two
2262 * cases:
2263 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2264 * some AMD CPUs when EAX of these instructions are in the reserved memory
2265 * regions (e.g. SMM memory on host).
2266 * 2) VMware backdoor
2267 */
gp_interception(struct kvm_vcpu * vcpu)2268 static int gp_interception(struct kvm_vcpu *vcpu)
2269 {
2270 struct vcpu_svm *svm = to_svm(vcpu);
2271 u32 error_code = svm->vmcb->control.exit_info_1;
2272 int opcode;
2273
2274 /* Both #GP cases have zero error_code */
2275 if (error_code)
2276 goto reinject;
2277
2278 /* Decode the instruction for usage later */
2279 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2280 goto reinject;
2281
2282 opcode = svm_instr_opcode(vcpu);
2283
2284 if (opcode == NONE_SVM_INSTR) {
2285 if (!enable_vmware_backdoor)
2286 goto reinject;
2287
2288 /*
2289 * VMware backdoor emulation on #GP interception only handles
2290 * IN{S}, OUT{S}, and RDPMC.
2291 */
2292 if (!is_guest_mode(vcpu))
2293 return kvm_emulate_instruction(vcpu,
2294 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2295 } else {
2296 /* All SVM instructions expect page aligned RAX */
2297 if (svm->vmcb->save.rax & ~PAGE_MASK)
2298 goto reinject;
2299
2300 return emulate_svm_instr(vcpu, opcode);
2301 }
2302
2303 reinject:
2304 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2305 return 1;
2306 }
2307
svm_set_gif(struct vcpu_svm * svm,bool value)2308 void svm_set_gif(struct vcpu_svm *svm, bool value)
2309 {
2310 if (value) {
2311 /*
2312 * If VGIF is enabled, the STGI intercept is only added to
2313 * detect the opening of the SMI/NMI window; remove it now.
2314 * Likewise, clear the VINTR intercept, we will set it
2315 * again while processing KVM_REQ_EVENT if needed.
2316 */
2317 if (vgif)
2318 svm_clr_intercept(svm, INTERCEPT_STGI);
2319 if (svm_is_intercept(svm, INTERCEPT_VINTR))
2320 svm_clear_vintr(svm);
2321
2322 enable_gif(svm);
2323 if (svm->vcpu.arch.smi_pending ||
2324 svm->vcpu.arch.nmi_pending ||
2325 kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2326 kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2327 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2328 } else {
2329 disable_gif(svm);
2330
2331 /*
2332 * After a CLGI no interrupts should come. But if vGIF is
2333 * in use, we still rely on the VINTR intercept (rather than
2334 * STGI) to detect an open interrupt window.
2335 */
2336 if (!vgif)
2337 svm_clear_vintr(svm);
2338 }
2339 }
2340
stgi_interception(struct kvm_vcpu * vcpu)2341 static int stgi_interception(struct kvm_vcpu *vcpu)
2342 {
2343 int ret;
2344
2345 if (nested_svm_check_permissions(vcpu))
2346 return 1;
2347
2348 ret = kvm_skip_emulated_instruction(vcpu);
2349 svm_set_gif(to_svm(vcpu), true);
2350 return ret;
2351 }
2352
clgi_interception(struct kvm_vcpu * vcpu)2353 static int clgi_interception(struct kvm_vcpu *vcpu)
2354 {
2355 int ret;
2356
2357 if (nested_svm_check_permissions(vcpu))
2358 return 1;
2359
2360 ret = kvm_skip_emulated_instruction(vcpu);
2361 svm_set_gif(to_svm(vcpu), false);
2362 return ret;
2363 }
2364
invlpga_interception(struct kvm_vcpu * vcpu)2365 static int invlpga_interception(struct kvm_vcpu *vcpu)
2366 {
2367 gva_t gva = kvm_rax_read(vcpu);
2368 u32 asid = kvm_rcx_read(vcpu);
2369
2370 /* FIXME: Handle an address size prefix. */
2371 if (!is_long_mode(vcpu))
2372 gva = (u32)gva;
2373
2374 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2375
2376 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2377 kvm_mmu_invlpg(vcpu, gva);
2378
2379 return kvm_skip_emulated_instruction(vcpu);
2380 }
2381
skinit_interception(struct kvm_vcpu * vcpu)2382 static int skinit_interception(struct kvm_vcpu *vcpu)
2383 {
2384 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2385
2386 kvm_queue_exception(vcpu, UD_VECTOR);
2387 return 1;
2388 }
2389
task_switch_interception(struct kvm_vcpu * vcpu)2390 static int task_switch_interception(struct kvm_vcpu *vcpu)
2391 {
2392 struct vcpu_svm *svm = to_svm(vcpu);
2393 u16 tss_selector;
2394 int reason;
2395 int int_type = svm->vmcb->control.exit_int_info &
2396 SVM_EXITINTINFO_TYPE_MASK;
2397 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2398 uint32_t type =
2399 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2400 uint32_t idt_v =
2401 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2402 bool has_error_code = false;
2403 u32 error_code = 0;
2404
2405 tss_selector = (u16)svm->vmcb->control.exit_info_1;
2406
2407 if (svm->vmcb->control.exit_info_2 &
2408 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2409 reason = TASK_SWITCH_IRET;
2410 else if (svm->vmcb->control.exit_info_2 &
2411 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2412 reason = TASK_SWITCH_JMP;
2413 else if (idt_v)
2414 reason = TASK_SWITCH_GATE;
2415 else
2416 reason = TASK_SWITCH_CALL;
2417
2418 if (reason == TASK_SWITCH_GATE) {
2419 switch (type) {
2420 case SVM_EXITINTINFO_TYPE_NMI:
2421 vcpu->arch.nmi_injected = false;
2422 break;
2423 case SVM_EXITINTINFO_TYPE_EXEPT:
2424 if (svm->vmcb->control.exit_info_2 &
2425 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2426 has_error_code = true;
2427 error_code =
2428 (u32)svm->vmcb->control.exit_info_2;
2429 }
2430 kvm_clear_exception_queue(vcpu);
2431 break;
2432 case SVM_EXITINTINFO_TYPE_INTR:
2433 case SVM_EXITINTINFO_TYPE_SOFT:
2434 kvm_clear_interrupt_queue(vcpu);
2435 break;
2436 default:
2437 break;
2438 }
2439 }
2440
2441 if (reason != TASK_SWITCH_GATE ||
2442 int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2443 (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2444 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2445 if (!svm_skip_emulated_instruction(vcpu))
2446 return 0;
2447 }
2448
2449 if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2450 int_vec = -1;
2451
2452 return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2453 has_error_code, error_code);
2454 }
2455
svm_clr_iret_intercept(struct vcpu_svm * svm)2456 static void svm_clr_iret_intercept(struct vcpu_svm *svm)
2457 {
2458 if (!sev_es_guest(svm->vcpu.kvm))
2459 svm_clr_intercept(svm, INTERCEPT_IRET);
2460 }
2461
svm_set_iret_intercept(struct vcpu_svm * svm)2462 static void svm_set_iret_intercept(struct vcpu_svm *svm)
2463 {
2464 if (!sev_es_guest(svm->vcpu.kvm))
2465 svm_set_intercept(svm, INTERCEPT_IRET);
2466 }
2467
iret_interception(struct kvm_vcpu * vcpu)2468 static int iret_interception(struct kvm_vcpu *vcpu)
2469 {
2470 struct vcpu_svm *svm = to_svm(vcpu);
2471
2472 WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
2473
2474 ++vcpu->stat.nmi_window_exits;
2475 svm->awaiting_iret_completion = true;
2476
2477 svm_clr_iret_intercept(svm);
2478 svm->nmi_iret_rip = kvm_rip_read(vcpu);
2479
2480 kvm_make_request(KVM_REQ_EVENT, vcpu);
2481 return 1;
2482 }
2483
invlpg_interception(struct kvm_vcpu * vcpu)2484 static int invlpg_interception(struct kvm_vcpu *vcpu)
2485 {
2486 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2487 return kvm_emulate_instruction(vcpu, 0);
2488
2489 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2490 return kvm_skip_emulated_instruction(vcpu);
2491 }
2492
emulate_on_interception(struct kvm_vcpu * vcpu)2493 static int emulate_on_interception(struct kvm_vcpu *vcpu)
2494 {
2495 return kvm_emulate_instruction(vcpu, 0);
2496 }
2497
rsm_interception(struct kvm_vcpu * vcpu)2498 static int rsm_interception(struct kvm_vcpu *vcpu)
2499 {
2500 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2501 }
2502
check_selective_cr0_intercepted(struct kvm_vcpu * vcpu,unsigned long val)2503 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2504 unsigned long val)
2505 {
2506 struct vcpu_svm *svm = to_svm(vcpu);
2507 unsigned long cr0 = vcpu->arch.cr0;
2508 bool ret = false;
2509
2510 if (!is_guest_mode(vcpu) ||
2511 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2512 return false;
2513
2514 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2515 val &= ~SVM_CR0_SELECTIVE_MASK;
2516
2517 if (cr0 ^ val) {
2518 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2519 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2520 }
2521
2522 return ret;
2523 }
2524
2525 #define CR_VALID (1ULL << 63)
2526
cr_interception(struct kvm_vcpu * vcpu)2527 static int cr_interception(struct kvm_vcpu *vcpu)
2528 {
2529 struct vcpu_svm *svm = to_svm(vcpu);
2530 int reg, cr;
2531 unsigned long val;
2532 int err;
2533
2534 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2535 return emulate_on_interception(vcpu);
2536
2537 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2538 return emulate_on_interception(vcpu);
2539
2540 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2541 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2542 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2543 else
2544 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2545
2546 err = 0;
2547 if (cr >= 16) { /* mov to cr */
2548 cr -= 16;
2549 val = kvm_register_read(vcpu, reg);
2550 trace_kvm_cr_write(cr, val);
2551 switch (cr) {
2552 case 0:
2553 if (!check_selective_cr0_intercepted(vcpu, val))
2554 err = kvm_set_cr0(vcpu, val);
2555 else
2556 return 1;
2557
2558 break;
2559 case 3:
2560 err = kvm_set_cr3(vcpu, val);
2561 break;
2562 case 4:
2563 err = kvm_set_cr4(vcpu, val);
2564 break;
2565 case 8:
2566 err = kvm_set_cr8(vcpu, val);
2567 break;
2568 default:
2569 WARN(1, "unhandled write to CR%d", cr);
2570 kvm_queue_exception(vcpu, UD_VECTOR);
2571 return 1;
2572 }
2573 } else { /* mov from cr */
2574 switch (cr) {
2575 case 0:
2576 val = kvm_read_cr0(vcpu);
2577 break;
2578 case 2:
2579 val = vcpu->arch.cr2;
2580 break;
2581 case 3:
2582 val = kvm_read_cr3(vcpu);
2583 break;
2584 case 4:
2585 val = kvm_read_cr4(vcpu);
2586 break;
2587 case 8:
2588 val = kvm_get_cr8(vcpu);
2589 break;
2590 default:
2591 WARN(1, "unhandled read from CR%d", cr);
2592 kvm_queue_exception(vcpu, UD_VECTOR);
2593 return 1;
2594 }
2595 kvm_register_write(vcpu, reg, val);
2596 trace_kvm_cr_read(cr, val);
2597 }
2598 return kvm_complete_insn_gp(vcpu, err);
2599 }
2600
cr_trap(struct kvm_vcpu * vcpu)2601 static int cr_trap(struct kvm_vcpu *vcpu)
2602 {
2603 struct vcpu_svm *svm = to_svm(vcpu);
2604 unsigned long old_value, new_value;
2605 unsigned int cr;
2606 int ret = 0;
2607
2608 new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2609
2610 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2611 switch (cr) {
2612 case 0:
2613 old_value = kvm_read_cr0(vcpu);
2614 svm_set_cr0(vcpu, new_value);
2615
2616 kvm_post_set_cr0(vcpu, old_value, new_value);
2617 break;
2618 case 4:
2619 old_value = kvm_read_cr4(vcpu);
2620 svm_set_cr4(vcpu, new_value);
2621
2622 kvm_post_set_cr4(vcpu, old_value, new_value);
2623 break;
2624 case 8:
2625 ret = kvm_set_cr8(vcpu, new_value);
2626 break;
2627 default:
2628 WARN(1, "unhandled CR%d write trap", cr);
2629 kvm_queue_exception(vcpu, UD_VECTOR);
2630 return 1;
2631 }
2632
2633 return kvm_complete_insn_gp(vcpu, ret);
2634 }
2635
dr_interception(struct kvm_vcpu * vcpu)2636 static int dr_interception(struct kvm_vcpu *vcpu)
2637 {
2638 struct vcpu_svm *svm = to_svm(vcpu);
2639 int reg, dr;
2640 int err = 0;
2641
2642 /*
2643 * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
2644 * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
2645 */
2646 if (sev_es_guest(vcpu->kvm))
2647 return 1;
2648
2649 if (vcpu->guest_debug == 0) {
2650 /*
2651 * No more DR vmexits; force a reload of the debug registers
2652 * and reenter on this instruction. The next vmexit will
2653 * retrieve the full state of the debug registers.
2654 */
2655 clr_dr_intercepts(svm);
2656 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2657 return 1;
2658 }
2659
2660 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2661 return emulate_on_interception(vcpu);
2662
2663 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2664 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2665 if (dr >= 16) { /* mov to DRn */
2666 dr -= 16;
2667 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
2668 } else {
2669 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
2670 }
2671
2672 return kvm_complete_insn_gp(vcpu, err);
2673 }
2674
cr8_write_interception(struct kvm_vcpu * vcpu)2675 static int cr8_write_interception(struct kvm_vcpu *vcpu)
2676 {
2677 int r;
2678
2679 u8 cr8_prev = kvm_get_cr8(vcpu);
2680 /* instruction emulation calls kvm_set_cr8() */
2681 r = cr_interception(vcpu);
2682 if (lapic_in_kernel(vcpu))
2683 return r;
2684 if (cr8_prev <= kvm_get_cr8(vcpu))
2685 return r;
2686 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2687 return 0;
2688 }
2689
efer_trap(struct kvm_vcpu * vcpu)2690 static int efer_trap(struct kvm_vcpu *vcpu)
2691 {
2692 struct msr_data msr_info;
2693 int ret;
2694
2695 /*
2696 * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2697 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2698 * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2699 * the guest doesn't have X86_FEATURE_SVM.
2700 */
2701 msr_info.host_initiated = false;
2702 msr_info.index = MSR_EFER;
2703 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2704 ret = kvm_set_msr_common(vcpu, &msr_info);
2705
2706 return kvm_complete_insn_gp(vcpu, ret);
2707 }
2708
svm_get_feature_msr(u32 msr,u64 * data)2709 static int svm_get_feature_msr(u32 msr, u64 *data)
2710 {
2711 *data = 0;
2712
2713 switch (msr) {
2714 case MSR_AMD64_DE_CFG:
2715 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2716 *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2717 break;
2718 default:
2719 return KVM_MSR_RET_UNSUPPORTED;
2720 }
2721
2722 return 0;
2723 }
2724
sev_es_prevent_msr_access(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2725 static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
2726 struct msr_data *msr_info)
2727 {
2728 return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected &&
2729 msr_info->index != MSR_IA32_XSS &&
2730 !msr_write_intercepted(vcpu, msr_info->index);
2731 }
2732
svm_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2733 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2734 {
2735 struct vcpu_svm *svm = to_svm(vcpu);
2736
2737 if (sev_es_prevent_msr_access(vcpu, msr_info)) {
2738 msr_info->data = 0;
2739 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
2740 }
2741
2742 switch (msr_info->index) {
2743 case MSR_AMD64_TSC_RATIO:
2744 if (!msr_info->host_initiated &&
2745 !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR))
2746 return 1;
2747 msr_info->data = svm->tsc_ratio_msr;
2748 break;
2749 case MSR_STAR:
2750 msr_info->data = svm->vmcb01.ptr->save.star;
2751 break;
2752 #ifdef CONFIG_X86_64
2753 case MSR_LSTAR:
2754 msr_info->data = svm->vmcb01.ptr->save.lstar;
2755 break;
2756 case MSR_CSTAR:
2757 msr_info->data = svm->vmcb01.ptr->save.cstar;
2758 break;
2759 case MSR_GS_BASE:
2760 msr_info->data = svm->vmcb01.ptr->save.gs.base;
2761 break;
2762 case MSR_FS_BASE:
2763 msr_info->data = svm->vmcb01.ptr->save.fs.base;
2764 break;
2765 case MSR_KERNEL_GS_BASE:
2766 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2767 break;
2768 case MSR_SYSCALL_MASK:
2769 msr_info->data = svm->vmcb01.ptr->save.sfmask;
2770 break;
2771 #endif
2772 case MSR_IA32_SYSENTER_CS:
2773 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2774 break;
2775 case MSR_IA32_SYSENTER_EIP:
2776 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2777 if (guest_cpuid_is_intel_compatible(vcpu))
2778 msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2779 break;
2780 case MSR_IA32_SYSENTER_ESP:
2781 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2782 if (guest_cpuid_is_intel_compatible(vcpu))
2783 msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2784 break;
2785 case MSR_IA32_S_CET:
2786 msr_info->data = svm->vmcb->save.s_cet;
2787 break;
2788 case MSR_IA32_INT_SSP_TAB:
2789 msr_info->data = svm->vmcb->save.isst_addr;
2790 break;
2791 case MSR_KVM_INTERNAL_GUEST_SSP:
2792 msr_info->data = svm->vmcb->save.ssp;
2793 break;
2794 case MSR_TSC_AUX:
2795 msr_info->data = svm->tsc_aux;
2796 break;
2797 case MSR_IA32_DEBUGCTLMSR:
2798 msr_info->data = svm->vmcb->save.dbgctl;
2799 break;
2800 case MSR_IA32_LASTBRANCHFROMIP:
2801 msr_info->data = svm->vmcb->save.br_from;
2802 break;
2803 case MSR_IA32_LASTBRANCHTOIP:
2804 msr_info->data = svm->vmcb->save.br_to;
2805 break;
2806 case MSR_IA32_LASTINTFROMIP:
2807 msr_info->data = svm->vmcb->save.last_excp_from;
2808 break;
2809 case MSR_IA32_LASTINTTOIP:
2810 msr_info->data = svm->vmcb->save.last_excp_to;
2811 break;
2812 case MSR_VM_HSAVE_PA:
2813 msr_info->data = svm->nested.hsave_msr;
2814 break;
2815 case MSR_VM_CR:
2816 msr_info->data = svm->nested.vm_cr_msr;
2817 break;
2818 case MSR_IA32_SPEC_CTRL:
2819 if (!msr_info->host_initiated &&
2820 !guest_has_spec_ctrl_msr(vcpu))
2821 return 1;
2822
2823 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2824 msr_info->data = svm->vmcb->save.spec_ctrl;
2825 else
2826 msr_info->data = svm->spec_ctrl;
2827 break;
2828 case MSR_AMD64_VIRT_SPEC_CTRL:
2829 if (!msr_info->host_initiated &&
2830 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
2831 return 1;
2832
2833 msr_info->data = svm->virt_spec_ctrl;
2834 break;
2835 case MSR_F15H_IC_CFG: {
2836
2837 int family, model;
2838
2839 family = guest_cpuid_family(vcpu);
2840 model = guest_cpuid_model(vcpu);
2841
2842 if (family < 0 || model < 0)
2843 return kvm_get_msr_common(vcpu, msr_info);
2844
2845 msr_info->data = 0;
2846
2847 if (family == 0x15 &&
2848 (model >= 0x2 && model < 0x20))
2849 msr_info->data = 0x1E;
2850 }
2851 break;
2852 case MSR_AMD64_DE_CFG:
2853 msr_info->data = svm->msr_decfg;
2854 break;
2855 default:
2856 return kvm_get_msr_common(vcpu, msr_info);
2857 }
2858 return 0;
2859 }
2860
svm_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)2861 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2862 {
2863 struct vcpu_svm *svm = to_svm(vcpu);
2864 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2865 return kvm_complete_insn_gp(vcpu, err);
2866
2867 svm_vmgexit_inject_exception(svm, X86_TRAP_GP);
2868 return 1;
2869 }
2870
svm_set_vm_cr(struct kvm_vcpu * vcpu,u64 data)2871 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2872 {
2873 struct vcpu_svm *svm = to_svm(vcpu);
2874 int svm_dis, chg_mask;
2875
2876 if (data & ~SVM_VM_CR_VALID_MASK)
2877 return 1;
2878
2879 chg_mask = SVM_VM_CR_VALID_MASK;
2880
2881 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2882 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2883
2884 svm->nested.vm_cr_msr &= ~chg_mask;
2885 svm->nested.vm_cr_msr |= (data & chg_mask);
2886
2887 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2888
2889 /* check for svm_disable while efer.svme is set */
2890 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2891 return 1;
2892
2893 return 0;
2894 }
2895
svm_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2896 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2897 {
2898 struct vcpu_svm *svm = to_svm(vcpu);
2899 int ret = 0;
2900
2901 u32 ecx = msr->index;
2902 u64 data = msr->data;
2903
2904 if (sev_es_prevent_msr_access(vcpu, msr))
2905 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
2906
2907 switch (ecx) {
2908 case MSR_AMD64_TSC_RATIO:
2909
2910 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) {
2911
2912 if (!msr->host_initiated)
2913 return 1;
2914 /*
2915 * In case TSC scaling is not enabled, always
2916 * leave this MSR at the default value.
2917 *
2918 * Due to bug in qemu 6.2.0, it would try to set
2919 * this msr to 0 if tsc scaling is not enabled.
2920 * Ignore this value as well.
2921 */
2922 if (data != 0 && data != svm->tsc_ratio_msr)
2923 return 1;
2924 break;
2925 }
2926
2927 if (data & SVM_TSC_RATIO_RSVD)
2928 return 1;
2929
2930 svm->tsc_ratio_msr = data;
2931
2932 if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
2933 is_guest_mode(vcpu))
2934 nested_svm_update_tsc_ratio_msr(vcpu);
2935
2936 break;
2937 case MSR_IA32_CR_PAT:
2938 ret = kvm_set_msr_common(vcpu, msr);
2939 if (ret)
2940 break;
2941
2942 svm->vmcb01.ptr->save.g_pat = data;
2943 if (is_guest_mode(vcpu))
2944 nested_vmcb02_compute_g_pat(svm);
2945 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2946 break;
2947 case MSR_IA32_SPEC_CTRL:
2948 if (!msr->host_initiated &&
2949 !guest_has_spec_ctrl_msr(vcpu))
2950 return 1;
2951
2952 if (kvm_spec_ctrl_test_value(data))
2953 return 1;
2954
2955 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2956 svm->vmcb->save.spec_ctrl = data;
2957 else
2958 svm->spec_ctrl = data;
2959 if (!data)
2960 break;
2961
2962 /*
2963 * For non-nested:
2964 * When it's written (to non-zero) for the first time, pass
2965 * it through.
2966 *
2967 * For nested:
2968 * The handling of the MSR bitmap for L2 guests is done in
2969 * nested_svm_merge_msrpm().
2970 * We update the L1 MSR bit as well since it will end up
2971 * touching the MSR anyway now.
2972 */
2973 svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
2974 break;
2975 case MSR_AMD64_VIRT_SPEC_CTRL:
2976 if (!msr->host_initiated &&
2977 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD))
2978 return 1;
2979
2980 if (data & ~SPEC_CTRL_SSBD)
2981 return 1;
2982
2983 svm->virt_spec_ctrl = data;
2984 break;
2985 case MSR_STAR:
2986 svm->vmcb01.ptr->save.star = data;
2987 break;
2988 #ifdef CONFIG_X86_64
2989 case MSR_LSTAR:
2990 svm->vmcb01.ptr->save.lstar = data;
2991 break;
2992 case MSR_CSTAR:
2993 svm->vmcb01.ptr->save.cstar = data;
2994 break;
2995 case MSR_GS_BASE:
2996 svm->vmcb01.ptr->save.gs.base = data;
2997 break;
2998 case MSR_FS_BASE:
2999 svm->vmcb01.ptr->save.fs.base = data;
3000 break;
3001 case MSR_KERNEL_GS_BASE:
3002 svm->vmcb01.ptr->save.kernel_gs_base = data;
3003 break;
3004 case MSR_SYSCALL_MASK:
3005 svm->vmcb01.ptr->save.sfmask = data;
3006 break;
3007 #endif
3008 case MSR_IA32_SYSENTER_CS:
3009 svm->vmcb01.ptr->save.sysenter_cs = data;
3010 break;
3011 case MSR_IA32_SYSENTER_EIP:
3012 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
3013 /*
3014 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
3015 * when we spoof an Intel vendor ID (for cross vendor migration).
3016 * In this case we use this intercept to track the high
3017 * 32 bit part of these msrs to support Intel's
3018 * implementation of SYSENTER/SYSEXIT.
3019 */
3020 svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
3021 break;
3022 case MSR_IA32_SYSENTER_ESP:
3023 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
3024 svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
3025 break;
3026 case MSR_IA32_S_CET:
3027 svm->vmcb->save.s_cet = data;
3028 vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
3029 break;
3030 case MSR_IA32_INT_SSP_TAB:
3031 svm->vmcb->save.isst_addr = data;
3032 vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
3033 break;
3034 case MSR_KVM_INTERNAL_GUEST_SSP:
3035 svm->vmcb->save.ssp = data;
3036 vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET);
3037 break;
3038 case MSR_TSC_AUX:
3039 /*
3040 * TSC_AUX is always virtualized for SEV-ES guests when the
3041 * feature is available. The user return MSR support is not
3042 * required in this case because TSC_AUX is restored on #VMEXIT
3043 * from the host save area.
3044 */
3045 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
3046 break;
3047
3048 /*
3049 * TSC_AUX is usually changed only during boot and never read
3050 * directly. Intercept TSC_AUX and switch it via user return.
3051 */
3052 preempt_disable();
3053 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
3054 preempt_enable();
3055 if (ret)
3056 break;
3057
3058 svm->tsc_aux = data;
3059 break;
3060 case MSR_IA32_DEBUGCTLMSR:
3061 if (!lbrv) {
3062 kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3063 break;
3064 }
3065
3066 /*
3067 * Suppress BTF as KVM doesn't virtualize BTF, but there's no
3068 * way to communicate lack of support to the guest.
3069 */
3070 if (data & DEBUGCTLMSR_BTF) {
3071 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
3072 data &= ~DEBUGCTLMSR_BTF;
3073 }
3074
3075 if (data & DEBUGCTL_RESERVED_BITS)
3076 return 1;
3077
3078 if (svm->vmcb->save.dbgctl == data)
3079 break;
3080
3081 svm->vmcb->save.dbgctl = data;
3082 vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
3083 svm_update_lbrv(vcpu);
3084 break;
3085 case MSR_VM_HSAVE_PA:
3086 /*
3087 * Old kernels did not validate the value written to
3088 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
3089 * value to allow live migrating buggy or malicious guests
3090 * originating from those kernels.
3091 */
3092 if (!msr->host_initiated && !page_address_valid(vcpu, data))
3093 return 1;
3094
3095 svm->nested.hsave_msr = data & PAGE_MASK;
3096 break;
3097 case MSR_VM_CR:
3098 return svm_set_vm_cr(vcpu, data);
3099 case MSR_VM_IGNNE:
3100 kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3101 break;
3102 case MSR_AMD64_DE_CFG: {
3103 u64 supported_de_cfg;
3104
3105 if (svm_get_feature_msr(ecx, &supported_de_cfg))
3106 return 1;
3107
3108 if (data & ~supported_de_cfg)
3109 return 1;
3110
3111 svm->msr_decfg = data;
3112 break;
3113 }
3114 default:
3115 return kvm_set_msr_common(vcpu, msr);
3116 }
3117 return ret;
3118 }
3119
msr_interception(struct kvm_vcpu * vcpu)3120 static int msr_interception(struct kvm_vcpu *vcpu)
3121 {
3122 if (to_svm(vcpu)->vmcb->control.exit_info_1)
3123 return kvm_emulate_wrmsr(vcpu);
3124 else
3125 return kvm_emulate_rdmsr(vcpu);
3126 }
3127
interrupt_window_interception(struct kvm_vcpu * vcpu)3128 static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3129 {
3130 kvm_make_request(KVM_REQ_EVENT, vcpu);
3131 svm_clear_vintr(to_svm(vcpu));
3132
3133 /*
3134 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3135 * In this case AVIC was temporarily disabled for
3136 * requesting the IRQ window and we have to re-enable it.
3137 *
3138 * If running nested, still remove the VM wide AVIC inhibit to
3139 * support case in which the interrupt window was requested when the
3140 * vCPU was not running nested.
3141
3142 * All vCPUs which run still run nested, will remain to have their
3143 * AVIC still inhibited due to per-cpu AVIC inhibition.
3144 */
3145 kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3146
3147 ++vcpu->stat.irq_window_exits;
3148 return 1;
3149 }
3150
pause_interception(struct kvm_vcpu * vcpu)3151 static int pause_interception(struct kvm_vcpu *vcpu)
3152 {
3153 bool in_kernel;
3154 /*
3155 * CPL is not made available for an SEV-ES guest, therefore
3156 * vcpu->arch.preempted_in_kernel can never be true. Just
3157 * set in_kernel to false as well.
3158 */
3159 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3160
3161 grow_ple_window(vcpu);
3162
3163 kvm_vcpu_on_spin(vcpu, in_kernel);
3164 return kvm_skip_emulated_instruction(vcpu);
3165 }
3166
invpcid_interception(struct kvm_vcpu * vcpu)3167 static int invpcid_interception(struct kvm_vcpu *vcpu)
3168 {
3169 struct vcpu_svm *svm = to_svm(vcpu);
3170 unsigned long type;
3171 gva_t gva;
3172
3173 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
3174 kvm_queue_exception(vcpu, UD_VECTOR);
3175 return 1;
3176 }
3177
3178 /*
3179 * For an INVPCID intercept:
3180 * EXITINFO1 provides the linear address of the memory operand.
3181 * EXITINFO2 provides the contents of the register operand.
3182 */
3183 type = svm->vmcb->control.exit_info_2;
3184 gva = svm->vmcb->control.exit_info_1;
3185
3186 /*
3187 * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the
3188 * stack segment is used. The intercept takes priority over all
3189 * #GP checks except CPL>0, but somehow still generates a linear
3190 * address? The APM is sorely lacking.
3191 */
3192 if (is_noncanonical_address(gva, vcpu, 0)) {
3193 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3194 return 1;
3195 }
3196
3197 return kvm_handle_invpcid(vcpu, type, gva);
3198 }
3199
complete_userspace_buslock(struct kvm_vcpu * vcpu)3200 static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
3201 {
3202 struct vcpu_svm *svm = to_svm(vcpu);
3203
3204 /*
3205 * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
3206 * execute the bus-locking instruction. Set the bus lock counter to '1'
3207 * to effectively step past the bus lock.
3208 */
3209 if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
3210 svm->vmcb->control.bus_lock_counter = 1;
3211
3212 return 1;
3213 }
3214
bus_lock_exit(struct kvm_vcpu * vcpu)3215 static int bus_lock_exit(struct kvm_vcpu *vcpu)
3216 {
3217 struct vcpu_svm *svm = to_svm(vcpu);
3218
3219 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
3220 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
3221
3222 vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
3223 vcpu->arch.complete_userspace_io = complete_userspace_buslock;
3224
3225 if (is_guest_mode(vcpu))
3226 svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;
3227
3228 return 0;
3229 }
3230
3231 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3232 [SVM_EXIT_READ_CR0] = cr_interception,
3233 [SVM_EXIT_READ_CR3] = cr_interception,
3234 [SVM_EXIT_READ_CR4] = cr_interception,
3235 [SVM_EXIT_READ_CR8] = cr_interception,
3236 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
3237 [SVM_EXIT_WRITE_CR0] = cr_interception,
3238 [SVM_EXIT_WRITE_CR3] = cr_interception,
3239 [SVM_EXIT_WRITE_CR4] = cr_interception,
3240 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
3241 [SVM_EXIT_READ_DR0] = dr_interception,
3242 [SVM_EXIT_READ_DR1] = dr_interception,
3243 [SVM_EXIT_READ_DR2] = dr_interception,
3244 [SVM_EXIT_READ_DR3] = dr_interception,
3245 [SVM_EXIT_READ_DR4] = dr_interception,
3246 [SVM_EXIT_READ_DR5] = dr_interception,
3247 [SVM_EXIT_READ_DR6] = dr_interception,
3248 [SVM_EXIT_READ_DR7] = dr_interception,
3249 [SVM_EXIT_WRITE_DR0] = dr_interception,
3250 [SVM_EXIT_WRITE_DR1] = dr_interception,
3251 [SVM_EXIT_WRITE_DR2] = dr_interception,
3252 [SVM_EXIT_WRITE_DR3] = dr_interception,
3253 [SVM_EXIT_WRITE_DR4] = dr_interception,
3254 [SVM_EXIT_WRITE_DR5] = dr_interception,
3255 [SVM_EXIT_WRITE_DR6] = dr_interception,
3256 [SVM_EXIT_WRITE_DR7] = dr_interception,
3257 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
3258 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
3259 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
3260 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
3261 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
3262 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
3263 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
3264 [SVM_EXIT_INTR] = intr_interception,
3265 [SVM_EXIT_NMI] = nmi_interception,
3266 [SVM_EXIT_SMI] = smi_interception,
3267 [SVM_EXIT_VINTR] = interrupt_window_interception,
3268 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
3269 [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
3270 [SVM_EXIT_IRET] = iret_interception,
3271 [SVM_EXIT_INVD] = kvm_emulate_invd,
3272 [SVM_EXIT_PAUSE] = pause_interception,
3273 [SVM_EXIT_HLT] = kvm_emulate_halt,
3274 [SVM_EXIT_INVLPG] = invlpg_interception,
3275 [SVM_EXIT_INVLPGA] = invlpga_interception,
3276 [SVM_EXIT_IOIO] = io_interception,
3277 [SVM_EXIT_MSR] = msr_interception,
3278 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
3279 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
3280 [SVM_EXIT_VMRUN] = vmrun_interception,
3281 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
3282 [SVM_EXIT_VMLOAD] = vmload_interception,
3283 [SVM_EXIT_VMSAVE] = vmsave_interception,
3284 [SVM_EXIT_STGI] = stgi_interception,
3285 [SVM_EXIT_CLGI] = clgi_interception,
3286 [SVM_EXIT_SKINIT] = skinit_interception,
3287 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
3288 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
3289 [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
3290 [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
3291 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
3292 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
3293 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
3294 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
3295 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
3296 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
3297 [SVM_EXIT_INVPCID] = invpcid_interception,
3298 [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
3299 [SVM_EXIT_NPF] = npf_interception,
3300 [SVM_EXIT_BUS_LOCK] = bus_lock_exit,
3301 [SVM_EXIT_RSM] = rsm_interception,
3302 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
3303 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
3304 #ifdef CONFIG_KVM_AMD_SEV
3305 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
3306 #endif
3307 };
3308
dump_vmcb(struct kvm_vcpu * vcpu)3309 static void dump_vmcb(struct kvm_vcpu *vcpu)
3310 {
3311 struct vcpu_svm *svm = to_svm(vcpu);
3312 struct vmcb_control_area *control = &svm->vmcb->control;
3313 struct vmcb_save_area *save = &svm->vmcb->save;
3314 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3315 char *vm_type;
3316
3317 if (!dump_invalid_vmcb) {
3318 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3319 return;
3320 }
3321
3322 guard(mutex)(&vmcb_dump_mutex);
3323
3324 vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" :
3325 sev_es_guest(vcpu->kvm) ? "SEV-ES" :
3326 sev_guest(vcpu->kvm) ? "SEV" : "SVM";
3327
3328 pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n",
3329 vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3330 pr_err("VMCB Control Area:\n");
3331 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3332 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3333 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3334 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3335 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3336 pr_err("%-20s%08x %08x\n", "intercepts:",
3337 control->intercepts[INTERCEPT_WORD3],
3338 control->intercepts[INTERCEPT_WORD4]);
3339 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3340 pr_err("%-20s%d\n", "pause filter threshold:",
3341 control->pause_filter_thresh);
3342 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3343 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3344 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3345 pr_err("%-20s%d\n", "asid:", control->asid);
3346 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3347 pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl);
3348 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3349 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3350 pr_err("%-20s%08x\n", "int_state:", control->int_state);
3351 pr_err("%-20s%016llx\n", "exit_code:", control->exit_code);
3352 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3353 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3354 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3355 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3356 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3357 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3358 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3359 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3360 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3361 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3362 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3363 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3364 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3365 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3366 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3367 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3368 pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features);
3369 pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features);
3370
3371 if (sev_es_guest(vcpu->kvm)) {
3372 save = sev_decrypt_vmsa(vcpu);
3373 if (!save)
3374 goto no_vmsa;
3375
3376 save01 = save;
3377 }
3378
3379 pr_err("VMCB State Save Area:\n");
3380 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3381 "es:",
3382 save->es.selector, save->es.attrib,
3383 save->es.limit, save->es.base);
3384 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3385 "cs:",
3386 save->cs.selector, save->cs.attrib,
3387 save->cs.limit, save->cs.base);
3388 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3389 "ss:",
3390 save->ss.selector, save->ss.attrib,
3391 save->ss.limit, save->ss.base);
3392 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3393 "ds:",
3394 save->ds.selector, save->ds.attrib,
3395 save->ds.limit, save->ds.base);
3396 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3397 "fs:",
3398 save01->fs.selector, save01->fs.attrib,
3399 save01->fs.limit, save01->fs.base);
3400 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3401 "gs:",
3402 save01->gs.selector, save01->gs.attrib,
3403 save01->gs.limit, save01->gs.base);
3404 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3405 "gdtr:",
3406 save->gdtr.selector, save->gdtr.attrib,
3407 save->gdtr.limit, save->gdtr.base);
3408 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3409 "ldtr:",
3410 save01->ldtr.selector, save01->ldtr.attrib,
3411 save01->ldtr.limit, save01->ldtr.base);
3412 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3413 "idtr:",
3414 save->idtr.selector, save->idtr.attrib,
3415 save->idtr.limit, save->idtr.base);
3416 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3417 "tr:",
3418 save01->tr.selector, save01->tr.attrib,
3419 save01->tr.limit, save01->tr.base);
3420 pr_err("vmpl: %d cpl: %d efer: %016llx\n",
3421 save->vmpl, save->cpl, save->efer);
3422 pr_err("%-15s %016llx %-13s %016llx\n",
3423 "cr0:", save->cr0, "cr2:", save->cr2);
3424 pr_err("%-15s %016llx %-13s %016llx\n",
3425 "cr3:", save->cr3, "cr4:", save->cr4);
3426 pr_err("%-15s %016llx %-13s %016llx\n",
3427 "dr6:", save->dr6, "dr7:", save->dr7);
3428 pr_err("%-15s %016llx %-13s %016llx\n",
3429 "rip:", save->rip, "rflags:", save->rflags);
3430 pr_err("%-15s %016llx %-13s %016llx\n",
3431 "rsp:", save->rsp, "rax:", save->rax);
3432 pr_err("%-15s %016llx %-13s %016llx\n",
3433 "s_cet:", save->s_cet, "ssp:", save->ssp);
3434 pr_err("%-15s %016llx\n",
3435 "isst_addr:", save->isst_addr);
3436 pr_err("%-15s %016llx %-13s %016llx\n",
3437 "star:", save01->star, "lstar:", save01->lstar);
3438 pr_err("%-15s %016llx %-13s %016llx\n",
3439 "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3440 pr_err("%-15s %016llx %-13s %016llx\n",
3441 "kernel_gs_base:", save01->kernel_gs_base,
3442 "sysenter_cs:", save01->sysenter_cs);
3443 pr_err("%-15s %016llx %-13s %016llx\n",
3444 "sysenter_esp:", save01->sysenter_esp,
3445 "sysenter_eip:", save01->sysenter_eip);
3446 pr_err("%-15s %016llx %-13s %016llx\n",
3447 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3448 pr_err("%-15s %016llx %-13s %016llx\n",
3449 "br_from:", save->br_from, "br_to:", save->br_to);
3450 pr_err("%-15s %016llx %-13s %016llx\n",
3451 "excp_from:", save->last_excp_from,
3452 "excp_to:", save->last_excp_to);
3453
3454 if (sev_es_guest(vcpu->kvm)) {
3455 struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
3456
3457 pr_err("%-15s %016llx\n",
3458 "sev_features", vmsa->sev_features);
3459
3460 pr_err("%-15s %016llx %-13s %016llx\n",
3461 "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp);
3462 pr_err("%-15s %016llx %-13s %016llx\n",
3463 "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp);
3464 pr_err("%-15s %016llx\n",
3465 "u_cet:", vmsa->u_cet);
3466
3467 pr_err("%-15s %016llx %-13s %016llx\n",
3468 "rax:", vmsa->rax, "rbx:", vmsa->rbx);
3469 pr_err("%-15s %016llx %-13s %016llx\n",
3470 "rcx:", vmsa->rcx, "rdx:", vmsa->rdx);
3471 pr_err("%-15s %016llx %-13s %016llx\n",
3472 "rsi:", vmsa->rsi, "rdi:", vmsa->rdi);
3473 pr_err("%-15s %016llx %-13s %016llx\n",
3474 "rbp:", vmsa->rbp, "rsp:", vmsa->rsp);
3475 pr_err("%-15s %016llx %-13s %016llx\n",
3476 "r8:", vmsa->r8, "r9:", vmsa->r9);
3477 pr_err("%-15s %016llx %-13s %016llx\n",
3478 "r10:", vmsa->r10, "r11:", vmsa->r11);
3479 pr_err("%-15s %016llx %-13s %016llx\n",
3480 "r12:", vmsa->r12, "r13:", vmsa->r13);
3481 pr_err("%-15s %016llx %-13s %016llx\n",
3482 "r14:", vmsa->r14, "r15:", vmsa->r15);
3483 pr_err("%-15s %016llx %-13s %016llx\n",
3484 "xcr0:", vmsa->xcr0, "xss:", vmsa->xss);
3485 } else {
3486 pr_err("%-15s %016llx %-13s %016lx\n",
3487 "rax:", save->rax, "rbx:",
3488 vcpu->arch.regs[VCPU_REGS_RBX]);
3489 pr_err("%-15s %016lx %-13s %016lx\n",
3490 "rcx:", vcpu->arch.regs[VCPU_REGS_RCX],
3491 "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]);
3492 pr_err("%-15s %016lx %-13s %016lx\n",
3493 "rsi:", vcpu->arch.regs[VCPU_REGS_RSI],
3494 "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]);
3495 pr_err("%-15s %016lx %-13s %016llx\n",
3496 "rbp:", vcpu->arch.regs[VCPU_REGS_RBP],
3497 "rsp:", save->rsp);
3498 #ifdef CONFIG_X86_64
3499 pr_err("%-15s %016lx %-13s %016lx\n",
3500 "r8:", vcpu->arch.regs[VCPU_REGS_R8],
3501 "r9:", vcpu->arch.regs[VCPU_REGS_R9]);
3502 pr_err("%-15s %016lx %-13s %016lx\n",
3503 "r10:", vcpu->arch.regs[VCPU_REGS_R10],
3504 "r11:", vcpu->arch.regs[VCPU_REGS_R11]);
3505 pr_err("%-15s %016lx %-13s %016lx\n",
3506 "r12:", vcpu->arch.regs[VCPU_REGS_R12],
3507 "r13:", vcpu->arch.regs[VCPU_REGS_R13]);
3508 pr_err("%-15s %016lx %-13s %016lx\n",
3509 "r14:", vcpu->arch.regs[VCPU_REGS_R14],
3510 "r15:", vcpu->arch.regs[VCPU_REGS_R15]);
3511 #endif
3512 }
3513
3514 no_vmsa:
3515 if (sev_es_guest(vcpu->kvm))
3516 sev_free_decrypted_vmsa(vcpu, save);
3517 }
3518
svm_invoke_exit_handler(struct kvm_vcpu * vcpu,u64 __exit_code)3519 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code)
3520 {
3521 u32 exit_code = __exit_code;
3522
3523 /*
3524 * SVM uses negative values, i.e. 64-bit values, to indicate that VMRUN
3525 * failed. Report all such errors to userspace (note, VMEXIT_INVALID,
3526 * a.k.a. SVM_EXIT_ERR, is special cased by svm_handle_exit()). Skip
3527 * the check when running as a VM, as KVM has historically left garbage
3528 * in bits 63:32, i.e. running KVM-on-KVM would hit false positives if
3529 * the underlying kernel is buggy.
3530 */
3531 if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR) &&
3532 (u64)exit_code != __exit_code)
3533 goto unexpected_vmexit;
3534
3535 #ifdef CONFIG_MITIGATION_RETPOLINE
3536 if (exit_code == SVM_EXIT_MSR)
3537 return msr_interception(vcpu);
3538 else if (exit_code == SVM_EXIT_VINTR)
3539 return interrupt_window_interception(vcpu);
3540 else if (exit_code == SVM_EXIT_INTR)
3541 return intr_interception(vcpu);
3542 else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT)
3543 return kvm_emulate_halt(vcpu);
3544 else if (exit_code == SVM_EXIT_NPF)
3545 return npf_interception(vcpu);
3546 #ifdef CONFIG_KVM_AMD_SEV
3547 else if (exit_code == SVM_EXIT_VMGEXIT)
3548 return sev_handle_vmgexit(vcpu);
3549 #endif
3550 #endif
3551 if (exit_code >= ARRAY_SIZE(svm_exit_handlers))
3552 goto unexpected_vmexit;
3553
3554 exit_code = array_index_nospec(exit_code, ARRAY_SIZE(svm_exit_handlers));
3555 if (!svm_exit_handlers[exit_code])
3556 goto unexpected_vmexit;
3557
3558 return svm_exit_handlers[exit_code](vcpu);
3559
3560 unexpected_vmexit:
3561 dump_vmcb(vcpu);
3562 kvm_prepare_unexpected_reason_exit(vcpu, __exit_code);
3563 return 0;
3564 }
3565
svm_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)3566 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3567 u64 *info1, u64 *info2,
3568 u32 *intr_info, u32 *error_code)
3569 {
3570 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3571
3572 *reason = control->exit_code;
3573 *info1 = control->exit_info_1;
3574 *info2 = control->exit_info_2;
3575 *intr_info = control->exit_int_info;
3576 if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3577 (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3578 *error_code = control->exit_int_info_err;
3579 else
3580 *error_code = 0;
3581 }
3582
svm_get_entry_info(struct kvm_vcpu * vcpu,u32 * intr_info,u32 * error_code)3583 static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info,
3584 u32 *error_code)
3585 {
3586 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3587
3588 *intr_info = control->event_inj;
3589
3590 if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3591 (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3592 *error_code = control->event_inj_err;
3593 else
3594 *error_code = 0;
3595
3596 }
3597
svm_handle_exit(struct kvm_vcpu * vcpu,fastpath_t exit_fastpath)3598 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3599 {
3600 struct vcpu_svm *svm = to_svm(vcpu);
3601 struct kvm_run *kvm_run = vcpu->run;
3602
3603 /* SEV-ES guests must use the CR write traps to track CR registers. */
3604 if (!sev_es_guest(vcpu->kvm)) {
3605 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3606 vcpu->arch.cr0 = svm->vmcb->save.cr0;
3607 if (npt_enabled)
3608 vcpu->arch.cr3 = svm->vmcb->save.cr3;
3609 }
3610
3611 if (is_guest_mode(vcpu)) {
3612 int vmexit;
3613
3614 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3615
3616 vmexit = nested_svm_exit_special(svm);
3617
3618 if (vmexit == NESTED_EXIT_CONTINUE)
3619 vmexit = nested_svm_exit_handled(svm);
3620
3621 if (vmexit == NESTED_EXIT_DONE)
3622 return 1;
3623 }
3624
3625 if (svm_is_vmrun_failure(svm->vmcb->control.exit_code)) {
3626 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3627 kvm_run->fail_entry.hardware_entry_failure_reason
3628 = svm->vmcb->control.exit_code;
3629 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3630 dump_vmcb(vcpu);
3631 return 0;
3632 }
3633
3634 if (exit_fastpath != EXIT_FASTPATH_NONE)
3635 return 1;
3636
3637 return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code);
3638 }
3639
pre_svm_run(struct kvm_vcpu * vcpu)3640 static int pre_svm_run(struct kvm_vcpu *vcpu)
3641 {
3642 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3643 struct vcpu_svm *svm = to_svm(vcpu);
3644
3645 /*
3646 * If the previous vmrun of the vmcb occurred on a different physical
3647 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
3648 * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3649 */
3650 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3651 svm->current_vmcb->asid_generation = 0;
3652 vmcb_mark_all_dirty(svm->vmcb);
3653 svm->current_vmcb->cpu = vcpu->cpu;
3654 }
3655
3656 if (sev_guest(vcpu->kvm))
3657 return pre_sev_run(svm, vcpu->cpu);
3658
3659 /* FIXME: handle wraparound of asid_generation */
3660 if (svm->current_vmcb->asid_generation != sd->asid_generation)
3661 new_asid(svm, sd);
3662
3663 return 0;
3664 }
3665
svm_inject_nmi(struct kvm_vcpu * vcpu)3666 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3667 {
3668 struct vcpu_svm *svm = to_svm(vcpu);
3669
3670 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3671
3672 if (svm->nmi_l1_to_l2)
3673 return;
3674
3675 /*
3676 * No need to manually track NMI masking when vNMI is enabled, hardware
3677 * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
3678 * case where software directly injects an NMI.
3679 */
3680 if (!is_vnmi_enabled(svm)) {
3681 svm->nmi_masked = true;
3682 svm_set_iret_intercept(svm);
3683 }
3684 ++vcpu->stat.nmi_injections;
3685 }
3686
svm_is_vnmi_pending(struct kvm_vcpu * vcpu)3687 static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
3688 {
3689 struct vcpu_svm *svm = to_svm(vcpu);
3690
3691 if (!is_vnmi_enabled(svm))
3692 return false;
3693
3694 return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
3695 }
3696
svm_set_vnmi_pending(struct kvm_vcpu * vcpu)3697 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
3698 {
3699 struct vcpu_svm *svm = to_svm(vcpu);
3700
3701 if (!is_vnmi_enabled(svm))
3702 return false;
3703
3704 if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
3705 return false;
3706
3707 svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
3708 vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
3709
3710 /*
3711 * Because the pending NMI is serviced by hardware, KVM can't know when
3712 * the NMI is "injected", but for all intents and purposes, passing the
3713 * NMI off to hardware counts as injection.
3714 */
3715 ++vcpu->stat.nmi_injections;
3716
3717 return true;
3718 }
3719
svm_inject_irq(struct kvm_vcpu * vcpu,bool reinjected)3720 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3721 {
3722 struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt;
3723 struct vcpu_svm *svm = to_svm(vcpu);
3724 u32 type;
3725
3726 if (intr->soft) {
3727 if (svm_update_soft_interrupt_rip(vcpu, intr->nr))
3728 return;
3729
3730 type = SVM_EVTINJ_TYPE_SOFT;
3731 } else {
3732 type = SVM_EVTINJ_TYPE_INTR;
3733 }
3734
3735 trace_kvm_inj_virq(intr->nr, intr->soft, reinjected);
3736 ++vcpu->stat.irq_injections;
3737
3738 svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type;
3739 }
3740
svm_complete_interrupt_delivery(struct kvm_vcpu * vcpu,int delivery_mode,int trig_mode,int vector)3741 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3742 int trig_mode, int vector)
3743 {
3744 /*
3745 * apic->apicv_active must be read after vcpu->mode.
3746 * Pairs with smp_store_release in vcpu_enter_guest.
3747 */
3748 bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3749
3750 /* Note, this is called iff the local APIC is in-kernel. */
3751 if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3752 /* Process the interrupt via kvm_check_and_inject_events(). */
3753 kvm_make_request(KVM_REQ_EVENT, vcpu);
3754 kvm_vcpu_kick(vcpu);
3755 return;
3756 }
3757
3758 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3759 if (in_guest_mode) {
3760 /*
3761 * Signal the doorbell to tell hardware to inject the IRQ. If
3762 * the vCPU exits the guest before the doorbell chimes, hardware
3763 * will automatically process AVIC interrupts at the next VMRUN.
3764 */
3765 avic_ring_doorbell(vcpu);
3766 } else {
3767 /*
3768 * Wake the vCPU if it was blocking. KVM will then detect the
3769 * pending IRQ when checking if the vCPU has a wake event.
3770 */
3771 kvm_vcpu_wake_up(vcpu);
3772 }
3773 }
3774
svm_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)3775 static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
3776 int trig_mode, int vector)
3777 {
3778 kvm_lapic_set_irr(vector, apic);
3779
3780 /*
3781 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3782 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3783 * the read of guest_mode. This guarantees that either VMRUN will see
3784 * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3785 * will signal the doorbell if the CPU has already entered the guest.
3786 */
3787 smp_mb__after_atomic();
3788 svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3789 }
3790
svm_update_cr8_intercept(struct kvm_vcpu * vcpu,int tpr,int irr)3791 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3792 {
3793 struct vcpu_svm *svm = to_svm(vcpu);
3794
3795 /*
3796 * SEV-ES guests must always keep the CR intercepts cleared. CR
3797 * tracking is done using the CR write traps.
3798 */
3799 if (sev_es_guest(vcpu->kvm))
3800 return;
3801
3802 if (nested_svm_virtualize_tpr(vcpu))
3803 return;
3804
3805 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3806
3807 if (irr == -1)
3808 return;
3809
3810 if (tpr >= irr)
3811 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3812 }
3813
svm_get_nmi_mask(struct kvm_vcpu * vcpu)3814 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3815 {
3816 struct vcpu_svm *svm = to_svm(vcpu);
3817
3818 if (is_vnmi_enabled(svm))
3819 return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
3820 else
3821 return svm->nmi_masked;
3822 }
3823
svm_set_nmi_mask(struct kvm_vcpu * vcpu,bool masked)3824 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3825 {
3826 struct vcpu_svm *svm = to_svm(vcpu);
3827
3828 if (is_vnmi_enabled(svm)) {
3829 if (masked)
3830 svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
3831 else
3832 svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
3833
3834 } else {
3835 svm->nmi_masked = masked;
3836 if (masked)
3837 svm_set_iret_intercept(svm);
3838 else
3839 svm_clr_iret_intercept(svm);
3840 }
3841 }
3842
svm_nmi_blocked(struct kvm_vcpu * vcpu)3843 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3844 {
3845 struct vcpu_svm *svm = to_svm(vcpu);
3846 struct vmcb *vmcb = svm->vmcb;
3847
3848 if (!gif_set(svm))
3849 return true;
3850
3851 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3852 return false;
3853
3854 if (svm_get_nmi_mask(vcpu))
3855 return true;
3856
3857 return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
3858 }
3859
svm_nmi_allowed(struct kvm_vcpu * vcpu,bool for_injection)3860 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3861 {
3862 struct vcpu_svm *svm = to_svm(vcpu);
3863 if (svm->nested.nested_run_pending)
3864 return -EBUSY;
3865
3866 if (svm_nmi_blocked(vcpu))
3867 return 0;
3868
3869 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
3870 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3871 return -EBUSY;
3872 return 1;
3873 }
3874
svm_interrupt_blocked(struct kvm_vcpu * vcpu)3875 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3876 {
3877 struct vcpu_svm *svm = to_svm(vcpu);
3878 struct vmcb *vmcb = svm->vmcb;
3879
3880 if (!gif_set(svm))
3881 return true;
3882
3883 if (is_guest_mode(vcpu)) {
3884 /* As long as interrupts are being delivered... */
3885 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3886 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3887 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3888 return true;
3889
3890 /* ... vmexits aren't blocked by the interrupt shadow */
3891 if (nested_exit_on_intr(svm))
3892 return false;
3893 } else {
3894 if (!svm_get_if_flag(vcpu))
3895 return true;
3896 }
3897
3898 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3899 }
3900
svm_interrupt_allowed(struct kvm_vcpu * vcpu,bool for_injection)3901 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3902 {
3903 struct vcpu_svm *svm = to_svm(vcpu);
3904
3905 if (svm->nested.nested_run_pending)
3906 return -EBUSY;
3907
3908 if (svm_interrupt_blocked(vcpu))
3909 return 0;
3910
3911 /*
3912 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3913 * e.g. if the IRQ arrived asynchronously after checking nested events.
3914 */
3915 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3916 return -EBUSY;
3917
3918 return 1;
3919 }
3920
svm_enable_irq_window(struct kvm_vcpu * vcpu)3921 static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3922 {
3923 struct vcpu_svm *svm = to_svm(vcpu);
3924
3925 /*
3926 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3927 * 1, because that's a separate STGI/VMRUN intercept. The next time we
3928 * get that intercept, this function will be called again though and
3929 * we'll get the vintr intercept. However, if the vGIF feature is
3930 * enabled, the STGI interception will not occur. Enable the irq
3931 * window under the assumption that the hardware will set the GIF.
3932 */
3933 if (vgif || gif_set(svm)) {
3934 /*
3935 * IRQ window is not needed when AVIC is enabled,
3936 * unless we have pending ExtINT since it cannot be injected
3937 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3938 * and fallback to injecting IRQ via V_IRQ.
3939 *
3940 * If running nested, AVIC is already locally inhibited
3941 * on this vCPU, therefore there is no need to request
3942 * the VM wide AVIC inhibition.
3943 */
3944 if (!is_guest_mode(vcpu))
3945 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3946
3947 svm_set_vintr(svm);
3948 }
3949 }
3950
svm_enable_nmi_window(struct kvm_vcpu * vcpu)3951 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3952 {
3953 struct vcpu_svm *svm = to_svm(vcpu);
3954
3955 /*
3956 * If NMIs are outright masked, i.e. the vCPU is already handling an
3957 * NMI, and KVM has not yet intercepted an IRET, then there is nothing
3958 * more to do at this time as KVM has already enabled IRET intercepts.
3959 * If KVM has already intercepted IRET, then single-step over the IRET,
3960 * as NMIs aren't architecturally unmasked until the IRET completes.
3961 *
3962 * If vNMI is enabled, KVM should never request an NMI window if NMIs
3963 * are masked, as KVM allows at most one to-be-injected NMI and one
3964 * pending NMI. If two NMIs arrive simultaneously, KVM will inject one
3965 * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are
3966 * unmasked. KVM _will_ request an NMI window in some situations, e.g.
3967 * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately
3968 * inject the NMI. In those situations, KVM needs to single-step over
3969 * the STI shadow or intercept STGI.
3970 */
3971 if (svm_get_nmi_mask(vcpu)) {
3972 WARN_ON_ONCE(is_vnmi_enabled(svm));
3973
3974 if (!svm->awaiting_iret_completion)
3975 return; /* IRET will cause a vm exit */
3976 }
3977
3978 /*
3979 * SEV-ES guests are responsible for signaling when a vCPU is ready to
3980 * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
3981 * KVM can't intercept and single-step IRET to detect when NMIs are
3982 * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE.
3983 *
3984 * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
3985 * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
3986 * supported NAEs in the GHCB protocol.
3987 */
3988 if (sev_es_guest(vcpu->kvm))
3989 return;
3990
3991 if (!gif_set(svm)) {
3992 if (vgif)
3993 svm_set_intercept(svm, INTERCEPT_STGI);
3994 return; /* STGI will cause a vm exit */
3995 }
3996
3997 /*
3998 * Something prevents NMI from been injected. Single step over possible
3999 * problem (IRET or exception injection or interrupt shadow)
4000 */
4001 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
4002 svm->nmi_singlestep = true;
4003 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
4004 }
4005
svm_flush_tlb_asid(struct kvm_vcpu * vcpu)4006 static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
4007 {
4008 struct vcpu_svm *svm = to_svm(vcpu);
4009
4010 /*
4011 * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
4012 * A TLB flush for the current ASID flushes both "host" and "guest" TLB
4013 * entries, and thus is a superset of Hyper-V's fine grained flushing.
4014 */
4015 kvm_hv_vcpu_purge_flush_tlb(vcpu);
4016
4017 /*
4018 * Flush only the current ASID even if the TLB flush was invoked via
4019 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
4020 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
4021 * unconditionally does a TLB flush on both nested VM-Enter and nested
4022 * VM-Exit (via kvm_mmu_reset_context()).
4023 */
4024 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
4025 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
4026 else
4027 svm->current_vmcb->asid_generation--;
4028 }
4029
svm_flush_tlb_current(struct kvm_vcpu * vcpu)4030 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
4031 {
4032 hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
4033
4034 /*
4035 * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
4036 * flush the NPT mappings via hypercall as flushing the ASID only
4037 * affects virtual to physical mappings, it does not invalidate guest
4038 * physical to host physical mappings.
4039 */
4040 if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
4041 hyperv_flush_guest_mapping(root_tdp);
4042
4043 svm_flush_tlb_asid(vcpu);
4044 }
4045
svm_flush_tlb_all(struct kvm_vcpu * vcpu)4046 static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
4047 {
4048 /*
4049 * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
4050 * flushes should be routed to hv_flush_remote_tlbs() without requesting
4051 * a "regular" remote flush. Reaching this point means either there's
4052 * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
4053 * which might be fatal to the guest. Yell, but try to recover.
4054 */
4055 if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
4056 hv_flush_remote_tlbs(vcpu->kvm);
4057
4058 svm_flush_tlb_asid(vcpu);
4059 }
4060
svm_flush_tlb_gva(struct kvm_vcpu * vcpu,gva_t gva)4061 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
4062 {
4063 struct vcpu_svm *svm = to_svm(vcpu);
4064
4065 invlpga(gva, svm->vmcb->control.asid);
4066 }
4067
svm_flush_tlb_guest(struct kvm_vcpu * vcpu)4068 static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu)
4069 {
4070 kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS);
4071
4072 svm_flush_tlb_asid(vcpu);
4073 }
4074
sync_cr8_to_lapic(struct kvm_vcpu * vcpu)4075 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
4076 {
4077 struct vcpu_svm *svm = to_svm(vcpu);
4078
4079 if (nested_svm_virtualize_tpr(vcpu))
4080 return;
4081
4082 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
4083 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
4084 kvm_set_cr8(vcpu, cr8);
4085 }
4086 }
4087
sync_lapic_to_cr8(struct kvm_vcpu * vcpu)4088 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
4089 {
4090 struct vcpu_svm *svm = to_svm(vcpu);
4091 u64 cr8;
4092
4093 if (nested_svm_virtualize_tpr(vcpu))
4094 return;
4095
4096 cr8 = kvm_get_cr8(vcpu);
4097 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
4098 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
4099 }
4100
svm_complete_soft_interrupt(struct kvm_vcpu * vcpu,u8 vector,int type)4101 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
4102 int type)
4103 {
4104 bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
4105 bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
4106 struct vcpu_svm *svm = to_svm(vcpu);
4107
4108 /*
4109 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
4110 * associated with the original soft exception/interrupt. next_rip is
4111 * cleared on all exits that can occur while vectoring an event, so KVM
4112 * needs to manually set next_rip for re-injection. Unlike the !nrips
4113 * case below, this needs to be done if and only if KVM is re-injecting
4114 * the same event, i.e. if the event is a soft exception/interrupt,
4115 * otherwise next_rip is unused on VMRUN.
4116 */
4117 if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
4118 kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
4119 svm->vmcb->control.next_rip = svm->soft_int_next_rip;
4120 /*
4121 * If NRIPS isn't enabled, KVM must manually advance RIP prior to
4122 * injecting the soft exception/interrupt. That advancement needs to
4123 * be unwound if vectoring didn't complete. Note, the new event may
4124 * not be the injected event, e.g. if KVM injected an INTn, the INTn
4125 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
4126 * be the reported vectored event, but RIP still needs to be unwound.
4127 */
4128 else if (!nrips && (is_soft || is_exception) &&
4129 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
4130 kvm_rip_write(vcpu, svm->soft_int_old_rip);
4131 }
4132
svm_complete_interrupts(struct kvm_vcpu * vcpu)4133 static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
4134 {
4135 struct vcpu_svm *svm = to_svm(vcpu);
4136 u8 vector;
4137 int type;
4138 u32 exitintinfo = svm->vmcb->control.exit_int_info;
4139 bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
4140 bool soft_int_injected = svm->soft_int_injected;
4141
4142 svm->nmi_l1_to_l2 = false;
4143 svm->soft_int_injected = false;
4144
4145 /*
4146 * If we've made progress since setting awaiting_iret_completion, we've
4147 * executed an IRET and can allow NMI injection.
4148 */
4149 if (svm->awaiting_iret_completion &&
4150 kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
4151 svm->awaiting_iret_completion = false;
4152 svm->nmi_masked = false;
4153 kvm_make_request(KVM_REQ_EVENT, vcpu);
4154 }
4155
4156 vcpu->arch.nmi_injected = false;
4157 kvm_clear_exception_queue(vcpu);
4158 kvm_clear_interrupt_queue(vcpu);
4159
4160 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
4161 return;
4162
4163 kvm_make_request(KVM_REQ_EVENT, vcpu);
4164
4165 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
4166 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
4167
4168 if (soft_int_injected)
4169 svm_complete_soft_interrupt(vcpu, vector, type);
4170
4171 switch (type) {
4172 case SVM_EXITINTINFO_TYPE_NMI:
4173 vcpu->arch.nmi_injected = true;
4174 svm->nmi_l1_to_l2 = nmi_l1_to_l2;
4175 break;
4176 case SVM_EXITINTINFO_TYPE_EXEPT: {
4177 u32 error_code = 0;
4178
4179 /*
4180 * Never re-inject a #VC exception.
4181 */
4182 if (vector == X86_TRAP_VC)
4183 break;
4184
4185 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR)
4186 error_code = svm->vmcb->control.exit_int_info_err;
4187
4188 kvm_requeue_exception(vcpu, vector,
4189 exitintinfo & SVM_EXITINTINFO_VALID_ERR,
4190 error_code);
4191 break;
4192 }
4193 case SVM_EXITINTINFO_TYPE_INTR:
4194 kvm_queue_interrupt(vcpu, vector, false);
4195 break;
4196 case SVM_EXITINTINFO_TYPE_SOFT:
4197 kvm_queue_interrupt(vcpu, vector, true);
4198 break;
4199 default:
4200 break;
4201 }
4202
4203 }
4204
svm_cancel_injection(struct kvm_vcpu * vcpu)4205 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4206 {
4207 struct vcpu_svm *svm = to_svm(vcpu);
4208 struct vmcb_control_area *control = &svm->vmcb->control;
4209
4210 control->exit_int_info = control->event_inj;
4211 control->exit_int_info_err = control->event_inj_err;
4212 control->event_inj = 0;
4213 svm_complete_interrupts(vcpu);
4214 }
4215
svm_vcpu_pre_run(struct kvm_vcpu * vcpu)4216 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
4217 {
4218 if (to_kvm_sev_info(vcpu->kvm)->need_init)
4219 return -EINVAL;
4220
4221 return 1;
4222 }
4223
svm_exit_handlers_fastpath(struct kvm_vcpu * vcpu)4224 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
4225 {
4226 struct vcpu_svm *svm = to_svm(vcpu);
4227 struct vmcb_control_area *control = &svm->vmcb->control;
4228
4229 /*
4230 * Next RIP must be provided as IRQs are disabled, and accessing guest
4231 * memory to decode the instruction might fault, i.e. might sleep.
4232 */
4233 if (!nrips || !control->next_rip)
4234 return EXIT_FASTPATH_NONE;
4235
4236 if (is_guest_mode(vcpu))
4237 return EXIT_FASTPATH_NONE;
4238
4239 switch (control->exit_code) {
4240 case SVM_EXIT_MSR:
4241 if (!control->exit_info_1)
4242 break;
4243 return handle_fastpath_wrmsr(vcpu);
4244 case SVM_EXIT_HLT:
4245 return handle_fastpath_hlt(vcpu);
4246 case SVM_EXIT_INVD:
4247 return handle_fastpath_invd(vcpu);
4248 default:
4249 break;
4250 }
4251
4252 return EXIT_FASTPATH_NONE;
4253 }
4254
svm_vcpu_enter_exit(struct kvm_vcpu * vcpu,bool spec_ctrl_intercepted)4255 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
4256 {
4257 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
4258 struct vcpu_svm *svm = to_svm(vcpu);
4259
4260 guest_state_enter_irqoff();
4261
4262 /*
4263 * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
4264 * VMRUN controls whether or not physical IRQs are masked (KVM always
4265 * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
4266 * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
4267 * into guest state if delivery of an event during VMRUN triggers a
4268 * #VMEXIT, and the guest_state transitions already tell lockdep that
4269 * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
4270 * this path, so IRQs aren't actually unmasked while running host code.
4271 */
4272 raw_local_irq_enable();
4273
4274 amd_clear_divider();
4275
4276 if (sev_es_guest(vcpu->kvm))
4277 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
4278 sev_es_host_save_area(sd));
4279 else
4280 __svm_vcpu_run(svm, spec_ctrl_intercepted);
4281
4282 raw_local_irq_disable();
4283
4284 guest_state_exit_irqoff();
4285 }
4286
svm_vcpu_run(struct kvm_vcpu * vcpu,u64 run_flags)4287 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
4288 {
4289 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
4290 struct vcpu_svm *svm = to_svm(vcpu);
4291 bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
4292
4293 trace_kvm_entry(vcpu, force_immediate_exit);
4294
4295 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4296 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4297 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4298
4299 /*
4300 * Disable singlestep if we're injecting an interrupt/exception.
4301 * We don't want our modified rflags to be pushed on the stack where
4302 * we might not be able to easily reset them if we disabled NMI
4303 * singlestep later.
4304 */
4305 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
4306 /*
4307 * Event injection happens before external interrupts cause a
4308 * vmexit and interrupts are disabled here, so smp_send_reschedule
4309 * is enough to force an immediate vmexit.
4310 */
4311 disable_nmi_singlestep(svm);
4312 force_immediate_exit = true;
4313 }
4314
4315 if (force_immediate_exit)
4316 smp_send_reschedule(vcpu->cpu);
4317
4318 if (pre_svm_run(vcpu)) {
4319 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4320 vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR;
4321 vcpu->run->fail_entry.cpu = vcpu->cpu;
4322 return EXIT_FASTPATH_EXIT_USERSPACE;
4323 }
4324
4325 sync_lapic_to_cr8(vcpu);
4326
4327 if (unlikely(svm->asid != svm->vmcb->control.asid)) {
4328 svm->vmcb->control.asid = svm->asid;
4329 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
4330 }
4331 svm->vmcb->save.cr2 = vcpu->arch.cr2;
4332
4333 if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) &&
4334 kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS))
4335 svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP;
4336
4337 svm_hv_update_vp_id(svm->vmcb, vcpu);
4338
4339 /*
4340 * Run with all-zero DR6 unless the guest can write DR6 freely, so that
4341 * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
4342 * KVM's snapshot is only necessary when DR accesses won't exit.
4343 */
4344 if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
4345 svm_set_dr6(vcpu, vcpu->arch.dr6);
4346 else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
4347 svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
4348
4349 clgi();
4350
4351 /*
4352 * Hardware only context switches DEBUGCTL if LBR virtualization is
4353 * enabled. Manually load DEBUGCTL if necessary (and restore it after
4354 * VM-Exit), as running with the host's DEBUGCTL can negatively affect
4355 * guest state and can even be fatal, e.g. due to Bus Lock Detect.
4356 */
4357 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
4358 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
4359 update_debugctlmsr(svm->vmcb->save.dbgctl);
4360
4361 kvm_wait_lapic_expire(vcpu);
4362
4363 /*
4364 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
4365 * it's non-zero. Since vmentry is serialising on affected CPUs, there
4366 * is no need to worry about the conditional branch over the wrmsr
4367 * being speculatively taken.
4368 */
4369 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4370 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
4371
4372 svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
4373
4374 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4375 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
4376
4377 if (!sev_es_guest(vcpu->kvm)) {
4378 vcpu->arch.cr2 = svm->vmcb->save.cr2;
4379 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4380 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4381 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4382 }
4383 vcpu->arch.regs_dirty = 0;
4384
4385 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4386 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4387
4388 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
4389 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
4390 update_debugctlmsr(vcpu->arch.host_debugctl);
4391
4392 stgi();
4393
4394 /* Any pending NMI will happen here */
4395
4396 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4397 kvm_after_interrupt(vcpu);
4398
4399 sync_cr8_to_lapic(vcpu);
4400
4401 svm->next_rip = 0;
4402 if (is_guest_mode(vcpu)) {
4403 nested_sync_control_from_vmcb02(svm);
4404
4405 /* Track VMRUNs that have made past consistency checking */
4406 if (svm->nested.nested_run_pending &&
4407 !svm_is_vmrun_failure(svm->vmcb->control.exit_code))
4408 ++vcpu->stat.nested_run;
4409
4410 svm->nested.nested_run_pending = 0;
4411 }
4412
4413 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4414
4415 /*
4416 * Unconditionally mask off the CLEAR_RAP bit, the AND is just as cheap
4417 * as the TEST+Jcc to avoid it.
4418 */
4419 if (cpu_feature_enabled(X86_FEATURE_ERAPS))
4420 svm->vmcb->control.erap_ctl &= ~ERAP_CONTROL_CLEAR_RAP;
4421
4422 vmcb_mark_all_clean(svm->vmcb);
4423
4424 /* if exit due to PF check for async PF */
4425 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4426 vcpu->arch.apf.host_apf_flags =
4427 kvm_read_and_reset_apf_flags();
4428
4429 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4430
4431 if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL))
4432 rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl);
4433
4434 trace_kvm_exit(vcpu, KVM_ISA_SVM);
4435
4436 svm_complete_interrupts(vcpu);
4437
4438 return svm_exit_handlers_fastpath(vcpu);
4439 }
4440
svm_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int root_level)4441 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4442 int root_level)
4443 {
4444 struct vcpu_svm *svm = to_svm(vcpu);
4445 unsigned long cr3;
4446
4447 if (npt_enabled) {
4448 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4449 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4450
4451 hv_track_root_tdp(vcpu, root_hpa);
4452
4453 cr3 = vcpu->arch.cr3;
4454 } else if (root_level >= PT64_ROOT_4LEVEL) {
4455 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4456 } else {
4457 /* PCID in the guest should be impossible with a 32-bit MMU. */
4458 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4459 cr3 = root_hpa;
4460 }
4461
4462 svm->vmcb->save.cr3 = cr3;
4463 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4464 }
4465
4466 static void
svm_patch_hypercall(struct kvm_vcpu * vcpu,unsigned char * hypercall)4467 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4468 {
4469 /*
4470 * Patch in the VMMCALL instruction:
4471 */
4472 hypercall[0] = 0x0f;
4473 hypercall[1] = 0x01;
4474 hypercall[2] = 0xd9;
4475 }
4476
4477 /*
4478 * The kvm parameter can be NULL (module initialization, or invocation before
4479 * VM creation). Be sure to check the kvm parameter before using it.
4480 */
svm_has_emulated_msr(struct kvm * kvm,u32 index)4481 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4482 {
4483 switch (index) {
4484 case MSR_IA32_MCG_EXT_CTL:
4485 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
4486 return false;
4487 case MSR_IA32_SMBASE:
4488 if (!IS_ENABLED(CONFIG_KVM_SMM))
4489 return false;
4490 /* SEV-ES guests do not support SMM, so report false */
4491 if (kvm && sev_es_guest(kvm))
4492 return false;
4493 break;
4494 default:
4495 break;
4496 }
4497
4498 return true;
4499 }
4500
svm_vcpu_after_set_cpuid(struct kvm_vcpu * vcpu)4501 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4502 {
4503 struct vcpu_svm *svm = to_svm(vcpu);
4504
4505 /*
4506 * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
4507 * can only disable all variants of by disallowing CR4.OSXSAVE from
4508 * being set. As a result, if the host has XSAVE and XSAVES, and the
4509 * guest has XSAVE enabled, the guest can execute XSAVES without
4510 * faulting. Treat XSAVES as enabled in this case regardless of
4511 * whether it's advertised to the guest so that KVM context switches
4512 * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
4513 * the guest read/write access to the host's XSS.
4514 */
4515 guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES,
4516 boot_cpu_has(X86_FEATURE_XSAVES) &&
4517 guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE));
4518
4519 /*
4520 * Intercept VMLOAD if the vCPU model is Intel in order to emulate that
4521 * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
4522 * SVM on Intel is bonkers and extremely unlikely to work).
4523 */
4524 if (guest_cpuid_is_intel_compatible(vcpu))
4525 guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4526
4527 if (sev_guest(vcpu->kvm))
4528 sev_vcpu_after_set_cpuid(svm);
4529 }
4530
svm_has_wbinvd_exit(void)4531 static bool svm_has_wbinvd_exit(void)
4532 {
4533 return true;
4534 }
4535
4536 #define PRE_EX(exit) { .exit_code = (exit), \
4537 .stage = X86_ICPT_PRE_EXCEPT, }
4538 #define POST_EX(exit) { .exit_code = (exit), \
4539 .stage = X86_ICPT_POST_EXCEPT, }
4540 #define POST_MEM(exit) { .exit_code = (exit), \
4541 .stage = X86_ICPT_POST_MEMACCESS, }
4542
4543 static const struct __x86_intercept {
4544 u32 exit_code;
4545 enum x86_intercept_stage stage;
4546 } x86_intercept_map[] = {
4547 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
4548 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
4549 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
4550 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
4551 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
4552 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
4553 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
4554 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
4555 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
4556 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
4557 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
4558 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
4559 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
4560 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
4561 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
4562 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
4563 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
4564 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
4565 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
4566 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
4567 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
4568 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
4569 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
4570 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
4571 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
4572 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
4573 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
4574 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
4575 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
4576 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
4577 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
4578 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
4579 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
4580 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
4581 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
4582 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
4583 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
4584 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
4585 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
4586 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
4587 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
4588 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
4589 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
4590 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
4591 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
4592 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
4593 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
4594 };
4595
4596 #undef PRE_EX
4597 #undef POST_EX
4598 #undef POST_MEM
4599
svm_check_intercept(struct kvm_vcpu * vcpu,struct x86_instruction_info * info,enum x86_intercept_stage stage,struct x86_exception * exception)4600 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4601 struct x86_instruction_info *info,
4602 enum x86_intercept_stage stage,
4603 struct x86_exception *exception)
4604 {
4605 struct vcpu_svm *svm = to_svm(vcpu);
4606 int vmexit, ret = X86EMUL_CONTINUE;
4607 struct __x86_intercept icpt_info;
4608 struct vmcb *vmcb = svm->vmcb;
4609
4610 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4611 goto out;
4612
4613 icpt_info = x86_intercept_map[info->intercept];
4614
4615 if (stage != icpt_info.stage)
4616 goto out;
4617
4618 switch (icpt_info.exit_code) {
4619 case SVM_EXIT_READ_CR0:
4620 if (info->intercept == x86_intercept_cr_read)
4621 icpt_info.exit_code += info->modrm_reg;
4622 break;
4623 case SVM_EXIT_WRITE_CR0: {
4624 unsigned long cr0, val;
4625
4626 /*
4627 * Adjust the exit code accordingly if a CR other than CR0 is
4628 * being written, and skip straight to the common handling as
4629 * only CR0 has an additional selective intercept.
4630 */
4631 if (info->intercept == x86_intercept_cr_write && info->modrm_reg) {
4632 icpt_info.exit_code += info->modrm_reg;
4633 break;
4634 }
4635
4636 /*
4637 * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a
4638 * selective CR0 intercept is triggered (the common logic will
4639 * treat the selective intercept as being enabled). Note, the
4640 * unconditional intercept has higher priority, i.e. this is
4641 * only relevant if *only* the selective intercept is enabled.
4642 */
4643 if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) ||
4644 !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))
4645 break;
4646
4647 /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */
4648 if (info->intercept == x86_intercept_clts)
4649 break;
4650
4651 /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */
4652 if (info->intercept == x86_intercept_lmsw) {
4653 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4654 break;
4655 }
4656
4657 /*
4658 * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit
4659 * other than SVM_CR0_SELECTIVE_MASK is changed.
4660 */
4661 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4662 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
4663 if (cr0 ^ val)
4664 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4665 break;
4666 }
4667 case SVM_EXIT_READ_DR0:
4668 case SVM_EXIT_WRITE_DR0:
4669 icpt_info.exit_code += info->modrm_reg;
4670 break;
4671 case SVM_EXIT_MSR:
4672 if (info->intercept == x86_intercept_wrmsr)
4673 vmcb->control.exit_info_1 = 1;
4674 else
4675 vmcb->control.exit_info_1 = 0;
4676 break;
4677 case SVM_EXIT_PAUSE:
4678 /*
4679 * We get this for NOP only, but pause
4680 * is rep not, check this here
4681 */
4682 if (info->rep_prefix != REPE_PREFIX)
4683 goto out;
4684 break;
4685 case SVM_EXIT_IOIO: {
4686 u64 exit_info;
4687 u32 bytes;
4688
4689 if (info->intercept == x86_intercept_in ||
4690 info->intercept == x86_intercept_ins) {
4691 exit_info = ((info->src_val & 0xffff) << 16) |
4692 SVM_IOIO_TYPE_MASK;
4693 bytes = info->dst_bytes;
4694 } else {
4695 exit_info = (info->dst_val & 0xffff) << 16;
4696 bytes = info->src_bytes;
4697 }
4698
4699 if (info->intercept == x86_intercept_outs ||
4700 info->intercept == x86_intercept_ins)
4701 exit_info |= SVM_IOIO_STR_MASK;
4702
4703 if (info->rep_prefix)
4704 exit_info |= SVM_IOIO_REP_MASK;
4705
4706 bytes = min(bytes, 4u);
4707
4708 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4709
4710 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4711
4712 vmcb->control.exit_info_1 = exit_info;
4713 vmcb->control.exit_info_2 = info->next_rip;
4714
4715 break;
4716 }
4717 default:
4718 break;
4719 }
4720
4721 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4722 if (static_cpu_has(X86_FEATURE_NRIPS))
4723 vmcb->control.next_rip = info->next_rip;
4724 vmcb->control.exit_code = icpt_info.exit_code;
4725 vmexit = nested_svm_exit_handled(svm);
4726
4727 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4728 : X86EMUL_CONTINUE;
4729
4730 out:
4731 return ret;
4732 }
4733
svm_handle_exit_irqoff(struct kvm_vcpu * vcpu)4734 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4735 {
4736 switch (to_svm(vcpu)->vmcb->control.exit_code) {
4737 case SVM_EXIT_EXCP_BASE + MC_VECTOR:
4738 svm_handle_mce(vcpu);
4739 break;
4740 case SVM_EXIT_INTR:
4741 vcpu->arch.at_instruction_boundary = true;
4742 break;
4743 default:
4744 break;
4745 }
4746 }
4747
svm_setup_mce(struct kvm_vcpu * vcpu)4748 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4749 {
4750 /* [63:9] are reserved. */
4751 vcpu->arch.mcg_cap &= 0x1ff;
4752 }
4753
4754 #ifdef CONFIG_KVM_SMM
svm_smi_blocked(struct kvm_vcpu * vcpu)4755 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4756 {
4757 struct vcpu_svm *svm = to_svm(vcpu);
4758
4759 /* Per APM Vol.2 15.22.2 "Response to SMI" */
4760 if (!gif_set(svm))
4761 return true;
4762
4763 return is_smm(vcpu);
4764 }
4765
svm_smi_allowed(struct kvm_vcpu * vcpu,bool for_injection)4766 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4767 {
4768 struct vcpu_svm *svm = to_svm(vcpu);
4769 if (svm->nested.nested_run_pending)
4770 return -EBUSY;
4771
4772 if (svm_smi_blocked(vcpu))
4773 return 0;
4774
4775 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
4776 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4777 return -EBUSY;
4778
4779 return 1;
4780 }
4781
svm_enter_smm(struct kvm_vcpu * vcpu,union kvm_smram * smram)4782 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4783 {
4784 struct vcpu_svm *svm = to_svm(vcpu);
4785 struct kvm_host_map map_save;
4786 int ret;
4787
4788 if (!is_guest_mode(vcpu))
4789 return 0;
4790
4791 /*
4792 * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is
4793 * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4794 */
4795
4796 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
4797 return 1;
4798
4799 smram->smram64.svm_guest_flag = 1;
4800 smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4801
4802 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4803 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4804 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4805
4806 ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4807 if (ret)
4808 return ret;
4809
4810 /*
4811 * KVM uses VMCB01 to store L1 host state while L2 runs but
4812 * VMCB01 is going to be used during SMM and thus the state will
4813 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4814 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4815 * format of the area is identical to guest save area offsetted
4816 * by 0x400 (matches the offset of 'struct vmcb_save_area'
4817 * within 'struct vmcb'). Note: HSAVE area may also be used by
4818 * L1 hypervisor to save additional host context (e.g. KVM does
4819 * that, see svm_prepare_switch_to_guest()) which must be
4820 * preserved.
4821 */
4822 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4823 return 1;
4824
4825 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4826
4827 svm_copy_vmrun_state(map_save.hva + 0x400,
4828 &svm->vmcb01.ptr->save);
4829
4830 kvm_vcpu_unmap(vcpu, &map_save);
4831 return 0;
4832 }
4833
svm_leave_smm(struct kvm_vcpu * vcpu,const union kvm_smram * smram)4834 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4835 {
4836 struct vcpu_svm *svm = to_svm(vcpu);
4837 struct kvm_host_map map, map_save;
4838 struct vmcb *vmcb12;
4839 int ret;
4840
4841 const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4842
4843 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
4844 return 0;
4845
4846 /* Non-zero if SMI arrived while vCPU was in guest mode. */
4847 if (!smram64->svm_guest_flag)
4848 return 0;
4849
4850 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
4851 return 1;
4852
4853 if (!(smram64->efer & EFER_SVME))
4854 return 1;
4855
4856 if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4857 return 1;
4858
4859 ret = 1;
4860 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4861 goto unmap_map;
4862
4863 if (svm_allocate_nested(svm))
4864 goto unmap_save;
4865
4866 /*
4867 * Restore L1 host state from L1 HSAVE area as VMCB01 was
4868 * used during SMM (see svm_enter_smm())
4869 */
4870
4871 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4872
4873 /*
4874 * Enter the nested guest now
4875 */
4876
4877 vmcb_mark_all_dirty(svm->vmcb01.ptr);
4878
4879 vmcb12 = map.hva;
4880 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4881 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4882 ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
4883
4884 if (ret)
4885 goto unmap_save;
4886
4887 svm->nested.nested_run_pending = 1;
4888
4889 unmap_save:
4890 kvm_vcpu_unmap(vcpu, &map_save);
4891 unmap_map:
4892 kvm_vcpu_unmap(vcpu, &map);
4893 return ret;
4894 }
4895
svm_enable_smi_window(struct kvm_vcpu * vcpu)4896 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4897 {
4898 struct vcpu_svm *svm = to_svm(vcpu);
4899
4900 if (!gif_set(svm)) {
4901 if (vgif)
4902 svm_set_intercept(svm, INTERCEPT_STGI);
4903 /* STGI will cause a vm exit */
4904 } else {
4905 /* We must be in SMM; RSM will cause a vmexit anyway. */
4906 }
4907 }
4908 #endif
4909
svm_check_emulate_instruction(struct kvm_vcpu * vcpu,int emul_type,void * insn,int insn_len)4910 static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4911 void *insn, int insn_len)
4912 {
4913 struct vcpu_svm *svm = to_svm(vcpu);
4914 bool smep, smap, is_user;
4915 u64 error_code;
4916
4917 /* Check that emulation is possible during event vectoring */
4918 if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) &&
4919 !kvm_can_emulate_event_vectoring(emul_type))
4920 return X86EMUL_UNHANDLEABLE_VECTORING;
4921
4922 /* Emulation is always possible when KVM has access to all guest state. */
4923 if (!sev_guest(vcpu->kvm))
4924 return X86EMUL_CONTINUE;
4925
4926 /* #UD and #GP should never be intercepted for SEV guests. */
4927 WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4928 EMULTYPE_TRAP_UD_FORCED |
4929 EMULTYPE_VMWARE_GP));
4930
4931 /*
4932 * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4933 * to guest register state.
4934 */
4935 if (sev_es_guest(vcpu->kvm))
4936 return X86EMUL_RETRY_INSTR;
4937
4938 /*
4939 * Emulation is possible if the instruction is already decoded, e.g.
4940 * when completing I/O after returning from userspace.
4941 */
4942 if (emul_type & EMULTYPE_NO_DECODE)
4943 return X86EMUL_CONTINUE;
4944
4945 /*
4946 * Emulation is possible for SEV guests if and only if a prefilled
4947 * buffer containing the bytes of the intercepted instruction is
4948 * available. SEV guest memory is encrypted with a guest specific key
4949 * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and
4950 * decode garbage.
4951 *
4952 * If KVM is NOT trying to simply skip an instruction, inject #UD if
4953 * KVM reached this point without an instruction buffer. In practice,
4954 * this path should never be hit by a well-behaved guest, e.g. KVM
4955 * doesn't intercept #UD or #GP for SEV guests, but this path is still
4956 * theoretically reachable, e.g. via unaccelerated fault-like AVIC
4957 * access, and needs to be handled by KVM to avoid putting the guest
4958 * into an infinite loop. Injecting #UD is somewhat arbitrary, but
4959 * its the least awful option given lack of insight into the guest.
4960 *
4961 * If KVM is trying to skip an instruction, simply resume the guest.
4962 * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
4963 * will attempt to re-inject the INT3/INTO and skip the instruction.
4964 * In that scenario, retrying the INT3/INTO and hoping the guest will
4965 * make forward progress is the only option that has a chance of
4966 * success (and in practice it will work the vast majority of the time).
4967 */
4968 if (unlikely(!insn)) {
4969 if (emul_type & EMULTYPE_SKIP)
4970 return X86EMUL_UNHANDLEABLE;
4971
4972 kvm_queue_exception(vcpu, UD_VECTOR);
4973 return X86EMUL_PROPAGATE_FAULT;
4974 }
4975
4976 /*
4977 * Emulate for SEV guests if the insn buffer is not empty. The buffer
4978 * will be empty if the DecodeAssist microcode cannot fetch bytes for
4979 * the faulting instruction because the code fetch itself faulted, e.g.
4980 * the guest attempted to fetch from emulated MMIO or a guest page
4981 * table used to translate CS:RIP resides in emulated MMIO.
4982 */
4983 if (likely(insn_len))
4984 return X86EMUL_CONTINUE;
4985
4986 /*
4987 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4988 *
4989 * Errata:
4990 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4991 * possible that CPU microcode implementing DecodeAssist will fail to
4992 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4993 * be '0'. This happens because microcode reads CS:RIP using a _data_
4994 * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
4995 * gives up and does not fill the instruction bytes buffer.
4996 *
4997 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4998 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4999 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
5000 * GuestIntrBytes field of the VMCB.
5001 *
5002 * This does _not_ mean that the erratum has been encountered, as the
5003 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
5004 * #PF, e.g. if the guest attempt to execute from emulated MMIO and
5005 * encountered a reserved/not-present #PF.
5006 *
5007 * To hit the erratum, the following conditions must be true:
5008 * 1. CR4.SMAP=1 (obviously).
5009 * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
5010 * have been hit as the guest would have encountered a SMEP
5011 * violation #PF, not a #NPF.
5012 * 3. The #NPF is not due to a code fetch, in which case failure to
5013 * retrieve the instruction bytes is legitimate (see abvoe).
5014 *
5015 * In addition, don't apply the erratum workaround if the #NPF occurred
5016 * while translating guest page tables (see below).
5017 */
5018 error_code = svm->vmcb->control.exit_info_1;
5019 if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
5020 goto resume_guest;
5021
5022 smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
5023 smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
5024 is_user = svm_get_cpl(vcpu) == 3;
5025 if (smap && (!smep || is_user)) {
5026 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
5027
5028 /*
5029 * If the fault occurred in userspace, arbitrarily inject #GP
5030 * to avoid killing the guest and to hopefully avoid confusing
5031 * the guest kernel too much, e.g. injecting #PF would not be
5032 * coherent with respect to the guest's page tables. Request
5033 * triple fault if the fault occurred in the kernel as there's
5034 * no fault that KVM can inject without confusing the guest.
5035 * In practice, the triple fault is moot as no sane SEV kernel
5036 * will execute from user memory while also running with SMAP=1.
5037 */
5038 if (is_user)
5039 kvm_inject_gp(vcpu, 0);
5040 else
5041 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5042 return X86EMUL_PROPAGATE_FAULT;
5043 }
5044
5045 resume_guest:
5046 /*
5047 * If the erratum was not hit, simply resume the guest and let it fault
5048 * again. While awful, e.g. the vCPU may get stuck in an infinite loop
5049 * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
5050 * userspace will kill the guest, and letting the emulator read garbage
5051 * will yield random behavior and potentially corrupt the guest.
5052 *
5053 * Simply resuming the guest is technically not a violation of the SEV
5054 * architecture. AMD's APM states that all code fetches and page table
5055 * accesses for SEV guest are encrypted, regardless of the C-Bit. The
5056 * APM also states that encrypted accesses to MMIO are "ignored", but
5057 * doesn't explicitly define "ignored", i.e. doing nothing and letting
5058 * the guest spin is technically "ignoring" the access.
5059 */
5060 return X86EMUL_RETRY_INSTR;
5061 }
5062
svm_apic_init_signal_blocked(struct kvm_vcpu * vcpu)5063 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
5064 {
5065 struct vcpu_svm *svm = to_svm(vcpu);
5066
5067 return !gif_set(svm);
5068 }
5069
svm_vcpu_deliver_sipi_vector(struct kvm_vcpu * vcpu,u8 vector)5070 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
5071 {
5072 if (!sev_es_guest(vcpu->kvm))
5073 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
5074
5075 sev_vcpu_deliver_sipi_vector(vcpu, vector);
5076 }
5077
svm_vm_destroy(struct kvm * kvm)5078 static void svm_vm_destroy(struct kvm *kvm)
5079 {
5080 avic_vm_destroy(kvm);
5081 sev_vm_destroy(kvm);
5082
5083 svm_srso_vm_destroy();
5084 }
5085
svm_vm_init(struct kvm * kvm)5086 static int svm_vm_init(struct kvm *kvm)
5087 {
5088 int type = kvm->arch.vm_type;
5089
5090 if (type != KVM_X86_DEFAULT_VM &&
5091 type != KVM_X86_SW_PROTECTED_VM) {
5092 kvm->arch.has_protected_state =
5093 (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM);
5094 to_kvm_sev_info(kvm)->need_init = true;
5095
5096 kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
5097 kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem;
5098 }
5099
5100 if (!pause_filter_count || !pause_filter_thresh)
5101 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
5102
5103 if (enable_apicv) {
5104 int ret = avic_vm_init(kvm);
5105 if (ret)
5106 return ret;
5107 }
5108
5109 svm_srso_vm_init();
5110 return 0;
5111 }
5112
svm_alloc_apic_backing_page(struct kvm_vcpu * vcpu)5113 static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
5114 {
5115 struct page *page = snp_safe_alloc_page();
5116
5117 if (!page)
5118 return NULL;
5119
5120 return page_address(page);
5121 }
5122
5123 struct kvm_x86_ops svm_x86_ops __initdata = {
5124 .name = KBUILD_MODNAME,
5125
5126 .check_processor_compatibility = svm_check_processor_compat,
5127
5128 .hardware_unsetup = svm_hardware_unsetup,
5129 .enable_virtualization_cpu = svm_enable_virtualization_cpu,
5130 .disable_virtualization_cpu = svm_disable_virtualization_cpu,
5131 .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
5132 .has_emulated_msr = svm_has_emulated_msr,
5133
5134 .vcpu_precreate = svm_vcpu_precreate,
5135 .vcpu_create = svm_vcpu_create,
5136 .vcpu_free = svm_vcpu_free,
5137 .vcpu_reset = svm_vcpu_reset,
5138
5139 .vm_size = sizeof(struct kvm_svm),
5140 .vm_init = svm_vm_init,
5141 .vm_destroy = svm_vm_destroy,
5142
5143 .prepare_switch_to_guest = svm_prepare_switch_to_guest,
5144 .vcpu_load = svm_vcpu_load,
5145 .vcpu_put = svm_vcpu_put,
5146 .vcpu_blocking = avic_vcpu_blocking,
5147 .vcpu_unblocking = avic_vcpu_unblocking,
5148
5149 .update_exception_bitmap = svm_update_exception_bitmap,
5150 .get_feature_msr = svm_get_feature_msr,
5151 .get_msr = svm_get_msr,
5152 .set_msr = svm_set_msr,
5153 .get_segment_base = svm_get_segment_base,
5154 .get_segment = svm_get_segment,
5155 .set_segment = svm_set_segment,
5156 .get_cpl = svm_get_cpl,
5157 .get_cpl_no_cache = svm_get_cpl,
5158 .get_cs_db_l_bits = svm_get_cs_db_l_bits,
5159 .is_valid_cr0 = svm_is_valid_cr0,
5160 .set_cr0 = svm_set_cr0,
5161 .post_set_cr3 = sev_post_set_cr3,
5162 .is_valid_cr4 = svm_is_valid_cr4,
5163 .set_cr4 = svm_set_cr4,
5164 .set_efer = svm_set_efer,
5165 .get_idt = svm_get_idt,
5166 .set_idt = svm_set_idt,
5167 .get_gdt = svm_get_gdt,
5168 .set_gdt = svm_set_gdt,
5169 .set_dr7 = svm_set_dr7,
5170 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
5171 .cache_reg = svm_cache_reg,
5172 .get_rflags = svm_get_rflags,
5173 .set_rflags = svm_set_rflags,
5174 .get_if_flag = svm_get_if_flag,
5175
5176 .flush_tlb_all = svm_flush_tlb_all,
5177 .flush_tlb_current = svm_flush_tlb_current,
5178 .flush_tlb_gva = svm_flush_tlb_gva,
5179 .flush_tlb_guest = svm_flush_tlb_guest,
5180
5181 .vcpu_pre_run = svm_vcpu_pre_run,
5182 .vcpu_run = svm_vcpu_run,
5183 .handle_exit = svm_handle_exit,
5184 .skip_emulated_instruction = svm_skip_emulated_instruction,
5185 .update_emulated_instruction = NULL,
5186 .set_interrupt_shadow = svm_set_interrupt_shadow,
5187 .get_interrupt_shadow = svm_get_interrupt_shadow,
5188 .patch_hypercall = svm_patch_hypercall,
5189 .inject_irq = svm_inject_irq,
5190 .inject_nmi = svm_inject_nmi,
5191 .is_vnmi_pending = svm_is_vnmi_pending,
5192 .set_vnmi_pending = svm_set_vnmi_pending,
5193 .inject_exception = svm_inject_exception,
5194 .cancel_injection = svm_cancel_injection,
5195 .interrupt_allowed = svm_interrupt_allowed,
5196 .nmi_allowed = svm_nmi_allowed,
5197 .get_nmi_mask = svm_get_nmi_mask,
5198 .set_nmi_mask = svm_set_nmi_mask,
5199 .enable_nmi_window = svm_enable_nmi_window,
5200 .enable_irq_window = svm_enable_irq_window,
5201 .update_cr8_intercept = svm_update_cr8_intercept,
5202
5203 .x2apic_icr_is_split = true,
5204 .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
5205 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
5206 .apicv_post_state_restore = avic_apicv_post_state_restore,
5207 .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
5208
5209 .get_exit_info = svm_get_exit_info,
5210 .get_entry_info = svm_get_entry_info,
5211
5212 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
5213
5214 .has_wbinvd_exit = svm_has_wbinvd_exit,
5215
5216 .get_l2_tsc_offset = svm_get_l2_tsc_offset,
5217 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
5218 .write_tsc_offset = svm_write_tsc_offset,
5219 .write_tsc_multiplier = svm_write_tsc_multiplier,
5220
5221 .load_mmu_pgd = svm_load_mmu_pgd,
5222
5223 .check_intercept = svm_check_intercept,
5224 .handle_exit_irqoff = svm_handle_exit_irqoff,
5225
5226 .nested_ops = &svm_nested_ops,
5227
5228 .deliver_interrupt = svm_deliver_interrupt,
5229 .pi_update_irte = avic_pi_update_irte,
5230 .setup_mce = svm_setup_mce,
5231
5232 #ifdef CONFIG_KVM_SMM
5233 .smi_allowed = svm_smi_allowed,
5234 .enter_smm = svm_enter_smm,
5235 .leave_smm = svm_leave_smm,
5236 .enable_smi_window = svm_enable_smi_window,
5237 #endif
5238
5239 #ifdef CONFIG_KVM_AMD_SEV
5240 .dev_get_attr = sev_dev_get_attr,
5241 .mem_enc_ioctl = sev_mem_enc_ioctl,
5242 .mem_enc_register_region = sev_mem_enc_register_region,
5243 .mem_enc_unregister_region = sev_mem_enc_unregister_region,
5244 .guest_memory_reclaimed = sev_guest_memory_reclaimed,
5245
5246 .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
5247 .vm_move_enc_context_from = sev_vm_move_enc_context_from,
5248 #endif
5249 .check_emulate_instruction = svm_check_emulate_instruction,
5250
5251 .apic_init_signal_blocked = svm_apic_init_signal_blocked,
5252
5253 .recalc_intercepts = svm_recalc_intercepts,
5254 .complete_emulated_msr = svm_complete_emulated_msr,
5255
5256 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
5257 .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
5258 .alloc_apic_backing_page = svm_alloc_apic_backing_page,
5259
5260 .gmem_prepare = sev_gmem_prepare,
5261 .gmem_invalidate = sev_gmem_invalidate,
5262 .gmem_max_mapping_level = sev_gmem_max_mapping_level,
5263 };
5264
5265 /*
5266 * The default MMIO mask is a single bit (excluding the present bit),
5267 * which could conflict with the memory encryption bit. Check for
5268 * memory encryption support and override the default MMIO mask if
5269 * memory encryption is enabled.
5270 */
svm_adjust_mmio_mask(void)5271 static __init void svm_adjust_mmio_mask(void)
5272 {
5273 unsigned int enc_bit, mask_bit;
5274 u64 msr, mask;
5275
5276 /* If there is no memory encryption support, use existing mask */
5277 if (cpuid_eax(0x80000000) < 0x8000001f)
5278 return;
5279
5280 /* If memory encryption is not enabled, use existing mask */
5281 rdmsrq(MSR_AMD64_SYSCFG, msr);
5282 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
5283 return;
5284
5285 enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
5286 mask_bit = boot_cpu_data.x86_phys_bits;
5287
5288 /* Increment the mask bit if it is the same as the encryption bit */
5289 if (enc_bit == mask_bit)
5290 mask_bit++;
5291
5292 /*
5293 * If the mask bit location is below 52, then some bits above the
5294 * physical addressing limit will always be reserved, so use the
5295 * rsvd_bits() function to generate the mask. This mask, along with
5296 * the present bit, will be used to generate a page fault with
5297 * PFER.RSV = 1.
5298 *
5299 * If the mask bit location is 52 (or above), then clear the mask.
5300 */
5301 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
5302
5303 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
5304 }
5305
svm_set_cpu_caps(void)5306 static __init void svm_set_cpu_caps(void)
5307 {
5308 kvm_initialize_cpu_caps();
5309
5310 kvm_caps.supported_perf_cap = 0;
5311
5312 kvm_cpu_cap_clear(X86_FEATURE_IBT);
5313
5314 /* CPUID 0x80000001 and 0x8000000A (SVM features) */
5315 if (nested) {
5316 kvm_cpu_cap_set(X86_FEATURE_SVM);
5317 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
5318
5319 /*
5320 * KVM currently flushes TLBs on *every* nested SVM transition,
5321 * and so for all intents and purposes KVM supports flushing by
5322 * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
5323 */
5324 kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);
5325
5326 if (nrips)
5327 kvm_cpu_cap_set(X86_FEATURE_NRIPS);
5328
5329 if (npt_enabled)
5330 kvm_cpu_cap_set(X86_FEATURE_NPT);
5331
5332 if (tsc_scaling)
5333 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
5334
5335 if (vls)
5336 kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
5337 if (lbrv)
5338 kvm_cpu_cap_set(X86_FEATURE_LBRV);
5339
5340 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
5341 kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
5342
5343 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
5344 kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
5345
5346 if (vgif)
5347 kvm_cpu_cap_set(X86_FEATURE_VGIF);
5348
5349 if (vnmi)
5350 kvm_cpu_cap_set(X86_FEATURE_VNMI);
5351
5352 /* Nested VM can receive #VMEXIT instead of triggering #GP */
5353 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
5354 }
5355
5356 if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD))
5357 kvm_caps.has_bus_lock_exit = true;
5358
5359 /* CPUID 0x80000008 */
5360 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
5361 boot_cpu_has(X86_FEATURE_AMD_SSBD))
5362 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
5363
5364 if (enable_pmu) {
5365 /*
5366 * Enumerate support for PERFCTR_CORE if and only if KVM has
5367 * access to enough counters to virtualize "core" support,
5368 * otherwise limit vPMU support to the legacy number of counters.
5369 */
5370 if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
5371 kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
5372 kvm_pmu_cap.num_counters_gp);
5373 else
5374 kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
5375
5376 if (kvm_pmu_cap.version != 2 ||
5377 !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
5378 kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
5379 }
5380
5381 /* CPUID 0x8000001F (SME/SEV features) */
5382 sev_set_cpu_caps();
5383
5384 /*
5385 * Clear capabilities that are automatically configured by common code,
5386 * but that require explicit SVM support (that isn't yet implemented).
5387 */
5388 kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
5389 kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM);
5390
5391 kvm_setup_xss_caps();
5392 kvm_finalize_cpu_caps();
5393 }
5394
svm_hardware_setup(void)5395 static __init int svm_hardware_setup(void)
5396 {
5397 void *iopm_va;
5398 int cpu, r;
5399
5400 /*
5401 * NX is required for shadow paging and for NPT if the NX huge pages
5402 * mitigation is enabled.
5403 */
5404 if (!boot_cpu_has(X86_FEATURE_NX)) {
5405 pr_err_ratelimited("NX (Execute Disable) not supported\n");
5406 return -EOPNOTSUPP;
5407 }
5408 kvm_enable_efer_bits(EFER_NX);
5409
5410 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
5411 XFEATURE_MASK_BNDCSR);
5412
5413 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
5414 kvm_enable_efer_bits(EFER_FFXSR);
5415
5416 if (tsc_scaling) {
5417 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
5418 tsc_scaling = false;
5419 } else {
5420 pr_info("TSC scaling supported\n");
5421 kvm_caps.has_tsc_control = true;
5422 }
5423 }
5424 kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
5425 kvm_caps.tsc_scaling_ratio_frac_bits = 32;
5426
5427 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
5428
5429 if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
5430 kvm_enable_efer_bits(EFER_AUTOIBRS);
5431
5432 /* Check for pause filtering support */
5433 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
5434 pause_filter_count = 0;
5435 pause_filter_thresh = 0;
5436 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
5437 pause_filter_thresh = 0;
5438 }
5439
5440 if (nested) {
5441 pr_info("Nested Virtualization enabled\n");
5442 kvm_enable_efer_bits(EFER_SVME);
5443 if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ))
5444 kvm_enable_efer_bits(EFER_LMSLE);
5445
5446 r = nested_svm_init_msrpm_merge_offsets();
5447 if (r)
5448 return r;
5449 }
5450
5451 /*
5452 * KVM's MMU doesn't support using 2-level paging for itself, and thus
5453 * NPT isn't supported if the host is using 2-level paging since host
5454 * CR4 is unchanged on VMRUN.
5455 */
5456 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
5457 npt_enabled = false;
5458
5459 if (!boot_cpu_has(X86_FEATURE_NPT))
5460 npt_enabled = false;
5461
5462 /* Force VM NPT level equal to the host's paging level */
5463 kvm_configure_mmu(npt_enabled, get_npt_level(),
5464 get_npt_level(), PG_LEVEL_1G);
5465 pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled));
5466
5467 /*
5468 * It seems that on AMD processors PTE's accessed bit is
5469 * being set by the CPU hardware before the NPF vmexit.
5470 * This is not expected behaviour and our tests fail because
5471 * of it.
5472 * A workaround here is to disable support for
5473 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5474 * In this case userspace can know if there is support using
5475 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5476 * it
5477 * If future AMD CPU models change the behaviour described above,
5478 * this variable can be changed accordingly
5479 */
5480 allow_smaller_maxphyaddr = !npt_enabled;
5481
5482 /* Setup shadow_me_value and shadow_me_mask */
5483 kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5484
5485 svm_adjust_mmio_mask();
5486
5487 nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
5488
5489 if (lbrv) {
5490 if (!boot_cpu_has(X86_FEATURE_LBRV))
5491 lbrv = false;
5492 else
5493 pr_info("LBR virtualization supported\n");
5494 }
5495
5496 iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL);
5497 if (!iopm_va)
5498 return -ENOMEM;
5499
5500 iopm_base = __sme_set(__pa(iopm_va));
5501
5502 /*
5503 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5504 * may be modified by svm_adjust_mmio_mask()), as well as nrips.
5505 */
5506 sev_hardware_setup();
5507
5508 svm_hv_hardware_setup();
5509
5510 enable_apicv = avic_hardware_setup();
5511 if (!enable_apicv) {
5512 enable_ipiv = false;
5513 svm_x86_ops.vcpu_blocking = NULL;
5514 svm_x86_ops.vcpu_unblocking = NULL;
5515 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5516 }
5517
5518 if (vls) {
5519 if (!npt_enabled ||
5520 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5521 !IS_ENABLED(CONFIG_X86_64)) {
5522 vls = false;
5523 } else {
5524 pr_info("Virtual VMLOAD VMSAVE supported\n");
5525 }
5526 }
5527
5528 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5529 svm_gp_erratum_intercept = false;
5530
5531 if (vgif) {
5532 if (!boot_cpu_has(X86_FEATURE_VGIF))
5533 vgif = false;
5534 else
5535 pr_info("Virtual GIF supported\n");
5536 }
5537
5538 vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
5539 if (vnmi)
5540 pr_info("Virtual NMI enabled\n");
5541
5542 if (!vnmi) {
5543 svm_x86_ops.is_vnmi_pending = NULL;
5544 svm_x86_ops.set_vnmi_pending = NULL;
5545 }
5546
5547 if (!enable_pmu)
5548 pr_info("PMU virtualization is disabled\n");
5549
5550 svm_set_cpu_caps();
5551
5552 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
5553
5554 for_each_possible_cpu(cpu) {
5555 r = svm_cpu_init(cpu);
5556 if (r)
5557 goto err;
5558 }
5559
5560 return 0;
5561
5562 err:
5563 svm_hardware_unsetup();
5564 return r;
5565 }
5566
5567
5568 static struct kvm_x86_init_ops svm_init_ops __initdata = {
5569 .hardware_setup = svm_hardware_setup,
5570
5571 .runtime_ops = &svm_x86_ops,
5572 .pmu_ops = &amd_pmu_ops,
5573 };
5574
__svm_exit(void)5575 static void __svm_exit(void)
5576 {
5577 kvm_x86_vendor_exit();
5578 }
5579
svm_init(void)5580 static int __init svm_init(void)
5581 {
5582 int r;
5583
5584 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);
5585
5586 __unused_size_checks();
5587
5588 if (!kvm_is_svm_supported())
5589 return -EOPNOTSUPP;
5590
5591 r = kvm_x86_vendor_init(&svm_init_ops);
5592 if (r)
5593 return r;
5594
5595 /*
5596 * Common KVM initialization _must_ come last, after this, /dev/kvm is
5597 * exposed to userspace!
5598 */
5599 r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
5600 THIS_MODULE);
5601 if (r)
5602 goto err_kvm_init;
5603
5604 return 0;
5605
5606 err_kvm_init:
5607 __svm_exit();
5608 return r;
5609 }
5610
svm_exit(void)5611 static void __exit svm_exit(void)
5612 {
5613 kvm_exit();
5614 __svm_exit();
5615 }
5616
5617 module_init(svm_init)
5618 module_exit(svm_exit)
5619