1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/kvm_host.h> 4 5 #include "irq.h" 6 #include "mmu.h" 7 #include "kvm_cache_regs.h" 8 #include "x86.h" 9 #include "smm.h" 10 #include "cpuid.h" 11 #include "pmu.h" 12 13 #include <linux/module.h> 14 #include <linux/mod_devicetable.h> 15 #include <linux/kernel.h> 16 #include <linux/vmalloc.h> 17 #include <linux/highmem.h> 18 #include <linux/amd-iommu.h> 19 #include <linux/sched.h> 20 #include <linux/trace_events.h> 21 #include <linux/slab.h> 22 #include <linux/hashtable.h> 23 #include <linux/objtool.h> 24 #include <linux/psp-sev.h> 25 #include <linux/file.h> 26 #include <linux/pagemap.h> 27 #include <linux/swap.h> 28 #include <linux/rwsem.h> 29 #include <linux/cc_platform.h> 30 #include <linux/smp.h> 31 #include <linux/string_choices.h> 32 33 #include <asm/apic.h> 34 #include <asm/perf_event.h> 35 #include <asm/tlbflush.h> 36 #include <asm/desc.h> 37 #include <asm/debugreg.h> 38 #include <asm/kvm_para.h> 39 #include <asm/irq_remapping.h> 40 #include <asm/spec-ctrl.h> 41 #include <asm/cpu_device_id.h> 42 #include <asm/traps.h> 43 #include <asm/reboot.h> 44 #include <asm/fpu/api.h> 45 46 #include <trace/events/ipi.h> 47 48 #include "trace.h" 49 50 #include "svm.h" 51 #include "svm_ops.h" 52 53 #include "kvm_onhyperv.h" 54 #include "svm_onhyperv.h" 55 56 MODULE_AUTHOR("Qumranet"); 57 MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions"); 58 MODULE_LICENSE("GPL"); 59 60 #ifdef MODULE 61 static const struct x86_cpu_id svm_cpu_id[] = { 62 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL), 63 {} 64 }; 65 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); 66 #endif 67 68 #define SEG_TYPE_LDT 2 69 #define SEG_TYPE_BUSY_TSS16 3 70 71 static bool erratum_383_found __read_mostly; 72 73 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 74 75 /* 76 * Set osvw_len to higher value when updated Revision Guides 77 * are published and we know what the new status bits are 78 */ 79 static uint64_t osvw_len = 4, osvw_status; 80 81 static DEFINE_PER_CPU(u64, current_tsc_ratio); 82 83 #define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) 84 85 static const struct svm_direct_access_msrs { 86 u32 index; /* Index of the MSR */ 87 bool always; /* True if intercept is initially cleared */ 88 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { 89 { .index = MSR_STAR, .always = true }, 90 { .index = MSR_IA32_SYSENTER_CS, .always = true }, 91 { .index = MSR_IA32_SYSENTER_EIP, .always = false }, 92 { .index = MSR_IA32_SYSENTER_ESP, .always = false }, 93 #ifdef CONFIG_X86_64 94 { .index = MSR_GS_BASE, .always = true }, 95 { .index = MSR_FS_BASE, .always = true }, 96 { .index = MSR_KERNEL_GS_BASE, .always = true }, 97 { .index = MSR_LSTAR, .always = true }, 98 { .index = MSR_CSTAR, .always = true }, 99 { .index = MSR_SYSCALL_MASK, .always = true }, 100 #endif 101 { .index = MSR_IA32_SPEC_CTRL, .always = false }, 102 { .index = MSR_IA32_PRED_CMD, .always = false }, 103 { .index = MSR_IA32_FLUSH_CMD, .always = false }, 104 { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, 105 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, 106 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, 107 { .index = MSR_IA32_LASTINTFROMIP, .always = false }, 108 { .index = MSR_IA32_LASTINTTOIP, .always = false }, 109 { .index = MSR_IA32_XSS, .always = false }, 110 { .index = MSR_EFER, .always = false }, 111 { .index = MSR_IA32_CR_PAT, .always = false }, 112 { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, 113 { .index = MSR_TSC_AUX, .always = false }, 114 { .index = X2APIC_MSR(APIC_ID), .always = false }, 115 { .index = X2APIC_MSR(APIC_LVR), .always = false }, 116 { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, 117 { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, 118 { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, 119 { .index = X2APIC_MSR(APIC_EOI), .always = false }, 120 { .index = X2APIC_MSR(APIC_RRR), .always = false }, 121 { .index = X2APIC_MSR(APIC_LDR), .always = false }, 122 { .index = X2APIC_MSR(APIC_DFR), .always = false }, 123 { .index = X2APIC_MSR(APIC_SPIV), .always = false }, 124 { .index = X2APIC_MSR(APIC_ISR), .always = false }, 125 { .index = X2APIC_MSR(APIC_TMR), .always = false }, 126 { .index = X2APIC_MSR(APIC_IRR), .always = false }, 127 { .index = X2APIC_MSR(APIC_ESR), .always = false }, 128 { .index = X2APIC_MSR(APIC_ICR), .always = false }, 129 { .index = X2APIC_MSR(APIC_ICR2), .always = false }, 130 131 /* 132 * Note: 133 * AMD does not virtualize APIC TSC-deadline timer mode, but it is 134 * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, 135 * the AVIC hardware would generate GP fault. Therefore, always 136 * intercept the MSR 0x832, and do not setup direct_access_msr. 137 */ 138 { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, 139 { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, 140 { .index = X2APIC_MSR(APIC_LVT0), .always = false }, 141 { .index = X2APIC_MSR(APIC_LVT1), .always = false }, 142 { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, 143 { .index = X2APIC_MSR(APIC_TMICT), .always = false }, 144 { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, 145 { .index = X2APIC_MSR(APIC_TDCR), .always = false }, 146 { .index = MSR_INVALID, .always = false }, 147 }; 148 149 /* 150 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 151 * pause_filter_count: On processors that support Pause filtering(indicated 152 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter 153 * count value. On VMRUN this value is loaded into an internal counter. 154 * Each time a pause instruction is executed, this counter is decremented 155 * until it reaches zero at which time a #VMEXIT is generated if pause 156 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause 157 * Intercept Filtering for more details. 158 * This also indicate if ple logic enabled. 159 * 160 * pause_filter_thresh: In addition, some processor families support advanced 161 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on 162 * the amount of time a guest is allowed to execute in a pause loop. 163 * In this mode, a 16-bit pause filter threshold field is added in the 164 * VMCB. The threshold value is a cycle count that is used to reset the 165 * pause counter. As with simple pause filtering, VMRUN loads the pause 166 * count value from VMCB into an internal counter. Then, on each pause 167 * instruction the hardware checks the elapsed number of cycles since 168 * the most recent pause instruction against the pause filter threshold. 169 * If the elapsed cycle count is greater than the pause filter threshold, 170 * then the internal pause count is reloaded from the VMCB and execution 171 * continues. If the elapsed cycle count is less than the pause filter 172 * threshold, then the internal pause count is decremented. If the count 173 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is 174 * triggered. If advanced pause filtering is supported and pause filter 175 * threshold field is set to zero, the filter will operate in the simpler, 176 * count only mode. 177 */ 178 179 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; 180 module_param(pause_filter_thresh, ushort, 0444); 181 182 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; 183 module_param(pause_filter_count, ushort, 0444); 184 185 /* Default doubles per-vcpu window every exit. */ 186 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 187 module_param(pause_filter_count_grow, ushort, 0444); 188 189 /* Default resets per-vcpu window every exit to pause_filter_count. */ 190 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 191 module_param(pause_filter_count_shrink, ushort, 0444); 192 193 /* Default is to compute the maximum so we can never overflow. */ 194 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; 195 module_param(pause_filter_count_max, ushort, 0444); 196 197 /* 198 * Use nested page tables by default. Note, NPT may get forced off by 199 * svm_hardware_setup() if it's unsupported by hardware or the host kernel. 200 */ 201 bool npt_enabled = true; 202 module_param_named(npt, npt_enabled, bool, 0444); 203 204 /* allow nested virtualization in KVM/SVM */ 205 static int nested = true; 206 module_param(nested, int, 0444); 207 208 /* enable/disable Next RIP Save */ 209 int nrips = true; 210 module_param(nrips, int, 0444); 211 212 /* enable/disable Virtual VMLOAD VMSAVE */ 213 static int vls = true; 214 module_param(vls, int, 0444); 215 216 /* enable/disable Virtual GIF */ 217 int vgif = true; 218 module_param(vgif, int, 0444); 219 220 /* enable/disable LBR virtualization */ 221 int lbrv = true; 222 module_param(lbrv, int, 0444); 223 224 static int tsc_scaling = true; 225 module_param(tsc_scaling, int, 0444); 226 227 /* 228 * enable / disable AVIC. Because the defaults differ for APICv 229 * support between VMX and SVM we cannot use module_param_named. 230 */ 231 static bool avic; 232 module_param(avic, bool, 0444); 233 234 bool __read_mostly dump_invalid_vmcb; 235 module_param(dump_invalid_vmcb, bool, 0644); 236 237 238 bool intercept_smi = true; 239 module_param(intercept_smi, bool, 0444); 240 241 bool vnmi = true; 242 module_param(vnmi, bool, 0444); 243 244 static bool svm_gp_erratum_intercept = true; 245 246 static u8 rsm_ins_bytes[] = "\x0f\xaa"; 247 248 static unsigned long iopm_base; 249 250 DEFINE_PER_CPU(struct svm_cpu_data, svm_data); 251 252 /* 253 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via 254 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. 255 * 256 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to 257 * defer the restoration of TSC_AUX until the CPU returns to userspace. 258 */ 259 static int tsc_aux_uret_slot __read_mostly = -1; 260 261 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 262 263 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 264 #define MSRS_RANGE_SIZE 2048 265 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 266 svm_msrpm_offset(u32 msr)267 u32 svm_msrpm_offset(u32 msr) 268 { 269 u32 offset; 270 int i; 271 272 for (i = 0; i < NUM_MSR_MAPS; i++) { 273 if (msr < msrpm_ranges[i] || 274 msr >= msrpm_ranges[i] + MSRS_IN_RANGE) 275 continue; 276 277 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ 278 offset += (i * MSRS_RANGE_SIZE); /* add range offset */ 279 280 /* Now we have the u8 offset - but need the u32 offset */ 281 return offset / 4; 282 } 283 284 /* MSR not in any range */ 285 return MSR_INVALID; 286 } 287 get_npt_level(void)288 static int get_npt_level(void) 289 { 290 #ifdef CONFIG_X86_64 291 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; 292 #else 293 return PT32E_ROOT_LEVEL; 294 #endif 295 } 296 svm_set_efer(struct kvm_vcpu * vcpu,u64 efer)297 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 298 { 299 struct vcpu_svm *svm = to_svm(vcpu); 300 u64 old_efer = vcpu->arch.efer; 301 vcpu->arch.efer = efer; 302 303 if (!npt_enabled) { 304 /* Shadow paging assumes NX to be available. */ 305 efer |= EFER_NX; 306 307 if (!(efer & EFER_LMA)) 308 efer &= ~EFER_LME; 309 } 310 311 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { 312 if (!(efer & EFER_SVME)) { 313 svm_leave_nested(vcpu); 314 svm_set_gif(svm, true); 315 /* #GP intercept is still needed for vmware backdoor */ 316 if (!enable_vmware_backdoor) 317 clr_exception_intercept(svm, GP_VECTOR); 318 319 /* 320 * Free the nested guest state, unless we are in SMM. 321 * In this case we will return to the nested guest 322 * as soon as we leave SMM. 323 */ 324 if (!is_smm(vcpu)) 325 svm_free_nested(svm); 326 327 } else { 328 int ret = svm_allocate_nested(svm); 329 330 if (ret) { 331 vcpu->arch.efer = old_efer; 332 return ret; 333 } 334 335 /* 336 * Never intercept #GP for SEV guests, KVM can't 337 * decrypt guest memory to workaround the erratum. 338 */ 339 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) 340 set_exception_intercept(svm, GP_VECTOR); 341 } 342 } 343 344 svm->vmcb->save.efer = efer | EFER_SVME; 345 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 346 return 0; 347 } 348 svm_get_interrupt_shadow(struct kvm_vcpu * vcpu)349 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) 350 { 351 struct vcpu_svm *svm = to_svm(vcpu); 352 u32 ret = 0; 353 354 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 355 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; 356 return ret; 357 } 358 svm_set_interrupt_shadow(struct kvm_vcpu * vcpu,int mask)359 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 360 { 361 struct vcpu_svm *svm = to_svm(vcpu); 362 363 if (mask == 0) 364 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 365 else 366 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; 367 368 } 369 __svm_skip_emulated_instruction(struct kvm_vcpu * vcpu,bool commit_side_effects)370 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, 371 bool commit_side_effects) 372 { 373 struct vcpu_svm *svm = to_svm(vcpu); 374 unsigned long old_rflags; 375 376 /* 377 * SEV-ES does not expose the next RIP. The RIP update is controlled by 378 * the type of exit and the #VC handler in the guest. 379 */ 380 if (sev_es_guest(vcpu->kvm)) 381 goto done; 382 383 if (nrips && svm->vmcb->control.next_rip != 0) { 384 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); 385 svm->next_rip = svm->vmcb->control.next_rip; 386 } 387 388 if (!svm->next_rip) { 389 if (unlikely(!commit_side_effects)) 390 old_rflags = svm->vmcb->save.rflags; 391 392 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 393 return 0; 394 395 if (unlikely(!commit_side_effects)) 396 svm->vmcb->save.rflags = old_rflags; 397 } else { 398 kvm_rip_write(vcpu, svm->next_rip); 399 } 400 401 done: 402 if (likely(commit_side_effects)) 403 svm_set_interrupt_shadow(vcpu, 0); 404 405 return 1; 406 } 407 svm_skip_emulated_instruction(struct kvm_vcpu * vcpu)408 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) 409 { 410 return __svm_skip_emulated_instruction(vcpu, true); 411 } 412 svm_update_soft_interrupt_rip(struct kvm_vcpu * vcpu)413 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) 414 { 415 unsigned long rip, old_rip = kvm_rip_read(vcpu); 416 struct vcpu_svm *svm = to_svm(vcpu); 417 418 /* 419 * Due to architectural shortcomings, the CPU doesn't always provide 420 * NextRIP, e.g. if KVM intercepted an exception that occurred while 421 * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip 422 * the instruction even if NextRIP is supported to acquire the next 423 * RIP so that it can be shoved into the NextRIP field, otherwise 424 * hardware will fail to advance guest RIP during event injection. 425 * Drop the exception/interrupt if emulation fails and effectively 426 * retry the instruction, it's the least awful option. If NRIPS is 427 * in use, the skip must not commit any side effects such as clearing 428 * the interrupt shadow or RFLAGS.RF. 429 */ 430 if (!__svm_skip_emulated_instruction(vcpu, !nrips)) 431 return -EIO; 432 433 rip = kvm_rip_read(vcpu); 434 435 /* 436 * Save the injection information, even when using next_rip, as the 437 * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection 438 * doesn't complete due to a VM-Exit occurring while the CPU is 439 * vectoring the event. Decoding the instruction isn't guaranteed to 440 * work as there may be no backing instruction, e.g. if the event is 441 * being injected by L1 for L2, or if the guest is patching INT3 into 442 * a different instruction. 443 */ 444 svm->soft_int_injected = true; 445 svm->soft_int_csbase = svm->vmcb->save.cs.base; 446 svm->soft_int_old_rip = old_rip; 447 svm->soft_int_next_rip = rip; 448 449 if (nrips) 450 kvm_rip_write(vcpu, old_rip); 451 452 if (static_cpu_has(X86_FEATURE_NRIPS)) 453 svm->vmcb->control.next_rip = rip; 454 455 return 0; 456 } 457 svm_inject_exception(struct kvm_vcpu * vcpu)458 static void svm_inject_exception(struct kvm_vcpu *vcpu) 459 { 460 struct kvm_queued_exception *ex = &vcpu->arch.exception; 461 struct vcpu_svm *svm = to_svm(vcpu); 462 463 kvm_deliver_exception_payload(vcpu, ex); 464 465 if (kvm_exception_is_soft(ex->vector) && 466 svm_update_soft_interrupt_rip(vcpu)) 467 return; 468 469 svm->vmcb->control.event_inj = ex->vector 470 | SVM_EVTINJ_VALID 471 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0) 472 | SVM_EVTINJ_TYPE_EXEPT; 473 svm->vmcb->control.event_inj_err = ex->error_code; 474 } 475 svm_init_erratum_383(void)476 static void svm_init_erratum_383(void) 477 { 478 u32 low, high; 479 int err; 480 u64 val; 481 482 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) 483 return; 484 485 /* Use _safe variants to not break nested virtualization */ 486 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); 487 if (err) 488 return; 489 490 val |= (1ULL << 47); 491 492 low = lower_32_bits(val); 493 high = upper_32_bits(val); 494 495 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); 496 497 erratum_383_found = true; 498 } 499 svm_init_osvw(struct kvm_vcpu * vcpu)500 static void svm_init_osvw(struct kvm_vcpu *vcpu) 501 { 502 /* 503 * Guests should see errata 400 and 415 as fixed (assuming that 504 * HLT and IO instructions are intercepted). 505 */ 506 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; 507 vcpu->arch.osvw.status = osvw_status & ~(6ULL); 508 509 /* 510 * By increasing VCPU's osvw.length to 3 we are telling the guest that 511 * all osvw.status bits inside that length, including bit 0 (which is 512 * reserved for erratum 298), are valid. However, if host processor's 513 * osvw_len is 0 then osvw_status[0] carries no information. We need to 514 * be conservative here and therefore we tell the guest that erratum 298 515 * is present (because we really don't know). 516 */ 517 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) 518 vcpu->arch.osvw.status |= 1; 519 } 520 __kvm_is_svm_supported(void)521 static bool __kvm_is_svm_supported(void) 522 { 523 int cpu = smp_processor_id(); 524 struct cpuinfo_x86 *c = &cpu_data(cpu); 525 526 if (c->x86_vendor != X86_VENDOR_AMD && 527 c->x86_vendor != X86_VENDOR_HYGON) { 528 pr_err("CPU %d isn't AMD or Hygon\n", cpu); 529 return false; 530 } 531 532 if (!cpu_has(c, X86_FEATURE_SVM)) { 533 pr_err("SVM not supported by CPU %d\n", cpu); 534 return false; 535 } 536 537 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { 538 pr_info("KVM is unsupported when running as an SEV guest\n"); 539 return false; 540 } 541 542 return true; 543 } 544 kvm_is_svm_supported(void)545 static bool kvm_is_svm_supported(void) 546 { 547 bool supported; 548 549 migrate_disable(); 550 supported = __kvm_is_svm_supported(); 551 migrate_enable(); 552 553 return supported; 554 } 555 svm_check_processor_compat(void)556 static int svm_check_processor_compat(void) 557 { 558 if (!__kvm_is_svm_supported()) 559 return -EIO; 560 561 return 0; 562 } 563 __svm_write_tsc_multiplier(u64 multiplier)564 static void __svm_write_tsc_multiplier(u64 multiplier) 565 { 566 if (multiplier == __this_cpu_read(current_tsc_ratio)) 567 return; 568 569 wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); 570 __this_cpu_write(current_tsc_ratio, multiplier); 571 } 572 sev_es_host_save_area(struct svm_cpu_data * sd)573 static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) 574 { 575 return &sd->save_area->host_sev_es_save; 576 } 577 kvm_cpu_svm_disable(void)578 static inline void kvm_cpu_svm_disable(void) 579 { 580 uint64_t efer; 581 582 wrmsrl(MSR_VM_HSAVE_PA, 0); 583 rdmsrl(MSR_EFER, efer); 584 if (efer & EFER_SVME) { 585 /* 586 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and 587 * NMI aren't blocked. 588 */ 589 stgi(); 590 wrmsrl(MSR_EFER, efer & ~EFER_SVME); 591 } 592 } 593 svm_emergency_disable_virtualization_cpu(void)594 static void svm_emergency_disable_virtualization_cpu(void) 595 { 596 kvm_rebooting = true; 597 598 kvm_cpu_svm_disable(); 599 } 600 svm_disable_virtualization_cpu(void)601 static void svm_disable_virtualization_cpu(void) 602 { 603 /* Make sure we clean up behind us */ 604 if (tsc_scaling) 605 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 606 607 kvm_cpu_svm_disable(); 608 609 amd_pmu_disable_virt(); 610 611 if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) 612 msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); 613 } 614 svm_enable_virtualization_cpu(void)615 static int svm_enable_virtualization_cpu(void) 616 { 617 618 struct svm_cpu_data *sd; 619 uint64_t efer; 620 int me = raw_smp_processor_id(); 621 622 rdmsrl(MSR_EFER, efer); 623 if (efer & EFER_SVME) 624 return -EBUSY; 625 626 sd = per_cpu_ptr(&svm_data, me); 627 sd->asid_generation = 1; 628 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 629 sd->next_asid = sd->max_asid + 1; 630 sd->min_asid = max_sev_asid + 1; 631 632 wrmsrl(MSR_EFER, efer | EFER_SVME); 633 634 wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); 635 636 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 637 /* 638 * Set the default value, even if we don't use TSC scaling 639 * to avoid having stale value in the msr 640 */ 641 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 642 } 643 644 645 /* 646 * Get OSVW bits. 647 * 648 * Note that it is possible to have a system with mixed processor 649 * revisions and therefore different OSVW bits. If bits are not the same 650 * on different processors then choose the worst case (i.e. if erratum 651 * is present on one processor and not on another then assume that the 652 * erratum is present everywhere). 653 */ 654 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { 655 uint64_t len, status = 0; 656 int err; 657 658 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); 659 if (!err) 660 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, 661 &err); 662 663 if (err) 664 osvw_status = osvw_len = 0; 665 else { 666 if (len < osvw_len) 667 osvw_len = len; 668 osvw_status |= status; 669 osvw_status &= (1ULL << osvw_len) - 1; 670 } 671 } else 672 osvw_status = osvw_len = 0; 673 674 svm_init_erratum_383(); 675 676 amd_pmu_enable_virt(); 677 678 /* 679 * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type 680 * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests. 681 * Since Linux does not change the value of TSC_AUX once set, prime the 682 * TSC_AUX field now to avoid a RDMSR on every vCPU run. 683 */ 684 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { 685 u32 __maybe_unused msr_hi; 686 687 rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); 688 } 689 690 if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) 691 msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); 692 693 return 0; 694 } 695 svm_cpu_uninit(int cpu)696 static void svm_cpu_uninit(int cpu) 697 { 698 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 699 700 if (!sd->save_area) 701 return; 702 703 kfree(sd->sev_vmcbs); 704 __free_page(__sme_pa_to_page(sd->save_area_pa)); 705 sd->save_area_pa = 0; 706 sd->save_area = NULL; 707 } 708 svm_cpu_init(int cpu)709 static int svm_cpu_init(int cpu) 710 { 711 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 712 struct page *save_area_page; 713 int ret = -ENOMEM; 714 715 memset(sd, 0, sizeof(struct svm_cpu_data)); 716 save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); 717 if (!save_area_page) 718 return ret; 719 720 ret = sev_cpu_init(sd); 721 if (ret) 722 goto free_save_area; 723 724 sd->save_area = page_address(save_area_page); 725 sd->save_area_pa = __sme_page_pa(save_area_page); 726 return 0; 727 728 free_save_area: 729 __free_page(save_area_page); 730 return ret; 731 732 } 733 set_dr_intercepts(struct vcpu_svm * svm)734 static void set_dr_intercepts(struct vcpu_svm *svm) 735 { 736 struct vmcb *vmcb = svm->vmcb01.ptr; 737 738 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); 739 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); 740 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); 741 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); 742 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); 743 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); 744 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); 745 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); 746 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); 747 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); 748 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); 749 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); 750 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); 751 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); 752 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); 753 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); 754 755 recalc_intercepts(svm); 756 } 757 clr_dr_intercepts(struct vcpu_svm * svm)758 static void clr_dr_intercepts(struct vcpu_svm *svm) 759 { 760 struct vmcb *vmcb = svm->vmcb01.ptr; 761 762 vmcb->control.intercepts[INTERCEPT_DR] = 0; 763 764 recalc_intercepts(svm); 765 } 766 direct_access_msr_slot(u32 msr)767 static int direct_access_msr_slot(u32 msr) 768 { 769 u32 i; 770 771 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) 772 if (direct_access_msrs[i].index == msr) 773 return i; 774 775 return -ENOENT; 776 } 777 set_shadow_msr_intercept(struct kvm_vcpu * vcpu,u32 msr,int read,int write)778 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, 779 int write) 780 { 781 struct vcpu_svm *svm = to_svm(vcpu); 782 int slot = direct_access_msr_slot(msr); 783 784 if (slot == -ENOENT) 785 return; 786 787 /* Set the shadow bitmaps to the desired intercept states */ 788 if (read) 789 set_bit(slot, svm->shadow_msr_intercept.read); 790 else 791 clear_bit(slot, svm->shadow_msr_intercept.read); 792 793 if (write) 794 set_bit(slot, svm->shadow_msr_intercept.write); 795 else 796 clear_bit(slot, svm->shadow_msr_intercept.write); 797 } 798 valid_msr_intercept(u32 index)799 static bool valid_msr_intercept(u32 index) 800 { 801 return direct_access_msr_slot(index) != -ENOENT; 802 } 803 msr_write_intercepted(struct kvm_vcpu * vcpu,u32 msr)804 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) 805 { 806 u8 bit_write; 807 unsigned long tmp; 808 u32 offset; 809 u32 *msrpm; 810 811 /* 812 * For non-nested case: 813 * If the L01 MSR bitmap does not intercept the MSR, then we need to 814 * save it. 815 * 816 * For nested case: 817 * If the L02 MSR bitmap does not intercept the MSR, then we need to 818 * save it. 819 */ 820 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: 821 to_svm(vcpu)->msrpm; 822 823 offset = svm_msrpm_offset(msr); 824 bit_write = 2 * (msr & 0x0f) + 1; 825 tmp = msrpm[offset]; 826 827 BUG_ON(offset == MSR_INVALID); 828 829 return test_bit(bit_write, &tmp); 830 } 831 set_msr_interception_bitmap(struct kvm_vcpu * vcpu,u32 * msrpm,u32 msr,int read,int write)832 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, 833 u32 msr, int read, int write) 834 { 835 struct vcpu_svm *svm = to_svm(vcpu); 836 u8 bit_read, bit_write; 837 unsigned long tmp; 838 u32 offset; 839 840 /* 841 * If this warning triggers extend the direct_access_msrs list at the 842 * beginning of the file 843 */ 844 WARN_ON(!valid_msr_intercept(msr)); 845 846 /* Enforce non allowed MSRs to trap */ 847 if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 848 read = 0; 849 850 if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 851 write = 0; 852 853 offset = svm_msrpm_offset(msr); 854 bit_read = 2 * (msr & 0x0f); 855 bit_write = 2 * (msr & 0x0f) + 1; 856 tmp = msrpm[offset]; 857 858 BUG_ON(offset == MSR_INVALID); 859 860 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); 861 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); 862 863 msrpm[offset] = tmp; 864 865 svm_hv_vmcb_dirty_nested_enlightenments(vcpu); 866 svm->nested.force_msr_bitmap_recalc = true; 867 } 868 set_msr_interception(struct kvm_vcpu * vcpu,u32 * msrpm,u32 msr,int read,int write)869 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, 870 int read, int write) 871 { 872 set_shadow_msr_intercept(vcpu, msr, read, write); 873 set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); 874 } 875 svm_vcpu_alloc_msrpm(void)876 u32 *svm_vcpu_alloc_msrpm(void) 877 { 878 unsigned int order = get_order(MSRPM_SIZE); 879 struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); 880 u32 *msrpm; 881 882 if (!pages) 883 return NULL; 884 885 msrpm = page_address(pages); 886 memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); 887 888 return msrpm; 889 } 890 svm_vcpu_init_msrpm(struct kvm_vcpu * vcpu,u32 * msrpm)891 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) 892 { 893 int i; 894 895 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 896 if (!direct_access_msrs[i].always) 897 continue; 898 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); 899 } 900 } 901 svm_set_x2apic_msr_interception(struct vcpu_svm * svm,bool intercept)902 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) 903 { 904 int i; 905 906 if (intercept == svm->x2avic_msrs_intercepted) 907 return; 908 909 if (!x2avic_enabled) 910 return; 911 912 for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { 913 int index = direct_access_msrs[i].index; 914 915 if ((index < APIC_BASE_MSR) || 916 (index > APIC_BASE_MSR + 0xff)) 917 continue; 918 set_msr_interception(&svm->vcpu, svm->msrpm, index, 919 !intercept, !intercept); 920 } 921 922 svm->x2avic_msrs_intercepted = intercept; 923 } 924 svm_vcpu_free_msrpm(u32 * msrpm)925 void svm_vcpu_free_msrpm(u32 *msrpm) 926 { 927 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); 928 } 929 svm_msr_filter_changed(struct kvm_vcpu * vcpu)930 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) 931 { 932 struct vcpu_svm *svm = to_svm(vcpu); 933 u32 i; 934 935 /* 936 * Set intercept permissions for all direct access MSRs again. They 937 * will automatically get filtered through the MSR filter, so we are 938 * back in sync after this. 939 */ 940 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 941 u32 msr = direct_access_msrs[i].index; 942 u32 read = test_bit(i, svm->shadow_msr_intercept.read); 943 u32 write = test_bit(i, svm->shadow_msr_intercept.write); 944 945 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); 946 } 947 } 948 add_msr_offset(u32 offset)949 static void add_msr_offset(u32 offset) 950 { 951 int i; 952 953 for (i = 0; i < MSRPM_OFFSETS; ++i) { 954 955 /* Offset already in list? */ 956 if (msrpm_offsets[i] == offset) 957 return; 958 959 /* Slot used by another offset? */ 960 if (msrpm_offsets[i] != MSR_INVALID) 961 continue; 962 963 /* Add offset to list */ 964 msrpm_offsets[i] = offset; 965 966 return; 967 } 968 969 /* 970 * If this BUG triggers the msrpm_offsets table has an overflow. Just 971 * increase MSRPM_OFFSETS in this case. 972 */ 973 BUG(); 974 } 975 init_msrpm_offsets(void)976 static void init_msrpm_offsets(void) 977 { 978 int i; 979 980 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); 981 982 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 983 u32 offset; 984 985 offset = svm_msrpm_offset(direct_access_msrs[i].index); 986 BUG_ON(offset == MSR_INVALID); 987 988 add_msr_offset(offset); 989 } 990 } 991 svm_copy_lbrs(struct vmcb * to_vmcb,struct vmcb * from_vmcb)992 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) 993 { 994 to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; 995 to_vmcb->save.br_from = from_vmcb->save.br_from; 996 to_vmcb->save.br_to = from_vmcb->save.br_to; 997 to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; 998 to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; 999 1000 vmcb_mark_dirty(to_vmcb, VMCB_LBR); 1001 } 1002 svm_enable_lbrv(struct kvm_vcpu * vcpu)1003 void svm_enable_lbrv(struct kvm_vcpu *vcpu) 1004 { 1005 struct vcpu_svm *svm = to_svm(vcpu); 1006 1007 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 1008 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); 1009 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); 1010 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); 1011 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); 1012 1013 if (sev_es_guest(vcpu->kvm)) 1014 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); 1015 1016 /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ 1017 if (is_guest_mode(vcpu)) 1018 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); 1019 } 1020 svm_disable_lbrv(struct kvm_vcpu * vcpu)1021 static void svm_disable_lbrv(struct kvm_vcpu *vcpu) 1022 { 1023 struct vcpu_svm *svm = to_svm(vcpu); 1024 1025 KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); 1026 1027 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 1028 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); 1029 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); 1030 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); 1031 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); 1032 1033 /* 1034 * Move the LBR msrs back to the vmcb01 to avoid copying them 1035 * on nested guest entries. 1036 */ 1037 if (is_guest_mode(vcpu)) 1038 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); 1039 } 1040 svm_get_lbr_vmcb(struct vcpu_svm * svm)1041 static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) 1042 { 1043 /* 1044 * If LBR virtualization is disabled, the LBR MSRs are always kept in 1045 * vmcb01. If LBR virtualization is enabled and L1 is running VMs of 1046 * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. 1047 */ 1048 return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : 1049 svm->vmcb01.ptr; 1050 } 1051 svm_update_lbrv(struct kvm_vcpu * vcpu)1052 void svm_update_lbrv(struct kvm_vcpu *vcpu) 1053 { 1054 struct vcpu_svm *svm = to_svm(vcpu); 1055 bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; 1056 bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || 1057 (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && 1058 (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); 1059 1060 if (enable_lbrv == current_enable_lbrv) 1061 return; 1062 1063 if (enable_lbrv) 1064 svm_enable_lbrv(vcpu); 1065 else 1066 svm_disable_lbrv(vcpu); 1067 } 1068 disable_nmi_singlestep(struct vcpu_svm * svm)1069 void disable_nmi_singlestep(struct vcpu_svm *svm) 1070 { 1071 svm->nmi_singlestep = false; 1072 1073 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 1074 /* Clear our flags if they were not set by the guest */ 1075 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 1076 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; 1077 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 1078 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; 1079 } 1080 } 1081 grow_ple_window(struct kvm_vcpu * vcpu)1082 static void grow_ple_window(struct kvm_vcpu *vcpu) 1083 { 1084 struct vcpu_svm *svm = to_svm(vcpu); 1085 struct vmcb_control_area *control = &svm->vmcb->control; 1086 int old = control->pause_filter_count; 1087 1088 if (kvm_pause_in_guest(vcpu->kvm)) 1089 return; 1090 1091 control->pause_filter_count = __grow_ple_window(old, 1092 pause_filter_count, 1093 pause_filter_count_grow, 1094 pause_filter_count_max); 1095 1096 if (control->pause_filter_count != old) { 1097 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1098 trace_kvm_ple_window_update(vcpu->vcpu_id, 1099 control->pause_filter_count, old); 1100 } 1101 } 1102 shrink_ple_window(struct kvm_vcpu * vcpu)1103 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1104 { 1105 struct vcpu_svm *svm = to_svm(vcpu); 1106 struct vmcb_control_area *control = &svm->vmcb->control; 1107 int old = control->pause_filter_count; 1108 1109 if (kvm_pause_in_guest(vcpu->kvm)) 1110 return; 1111 1112 control->pause_filter_count = 1113 __shrink_ple_window(old, 1114 pause_filter_count, 1115 pause_filter_count_shrink, 1116 pause_filter_count); 1117 if (control->pause_filter_count != old) { 1118 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1119 trace_kvm_ple_window_update(vcpu->vcpu_id, 1120 control->pause_filter_count, old); 1121 } 1122 } 1123 svm_hardware_unsetup(void)1124 static void svm_hardware_unsetup(void) 1125 { 1126 int cpu; 1127 1128 sev_hardware_unsetup(); 1129 1130 for_each_possible_cpu(cpu) 1131 svm_cpu_uninit(cpu); 1132 1133 __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE)); 1134 iopm_base = 0; 1135 } 1136 init_seg(struct vmcb_seg * seg)1137 static void init_seg(struct vmcb_seg *seg) 1138 { 1139 seg->selector = 0; 1140 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 1141 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 1142 seg->limit = 0xffff; 1143 seg->base = 0; 1144 } 1145 init_sys_seg(struct vmcb_seg * seg,uint32_t type)1146 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) 1147 { 1148 seg->selector = 0; 1149 seg->attrib = SVM_SELECTOR_P_MASK | type; 1150 seg->limit = 0xffff; 1151 seg->base = 0; 1152 } 1153 svm_get_l2_tsc_offset(struct kvm_vcpu * vcpu)1154 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1155 { 1156 struct vcpu_svm *svm = to_svm(vcpu); 1157 1158 return svm->nested.ctl.tsc_offset; 1159 } 1160 svm_get_l2_tsc_multiplier(struct kvm_vcpu * vcpu)1161 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1162 { 1163 struct vcpu_svm *svm = to_svm(vcpu); 1164 1165 return svm->tsc_ratio_msr; 1166 } 1167 svm_write_tsc_offset(struct kvm_vcpu * vcpu)1168 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu) 1169 { 1170 struct vcpu_svm *svm = to_svm(vcpu); 1171 1172 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; 1173 svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; 1174 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1175 } 1176 svm_write_tsc_multiplier(struct kvm_vcpu * vcpu)1177 void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1178 { 1179 preempt_disable(); 1180 if (to_svm(vcpu)->guest_state_loaded) 1181 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); 1182 preempt_enable(); 1183 } 1184 1185 /* Evaluate instruction intercepts that depend on guest CPUID features. */ svm_recalc_instruction_intercepts(struct kvm_vcpu * vcpu,struct vcpu_svm * svm)1186 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, 1187 struct vcpu_svm *svm) 1188 { 1189 /* 1190 * Intercept INVPCID if shadow paging is enabled to sync/free shadow 1191 * roots, or if INVPCID is disabled in the guest to inject #UD. 1192 */ 1193 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { 1194 if (!npt_enabled || 1195 !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID)) 1196 svm_set_intercept(svm, INTERCEPT_INVPCID); 1197 else 1198 svm_clr_intercept(svm, INTERCEPT_INVPCID); 1199 } 1200 1201 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { 1202 if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP)) 1203 svm_clr_intercept(svm, INTERCEPT_RDTSCP); 1204 else 1205 svm_set_intercept(svm, INTERCEPT_RDTSCP); 1206 } 1207 } 1208 init_vmcb_after_set_cpuid(struct kvm_vcpu * vcpu)1209 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) 1210 { 1211 struct vcpu_svm *svm = to_svm(vcpu); 1212 1213 if (guest_cpuid_is_intel_compatible(vcpu)) { 1214 /* 1215 * We must intercept SYSENTER_EIP and SYSENTER_ESP 1216 * accesses because the processor only stores 32 bits. 1217 * For the same reason we cannot use virtual VMLOAD/VMSAVE. 1218 */ 1219 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1220 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1221 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1222 1223 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); 1224 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); 1225 } else { 1226 /* 1227 * If hardware supports Virtual VMLOAD VMSAVE then enable it 1228 * in VMCB and clear intercepts to avoid #VMEXIT. 1229 */ 1230 if (vls) { 1231 svm_clr_intercept(svm, INTERCEPT_VMLOAD); 1232 svm_clr_intercept(svm, INTERCEPT_VMSAVE); 1233 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1234 } 1235 /* No need to intercept these MSRs */ 1236 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); 1237 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); 1238 } 1239 } 1240 init_vmcb(struct kvm_vcpu * vcpu)1241 static void init_vmcb(struct kvm_vcpu *vcpu) 1242 { 1243 struct vcpu_svm *svm = to_svm(vcpu); 1244 struct vmcb *vmcb = svm->vmcb01.ptr; 1245 struct vmcb_control_area *control = &vmcb->control; 1246 struct vmcb_save_area *save = &vmcb->save; 1247 1248 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1249 svm_set_intercept(svm, INTERCEPT_CR3_READ); 1250 svm_set_intercept(svm, INTERCEPT_CR4_READ); 1251 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1252 svm_set_intercept(svm, INTERCEPT_CR3_WRITE); 1253 svm_set_intercept(svm, INTERCEPT_CR4_WRITE); 1254 if (!kvm_vcpu_apicv_active(vcpu)) 1255 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 1256 1257 set_dr_intercepts(svm); 1258 1259 set_exception_intercept(svm, PF_VECTOR); 1260 set_exception_intercept(svm, UD_VECTOR); 1261 set_exception_intercept(svm, MC_VECTOR); 1262 set_exception_intercept(svm, AC_VECTOR); 1263 set_exception_intercept(svm, DB_VECTOR); 1264 /* 1265 * Guest access to VMware backdoor ports could legitimately 1266 * trigger #GP because of TSS I/O permission bitmap. 1267 * We intercept those #GP and allow access to them anyway 1268 * as VMware does. 1269 */ 1270 if (enable_vmware_backdoor) 1271 set_exception_intercept(svm, GP_VECTOR); 1272 1273 svm_set_intercept(svm, INTERCEPT_INTR); 1274 svm_set_intercept(svm, INTERCEPT_NMI); 1275 1276 if (intercept_smi) 1277 svm_set_intercept(svm, INTERCEPT_SMI); 1278 1279 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1280 svm_set_intercept(svm, INTERCEPT_RDPMC); 1281 svm_set_intercept(svm, INTERCEPT_CPUID); 1282 svm_set_intercept(svm, INTERCEPT_INVD); 1283 svm_set_intercept(svm, INTERCEPT_INVLPG); 1284 svm_set_intercept(svm, INTERCEPT_INVLPGA); 1285 svm_set_intercept(svm, INTERCEPT_IOIO_PROT); 1286 svm_set_intercept(svm, INTERCEPT_MSR_PROT); 1287 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH); 1288 svm_set_intercept(svm, INTERCEPT_SHUTDOWN); 1289 svm_set_intercept(svm, INTERCEPT_VMRUN); 1290 svm_set_intercept(svm, INTERCEPT_VMMCALL); 1291 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1292 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1293 svm_set_intercept(svm, INTERCEPT_STGI); 1294 svm_set_intercept(svm, INTERCEPT_CLGI); 1295 svm_set_intercept(svm, INTERCEPT_SKINIT); 1296 svm_set_intercept(svm, INTERCEPT_WBINVD); 1297 svm_set_intercept(svm, INTERCEPT_XSETBV); 1298 svm_set_intercept(svm, INTERCEPT_RDPRU); 1299 svm_set_intercept(svm, INTERCEPT_RSM); 1300 1301 if (!kvm_mwait_in_guest(vcpu->kvm)) { 1302 svm_set_intercept(svm, INTERCEPT_MONITOR); 1303 svm_set_intercept(svm, INTERCEPT_MWAIT); 1304 } 1305 1306 if (!kvm_hlt_in_guest(vcpu->kvm)) { 1307 if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) 1308 svm_set_intercept(svm, INTERCEPT_IDLE_HLT); 1309 else 1310 svm_set_intercept(svm, INTERCEPT_HLT); 1311 } 1312 1313 control->iopm_base_pa = iopm_base; 1314 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1315 control->int_ctl = V_INTR_MASKING_MASK; 1316 1317 init_seg(&save->es); 1318 init_seg(&save->ss); 1319 init_seg(&save->ds); 1320 init_seg(&save->fs); 1321 init_seg(&save->gs); 1322 1323 save->cs.selector = 0xf000; 1324 save->cs.base = 0xffff0000; 1325 /* Executable/Readable Code Segment */ 1326 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1327 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1328 save->cs.limit = 0xffff; 1329 1330 save->gdtr.base = 0; 1331 save->gdtr.limit = 0xffff; 1332 save->idtr.base = 0; 1333 save->idtr.limit = 0xffff; 1334 1335 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1336 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1337 1338 if (npt_enabled) { 1339 /* Setup VMCB for Nested Paging */ 1340 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 1341 svm_clr_intercept(svm, INTERCEPT_INVLPG); 1342 clr_exception_intercept(svm, PF_VECTOR); 1343 svm_clr_intercept(svm, INTERCEPT_CR3_READ); 1344 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); 1345 save->g_pat = vcpu->arch.pat; 1346 save->cr3 = 0; 1347 } 1348 svm->current_vmcb->asid_generation = 0; 1349 svm->asid = 0; 1350 1351 svm->nested.vmcb12_gpa = INVALID_GPA; 1352 svm->nested.last_vmcb12_gpa = INVALID_GPA; 1353 1354 if (!kvm_pause_in_guest(vcpu->kvm)) { 1355 control->pause_filter_count = pause_filter_count; 1356 if (pause_filter_thresh) 1357 control->pause_filter_thresh = pause_filter_thresh; 1358 svm_set_intercept(svm, INTERCEPT_PAUSE); 1359 } else { 1360 svm_clr_intercept(svm, INTERCEPT_PAUSE); 1361 } 1362 1363 svm_recalc_instruction_intercepts(vcpu, svm); 1364 1365 /* 1366 * If the host supports V_SPEC_CTRL then disable the interception 1367 * of MSR_IA32_SPEC_CTRL. 1368 */ 1369 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 1370 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 1371 1372 if (kvm_vcpu_apicv_active(vcpu)) 1373 avic_init_vmcb(svm, vmcb); 1374 1375 if (vnmi) 1376 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK; 1377 1378 if (vgif) { 1379 svm_clr_intercept(svm, INTERCEPT_STGI); 1380 svm_clr_intercept(svm, INTERCEPT_CLGI); 1381 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1382 } 1383 1384 if (sev_guest(vcpu->kvm)) 1385 sev_init_vmcb(svm); 1386 1387 svm_hv_init_vmcb(vmcb); 1388 init_vmcb_after_set_cpuid(vcpu); 1389 1390 vmcb_mark_all_dirty(vmcb); 1391 1392 enable_gif(svm); 1393 } 1394 __svm_vcpu_reset(struct kvm_vcpu * vcpu)1395 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) 1396 { 1397 struct vcpu_svm *svm = to_svm(vcpu); 1398 1399 svm_vcpu_init_msrpm(vcpu, svm->msrpm); 1400 1401 svm_init_osvw(vcpu); 1402 1403 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 1404 vcpu->arch.microcode_version = 0x01000065; 1405 svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; 1406 1407 svm->nmi_masked = false; 1408 svm->awaiting_iret_completion = false; 1409 1410 if (sev_es_guest(vcpu->kvm)) 1411 sev_es_vcpu_reset(svm); 1412 } 1413 svm_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)1414 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 1415 { 1416 struct vcpu_svm *svm = to_svm(vcpu); 1417 1418 svm->spec_ctrl = 0; 1419 svm->virt_spec_ctrl = 0; 1420 1421 if (init_event) 1422 sev_snp_init_protected_guest_state(vcpu); 1423 1424 init_vmcb(vcpu); 1425 1426 if (!init_event) 1427 __svm_vcpu_reset(vcpu); 1428 } 1429 svm_switch_vmcb(struct vcpu_svm * svm,struct kvm_vmcb_info * target_vmcb)1430 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) 1431 { 1432 svm->current_vmcb = target_vmcb; 1433 svm->vmcb = target_vmcb->ptr; 1434 } 1435 svm_vcpu_create(struct kvm_vcpu * vcpu)1436 static int svm_vcpu_create(struct kvm_vcpu *vcpu) 1437 { 1438 struct vcpu_svm *svm; 1439 struct page *vmcb01_page; 1440 struct page *vmsa_page = NULL; 1441 int err; 1442 1443 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); 1444 svm = to_svm(vcpu); 1445 1446 err = -ENOMEM; 1447 vmcb01_page = snp_safe_alloc_page(); 1448 if (!vmcb01_page) 1449 goto out; 1450 1451 if (sev_es_guest(vcpu->kvm)) { 1452 /* 1453 * SEV-ES guests require a separate VMSA page used to contain 1454 * the encrypted register state of the guest. 1455 */ 1456 vmsa_page = snp_safe_alloc_page(); 1457 if (!vmsa_page) 1458 goto error_free_vmcb_page; 1459 } 1460 1461 err = avic_init_vcpu(svm); 1462 if (err) 1463 goto error_free_vmsa_page; 1464 1465 svm->msrpm = svm_vcpu_alloc_msrpm(); 1466 if (!svm->msrpm) { 1467 err = -ENOMEM; 1468 goto error_free_vmsa_page; 1469 } 1470 1471 svm->x2avic_msrs_intercepted = true; 1472 1473 svm->vmcb01.ptr = page_address(vmcb01_page); 1474 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); 1475 svm_switch_vmcb(svm, &svm->vmcb01); 1476 1477 if (vmsa_page) 1478 svm->sev_es.vmsa = page_address(vmsa_page); 1479 1480 svm->guest_state_loaded = false; 1481 1482 return 0; 1483 1484 error_free_vmsa_page: 1485 if (vmsa_page) 1486 __free_page(vmsa_page); 1487 error_free_vmcb_page: 1488 __free_page(vmcb01_page); 1489 out: 1490 return err; 1491 } 1492 svm_clear_current_vmcb(struct vmcb * vmcb)1493 static void svm_clear_current_vmcb(struct vmcb *vmcb) 1494 { 1495 int i; 1496 1497 for_each_online_cpu(i) 1498 cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); 1499 } 1500 svm_vcpu_free(struct kvm_vcpu * vcpu)1501 static void svm_vcpu_free(struct kvm_vcpu *vcpu) 1502 { 1503 struct vcpu_svm *svm = to_svm(vcpu); 1504 1505 /* 1506 * The vmcb page can be recycled, causing a false negative in 1507 * svm_vcpu_load(). So, ensure that no logical CPU has this 1508 * vmcb page recorded as its current vmcb. 1509 */ 1510 svm_clear_current_vmcb(svm->vmcb); 1511 1512 svm_leave_nested(vcpu); 1513 svm_free_nested(svm); 1514 1515 sev_free_vcpu(vcpu); 1516 1517 __free_page(__sme_pa_to_page(svm->vmcb01.pa)); 1518 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); 1519 } 1520 svm_prepare_switch_to_guest(struct kvm_vcpu * vcpu)1521 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1522 { 1523 struct vcpu_svm *svm = to_svm(vcpu); 1524 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 1525 1526 if (sev_es_guest(vcpu->kvm)) 1527 sev_es_unmap_ghcb(svm); 1528 1529 if (svm->guest_state_loaded) 1530 return; 1531 1532 /* 1533 * Save additional host state that will be restored on VMEXIT (sev-es) 1534 * or subsequent vmload of host save area. 1535 */ 1536 vmsave(sd->save_area_pa); 1537 if (sev_es_guest(vcpu->kvm)) 1538 sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd)); 1539 1540 if (tsc_scaling) 1541 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); 1542 1543 /* 1544 * TSC_AUX is always virtualized for SEV-ES guests when the feature is 1545 * available. The user return MSR support is not required in this case 1546 * because TSC_AUX is restored on #VMEXIT from the host save area 1547 * (which has been initialized in svm_enable_virtualization_cpu()). 1548 */ 1549 if (likely(tsc_aux_uret_slot >= 0) && 1550 (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) 1551 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); 1552 1553 svm->guest_state_loaded = true; 1554 } 1555 svm_prepare_host_switch(struct kvm_vcpu * vcpu)1556 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) 1557 { 1558 to_svm(vcpu)->guest_state_loaded = false; 1559 } 1560 svm_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1561 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1562 { 1563 struct vcpu_svm *svm = to_svm(vcpu); 1564 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 1565 1566 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1567 shrink_ple_window(vcpu); 1568 1569 if (sd->current_vmcb != svm->vmcb) { 1570 sd->current_vmcb = svm->vmcb; 1571 1572 if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) && 1573 static_branch_likely(&switch_vcpu_ibpb)) 1574 indirect_branch_prediction_barrier(); 1575 } 1576 if (kvm_vcpu_apicv_active(vcpu)) 1577 avic_vcpu_load(vcpu, cpu); 1578 } 1579 svm_vcpu_put(struct kvm_vcpu * vcpu)1580 static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1581 { 1582 if (kvm_vcpu_apicv_active(vcpu)) 1583 avic_vcpu_put(vcpu); 1584 1585 svm_prepare_host_switch(vcpu); 1586 1587 ++vcpu->stat.host_state_reload; 1588 } 1589 svm_get_rflags(struct kvm_vcpu * vcpu)1590 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1591 { 1592 struct vcpu_svm *svm = to_svm(vcpu); 1593 unsigned long rflags = svm->vmcb->save.rflags; 1594 1595 if (svm->nmi_singlestep) { 1596 /* Hide our flags if they were not set by the guest */ 1597 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 1598 rflags &= ~X86_EFLAGS_TF; 1599 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 1600 rflags &= ~X86_EFLAGS_RF; 1601 } 1602 return rflags; 1603 } 1604 svm_set_rflags(struct kvm_vcpu * vcpu,unsigned long rflags)1605 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1606 { 1607 if (to_svm(vcpu)->nmi_singlestep) 1608 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 1609 1610 /* 1611 * Any change of EFLAGS.VM is accompanied by a reload of SS 1612 * (caused by either a task switch or an inter-privilege IRET), 1613 * so we do not need to update the CPL here. 1614 */ 1615 to_svm(vcpu)->vmcb->save.rflags = rflags; 1616 } 1617 svm_get_if_flag(struct kvm_vcpu * vcpu)1618 static bool svm_get_if_flag(struct kvm_vcpu *vcpu) 1619 { 1620 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1621 1622 return sev_es_guest(vcpu->kvm) 1623 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK 1624 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; 1625 } 1626 svm_cache_reg(struct kvm_vcpu * vcpu,enum kvm_reg reg)1627 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1628 { 1629 kvm_register_mark_available(vcpu, reg); 1630 1631 switch (reg) { 1632 case VCPU_EXREG_PDPTR: 1633 /* 1634 * When !npt_enabled, mmu->pdptrs[] is already available since 1635 * it is always updated per SDM when moving to CRs. 1636 */ 1637 if (npt_enabled) 1638 load_pdptrs(vcpu, kvm_read_cr3(vcpu)); 1639 break; 1640 default: 1641 KVM_BUG_ON(1, vcpu->kvm); 1642 } 1643 } 1644 svm_set_vintr(struct vcpu_svm * svm)1645 static void svm_set_vintr(struct vcpu_svm *svm) 1646 { 1647 struct vmcb_control_area *control; 1648 1649 /* 1650 * The following fields are ignored when AVIC is enabled 1651 */ 1652 WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu)); 1653 1654 svm_set_intercept(svm, INTERCEPT_VINTR); 1655 1656 /* 1657 * Recalculating intercepts may have cleared the VINTR intercept. If 1658 * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF 1659 * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN. 1660 * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as 1661 * interrupts will never be unblocked while L2 is running. 1662 */ 1663 if (!svm_is_intercept(svm, INTERCEPT_VINTR)) 1664 return; 1665 1666 /* 1667 * This is just a dummy VINTR to actually cause a vmexit to happen. 1668 * Actual injection of virtual interrupts happens through EVENTINJ. 1669 */ 1670 control = &svm->vmcb->control; 1671 control->int_vector = 0x0; 1672 control->int_ctl &= ~V_INTR_PRIO_MASK; 1673 control->int_ctl |= V_IRQ_MASK | 1674 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 1675 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1676 } 1677 svm_clear_vintr(struct vcpu_svm * svm)1678 static void svm_clear_vintr(struct vcpu_svm *svm) 1679 { 1680 svm_clr_intercept(svm, INTERCEPT_VINTR); 1681 1682 /* Drop int_ctl fields related to VINTR injection. */ 1683 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1684 if (is_guest_mode(&svm->vcpu)) { 1685 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1686 1687 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != 1688 (svm->nested.ctl.int_ctl & V_TPR_MASK)); 1689 1690 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & 1691 V_IRQ_INJECTION_BITS_MASK; 1692 1693 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; 1694 } 1695 1696 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1697 } 1698 svm_seg(struct kvm_vcpu * vcpu,int seg)1699 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1700 { 1701 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1702 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save; 1703 1704 switch (seg) { 1705 case VCPU_SREG_CS: return &save->cs; 1706 case VCPU_SREG_DS: return &save->ds; 1707 case VCPU_SREG_ES: return &save->es; 1708 case VCPU_SREG_FS: return &save01->fs; 1709 case VCPU_SREG_GS: return &save01->gs; 1710 case VCPU_SREG_SS: return &save->ss; 1711 case VCPU_SREG_TR: return &save01->tr; 1712 case VCPU_SREG_LDTR: return &save01->ldtr; 1713 } 1714 BUG(); 1715 return NULL; 1716 } 1717 svm_get_segment_base(struct kvm_vcpu * vcpu,int seg)1718 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) 1719 { 1720 struct vmcb_seg *s = svm_seg(vcpu, seg); 1721 1722 return s->base; 1723 } 1724 svm_get_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)1725 static void svm_get_segment(struct kvm_vcpu *vcpu, 1726 struct kvm_segment *var, int seg) 1727 { 1728 struct vmcb_seg *s = svm_seg(vcpu, seg); 1729 1730 var->base = s->base; 1731 var->limit = s->limit; 1732 var->selector = s->selector; 1733 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; 1734 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; 1735 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; 1736 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; 1737 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; 1738 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 1739 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1740 1741 /* 1742 * AMD CPUs circa 2014 track the G bit for all segments except CS. 1743 * However, the SVM spec states that the G bit is not observed by the 1744 * CPU, and some VMware virtual CPUs drop the G bit for all segments. 1745 * So let's synthesize a legal G bit for all segments, this helps 1746 * running KVM nested. It also helps cross-vendor migration, because 1747 * Intel's vmentry has a check on the 'G' bit. 1748 */ 1749 var->g = s->limit > 0xfffff; 1750 1751 /* 1752 * AMD's VMCB does not have an explicit unusable field, so emulate it 1753 * for cross vendor migration purposes by "not present" 1754 */ 1755 var->unusable = !var->present; 1756 1757 switch (seg) { 1758 case VCPU_SREG_TR: 1759 /* 1760 * Work around a bug where the busy flag in the tr selector 1761 * isn't exposed 1762 */ 1763 var->type |= 0x2; 1764 break; 1765 case VCPU_SREG_DS: 1766 case VCPU_SREG_ES: 1767 case VCPU_SREG_FS: 1768 case VCPU_SREG_GS: 1769 /* 1770 * The accessed bit must always be set in the segment 1771 * descriptor cache, although it can be cleared in the 1772 * descriptor, the cached bit always remains at 1. Since 1773 * Intel has a check on this, set it here to support 1774 * cross-vendor migration. 1775 */ 1776 if (!var->unusable) 1777 var->type |= 0x1; 1778 break; 1779 case VCPU_SREG_SS: 1780 /* 1781 * On AMD CPUs sometimes the DB bit in the segment 1782 * descriptor is left as 1, although the whole segment has 1783 * been made unusable. Clear it here to pass an Intel VMX 1784 * entry check when cross vendor migrating. 1785 */ 1786 if (var->unusable) 1787 var->db = 0; 1788 /* This is symmetric with svm_set_segment() */ 1789 var->dpl = to_svm(vcpu)->vmcb->save.cpl; 1790 break; 1791 } 1792 } 1793 svm_get_cpl(struct kvm_vcpu * vcpu)1794 static int svm_get_cpl(struct kvm_vcpu *vcpu) 1795 { 1796 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1797 1798 return save->cpl; 1799 } 1800 svm_get_cs_db_l_bits(struct kvm_vcpu * vcpu,int * db,int * l)1801 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 1802 { 1803 struct kvm_segment cs; 1804 1805 svm_get_segment(vcpu, &cs, VCPU_SREG_CS); 1806 *db = cs.db; 1807 *l = cs.l; 1808 } 1809 svm_get_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1810 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1811 { 1812 struct vcpu_svm *svm = to_svm(vcpu); 1813 1814 dt->size = svm->vmcb->save.idtr.limit; 1815 dt->address = svm->vmcb->save.idtr.base; 1816 } 1817 svm_set_idt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1818 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1819 { 1820 struct vcpu_svm *svm = to_svm(vcpu); 1821 1822 svm->vmcb->save.idtr.limit = dt->size; 1823 svm->vmcb->save.idtr.base = dt->address ; 1824 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1825 } 1826 svm_get_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1827 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1828 { 1829 struct vcpu_svm *svm = to_svm(vcpu); 1830 1831 dt->size = svm->vmcb->save.gdtr.limit; 1832 dt->address = svm->vmcb->save.gdtr.base; 1833 } 1834 svm_set_gdt(struct kvm_vcpu * vcpu,struct desc_ptr * dt)1835 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1836 { 1837 struct vcpu_svm *svm = to_svm(vcpu); 1838 1839 svm->vmcb->save.gdtr.limit = dt->size; 1840 svm->vmcb->save.gdtr.base = dt->address ; 1841 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1842 } 1843 sev_post_set_cr3(struct kvm_vcpu * vcpu,unsigned long cr3)1844 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1845 { 1846 struct vcpu_svm *svm = to_svm(vcpu); 1847 1848 /* 1849 * For guests that don't set guest_state_protected, the cr3 update is 1850 * handled via kvm_mmu_load() while entering the guest. For guests 1851 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to 1852 * VMCB save area now, since the save area will become the initial 1853 * contents of the VMSA, and future VMCB save area updates won't be 1854 * seen. 1855 */ 1856 if (sev_es_guest(vcpu->kvm)) { 1857 svm->vmcb->save.cr3 = cr3; 1858 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1859 } 1860 } 1861 svm_is_valid_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)1862 static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1863 { 1864 return true; 1865 } 1866 svm_set_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)1867 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1868 { 1869 struct vcpu_svm *svm = to_svm(vcpu); 1870 u64 hcr0 = cr0; 1871 bool old_paging = is_paging(vcpu); 1872 1873 #ifdef CONFIG_X86_64 1874 if (vcpu->arch.efer & EFER_LME) { 1875 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1876 vcpu->arch.efer |= EFER_LMA; 1877 if (!vcpu->arch.guest_state_protected) 1878 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1879 } 1880 1881 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1882 vcpu->arch.efer &= ~EFER_LMA; 1883 if (!vcpu->arch.guest_state_protected) 1884 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1885 } 1886 } 1887 #endif 1888 vcpu->arch.cr0 = cr0; 1889 1890 if (!npt_enabled) { 1891 hcr0 |= X86_CR0_PG | X86_CR0_WP; 1892 if (old_paging != is_paging(vcpu)) 1893 svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); 1894 } 1895 1896 /* 1897 * re-enable caching here because the QEMU bios 1898 * does not do it - this results in some delay at 1899 * reboot 1900 */ 1901 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 1902 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1903 1904 svm->vmcb->save.cr0 = hcr0; 1905 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1906 1907 /* 1908 * SEV-ES guests must always keep the CR intercepts cleared. CR 1909 * tracking is done using the CR write traps. 1910 */ 1911 if (sev_es_guest(vcpu->kvm)) 1912 return; 1913 1914 if (hcr0 == cr0) { 1915 /* Selective CR0 write remains on. */ 1916 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 1917 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 1918 } else { 1919 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1920 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1921 } 1922 } 1923 svm_is_valid_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)1924 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1925 { 1926 return true; 1927 } 1928 svm_set_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)1929 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1930 { 1931 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; 1932 unsigned long old_cr4 = vcpu->arch.cr4; 1933 1934 vcpu->arch.cr4 = cr4; 1935 if (!npt_enabled) { 1936 cr4 |= X86_CR4_PAE; 1937 1938 if (!is_paging(vcpu)) 1939 cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 1940 } 1941 cr4 |= host_cr4_mce; 1942 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1943 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 1944 1945 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 1946 vcpu->arch.cpuid_dynamic_bits_dirty = true; 1947 } 1948 svm_set_segment(struct kvm_vcpu * vcpu,struct kvm_segment * var,int seg)1949 static void svm_set_segment(struct kvm_vcpu *vcpu, 1950 struct kvm_segment *var, int seg) 1951 { 1952 struct vcpu_svm *svm = to_svm(vcpu); 1953 struct vmcb_seg *s = svm_seg(vcpu, seg); 1954 1955 s->base = var->base; 1956 s->limit = var->limit; 1957 s->selector = var->selector; 1958 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); 1959 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; 1960 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; 1961 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; 1962 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; 1963 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; 1964 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 1965 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1966 1967 /* 1968 * This is always accurate, except if SYSRET returned to a segment 1969 * with SS.DPL != 3. Intel does not have this quirk, and always 1970 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it 1971 * would entail passing the CPL to userspace and back. 1972 */ 1973 if (seg == VCPU_SREG_SS) 1974 /* This is symmetric with svm_get_segment() */ 1975 svm->vmcb->save.cpl = (var->dpl & 3); 1976 1977 vmcb_mark_dirty(svm->vmcb, VMCB_SEG); 1978 } 1979 svm_update_exception_bitmap(struct kvm_vcpu * vcpu)1980 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu) 1981 { 1982 struct vcpu_svm *svm = to_svm(vcpu); 1983 1984 clr_exception_intercept(svm, BP_VECTOR); 1985 1986 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1987 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1988 set_exception_intercept(svm, BP_VECTOR); 1989 } 1990 } 1991 new_asid(struct vcpu_svm * svm,struct svm_cpu_data * sd)1992 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1993 { 1994 if (sd->next_asid > sd->max_asid) { 1995 ++sd->asid_generation; 1996 sd->next_asid = sd->min_asid; 1997 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1998 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 1999 } 2000 2001 svm->current_vmcb->asid_generation = sd->asid_generation; 2002 svm->asid = sd->next_asid++; 2003 } 2004 svm_set_dr6(struct kvm_vcpu * vcpu,unsigned long value)2005 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) 2006 { 2007 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 2008 2009 if (vcpu->arch.guest_state_protected) 2010 return; 2011 2012 if (unlikely(value != vmcb->save.dr6)) { 2013 vmcb->save.dr6 = value; 2014 vmcb_mark_dirty(vmcb, VMCB_DR); 2015 } 2016 } 2017 svm_sync_dirty_debug_regs(struct kvm_vcpu * vcpu)2018 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 2019 { 2020 struct vcpu_svm *svm = to_svm(vcpu); 2021 2022 if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm))) 2023 return; 2024 2025 get_debugreg(vcpu->arch.db[0], 0); 2026 get_debugreg(vcpu->arch.db[1], 1); 2027 get_debugreg(vcpu->arch.db[2], 2); 2028 get_debugreg(vcpu->arch.db[3], 3); 2029 /* 2030 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here, 2031 * because db_interception might need it. We can do it before vmentry. 2032 */ 2033 vcpu->arch.dr6 = svm->vmcb->save.dr6; 2034 vcpu->arch.dr7 = svm->vmcb->save.dr7; 2035 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 2036 set_dr_intercepts(svm); 2037 } 2038 svm_set_dr7(struct kvm_vcpu * vcpu,unsigned long value)2039 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 2040 { 2041 struct vcpu_svm *svm = to_svm(vcpu); 2042 2043 if (vcpu->arch.guest_state_protected) 2044 return; 2045 2046 svm->vmcb->save.dr7 = value; 2047 vmcb_mark_dirty(svm->vmcb, VMCB_DR); 2048 } 2049 pf_interception(struct kvm_vcpu * vcpu)2050 static int pf_interception(struct kvm_vcpu *vcpu) 2051 { 2052 struct vcpu_svm *svm = to_svm(vcpu); 2053 2054 u64 fault_address = svm->vmcb->control.exit_info_2; 2055 u64 error_code = svm->vmcb->control.exit_info_1; 2056 2057 return kvm_handle_page_fault(vcpu, error_code, fault_address, 2058 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 2059 svm->vmcb->control.insn_bytes : NULL, 2060 svm->vmcb->control.insn_len); 2061 } 2062 npf_interception(struct kvm_vcpu * vcpu)2063 static int npf_interception(struct kvm_vcpu *vcpu) 2064 { 2065 struct vcpu_svm *svm = to_svm(vcpu); 2066 int rc; 2067 2068 u64 fault_address = svm->vmcb->control.exit_info_2; 2069 u64 error_code = svm->vmcb->control.exit_info_1; 2070 2071 /* 2072 * WARN if hardware generates a fault with an error code that collides 2073 * with KVM-defined sythentic flags. Clear the flags and continue on, 2074 * i.e. don't terminate the VM, as KVM can't possibly be relying on a 2075 * flag that KVM doesn't know about. 2076 */ 2077 if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) 2078 error_code &= ~PFERR_SYNTHETIC_MASK; 2079 2080 if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) 2081 error_code |= PFERR_PRIVATE_ACCESS; 2082 2083 trace_kvm_page_fault(vcpu, fault_address, error_code); 2084 rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, 2085 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 2086 svm->vmcb->control.insn_bytes : NULL, 2087 svm->vmcb->control.insn_len); 2088 2089 if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) 2090 sev_handle_rmp_fault(vcpu, fault_address, error_code); 2091 2092 return rc; 2093 } 2094 db_interception(struct kvm_vcpu * vcpu)2095 static int db_interception(struct kvm_vcpu *vcpu) 2096 { 2097 struct kvm_run *kvm_run = vcpu->run; 2098 struct vcpu_svm *svm = to_svm(vcpu); 2099 2100 if (!(vcpu->guest_debug & 2101 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 2102 !svm->nmi_singlestep) { 2103 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; 2104 kvm_queue_exception_p(vcpu, DB_VECTOR, payload); 2105 return 1; 2106 } 2107 2108 if (svm->nmi_singlestep) { 2109 disable_nmi_singlestep(svm); 2110 /* Make sure we check for pending NMIs upon entry */ 2111 kvm_make_request(KVM_REQ_EVENT, vcpu); 2112 } 2113 2114 if (vcpu->guest_debug & 2115 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { 2116 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2117 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; 2118 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; 2119 kvm_run->debug.arch.pc = 2120 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2121 kvm_run->debug.arch.exception = DB_VECTOR; 2122 return 0; 2123 } 2124 2125 return 1; 2126 } 2127 bp_interception(struct kvm_vcpu * vcpu)2128 static int bp_interception(struct kvm_vcpu *vcpu) 2129 { 2130 struct vcpu_svm *svm = to_svm(vcpu); 2131 struct kvm_run *kvm_run = vcpu->run; 2132 2133 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2134 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2135 kvm_run->debug.arch.exception = BP_VECTOR; 2136 return 0; 2137 } 2138 ud_interception(struct kvm_vcpu * vcpu)2139 static int ud_interception(struct kvm_vcpu *vcpu) 2140 { 2141 return handle_ud(vcpu); 2142 } 2143 ac_interception(struct kvm_vcpu * vcpu)2144 static int ac_interception(struct kvm_vcpu *vcpu) 2145 { 2146 kvm_queue_exception_e(vcpu, AC_VECTOR, 0); 2147 return 1; 2148 } 2149 is_erratum_383(void)2150 static bool is_erratum_383(void) 2151 { 2152 int err, i; 2153 u64 value; 2154 2155 if (!erratum_383_found) 2156 return false; 2157 2158 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); 2159 if (err) 2160 return false; 2161 2162 /* Bit 62 may or may not be set for this mce */ 2163 value &= ~(1ULL << 62); 2164 2165 if (value != 0xb600000000010015ULL) 2166 return false; 2167 2168 /* Clear MCi_STATUS registers */ 2169 for (i = 0; i < 6; ++i) 2170 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); 2171 2172 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); 2173 if (!err) { 2174 u32 low, high; 2175 2176 value &= ~(1ULL << 2); 2177 low = lower_32_bits(value); 2178 high = upper_32_bits(value); 2179 2180 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); 2181 } 2182 2183 /* Flush tlb to evict multi-match entries */ 2184 __flush_tlb_all(); 2185 2186 return true; 2187 } 2188 svm_handle_mce(struct kvm_vcpu * vcpu)2189 static void svm_handle_mce(struct kvm_vcpu *vcpu) 2190 { 2191 if (is_erratum_383()) { 2192 /* 2193 * Erratum 383 triggered. Guest state is corrupt so kill the 2194 * guest. 2195 */ 2196 pr_err("Guest triggered AMD Erratum 383\n"); 2197 2198 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2199 2200 return; 2201 } 2202 2203 /* 2204 * On an #MC intercept the MCE handler is not called automatically in 2205 * the host. So do it by hand here. 2206 */ 2207 kvm_machine_check(); 2208 } 2209 mc_interception(struct kvm_vcpu * vcpu)2210 static int mc_interception(struct kvm_vcpu *vcpu) 2211 { 2212 return 1; 2213 } 2214 shutdown_interception(struct kvm_vcpu * vcpu)2215 static int shutdown_interception(struct kvm_vcpu *vcpu) 2216 { 2217 struct kvm_run *kvm_run = vcpu->run; 2218 struct vcpu_svm *svm = to_svm(vcpu); 2219 2220 2221 /* 2222 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put 2223 * the VMCB in a known good state. Unfortuately, KVM doesn't have 2224 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking 2225 * userspace. At a platform view, INIT is acceptable behavior as 2226 * there exist bare metal platforms that automatically INIT the CPU 2227 * in response to shutdown. 2228 * 2229 * The VM save area for SEV-ES guests has already been encrypted so it 2230 * cannot be reinitialized, i.e. synthesizing INIT is futile. 2231 */ 2232 if (!sev_es_guest(vcpu->kvm)) { 2233 clear_page(svm->vmcb); 2234 kvm_vcpu_reset(vcpu, true); 2235 } 2236 2237 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2238 return 0; 2239 } 2240 io_interception(struct kvm_vcpu * vcpu)2241 static int io_interception(struct kvm_vcpu *vcpu) 2242 { 2243 struct vcpu_svm *svm = to_svm(vcpu); 2244 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 2245 int size, in, string; 2246 unsigned port; 2247 2248 ++vcpu->stat.io_exits; 2249 string = (io_info & SVM_IOIO_STR_MASK) != 0; 2250 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 2251 port = io_info >> 16; 2252 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2253 2254 if (string) { 2255 if (sev_es_guest(vcpu->kvm)) 2256 return sev_es_string_io(svm, size, port, in); 2257 else 2258 return kvm_emulate_instruction(vcpu, 0); 2259 } 2260 2261 svm->next_rip = svm->vmcb->control.exit_info_2; 2262 2263 return kvm_fast_pio(vcpu, size, port, in); 2264 } 2265 nmi_interception(struct kvm_vcpu * vcpu)2266 static int nmi_interception(struct kvm_vcpu *vcpu) 2267 { 2268 return 1; 2269 } 2270 smi_interception(struct kvm_vcpu * vcpu)2271 static int smi_interception(struct kvm_vcpu *vcpu) 2272 { 2273 return 1; 2274 } 2275 intr_interception(struct kvm_vcpu * vcpu)2276 static int intr_interception(struct kvm_vcpu *vcpu) 2277 { 2278 ++vcpu->stat.irq_exits; 2279 return 1; 2280 } 2281 vmload_vmsave_interception(struct kvm_vcpu * vcpu,bool vmload)2282 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) 2283 { 2284 struct vcpu_svm *svm = to_svm(vcpu); 2285 struct vmcb *vmcb12; 2286 struct kvm_host_map map; 2287 int ret; 2288 2289 if (nested_svm_check_permissions(vcpu)) 2290 return 1; 2291 2292 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); 2293 if (ret) { 2294 if (ret == -EINVAL) 2295 kvm_inject_gp(vcpu, 0); 2296 return 1; 2297 } 2298 2299 vmcb12 = map.hva; 2300 2301 ret = kvm_skip_emulated_instruction(vcpu); 2302 2303 if (vmload) { 2304 svm_copy_vmloadsave_state(svm->vmcb, vmcb12); 2305 svm->sysenter_eip_hi = 0; 2306 svm->sysenter_esp_hi = 0; 2307 } else { 2308 svm_copy_vmloadsave_state(vmcb12, svm->vmcb); 2309 } 2310 2311 kvm_vcpu_unmap(vcpu, &map); 2312 2313 return ret; 2314 } 2315 vmload_interception(struct kvm_vcpu * vcpu)2316 static int vmload_interception(struct kvm_vcpu *vcpu) 2317 { 2318 return vmload_vmsave_interception(vcpu, true); 2319 } 2320 vmsave_interception(struct kvm_vcpu * vcpu)2321 static int vmsave_interception(struct kvm_vcpu *vcpu) 2322 { 2323 return vmload_vmsave_interception(vcpu, false); 2324 } 2325 vmrun_interception(struct kvm_vcpu * vcpu)2326 static int vmrun_interception(struct kvm_vcpu *vcpu) 2327 { 2328 if (nested_svm_check_permissions(vcpu)) 2329 return 1; 2330 2331 return nested_svm_vmrun(vcpu); 2332 } 2333 2334 enum { 2335 NONE_SVM_INSTR, 2336 SVM_INSTR_VMRUN, 2337 SVM_INSTR_VMLOAD, 2338 SVM_INSTR_VMSAVE, 2339 }; 2340 2341 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ svm_instr_opcode(struct kvm_vcpu * vcpu)2342 static int svm_instr_opcode(struct kvm_vcpu *vcpu) 2343 { 2344 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 2345 2346 if (ctxt->b != 0x1 || ctxt->opcode_len != 2) 2347 return NONE_SVM_INSTR; 2348 2349 switch (ctxt->modrm) { 2350 case 0xd8: /* VMRUN */ 2351 return SVM_INSTR_VMRUN; 2352 case 0xda: /* VMLOAD */ 2353 return SVM_INSTR_VMLOAD; 2354 case 0xdb: /* VMSAVE */ 2355 return SVM_INSTR_VMSAVE; 2356 default: 2357 break; 2358 } 2359 2360 return NONE_SVM_INSTR; 2361 } 2362 emulate_svm_instr(struct kvm_vcpu * vcpu,int opcode)2363 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) 2364 { 2365 const int guest_mode_exit_codes[] = { 2366 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, 2367 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, 2368 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, 2369 }; 2370 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { 2371 [SVM_INSTR_VMRUN] = vmrun_interception, 2372 [SVM_INSTR_VMLOAD] = vmload_interception, 2373 [SVM_INSTR_VMSAVE] = vmsave_interception, 2374 }; 2375 struct vcpu_svm *svm = to_svm(vcpu); 2376 int ret; 2377 2378 if (is_guest_mode(vcpu)) { 2379 /* Returns '1' or -errno on failure, '0' on success. */ 2380 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); 2381 if (ret) 2382 return ret; 2383 return 1; 2384 } 2385 return svm_instr_handlers[opcode](vcpu); 2386 } 2387 2388 /* 2389 * #GP handling code. Note that #GP can be triggered under the following two 2390 * cases: 2391 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on 2392 * some AMD CPUs when EAX of these instructions are in the reserved memory 2393 * regions (e.g. SMM memory on host). 2394 * 2) VMware backdoor 2395 */ gp_interception(struct kvm_vcpu * vcpu)2396 static int gp_interception(struct kvm_vcpu *vcpu) 2397 { 2398 struct vcpu_svm *svm = to_svm(vcpu); 2399 u32 error_code = svm->vmcb->control.exit_info_1; 2400 int opcode; 2401 2402 /* Both #GP cases have zero error_code */ 2403 if (error_code) 2404 goto reinject; 2405 2406 /* Decode the instruction for usage later */ 2407 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) 2408 goto reinject; 2409 2410 opcode = svm_instr_opcode(vcpu); 2411 2412 if (opcode == NONE_SVM_INSTR) { 2413 if (!enable_vmware_backdoor) 2414 goto reinject; 2415 2416 /* 2417 * VMware backdoor emulation on #GP interception only handles 2418 * IN{S}, OUT{S}, and RDPMC. 2419 */ 2420 if (!is_guest_mode(vcpu)) 2421 return kvm_emulate_instruction(vcpu, 2422 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); 2423 } else { 2424 /* All SVM instructions expect page aligned RAX */ 2425 if (svm->vmcb->save.rax & ~PAGE_MASK) 2426 goto reinject; 2427 2428 return emulate_svm_instr(vcpu, opcode); 2429 } 2430 2431 reinject: 2432 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 2433 return 1; 2434 } 2435 svm_set_gif(struct vcpu_svm * svm,bool value)2436 void svm_set_gif(struct vcpu_svm *svm, bool value) 2437 { 2438 if (value) { 2439 /* 2440 * If VGIF is enabled, the STGI intercept is only added to 2441 * detect the opening of the SMI/NMI window; remove it now. 2442 * Likewise, clear the VINTR intercept, we will set it 2443 * again while processing KVM_REQ_EVENT if needed. 2444 */ 2445 if (vgif) 2446 svm_clr_intercept(svm, INTERCEPT_STGI); 2447 if (svm_is_intercept(svm, INTERCEPT_VINTR)) 2448 svm_clear_vintr(svm); 2449 2450 enable_gif(svm); 2451 if (svm->vcpu.arch.smi_pending || 2452 svm->vcpu.arch.nmi_pending || 2453 kvm_cpu_has_injectable_intr(&svm->vcpu) || 2454 kvm_apic_has_pending_init_or_sipi(&svm->vcpu)) 2455 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 2456 } else { 2457 disable_gif(svm); 2458 2459 /* 2460 * After a CLGI no interrupts should come. But if vGIF is 2461 * in use, we still rely on the VINTR intercept (rather than 2462 * STGI) to detect an open interrupt window. 2463 */ 2464 if (!vgif) 2465 svm_clear_vintr(svm); 2466 } 2467 } 2468 stgi_interception(struct kvm_vcpu * vcpu)2469 static int stgi_interception(struct kvm_vcpu *vcpu) 2470 { 2471 int ret; 2472 2473 if (nested_svm_check_permissions(vcpu)) 2474 return 1; 2475 2476 ret = kvm_skip_emulated_instruction(vcpu); 2477 svm_set_gif(to_svm(vcpu), true); 2478 return ret; 2479 } 2480 clgi_interception(struct kvm_vcpu * vcpu)2481 static int clgi_interception(struct kvm_vcpu *vcpu) 2482 { 2483 int ret; 2484 2485 if (nested_svm_check_permissions(vcpu)) 2486 return 1; 2487 2488 ret = kvm_skip_emulated_instruction(vcpu); 2489 svm_set_gif(to_svm(vcpu), false); 2490 return ret; 2491 } 2492 invlpga_interception(struct kvm_vcpu * vcpu)2493 static int invlpga_interception(struct kvm_vcpu *vcpu) 2494 { 2495 gva_t gva = kvm_rax_read(vcpu); 2496 u32 asid = kvm_rcx_read(vcpu); 2497 2498 /* FIXME: Handle an address size prefix. */ 2499 if (!is_long_mode(vcpu)) 2500 gva = (u32)gva; 2501 2502 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva); 2503 2504 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 2505 kvm_mmu_invlpg(vcpu, gva); 2506 2507 return kvm_skip_emulated_instruction(vcpu); 2508 } 2509 skinit_interception(struct kvm_vcpu * vcpu)2510 static int skinit_interception(struct kvm_vcpu *vcpu) 2511 { 2512 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu)); 2513 2514 kvm_queue_exception(vcpu, UD_VECTOR); 2515 return 1; 2516 } 2517 task_switch_interception(struct kvm_vcpu * vcpu)2518 static int task_switch_interception(struct kvm_vcpu *vcpu) 2519 { 2520 struct vcpu_svm *svm = to_svm(vcpu); 2521 u16 tss_selector; 2522 int reason; 2523 int int_type = svm->vmcb->control.exit_int_info & 2524 SVM_EXITINTINFO_TYPE_MASK; 2525 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; 2526 uint32_t type = 2527 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2528 uint32_t idt_v = 2529 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2530 bool has_error_code = false; 2531 u32 error_code = 0; 2532 2533 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2534 2535 if (svm->vmcb->control.exit_info_2 & 2536 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 2537 reason = TASK_SWITCH_IRET; 2538 else if (svm->vmcb->control.exit_info_2 & 2539 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 2540 reason = TASK_SWITCH_JMP; 2541 else if (idt_v) 2542 reason = TASK_SWITCH_GATE; 2543 else 2544 reason = TASK_SWITCH_CALL; 2545 2546 if (reason == TASK_SWITCH_GATE) { 2547 switch (type) { 2548 case SVM_EXITINTINFO_TYPE_NMI: 2549 vcpu->arch.nmi_injected = false; 2550 break; 2551 case SVM_EXITINTINFO_TYPE_EXEPT: 2552 if (svm->vmcb->control.exit_info_2 & 2553 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { 2554 has_error_code = true; 2555 error_code = 2556 (u32)svm->vmcb->control.exit_info_2; 2557 } 2558 kvm_clear_exception_queue(vcpu); 2559 break; 2560 case SVM_EXITINTINFO_TYPE_INTR: 2561 case SVM_EXITINTINFO_TYPE_SOFT: 2562 kvm_clear_interrupt_queue(vcpu); 2563 break; 2564 default: 2565 break; 2566 } 2567 } 2568 2569 if (reason != TASK_SWITCH_GATE || 2570 int_type == SVM_EXITINTINFO_TYPE_SOFT || 2571 (int_type == SVM_EXITINTINFO_TYPE_EXEPT && 2572 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { 2573 if (!svm_skip_emulated_instruction(vcpu)) 2574 return 0; 2575 } 2576 2577 if (int_type != SVM_EXITINTINFO_TYPE_SOFT) 2578 int_vec = -1; 2579 2580 return kvm_task_switch(vcpu, tss_selector, int_vec, reason, 2581 has_error_code, error_code); 2582 } 2583 svm_clr_iret_intercept(struct vcpu_svm * svm)2584 static void svm_clr_iret_intercept(struct vcpu_svm *svm) 2585 { 2586 if (!sev_es_guest(svm->vcpu.kvm)) 2587 svm_clr_intercept(svm, INTERCEPT_IRET); 2588 } 2589 svm_set_iret_intercept(struct vcpu_svm * svm)2590 static void svm_set_iret_intercept(struct vcpu_svm *svm) 2591 { 2592 if (!sev_es_guest(svm->vcpu.kvm)) 2593 svm_set_intercept(svm, INTERCEPT_IRET); 2594 } 2595 iret_interception(struct kvm_vcpu * vcpu)2596 static int iret_interception(struct kvm_vcpu *vcpu) 2597 { 2598 struct vcpu_svm *svm = to_svm(vcpu); 2599 2600 WARN_ON_ONCE(sev_es_guest(vcpu->kvm)); 2601 2602 ++vcpu->stat.nmi_window_exits; 2603 svm->awaiting_iret_completion = true; 2604 2605 svm_clr_iret_intercept(svm); 2606 svm->nmi_iret_rip = kvm_rip_read(vcpu); 2607 2608 kvm_make_request(KVM_REQ_EVENT, vcpu); 2609 return 1; 2610 } 2611 invlpg_interception(struct kvm_vcpu * vcpu)2612 static int invlpg_interception(struct kvm_vcpu *vcpu) 2613 { 2614 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2615 return kvm_emulate_instruction(vcpu, 0); 2616 2617 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1); 2618 return kvm_skip_emulated_instruction(vcpu); 2619 } 2620 emulate_on_interception(struct kvm_vcpu * vcpu)2621 static int emulate_on_interception(struct kvm_vcpu *vcpu) 2622 { 2623 return kvm_emulate_instruction(vcpu, 0); 2624 } 2625 rsm_interception(struct kvm_vcpu * vcpu)2626 static int rsm_interception(struct kvm_vcpu *vcpu) 2627 { 2628 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2); 2629 } 2630 check_selective_cr0_intercepted(struct kvm_vcpu * vcpu,unsigned long val)2631 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, 2632 unsigned long val) 2633 { 2634 struct vcpu_svm *svm = to_svm(vcpu); 2635 unsigned long cr0 = vcpu->arch.cr0; 2636 bool ret = false; 2637 2638 if (!is_guest_mode(vcpu) || 2639 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) 2640 return false; 2641 2642 cr0 &= ~SVM_CR0_SELECTIVE_MASK; 2643 val &= ~SVM_CR0_SELECTIVE_MASK; 2644 2645 if (cr0 ^ val) { 2646 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 2647 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); 2648 } 2649 2650 return ret; 2651 } 2652 2653 #define CR_VALID (1ULL << 63) 2654 cr_interception(struct kvm_vcpu * vcpu)2655 static int cr_interception(struct kvm_vcpu *vcpu) 2656 { 2657 struct vcpu_svm *svm = to_svm(vcpu); 2658 int reg, cr; 2659 unsigned long val; 2660 int err; 2661 2662 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2663 return emulate_on_interception(vcpu); 2664 2665 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) 2666 return emulate_on_interception(vcpu); 2667 2668 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2669 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) 2670 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; 2671 else 2672 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 2673 2674 err = 0; 2675 if (cr >= 16) { /* mov to cr */ 2676 cr -= 16; 2677 val = kvm_register_read(vcpu, reg); 2678 trace_kvm_cr_write(cr, val); 2679 switch (cr) { 2680 case 0: 2681 if (!check_selective_cr0_intercepted(vcpu, val)) 2682 err = kvm_set_cr0(vcpu, val); 2683 else 2684 return 1; 2685 2686 break; 2687 case 3: 2688 err = kvm_set_cr3(vcpu, val); 2689 break; 2690 case 4: 2691 err = kvm_set_cr4(vcpu, val); 2692 break; 2693 case 8: 2694 err = kvm_set_cr8(vcpu, val); 2695 break; 2696 default: 2697 WARN(1, "unhandled write to CR%d", cr); 2698 kvm_queue_exception(vcpu, UD_VECTOR); 2699 return 1; 2700 } 2701 } else { /* mov from cr */ 2702 switch (cr) { 2703 case 0: 2704 val = kvm_read_cr0(vcpu); 2705 break; 2706 case 2: 2707 val = vcpu->arch.cr2; 2708 break; 2709 case 3: 2710 val = kvm_read_cr3(vcpu); 2711 break; 2712 case 4: 2713 val = kvm_read_cr4(vcpu); 2714 break; 2715 case 8: 2716 val = kvm_get_cr8(vcpu); 2717 break; 2718 default: 2719 WARN(1, "unhandled read from CR%d", cr); 2720 kvm_queue_exception(vcpu, UD_VECTOR); 2721 return 1; 2722 } 2723 kvm_register_write(vcpu, reg, val); 2724 trace_kvm_cr_read(cr, val); 2725 } 2726 return kvm_complete_insn_gp(vcpu, err); 2727 } 2728 cr_trap(struct kvm_vcpu * vcpu)2729 static int cr_trap(struct kvm_vcpu *vcpu) 2730 { 2731 struct vcpu_svm *svm = to_svm(vcpu); 2732 unsigned long old_value, new_value; 2733 unsigned int cr; 2734 int ret = 0; 2735 2736 new_value = (unsigned long)svm->vmcb->control.exit_info_1; 2737 2738 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP; 2739 switch (cr) { 2740 case 0: 2741 old_value = kvm_read_cr0(vcpu); 2742 svm_set_cr0(vcpu, new_value); 2743 2744 kvm_post_set_cr0(vcpu, old_value, new_value); 2745 break; 2746 case 4: 2747 old_value = kvm_read_cr4(vcpu); 2748 svm_set_cr4(vcpu, new_value); 2749 2750 kvm_post_set_cr4(vcpu, old_value, new_value); 2751 break; 2752 case 8: 2753 ret = kvm_set_cr8(vcpu, new_value); 2754 break; 2755 default: 2756 WARN(1, "unhandled CR%d write trap", cr); 2757 kvm_queue_exception(vcpu, UD_VECTOR); 2758 return 1; 2759 } 2760 2761 return kvm_complete_insn_gp(vcpu, ret); 2762 } 2763 dr_interception(struct kvm_vcpu * vcpu)2764 static int dr_interception(struct kvm_vcpu *vcpu) 2765 { 2766 struct vcpu_svm *svm = to_svm(vcpu); 2767 int reg, dr; 2768 int err = 0; 2769 2770 /* 2771 * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT 2772 * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early. 2773 */ 2774 if (sev_es_guest(vcpu->kvm)) 2775 return 1; 2776 2777 if (vcpu->guest_debug == 0) { 2778 /* 2779 * No more DR vmexits; force a reload of the debug registers 2780 * and reenter on this instruction. The next vmexit will 2781 * retrieve the full state of the debug registers. 2782 */ 2783 clr_dr_intercepts(svm); 2784 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 2785 return 1; 2786 } 2787 2788 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) 2789 return emulate_on_interception(vcpu); 2790 2791 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2792 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; 2793 if (dr >= 16) { /* mov to DRn */ 2794 dr -= 16; 2795 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 2796 } else { 2797 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 2798 } 2799 2800 return kvm_complete_insn_gp(vcpu, err); 2801 } 2802 cr8_write_interception(struct kvm_vcpu * vcpu)2803 static int cr8_write_interception(struct kvm_vcpu *vcpu) 2804 { 2805 int r; 2806 2807 u8 cr8_prev = kvm_get_cr8(vcpu); 2808 /* instruction emulation calls kvm_set_cr8() */ 2809 r = cr_interception(vcpu); 2810 if (lapic_in_kernel(vcpu)) 2811 return r; 2812 if (cr8_prev <= kvm_get_cr8(vcpu)) 2813 return r; 2814 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 2815 return 0; 2816 } 2817 efer_trap(struct kvm_vcpu * vcpu)2818 static int efer_trap(struct kvm_vcpu *vcpu) 2819 { 2820 struct msr_data msr_info; 2821 int ret; 2822 2823 /* 2824 * Clear the EFER_SVME bit from EFER. The SVM code always sets this 2825 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against 2826 * whether the guest has X86_FEATURE_SVM - this avoids a failure if 2827 * the guest doesn't have X86_FEATURE_SVM. 2828 */ 2829 msr_info.host_initiated = false; 2830 msr_info.index = MSR_EFER; 2831 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME; 2832 ret = kvm_set_msr_common(vcpu, &msr_info); 2833 2834 return kvm_complete_insn_gp(vcpu, ret); 2835 } 2836 svm_get_feature_msr(u32 msr,u64 * data)2837 static int svm_get_feature_msr(u32 msr, u64 *data) 2838 { 2839 *data = 0; 2840 2841 switch (msr) { 2842 case MSR_AMD64_DE_CFG: 2843 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) 2844 *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; 2845 break; 2846 default: 2847 return KVM_MSR_RET_UNSUPPORTED; 2848 } 2849 2850 return 0; 2851 } 2852 2853 static bool sev_es_prevent_msr_access(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2854 sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2855 { 2856 return sev_es_guest(vcpu->kvm) && 2857 vcpu->arch.guest_state_protected && 2858 svm_msrpm_offset(msr_info->index) != MSR_INVALID && 2859 !msr_write_intercepted(vcpu, msr_info->index); 2860 } 2861 svm_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr_info)2862 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2863 { 2864 struct vcpu_svm *svm = to_svm(vcpu); 2865 2866 if (sev_es_prevent_msr_access(vcpu, msr_info)) { 2867 msr_info->data = 0; 2868 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; 2869 } 2870 2871 switch (msr_info->index) { 2872 case MSR_AMD64_TSC_RATIO: 2873 if (!msr_info->host_initiated && 2874 !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) 2875 return 1; 2876 msr_info->data = svm->tsc_ratio_msr; 2877 break; 2878 case MSR_STAR: 2879 msr_info->data = svm->vmcb01.ptr->save.star; 2880 break; 2881 #ifdef CONFIG_X86_64 2882 case MSR_LSTAR: 2883 msr_info->data = svm->vmcb01.ptr->save.lstar; 2884 break; 2885 case MSR_CSTAR: 2886 msr_info->data = svm->vmcb01.ptr->save.cstar; 2887 break; 2888 case MSR_GS_BASE: 2889 msr_info->data = svm->vmcb01.ptr->save.gs.base; 2890 break; 2891 case MSR_FS_BASE: 2892 msr_info->data = svm->vmcb01.ptr->save.fs.base; 2893 break; 2894 case MSR_KERNEL_GS_BASE: 2895 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; 2896 break; 2897 case MSR_SYSCALL_MASK: 2898 msr_info->data = svm->vmcb01.ptr->save.sfmask; 2899 break; 2900 #endif 2901 case MSR_IA32_SYSENTER_CS: 2902 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; 2903 break; 2904 case MSR_IA32_SYSENTER_EIP: 2905 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; 2906 if (guest_cpuid_is_intel_compatible(vcpu)) 2907 msr_info->data |= (u64)svm->sysenter_eip_hi << 32; 2908 break; 2909 case MSR_IA32_SYSENTER_ESP: 2910 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; 2911 if (guest_cpuid_is_intel_compatible(vcpu)) 2912 msr_info->data |= (u64)svm->sysenter_esp_hi << 32; 2913 break; 2914 case MSR_TSC_AUX: 2915 msr_info->data = svm->tsc_aux; 2916 break; 2917 case MSR_IA32_DEBUGCTLMSR: 2918 msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; 2919 break; 2920 case MSR_IA32_LASTBRANCHFROMIP: 2921 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; 2922 break; 2923 case MSR_IA32_LASTBRANCHTOIP: 2924 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; 2925 break; 2926 case MSR_IA32_LASTINTFROMIP: 2927 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; 2928 break; 2929 case MSR_IA32_LASTINTTOIP: 2930 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; 2931 break; 2932 case MSR_VM_HSAVE_PA: 2933 msr_info->data = svm->nested.hsave_msr; 2934 break; 2935 case MSR_VM_CR: 2936 msr_info->data = svm->nested.vm_cr_msr; 2937 break; 2938 case MSR_IA32_SPEC_CTRL: 2939 if (!msr_info->host_initiated && 2940 !guest_has_spec_ctrl_msr(vcpu)) 2941 return 1; 2942 2943 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2944 msr_info->data = svm->vmcb->save.spec_ctrl; 2945 else 2946 msr_info->data = svm->spec_ctrl; 2947 break; 2948 case MSR_AMD64_VIRT_SPEC_CTRL: 2949 if (!msr_info->host_initiated && 2950 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2951 return 1; 2952 2953 msr_info->data = svm->virt_spec_ctrl; 2954 break; 2955 case MSR_F15H_IC_CFG: { 2956 2957 int family, model; 2958 2959 family = guest_cpuid_family(vcpu); 2960 model = guest_cpuid_model(vcpu); 2961 2962 if (family < 0 || model < 0) 2963 return kvm_get_msr_common(vcpu, msr_info); 2964 2965 msr_info->data = 0; 2966 2967 if (family == 0x15 && 2968 (model >= 0x2 && model < 0x20)) 2969 msr_info->data = 0x1E; 2970 } 2971 break; 2972 case MSR_AMD64_DE_CFG: 2973 msr_info->data = svm->msr_decfg; 2974 break; 2975 default: 2976 return kvm_get_msr_common(vcpu, msr_info); 2977 } 2978 return 0; 2979 } 2980 svm_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)2981 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2982 { 2983 struct vcpu_svm *svm = to_svm(vcpu); 2984 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) 2985 return kvm_complete_insn_gp(vcpu, err); 2986 2987 svm_vmgexit_inject_exception(svm, X86_TRAP_GP); 2988 return 1; 2989 } 2990 svm_set_vm_cr(struct kvm_vcpu * vcpu,u64 data)2991 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) 2992 { 2993 struct vcpu_svm *svm = to_svm(vcpu); 2994 int svm_dis, chg_mask; 2995 2996 if (data & ~SVM_VM_CR_VALID_MASK) 2997 return 1; 2998 2999 chg_mask = SVM_VM_CR_VALID_MASK; 3000 3001 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) 3002 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); 3003 3004 svm->nested.vm_cr_msr &= ~chg_mask; 3005 svm->nested.vm_cr_msr |= (data & chg_mask); 3006 3007 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; 3008 3009 /* check for svm_disable while efer.svme is set */ 3010 if (svm_dis && (vcpu->arch.efer & EFER_SVME)) 3011 return 1; 3012 3013 return 0; 3014 } 3015 svm_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)3016 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 3017 { 3018 struct vcpu_svm *svm = to_svm(vcpu); 3019 int ret = 0; 3020 3021 u32 ecx = msr->index; 3022 u64 data = msr->data; 3023 3024 if (sev_es_prevent_msr_access(vcpu, msr)) 3025 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; 3026 3027 switch (ecx) { 3028 case MSR_AMD64_TSC_RATIO: 3029 3030 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) { 3031 3032 if (!msr->host_initiated) 3033 return 1; 3034 /* 3035 * In case TSC scaling is not enabled, always 3036 * leave this MSR at the default value. 3037 * 3038 * Due to bug in qemu 6.2.0, it would try to set 3039 * this msr to 0 if tsc scaling is not enabled. 3040 * Ignore this value as well. 3041 */ 3042 if (data != 0 && data != svm->tsc_ratio_msr) 3043 return 1; 3044 break; 3045 } 3046 3047 if (data & SVM_TSC_RATIO_RSVD) 3048 return 1; 3049 3050 svm->tsc_ratio_msr = data; 3051 3052 if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && 3053 is_guest_mode(vcpu)) 3054 nested_svm_update_tsc_ratio_msr(vcpu); 3055 3056 break; 3057 case MSR_IA32_CR_PAT: 3058 ret = kvm_set_msr_common(vcpu, msr); 3059 if (ret) 3060 break; 3061 3062 svm->vmcb01.ptr->save.g_pat = data; 3063 if (is_guest_mode(vcpu)) 3064 nested_vmcb02_compute_g_pat(svm); 3065 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 3066 break; 3067 case MSR_IA32_SPEC_CTRL: 3068 if (!msr->host_initiated && 3069 !guest_has_spec_ctrl_msr(vcpu)) 3070 return 1; 3071 3072 if (kvm_spec_ctrl_test_value(data)) 3073 return 1; 3074 3075 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 3076 svm->vmcb->save.spec_ctrl = data; 3077 else 3078 svm->spec_ctrl = data; 3079 if (!data) 3080 break; 3081 3082 /* 3083 * For non-nested: 3084 * When it's written (to non-zero) for the first time, pass 3085 * it through. 3086 * 3087 * For nested: 3088 * The handling of the MSR bitmap for L2 guests is done in 3089 * nested_svm_vmrun_msrpm. 3090 * We update the L1 MSR bit as well since it will end up 3091 * touching the MSR anyway now. 3092 */ 3093 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 3094 break; 3095 case MSR_AMD64_VIRT_SPEC_CTRL: 3096 if (!msr->host_initiated && 3097 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) 3098 return 1; 3099 3100 if (data & ~SPEC_CTRL_SSBD) 3101 return 1; 3102 3103 svm->virt_spec_ctrl = data; 3104 break; 3105 case MSR_STAR: 3106 svm->vmcb01.ptr->save.star = data; 3107 break; 3108 #ifdef CONFIG_X86_64 3109 case MSR_LSTAR: 3110 svm->vmcb01.ptr->save.lstar = data; 3111 break; 3112 case MSR_CSTAR: 3113 svm->vmcb01.ptr->save.cstar = data; 3114 break; 3115 case MSR_GS_BASE: 3116 svm->vmcb01.ptr->save.gs.base = data; 3117 break; 3118 case MSR_FS_BASE: 3119 svm->vmcb01.ptr->save.fs.base = data; 3120 break; 3121 case MSR_KERNEL_GS_BASE: 3122 svm->vmcb01.ptr->save.kernel_gs_base = data; 3123 break; 3124 case MSR_SYSCALL_MASK: 3125 svm->vmcb01.ptr->save.sfmask = data; 3126 break; 3127 #endif 3128 case MSR_IA32_SYSENTER_CS: 3129 svm->vmcb01.ptr->save.sysenter_cs = data; 3130 break; 3131 case MSR_IA32_SYSENTER_EIP: 3132 svm->vmcb01.ptr->save.sysenter_eip = (u32)data; 3133 /* 3134 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs 3135 * when we spoof an Intel vendor ID (for cross vendor migration). 3136 * In this case we use this intercept to track the high 3137 * 32 bit part of these msrs to support Intel's 3138 * implementation of SYSENTER/SYSEXIT. 3139 */ 3140 svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; 3141 break; 3142 case MSR_IA32_SYSENTER_ESP: 3143 svm->vmcb01.ptr->save.sysenter_esp = (u32)data; 3144 svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; 3145 break; 3146 case MSR_TSC_AUX: 3147 /* 3148 * TSC_AUX is always virtualized for SEV-ES guests when the 3149 * feature is available. The user return MSR support is not 3150 * required in this case because TSC_AUX is restored on #VMEXIT 3151 * from the host save area (which has been initialized in 3152 * svm_enable_virtualization_cpu()). 3153 */ 3154 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) 3155 break; 3156 3157 /* 3158 * TSC_AUX is usually changed only during boot and never read 3159 * directly. Intercept TSC_AUX instead of exposing it to the 3160 * guest via direct_access_msrs, and switch it via user return. 3161 */ 3162 preempt_disable(); 3163 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); 3164 preempt_enable(); 3165 if (ret) 3166 break; 3167 3168 svm->tsc_aux = data; 3169 break; 3170 case MSR_IA32_DEBUGCTLMSR: 3171 if (!lbrv) { 3172 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3173 break; 3174 } 3175 3176 /* 3177 * AMD changed the architectural behavior of bits 5:2. On CPUs 3178 * without BusLockTrap, bits 5:2 control "external pins", but 3179 * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap 3180 * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed 3181 * the guest to set bits 5:2 despite not actually virtualizing 3182 * Performance-Monitoring/Breakpoint external pins. Drop bits 3183 * 5:2 for backwards compatibility. 3184 */ 3185 data &= ~GENMASK(5, 2); 3186 3187 /* 3188 * Suppress BTF as KVM doesn't virtualize BTF, but there's no 3189 * way to communicate lack of support to the guest. 3190 */ 3191 if (data & DEBUGCTLMSR_BTF) { 3192 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 3193 data &= ~DEBUGCTLMSR_BTF; 3194 } 3195 3196 if (data & DEBUGCTL_RESERVED_BITS) 3197 return 1; 3198 3199 svm_get_lbr_vmcb(svm)->save.dbgctl = data; 3200 svm_update_lbrv(vcpu); 3201 break; 3202 case MSR_VM_HSAVE_PA: 3203 /* 3204 * Old kernels did not validate the value written to 3205 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid 3206 * value to allow live migrating buggy or malicious guests 3207 * originating from those kernels. 3208 */ 3209 if (!msr->host_initiated && !page_address_valid(vcpu, data)) 3210 return 1; 3211 3212 svm->nested.hsave_msr = data & PAGE_MASK; 3213 break; 3214 case MSR_VM_CR: 3215 return svm_set_vm_cr(vcpu, data); 3216 case MSR_VM_IGNNE: 3217 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3218 break; 3219 case MSR_AMD64_DE_CFG: { 3220 u64 supported_de_cfg; 3221 3222 if (svm_get_feature_msr(ecx, &supported_de_cfg)) 3223 return 1; 3224 3225 if (data & ~supported_de_cfg) 3226 return 1; 3227 3228 svm->msr_decfg = data; 3229 break; 3230 } 3231 default: 3232 return kvm_set_msr_common(vcpu, msr); 3233 } 3234 return ret; 3235 } 3236 msr_interception(struct kvm_vcpu * vcpu)3237 static int msr_interception(struct kvm_vcpu *vcpu) 3238 { 3239 if (to_svm(vcpu)->vmcb->control.exit_info_1) 3240 return kvm_emulate_wrmsr(vcpu); 3241 else 3242 return kvm_emulate_rdmsr(vcpu); 3243 } 3244 interrupt_window_interception(struct kvm_vcpu * vcpu)3245 static int interrupt_window_interception(struct kvm_vcpu *vcpu) 3246 { 3247 kvm_make_request(KVM_REQ_EVENT, vcpu); 3248 svm_clear_vintr(to_svm(vcpu)); 3249 3250 /* 3251 * If not running nested, for AVIC, the only reason to end up here is ExtINTs. 3252 * In this case AVIC was temporarily disabled for 3253 * requesting the IRQ window and we have to re-enable it. 3254 * 3255 * If running nested, still remove the VM wide AVIC inhibit to 3256 * support case in which the interrupt window was requested when the 3257 * vCPU was not running nested. 3258 3259 * All vCPUs which run still run nested, will remain to have their 3260 * AVIC still inhibited due to per-cpu AVIC inhibition. 3261 */ 3262 kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 3263 3264 ++vcpu->stat.irq_window_exits; 3265 return 1; 3266 } 3267 pause_interception(struct kvm_vcpu * vcpu)3268 static int pause_interception(struct kvm_vcpu *vcpu) 3269 { 3270 bool in_kernel; 3271 /* 3272 * CPL is not made available for an SEV-ES guest, therefore 3273 * vcpu->arch.preempted_in_kernel can never be true. Just 3274 * set in_kernel to false as well. 3275 */ 3276 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; 3277 3278 grow_ple_window(vcpu); 3279 3280 kvm_vcpu_on_spin(vcpu, in_kernel); 3281 return kvm_skip_emulated_instruction(vcpu); 3282 } 3283 invpcid_interception(struct kvm_vcpu * vcpu)3284 static int invpcid_interception(struct kvm_vcpu *vcpu) 3285 { 3286 struct vcpu_svm *svm = to_svm(vcpu); 3287 unsigned long type; 3288 gva_t gva; 3289 3290 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 3291 kvm_queue_exception(vcpu, UD_VECTOR); 3292 return 1; 3293 } 3294 3295 /* 3296 * For an INVPCID intercept: 3297 * EXITINFO1 provides the linear address of the memory operand. 3298 * EXITINFO2 provides the contents of the register operand. 3299 */ 3300 type = svm->vmcb->control.exit_info_2; 3301 gva = svm->vmcb->control.exit_info_1; 3302 3303 /* 3304 * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the 3305 * stack segment is used. The intercept takes priority over all 3306 * #GP checks except CPL>0, but somehow still generates a linear 3307 * address? The APM is sorely lacking. 3308 */ 3309 if (is_noncanonical_address(gva, vcpu, 0)) { 3310 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3311 return 1; 3312 } 3313 3314 return kvm_handle_invpcid(vcpu, type, gva); 3315 } 3316 3317 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { 3318 [SVM_EXIT_READ_CR0] = cr_interception, 3319 [SVM_EXIT_READ_CR3] = cr_interception, 3320 [SVM_EXIT_READ_CR4] = cr_interception, 3321 [SVM_EXIT_READ_CR8] = cr_interception, 3322 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, 3323 [SVM_EXIT_WRITE_CR0] = cr_interception, 3324 [SVM_EXIT_WRITE_CR3] = cr_interception, 3325 [SVM_EXIT_WRITE_CR4] = cr_interception, 3326 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3327 [SVM_EXIT_READ_DR0] = dr_interception, 3328 [SVM_EXIT_READ_DR1] = dr_interception, 3329 [SVM_EXIT_READ_DR2] = dr_interception, 3330 [SVM_EXIT_READ_DR3] = dr_interception, 3331 [SVM_EXIT_READ_DR4] = dr_interception, 3332 [SVM_EXIT_READ_DR5] = dr_interception, 3333 [SVM_EXIT_READ_DR6] = dr_interception, 3334 [SVM_EXIT_READ_DR7] = dr_interception, 3335 [SVM_EXIT_WRITE_DR0] = dr_interception, 3336 [SVM_EXIT_WRITE_DR1] = dr_interception, 3337 [SVM_EXIT_WRITE_DR2] = dr_interception, 3338 [SVM_EXIT_WRITE_DR3] = dr_interception, 3339 [SVM_EXIT_WRITE_DR4] = dr_interception, 3340 [SVM_EXIT_WRITE_DR5] = dr_interception, 3341 [SVM_EXIT_WRITE_DR6] = dr_interception, 3342 [SVM_EXIT_WRITE_DR7] = dr_interception, 3343 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3344 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3345 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3346 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 3347 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 3348 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 3349 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, 3350 [SVM_EXIT_INTR] = intr_interception, 3351 [SVM_EXIT_NMI] = nmi_interception, 3352 [SVM_EXIT_SMI] = smi_interception, 3353 [SVM_EXIT_VINTR] = interrupt_window_interception, 3354 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, 3355 [SVM_EXIT_CPUID] = kvm_emulate_cpuid, 3356 [SVM_EXIT_IRET] = iret_interception, 3357 [SVM_EXIT_INVD] = kvm_emulate_invd, 3358 [SVM_EXIT_PAUSE] = pause_interception, 3359 [SVM_EXIT_HLT] = kvm_emulate_halt, 3360 [SVM_EXIT_INVLPG] = invlpg_interception, 3361 [SVM_EXIT_INVLPGA] = invlpga_interception, 3362 [SVM_EXIT_IOIO] = io_interception, 3363 [SVM_EXIT_MSR] = msr_interception, 3364 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 3365 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 3366 [SVM_EXIT_VMRUN] = vmrun_interception, 3367 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, 3368 [SVM_EXIT_VMLOAD] = vmload_interception, 3369 [SVM_EXIT_VMSAVE] = vmsave_interception, 3370 [SVM_EXIT_STGI] = stgi_interception, 3371 [SVM_EXIT_CLGI] = clgi_interception, 3372 [SVM_EXIT_SKINIT] = skinit_interception, 3373 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, 3374 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, 3375 [SVM_EXIT_MONITOR] = kvm_emulate_monitor, 3376 [SVM_EXIT_MWAIT] = kvm_emulate_mwait, 3377 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, 3378 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, 3379 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, 3380 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, 3381 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, 3382 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, 3383 [SVM_EXIT_INVPCID] = invpcid_interception, 3384 [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, 3385 [SVM_EXIT_NPF] = npf_interception, 3386 [SVM_EXIT_RSM] = rsm_interception, 3387 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 3388 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, 3389 #ifdef CONFIG_KVM_AMD_SEV 3390 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, 3391 #endif 3392 }; 3393 dump_vmcb(struct kvm_vcpu * vcpu)3394 static void dump_vmcb(struct kvm_vcpu *vcpu) 3395 { 3396 struct vcpu_svm *svm = to_svm(vcpu); 3397 struct vmcb_control_area *control = &svm->vmcb->control; 3398 struct vmcb_save_area *save = &svm->vmcb->save; 3399 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; 3400 3401 if (!dump_invalid_vmcb) { 3402 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3403 return; 3404 } 3405 3406 pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", 3407 svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); 3408 pr_err("VMCB Control Area:\n"); 3409 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); 3410 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); 3411 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); 3412 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); 3413 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); 3414 pr_err("%-20s%08x %08x\n", "intercepts:", 3415 control->intercepts[INTERCEPT_WORD3], 3416 control->intercepts[INTERCEPT_WORD4]); 3417 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); 3418 pr_err("%-20s%d\n", "pause filter threshold:", 3419 control->pause_filter_thresh); 3420 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); 3421 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); 3422 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); 3423 pr_err("%-20s%d\n", "asid:", control->asid); 3424 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); 3425 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); 3426 pr_err("%-20s%08x\n", "int_vector:", control->int_vector); 3427 pr_err("%-20s%08x\n", "int_state:", control->int_state); 3428 pr_err("%-20s%08x\n", "exit_code:", control->exit_code); 3429 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); 3430 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); 3431 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); 3432 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); 3433 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); 3434 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); 3435 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 3436 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); 3437 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 3438 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 3439 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 3440 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 3441 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 3442 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); 3443 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); 3444 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); 3445 pr_err("VMCB State Save Area:\n"); 3446 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3447 "es:", 3448 save->es.selector, save->es.attrib, 3449 save->es.limit, save->es.base); 3450 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3451 "cs:", 3452 save->cs.selector, save->cs.attrib, 3453 save->cs.limit, save->cs.base); 3454 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3455 "ss:", 3456 save->ss.selector, save->ss.attrib, 3457 save->ss.limit, save->ss.base); 3458 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3459 "ds:", 3460 save->ds.selector, save->ds.attrib, 3461 save->ds.limit, save->ds.base); 3462 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3463 "fs:", 3464 save01->fs.selector, save01->fs.attrib, 3465 save01->fs.limit, save01->fs.base); 3466 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3467 "gs:", 3468 save01->gs.selector, save01->gs.attrib, 3469 save01->gs.limit, save01->gs.base); 3470 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3471 "gdtr:", 3472 save->gdtr.selector, save->gdtr.attrib, 3473 save->gdtr.limit, save->gdtr.base); 3474 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3475 "ldtr:", 3476 save01->ldtr.selector, save01->ldtr.attrib, 3477 save01->ldtr.limit, save01->ldtr.base); 3478 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3479 "idtr:", 3480 save->idtr.selector, save->idtr.attrib, 3481 save->idtr.limit, save->idtr.base); 3482 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3483 "tr:", 3484 save01->tr.selector, save01->tr.attrib, 3485 save01->tr.limit, save01->tr.base); 3486 pr_err("vmpl: %d cpl: %d efer: %016llx\n", 3487 save->vmpl, save->cpl, save->efer); 3488 pr_err("%-15s %016llx %-13s %016llx\n", 3489 "cr0:", save->cr0, "cr2:", save->cr2); 3490 pr_err("%-15s %016llx %-13s %016llx\n", 3491 "cr3:", save->cr3, "cr4:", save->cr4); 3492 pr_err("%-15s %016llx %-13s %016llx\n", 3493 "dr6:", save->dr6, "dr7:", save->dr7); 3494 pr_err("%-15s %016llx %-13s %016llx\n", 3495 "rip:", save->rip, "rflags:", save->rflags); 3496 pr_err("%-15s %016llx %-13s %016llx\n", 3497 "rsp:", save->rsp, "rax:", save->rax); 3498 pr_err("%-15s %016llx %-13s %016llx\n", 3499 "star:", save01->star, "lstar:", save01->lstar); 3500 pr_err("%-15s %016llx %-13s %016llx\n", 3501 "cstar:", save01->cstar, "sfmask:", save01->sfmask); 3502 pr_err("%-15s %016llx %-13s %016llx\n", 3503 "kernel_gs_base:", save01->kernel_gs_base, 3504 "sysenter_cs:", save01->sysenter_cs); 3505 pr_err("%-15s %016llx %-13s %016llx\n", 3506 "sysenter_esp:", save01->sysenter_esp, 3507 "sysenter_eip:", save01->sysenter_eip); 3508 pr_err("%-15s %016llx %-13s %016llx\n", 3509 "gpat:", save->g_pat, "dbgctl:", save->dbgctl); 3510 pr_err("%-15s %016llx %-13s %016llx\n", 3511 "br_from:", save->br_from, "br_to:", save->br_to); 3512 pr_err("%-15s %016llx %-13s %016llx\n", 3513 "excp_from:", save->last_excp_from, 3514 "excp_to:", save->last_excp_to); 3515 } 3516 svm_check_exit_valid(u64 exit_code)3517 static bool svm_check_exit_valid(u64 exit_code) 3518 { 3519 return (exit_code < ARRAY_SIZE(svm_exit_handlers) && 3520 svm_exit_handlers[exit_code]); 3521 } 3522 svm_handle_invalid_exit(struct kvm_vcpu * vcpu,u64 exit_code)3523 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) 3524 { 3525 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); 3526 dump_vmcb(vcpu); 3527 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3528 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 3529 vcpu->run->internal.ndata = 2; 3530 vcpu->run->internal.data[0] = exit_code; 3531 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 3532 return 0; 3533 } 3534 svm_invoke_exit_handler(struct kvm_vcpu * vcpu,u64 exit_code)3535 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) 3536 { 3537 if (!svm_check_exit_valid(exit_code)) 3538 return svm_handle_invalid_exit(vcpu, exit_code); 3539 3540 #ifdef CONFIG_MITIGATION_RETPOLINE 3541 if (exit_code == SVM_EXIT_MSR) 3542 return msr_interception(vcpu); 3543 else if (exit_code == SVM_EXIT_VINTR) 3544 return interrupt_window_interception(vcpu); 3545 else if (exit_code == SVM_EXIT_INTR) 3546 return intr_interception(vcpu); 3547 else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) 3548 return kvm_emulate_halt(vcpu); 3549 else if (exit_code == SVM_EXIT_NPF) 3550 return npf_interception(vcpu); 3551 #endif 3552 return svm_exit_handlers[exit_code](vcpu); 3553 } 3554 svm_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)3555 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 3556 u64 *info1, u64 *info2, 3557 u32 *intr_info, u32 *error_code) 3558 { 3559 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 3560 3561 *reason = control->exit_code; 3562 *info1 = control->exit_info_1; 3563 *info2 = control->exit_info_2; 3564 *intr_info = control->exit_int_info; 3565 if ((*intr_info & SVM_EXITINTINFO_VALID) && 3566 (*intr_info & SVM_EXITINTINFO_VALID_ERR)) 3567 *error_code = control->exit_int_info_err; 3568 else 3569 *error_code = 0; 3570 } 3571 svm_get_entry_info(struct kvm_vcpu * vcpu,u32 * intr_info,u32 * error_code)3572 static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, 3573 u32 *error_code) 3574 { 3575 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 3576 3577 *intr_info = control->event_inj; 3578 3579 if ((*intr_info & SVM_EXITINTINFO_VALID) && 3580 (*intr_info & SVM_EXITINTINFO_VALID_ERR)) 3581 *error_code = control->event_inj_err; 3582 else 3583 *error_code = 0; 3584 3585 } 3586 svm_handle_exit(struct kvm_vcpu * vcpu,fastpath_t exit_fastpath)3587 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 3588 { 3589 struct vcpu_svm *svm = to_svm(vcpu); 3590 struct kvm_run *kvm_run = vcpu->run; 3591 u32 exit_code = svm->vmcb->control.exit_code; 3592 3593 /* SEV-ES guests must use the CR write traps to track CR registers. */ 3594 if (!sev_es_guest(vcpu->kvm)) { 3595 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) 3596 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3597 if (npt_enabled) 3598 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3599 } 3600 3601 if (is_guest_mode(vcpu)) { 3602 int vmexit; 3603 3604 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); 3605 3606 vmexit = nested_svm_exit_special(svm); 3607 3608 if (vmexit == NESTED_EXIT_CONTINUE) 3609 vmexit = nested_svm_exit_handled(svm); 3610 3611 if (vmexit == NESTED_EXIT_DONE) 3612 return 1; 3613 } 3614 3615 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 3616 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3617 kvm_run->fail_entry.hardware_entry_failure_reason 3618 = svm->vmcb->control.exit_code; 3619 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 3620 dump_vmcb(vcpu); 3621 return 0; 3622 } 3623 3624 if (exit_fastpath != EXIT_FASTPATH_NONE) 3625 return 1; 3626 3627 return svm_invoke_exit_handler(vcpu, exit_code); 3628 } 3629 pre_svm_run(struct kvm_vcpu * vcpu)3630 static int pre_svm_run(struct kvm_vcpu *vcpu) 3631 { 3632 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 3633 struct vcpu_svm *svm = to_svm(vcpu); 3634 3635 /* 3636 * If the previous vmrun of the vmcb occurred on a different physical 3637 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's 3638 * vmcb clean bits are per logical CPU, as are KVM's asid assignments. 3639 */ 3640 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) { 3641 svm->current_vmcb->asid_generation = 0; 3642 vmcb_mark_all_dirty(svm->vmcb); 3643 svm->current_vmcb->cpu = vcpu->cpu; 3644 } 3645 3646 if (sev_guest(vcpu->kvm)) 3647 return pre_sev_run(svm, vcpu->cpu); 3648 3649 /* FIXME: handle wraparound of asid_generation */ 3650 if (svm->current_vmcb->asid_generation != sd->asid_generation) 3651 new_asid(svm, sd); 3652 3653 return 0; 3654 } 3655 svm_inject_nmi(struct kvm_vcpu * vcpu)3656 static void svm_inject_nmi(struct kvm_vcpu *vcpu) 3657 { 3658 struct vcpu_svm *svm = to_svm(vcpu); 3659 3660 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3661 3662 if (svm->nmi_l1_to_l2) 3663 return; 3664 3665 /* 3666 * No need to manually track NMI masking when vNMI is enabled, hardware 3667 * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the 3668 * case where software directly injects an NMI. 3669 */ 3670 if (!is_vnmi_enabled(svm)) { 3671 svm->nmi_masked = true; 3672 svm_set_iret_intercept(svm); 3673 } 3674 ++vcpu->stat.nmi_injections; 3675 } 3676 svm_is_vnmi_pending(struct kvm_vcpu * vcpu)3677 static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu) 3678 { 3679 struct vcpu_svm *svm = to_svm(vcpu); 3680 3681 if (!is_vnmi_enabled(svm)) 3682 return false; 3683 3684 return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK); 3685 } 3686 svm_set_vnmi_pending(struct kvm_vcpu * vcpu)3687 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) 3688 { 3689 struct vcpu_svm *svm = to_svm(vcpu); 3690 3691 if (!is_vnmi_enabled(svm)) 3692 return false; 3693 3694 if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK) 3695 return false; 3696 3697 svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK; 3698 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 3699 3700 /* 3701 * Because the pending NMI is serviced by hardware, KVM can't know when 3702 * the NMI is "injected", but for all intents and purposes, passing the 3703 * NMI off to hardware counts as injection. 3704 */ 3705 ++vcpu->stat.nmi_injections; 3706 3707 return true; 3708 } 3709 svm_inject_irq(struct kvm_vcpu * vcpu,bool reinjected)3710 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 3711 { 3712 struct vcpu_svm *svm = to_svm(vcpu); 3713 u32 type; 3714 3715 if (vcpu->arch.interrupt.soft) { 3716 if (svm_update_soft_interrupt_rip(vcpu)) 3717 return; 3718 3719 type = SVM_EVTINJ_TYPE_SOFT; 3720 } else { 3721 type = SVM_EVTINJ_TYPE_INTR; 3722 } 3723 3724 trace_kvm_inj_virq(vcpu->arch.interrupt.nr, 3725 vcpu->arch.interrupt.soft, reinjected); 3726 ++vcpu->stat.irq_injections; 3727 3728 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 3729 SVM_EVTINJ_VALID | type; 3730 } 3731 svm_complete_interrupt_delivery(struct kvm_vcpu * vcpu,int delivery_mode,int trig_mode,int vector)3732 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, 3733 int trig_mode, int vector) 3734 { 3735 /* 3736 * apic->apicv_active must be read after vcpu->mode. 3737 * Pairs with smp_store_release in vcpu_enter_guest. 3738 */ 3739 bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); 3740 3741 /* Note, this is called iff the local APIC is in-kernel. */ 3742 if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { 3743 /* Process the interrupt via kvm_check_and_inject_events(). */ 3744 kvm_make_request(KVM_REQ_EVENT, vcpu); 3745 kvm_vcpu_kick(vcpu); 3746 return; 3747 } 3748 3749 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 3750 if (in_guest_mode) { 3751 /* 3752 * Signal the doorbell to tell hardware to inject the IRQ. If 3753 * the vCPU exits the guest before the doorbell chimes, hardware 3754 * will automatically process AVIC interrupts at the next VMRUN. 3755 */ 3756 avic_ring_doorbell(vcpu); 3757 } else { 3758 /* 3759 * Wake the vCPU if it was blocking. KVM will then detect the 3760 * pending IRQ when checking if the vCPU has a wake event. 3761 */ 3762 kvm_vcpu_wake_up(vcpu); 3763 } 3764 } 3765 svm_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)3766 static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 3767 int trig_mode, int vector) 3768 { 3769 kvm_lapic_set_irr(vector, apic); 3770 3771 /* 3772 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in 3773 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before 3774 * the read of guest_mode. This guarantees that either VMRUN will see 3775 * and process the new vIRR entry, or that svm_complete_interrupt_delivery 3776 * will signal the doorbell if the CPU has already entered the guest. 3777 */ 3778 smp_mb__after_atomic(); 3779 svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); 3780 } 3781 svm_update_cr8_intercept(struct kvm_vcpu * vcpu,int tpr,int irr)3782 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 3783 { 3784 struct vcpu_svm *svm = to_svm(vcpu); 3785 3786 /* 3787 * SEV-ES guests must always keep the CR intercepts cleared. CR 3788 * tracking is done using the CR write traps. 3789 */ 3790 if (sev_es_guest(vcpu->kvm)) 3791 return; 3792 3793 if (nested_svm_virtualize_tpr(vcpu)) 3794 return; 3795 3796 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 3797 3798 if (irr == -1) 3799 return; 3800 3801 if (tpr >= irr) 3802 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 3803 } 3804 svm_get_nmi_mask(struct kvm_vcpu * vcpu)3805 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 3806 { 3807 struct vcpu_svm *svm = to_svm(vcpu); 3808 3809 if (is_vnmi_enabled(svm)) 3810 return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK; 3811 else 3812 return svm->nmi_masked; 3813 } 3814 svm_set_nmi_mask(struct kvm_vcpu * vcpu,bool masked)3815 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 3816 { 3817 struct vcpu_svm *svm = to_svm(vcpu); 3818 3819 if (is_vnmi_enabled(svm)) { 3820 if (masked) 3821 svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK; 3822 else 3823 svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK; 3824 3825 } else { 3826 svm->nmi_masked = masked; 3827 if (masked) 3828 svm_set_iret_intercept(svm); 3829 else 3830 svm_clr_iret_intercept(svm); 3831 } 3832 } 3833 svm_nmi_blocked(struct kvm_vcpu * vcpu)3834 bool svm_nmi_blocked(struct kvm_vcpu *vcpu) 3835 { 3836 struct vcpu_svm *svm = to_svm(vcpu); 3837 struct vmcb *vmcb = svm->vmcb; 3838 3839 if (!gif_set(svm)) 3840 return true; 3841 3842 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3843 return false; 3844 3845 if (svm_get_nmi_mask(vcpu)) 3846 return true; 3847 3848 return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK; 3849 } 3850 svm_nmi_allowed(struct kvm_vcpu * vcpu,bool for_injection)3851 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3852 { 3853 struct vcpu_svm *svm = to_svm(vcpu); 3854 if (svm->nested.nested_run_pending) 3855 return -EBUSY; 3856 3857 if (svm_nmi_blocked(vcpu)) 3858 return 0; 3859 3860 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 3861 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3862 return -EBUSY; 3863 return 1; 3864 } 3865 svm_interrupt_blocked(struct kvm_vcpu * vcpu)3866 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) 3867 { 3868 struct vcpu_svm *svm = to_svm(vcpu); 3869 struct vmcb *vmcb = svm->vmcb; 3870 3871 if (!gif_set(svm)) 3872 return true; 3873 3874 if (is_guest_mode(vcpu)) { 3875 /* As long as interrupts are being delivered... */ 3876 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) 3877 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) 3878 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) 3879 return true; 3880 3881 /* ... vmexits aren't blocked by the interrupt shadow */ 3882 if (nested_exit_on_intr(svm)) 3883 return false; 3884 } else { 3885 if (!svm_get_if_flag(vcpu)) 3886 return true; 3887 } 3888 3889 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); 3890 } 3891 svm_interrupt_allowed(struct kvm_vcpu * vcpu,bool for_injection)3892 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3893 { 3894 struct vcpu_svm *svm = to_svm(vcpu); 3895 3896 if (svm->nested.nested_run_pending) 3897 return -EBUSY; 3898 3899 if (svm_interrupt_blocked(vcpu)) 3900 return 0; 3901 3902 /* 3903 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 3904 * e.g. if the IRQ arrived asynchronously after checking nested events. 3905 */ 3906 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) 3907 return -EBUSY; 3908 3909 return 1; 3910 } 3911 svm_enable_irq_window(struct kvm_vcpu * vcpu)3912 static void svm_enable_irq_window(struct kvm_vcpu *vcpu) 3913 { 3914 struct vcpu_svm *svm = to_svm(vcpu); 3915 3916 /* 3917 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 3918 * 1, because that's a separate STGI/VMRUN intercept. The next time we 3919 * get that intercept, this function will be called again though and 3920 * we'll get the vintr intercept. However, if the vGIF feature is 3921 * enabled, the STGI interception will not occur. Enable the irq 3922 * window under the assumption that the hardware will set the GIF. 3923 */ 3924 if (vgif || gif_set(svm)) { 3925 /* 3926 * IRQ window is not needed when AVIC is enabled, 3927 * unless we have pending ExtINT since it cannot be injected 3928 * via AVIC. In such case, KVM needs to temporarily disable AVIC, 3929 * and fallback to injecting IRQ via V_IRQ. 3930 * 3931 * If running nested, AVIC is already locally inhibited 3932 * on this vCPU, therefore there is no need to request 3933 * the VM wide AVIC inhibition. 3934 */ 3935 if (!is_guest_mode(vcpu)) 3936 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 3937 3938 svm_set_vintr(svm); 3939 } 3940 } 3941 svm_enable_nmi_window(struct kvm_vcpu * vcpu)3942 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) 3943 { 3944 struct vcpu_svm *svm = to_svm(vcpu); 3945 3946 /* 3947 * If NMIs are outright masked, i.e. the vCPU is already handling an 3948 * NMI, and KVM has not yet intercepted an IRET, then there is nothing 3949 * more to do at this time as KVM has already enabled IRET intercepts. 3950 * If KVM has already intercepted IRET, then single-step over the IRET, 3951 * as NMIs aren't architecturally unmasked until the IRET completes. 3952 * 3953 * If vNMI is enabled, KVM should never request an NMI window if NMIs 3954 * are masked, as KVM allows at most one to-be-injected NMI and one 3955 * pending NMI. If two NMIs arrive simultaneously, KVM will inject one 3956 * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are 3957 * unmasked. KVM _will_ request an NMI window in some situations, e.g. 3958 * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately 3959 * inject the NMI. In those situations, KVM needs to single-step over 3960 * the STI shadow or intercept STGI. 3961 */ 3962 if (svm_get_nmi_mask(vcpu)) { 3963 WARN_ON_ONCE(is_vnmi_enabled(svm)); 3964 3965 if (!svm->awaiting_iret_completion) 3966 return; /* IRET will cause a vm exit */ 3967 } 3968 3969 /* 3970 * SEV-ES guests are responsible for signaling when a vCPU is ready to 3971 * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e. 3972 * KVM can't intercept and single-step IRET to detect when NMIs are 3973 * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE. 3974 * 3975 * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware 3976 * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not 3977 * supported NAEs in the GHCB protocol. 3978 */ 3979 if (sev_es_guest(vcpu->kvm)) 3980 return; 3981 3982 if (!gif_set(svm)) { 3983 if (vgif) 3984 svm_set_intercept(svm, INTERCEPT_STGI); 3985 return; /* STGI will cause a vm exit */ 3986 } 3987 3988 /* 3989 * Something prevents NMI from been injected. Single step over possible 3990 * problem (IRET or exception injection or interrupt shadow) 3991 */ 3992 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); 3993 svm->nmi_singlestep = true; 3994 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3995 } 3996 svm_flush_tlb_asid(struct kvm_vcpu * vcpu)3997 static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu) 3998 { 3999 struct vcpu_svm *svm = to_svm(vcpu); 4000 4001 /* 4002 * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries. 4003 * A TLB flush for the current ASID flushes both "host" and "guest" TLB 4004 * entries, and thus is a superset of Hyper-V's fine grained flushing. 4005 */ 4006 kvm_hv_vcpu_purge_flush_tlb(vcpu); 4007 4008 /* 4009 * Flush only the current ASID even if the TLB flush was invoked via 4010 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all 4011 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and 4012 * unconditionally does a TLB flush on both nested VM-Enter and nested 4013 * VM-Exit (via kvm_mmu_reset_context()). 4014 */ 4015 if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) 4016 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 4017 else 4018 svm->current_vmcb->asid_generation--; 4019 } 4020 svm_flush_tlb_current(struct kvm_vcpu * vcpu)4021 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) 4022 { 4023 hpa_t root_tdp = vcpu->arch.mmu->root.hpa; 4024 4025 /* 4026 * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly 4027 * flush the NPT mappings via hypercall as flushing the ASID only 4028 * affects virtual to physical mappings, it does not invalidate guest 4029 * physical to host physical mappings. 4030 */ 4031 if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp)) 4032 hyperv_flush_guest_mapping(root_tdp); 4033 4034 svm_flush_tlb_asid(vcpu); 4035 } 4036 svm_flush_tlb_all(struct kvm_vcpu * vcpu)4037 static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) 4038 { 4039 /* 4040 * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB 4041 * flushes should be routed to hv_flush_remote_tlbs() without requesting 4042 * a "regular" remote flush. Reaching this point means either there's 4043 * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of 4044 * which might be fatal to the guest. Yell, but try to recover. 4045 */ 4046 if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) 4047 hv_flush_remote_tlbs(vcpu->kvm); 4048 4049 svm_flush_tlb_asid(vcpu); 4050 } 4051 svm_flush_tlb_gva(struct kvm_vcpu * vcpu,gva_t gva)4052 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) 4053 { 4054 struct vcpu_svm *svm = to_svm(vcpu); 4055 4056 invlpga(gva, svm->vmcb->control.asid); 4057 } 4058 sync_cr8_to_lapic(struct kvm_vcpu * vcpu)4059 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) 4060 { 4061 struct vcpu_svm *svm = to_svm(vcpu); 4062 4063 if (nested_svm_virtualize_tpr(vcpu)) 4064 return; 4065 4066 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { 4067 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 4068 kvm_set_cr8(vcpu, cr8); 4069 } 4070 } 4071 sync_lapic_to_cr8(struct kvm_vcpu * vcpu)4072 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) 4073 { 4074 struct vcpu_svm *svm = to_svm(vcpu); 4075 u64 cr8; 4076 4077 if (nested_svm_virtualize_tpr(vcpu) || 4078 kvm_vcpu_apicv_active(vcpu)) 4079 return; 4080 4081 cr8 = kvm_get_cr8(vcpu); 4082 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 4083 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 4084 } 4085 svm_complete_soft_interrupt(struct kvm_vcpu * vcpu,u8 vector,int type)4086 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, 4087 int type) 4088 { 4089 bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); 4090 bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); 4091 struct vcpu_svm *svm = to_svm(vcpu); 4092 4093 /* 4094 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's 4095 * associated with the original soft exception/interrupt. next_rip is 4096 * cleared on all exits that can occur while vectoring an event, so KVM 4097 * needs to manually set next_rip for re-injection. Unlike the !nrips 4098 * case below, this needs to be done if and only if KVM is re-injecting 4099 * the same event, i.e. if the event is a soft exception/interrupt, 4100 * otherwise next_rip is unused on VMRUN. 4101 */ 4102 if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && 4103 kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) 4104 svm->vmcb->control.next_rip = svm->soft_int_next_rip; 4105 /* 4106 * If NRIPS isn't enabled, KVM must manually advance RIP prior to 4107 * injecting the soft exception/interrupt. That advancement needs to 4108 * be unwound if vectoring didn't complete. Note, the new event may 4109 * not be the injected event, e.g. if KVM injected an INTn, the INTn 4110 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will 4111 * be the reported vectored event, but RIP still needs to be unwound. 4112 */ 4113 else if (!nrips && (is_soft || is_exception) && 4114 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) 4115 kvm_rip_write(vcpu, svm->soft_int_old_rip); 4116 } 4117 svm_complete_interrupts(struct kvm_vcpu * vcpu)4118 static void svm_complete_interrupts(struct kvm_vcpu *vcpu) 4119 { 4120 struct vcpu_svm *svm = to_svm(vcpu); 4121 u8 vector; 4122 int type; 4123 u32 exitintinfo = svm->vmcb->control.exit_int_info; 4124 bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; 4125 bool soft_int_injected = svm->soft_int_injected; 4126 4127 svm->nmi_l1_to_l2 = false; 4128 svm->soft_int_injected = false; 4129 4130 /* 4131 * If we've made progress since setting awaiting_iret_completion, we've 4132 * executed an IRET and can allow NMI injection. 4133 */ 4134 if (svm->awaiting_iret_completion && 4135 kvm_rip_read(vcpu) != svm->nmi_iret_rip) { 4136 svm->awaiting_iret_completion = false; 4137 svm->nmi_masked = false; 4138 kvm_make_request(KVM_REQ_EVENT, vcpu); 4139 } 4140 4141 vcpu->arch.nmi_injected = false; 4142 kvm_clear_exception_queue(vcpu); 4143 kvm_clear_interrupt_queue(vcpu); 4144 4145 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 4146 return; 4147 4148 kvm_make_request(KVM_REQ_EVENT, vcpu); 4149 4150 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 4151 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 4152 4153 if (soft_int_injected) 4154 svm_complete_soft_interrupt(vcpu, vector, type); 4155 4156 switch (type) { 4157 case SVM_EXITINTINFO_TYPE_NMI: 4158 vcpu->arch.nmi_injected = true; 4159 svm->nmi_l1_to_l2 = nmi_l1_to_l2; 4160 break; 4161 case SVM_EXITINTINFO_TYPE_EXEPT: { 4162 u32 error_code = 0; 4163 4164 /* 4165 * Never re-inject a #VC exception. 4166 */ 4167 if (vector == X86_TRAP_VC) 4168 break; 4169 4170 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) 4171 error_code = svm->vmcb->control.exit_int_info_err; 4172 4173 kvm_requeue_exception(vcpu, vector, 4174 exitintinfo & SVM_EXITINTINFO_VALID_ERR, 4175 error_code); 4176 break; 4177 } 4178 case SVM_EXITINTINFO_TYPE_INTR: 4179 kvm_queue_interrupt(vcpu, vector, false); 4180 break; 4181 case SVM_EXITINTINFO_TYPE_SOFT: 4182 kvm_queue_interrupt(vcpu, vector, true); 4183 break; 4184 default: 4185 break; 4186 } 4187 4188 } 4189 svm_cancel_injection(struct kvm_vcpu * vcpu)4190 static void svm_cancel_injection(struct kvm_vcpu *vcpu) 4191 { 4192 struct vcpu_svm *svm = to_svm(vcpu); 4193 struct vmcb_control_area *control = &svm->vmcb->control; 4194 4195 control->exit_int_info = control->event_inj; 4196 control->exit_int_info_err = control->event_inj_err; 4197 control->event_inj = 0; 4198 svm_complete_interrupts(vcpu); 4199 } 4200 svm_vcpu_pre_run(struct kvm_vcpu * vcpu)4201 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) 4202 { 4203 if (to_kvm_sev_info(vcpu->kvm)->need_init) 4204 return -EINVAL; 4205 4206 return 1; 4207 } 4208 svm_exit_handlers_fastpath(struct kvm_vcpu * vcpu)4209 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 4210 { 4211 struct vcpu_svm *svm = to_svm(vcpu); 4212 4213 if (is_guest_mode(vcpu)) 4214 return EXIT_FASTPATH_NONE; 4215 4216 switch (svm->vmcb->control.exit_code) { 4217 case SVM_EXIT_MSR: 4218 if (!svm->vmcb->control.exit_info_1) 4219 break; 4220 return handle_fastpath_set_msr_irqoff(vcpu); 4221 case SVM_EXIT_HLT: 4222 return handle_fastpath_hlt(vcpu); 4223 default: 4224 break; 4225 } 4226 4227 return EXIT_FASTPATH_NONE; 4228 } 4229 svm_vcpu_enter_exit(struct kvm_vcpu * vcpu,bool spec_ctrl_intercepted)4230 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) 4231 { 4232 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 4233 struct vcpu_svm *svm = to_svm(vcpu); 4234 4235 guest_state_enter_irqoff(); 4236 4237 /* 4238 * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of 4239 * VMRUN controls whether or not physical IRQs are masked (KVM always 4240 * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the 4241 * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow 4242 * into guest state if delivery of an event during VMRUN triggers a 4243 * #VMEXIT, and the guest_state transitions already tell lockdep that 4244 * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of 4245 * this path, so IRQs aren't actually unmasked while running host code. 4246 */ 4247 raw_local_irq_enable(); 4248 4249 amd_clear_divider(); 4250 4251 if (sev_es_guest(vcpu->kvm)) 4252 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, 4253 sev_es_host_save_area(sd)); 4254 else 4255 __svm_vcpu_run(svm, spec_ctrl_intercepted); 4256 4257 raw_local_irq_disable(); 4258 4259 guest_state_exit_irqoff(); 4260 } 4261 svm_vcpu_run(struct kvm_vcpu * vcpu,bool force_immediate_exit)4262 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, 4263 bool force_immediate_exit) 4264 { 4265 struct vcpu_svm *svm = to_svm(vcpu); 4266 bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); 4267 4268 trace_kvm_entry(vcpu, force_immediate_exit); 4269 4270 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4271 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4272 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4273 4274 /* 4275 * Disable singlestep if we're injecting an interrupt/exception. 4276 * We don't want our modified rflags to be pushed on the stack where 4277 * we might not be able to easily reset them if we disabled NMI 4278 * singlestep later. 4279 */ 4280 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { 4281 /* 4282 * Event injection happens before external interrupts cause a 4283 * vmexit and interrupts are disabled here, so smp_send_reschedule 4284 * is enough to force an immediate vmexit. 4285 */ 4286 disable_nmi_singlestep(svm); 4287 force_immediate_exit = true; 4288 } 4289 4290 if (force_immediate_exit) 4291 smp_send_reschedule(vcpu->cpu); 4292 4293 if (pre_svm_run(vcpu)) { 4294 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 4295 vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; 4296 vcpu->run->fail_entry.cpu = vcpu->cpu; 4297 return EXIT_FASTPATH_EXIT_USERSPACE; 4298 } 4299 4300 sync_lapic_to_cr8(vcpu); 4301 4302 if (unlikely(svm->asid != svm->vmcb->control.asid)) { 4303 svm->vmcb->control.asid = svm->asid; 4304 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 4305 } 4306 svm->vmcb->save.cr2 = vcpu->arch.cr2; 4307 4308 svm_hv_update_vp_id(svm->vmcb, vcpu); 4309 4310 /* 4311 * Run with all-zero DR6 unless needed, so that we can get the exact cause 4312 * of a #DB. 4313 */ 4314 if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) 4315 svm_set_dr6(vcpu, DR6_ACTIVE_LOW); 4316 4317 clgi(); 4318 kvm_load_guest_xsave_state(vcpu); 4319 4320 /* 4321 * Hardware only context switches DEBUGCTL if LBR virtualization is 4322 * enabled. Manually load DEBUGCTL if necessary (and restore it after 4323 * VM-Exit), as running with the host's DEBUGCTL can negatively affect 4324 * guest state and can even be fatal, e.g. due to Bus Lock Detect. 4325 */ 4326 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4327 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4328 update_debugctlmsr(svm->vmcb->save.dbgctl); 4329 4330 kvm_wait_lapic_expire(vcpu); 4331 4332 /* 4333 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 4334 * it's non-zero. Since vmentry is serialising on affected CPUs, there 4335 * is no need to worry about the conditional branch over the wrmsr 4336 * being speculatively taken. 4337 */ 4338 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 4339 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl); 4340 4341 svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); 4342 4343 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 4344 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); 4345 4346 if (!sev_es_guest(vcpu->kvm)) { 4347 vcpu->arch.cr2 = svm->vmcb->save.cr2; 4348 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 4349 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 4350 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 4351 } 4352 vcpu->arch.regs_dirty = 0; 4353 4354 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 4355 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 4356 4357 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4358 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4359 update_debugctlmsr(vcpu->arch.host_debugctl); 4360 4361 kvm_load_host_xsave_state(vcpu); 4362 stgi(); 4363 4364 /* Any pending NMI will happen here */ 4365 4366 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 4367 kvm_after_interrupt(vcpu); 4368 4369 sync_cr8_to_lapic(vcpu); 4370 4371 svm->next_rip = 0; 4372 if (is_guest_mode(vcpu)) { 4373 nested_sync_control_from_vmcb02(svm); 4374 4375 /* Track VMRUNs that have made past consistency checking */ 4376 if (svm->nested.nested_run_pending && 4377 svm->vmcb->control.exit_code != SVM_EXIT_ERR) 4378 ++vcpu->stat.nested_run; 4379 4380 svm->nested.nested_run_pending = 0; 4381 } 4382 4383 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 4384 vmcb_mark_all_clean(svm->vmcb); 4385 4386 /* if exit due to PF check for async PF */ 4387 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) 4388 vcpu->arch.apf.host_apf_flags = 4389 kvm_read_and_reset_apf_flags(); 4390 4391 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; 4392 4393 /* 4394 * We need to handle MC intercepts here before the vcpu has a chance to 4395 * change the physical cpu 4396 */ 4397 if (unlikely(svm->vmcb->control.exit_code == 4398 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 4399 svm_handle_mce(vcpu); 4400 4401 trace_kvm_exit(vcpu, KVM_ISA_SVM); 4402 4403 svm_complete_interrupts(vcpu); 4404 4405 return svm_exit_handlers_fastpath(vcpu); 4406 } 4407 svm_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int root_level)4408 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, 4409 int root_level) 4410 { 4411 struct vcpu_svm *svm = to_svm(vcpu); 4412 unsigned long cr3; 4413 4414 if (npt_enabled) { 4415 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); 4416 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 4417 4418 hv_track_root_tdp(vcpu, root_hpa); 4419 4420 cr3 = vcpu->arch.cr3; 4421 } else if (root_level >= PT64_ROOT_4LEVEL) { 4422 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); 4423 } else { 4424 /* PCID in the guest should be impossible with a 32-bit MMU. */ 4425 WARN_ON_ONCE(kvm_get_active_pcid(vcpu)); 4426 cr3 = root_hpa; 4427 } 4428 4429 svm->vmcb->save.cr3 = cr3; 4430 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 4431 } 4432 4433 static void svm_patch_hypercall(struct kvm_vcpu * vcpu,unsigned char * hypercall)4434 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 4435 { 4436 /* 4437 * Patch in the VMMCALL instruction: 4438 */ 4439 hypercall[0] = 0x0f; 4440 hypercall[1] = 0x01; 4441 hypercall[2] = 0xd9; 4442 } 4443 4444 /* 4445 * The kvm parameter can be NULL (module initialization, or invocation before 4446 * VM creation). Be sure to check the kvm parameter before using it. 4447 */ svm_has_emulated_msr(struct kvm * kvm,u32 index)4448 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) 4449 { 4450 switch (index) { 4451 case MSR_IA32_MCG_EXT_CTL: 4452 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 4453 return false; 4454 case MSR_IA32_SMBASE: 4455 if (!IS_ENABLED(CONFIG_KVM_SMM)) 4456 return false; 4457 /* SEV-ES guests do not support SMM, so report false */ 4458 if (kvm && sev_es_guest(kvm)) 4459 return false; 4460 break; 4461 default: 4462 break; 4463 } 4464 4465 return true; 4466 } 4467 svm_vcpu_after_set_cpuid(struct kvm_vcpu * vcpu)4468 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 4469 { 4470 struct vcpu_svm *svm = to_svm(vcpu); 4471 4472 /* 4473 * SVM doesn't provide a way to disable just XSAVES in the guest, KVM 4474 * can only disable all variants of by disallowing CR4.OSXSAVE from 4475 * being set. As a result, if the host has XSAVE and XSAVES, and the 4476 * guest has XSAVE enabled, the guest can execute XSAVES without 4477 * faulting. Treat XSAVES as enabled in this case regardless of 4478 * whether it's advertised to the guest so that KVM context switches 4479 * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give 4480 * the guest read/write access to the host's XSS. 4481 */ 4482 guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES, 4483 boot_cpu_has(X86_FEATURE_XSAVES) && 4484 guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)); 4485 4486 /* 4487 * Intercept VMLOAD if the vCPU model is Intel in order to emulate that 4488 * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing 4489 * SVM on Intel is bonkers and extremely unlikely to work). 4490 */ 4491 if (guest_cpuid_is_intel_compatible(vcpu)) 4492 guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); 4493 4494 svm_recalc_instruction_intercepts(vcpu, svm); 4495 4496 if (boot_cpu_has(X86_FEATURE_IBPB)) 4497 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 4498 !!guest_has_pred_cmd_msr(vcpu)); 4499 4500 if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 4501 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, 4502 !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 4503 4504 if (sev_guest(vcpu->kvm)) 4505 sev_vcpu_after_set_cpuid(svm); 4506 4507 init_vmcb_after_set_cpuid(vcpu); 4508 } 4509 svm_has_wbinvd_exit(void)4510 static bool svm_has_wbinvd_exit(void) 4511 { 4512 return true; 4513 } 4514 4515 #define PRE_EX(exit) { .exit_code = (exit), \ 4516 .stage = X86_ICPT_PRE_EXCEPT, } 4517 #define POST_EX(exit) { .exit_code = (exit), \ 4518 .stage = X86_ICPT_POST_EXCEPT, } 4519 #define POST_MEM(exit) { .exit_code = (exit), \ 4520 .stage = X86_ICPT_POST_MEMACCESS, } 4521 4522 static const struct __x86_intercept { 4523 u32 exit_code; 4524 enum x86_intercept_stage stage; 4525 } x86_intercept_map[] = { 4526 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), 4527 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), 4528 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), 4529 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), 4530 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), 4531 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), 4532 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), 4533 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), 4534 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), 4535 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), 4536 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), 4537 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), 4538 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), 4539 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), 4540 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), 4541 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), 4542 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), 4543 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), 4544 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), 4545 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), 4546 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), 4547 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), 4548 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), 4549 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), 4550 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), 4551 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), 4552 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), 4553 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), 4554 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), 4555 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), 4556 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), 4557 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), 4558 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), 4559 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), 4560 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), 4561 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), 4562 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), 4563 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), 4564 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), 4565 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), 4566 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), 4567 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), 4568 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), 4569 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), 4570 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), 4571 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), 4572 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), 4573 }; 4574 4575 #undef PRE_EX 4576 #undef POST_EX 4577 #undef POST_MEM 4578 svm_check_intercept(struct kvm_vcpu * vcpu,struct x86_instruction_info * info,enum x86_intercept_stage stage,struct x86_exception * exception)4579 static int svm_check_intercept(struct kvm_vcpu *vcpu, 4580 struct x86_instruction_info *info, 4581 enum x86_intercept_stage stage, 4582 struct x86_exception *exception) 4583 { 4584 struct vcpu_svm *svm = to_svm(vcpu); 4585 int vmexit, ret = X86EMUL_CONTINUE; 4586 struct __x86_intercept icpt_info; 4587 struct vmcb *vmcb = svm->vmcb; 4588 4589 if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) 4590 goto out; 4591 4592 icpt_info = x86_intercept_map[info->intercept]; 4593 4594 if (stage != icpt_info.stage) 4595 goto out; 4596 4597 switch (icpt_info.exit_code) { 4598 case SVM_EXIT_READ_CR0: 4599 if (info->intercept == x86_intercept_cr_read) 4600 icpt_info.exit_code += info->modrm_reg; 4601 break; 4602 case SVM_EXIT_WRITE_CR0: { 4603 unsigned long cr0, val; 4604 4605 if (info->intercept == x86_intercept_cr_write) 4606 icpt_info.exit_code += info->modrm_reg; 4607 4608 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || 4609 info->intercept == x86_intercept_clts) 4610 break; 4611 4612 if (!(vmcb12_is_intercept(&svm->nested.ctl, 4613 INTERCEPT_SELECTIVE_CR0))) 4614 break; 4615 4616 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; 4617 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; 4618 4619 if (info->intercept == x86_intercept_lmsw) { 4620 cr0 &= 0xfUL; 4621 val &= 0xfUL; 4622 /* lmsw can't clear PE - catch this here */ 4623 if (cr0 & X86_CR0_PE) 4624 val |= X86_CR0_PE; 4625 } 4626 4627 if (cr0 ^ val) 4628 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; 4629 4630 break; 4631 } 4632 case SVM_EXIT_READ_DR0: 4633 case SVM_EXIT_WRITE_DR0: 4634 icpt_info.exit_code += info->modrm_reg; 4635 break; 4636 case SVM_EXIT_MSR: 4637 if (info->intercept == x86_intercept_wrmsr) 4638 vmcb->control.exit_info_1 = 1; 4639 else 4640 vmcb->control.exit_info_1 = 0; 4641 break; 4642 case SVM_EXIT_PAUSE: 4643 /* 4644 * We get this for NOP only, but pause 4645 * is rep not, check this here 4646 */ 4647 if (info->rep_prefix != REPE_PREFIX) 4648 goto out; 4649 break; 4650 case SVM_EXIT_IOIO: { 4651 u64 exit_info; 4652 u32 bytes; 4653 4654 if (info->intercept == x86_intercept_in || 4655 info->intercept == x86_intercept_ins) { 4656 exit_info = ((info->src_val & 0xffff) << 16) | 4657 SVM_IOIO_TYPE_MASK; 4658 bytes = info->dst_bytes; 4659 } else { 4660 exit_info = (info->dst_val & 0xffff) << 16; 4661 bytes = info->src_bytes; 4662 } 4663 4664 if (info->intercept == x86_intercept_outs || 4665 info->intercept == x86_intercept_ins) 4666 exit_info |= SVM_IOIO_STR_MASK; 4667 4668 if (info->rep_prefix) 4669 exit_info |= SVM_IOIO_REP_MASK; 4670 4671 bytes = min(bytes, 4u); 4672 4673 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; 4674 4675 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); 4676 4677 vmcb->control.exit_info_1 = exit_info; 4678 vmcb->control.exit_info_2 = info->next_rip; 4679 4680 break; 4681 } 4682 default: 4683 break; 4684 } 4685 4686 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */ 4687 if (static_cpu_has(X86_FEATURE_NRIPS)) 4688 vmcb->control.next_rip = info->next_rip; 4689 vmcb->control.exit_code = icpt_info.exit_code; 4690 vmexit = nested_svm_exit_handled(svm); 4691 4692 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED 4693 : X86EMUL_CONTINUE; 4694 4695 out: 4696 return ret; 4697 } 4698 svm_handle_exit_irqoff(struct kvm_vcpu * vcpu)4699 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) 4700 { 4701 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) 4702 vcpu->arch.at_instruction_boundary = true; 4703 } 4704 svm_setup_mce(struct kvm_vcpu * vcpu)4705 static void svm_setup_mce(struct kvm_vcpu *vcpu) 4706 { 4707 /* [63:9] are reserved. */ 4708 vcpu->arch.mcg_cap &= 0x1ff; 4709 } 4710 4711 #ifdef CONFIG_KVM_SMM svm_smi_blocked(struct kvm_vcpu * vcpu)4712 bool svm_smi_blocked(struct kvm_vcpu *vcpu) 4713 { 4714 struct vcpu_svm *svm = to_svm(vcpu); 4715 4716 /* Per APM Vol.2 15.22.2 "Response to SMI" */ 4717 if (!gif_set(svm)) 4718 return true; 4719 4720 return is_smm(vcpu); 4721 } 4722 svm_smi_allowed(struct kvm_vcpu * vcpu,bool for_injection)4723 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4724 { 4725 struct vcpu_svm *svm = to_svm(vcpu); 4726 if (svm->nested.nested_run_pending) 4727 return -EBUSY; 4728 4729 if (svm_smi_blocked(vcpu)) 4730 return 0; 4731 4732 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ 4733 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) 4734 return -EBUSY; 4735 4736 return 1; 4737 } 4738 svm_enter_smm(struct kvm_vcpu * vcpu,union kvm_smram * smram)4739 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 4740 { 4741 struct vcpu_svm *svm = to_svm(vcpu); 4742 struct kvm_host_map map_save; 4743 int ret; 4744 4745 if (!is_guest_mode(vcpu)) 4746 return 0; 4747 4748 /* 4749 * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is 4750 * responsible for ensuring nested SVM and SMIs are mutually exclusive. 4751 */ 4752 4753 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 4754 return 1; 4755 4756 smram->smram64.svm_guest_flag = 1; 4757 smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa; 4758 4759 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4760 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4761 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4762 4763 ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); 4764 if (ret) 4765 return ret; 4766 4767 /* 4768 * KVM uses VMCB01 to store L1 host state while L2 runs but 4769 * VMCB01 is going to be used during SMM and thus the state will 4770 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save 4771 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the 4772 * format of the area is identical to guest save area offsetted 4773 * by 0x400 (matches the offset of 'struct vmcb_save_area' 4774 * within 'struct vmcb'). Note: HSAVE area may also be used by 4775 * L1 hypervisor to save additional host context (e.g. KVM does 4776 * that, see svm_prepare_switch_to_guest()) which must be 4777 * preserved. 4778 */ 4779 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) 4780 return 1; 4781 4782 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); 4783 4784 svm_copy_vmrun_state(map_save.hva + 0x400, 4785 &svm->vmcb01.ptr->save); 4786 4787 kvm_vcpu_unmap(vcpu, &map_save); 4788 return 0; 4789 } 4790 svm_leave_smm(struct kvm_vcpu * vcpu,const union kvm_smram * smram)4791 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 4792 { 4793 struct vcpu_svm *svm = to_svm(vcpu); 4794 struct kvm_host_map map, map_save; 4795 struct vmcb *vmcb12; 4796 int ret; 4797 4798 const struct kvm_smram_state_64 *smram64 = &smram->smram64; 4799 4800 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 4801 return 0; 4802 4803 /* Non-zero if SMI arrived while vCPU was in guest mode. */ 4804 if (!smram64->svm_guest_flag) 4805 return 0; 4806 4807 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) 4808 return 1; 4809 4810 if (!(smram64->efer & EFER_SVME)) 4811 return 1; 4812 4813 if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map)) 4814 return 1; 4815 4816 ret = 1; 4817 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) 4818 goto unmap_map; 4819 4820 if (svm_allocate_nested(svm)) 4821 goto unmap_save; 4822 4823 /* 4824 * Restore L1 host state from L1 HSAVE area as VMCB01 was 4825 * used during SMM (see svm_enter_smm()) 4826 */ 4827 4828 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); 4829 4830 /* 4831 * Enter the nested guest now 4832 */ 4833 4834 vmcb_mark_all_dirty(svm->vmcb01.ptr); 4835 4836 vmcb12 = map.hva; 4837 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); 4838 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); 4839 ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false); 4840 4841 if (ret) 4842 goto unmap_save; 4843 4844 svm->nested.nested_run_pending = 1; 4845 4846 unmap_save: 4847 kvm_vcpu_unmap(vcpu, &map_save); 4848 unmap_map: 4849 kvm_vcpu_unmap(vcpu, &map); 4850 return ret; 4851 } 4852 svm_enable_smi_window(struct kvm_vcpu * vcpu)4853 static void svm_enable_smi_window(struct kvm_vcpu *vcpu) 4854 { 4855 struct vcpu_svm *svm = to_svm(vcpu); 4856 4857 if (!gif_set(svm)) { 4858 if (vgif) 4859 svm_set_intercept(svm, INTERCEPT_STGI); 4860 /* STGI will cause a vm exit */ 4861 } else { 4862 /* We must be in SMM; RSM will cause a vmexit anyway. */ 4863 } 4864 } 4865 #endif 4866 svm_check_emulate_instruction(struct kvm_vcpu * vcpu,int emul_type,void * insn,int insn_len)4867 static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 4868 void *insn, int insn_len) 4869 { 4870 struct vcpu_svm *svm = to_svm(vcpu); 4871 bool smep, smap, is_user; 4872 u64 error_code; 4873 4874 /* Check that emulation is possible during event vectoring */ 4875 if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) && 4876 !kvm_can_emulate_event_vectoring(emul_type)) 4877 return X86EMUL_UNHANDLEABLE_VECTORING; 4878 4879 /* Emulation is always possible when KVM has access to all guest state. */ 4880 if (!sev_guest(vcpu->kvm)) 4881 return X86EMUL_CONTINUE; 4882 4883 /* #UD and #GP should never be intercepted for SEV guests. */ 4884 WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | 4885 EMULTYPE_TRAP_UD_FORCED | 4886 EMULTYPE_VMWARE_GP)); 4887 4888 /* 4889 * Emulation is impossible for SEV-ES guests as KVM doesn't have access 4890 * to guest register state. 4891 */ 4892 if (sev_es_guest(vcpu->kvm)) 4893 return X86EMUL_RETRY_INSTR; 4894 4895 /* 4896 * Emulation is possible if the instruction is already decoded, e.g. 4897 * when completing I/O after returning from userspace. 4898 */ 4899 if (emul_type & EMULTYPE_NO_DECODE) 4900 return X86EMUL_CONTINUE; 4901 4902 /* 4903 * Emulation is possible for SEV guests if and only if a prefilled 4904 * buffer containing the bytes of the intercepted instruction is 4905 * available. SEV guest memory is encrypted with a guest specific key 4906 * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and 4907 * decode garbage. 4908 * 4909 * If KVM is NOT trying to simply skip an instruction, inject #UD if 4910 * KVM reached this point without an instruction buffer. In practice, 4911 * this path should never be hit by a well-behaved guest, e.g. KVM 4912 * doesn't intercept #UD or #GP for SEV guests, but this path is still 4913 * theoretically reachable, e.g. via unaccelerated fault-like AVIC 4914 * access, and needs to be handled by KVM to avoid putting the guest 4915 * into an infinite loop. Injecting #UD is somewhat arbitrary, but 4916 * its the least awful option given lack of insight into the guest. 4917 * 4918 * If KVM is trying to skip an instruction, simply resume the guest. 4919 * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM 4920 * will attempt to re-inject the INT3/INTO and skip the instruction. 4921 * In that scenario, retrying the INT3/INTO and hoping the guest will 4922 * make forward progress is the only option that has a chance of 4923 * success (and in practice it will work the vast majority of the time). 4924 */ 4925 if (unlikely(!insn)) { 4926 if (emul_type & EMULTYPE_SKIP) 4927 return X86EMUL_UNHANDLEABLE; 4928 4929 kvm_queue_exception(vcpu, UD_VECTOR); 4930 return X86EMUL_PROPAGATE_FAULT; 4931 } 4932 4933 /* 4934 * Emulate for SEV guests if the insn buffer is not empty. The buffer 4935 * will be empty if the DecodeAssist microcode cannot fetch bytes for 4936 * the faulting instruction because the code fetch itself faulted, e.g. 4937 * the guest attempted to fetch from emulated MMIO or a guest page 4938 * table used to translate CS:RIP resides in emulated MMIO. 4939 */ 4940 if (likely(insn_len)) 4941 return X86EMUL_CONTINUE; 4942 4943 /* 4944 * Detect and workaround Errata 1096 Fam_17h_00_0Fh. 4945 * 4946 * Errata: 4947 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is 4948 * possible that CPU microcode implementing DecodeAssist will fail to 4949 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly 4950 * be '0'. This happens because microcode reads CS:RIP using a _data_ 4951 * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode 4952 * gives up and does not fill the instruction bytes buffer. 4953 * 4954 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU 4955 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler 4956 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the 4957 * GuestIntrBytes field of the VMCB. 4958 * 4959 * This does _not_ mean that the erratum has been encountered, as the 4960 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate 4961 * #PF, e.g. if the guest attempt to execute from emulated MMIO and 4962 * encountered a reserved/not-present #PF. 4963 * 4964 * To hit the erratum, the following conditions must be true: 4965 * 1. CR4.SMAP=1 (obviously). 4966 * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot 4967 * have been hit as the guest would have encountered a SMEP 4968 * violation #PF, not a #NPF. 4969 * 3. The #NPF is not due to a code fetch, in which case failure to 4970 * retrieve the instruction bytes is legitimate (see abvoe). 4971 * 4972 * In addition, don't apply the erratum workaround if the #NPF occurred 4973 * while translating guest page tables (see below). 4974 */ 4975 error_code = svm->vmcb->control.exit_info_1; 4976 if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) 4977 goto resume_guest; 4978 4979 smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP); 4980 smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP); 4981 is_user = svm_get_cpl(vcpu) == 3; 4982 if (smap && (!smep || is_user)) { 4983 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); 4984 4985 /* 4986 * If the fault occurred in userspace, arbitrarily inject #GP 4987 * to avoid killing the guest and to hopefully avoid confusing 4988 * the guest kernel too much, e.g. injecting #PF would not be 4989 * coherent with respect to the guest's page tables. Request 4990 * triple fault if the fault occurred in the kernel as there's 4991 * no fault that KVM can inject without confusing the guest. 4992 * In practice, the triple fault is moot as no sane SEV kernel 4993 * will execute from user memory while also running with SMAP=1. 4994 */ 4995 if (is_user) 4996 kvm_inject_gp(vcpu, 0); 4997 else 4998 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4999 return X86EMUL_PROPAGATE_FAULT; 5000 } 5001 5002 resume_guest: 5003 /* 5004 * If the erratum was not hit, simply resume the guest and let it fault 5005 * again. While awful, e.g. the vCPU may get stuck in an infinite loop 5006 * if the fault is at CPL=0, it's the lesser of all evils. Exiting to 5007 * userspace will kill the guest, and letting the emulator read garbage 5008 * will yield random behavior and potentially corrupt the guest. 5009 * 5010 * Simply resuming the guest is technically not a violation of the SEV 5011 * architecture. AMD's APM states that all code fetches and page table 5012 * accesses for SEV guest are encrypted, regardless of the C-Bit. The 5013 * APM also states that encrypted accesses to MMIO are "ignored", but 5014 * doesn't explicitly define "ignored", i.e. doing nothing and letting 5015 * the guest spin is technically "ignoring" the access. 5016 */ 5017 return X86EMUL_RETRY_INSTR; 5018 } 5019 svm_apic_init_signal_blocked(struct kvm_vcpu * vcpu)5020 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 5021 { 5022 struct vcpu_svm *svm = to_svm(vcpu); 5023 5024 return !gif_set(svm); 5025 } 5026 svm_vcpu_deliver_sipi_vector(struct kvm_vcpu * vcpu,u8 vector)5027 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 5028 { 5029 if (!sev_es_guest(vcpu->kvm)) 5030 return kvm_vcpu_deliver_sipi_vector(vcpu, vector); 5031 5032 sev_vcpu_deliver_sipi_vector(vcpu, vector); 5033 } 5034 svm_vm_destroy(struct kvm * kvm)5035 static void svm_vm_destroy(struct kvm *kvm) 5036 { 5037 avic_vm_destroy(kvm); 5038 sev_vm_destroy(kvm); 5039 } 5040 svm_vm_init(struct kvm * kvm)5041 static int svm_vm_init(struct kvm *kvm) 5042 { 5043 int type = kvm->arch.vm_type; 5044 5045 if (type != KVM_X86_DEFAULT_VM && 5046 type != KVM_X86_SW_PROTECTED_VM) { 5047 kvm->arch.has_protected_state = 5048 (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); 5049 to_kvm_sev_info(kvm)->need_init = true; 5050 5051 kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); 5052 kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; 5053 } 5054 5055 if (!pause_filter_count || !pause_filter_thresh) 5056 kvm->arch.pause_in_guest = true; 5057 5058 if (enable_apicv) { 5059 int ret = avic_vm_init(kvm); 5060 if (ret) 5061 return ret; 5062 } 5063 5064 return 0; 5065 } 5066 svm_alloc_apic_backing_page(struct kvm_vcpu * vcpu)5067 static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) 5068 { 5069 struct page *page = snp_safe_alloc_page(); 5070 5071 if (!page) 5072 return NULL; 5073 5074 return page_address(page); 5075 } 5076 5077 static struct kvm_x86_ops svm_x86_ops __initdata = { 5078 .name = KBUILD_MODNAME, 5079 5080 .check_processor_compatibility = svm_check_processor_compat, 5081 5082 .hardware_unsetup = svm_hardware_unsetup, 5083 .enable_virtualization_cpu = svm_enable_virtualization_cpu, 5084 .disable_virtualization_cpu = svm_disable_virtualization_cpu, 5085 .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, 5086 .has_emulated_msr = svm_has_emulated_msr, 5087 5088 .vcpu_create = svm_vcpu_create, 5089 .vcpu_free = svm_vcpu_free, 5090 .vcpu_reset = svm_vcpu_reset, 5091 5092 .vm_size = sizeof(struct kvm_svm), 5093 .vm_init = svm_vm_init, 5094 .vm_destroy = svm_vm_destroy, 5095 5096 .prepare_switch_to_guest = svm_prepare_switch_to_guest, 5097 .vcpu_load = svm_vcpu_load, 5098 .vcpu_put = svm_vcpu_put, 5099 .vcpu_blocking = avic_vcpu_blocking, 5100 .vcpu_unblocking = avic_vcpu_unblocking, 5101 5102 .update_exception_bitmap = svm_update_exception_bitmap, 5103 .get_feature_msr = svm_get_feature_msr, 5104 .get_msr = svm_get_msr, 5105 .set_msr = svm_set_msr, 5106 .get_segment_base = svm_get_segment_base, 5107 .get_segment = svm_get_segment, 5108 .set_segment = svm_set_segment, 5109 .get_cpl = svm_get_cpl, 5110 .get_cpl_no_cache = svm_get_cpl, 5111 .get_cs_db_l_bits = svm_get_cs_db_l_bits, 5112 .is_valid_cr0 = svm_is_valid_cr0, 5113 .set_cr0 = svm_set_cr0, 5114 .post_set_cr3 = sev_post_set_cr3, 5115 .is_valid_cr4 = svm_is_valid_cr4, 5116 .set_cr4 = svm_set_cr4, 5117 .set_efer = svm_set_efer, 5118 .get_idt = svm_get_idt, 5119 .set_idt = svm_set_idt, 5120 .get_gdt = svm_get_gdt, 5121 .set_gdt = svm_set_gdt, 5122 .set_dr6 = svm_set_dr6, 5123 .set_dr7 = svm_set_dr7, 5124 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, 5125 .cache_reg = svm_cache_reg, 5126 .get_rflags = svm_get_rflags, 5127 .set_rflags = svm_set_rflags, 5128 .get_if_flag = svm_get_if_flag, 5129 5130 .flush_tlb_all = svm_flush_tlb_all, 5131 .flush_tlb_current = svm_flush_tlb_current, 5132 .flush_tlb_gva = svm_flush_tlb_gva, 5133 .flush_tlb_guest = svm_flush_tlb_asid, 5134 5135 .vcpu_pre_run = svm_vcpu_pre_run, 5136 .vcpu_run = svm_vcpu_run, 5137 .handle_exit = svm_handle_exit, 5138 .skip_emulated_instruction = svm_skip_emulated_instruction, 5139 .update_emulated_instruction = NULL, 5140 .set_interrupt_shadow = svm_set_interrupt_shadow, 5141 .get_interrupt_shadow = svm_get_interrupt_shadow, 5142 .patch_hypercall = svm_patch_hypercall, 5143 .inject_irq = svm_inject_irq, 5144 .inject_nmi = svm_inject_nmi, 5145 .is_vnmi_pending = svm_is_vnmi_pending, 5146 .set_vnmi_pending = svm_set_vnmi_pending, 5147 .inject_exception = svm_inject_exception, 5148 .cancel_injection = svm_cancel_injection, 5149 .interrupt_allowed = svm_interrupt_allowed, 5150 .nmi_allowed = svm_nmi_allowed, 5151 .get_nmi_mask = svm_get_nmi_mask, 5152 .set_nmi_mask = svm_set_nmi_mask, 5153 .enable_nmi_window = svm_enable_nmi_window, 5154 .enable_irq_window = svm_enable_irq_window, 5155 .update_cr8_intercept = svm_update_cr8_intercept, 5156 5157 .x2apic_icr_is_split = true, 5158 .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, 5159 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, 5160 .apicv_post_state_restore = avic_apicv_post_state_restore, 5161 .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, 5162 5163 .get_exit_info = svm_get_exit_info, 5164 .get_entry_info = svm_get_entry_info, 5165 5166 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, 5167 5168 .has_wbinvd_exit = svm_has_wbinvd_exit, 5169 5170 .get_l2_tsc_offset = svm_get_l2_tsc_offset, 5171 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, 5172 .write_tsc_offset = svm_write_tsc_offset, 5173 .write_tsc_multiplier = svm_write_tsc_multiplier, 5174 5175 .load_mmu_pgd = svm_load_mmu_pgd, 5176 5177 .check_intercept = svm_check_intercept, 5178 .handle_exit_irqoff = svm_handle_exit_irqoff, 5179 5180 .nested_ops = &svm_nested_ops, 5181 5182 .deliver_interrupt = svm_deliver_interrupt, 5183 .pi_update_irte = avic_pi_update_irte, 5184 .setup_mce = svm_setup_mce, 5185 5186 #ifdef CONFIG_KVM_SMM 5187 .smi_allowed = svm_smi_allowed, 5188 .enter_smm = svm_enter_smm, 5189 .leave_smm = svm_leave_smm, 5190 .enable_smi_window = svm_enable_smi_window, 5191 #endif 5192 5193 #ifdef CONFIG_KVM_AMD_SEV 5194 .dev_get_attr = sev_dev_get_attr, 5195 .mem_enc_ioctl = sev_mem_enc_ioctl, 5196 .mem_enc_register_region = sev_mem_enc_register_region, 5197 .mem_enc_unregister_region = sev_mem_enc_unregister_region, 5198 .guest_memory_reclaimed = sev_guest_memory_reclaimed, 5199 5200 .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, 5201 .vm_move_enc_context_from = sev_vm_move_enc_context_from, 5202 #endif 5203 .check_emulate_instruction = svm_check_emulate_instruction, 5204 5205 .apic_init_signal_blocked = svm_apic_init_signal_blocked, 5206 5207 .msr_filter_changed = svm_msr_filter_changed, 5208 .complete_emulated_msr = svm_complete_emulated_msr, 5209 5210 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, 5211 .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, 5212 .alloc_apic_backing_page = svm_alloc_apic_backing_page, 5213 5214 .gmem_prepare = sev_gmem_prepare, 5215 .gmem_invalidate = sev_gmem_invalidate, 5216 .private_max_mapping_level = sev_private_max_mapping_level, 5217 }; 5218 5219 /* 5220 * The default MMIO mask is a single bit (excluding the present bit), 5221 * which could conflict with the memory encryption bit. Check for 5222 * memory encryption support and override the default MMIO mask if 5223 * memory encryption is enabled. 5224 */ svm_adjust_mmio_mask(void)5225 static __init void svm_adjust_mmio_mask(void) 5226 { 5227 unsigned int enc_bit, mask_bit; 5228 u64 msr, mask; 5229 5230 /* If there is no memory encryption support, use existing mask */ 5231 if (cpuid_eax(0x80000000) < 0x8000001f) 5232 return; 5233 5234 /* If memory encryption is not enabled, use existing mask */ 5235 rdmsrl(MSR_AMD64_SYSCFG, msr); 5236 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) 5237 return; 5238 5239 enc_bit = cpuid_ebx(0x8000001f) & 0x3f; 5240 mask_bit = boot_cpu_data.x86_phys_bits; 5241 5242 /* Increment the mask bit if it is the same as the encryption bit */ 5243 if (enc_bit == mask_bit) 5244 mask_bit++; 5245 5246 /* 5247 * If the mask bit location is below 52, then some bits above the 5248 * physical addressing limit will always be reserved, so use the 5249 * rsvd_bits() function to generate the mask. This mask, along with 5250 * the present bit, will be used to generate a page fault with 5251 * PFER.RSV = 1. 5252 * 5253 * If the mask bit location is 52 (or above), then clear the mask. 5254 */ 5255 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; 5256 5257 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); 5258 } 5259 svm_set_cpu_caps(void)5260 static __init void svm_set_cpu_caps(void) 5261 { 5262 kvm_set_cpu_caps(); 5263 5264 kvm_caps.supported_perf_cap = 0; 5265 kvm_caps.supported_xss = 0; 5266 5267 /* CPUID 0x80000001 and 0x8000000A (SVM features) */ 5268 if (nested) { 5269 kvm_cpu_cap_set(X86_FEATURE_SVM); 5270 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); 5271 5272 /* 5273 * KVM currently flushes TLBs on *every* nested SVM transition, 5274 * and so for all intents and purposes KVM supports flushing by 5275 * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush. 5276 */ 5277 kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID); 5278 5279 if (nrips) 5280 kvm_cpu_cap_set(X86_FEATURE_NRIPS); 5281 5282 if (npt_enabled) 5283 kvm_cpu_cap_set(X86_FEATURE_NPT); 5284 5285 if (tsc_scaling) 5286 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); 5287 5288 if (vls) 5289 kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD); 5290 if (lbrv) 5291 kvm_cpu_cap_set(X86_FEATURE_LBRV); 5292 5293 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) 5294 kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER); 5295 5296 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) 5297 kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD); 5298 5299 if (vgif) 5300 kvm_cpu_cap_set(X86_FEATURE_VGIF); 5301 5302 if (vnmi) 5303 kvm_cpu_cap_set(X86_FEATURE_VNMI); 5304 5305 /* Nested VM can receive #VMEXIT instead of triggering #GP */ 5306 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); 5307 } 5308 5309 /* CPUID 0x80000008 */ 5310 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 5311 boot_cpu_has(X86_FEATURE_AMD_SSBD)) 5312 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 5313 5314 if (enable_pmu) { 5315 /* 5316 * Enumerate support for PERFCTR_CORE if and only if KVM has 5317 * access to enough counters to virtualize "core" support, 5318 * otherwise limit vPMU support to the legacy number of counters. 5319 */ 5320 if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) 5321 kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, 5322 kvm_pmu_cap.num_counters_gp); 5323 else 5324 kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); 5325 5326 if (kvm_pmu_cap.version != 2 || 5327 !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) 5328 kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); 5329 } 5330 5331 /* CPUID 0x8000001F (SME/SEV features) */ 5332 sev_set_cpu_caps(); 5333 5334 /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ 5335 kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); 5336 } 5337 svm_hardware_setup(void)5338 static __init int svm_hardware_setup(void) 5339 { 5340 int cpu; 5341 struct page *iopm_pages; 5342 void *iopm_va; 5343 int r; 5344 unsigned int order = get_order(IOPM_SIZE); 5345 5346 /* 5347 * NX is required for shadow paging and for NPT if the NX huge pages 5348 * mitigation is enabled. 5349 */ 5350 if (!boot_cpu_has(X86_FEATURE_NX)) { 5351 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 5352 return -EOPNOTSUPP; 5353 } 5354 kvm_enable_efer_bits(EFER_NX); 5355 5356 iopm_pages = alloc_pages(GFP_KERNEL, order); 5357 5358 if (!iopm_pages) 5359 return -ENOMEM; 5360 5361 iopm_va = page_address(iopm_pages); 5362 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 5363 iopm_base = __sme_page_pa(iopm_pages); 5364 5365 init_msrpm_offsets(); 5366 5367 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 5368 XFEATURE_MASK_BNDCSR); 5369 5370 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 5371 kvm_enable_efer_bits(EFER_FFXSR); 5372 5373 if (tsc_scaling) { 5374 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 5375 tsc_scaling = false; 5376 } else { 5377 pr_info("TSC scaling supported\n"); 5378 kvm_caps.has_tsc_control = true; 5379 } 5380 } 5381 kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; 5382 kvm_caps.tsc_scaling_ratio_frac_bits = 32; 5383 5384 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); 5385 5386 if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) 5387 kvm_enable_efer_bits(EFER_AUTOIBRS); 5388 5389 /* Check for pause filtering support */ 5390 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 5391 pause_filter_count = 0; 5392 pause_filter_thresh = 0; 5393 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 5394 pause_filter_thresh = 0; 5395 } 5396 5397 if (nested) { 5398 pr_info("Nested Virtualization enabled\n"); 5399 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 5400 } 5401 5402 /* 5403 * KVM's MMU doesn't support using 2-level paging for itself, and thus 5404 * NPT isn't supported if the host is using 2-level paging since host 5405 * CR4 is unchanged on VMRUN. 5406 */ 5407 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) 5408 npt_enabled = false; 5409 5410 if (!boot_cpu_has(X86_FEATURE_NPT)) 5411 npt_enabled = false; 5412 5413 /* Force VM NPT level equal to the host's paging level */ 5414 kvm_configure_mmu(npt_enabled, get_npt_level(), 5415 get_npt_level(), PG_LEVEL_1G); 5416 pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); 5417 5418 /* Setup shadow_me_value and shadow_me_mask */ 5419 kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); 5420 5421 svm_adjust_mmio_mask(); 5422 5423 nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); 5424 5425 if (lbrv) { 5426 if (!boot_cpu_has(X86_FEATURE_LBRV)) 5427 lbrv = false; 5428 else 5429 pr_info("LBR virtualization supported\n"); 5430 } 5431 /* 5432 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which 5433 * may be modified by svm_adjust_mmio_mask()), as well as nrips. 5434 */ 5435 sev_hardware_setup(); 5436 5437 svm_hv_hardware_setup(); 5438 5439 for_each_possible_cpu(cpu) { 5440 r = svm_cpu_init(cpu); 5441 if (r) 5442 goto err; 5443 } 5444 5445 enable_apicv = avic = avic && avic_hardware_setup(); 5446 5447 if (!enable_apicv) { 5448 svm_x86_ops.vcpu_blocking = NULL; 5449 svm_x86_ops.vcpu_unblocking = NULL; 5450 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; 5451 } else if (!x2avic_enabled) { 5452 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; 5453 } 5454 5455 if (vls) { 5456 if (!npt_enabled || 5457 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 5458 !IS_ENABLED(CONFIG_X86_64)) { 5459 vls = false; 5460 } else { 5461 pr_info("Virtual VMLOAD VMSAVE supported\n"); 5462 } 5463 } 5464 5465 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) 5466 svm_gp_erratum_intercept = false; 5467 5468 if (vgif) { 5469 if (!boot_cpu_has(X86_FEATURE_VGIF)) 5470 vgif = false; 5471 else 5472 pr_info("Virtual GIF supported\n"); 5473 } 5474 5475 vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI); 5476 if (vnmi) 5477 pr_info("Virtual NMI enabled\n"); 5478 5479 if (!vnmi) { 5480 svm_x86_ops.is_vnmi_pending = NULL; 5481 svm_x86_ops.set_vnmi_pending = NULL; 5482 } 5483 5484 if (!enable_pmu) 5485 pr_info("PMU virtualization is disabled\n"); 5486 5487 svm_set_cpu_caps(); 5488 5489 /* 5490 * It seems that on AMD processors PTE's accessed bit is 5491 * being set by the CPU hardware before the NPF vmexit. 5492 * This is not expected behaviour and our tests fail because 5493 * of it. 5494 * A workaround here is to disable support for 5495 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 5496 * In this case userspace can know if there is support using 5497 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 5498 * it 5499 * If future AMD CPU models change the behaviour described above, 5500 * this variable can be changed accordingly 5501 */ 5502 allow_smaller_maxphyaddr = !npt_enabled; 5503 5504 return 0; 5505 5506 err: 5507 svm_hardware_unsetup(); 5508 return r; 5509 } 5510 5511 5512 static struct kvm_x86_init_ops svm_init_ops __initdata = { 5513 .hardware_setup = svm_hardware_setup, 5514 5515 .runtime_ops = &svm_x86_ops, 5516 .pmu_ops = &amd_pmu_ops, 5517 }; 5518 __svm_exit(void)5519 static void __svm_exit(void) 5520 { 5521 kvm_x86_vendor_exit(); 5522 } 5523 svm_init(void)5524 static int __init svm_init(void) 5525 { 5526 int r; 5527 5528 __unused_size_checks(); 5529 5530 if (!kvm_is_svm_supported()) 5531 return -EOPNOTSUPP; 5532 5533 r = kvm_x86_vendor_init(&svm_init_ops); 5534 if (r) 5535 return r; 5536 5537 /* 5538 * Common KVM initialization _must_ come last, after this, /dev/kvm is 5539 * exposed to userspace! 5540 */ 5541 r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), 5542 THIS_MODULE); 5543 if (r) 5544 goto err_kvm_init; 5545 5546 return 0; 5547 5548 err_kvm_init: 5549 __svm_exit(); 5550 return r; 5551 } 5552 svm_exit(void)5553 static void __exit svm_exit(void) 5554 { 5555 kvm_exit(); 5556 __svm_exit(); 5557 } 5558 5559 module_init(svm_init) 5560 module_exit(svm_exit) 5561