1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/kvm_host.h> 4 5 #include "irq.h" 6 #include "mmu.h" 7 #include "kvm_cache_regs.h" 8 #include "x86.h" 9 #include "smm.h" 10 #include "cpuid.h" 11 #include "pmu.h" 12 13 #include <linux/module.h> 14 #include <linux/mod_devicetable.h> 15 #include <linux/kernel.h> 16 #include <linux/vmalloc.h> 17 #include <linux/highmem.h> 18 #include <linux/amd-iommu.h> 19 #include <linux/sched.h> 20 #include <linux/trace_events.h> 21 #include <linux/slab.h> 22 #include <linux/hashtable.h> 23 #include <linux/objtool.h> 24 #include <linux/psp-sev.h> 25 #include <linux/file.h> 26 #include <linux/pagemap.h> 27 #include <linux/swap.h> 28 #include <linux/rwsem.h> 29 #include <linux/cc_platform.h> 30 #include <linux/smp.h> 31 #include <linux/string_choices.h> 32 #include <linux/mutex.h> 33 34 #include <asm/apic.h> 35 #include <asm/msr.h> 36 #include <asm/perf_event.h> 37 #include <asm/tlbflush.h> 38 #include <asm/desc.h> 39 #include <asm/debugreg.h> 40 #include <asm/kvm_para.h> 41 #include <asm/irq_remapping.h> 42 #include <asm/spec-ctrl.h> 43 #include <asm/cpu_device_id.h> 44 #include <asm/traps.h> 45 #include <asm/reboot.h> 46 #include <asm/fpu/api.h> 47 48 #include <trace/events/ipi.h> 49 50 #include "trace.h" 51 52 #include "svm.h" 53 #include "svm_ops.h" 54 55 #include "kvm_onhyperv.h" 56 #include "svm_onhyperv.h" 57 58 MODULE_AUTHOR("Qumranet"); 59 MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions"); 60 MODULE_LICENSE("GPL"); 61 62 #ifdef MODULE 63 static const struct x86_cpu_id svm_cpu_id[] = { 64 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL), 65 {} 66 }; 67 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); 68 #endif 69 70 #define SEG_TYPE_LDT 2 71 #define SEG_TYPE_BUSY_TSS16 3 72 73 static bool erratum_383_found __read_mostly; 74 75 /* 76 * Set osvw_len to higher value when updated Revision Guides 77 * are published and we know what the new status bits are 78 */ 79 static uint64_t osvw_len = 4, osvw_status; 80 81 static DEFINE_PER_CPU(u64, current_tsc_ratio); 82 83 /* 84 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 85 * pause_filter_count: On processors that support Pause filtering(indicated 86 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter 87 * count value. On VMRUN this value is loaded into an internal counter. 88 * Each time a pause instruction is executed, this counter is decremented 89 * until it reaches zero at which time a #VMEXIT is generated if pause 90 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause 91 * Intercept Filtering for more details. 92 * This also indicate if ple logic enabled. 93 * 94 * pause_filter_thresh: In addition, some processor families support advanced 95 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on 96 * the amount of time a guest is allowed to execute in a pause loop. 97 * In this mode, a 16-bit pause filter threshold field is added in the 98 * VMCB. The threshold value is a cycle count that is used to reset the 99 * pause counter. As with simple pause filtering, VMRUN loads the pause 100 * count value from VMCB into an internal counter. Then, on each pause 101 * instruction the hardware checks the elapsed number of cycles since 102 * the most recent pause instruction against the pause filter threshold. 103 * If the elapsed cycle count is greater than the pause filter threshold, 104 * then the internal pause count is reloaded from the VMCB and execution 105 * continues. If the elapsed cycle count is less than the pause filter 106 * threshold, then the internal pause count is decremented. If the count 107 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is 108 * triggered. If advanced pause filtering is supported and pause filter 109 * threshold field is set to zero, the filter will operate in the simpler, 110 * count only mode. 111 */ 112 113 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; 114 module_param(pause_filter_thresh, ushort, 0444); 115 116 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; 117 module_param(pause_filter_count, ushort, 0444); 118 119 /* Default doubles per-vcpu window every exit. */ 120 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 121 module_param(pause_filter_count_grow, ushort, 0444); 122 123 /* Default resets per-vcpu window every exit to pause_filter_count. */ 124 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 125 module_param(pause_filter_count_shrink, ushort, 0444); 126 127 /* Default is to compute the maximum so we can never overflow. */ 128 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; 129 module_param(pause_filter_count_max, ushort, 0444); 130 131 /* 132 * Use nested page tables by default. Note, NPT may get forced off by 133 * svm_hardware_setup() if it's unsupported by hardware or the host kernel. 134 */ 135 bool npt_enabled = true; 136 module_param_named(npt, npt_enabled, bool, 0444); 137 138 /* allow nested virtualization in KVM/SVM */ 139 static int nested = true; 140 module_param(nested, int, 0444); 141 142 /* enable/disable Next RIP Save */ 143 int nrips = true; 144 module_param(nrips, int, 0444); 145 146 /* enable/disable Virtual VMLOAD VMSAVE */ 147 static int vls = true; 148 module_param(vls, int, 0444); 149 150 /* enable/disable Virtual GIF */ 151 int vgif = true; 152 module_param(vgif, int, 0444); 153 154 /* enable/disable LBR virtualization */ 155 int lbrv = true; 156 module_param(lbrv, int, 0444); 157 158 static int tsc_scaling = true; 159 module_param(tsc_scaling, int, 0444); 160 161 /* 162 * enable / disable AVIC. Because the defaults differ for APICv 163 * support between VMX and SVM we cannot use module_param_named. 164 */ 165 static bool avic; 166 module_param(avic, bool, 0444); 167 module_param(enable_ipiv, bool, 0444); 168 169 module_param(enable_device_posted_irqs, bool, 0444); 170 171 bool __read_mostly dump_invalid_vmcb; 172 module_param(dump_invalid_vmcb, bool, 0644); 173 174 175 bool intercept_smi = true; 176 module_param(intercept_smi, bool, 0444); 177 178 bool vnmi = true; 179 module_param(vnmi, bool, 0444); 180 181 static bool svm_gp_erratum_intercept = true; 182 183 static u8 rsm_ins_bytes[] = "\x0f\xaa"; 184 185 static unsigned long iopm_base; 186 187 DEFINE_PER_CPU(struct svm_cpu_data, svm_data); 188 189 static DEFINE_MUTEX(vmcb_dump_mutex); 190 191 /* 192 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via 193 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. 194 * 195 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to 196 * defer the restoration of TSC_AUX until the CPU returns to userspace. 197 */ 198 static int tsc_aux_uret_slot __read_mostly = -1; 199 200 static int get_npt_level(void) 201 { 202 #ifdef CONFIG_X86_64 203 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; 204 #else 205 return PT32E_ROOT_LEVEL; 206 #endif 207 } 208 209 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 210 { 211 struct vcpu_svm *svm = to_svm(vcpu); 212 u64 old_efer = vcpu->arch.efer; 213 vcpu->arch.efer = efer; 214 215 if (!npt_enabled) { 216 /* Shadow paging assumes NX to be available. */ 217 efer |= EFER_NX; 218 219 if (!(efer & EFER_LMA)) 220 efer &= ~EFER_LME; 221 } 222 223 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { 224 if (!(efer & EFER_SVME)) { 225 svm_leave_nested(vcpu); 226 svm_set_gif(svm, true); 227 /* #GP intercept is still needed for vmware backdoor */ 228 if (!enable_vmware_backdoor) 229 clr_exception_intercept(svm, GP_VECTOR); 230 231 /* 232 * Free the nested guest state, unless we are in SMM. 233 * In this case we will return to the nested guest 234 * as soon as we leave SMM. 235 */ 236 if (!is_smm(vcpu)) 237 svm_free_nested(svm); 238 239 } else { 240 int ret = svm_allocate_nested(svm); 241 242 if (ret) { 243 vcpu->arch.efer = old_efer; 244 return ret; 245 } 246 247 /* 248 * Never intercept #GP for SEV guests, KVM can't 249 * decrypt guest memory to workaround the erratum. 250 */ 251 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) 252 set_exception_intercept(svm, GP_VECTOR); 253 } 254 } 255 256 svm->vmcb->save.efer = efer | EFER_SVME; 257 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 258 return 0; 259 } 260 261 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) 262 { 263 struct vcpu_svm *svm = to_svm(vcpu); 264 u32 ret = 0; 265 266 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 267 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; 268 return ret; 269 } 270 271 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 272 { 273 struct vcpu_svm *svm = to_svm(vcpu); 274 275 if (mask == 0) 276 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 277 else 278 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; 279 280 } 281 282 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, 283 bool commit_side_effects) 284 { 285 struct vcpu_svm *svm = to_svm(vcpu); 286 unsigned long old_rflags; 287 288 /* 289 * SEV-ES does not expose the next RIP. The RIP update is controlled by 290 * the type of exit and the #VC handler in the guest. 291 */ 292 if (sev_es_guest(vcpu->kvm)) 293 goto done; 294 295 if (nrips && svm->vmcb->control.next_rip != 0) { 296 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); 297 svm->next_rip = svm->vmcb->control.next_rip; 298 } 299 300 if (!svm->next_rip) { 301 if (unlikely(!commit_side_effects)) 302 old_rflags = svm->vmcb->save.rflags; 303 304 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 305 return 0; 306 307 if (unlikely(!commit_side_effects)) 308 svm->vmcb->save.rflags = old_rflags; 309 } else { 310 kvm_rip_write(vcpu, svm->next_rip); 311 } 312 313 done: 314 if (likely(commit_side_effects)) 315 svm_set_interrupt_shadow(vcpu, 0); 316 317 return 1; 318 } 319 320 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) 321 { 322 return __svm_skip_emulated_instruction(vcpu, true); 323 } 324 325 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) 326 { 327 unsigned long rip, old_rip = kvm_rip_read(vcpu); 328 struct vcpu_svm *svm = to_svm(vcpu); 329 330 /* 331 * Due to architectural shortcomings, the CPU doesn't always provide 332 * NextRIP, e.g. if KVM intercepted an exception that occurred while 333 * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip 334 * the instruction even if NextRIP is supported to acquire the next 335 * RIP so that it can be shoved into the NextRIP field, otherwise 336 * hardware will fail to advance guest RIP during event injection. 337 * Drop the exception/interrupt if emulation fails and effectively 338 * retry the instruction, it's the least awful option. If NRIPS is 339 * in use, the skip must not commit any side effects such as clearing 340 * the interrupt shadow or RFLAGS.RF. 341 */ 342 if (!__svm_skip_emulated_instruction(vcpu, !nrips)) 343 return -EIO; 344 345 rip = kvm_rip_read(vcpu); 346 347 /* 348 * Save the injection information, even when using next_rip, as the 349 * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection 350 * doesn't complete due to a VM-Exit occurring while the CPU is 351 * vectoring the event. Decoding the instruction isn't guaranteed to 352 * work as there may be no backing instruction, e.g. if the event is 353 * being injected by L1 for L2, or if the guest is patching INT3 into 354 * a different instruction. 355 */ 356 svm->soft_int_injected = true; 357 svm->soft_int_csbase = svm->vmcb->save.cs.base; 358 svm->soft_int_old_rip = old_rip; 359 svm->soft_int_next_rip = rip; 360 361 if (nrips) 362 kvm_rip_write(vcpu, old_rip); 363 364 if (static_cpu_has(X86_FEATURE_NRIPS)) 365 svm->vmcb->control.next_rip = rip; 366 367 return 0; 368 } 369 370 static void svm_inject_exception(struct kvm_vcpu *vcpu) 371 { 372 struct kvm_queued_exception *ex = &vcpu->arch.exception; 373 struct vcpu_svm *svm = to_svm(vcpu); 374 375 kvm_deliver_exception_payload(vcpu, ex); 376 377 if (kvm_exception_is_soft(ex->vector) && 378 svm_update_soft_interrupt_rip(vcpu)) 379 return; 380 381 svm->vmcb->control.event_inj = ex->vector 382 | SVM_EVTINJ_VALID 383 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0) 384 | SVM_EVTINJ_TYPE_EXEPT; 385 svm->vmcb->control.event_inj_err = ex->error_code; 386 } 387 388 static void svm_init_erratum_383(void) 389 { 390 u64 val; 391 392 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) 393 return; 394 395 /* Use _safe variants to not break nested virtualization */ 396 if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) 397 return; 398 399 val |= (1ULL << 47); 400 401 native_write_msr_safe(MSR_AMD64_DC_CFG, val); 402 403 erratum_383_found = true; 404 } 405 406 static void svm_init_osvw(struct kvm_vcpu *vcpu) 407 { 408 /* 409 * Guests should see errata 400 and 415 as fixed (assuming that 410 * HLT and IO instructions are intercepted). 411 */ 412 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; 413 vcpu->arch.osvw.status = osvw_status & ~(6ULL); 414 415 /* 416 * By increasing VCPU's osvw.length to 3 we are telling the guest that 417 * all osvw.status bits inside that length, including bit 0 (which is 418 * reserved for erratum 298), are valid. However, if host processor's 419 * osvw_len is 0 then osvw_status[0] carries no information. We need to 420 * be conservative here and therefore we tell the guest that erratum 298 421 * is present (because we really don't know). 422 */ 423 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) 424 vcpu->arch.osvw.status |= 1; 425 } 426 427 static bool __kvm_is_svm_supported(void) 428 { 429 int cpu = smp_processor_id(); 430 struct cpuinfo_x86 *c = &cpu_data(cpu); 431 432 if (c->x86_vendor != X86_VENDOR_AMD && 433 c->x86_vendor != X86_VENDOR_HYGON) { 434 pr_err("CPU %d isn't AMD or Hygon\n", cpu); 435 return false; 436 } 437 438 if (!cpu_has(c, X86_FEATURE_SVM)) { 439 pr_err("SVM not supported by CPU %d\n", cpu); 440 return false; 441 } 442 443 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { 444 pr_info("KVM is unsupported when running as an SEV guest\n"); 445 return false; 446 } 447 448 return true; 449 } 450 451 static bool kvm_is_svm_supported(void) 452 { 453 bool supported; 454 455 migrate_disable(); 456 supported = __kvm_is_svm_supported(); 457 migrate_enable(); 458 459 return supported; 460 } 461 462 static int svm_check_processor_compat(void) 463 { 464 if (!__kvm_is_svm_supported()) 465 return -EIO; 466 467 return 0; 468 } 469 470 static void __svm_write_tsc_multiplier(u64 multiplier) 471 { 472 if (multiplier == __this_cpu_read(current_tsc_ratio)) 473 return; 474 475 wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); 476 __this_cpu_write(current_tsc_ratio, multiplier); 477 } 478 479 static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) 480 { 481 return &sd->save_area->host_sev_es_save; 482 } 483 484 static inline void kvm_cpu_svm_disable(void) 485 { 486 uint64_t efer; 487 488 wrmsrq(MSR_VM_HSAVE_PA, 0); 489 rdmsrq(MSR_EFER, efer); 490 if (efer & EFER_SVME) { 491 /* 492 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and 493 * NMI aren't blocked. 494 */ 495 stgi(); 496 wrmsrq(MSR_EFER, efer & ~EFER_SVME); 497 } 498 } 499 500 static void svm_emergency_disable_virtualization_cpu(void) 501 { 502 kvm_rebooting = true; 503 504 kvm_cpu_svm_disable(); 505 } 506 507 static void svm_disable_virtualization_cpu(void) 508 { 509 /* Make sure we clean up behind us */ 510 if (tsc_scaling) 511 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 512 513 kvm_cpu_svm_disable(); 514 515 amd_pmu_disable_virt(); 516 } 517 518 static int svm_enable_virtualization_cpu(void) 519 { 520 521 struct svm_cpu_data *sd; 522 uint64_t efer; 523 int me = raw_smp_processor_id(); 524 525 rdmsrq(MSR_EFER, efer); 526 if (efer & EFER_SVME) 527 return -EBUSY; 528 529 sd = per_cpu_ptr(&svm_data, me); 530 sd->asid_generation = 1; 531 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 532 sd->next_asid = sd->max_asid + 1; 533 sd->min_asid = max_sev_asid + 1; 534 535 wrmsrq(MSR_EFER, efer | EFER_SVME); 536 537 wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); 538 539 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 540 /* 541 * Set the default value, even if we don't use TSC scaling 542 * to avoid having stale value in the msr 543 */ 544 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 545 } 546 547 548 /* 549 * Get OSVW bits. 550 * 551 * Note that it is possible to have a system with mixed processor 552 * revisions and therefore different OSVW bits. If bits are not the same 553 * on different processors then choose the worst case (i.e. if erratum 554 * is present on one processor and not on another then assume that the 555 * erratum is present everywhere). 556 */ 557 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { 558 u64 len, status = 0; 559 int err; 560 561 err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); 562 if (!err) 563 err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); 564 565 if (err) 566 osvw_status = osvw_len = 0; 567 else { 568 if (len < osvw_len) 569 osvw_len = len; 570 osvw_status |= status; 571 osvw_status &= (1ULL << osvw_len) - 1; 572 } 573 } else 574 osvw_status = osvw_len = 0; 575 576 svm_init_erratum_383(); 577 578 amd_pmu_enable_virt(); 579 580 /* 581 * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type 582 * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests. 583 * Since Linux does not change the value of TSC_AUX once set, prime the 584 * TSC_AUX field now to avoid a RDMSR on every vCPU run. 585 */ 586 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { 587 u32 __maybe_unused msr_hi; 588 589 rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); 590 } 591 592 return 0; 593 } 594 595 static void svm_cpu_uninit(int cpu) 596 { 597 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 598 599 if (!sd->save_area) 600 return; 601 602 kfree(sd->sev_vmcbs); 603 __free_page(__sme_pa_to_page(sd->save_area_pa)); 604 sd->save_area_pa = 0; 605 sd->save_area = NULL; 606 } 607 608 static int svm_cpu_init(int cpu) 609 { 610 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 611 struct page *save_area_page; 612 int ret = -ENOMEM; 613 614 memset(sd, 0, sizeof(struct svm_cpu_data)); 615 save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); 616 if (!save_area_page) 617 return ret; 618 619 ret = sev_cpu_init(sd); 620 if (ret) 621 goto free_save_area; 622 623 sd->save_area = page_address(save_area_page); 624 sd->save_area_pa = __sme_page_pa(save_area_page); 625 return 0; 626 627 free_save_area: 628 __free_page(save_area_page); 629 return ret; 630 631 } 632 633 static void set_dr_intercepts(struct vcpu_svm *svm) 634 { 635 struct vmcb *vmcb = svm->vmcb01.ptr; 636 637 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); 638 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); 639 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); 640 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); 641 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); 642 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); 643 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); 644 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); 645 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); 646 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); 647 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); 648 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); 649 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); 650 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); 651 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); 652 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); 653 654 recalc_intercepts(svm); 655 } 656 657 static void clr_dr_intercepts(struct vcpu_svm *svm) 658 { 659 struct vmcb *vmcb = svm->vmcb01.ptr; 660 661 vmcb->control.intercepts[INTERCEPT_DR] = 0; 662 663 recalc_intercepts(svm); 664 } 665 666 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) 667 { 668 /* 669 * For non-nested case: 670 * If the L01 MSR bitmap does not intercept the MSR, then we need to 671 * save it. 672 * 673 * For nested case: 674 * If the L02 MSR bitmap does not intercept the MSR, then we need to 675 * save it. 676 */ 677 void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm : 678 to_svm(vcpu)->msrpm; 679 680 return svm_test_msr_bitmap_write(msrpm, msr); 681 } 682 683 void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) 684 { 685 struct vcpu_svm *svm = to_svm(vcpu); 686 void *msrpm = svm->msrpm; 687 688 /* Don't disable interception for MSRs userspace wants to handle. */ 689 if (type & MSR_TYPE_R) { 690 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 691 svm_clear_msr_bitmap_read(msrpm, msr); 692 else 693 svm_set_msr_bitmap_read(msrpm, msr); 694 } 695 696 if (type & MSR_TYPE_W) { 697 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 698 svm_clear_msr_bitmap_write(msrpm, msr); 699 else 700 svm_set_msr_bitmap_write(msrpm, msr); 701 } 702 703 svm_hv_vmcb_dirty_nested_enlightenments(vcpu); 704 svm->nested.force_msr_bitmap_recalc = true; 705 } 706 707 void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) 708 { 709 unsigned int order = get_order(size); 710 struct page *pages = alloc_pages(gfp_mask, order); 711 void *pm; 712 713 if (!pages) 714 return NULL; 715 716 /* 717 * Set all bits in the permissions map so that all MSR and I/O accesses 718 * are intercepted by default. 719 */ 720 pm = page_address(pages); 721 memset(pm, 0xff, PAGE_SIZE * (1 << order)); 722 723 return pm; 724 } 725 726 static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) 727 { 728 bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); 729 730 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); 731 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); 732 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept); 733 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept); 734 735 if (sev_es_guest(vcpu->kvm)) 736 svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); 737 } 738 739 void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) 740 { 741 static const u32 x2avic_passthrough_msrs[] = { 742 X2APIC_MSR(APIC_ID), 743 X2APIC_MSR(APIC_LVR), 744 X2APIC_MSR(APIC_TASKPRI), 745 X2APIC_MSR(APIC_ARBPRI), 746 X2APIC_MSR(APIC_PROCPRI), 747 X2APIC_MSR(APIC_EOI), 748 X2APIC_MSR(APIC_RRR), 749 X2APIC_MSR(APIC_LDR), 750 X2APIC_MSR(APIC_DFR), 751 X2APIC_MSR(APIC_SPIV), 752 X2APIC_MSR(APIC_ISR), 753 X2APIC_MSR(APIC_TMR), 754 X2APIC_MSR(APIC_IRR), 755 X2APIC_MSR(APIC_ESR), 756 X2APIC_MSR(APIC_ICR), 757 X2APIC_MSR(APIC_ICR2), 758 759 /* 760 * Note! Always intercept LVTT, as TSC-deadline timer mode 761 * isn't virtualized by hardware, and the CPU will generate a 762 * #GP instead of a #VMEXIT. 763 */ 764 X2APIC_MSR(APIC_LVTTHMR), 765 X2APIC_MSR(APIC_LVTPC), 766 X2APIC_MSR(APIC_LVT0), 767 X2APIC_MSR(APIC_LVT1), 768 X2APIC_MSR(APIC_LVTERR), 769 X2APIC_MSR(APIC_TMICT), 770 X2APIC_MSR(APIC_TMCCT), 771 X2APIC_MSR(APIC_TDCR), 772 }; 773 int i; 774 775 if (intercept == svm->x2avic_msrs_intercepted) 776 return; 777 778 if (!x2avic_enabled) 779 return; 780 781 for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++) 782 svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i], 783 MSR_TYPE_RW, intercept); 784 785 svm->x2avic_msrs_intercepted = intercept; 786 } 787 788 void svm_vcpu_free_msrpm(void *msrpm) 789 { 790 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); 791 } 792 793 static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 794 { 795 struct vcpu_svm *svm = to_svm(vcpu); 796 797 svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW); 798 svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 799 800 #ifdef CONFIG_X86_64 801 svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 802 svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 803 svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 804 svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW); 805 svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW); 806 svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW); 807 #endif 808 809 if (lbrv) 810 svm_recalc_lbr_msr_intercepts(vcpu); 811 812 if (cpu_feature_enabled(X86_FEATURE_IBPB)) 813 svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 814 !guest_has_pred_cmd_msr(vcpu)); 815 816 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) 817 svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 818 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 819 820 /* 821 * Disable interception of SPEC_CTRL if KVM doesn't need to manually 822 * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if 823 * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively 824 * using SPEC_CTRL. 825 */ 826 if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL)) 827 svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 828 !guest_has_spec_ctrl_msr(vcpu)); 829 else 830 svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 831 !svm->spec_ctrl); 832 833 /* 834 * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU, 835 * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits. 836 */ 837 svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW, 838 guest_cpuid_is_intel_compatible(vcpu)); 839 svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW, 840 guest_cpuid_is_intel_compatible(vcpu)); 841 842 if (kvm_aperfmperf_in_guest(vcpu->kvm)) { 843 svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); 844 svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); 845 } 846 847 if (sev_es_guest(vcpu->kvm)) 848 sev_es_recalc_msr_intercepts(vcpu); 849 850 /* 851 * x2APIC intercepts are modified on-demand and cannot be filtered by 852 * userspace. 853 */ 854 } 855 856 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) 857 { 858 to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; 859 to_vmcb->save.br_from = from_vmcb->save.br_from; 860 to_vmcb->save.br_to = from_vmcb->save.br_to; 861 to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; 862 to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; 863 864 vmcb_mark_dirty(to_vmcb, VMCB_LBR); 865 } 866 867 void svm_enable_lbrv(struct kvm_vcpu *vcpu) 868 { 869 struct vcpu_svm *svm = to_svm(vcpu); 870 871 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 872 svm_recalc_lbr_msr_intercepts(vcpu); 873 874 /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ 875 if (is_guest_mode(vcpu)) 876 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); 877 } 878 879 static void svm_disable_lbrv(struct kvm_vcpu *vcpu) 880 { 881 struct vcpu_svm *svm = to_svm(vcpu); 882 883 KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); 884 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 885 svm_recalc_lbr_msr_intercepts(vcpu); 886 887 /* 888 * Move the LBR msrs back to the vmcb01 to avoid copying them 889 * on nested guest entries. 890 */ 891 if (is_guest_mode(vcpu)) 892 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); 893 } 894 895 static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) 896 { 897 /* 898 * If LBR virtualization is disabled, the LBR MSRs are always kept in 899 * vmcb01. If LBR virtualization is enabled and L1 is running VMs of 900 * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. 901 */ 902 return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : 903 svm->vmcb01.ptr; 904 } 905 906 void svm_update_lbrv(struct kvm_vcpu *vcpu) 907 { 908 struct vcpu_svm *svm = to_svm(vcpu); 909 bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; 910 bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || 911 (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && 912 (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); 913 914 if (enable_lbrv == current_enable_lbrv) 915 return; 916 917 if (enable_lbrv) 918 svm_enable_lbrv(vcpu); 919 else 920 svm_disable_lbrv(vcpu); 921 } 922 923 void disable_nmi_singlestep(struct vcpu_svm *svm) 924 { 925 svm->nmi_singlestep = false; 926 927 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 928 /* Clear our flags if they were not set by the guest */ 929 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 930 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; 931 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 932 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; 933 } 934 } 935 936 static void grow_ple_window(struct kvm_vcpu *vcpu) 937 { 938 struct vcpu_svm *svm = to_svm(vcpu); 939 struct vmcb_control_area *control = &svm->vmcb->control; 940 int old = control->pause_filter_count; 941 942 if (kvm_pause_in_guest(vcpu->kvm)) 943 return; 944 945 control->pause_filter_count = __grow_ple_window(old, 946 pause_filter_count, 947 pause_filter_count_grow, 948 pause_filter_count_max); 949 950 if (control->pause_filter_count != old) { 951 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 952 trace_kvm_ple_window_update(vcpu->vcpu_id, 953 control->pause_filter_count, old); 954 } 955 } 956 957 static void shrink_ple_window(struct kvm_vcpu *vcpu) 958 { 959 struct vcpu_svm *svm = to_svm(vcpu); 960 struct vmcb_control_area *control = &svm->vmcb->control; 961 int old = control->pause_filter_count; 962 963 if (kvm_pause_in_guest(vcpu->kvm)) 964 return; 965 966 control->pause_filter_count = 967 __shrink_ple_window(old, 968 pause_filter_count, 969 pause_filter_count_shrink, 970 pause_filter_count); 971 if (control->pause_filter_count != old) { 972 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 973 trace_kvm_ple_window_update(vcpu->vcpu_id, 974 control->pause_filter_count, old); 975 } 976 } 977 978 static void svm_hardware_unsetup(void) 979 { 980 int cpu; 981 982 sev_hardware_unsetup(); 983 984 for_each_possible_cpu(cpu) 985 svm_cpu_uninit(cpu); 986 987 __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE)); 988 iopm_base = 0; 989 } 990 991 static void init_seg(struct vmcb_seg *seg) 992 { 993 seg->selector = 0; 994 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 995 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 996 seg->limit = 0xffff; 997 seg->base = 0; 998 } 999 1000 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) 1001 { 1002 seg->selector = 0; 1003 seg->attrib = SVM_SELECTOR_P_MASK | type; 1004 seg->limit = 0xffff; 1005 seg->base = 0; 1006 } 1007 1008 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1009 { 1010 struct vcpu_svm *svm = to_svm(vcpu); 1011 1012 return svm->nested.ctl.tsc_offset; 1013 } 1014 1015 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1016 { 1017 struct vcpu_svm *svm = to_svm(vcpu); 1018 1019 return svm->tsc_ratio_msr; 1020 } 1021 1022 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu) 1023 { 1024 struct vcpu_svm *svm = to_svm(vcpu); 1025 1026 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; 1027 svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; 1028 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1029 } 1030 1031 void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1032 { 1033 preempt_disable(); 1034 if (to_svm(vcpu)->guest_state_loaded) 1035 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); 1036 preempt_enable(); 1037 } 1038 1039 /* Evaluate instruction intercepts that depend on guest CPUID features. */ 1040 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) 1041 { 1042 struct vcpu_svm *svm = to_svm(vcpu); 1043 1044 /* 1045 * Intercept INVPCID if shadow paging is enabled to sync/free shadow 1046 * roots, or if INVPCID is disabled in the guest to inject #UD. 1047 */ 1048 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { 1049 if (!npt_enabled || 1050 !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID)) 1051 svm_set_intercept(svm, INTERCEPT_INVPCID); 1052 else 1053 svm_clr_intercept(svm, INTERCEPT_INVPCID); 1054 } 1055 1056 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { 1057 if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP)) 1058 svm_clr_intercept(svm, INTERCEPT_RDTSCP); 1059 else 1060 svm_set_intercept(svm, INTERCEPT_RDTSCP); 1061 } 1062 1063 if (guest_cpuid_is_intel_compatible(vcpu)) { 1064 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1065 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1066 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1067 } else { 1068 /* 1069 * If hardware supports Virtual VMLOAD VMSAVE then enable it 1070 * in VMCB and clear intercepts to avoid #VMEXIT. 1071 */ 1072 if (vls) { 1073 svm_clr_intercept(svm, INTERCEPT_VMLOAD); 1074 svm_clr_intercept(svm, INTERCEPT_VMSAVE); 1075 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1076 } 1077 } 1078 } 1079 1080 static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu) 1081 { 1082 svm_recalc_instruction_intercepts(vcpu); 1083 svm_recalc_msr_intercepts(vcpu); 1084 } 1085 1086 static void init_vmcb(struct kvm_vcpu *vcpu) 1087 { 1088 struct vcpu_svm *svm = to_svm(vcpu); 1089 struct vmcb *vmcb = svm->vmcb01.ptr; 1090 struct vmcb_control_area *control = &vmcb->control; 1091 struct vmcb_save_area *save = &vmcb->save; 1092 1093 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1094 svm_set_intercept(svm, INTERCEPT_CR3_READ); 1095 svm_set_intercept(svm, INTERCEPT_CR4_READ); 1096 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1097 svm_set_intercept(svm, INTERCEPT_CR3_WRITE); 1098 svm_set_intercept(svm, INTERCEPT_CR4_WRITE); 1099 if (!kvm_vcpu_apicv_active(vcpu)) 1100 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 1101 1102 set_dr_intercepts(svm); 1103 1104 set_exception_intercept(svm, PF_VECTOR); 1105 set_exception_intercept(svm, UD_VECTOR); 1106 set_exception_intercept(svm, MC_VECTOR); 1107 set_exception_intercept(svm, AC_VECTOR); 1108 set_exception_intercept(svm, DB_VECTOR); 1109 /* 1110 * Guest access to VMware backdoor ports could legitimately 1111 * trigger #GP because of TSS I/O permission bitmap. 1112 * We intercept those #GP and allow access to them anyway 1113 * as VMware does. 1114 */ 1115 if (enable_vmware_backdoor) 1116 set_exception_intercept(svm, GP_VECTOR); 1117 1118 svm_set_intercept(svm, INTERCEPT_INTR); 1119 svm_set_intercept(svm, INTERCEPT_NMI); 1120 1121 if (intercept_smi) 1122 svm_set_intercept(svm, INTERCEPT_SMI); 1123 1124 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1125 svm_set_intercept(svm, INTERCEPT_RDPMC); 1126 svm_set_intercept(svm, INTERCEPT_CPUID); 1127 svm_set_intercept(svm, INTERCEPT_INVD); 1128 svm_set_intercept(svm, INTERCEPT_INVLPG); 1129 svm_set_intercept(svm, INTERCEPT_INVLPGA); 1130 svm_set_intercept(svm, INTERCEPT_IOIO_PROT); 1131 svm_set_intercept(svm, INTERCEPT_MSR_PROT); 1132 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH); 1133 svm_set_intercept(svm, INTERCEPT_SHUTDOWN); 1134 svm_set_intercept(svm, INTERCEPT_VMRUN); 1135 svm_set_intercept(svm, INTERCEPT_VMMCALL); 1136 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1137 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1138 svm_set_intercept(svm, INTERCEPT_STGI); 1139 svm_set_intercept(svm, INTERCEPT_CLGI); 1140 svm_set_intercept(svm, INTERCEPT_SKINIT); 1141 svm_set_intercept(svm, INTERCEPT_WBINVD); 1142 svm_set_intercept(svm, INTERCEPT_XSETBV); 1143 svm_set_intercept(svm, INTERCEPT_RDPRU); 1144 svm_set_intercept(svm, INTERCEPT_RSM); 1145 1146 if (!kvm_mwait_in_guest(vcpu->kvm)) { 1147 svm_set_intercept(svm, INTERCEPT_MONITOR); 1148 svm_set_intercept(svm, INTERCEPT_MWAIT); 1149 } 1150 1151 if (!kvm_hlt_in_guest(vcpu->kvm)) { 1152 if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) 1153 svm_set_intercept(svm, INTERCEPT_IDLE_HLT); 1154 else 1155 svm_set_intercept(svm, INTERCEPT_HLT); 1156 } 1157 1158 control->iopm_base_pa = iopm_base; 1159 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1160 control->int_ctl = V_INTR_MASKING_MASK; 1161 1162 init_seg(&save->es); 1163 init_seg(&save->ss); 1164 init_seg(&save->ds); 1165 init_seg(&save->fs); 1166 init_seg(&save->gs); 1167 1168 save->cs.selector = 0xf000; 1169 save->cs.base = 0xffff0000; 1170 /* Executable/Readable Code Segment */ 1171 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1172 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1173 save->cs.limit = 0xffff; 1174 1175 save->gdtr.base = 0; 1176 save->gdtr.limit = 0xffff; 1177 save->idtr.base = 0; 1178 save->idtr.limit = 0xffff; 1179 1180 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1181 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1182 1183 if (npt_enabled) { 1184 /* Setup VMCB for Nested Paging */ 1185 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 1186 svm_clr_intercept(svm, INTERCEPT_INVLPG); 1187 clr_exception_intercept(svm, PF_VECTOR); 1188 svm_clr_intercept(svm, INTERCEPT_CR3_READ); 1189 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); 1190 save->g_pat = vcpu->arch.pat; 1191 save->cr3 = 0; 1192 } 1193 svm->current_vmcb->asid_generation = 0; 1194 svm->asid = 0; 1195 1196 svm->nested.vmcb12_gpa = INVALID_GPA; 1197 svm->nested.last_vmcb12_gpa = INVALID_GPA; 1198 1199 if (!kvm_pause_in_guest(vcpu->kvm)) { 1200 control->pause_filter_count = pause_filter_count; 1201 if (pause_filter_thresh) 1202 control->pause_filter_thresh = pause_filter_thresh; 1203 svm_set_intercept(svm, INTERCEPT_PAUSE); 1204 } else { 1205 svm_clr_intercept(svm, INTERCEPT_PAUSE); 1206 } 1207 1208 if (kvm_vcpu_apicv_active(vcpu)) 1209 avic_init_vmcb(svm, vmcb); 1210 1211 if (vnmi) 1212 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK; 1213 1214 if (vgif) { 1215 svm_clr_intercept(svm, INTERCEPT_STGI); 1216 svm_clr_intercept(svm, INTERCEPT_CLGI); 1217 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1218 } 1219 1220 if (vcpu->kvm->arch.bus_lock_detection_enabled) 1221 svm_set_intercept(svm, INTERCEPT_BUSLOCK); 1222 1223 if (sev_guest(vcpu->kvm)) 1224 sev_init_vmcb(svm); 1225 1226 svm_hv_init_vmcb(vmcb); 1227 1228 svm_recalc_intercepts_after_set_cpuid(vcpu); 1229 1230 vmcb_mark_all_dirty(vmcb); 1231 1232 enable_gif(svm); 1233 } 1234 1235 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) 1236 { 1237 struct vcpu_svm *svm = to_svm(vcpu); 1238 1239 svm_init_osvw(vcpu); 1240 1241 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 1242 vcpu->arch.microcode_version = 0x01000065; 1243 svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; 1244 1245 svm->nmi_masked = false; 1246 svm->awaiting_iret_completion = false; 1247 1248 if (sev_es_guest(vcpu->kvm)) 1249 sev_es_vcpu_reset(svm); 1250 } 1251 1252 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 1253 { 1254 struct vcpu_svm *svm = to_svm(vcpu); 1255 1256 svm->spec_ctrl = 0; 1257 svm->virt_spec_ctrl = 0; 1258 1259 if (init_event) 1260 sev_snp_init_protected_guest_state(vcpu); 1261 1262 init_vmcb(vcpu); 1263 1264 if (!init_event) 1265 __svm_vcpu_reset(vcpu); 1266 } 1267 1268 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) 1269 { 1270 svm->current_vmcb = target_vmcb; 1271 svm->vmcb = target_vmcb->ptr; 1272 } 1273 1274 static int svm_vcpu_create(struct kvm_vcpu *vcpu) 1275 { 1276 struct vcpu_svm *svm; 1277 struct page *vmcb01_page; 1278 struct page *vmsa_page = NULL; 1279 int err; 1280 1281 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); 1282 svm = to_svm(vcpu); 1283 1284 err = -ENOMEM; 1285 vmcb01_page = snp_safe_alloc_page(); 1286 if (!vmcb01_page) 1287 goto out; 1288 1289 if (sev_es_guest(vcpu->kvm)) { 1290 /* 1291 * SEV-ES guests require a separate VMSA page used to contain 1292 * the encrypted register state of the guest. 1293 */ 1294 vmsa_page = snp_safe_alloc_page(); 1295 if (!vmsa_page) 1296 goto error_free_vmcb_page; 1297 } 1298 1299 err = avic_init_vcpu(svm); 1300 if (err) 1301 goto error_free_vmsa_page; 1302 1303 svm->msrpm = svm_vcpu_alloc_msrpm(); 1304 if (!svm->msrpm) { 1305 err = -ENOMEM; 1306 goto error_free_vmsa_page; 1307 } 1308 1309 svm->x2avic_msrs_intercepted = true; 1310 1311 svm->vmcb01.ptr = page_address(vmcb01_page); 1312 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); 1313 svm_switch_vmcb(svm, &svm->vmcb01); 1314 1315 if (vmsa_page) 1316 svm->sev_es.vmsa = page_address(vmsa_page); 1317 1318 svm->guest_state_loaded = false; 1319 1320 return 0; 1321 1322 error_free_vmsa_page: 1323 if (vmsa_page) 1324 __free_page(vmsa_page); 1325 error_free_vmcb_page: 1326 __free_page(vmcb01_page); 1327 out: 1328 return err; 1329 } 1330 1331 static void svm_vcpu_free(struct kvm_vcpu *vcpu) 1332 { 1333 struct vcpu_svm *svm = to_svm(vcpu); 1334 1335 WARN_ON_ONCE(!list_empty(&svm->ir_list)); 1336 1337 svm_leave_nested(vcpu); 1338 svm_free_nested(svm); 1339 1340 sev_free_vcpu(vcpu); 1341 1342 __free_page(__sme_pa_to_page(svm->vmcb01.pa)); 1343 svm_vcpu_free_msrpm(svm->msrpm); 1344 } 1345 1346 #ifdef CONFIG_CPU_MITIGATIONS 1347 static DEFINE_SPINLOCK(srso_lock); 1348 static atomic_t srso_nr_vms; 1349 1350 static void svm_srso_clear_bp_spec_reduce(void *ign) 1351 { 1352 struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); 1353 1354 if (!sd->bp_spec_reduce_set) 1355 return; 1356 1357 msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); 1358 sd->bp_spec_reduce_set = false; 1359 } 1360 1361 static void svm_srso_vm_destroy(void) 1362 { 1363 if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) 1364 return; 1365 1366 if (atomic_dec_return(&srso_nr_vms)) 1367 return; 1368 1369 guard(spinlock)(&srso_lock); 1370 1371 /* 1372 * Verify a new VM didn't come along, acquire the lock, and increment 1373 * the count before this task acquired the lock. 1374 */ 1375 if (atomic_read(&srso_nr_vms)) 1376 return; 1377 1378 on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); 1379 } 1380 1381 static void svm_srso_vm_init(void) 1382 { 1383 if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) 1384 return; 1385 1386 /* 1387 * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 1388 * transition, i.e. destroying the last VM, is fully complete, e.g. so 1389 * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. 1390 */ 1391 if (atomic_inc_not_zero(&srso_nr_vms)) 1392 return; 1393 1394 guard(spinlock)(&srso_lock); 1395 1396 atomic_inc(&srso_nr_vms); 1397 } 1398 #else 1399 static void svm_srso_vm_init(void) { } 1400 static void svm_srso_vm_destroy(void) { } 1401 #endif 1402 1403 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1404 { 1405 struct vcpu_svm *svm = to_svm(vcpu); 1406 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 1407 1408 if (sev_es_guest(vcpu->kvm)) 1409 sev_es_unmap_ghcb(svm); 1410 1411 if (svm->guest_state_loaded) 1412 return; 1413 1414 /* 1415 * Save additional host state that will be restored on VMEXIT (sev-es) 1416 * or subsequent vmload of host save area. 1417 */ 1418 vmsave(sd->save_area_pa); 1419 if (sev_es_guest(vcpu->kvm)) 1420 sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd)); 1421 1422 if (tsc_scaling) 1423 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); 1424 1425 /* 1426 * TSC_AUX is always virtualized for SEV-ES guests when the feature is 1427 * available. The user return MSR support is not required in this case 1428 * because TSC_AUX is restored on #VMEXIT from the host save area 1429 * (which has been initialized in svm_enable_virtualization_cpu()). 1430 */ 1431 if (likely(tsc_aux_uret_slot >= 0) && 1432 (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) 1433 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); 1434 1435 if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && 1436 !sd->bp_spec_reduce_set) { 1437 sd->bp_spec_reduce_set = true; 1438 msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); 1439 } 1440 svm->guest_state_loaded = true; 1441 } 1442 1443 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) 1444 { 1445 to_svm(vcpu)->guest_state_loaded = false; 1446 } 1447 1448 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1449 { 1450 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1451 shrink_ple_window(vcpu); 1452 1453 if (kvm_vcpu_apicv_active(vcpu)) 1454 avic_vcpu_load(vcpu, cpu); 1455 } 1456 1457 static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1458 { 1459 if (kvm_vcpu_apicv_active(vcpu)) 1460 avic_vcpu_put(vcpu); 1461 1462 svm_prepare_host_switch(vcpu); 1463 1464 ++vcpu->stat.host_state_reload; 1465 } 1466 1467 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1468 { 1469 struct vcpu_svm *svm = to_svm(vcpu); 1470 unsigned long rflags = svm->vmcb->save.rflags; 1471 1472 if (svm->nmi_singlestep) { 1473 /* Hide our flags if they were not set by the guest */ 1474 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 1475 rflags &= ~X86_EFLAGS_TF; 1476 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 1477 rflags &= ~X86_EFLAGS_RF; 1478 } 1479 return rflags; 1480 } 1481 1482 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1483 { 1484 if (to_svm(vcpu)->nmi_singlestep) 1485 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 1486 1487 /* 1488 * Any change of EFLAGS.VM is accompanied by a reload of SS 1489 * (caused by either a task switch or an inter-privilege IRET), 1490 * so we do not need to update the CPL here. 1491 */ 1492 to_svm(vcpu)->vmcb->save.rflags = rflags; 1493 } 1494 1495 static bool svm_get_if_flag(struct kvm_vcpu *vcpu) 1496 { 1497 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1498 1499 return sev_es_guest(vcpu->kvm) 1500 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK 1501 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; 1502 } 1503 1504 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1505 { 1506 kvm_register_mark_available(vcpu, reg); 1507 1508 switch (reg) { 1509 case VCPU_EXREG_PDPTR: 1510 /* 1511 * When !npt_enabled, mmu->pdptrs[] is already available since 1512 * it is always updated per SDM when moving to CRs. 1513 */ 1514 if (npt_enabled) 1515 load_pdptrs(vcpu, kvm_read_cr3(vcpu)); 1516 break; 1517 default: 1518 KVM_BUG_ON(1, vcpu->kvm); 1519 } 1520 } 1521 1522 static void svm_set_vintr(struct vcpu_svm *svm) 1523 { 1524 struct vmcb_control_area *control; 1525 1526 /* 1527 * The following fields are ignored when AVIC is enabled 1528 */ 1529 WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu)); 1530 1531 svm_set_intercept(svm, INTERCEPT_VINTR); 1532 1533 /* 1534 * Recalculating intercepts may have cleared the VINTR intercept. If 1535 * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF 1536 * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN. 1537 * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as 1538 * interrupts will never be unblocked while L2 is running. 1539 */ 1540 if (!svm_is_intercept(svm, INTERCEPT_VINTR)) 1541 return; 1542 1543 /* 1544 * This is just a dummy VINTR to actually cause a vmexit to happen. 1545 * Actual injection of virtual interrupts happens through EVENTINJ. 1546 */ 1547 control = &svm->vmcb->control; 1548 control->int_vector = 0x0; 1549 control->int_ctl &= ~V_INTR_PRIO_MASK; 1550 control->int_ctl |= V_IRQ_MASK | 1551 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 1552 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1553 } 1554 1555 static void svm_clear_vintr(struct vcpu_svm *svm) 1556 { 1557 svm_clr_intercept(svm, INTERCEPT_VINTR); 1558 1559 /* Drop int_ctl fields related to VINTR injection. */ 1560 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1561 if (is_guest_mode(&svm->vcpu)) { 1562 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1563 1564 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != 1565 (svm->nested.ctl.int_ctl & V_TPR_MASK)); 1566 1567 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & 1568 V_IRQ_INJECTION_BITS_MASK; 1569 1570 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; 1571 } 1572 1573 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1574 } 1575 1576 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1577 { 1578 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1579 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save; 1580 1581 switch (seg) { 1582 case VCPU_SREG_CS: return &save->cs; 1583 case VCPU_SREG_DS: return &save->ds; 1584 case VCPU_SREG_ES: return &save->es; 1585 case VCPU_SREG_FS: return &save01->fs; 1586 case VCPU_SREG_GS: return &save01->gs; 1587 case VCPU_SREG_SS: return &save->ss; 1588 case VCPU_SREG_TR: return &save01->tr; 1589 case VCPU_SREG_LDTR: return &save01->ldtr; 1590 } 1591 BUG(); 1592 return NULL; 1593 } 1594 1595 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) 1596 { 1597 struct vmcb_seg *s = svm_seg(vcpu, seg); 1598 1599 return s->base; 1600 } 1601 1602 static void svm_get_segment(struct kvm_vcpu *vcpu, 1603 struct kvm_segment *var, int seg) 1604 { 1605 struct vmcb_seg *s = svm_seg(vcpu, seg); 1606 1607 var->base = s->base; 1608 var->limit = s->limit; 1609 var->selector = s->selector; 1610 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; 1611 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; 1612 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; 1613 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; 1614 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; 1615 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 1616 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1617 1618 /* 1619 * AMD CPUs circa 2014 track the G bit for all segments except CS. 1620 * However, the SVM spec states that the G bit is not observed by the 1621 * CPU, and some VMware virtual CPUs drop the G bit for all segments. 1622 * So let's synthesize a legal G bit for all segments, this helps 1623 * running KVM nested. It also helps cross-vendor migration, because 1624 * Intel's vmentry has a check on the 'G' bit. 1625 */ 1626 var->g = s->limit > 0xfffff; 1627 1628 /* 1629 * AMD's VMCB does not have an explicit unusable field, so emulate it 1630 * for cross vendor migration purposes by "not present" 1631 */ 1632 var->unusable = !var->present; 1633 1634 switch (seg) { 1635 case VCPU_SREG_TR: 1636 /* 1637 * Work around a bug where the busy flag in the tr selector 1638 * isn't exposed 1639 */ 1640 var->type |= 0x2; 1641 break; 1642 case VCPU_SREG_DS: 1643 case VCPU_SREG_ES: 1644 case VCPU_SREG_FS: 1645 case VCPU_SREG_GS: 1646 /* 1647 * The accessed bit must always be set in the segment 1648 * descriptor cache, although it can be cleared in the 1649 * descriptor, the cached bit always remains at 1. Since 1650 * Intel has a check on this, set it here to support 1651 * cross-vendor migration. 1652 */ 1653 if (!var->unusable) 1654 var->type |= 0x1; 1655 break; 1656 case VCPU_SREG_SS: 1657 /* 1658 * On AMD CPUs sometimes the DB bit in the segment 1659 * descriptor is left as 1, although the whole segment has 1660 * been made unusable. Clear it here to pass an Intel VMX 1661 * entry check when cross vendor migrating. 1662 */ 1663 if (var->unusable) 1664 var->db = 0; 1665 /* This is symmetric with svm_set_segment() */ 1666 var->dpl = to_svm(vcpu)->vmcb->save.cpl; 1667 break; 1668 } 1669 } 1670 1671 static int svm_get_cpl(struct kvm_vcpu *vcpu) 1672 { 1673 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1674 1675 return save->cpl; 1676 } 1677 1678 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 1679 { 1680 struct kvm_segment cs; 1681 1682 svm_get_segment(vcpu, &cs, VCPU_SREG_CS); 1683 *db = cs.db; 1684 *l = cs.l; 1685 } 1686 1687 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1688 { 1689 struct vcpu_svm *svm = to_svm(vcpu); 1690 1691 dt->size = svm->vmcb->save.idtr.limit; 1692 dt->address = svm->vmcb->save.idtr.base; 1693 } 1694 1695 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1696 { 1697 struct vcpu_svm *svm = to_svm(vcpu); 1698 1699 svm->vmcb->save.idtr.limit = dt->size; 1700 svm->vmcb->save.idtr.base = dt->address ; 1701 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1702 } 1703 1704 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1705 { 1706 struct vcpu_svm *svm = to_svm(vcpu); 1707 1708 dt->size = svm->vmcb->save.gdtr.limit; 1709 dt->address = svm->vmcb->save.gdtr.base; 1710 } 1711 1712 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1713 { 1714 struct vcpu_svm *svm = to_svm(vcpu); 1715 1716 svm->vmcb->save.gdtr.limit = dt->size; 1717 svm->vmcb->save.gdtr.base = dt->address ; 1718 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1719 } 1720 1721 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1722 { 1723 struct vcpu_svm *svm = to_svm(vcpu); 1724 1725 /* 1726 * For guests that don't set guest_state_protected, the cr3 update is 1727 * handled via kvm_mmu_load() while entering the guest. For guests 1728 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to 1729 * VMCB save area now, since the save area will become the initial 1730 * contents of the VMSA, and future VMCB save area updates won't be 1731 * seen. 1732 */ 1733 if (sev_es_guest(vcpu->kvm)) { 1734 svm->vmcb->save.cr3 = cr3; 1735 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1736 } 1737 } 1738 1739 static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1740 { 1741 return true; 1742 } 1743 1744 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1745 { 1746 struct vcpu_svm *svm = to_svm(vcpu); 1747 u64 hcr0 = cr0; 1748 bool old_paging = is_paging(vcpu); 1749 1750 #ifdef CONFIG_X86_64 1751 if (vcpu->arch.efer & EFER_LME) { 1752 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1753 vcpu->arch.efer |= EFER_LMA; 1754 if (!vcpu->arch.guest_state_protected) 1755 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1756 } 1757 1758 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1759 vcpu->arch.efer &= ~EFER_LMA; 1760 if (!vcpu->arch.guest_state_protected) 1761 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1762 } 1763 } 1764 #endif 1765 vcpu->arch.cr0 = cr0; 1766 1767 if (!npt_enabled) { 1768 hcr0 |= X86_CR0_PG | X86_CR0_WP; 1769 if (old_paging != is_paging(vcpu)) 1770 svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); 1771 } 1772 1773 /* 1774 * re-enable caching here because the QEMU bios 1775 * does not do it - this results in some delay at 1776 * reboot 1777 */ 1778 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 1779 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1780 1781 svm->vmcb->save.cr0 = hcr0; 1782 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1783 1784 /* 1785 * SEV-ES guests must always keep the CR intercepts cleared. CR 1786 * tracking is done using the CR write traps. 1787 */ 1788 if (sev_es_guest(vcpu->kvm)) 1789 return; 1790 1791 if (hcr0 == cr0) { 1792 /* Selective CR0 write remains on. */ 1793 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 1794 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 1795 } else { 1796 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1797 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1798 } 1799 } 1800 1801 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1802 { 1803 return true; 1804 } 1805 1806 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1807 { 1808 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; 1809 unsigned long old_cr4 = vcpu->arch.cr4; 1810 1811 vcpu->arch.cr4 = cr4; 1812 if (!npt_enabled) { 1813 cr4 |= X86_CR4_PAE; 1814 1815 if (!is_paging(vcpu)) 1816 cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 1817 } 1818 cr4 |= host_cr4_mce; 1819 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1820 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 1821 1822 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 1823 vcpu->arch.cpuid_dynamic_bits_dirty = true; 1824 } 1825 1826 static void svm_set_segment(struct kvm_vcpu *vcpu, 1827 struct kvm_segment *var, int seg) 1828 { 1829 struct vcpu_svm *svm = to_svm(vcpu); 1830 struct vmcb_seg *s = svm_seg(vcpu, seg); 1831 1832 s->base = var->base; 1833 s->limit = var->limit; 1834 s->selector = var->selector; 1835 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); 1836 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; 1837 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; 1838 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; 1839 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; 1840 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; 1841 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 1842 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1843 1844 /* 1845 * This is always accurate, except if SYSRET returned to a segment 1846 * with SS.DPL != 3. Intel does not have this quirk, and always 1847 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it 1848 * would entail passing the CPL to userspace and back. 1849 */ 1850 if (seg == VCPU_SREG_SS) 1851 /* This is symmetric with svm_get_segment() */ 1852 svm->vmcb->save.cpl = (var->dpl & 3); 1853 1854 vmcb_mark_dirty(svm->vmcb, VMCB_SEG); 1855 } 1856 1857 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu) 1858 { 1859 struct vcpu_svm *svm = to_svm(vcpu); 1860 1861 clr_exception_intercept(svm, BP_VECTOR); 1862 1863 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1864 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1865 set_exception_intercept(svm, BP_VECTOR); 1866 } 1867 } 1868 1869 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1870 { 1871 if (sd->next_asid > sd->max_asid) { 1872 ++sd->asid_generation; 1873 sd->next_asid = sd->min_asid; 1874 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1875 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 1876 } 1877 1878 svm->current_vmcb->asid_generation = sd->asid_generation; 1879 svm->asid = sd->next_asid++; 1880 } 1881 1882 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) 1883 { 1884 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1885 1886 if (vcpu->arch.guest_state_protected) 1887 return; 1888 1889 if (unlikely(value != vmcb->save.dr6)) { 1890 vmcb->save.dr6 = value; 1891 vmcb_mark_dirty(vmcb, VMCB_DR); 1892 } 1893 } 1894 1895 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 1896 { 1897 struct vcpu_svm *svm = to_svm(vcpu); 1898 1899 if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm))) 1900 return; 1901 1902 get_debugreg(vcpu->arch.db[0], 0); 1903 get_debugreg(vcpu->arch.db[1], 1); 1904 get_debugreg(vcpu->arch.db[2], 2); 1905 get_debugreg(vcpu->arch.db[3], 3); 1906 /* 1907 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here, 1908 * because db_interception might need it. We can do it before vmentry. 1909 */ 1910 vcpu->arch.dr6 = svm->vmcb->save.dr6; 1911 vcpu->arch.dr7 = svm->vmcb->save.dr7; 1912 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 1913 set_dr_intercepts(svm); 1914 } 1915 1916 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1917 { 1918 struct vcpu_svm *svm = to_svm(vcpu); 1919 1920 if (vcpu->arch.guest_state_protected) 1921 return; 1922 1923 svm->vmcb->save.dr7 = value; 1924 vmcb_mark_dirty(svm->vmcb, VMCB_DR); 1925 } 1926 1927 static int pf_interception(struct kvm_vcpu *vcpu) 1928 { 1929 struct vcpu_svm *svm = to_svm(vcpu); 1930 1931 u64 fault_address = svm->vmcb->control.exit_info_2; 1932 u64 error_code = svm->vmcb->control.exit_info_1; 1933 1934 return kvm_handle_page_fault(vcpu, error_code, fault_address, 1935 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 1936 svm->vmcb->control.insn_bytes : NULL, 1937 svm->vmcb->control.insn_len); 1938 } 1939 1940 static int npf_interception(struct kvm_vcpu *vcpu) 1941 { 1942 struct vcpu_svm *svm = to_svm(vcpu); 1943 int rc; 1944 1945 u64 fault_address = svm->vmcb->control.exit_info_2; 1946 u64 error_code = svm->vmcb->control.exit_info_1; 1947 1948 /* 1949 * WARN if hardware generates a fault with an error code that collides 1950 * with KVM-defined sythentic flags. Clear the flags and continue on, 1951 * i.e. don't terminate the VM, as KVM can't possibly be relying on a 1952 * flag that KVM doesn't know about. 1953 */ 1954 if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) 1955 error_code &= ~PFERR_SYNTHETIC_MASK; 1956 1957 if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) 1958 error_code |= PFERR_PRIVATE_ACCESS; 1959 1960 trace_kvm_page_fault(vcpu, fault_address, error_code); 1961 rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, 1962 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 1963 svm->vmcb->control.insn_bytes : NULL, 1964 svm->vmcb->control.insn_len); 1965 1966 if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) 1967 sev_handle_rmp_fault(vcpu, fault_address, error_code); 1968 1969 return rc; 1970 } 1971 1972 static int db_interception(struct kvm_vcpu *vcpu) 1973 { 1974 struct kvm_run *kvm_run = vcpu->run; 1975 struct vcpu_svm *svm = to_svm(vcpu); 1976 1977 if (!(vcpu->guest_debug & 1978 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1979 !svm->nmi_singlestep) { 1980 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; 1981 kvm_queue_exception_p(vcpu, DB_VECTOR, payload); 1982 return 1; 1983 } 1984 1985 if (svm->nmi_singlestep) { 1986 disable_nmi_singlestep(svm); 1987 /* Make sure we check for pending NMIs upon entry */ 1988 kvm_make_request(KVM_REQ_EVENT, vcpu); 1989 } 1990 1991 if (vcpu->guest_debug & 1992 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { 1993 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1994 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; 1995 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; 1996 kvm_run->debug.arch.pc = 1997 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1998 kvm_run->debug.arch.exception = DB_VECTOR; 1999 return 0; 2000 } 2001 2002 return 1; 2003 } 2004 2005 static int bp_interception(struct kvm_vcpu *vcpu) 2006 { 2007 struct vcpu_svm *svm = to_svm(vcpu); 2008 struct kvm_run *kvm_run = vcpu->run; 2009 2010 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2011 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2012 kvm_run->debug.arch.exception = BP_VECTOR; 2013 return 0; 2014 } 2015 2016 static int ud_interception(struct kvm_vcpu *vcpu) 2017 { 2018 return handle_ud(vcpu); 2019 } 2020 2021 static int ac_interception(struct kvm_vcpu *vcpu) 2022 { 2023 kvm_queue_exception_e(vcpu, AC_VECTOR, 0); 2024 return 1; 2025 } 2026 2027 static bool is_erratum_383(void) 2028 { 2029 int i; 2030 u64 value; 2031 2032 if (!erratum_383_found) 2033 return false; 2034 2035 if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) 2036 return false; 2037 2038 /* Bit 62 may or may not be set for this mce */ 2039 value &= ~(1ULL << 62); 2040 2041 if (value != 0xb600000000010015ULL) 2042 return false; 2043 2044 /* Clear MCi_STATUS registers */ 2045 for (i = 0; i < 6; ++i) 2046 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); 2047 2048 if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { 2049 value &= ~(1ULL << 2); 2050 native_write_msr_safe(MSR_IA32_MCG_STATUS, value); 2051 } 2052 2053 /* Flush tlb to evict multi-match entries */ 2054 __flush_tlb_all(); 2055 2056 return true; 2057 } 2058 2059 static void svm_handle_mce(struct kvm_vcpu *vcpu) 2060 { 2061 if (is_erratum_383()) { 2062 /* 2063 * Erratum 383 triggered. Guest state is corrupt so kill the 2064 * guest. 2065 */ 2066 pr_err("Guest triggered AMD Erratum 383\n"); 2067 2068 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2069 2070 return; 2071 } 2072 2073 /* 2074 * On an #MC intercept the MCE handler is not called automatically in 2075 * the host. So do it by hand here. 2076 */ 2077 kvm_machine_check(); 2078 } 2079 2080 static int mc_interception(struct kvm_vcpu *vcpu) 2081 { 2082 return 1; 2083 } 2084 2085 static int shutdown_interception(struct kvm_vcpu *vcpu) 2086 { 2087 struct kvm_run *kvm_run = vcpu->run; 2088 struct vcpu_svm *svm = to_svm(vcpu); 2089 2090 2091 /* 2092 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put 2093 * the VMCB in a known good state. Unfortuately, KVM doesn't have 2094 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking 2095 * userspace. At a platform view, INIT is acceptable behavior as 2096 * there exist bare metal platforms that automatically INIT the CPU 2097 * in response to shutdown. 2098 * 2099 * The VM save area for SEV-ES guests has already been encrypted so it 2100 * cannot be reinitialized, i.e. synthesizing INIT is futile. 2101 */ 2102 if (!sev_es_guest(vcpu->kvm)) { 2103 clear_page(svm->vmcb); 2104 #ifdef CONFIG_KVM_SMM 2105 if (is_smm(vcpu)) 2106 kvm_smm_changed(vcpu, false); 2107 #endif 2108 kvm_vcpu_reset(vcpu, true); 2109 } 2110 2111 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2112 return 0; 2113 } 2114 2115 static int io_interception(struct kvm_vcpu *vcpu) 2116 { 2117 struct vcpu_svm *svm = to_svm(vcpu); 2118 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 2119 int size, in, string; 2120 unsigned port; 2121 2122 ++vcpu->stat.io_exits; 2123 string = (io_info & SVM_IOIO_STR_MASK) != 0; 2124 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 2125 port = io_info >> 16; 2126 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2127 2128 if (string) { 2129 if (sev_es_guest(vcpu->kvm)) 2130 return sev_es_string_io(svm, size, port, in); 2131 else 2132 return kvm_emulate_instruction(vcpu, 0); 2133 } 2134 2135 svm->next_rip = svm->vmcb->control.exit_info_2; 2136 2137 return kvm_fast_pio(vcpu, size, port, in); 2138 } 2139 2140 static int nmi_interception(struct kvm_vcpu *vcpu) 2141 { 2142 return 1; 2143 } 2144 2145 static int smi_interception(struct kvm_vcpu *vcpu) 2146 { 2147 return 1; 2148 } 2149 2150 static int intr_interception(struct kvm_vcpu *vcpu) 2151 { 2152 ++vcpu->stat.irq_exits; 2153 return 1; 2154 } 2155 2156 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) 2157 { 2158 struct vcpu_svm *svm = to_svm(vcpu); 2159 struct vmcb *vmcb12; 2160 struct kvm_host_map map; 2161 int ret; 2162 2163 if (nested_svm_check_permissions(vcpu)) 2164 return 1; 2165 2166 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); 2167 if (ret) { 2168 if (ret == -EINVAL) 2169 kvm_inject_gp(vcpu, 0); 2170 return 1; 2171 } 2172 2173 vmcb12 = map.hva; 2174 2175 ret = kvm_skip_emulated_instruction(vcpu); 2176 2177 if (vmload) { 2178 svm_copy_vmloadsave_state(svm->vmcb, vmcb12); 2179 svm->sysenter_eip_hi = 0; 2180 svm->sysenter_esp_hi = 0; 2181 } else { 2182 svm_copy_vmloadsave_state(vmcb12, svm->vmcb); 2183 } 2184 2185 kvm_vcpu_unmap(vcpu, &map); 2186 2187 return ret; 2188 } 2189 2190 static int vmload_interception(struct kvm_vcpu *vcpu) 2191 { 2192 return vmload_vmsave_interception(vcpu, true); 2193 } 2194 2195 static int vmsave_interception(struct kvm_vcpu *vcpu) 2196 { 2197 return vmload_vmsave_interception(vcpu, false); 2198 } 2199 2200 static int vmrun_interception(struct kvm_vcpu *vcpu) 2201 { 2202 if (nested_svm_check_permissions(vcpu)) 2203 return 1; 2204 2205 return nested_svm_vmrun(vcpu); 2206 } 2207 2208 enum { 2209 NONE_SVM_INSTR, 2210 SVM_INSTR_VMRUN, 2211 SVM_INSTR_VMLOAD, 2212 SVM_INSTR_VMSAVE, 2213 }; 2214 2215 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ 2216 static int svm_instr_opcode(struct kvm_vcpu *vcpu) 2217 { 2218 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 2219 2220 if (ctxt->b != 0x1 || ctxt->opcode_len != 2) 2221 return NONE_SVM_INSTR; 2222 2223 switch (ctxt->modrm) { 2224 case 0xd8: /* VMRUN */ 2225 return SVM_INSTR_VMRUN; 2226 case 0xda: /* VMLOAD */ 2227 return SVM_INSTR_VMLOAD; 2228 case 0xdb: /* VMSAVE */ 2229 return SVM_INSTR_VMSAVE; 2230 default: 2231 break; 2232 } 2233 2234 return NONE_SVM_INSTR; 2235 } 2236 2237 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) 2238 { 2239 const int guest_mode_exit_codes[] = { 2240 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, 2241 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, 2242 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, 2243 }; 2244 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { 2245 [SVM_INSTR_VMRUN] = vmrun_interception, 2246 [SVM_INSTR_VMLOAD] = vmload_interception, 2247 [SVM_INSTR_VMSAVE] = vmsave_interception, 2248 }; 2249 struct vcpu_svm *svm = to_svm(vcpu); 2250 int ret; 2251 2252 if (is_guest_mode(vcpu)) { 2253 /* Returns '1' or -errno on failure, '0' on success. */ 2254 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); 2255 if (ret) 2256 return ret; 2257 return 1; 2258 } 2259 return svm_instr_handlers[opcode](vcpu); 2260 } 2261 2262 /* 2263 * #GP handling code. Note that #GP can be triggered under the following two 2264 * cases: 2265 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on 2266 * some AMD CPUs when EAX of these instructions are in the reserved memory 2267 * regions (e.g. SMM memory on host). 2268 * 2) VMware backdoor 2269 */ 2270 static int gp_interception(struct kvm_vcpu *vcpu) 2271 { 2272 struct vcpu_svm *svm = to_svm(vcpu); 2273 u32 error_code = svm->vmcb->control.exit_info_1; 2274 int opcode; 2275 2276 /* Both #GP cases have zero error_code */ 2277 if (error_code) 2278 goto reinject; 2279 2280 /* Decode the instruction for usage later */ 2281 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) 2282 goto reinject; 2283 2284 opcode = svm_instr_opcode(vcpu); 2285 2286 if (opcode == NONE_SVM_INSTR) { 2287 if (!enable_vmware_backdoor) 2288 goto reinject; 2289 2290 /* 2291 * VMware backdoor emulation on #GP interception only handles 2292 * IN{S}, OUT{S}, and RDPMC. 2293 */ 2294 if (!is_guest_mode(vcpu)) 2295 return kvm_emulate_instruction(vcpu, 2296 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); 2297 } else { 2298 /* All SVM instructions expect page aligned RAX */ 2299 if (svm->vmcb->save.rax & ~PAGE_MASK) 2300 goto reinject; 2301 2302 return emulate_svm_instr(vcpu, opcode); 2303 } 2304 2305 reinject: 2306 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 2307 return 1; 2308 } 2309 2310 void svm_set_gif(struct vcpu_svm *svm, bool value) 2311 { 2312 if (value) { 2313 /* 2314 * If VGIF is enabled, the STGI intercept is only added to 2315 * detect the opening of the SMI/NMI window; remove it now. 2316 * Likewise, clear the VINTR intercept, we will set it 2317 * again while processing KVM_REQ_EVENT if needed. 2318 */ 2319 if (vgif) 2320 svm_clr_intercept(svm, INTERCEPT_STGI); 2321 if (svm_is_intercept(svm, INTERCEPT_VINTR)) 2322 svm_clear_vintr(svm); 2323 2324 enable_gif(svm); 2325 if (svm->vcpu.arch.smi_pending || 2326 svm->vcpu.arch.nmi_pending || 2327 kvm_cpu_has_injectable_intr(&svm->vcpu) || 2328 kvm_apic_has_pending_init_or_sipi(&svm->vcpu)) 2329 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 2330 } else { 2331 disable_gif(svm); 2332 2333 /* 2334 * After a CLGI no interrupts should come. But if vGIF is 2335 * in use, we still rely on the VINTR intercept (rather than 2336 * STGI) to detect an open interrupt window. 2337 */ 2338 if (!vgif) 2339 svm_clear_vintr(svm); 2340 } 2341 } 2342 2343 static int stgi_interception(struct kvm_vcpu *vcpu) 2344 { 2345 int ret; 2346 2347 if (nested_svm_check_permissions(vcpu)) 2348 return 1; 2349 2350 ret = kvm_skip_emulated_instruction(vcpu); 2351 svm_set_gif(to_svm(vcpu), true); 2352 return ret; 2353 } 2354 2355 static int clgi_interception(struct kvm_vcpu *vcpu) 2356 { 2357 int ret; 2358 2359 if (nested_svm_check_permissions(vcpu)) 2360 return 1; 2361 2362 ret = kvm_skip_emulated_instruction(vcpu); 2363 svm_set_gif(to_svm(vcpu), false); 2364 return ret; 2365 } 2366 2367 static int invlpga_interception(struct kvm_vcpu *vcpu) 2368 { 2369 gva_t gva = kvm_rax_read(vcpu); 2370 u32 asid = kvm_rcx_read(vcpu); 2371 2372 /* FIXME: Handle an address size prefix. */ 2373 if (!is_long_mode(vcpu)) 2374 gva = (u32)gva; 2375 2376 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva); 2377 2378 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 2379 kvm_mmu_invlpg(vcpu, gva); 2380 2381 return kvm_skip_emulated_instruction(vcpu); 2382 } 2383 2384 static int skinit_interception(struct kvm_vcpu *vcpu) 2385 { 2386 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu)); 2387 2388 kvm_queue_exception(vcpu, UD_VECTOR); 2389 return 1; 2390 } 2391 2392 static int task_switch_interception(struct kvm_vcpu *vcpu) 2393 { 2394 struct vcpu_svm *svm = to_svm(vcpu); 2395 u16 tss_selector; 2396 int reason; 2397 int int_type = svm->vmcb->control.exit_int_info & 2398 SVM_EXITINTINFO_TYPE_MASK; 2399 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; 2400 uint32_t type = 2401 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2402 uint32_t idt_v = 2403 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2404 bool has_error_code = false; 2405 u32 error_code = 0; 2406 2407 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2408 2409 if (svm->vmcb->control.exit_info_2 & 2410 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 2411 reason = TASK_SWITCH_IRET; 2412 else if (svm->vmcb->control.exit_info_2 & 2413 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 2414 reason = TASK_SWITCH_JMP; 2415 else if (idt_v) 2416 reason = TASK_SWITCH_GATE; 2417 else 2418 reason = TASK_SWITCH_CALL; 2419 2420 if (reason == TASK_SWITCH_GATE) { 2421 switch (type) { 2422 case SVM_EXITINTINFO_TYPE_NMI: 2423 vcpu->arch.nmi_injected = false; 2424 break; 2425 case SVM_EXITINTINFO_TYPE_EXEPT: 2426 if (svm->vmcb->control.exit_info_2 & 2427 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { 2428 has_error_code = true; 2429 error_code = 2430 (u32)svm->vmcb->control.exit_info_2; 2431 } 2432 kvm_clear_exception_queue(vcpu); 2433 break; 2434 case SVM_EXITINTINFO_TYPE_INTR: 2435 case SVM_EXITINTINFO_TYPE_SOFT: 2436 kvm_clear_interrupt_queue(vcpu); 2437 break; 2438 default: 2439 break; 2440 } 2441 } 2442 2443 if (reason != TASK_SWITCH_GATE || 2444 int_type == SVM_EXITINTINFO_TYPE_SOFT || 2445 (int_type == SVM_EXITINTINFO_TYPE_EXEPT && 2446 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { 2447 if (!svm_skip_emulated_instruction(vcpu)) 2448 return 0; 2449 } 2450 2451 if (int_type != SVM_EXITINTINFO_TYPE_SOFT) 2452 int_vec = -1; 2453 2454 return kvm_task_switch(vcpu, tss_selector, int_vec, reason, 2455 has_error_code, error_code); 2456 } 2457 2458 static void svm_clr_iret_intercept(struct vcpu_svm *svm) 2459 { 2460 if (!sev_es_guest(svm->vcpu.kvm)) 2461 svm_clr_intercept(svm, INTERCEPT_IRET); 2462 } 2463 2464 static void svm_set_iret_intercept(struct vcpu_svm *svm) 2465 { 2466 if (!sev_es_guest(svm->vcpu.kvm)) 2467 svm_set_intercept(svm, INTERCEPT_IRET); 2468 } 2469 2470 static int iret_interception(struct kvm_vcpu *vcpu) 2471 { 2472 struct vcpu_svm *svm = to_svm(vcpu); 2473 2474 WARN_ON_ONCE(sev_es_guest(vcpu->kvm)); 2475 2476 ++vcpu->stat.nmi_window_exits; 2477 svm->awaiting_iret_completion = true; 2478 2479 svm_clr_iret_intercept(svm); 2480 svm->nmi_iret_rip = kvm_rip_read(vcpu); 2481 2482 kvm_make_request(KVM_REQ_EVENT, vcpu); 2483 return 1; 2484 } 2485 2486 static int invlpg_interception(struct kvm_vcpu *vcpu) 2487 { 2488 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2489 return kvm_emulate_instruction(vcpu, 0); 2490 2491 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1); 2492 return kvm_skip_emulated_instruction(vcpu); 2493 } 2494 2495 static int emulate_on_interception(struct kvm_vcpu *vcpu) 2496 { 2497 return kvm_emulate_instruction(vcpu, 0); 2498 } 2499 2500 static int rsm_interception(struct kvm_vcpu *vcpu) 2501 { 2502 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2); 2503 } 2504 2505 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, 2506 unsigned long val) 2507 { 2508 struct vcpu_svm *svm = to_svm(vcpu); 2509 unsigned long cr0 = vcpu->arch.cr0; 2510 bool ret = false; 2511 2512 if (!is_guest_mode(vcpu) || 2513 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) 2514 return false; 2515 2516 cr0 &= ~SVM_CR0_SELECTIVE_MASK; 2517 val &= ~SVM_CR0_SELECTIVE_MASK; 2518 2519 if (cr0 ^ val) { 2520 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 2521 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); 2522 } 2523 2524 return ret; 2525 } 2526 2527 #define CR_VALID (1ULL << 63) 2528 2529 static int cr_interception(struct kvm_vcpu *vcpu) 2530 { 2531 struct vcpu_svm *svm = to_svm(vcpu); 2532 int reg, cr; 2533 unsigned long val; 2534 int err; 2535 2536 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2537 return emulate_on_interception(vcpu); 2538 2539 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) 2540 return emulate_on_interception(vcpu); 2541 2542 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2543 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) 2544 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; 2545 else 2546 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 2547 2548 err = 0; 2549 if (cr >= 16) { /* mov to cr */ 2550 cr -= 16; 2551 val = kvm_register_read(vcpu, reg); 2552 trace_kvm_cr_write(cr, val); 2553 switch (cr) { 2554 case 0: 2555 if (!check_selective_cr0_intercepted(vcpu, val)) 2556 err = kvm_set_cr0(vcpu, val); 2557 else 2558 return 1; 2559 2560 break; 2561 case 3: 2562 err = kvm_set_cr3(vcpu, val); 2563 break; 2564 case 4: 2565 err = kvm_set_cr4(vcpu, val); 2566 break; 2567 case 8: 2568 err = kvm_set_cr8(vcpu, val); 2569 break; 2570 default: 2571 WARN(1, "unhandled write to CR%d", cr); 2572 kvm_queue_exception(vcpu, UD_VECTOR); 2573 return 1; 2574 } 2575 } else { /* mov from cr */ 2576 switch (cr) { 2577 case 0: 2578 val = kvm_read_cr0(vcpu); 2579 break; 2580 case 2: 2581 val = vcpu->arch.cr2; 2582 break; 2583 case 3: 2584 val = kvm_read_cr3(vcpu); 2585 break; 2586 case 4: 2587 val = kvm_read_cr4(vcpu); 2588 break; 2589 case 8: 2590 val = kvm_get_cr8(vcpu); 2591 break; 2592 default: 2593 WARN(1, "unhandled read from CR%d", cr); 2594 kvm_queue_exception(vcpu, UD_VECTOR); 2595 return 1; 2596 } 2597 kvm_register_write(vcpu, reg, val); 2598 trace_kvm_cr_read(cr, val); 2599 } 2600 return kvm_complete_insn_gp(vcpu, err); 2601 } 2602 2603 static int cr_trap(struct kvm_vcpu *vcpu) 2604 { 2605 struct vcpu_svm *svm = to_svm(vcpu); 2606 unsigned long old_value, new_value; 2607 unsigned int cr; 2608 int ret = 0; 2609 2610 new_value = (unsigned long)svm->vmcb->control.exit_info_1; 2611 2612 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP; 2613 switch (cr) { 2614 case 0: 2615 old_value = kvm_read_cr0(vcpu); 2616 svm_set_cr0(vcpu, new_value); 2617 2618 kvm_post_set_cr0(vcpu, old_value, new_value); 2619 break; 2620 case 4: 2621 old_value = kvm_read_cr4(vcpu); 2622 svm_set_cr4(vcpu, new_value); 2623 2624 kvm_post_set_cr4(vcpu, old_value, new_value); 2625 break; 2626 case 8: 2627 ret = kvm_set_cr8(vcpu, new_value); 2628 break; 2629 default: 2630 WARN(1, "unhandled CR%d write trap", cr); 2631 kvm_queue_exception(vcpu, UD_VECTOR); 2632 return 1; 2633 } 2634 2635 return kvm_complete_insn_gp(vcpu, ret); 2636 } 2637 2638 static int dr_interception(struct kvm_vcpu *vcpu) 2639 { 2640 struct vcpu_svm *svm = to_svm(vcpu); 2641 int reg, dr; 2642 int err = 0; 2643 2644 /* 2645 * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT 2646 * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early. 2647 */ 2648 if (sev_es_guest(vcpu->kvm)) 2649 return 1; 2650 2651 if (vcpu->guest_debug == 0) { 2652 /* 2653 * No more DR vmexits; force a reload of the debug registers 2654 * and reenter on this instruction. The next vmexit will 2655 * retrieve the full state of the debug registers. 2656 */ 2657 clr_dr_intercepts(svm); 2658 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 2659 return 1; 2660 } 2661 2662 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) 2663 return emulate_on_interception(vcpu); 2664 2665 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2666 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; 2667 if (dr >= 16) { /* mov to DRn */ 2668 dr -= 16; 2669 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 2670 } else { 2671 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 2672 } 2673 2674 return kvm_complete_insn_gp(vcpu, err); 2675 } 2676 2677 static int cr8_write_interception(struct kvm_vcpu *vcpu) 2678 { 2679 int r; 2680 2681 u8 cr8_prev = kvm_get_cr8(vcpu); 2682 /* instruction emulation calls kvm_set_cr8() */ 2683 r = cr_interception(vcpu); 2684 if (lapic_in_kernel(vcpu)) 2685 return r; 2686 if (cr8_prev <= kvm_get_cr8(vcpu)) 2687 return r; 2688 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 2689 return 0; 2690 } 2691 2692 static int efer_trap(struct kvm_vcpu *vcpu) 2693 { 2694 struct msr_data msr_info; 2695 int ret; 2696 2697 /* 2698 * Clear the EFER_SVME bit from EFER. The SVM code always sets this 2699 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against 2700 * whether the guest has X86_FEATURE_SVM - this avoids a failure if 2701 * the guest doesn't have X86_FEATURE_SVM. 2702 */ 2703 msr_info.host_initiated = false; 2704 msr_info.index = MSR_EFER; 2705 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME; 2706 ret = kvm_set_msr_common(vcpu, &msr_info); 2707 2708 return kvm_complete_insn_gp(vcpu, ret); 2709 } 2710 2711 static int svm_get_feature_msr(u32 msr, u64 *data) 2712 { 2713 *data = 0; 2714 2715 switch (msr) { 2716 case MSR_AMD64_DE_CFG: 2717 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) 2718 *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; 2719 break; 2720 default: 2721 return KVM_MSR_RET_UNSUPPORTED; 2722 } 2723 2724 return 0; 2725 } 2726 2727 static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, 2728 struct msr_data *msr_info) 2729 { 2730 return sev_es_guest(vcpu->kvm) && 2731 vcpu->arch.guest_state_protected && 2732 !msr_write_intercepted(vcpu, msr_info->index); 2733 } 2734 2735 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2736 { 2737 struct vcpu_svm *svm = to_svm(vcpu); 2738 2739 if (sev_es_prevent_msr_access(vcpu, msr_info)) { 2740 msr_info->data = 0; 2741 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; 2742 } 2743 2744 switch (msr_info->index) { 2745 case MSR_AMD64_TSC_RATIO: 2746 if (!msr_info->host_initiated && 2747 !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) 2748 return 1; 2749 msr_info->data = svm->tsc_ratio_msr; 2750 break; 2751 case MSR_STAR: 2752 msr_info->data = svm->vmcb01.ptr->save.star; 2753 break; 2754 #ifdef CONFIG_X86_64 2755 case MSR_LSTAR: 2756 msr_info->data = svm->vmcb01.ptr->save.lstar; 2757 break; 2758 case MSR_CSTAR: 2759 msr_info->data = svm->vmcb01.ptr->save.cstar; 2760 break; 2761 case MSR_GS_BASE: 2762 msr_info->data = svm->vmcb01.ptr->save.gs.base; 2763 break; 2764 case MSR_FS_BASE: 2765 msr_info->data = svm->vmcb01.ptr->save.fs.base; 2766 break; 2767 case MSR_KERNEL_GS_BASE: 2768 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; 2769 break; 2770 case MSR_SYSCALL_MASK: 2771 msr_info->data = svm->vmcb01.ptr->save.sfmask; 2772 break; 2773 #endif 2774 case MSR_IA32_SYSENTER_CS: 2775 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; 2776 break; 2777 case MSR_IA32_SYSENTER_EIP: 2778 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; 2779 if (guest_cpuid_is_intel_compatible(vcpu)) 2780 msr_info->data |= (u64)svm->sysenter_eip_hi << 32; 2781 break; 2782 case MSR_IA32_SYSENTER_ESP: 2783 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; 2784 if (guest_cpuid_is_intel_compatible(vcpu)) 2785 msr_info->data |= (u64)svm->sysenter_esp_hi << 32; 2786 break; 2787 case MSR_TSC_AUX: 2788 msr_info->data = svm->tsc_aux; 2789 break; 2790 case MSR_IA32_DEBUGCTLMSR: 2791 msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; 2792 break; 2793 case MSR_IA32_LASTBRANCHFROMIP: 2794 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; 2795 break; 2796 case MSR_IA32_LASTBRANCHTOIP: 2797 msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; 2798 break; 2799 case MSR_IA32_LASTINTFROMIP: 2800 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; 2801 break; 2802 case MSR_IA32_LASTINTTOIP: 2803 msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; 2804 break; 2805 case MSR_VM_HSAVE_PA: 2806 msr_info->data = svm->nested.hsave_msr; 2807 break; 2808 case MSR_VM_CR: 2809 msr_info->data = svm->nested.vm_cr_msr; 2810 break; 2811 case MSR_IA32_SPEC_CTRL: 2812 if (!msr_info->host_initiated && 2813 !guest_has_spec_ctrl_msr(vcpu)) 2814 return 1; 2815 2816 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2817 msr_info->data = svm->vmcb->save.spec_ctrl; 2818 else 2819 msr_info->data = svm->spec_ctrl; 2820 break; 2821 case MSR_AMD64_VIRT_SPEC_CTRL: 2822 if (!msr_info->host_initiated && 2823 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2824 return 1; 2825 2826 msr_info->data = svm->virt_spec_ctrl; 2827 break; 2828 case MSR_F15H_IC_CFG: { 2829 2830 int family, model; 2831 2832 family = guest_cpuid_family(vcpu); 2833 model = guest_cpuid_model(vcpu); 2834 2835 if (family < 0 || model < 0) 2836 return kvm_get_msr_common(vcpu, msr_info); 2837 2838 msr_info->data = 0; 2839 2840 if (family == 0x15 && 2841 (model >= 0x2 && model < 0x20)) 2842 msr_info->data = 0x1E; 2843 } 2844 break; 2845 case MSR_AMD64_DE_CFG: 2846 msr_info->data = svm->msr_decfg; 2847 break; 2848 default: 2849 return kvm_get_msr_common(vcpu, msr_info); 2850 } 2851 return 0; 2852 } 2853 2854 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2855 { 2856 struct vcpu_svm *svm = to_svm(vcpu); 2857 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) 2858 return kvm_complete_insn_gp(vcpu, err); 2859 2860 svm_vmgexit_inject_exception(svm, X86_TRAP_GP); 2861 return 1; 2862 } 2863 2864 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) 2865 { 2866 struct vcpu_svm *svm = to_svm(vcpu); 2867 int svm_dis, chg_mask; 2868 2869 if (data & ~SVM_VM_CR_VALID_MASK) 2870 return 1; 2871 2872 chg_mask = SVM_VM_CR_VALID_MASK; 2873 2874 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) 2875 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); 2876 2877 svm->nested.vm_cr_msr &= ~chg_mask; 2878 svm->nested.vm_cr_msr |= (data & chg_mask); 2879 2880 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; 2881 2882 /* check for svm_disable while efer.svme is set */ 2883 if (svm_dis && (vcpu->arch.efer & EFER_SVME)) 2884 return 1; 2885 2886 return 0; 2887 } 2888 2889 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2890 { 2891 struct vcpu_svm *svm = to_svm(vcpu); 2892 int ret = 0; 2893 2894 u32 ecx = msr->index; 2895 u64 data = msr->data; 2896 2897 if (sev_es_prevent_msr_access(vcpu, msr)) 2898 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; 2899 2900 switch (ecx) { 2901 case MSR_AMD64_TSC_RATIO: 2902 2903 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) { 2904 2905 if (!msr->host_initiated) 2906 return 1; 2907 /* 2908 * In case TSC scaling is not enabled, always 2909 * leave this MSR at the default value. 2910 * 2911 * Due to bug in qemu 6.2.0, it would try to set 2912 * this msr to 0 if tsc scaling is not enabled. 2913 * Ignore this value as well. 2914 */ 2915 if (data != 0 && data != svm->tsc_ratio_msr) 2916 return 1; 2917 break; 2918 } 2919 2920 if (data & SVM_TSC_RATIO_RSVD) 2921 return 1; 2922 2923 svm->tsc_ratio_msr = data; 2924 2925 if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && 2926 is_guest_mode(vcpu)) 2927 nested_svm_update_tsc_ratio_msr(vcpu); 2928 2929 break; 2930 case MSR_IA32_CR_PAT: 2931 ret = kvm_set_msr_common(vcpu, msr); 2932 if (ret) 2933 break; 2934 2935 svm->vmcb01.ptr->save.g_pat = data; 2936 if (is_guest_mode(vcpu)) 2937 nested_vmcb02_compute_g_pat(svm); 2938 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 2939 break; 2940 case MSR_IA32_SPEC_CTRL: 2941 if (!msr->host_initiated && 2942 !guest_has_spec_ctrl_msr(vcpu)) 2943 return 1; 2944 2945 if (kvm_spec_ctrl_test_value(data)) 2946 return 1; 2947 2948 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2949 svm->vmcb->save.spec_ctrl = data; 2950 else 2951 svm->spec_ctrl = data; 2952 if (!data) 2953 break; 2954 2955 /* 2956 * For non-nested: 2957 * When it's written (to non-zero) for the first time, pass 2958 * it through. 2959 * 2960 * For nested: 2961 * The handling of the MSR bitmap for L2 guests is done in 2962 * nested_svm_merge_msrpm(). 2963 * We update the L1 MSR bit as well since it will end up 2964 * touching the MSR anyway now. 2965 */ 2966 svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 2967 break; 2968 case MSR_AMD64_VIRT_SPEC_CTRL: 2969 if (!msr->host_initiated && 2970 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2971 return 1; 2972 2973 if (data & ~SPEC_CTRL_SSBD) 2974 return 1; 2975 2976 svm->virt_spec_ctrl = data; 2977 break; 2978 case MSR_STAR: 2979 svm->vmcb01.ptr->save.star = data; 2980 break; 2981 #ifdef CONFIG_X86_64 2982 case MSR_LSTAR: 2983 svm->vmcb01.ptr->save.lstar = data; 2984 break; 2985 case MSR_CSTAR: 2986 svm->vmcb01.ptr->save.cstar = data; 2987 break; 2988 case MSR_GS_BASE: 2989 svm->vmcb01.ptr->save.gs.base = data; 2990 break; 2991 case MSR_FS_BASE: 2992 svm->vmcb01.ptr->save.fs.base = data; 2993 break; 2994 case MSR_KERNEL_GS_BASE: 2995 svm->vmcb01.ptr->save.kernel_gs_base = data; 2996 break; 2997 case MSR_SYSCALL_MASK: 2998 svm->vmcb01.ptr->save.sfmask = data; 2999 break; 3000 #endif 3001 case MSR_IA32_SYSENTER_CS: 3002 svm->vmcb01.ptr->save.sysenter_cs = data; 3003 break; 3004 case MSR_IA32_SYSENTER_EIP: 3005 svm->vmcb01.ptr->save.sysenter_eip = (u32)data; 3006 /* 3007 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs 3008 * when we spoof an Intel vendor ID (for cross vendor migration). 3009 * In this case we use this intercept to track the high 3010 * 32 bit part of these msrs to support Intel's 3011 * implementation of SYSENTER/SYSEXIT. 3012 */ 3013 svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; 3014 break; 3015 case MSR_IA32_SYSENTER_ESP: 3016 svm->vmcb01.ptr->save.sysenter_esp = (u32)data; 3017 svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; 3018 break; 3019 case MSR_TSC_AUX: 3020 /* 3021 * TSC_AUX is always virtualized for SEV-ES guests when the 3022 * feature is available. The user return MSR support is not 3023 * required in this case because TSC_AUX is restored on #VMEXIT 3024 * from the host save area (which has been initialized in 3025 * svm_enable_virtualization_cpu()). 3026 */ 3027 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) 3028 break; 3029 3030 /* 3031 * TSC_AUX is usually changed only during boot and never read 3032 * directly. Intercept TSC_AUX and switch it via user return. 3033 */ 3034 preempt_disable(); 3035 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); 3036 preempt_enable(); 3037 if (ret) 3038 break; 3039 3040 svm->tsc_aux = data; 3041 break; 3042 case MSR_IA32_DEBUGCTLMSR: 3043 if (!lbrv) { 3044 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3045 break; 3046 } 3047 3048 /* 3049 * Suppress BTF as KVM doesn't virtualize BTF, but there's no 3050 * way to communicate lack of support to the guest. 3051 */ 3052 if (data & DEBUGCTLMSR_BTF) { 3053 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 3054 data &= ~DEBUGCTLMSR_BTF; 3055 } 3056 3057 if (data & DEBUGCTL_RESERVED_BITS) 3058 return 1; 3059 3060 svm_get_lbr_vmcb(svm)->save.dbgctl = data; 3061 svm_update_lbrv(vcpu); 3062 break; 3063 case MSR_VM_HSAVE_PA: 3064 /* 3065 * Old kernels did not validate the value written to 3066 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid 3067 * value to allow live migrating buggy or malicious guests 3068 * originating from those kernels. 3069 */ 3070 if (!msr->host_initiated && !page_address_valid(vcpu, data)) 3071 return 1; 3072 3073 svm->nested.hsave_msr = data & PAGE_MASK; 3074 break; 3075 case MSR_VM_CR: 3076 return svm_set_vm_cr(vcpu, data); 3077 case MSR_VM_IGNNE: 3078 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3079 break; 3080 case MSR_AMD64_DE_CFG: { 3081 u64 supported_de_cfg; 3082 3083 if (svm_get_feature_msr(ecx, &supported_de_cfg)) 3084 return 1; 3085 3086 if (data & ~supported_de_cfg) 3087 return 1; 3088 3089 svm->msr_decfg = data; 3090 break; 3091 } 3092 default: 3093 return kvm_set_msr_common(vcpu, msr); 3094 } 3095 return ret; 3096 } 3097 3098 static int msr_interception(struct kvm_vcpu *vcpu) 3099 { 3100 if (to_svm(vcpu)->vmcb->control.exit_info_1) 3101 return kvm_emulate_wrmsr(vcpu); 3102 else 3103 return kvm_emulate_rdmsr(vcpu); 3104 } 3105 3106 static int interrupt_window_interception(struct kvm_vcpu *vcpu) 3107 { 3108 kvm_make_request(KVM_REQ_EVENT, vcpu); 3109 svm_clear_vintr(to_svm(vcpu)); 3110 3111 /* 3112 * If not running nested, for AVIC, the only reason to end up here is ExtINTs. 3113 * In this case AVIC was temporarily disabled for 3114 * requesting the IRQ window and we have to re-enable it. 3115 * 3116 * If running nested, still remove the VM wide AVIC inhibit to 3117 * support case in which the interrupt window was requested when the 3118 * vCPU was not running nested. 3119 3120 * All vCPUs which run still run nested, will remain to have their 3121 * AVIC still inhibited due to per-cpu AVIC inhibition. 3122 */ 3123 kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 3124 3125 ++vcpu->stat.irq_window_exits; 3126 return 1; 3127 } 3128 3129 static int pause_interception(struct kvm_vcpu *vcpu) 3130 { 3131 bool in_kernel; 3132 /* 3133 * CPL is not made available for an SEV-ES guest, therefore 3134 * vcpu->arch.preempted_in_kernel can never be true. Just 3135 * set in_kernel to false as well. 3136 */ 3137 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; 3138 3139 grow_ple_window(vcpu); 3140 3141 kvm_vcpu_on_spin(vcpu, in_kernel); 3142 return kvm_skip_emulated_instruction(vcpu); 3143 } 3144 3145 static int invpcid_interception(struct kvm_vcpu *vcpu) 3146 { 3147 struct vcpu_svm *svm = to_svm(vcpu); 3148 unsigned long type; 3149 gva_t gva; 3150 3151 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 3152 kvm_queue_exception(vcpu, UD_VECTOR); 3153 return 1; 3154 } 3155 3156 /* 3157 * For an INVPCID intercept: 3158 * EXITINFO1 provides the linear address of the memory operand. 3159 * EXITINFO2 provides the contents of the register operand. 3160 */ 3161 type = svm->vmcb->control.exit_info_2; 3162 gva = svm->vmcb->control.exit_info_1; 3163 3164 /* 3165 * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the 3166 * stack segment is used. The intercept takes priority over all 3167 * #GP checks except CPL>0, but somehow still generates a linear 3168 * address? The APM is sorely lacking. 3169 */ 3170 if (is_noncanonical_address(gva, vcpu, 0)) { 3171 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3172 return 1; 3173 } 3174 3175 return kvm_handle_invpcid(vcpu, type, gva); 3176 } 3177 3178 static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) 3179 { 3180 struct vcpu_svm *svm = to_svm(vcpu); 3181 3182 /* 3183 * If userspace has NOT changed RIP, then KVM's ABI is to let the guest 3184 * execute the bus-locking instruction. Set the bus lock counter to '1' 3185 * to effectively step past the bus lock. 3186 */ 3187 if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) 3188 svm->vmcb->control.bus_lock_counter = 1; 3189 3190 return 1; 3191 } 3192 3193 static int bus_lock_exit(struct kvm_vcpu *vcpu) 3194 { 3195 struct vcpu_svm *svm = to_svm(vcpu); 3196 3197 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 3198 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 3199 3200 vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); 3201 vcpu->arch.complete_userspace_io = complete_userspace_buslock; 3202 3203 if (is_guest_mode(vcpu)) 3204 svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; 3205 3206 return 0; 3207 } 3208 3209 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { 3210 [SVM_EXIT_READ_CR0] = cr_interception, 3211 [SVM_EXIT_READ_CR3] = cr_interception, 3212 [SVM_EXIT_READ_CR4] = cr_interception, 3213 [SVM_EXIT_READ_CR8] = cr_interception, 3214 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, 3215 [SVM_EXIT_WRITE_CR0] = cr_interception, 3216 [SVM_EXIT_WRITE_CR3] = cr_interception, 3217 [SVM_EXIT_WRITE_CR4] = cr_interception, 3218 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3219 [SVM_EXIT_READ_DR0] = dr_interception, 3220 [SVM_EXIT_READ_DR1] = dr_interception, 3221 [SVM_EXIT_READ_DR2] = dr_interception, 3222 [SVM_EXIT_READ_DR3] = dr_interception, 3223 [SVM_EXIT_READ_DR4] = dr_interception, 3224 [SVM_EXIT_READ_DR5] = dr_interception, 3225 [SVM_EXIT_READ_DR6] = dr_interception, 3226 [SVM_EXIT_READ_DR7] = dr_interception, 3227 [SVM_EXIT_WRITE_DR0] = dr_interception, 3228 [SVM_EXIT_WRITE_DR1] = dr_interception, 3229 [SVM_EXIT_WRITE_DR2] = dr_interception, 3230 [SVM_EXIT_WRITE_DR3] = dr_interception, 3231 [SVM_EXIT_WRITE_DR4] = dr_interception, 3232 [SVM_EXIT_WRITE_DR5] = dr_interception, 3233 [SVM_EXIT_WRITE_DR6] = dr_interception, 3234 [SVM_EXIT_WRITE_DR7] = dr_interception, 3235 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3236 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3237 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3238 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 3239 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 3240 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 3241 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, 3242 [SVM_EXIT_INTR] = intr_interception, 3243 [SVM_EXIT_NMI] = nmi_interception, 3244 [SVM_EXIT_SMI] = smi_interception, 3245 [SVM_EXIT_VINTR] = interrupt_window_interception, 3246 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, 3247 [SVM_EXIT_CPUID] = kvm_emulate_cpuid, 3248 [SVM_EXIT_IRET] = iret_interception, 3249 [SVM_EXIT_INVD] = kvm_emulate_invd, 3250 [SVM_EXIT_PAUSE] = pause_interception, 3251 [SVM_EXIT_HLT] = kvm_emulate_halt, 3252 [SVM_EXIT_INVLPG] = invlpg_interception, 3253 [SVM_EXIT_INVLPGA] = invlpga_interception, 3254 [SVM_EXIT_IOIO] = io_interception, 3255 [SVM_EXIT_MSR] = msr_interception, 3256 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 3257 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 3258 [SVM_EXIT_VMRUN] = vmrun_interception, 3259 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, 3260 [SVM_EXIT_VMLOAD] = vmload_interception, 3261 [SVM_EXIT_VMSAVE] = vmsave_interception, 3262 [SVM_EXIT_STGI] = stgi_interception, 3263 [SVM_EXIT_CLGI] = clgi_interception, 3264 [SVM_EXIT_SKINIT] = skinit_interception, 3265 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, 3266 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, 3267 [SVM_EXIT_MONITOR] = kvm_emulate_monitor, 3268 [SVM_EXIT_MWAIT] = kvm_emulate_mwait, 3269 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, 3270 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, 3271 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, 3272 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, 3273 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, 3274 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, 3275 [SVM_EXIT_INVPCID] = invpcid_interception, 3276 [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, 3277 [SVM_EXIT_NPF] = npf_interception, 3278 [SVM_EXIT_BUS_LOCK] = bus_lock_exit, 3279 [SVM_EXIT_RSM] = rsm_interception, 3280 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 3281 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, 3282 #ifdef CONFIG_KVM_AMD_SEV 3283 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, 3284 #endif 3285 }; 3286 3287 static void dump_vmcb(struct kvm_vcpu *vcpu) 3288 { 3289 struct vcpu_svm *svm = to_svm(vcpu); 3290 struct vmcb_control_area *control = &svm->vmcb->control; 3291 struct vmcb_save_area *save = &svm->vmcb->save; 3292 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; 3293 char *vm_type; 3294 3295 if (!dump_invalid_vmcb) { 3296 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3297 return; 3298 } 3299 3300 guard(mutex)(&vmcb_dump_mutex); 3301 3302 vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : 3303 sev_es_guest(vcpu->kvm) ? "SEV-ES" : 3304 sev_guest(vcpu->kvm) ? "SEV" : "SVM"; 3305 3306 pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", 3307 vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); 3308 pr_err("VMCB Control Area:\n"); 3309 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); 3310 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); 3311 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); 3312 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); 3313 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); 3314 pr_err("%-20s%08x %08x\n", "intercepts:", 3315 control->intercepts[INTERCEPT_WORD3], 3316 control->intercepts[INTERCEPT_WORD4]); 3317 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); 3318 pr_err("%-20s%d\n", "pause filter threshold:", 3319 control->pause_filter_thresh); 3320 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); 3321 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); 3322 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); 3323 pr_err("%-20s%d\n", "asid:", control->asid); 3324 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); 3325 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); 3326 pr_err("%-20s%08x\n", "int_vector:", control->int_vector); 3327 pr_err("%-20s%08x\n", "int_state:", control->int_state); 3328 pr_err("%-20s%08x\n", "exit_code:", control->exit_code); 3329 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); 3330 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); 3331 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); 3332 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); 3333 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); 3334 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); 3335 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 3336 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); 3337 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 3338 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 3339 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 3340 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 3341 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 3342 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); 3343 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); 3344 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); 3345 pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); 3346 pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); 3347 3348 if (sev_es_guest(vcpu->kvm)) { 3349 save = sev_decrypt_vmsa(vcpu); 3350 if (!save) 3351 goto no_vmsa; 3352 3353 save01 = save; 3354 } 3355 3356 pr_err("VMCB State Save Area:\n"); 3357 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3358 "es:", 3359 save->es.selector, save->es.attrib, 3360 save->es.limit, save->es.base); 3361 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3362 "cs:", 3363 save->cs.selector, save->cs.attrib, 3364 save->cs.limit, save->cs.base); 3365 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3366 "ss:", 3367 save->ss.selector, save->ss.attrib, 3368 save->ss.limit, save->ss.base); 3369 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3370 "ds:", 3371 save->ds.selector, save->ds.attrib, 3372 save->ds.limit, save->ds.base); 3373 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3374 "fs:", 3375 save01->fs.selector, save01->fs.attrib, 3376 save01->fs.limit, save01->fs.base); 3377 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3378 "gs:", 3379 save01->gs.selector, save01->gs.attrib, 3380 save01->gs.limit, save01->gs.base); 3381 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3382 "gdtr:", 3383 save->gdtr.selector, save->gdtr.attrib, 3384 save->gdtr.limit, save->gdtr.base); 3385 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3386 "ldtr:", 3387 save01->ldtr.selector, save01->ldtr.attrib, 3388 save01->ldtr.limit, save01->ldtr.base); 3389 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3390 "idtr:", 3391 save->idtr.selector, save->idtr.attrib, 3392 save->idtr.limit, save->idtr.base); 3393 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3394 "tr:", 3395 save01->tr.selector, save01->tr.attrib, 3396 save01->tr.limit, save01->tr.base); 3397 pr_err("vmpl: %d cpl: %d efer: %016llx\n", 3398 save->vmpl, save->cpl, save->efer); 3399 pr_err("%-15s %016llx %-13s %016llx\n", 3400 "cr0:", save->cr0, "cr2:", save->cr2); 3401 pr_err("%-15s %016llx %-13s %016llx\n", 3402 "cr3:", save->cr3, "cr4:", save->cr4); 3403 pr_err("%-15s %016llx %-13s %016llx\n", 3404 "dr6:", save->dr6, "dr7:", save->dr7); 3405 pr_err("%-15s %016llx %-13s %016llx\n", 3406 "rip:", save->rip, "rflags:", save->rflags); 3407 pr_err("%-15s %016llx %-13s %016llx\n", 3408 "rsp:", save->rsp, "rax:", save->rax); 3409 pr_err("%-15s %016llx %-13s %016llx\n", 3410 "star:", save01->star, "lstar:", save01->lstar); 3411 pr_err("%-15s %016llx %-13s %016llx\n", 3412 "cstar:", save01->cstar, "sfmask:", save01->sfmask); 3413 pr_err("%-15s %016llx %-13s %016llx\n", 3414 "kernel_gs_base:", save01->kernel_gs_base, 3415 "sysenter_cs:", save01->sysenter_cs); 3416 pr_err("%-15s %016llx %-13s %016llx\n", 3417 "sysenter_esp:", save01->sysenter_esp, 3418 "sysenter_eip:", save01->sysenter_eip); 3419 pr_err("%-15s %016llx %-13s %016llx\n", 3420 "gpat:", save->g_pat, "dbgctl:", save->dbgctl); 3421 pr_err("%-15s %016llx %-13s %016llx\n", 3422 "br_from:", save->br_from, "br_to:", save->br_to); 3423 pr_err("%-15s %016llx %-13s %016llx\n", 3424 "excp_from:", save->last_excp_from, 3425 "excp_to:", save->last_excp_to); 3426 3427 if (sev_es_guest(vcpu->kvm)) { 3428 struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; 3429 3430 pr_err("%-15s %016llx\n", 3431 "sev_features", vmsa->sev_features); 3432 3433 pr_err("%-15s %016llx %-13s %016llx\n", 3434 "rax:", vmsa->rax, "rbx:", vmsa->rbx); 3435 pr_err("%-15s %016llx %-13s %016llx\n", 3436 "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); 3437 pr_err("%-15s %016llx %-13s %016llx\n", 3438 "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); 3439 pr_err("%-15s %016llx %-13s %016llx\n", 3440 "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); 3441 pr_err("%-15s %016llx %-13s %016llx\n", 3442 "r8:", vmsa->r8, "r9:", vmsa->r9); 3443 pr_err("%-15s %016llx %-13s %016llx\n", 3444 "r10:", vmsa->r10, "r11:", vmsa->r11); 3445 pr_err("%-15s %016llx %-13s %016llx\n", 3446 "r12:", vmsa->r12, "r13:", vmsa->r13); 3447 pr_err("%-15s %016llx %-13s %016llx\n", 3448 "r14:", vmsa->r14, "r15:", vmsa->r15); 3449 pr_err("%-15s %016llx %-13s %016llx\n", 3450 "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); 3451 } else { 3452 pr_err("%-15s %016llx %-13s %016lx\n", 3453 "rax:", save->rax, "rbx:", 3454 vcpu->arch.regs[VCPU_REGS_RBX]); 3455 pr_err("%-15s %016lx %-13s %016lx\n", 3456 "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], 3457 "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); 3458 pr_err("%-15s %016lx %-13s %016lx\n", 3459 "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], 3460 "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); 3461 pr_err("%-15s %016lx %-13s %016llx\n", 3462 "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], 3463 "rsp:", save->rsp); 3464 #ifdef CONFIG_X86_64 3465 pr_err("%-15s %016lx %-13s %016lx\n", 3466 "r8:", vcpu->arch.regs[VCPU_REGS_R8], 3467 "r9:", vcpu->arch.regs[VCPU_REGS_R9]); 3468 pr_err("%-15s %016lx %-13s %016lx\n", 3469 "r10:", vcpu->arch.regs[VCPU_REGS_R10], 3470 "r11:", vcpu->arch.regs[VCPU_REGS_R11]); 3471 pr_err("%-15s %016lx %-13s %016lx\n", 3472 "r12:", vcpu->arch.regs[VCPU_REGS_R12], 3473 "r13:", vcpu->arch.regs[VCPU_REGS_R13]); 3474 pr_err("%-15s %016lx %-13s %016lx\n", 3475 "r14:", vcpu->arch.regs[VCPU_REGS_R14], 3476 "r15:", vcpu->arch.regs[VCPU_REGS_R15]); 3477 #endif 3478 } 3479 3480 no_vmsa: 3481 if (sev_es_guest(vcpu->kvm)) 3482 sev_free_decrypted_vmsa(vcpu, save); 3483 } 3484 3485 static bool svm_check_exit_valid(u64 exit_code) 3486 { 3487 return (exit_code < ARRAY_SIZE(svm_exit_handlers) && 3488 svm_exit_handlers[exit_code]); 3489 } 3490 3491 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) 3492 { 3493 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); 3494 dump_vmcb(vcpu); 3495 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3496 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 3497 vcpu->run->internal.ndata = 2; 3498 vcpu->run->internal.data[0] = exit_code; 3499 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 3500 return 0; 3501 } 3502 3503 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) 3504 { 3505 if (!svm_check_exit_valid(exit_code)) 3506 return svm_handle_invalid_exit(vcpu, exit_code); 3507 3508 #ifdef CONFIG_MITIGATION_RETPOLINE 3509 if (exit_code == SVM_EXIT_MSR) 3510 return msr_interception(vcpu); 3511 else if (exit_code == SVM_EXIT_VINTR) 3512 return interrupt_window_interception(vcpu); 3513 else if (exit_code == SVM_EXIT_INTR) 3514 return intr_interception(vcpu); 3515 else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) 3516 return kvm_emulate_halt(vcpu); 3517 else if (exit_code == SVM_EXIT_NPF) 3518 return npf_interception(vcpu); 3519 #ifdef CONFIG_KVM_AMD_SEV 3520 else if (exit_code == SVM_EXIT_VMGEXIT) 3521 return sev_handle_vmgexit(vcpu); 3522 #endif 3523 #endif 3524 return svm_exit_handlers[exit_code](vcpu); 3525 } 3526 3527 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 3528 u64 *info1, u64 *info2, 3529 u32 *intr_info, u32 *error_code) 3530 { 3531 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 3532 3533 *reason = control->exit_code; 3534 *info1 = control->exit_info_1; 3535 *info2 = control->exit_info_2; 3536 *intr_info = control->exit_int_info; 3537 if ((*intr_info & SVM_EXITINTINFO_VALID) && 3538 (*intr_info & SVM_EXITINTINFO_VALID_ERR)) 3539 *error_code = control->exit_int_info_err; 3540 else 3541 *error_code = 0; 3542 } 3543 3544 static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, 3545 u32 *error_code) 3546 { 3547 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 3548 3549 *intr_info = control->event_inj; 3550 3551 if ((*intr_info & SVM_EXITINTINFO_VALID) && 3552 (*intr_info & SVM_EXITINTINFO_VALID_ERR)) 3553 *error_code = control->event_inj_err; 3554 else 3555 *error_code = 0; 3556 3557 } 3558 3559 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 3560 { 3561 struct vcpu_svm *svm = to_svm(vcpu); 3562 struct kvm_run *kvm_run = vcpu->run; 3563 u32 exit_code = svm->vmcb->control.exit_code; 3564 3565 /* SEV-ES guests must use the CR write traps to track CR registers. */ 3566 if (!sev_es_guest(vcpu->kvm)) { 3567 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) 3568 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3569 if (npt_enabled) 3570 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3571 } 3572 3573 if (is_guest_mode(vcpu)) { 3574 int vmexit; 3575 3576 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); 3577 3578 vmexit = nested_svm_exit_special(svm); 3579 3580 if (vmexit == NESTED_EXIT_CONTINUE) 3581 vmexit = nested_svm_exit_handled(svm); 3582 3583 if (vmexit == NESTED_EXIT_DONE) 3584 return 1; 3585 } 3586 3587 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 3588 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3589 kvm_run->fail_entry.hardware_entry_failure_reason 3590 = svm->vmcb->control.exit_code; 3591 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 3592 dump_vmcb(vcpu); 3593 return 0; 3594 } 3595 3596 if (exit_fastpath != EXIT_FASTPATH_NONE) 3597 return 1; 3598 3599 return svm_invoke_exit_handler(vcpu, exit_code); 3600 } 3601 3602 static int pre_svm_run(struct kvm_vcpu *vcpu) 3603 { 3604 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 3605 struct vcpu_svm *svm = to_svm(vcpu); 3606 3607 /* 3608 * If the previous vmrun of the vmcb occurred on a different physical 3609 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's 3610 * vmcb clean bits are per logical CPU, as are KVM's asid assignments. 3611 */ 3612 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) { 3613 svm->current_vmcb->asid_generation = 0; 3614 vmcb_mark_all_dirty(svm->vmcb); 3615 svm->current_vmcb->cpu = vcpu->cpu; 3616 } 3617 3618 if (sev_guest(vcpu->kvm)) 3619 return pre_sev_run(svm, vcpu->cpu); 3620 3621 /* FIXME: handle wraparound of asid_generation */ 3622 if (svm->current_vmcb->asid_generation != sd->asid_generation) 3623 new_asid(svm, sd); 3624 3625 return 0; 3626 } 3627 3628 static void svm_inject_nmi(struct kvm_vcpu *vcpu) 3629 { 3630 struct vcpu_svm *svm = to_svm(vcpu); 3631 3632 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3633 3634 if (svm->nmi_l1_to_l2) 3635 return; 3636 3637 /* 3638 * No need to manually track NMI masking when vNMI is enabled, hardware 3639 * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the 3640 * case where software directly injects an NMI. 3641 */ 3642 if (!is_vnmi_enabled(svm)) { 3643 svm->nmi_masked = true; 3644 svm_set_iret_intercept(svm); 3645 } 3646 ++vcpu->stat.nmi_injections; 3647 } 3648 3649 static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu) 3650 { 3651 struct vcpu_svm *svm = to_svm(vcpu); 3652 3653 if (!is_vnmi_enabled(svm)) 3654 return false; 3655 3656 return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK); 3657 } 3658 3659 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) 3660 { 3661 struct vcpu_svm *svm = to_svm(vcpu); 3662 3663 if (!is_vnmi_enabled(svm)) 3664 return false; 3665 3666 if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK) 3667 return false; 3668 3669 svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK; 3670 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 3671 3672 /* 3673 * Because the pending NMI is serviced by hardware, KVM can't know when 3674 * the NMI is "injected", but for all intents and purposes, passing the 3675 * NMI off to hardware counts as injection. 3676 */ 3677 ++vcpu->stat.nmi_injections; 3678 3679 return true; 3680 } 3681 3682 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 3683 { 3684 struct vcpu_svm *svm = to_svm(vcpu); 3685 u32 type; 3686 3687 if (vcpu->arch.interrupt.soft) { 3688 if (svm_update_soft_interrupt_rip(vcpu)) 3689 return; 3690 3691 type = SVM_EVTINJ_TYPE_SOFT; 3692 } else { 3693 type = SVM_EVTINJ_TYPE_INTR; 3694 } 3695 3696 trace_kvm_inj_virq(vcpu->arch.interrupt.nr, 3697 vcpu->arch.interrupt.soft, reinjected); 3698 ++vcpu->stat.irq_injections; 3699 3700 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 3701 SVM_EVTINJ_VALID | type; 3702 } 3703 3704 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, 3705 int trig_mode, int vector) 3706 { 3707 /* 3708 * apic->apicv_active must be read after vcpu->mode. 3709 * Pairs with smp_store_release in vcpu_enter_guest. 3710 */ 3711 bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); 3712 3713 /* Note, this is called iff the local APIC is in-kernel. */ 3714 if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { 3715 /* Process the interrupt via kvm_check_and_inject_events(). */ 3716 kvm_make_request(KVM_REQ_EVENT, vcpu); 3717 kvm_vcpu_kick(vcpu); 3718 return; 3719 } 3720 3721 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 3722 if (in_guest_mode) { 3723 /* 3724 * Signal the doorbell to tell hardware to inject the IRQ. If 3725 * the vCPU exits the guest before the doorbell chimes, hardware 3726 * will automatically process AVIC interrupts at the next VMRUN. 3727 */ 3728 avic_ring_doorbell(vcpu); 3729 } else { 3730 /* 3731 * Wake the vCPU if it was blocking. KVM will then detect the 3732 * pending IRQ when checking if the vCPU has a wake event. 3733 */ 3734 kvm_vcpu_wake_up(vcpu); 3735 } 3736 } 3737 3738 static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 3739 int trig_mode, int vector) 3740 { 3741 kvm_lapic_set_irr(vector, apic); 3742 3743 /* 3744 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in 3745 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before 3746 * the read of guest_mode. This guarantees that either VMRUN will see 3747 * and process the new vIRR entry, or that svm_complete_interrupt_delivery 3748 * will signal the doorbell if the CPU has already entered the guest. 3749 */ 3750 smp_mb__after_atomic(); 3751 svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); 3752 } 3753 3754 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 3755 { 3756 struct vcpu_svm *svm = to_svm(vcpu); 3757 3758 /* 3759 * SEV-ES guests must always keep the CR intercepts cleared. CR 3760 * tracking is done using the CR write traps. 3761 */ 3762 if (sev_es_guest(vcpu->kvm)) 3763 return; 3764 3765 if (nested_svm_virtualize_tpr(vcpu)) 3766 return; 3767 3768 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 3769 3770 if (irr == -1) 3771 return; 3772 3773 if (tpr >= irr) 3774 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 3775 } 3776 3777 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 3778 { 3779 struct vcpu_svm *svm = to_svm(vcpu); 3780 3781 if (is_vnmi_enabled(svm)) 3782 return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK; 3783 else 3784 return svm->nmi_masked; 3785 } 3786 3787 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 3788 { 3789 struct vcpu_svm *svm = to_svm(vcpu); 3790 3791 if (is_vnmi_enabled(svm)) { 3792 if (masked) 3793 svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK; 3794 else 3795 svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK; 3796 3797 } else { 3798 svm->nmi_masked = masked; 3799 if (masked) 3800 svm_set_iret_intercept(svm); 3801 else 3802 svm_clr_iret_intercept(svm); 3803 } 3804 } 3805 3806 bool svm_nmi_blocked(struct kvm_vcpu *vcpu) 3807 { 3808 struct vcpu_svm *svm = to_svm(vcpu); 3809 struct vmcb *vmcb = svm->vmcb; 3810 3811 if (!gif_set(svm)) 3812 return true; 3813 3814 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3815 return false; 3816 3817 if (svm_get_nmi_mask(vcpu)) 3818 return true; 3819 3820 return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK; 3821 } 3822 3823 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3824 { 3825 struct vcpu_svm *svm = to_svm(vcpu); 3826 if (svm->nested.nested_run_pending) 3827 return -EBUSY; 3828 3829 if (svm_nmi_blocked(vcpu)) 3830 return 0; 3831 3832 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 3833 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3834 return -EBUSY; 3835 return 1; 3836 } 3837 3838 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) 3839 { 3840 struct vcpu_svm *svm = to_svm(vcpu); 3841 struct vmcb *vmcb = svm->vmcb; 3842 3843 if (!gif_set(svm)) 3844 return true; 3845 3846 if (is_guest_mode(vcpu)) { 3847 /* As long as interrupts are being delivered... */ 3848 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) 3849 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) 3850 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) 3851 return true; 3852 3853 /* ... vmexits aren't blocked by the interrupt shadow */ 3854 if (nested_exit_on_intr(svm)) 3855 return false; 3856 } else { 3857 if (!svm_get_if_flag(vcpu)) 3858 return true; 3859 } 3860 3861 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); 3862 } 3863 3864 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3865 { 3866 struct vcpu_svm *svm = to_svm(vcpu); 3867 3868 if (svm->nested.nested_run_pending) 3869 return -EBUSY; 3870 3871 if (svm_interrupt_blocked(vcpu)) 3872 return 0; 3873 3874 /* 3875 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 3876 * e.g. if the IRQ arrived asynchronously after checking nested events. 3877 */ 3878 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) 3879 return -EBUSY; 3880 3881 return 1; 3882 } 3883 3884 static void svm_enable_irq_window(struct kvm_vcpu *vcpu) 3885 { 3886 struct vcpu_svm *svm = to_svm(vcpu); 3887 3888 /* 3889 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 3890 * 1, because that's a separate STGI/VMRUN intercept. The next time we 3891 * get that intercept, this function will be called again though and 3892 * we'll get the vintr intercept. However, if the vGIF feature is 3893 * enabled, the STGI interception will not occur. Enable the irq 3894 * window under the assumption that the hardware will set the GIF. 3895 */ 3896 if (vgif || gif_set(svm)) { 3897 /* 3898 * IRQ window is not needed when AVIC is enabled, 3899 * unless we have pending ExtINT since it cannot be injected 3900 * via AVIC. In such case, KVM needs to temporarily disable AVIC, 3901 * and fallback to injecting IRQ via V_IRQ. 3902 * 3903 * If running nested, AVIC is already locally inhibited 3904 * on this vCPU, therefore there is no need to request 3905 * the VM wide AVIC inhibition. 3906 */ 3907 if (!is_guest_mode(vcpu)) 3908 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 3909 3910 svm_set_vintr(svm); 3911 } 3912 } 3913 3914 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) 3915 { 3916 struct vcpu_svm *svm = to_svm(vcpu); 3917 3918 /* 3919 * If NMIs are outright masked, i.e. the vCPU is already handling an 3920 * NMI, and KVM has not yet intercepted an IRET, then there is nothing 3921 * more to do at this time as KVM has already enabled IRET intercepts. 3922 * If KVM has already intercepted IRET, then single-step over the IRET, 3923 * as NMIs aren't architecturally unmasked until the IRET completes. 3924 * 3925 * If vNMI is enabled, KVM should never request an NMI window if NMIs 3926 * are masked, as KVM allows at most one to-be-injected NMI and one 3927 * pending NMI. If two NMIs arrive simultaneously, KVM will inject one 3928 * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are 3929 * unmasked. KVM _will_ request an NMI window in some situations, e.g. 3930 * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately 3931 * inject the NMI. In those situations, KVM needs to single-step over 3932 * the STI shadow or intercept STGI. 3933 */ 3934 if (svm_get_nmi_mask(vcpu)) { 3935 WARN_ON_ONCE(is_vnmi_enabled(svm)); 3936 3937 if (!svm->awaiting_iret_completion) 3938 return; /* IRET will cause a vm exit */ 3939 } 3940 3941 /* 3942 * SEV-ES guests are responsible for signaling when a vCPU is ready to 3943 * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e. 3944 * KVM can't intercept and single-step IRET to detect when NMIs are 3945 * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE. 3946 * 3947 * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware 3948 * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not 3949 * supported NAEs in the GHCB protocol. 3950 */ 3951 if (sev_es_guest(vcpu->kvm)) 3952 return; 3953 3954 if (!gif_set(svm)) { 3955 if (vgif) 3956 svm_set_intercept(svm, INTERCEPT_STGI); 3957 return; /* STGI will cause a vm exit */ 3958 } 3959 3960 /* 3961 * Something prevents NMI from been injected. Single step over possible 3962 * problem (IRET or exception injection or interrupt shadow) 3963 */ 3964 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); 3965 svm->nmi_singlestep = true; 3966 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3967 } 3968 3969 static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu) 3970 { 3971 struct vcpu_svm *svm = to_svm(vcpu); 3972 3973 /* 3974 * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries. 3975 * A TLB flush for the current ASID flushes both "host" and "guest" TLB 3976 * entries, and thus is a superset of Hyper-V's fine grained flushing. 3977 */ 3978 kvm_hv_vcpu_purge_flush_tlb(vcpu); 3979 3980 /* 3981 * Flush only the current ASID even if the TLB flush was invoked via 3982 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all 3983 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and 3984 * unconditionally does a TLB flush on both nested VM-Enter and nested 3985 * VM-Exit (via kvm_mmu_reset_context()). 3986 */ 3987 if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) 3988 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 3989 else 3990 svm->current_vmcb->asid_generation--; 3991 } 3992 3993 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) 3994 { 3995 hpa_t root_tdp = vcpu->arch.mmu->root.hpa; 3996 3997 /* 3998 * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly 3999 * flush the NPT mappings via hypercall as flushing the ASID only 4000 * affects virtual to physical mappings, it does not invalidate guest 4001 * physical to host physical mappings. 4002 */ 4003 if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp)) 4004 hyperv_flush_guest_mapping(root_tdp); 4005 4006 svm_flush_tlb_asid(vcpu); 4007 } 4008 4009 static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) 4010 { 4011 /* 4012 * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB 4013 * flushes should be routed to hv_flush_remote_tlbs() without requesting 4014 * a "regular" remote flush. Reaching this point means either there's 4015 * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of 4016 * which might be fatal to the guest. Yell, but try to recover. 4017 */ 4018 if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) 4019 hv_flush_remote_tlbs(vcpu->kvm); 4020 4021 svm_flush_tlb_asid(vcpu); 4022 } 4023 4024 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) 4025 { 4026 struct vcpu_svm *svm = to_svm(vcpu); 4027 4028 invlpga(gva, svm->vmcb->control.asid); 4029 } 4030 4031 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) 4032 { 4033 struct vcpu_svm *svm = to_svm(vcpu); 4034 4035 if (nested_svm_virtualize_tpr(vcpu)) 4036 return; 4037 4038 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { 4039 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 4040 kvm_set_cr8(vcpu, cr8); 4041 } 4042 } 4043 4044 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) 4045 { 4046 struct vcpu_svm *svm = to_svm(vcpu); 4047 u64 cr8; 4048 4049 if (nested_svm_virtualize_tpr(vcpu) || 4050 kvm_vcpu_apicv_active(vcpu)) 4051 return; 4052 4053 cr8 = kvm_get_cr8(vcpu); 4054 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 4055 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 4056 } 4057 4058 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, 4059 int type) 4060 { 4061 bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); 4062 bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); 4063 struct vcpu_svm *svm = to_svm(vcpu); 4064 4065 /* 4066 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's 4067 * associated with the original soft exception/interrupt. next_rip is 4068 * cleared on all exits that can occur while vectoring an event, so KVM 4069 * needs to manually set next_rip for re-injection. Unlike the !nrips 4070 * case below, this needs to be done if and only if KVM is re-injecting 4071 * the same event, i.e. if the event is a soft exception/interrupt, 4072 * otherwise next_rip is unused on VMRUN. 4073 */ 4074 if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && 4075 kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) 4076 svm->vmcb->control.next_rip = svm->soft_int_next_rip; 4077 /* 4078 * If NRIPS isn't enabled, KVM must manually advance RIP prior to 4079 * injecting the soft exception/interrupt. That advancement needs to 4080 * be unwound if vectoring didn't complete. Note, the new event may 4081 * not be the injected event, e.g. if KVM injected an INTn, the INTn 4082 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will 4083 * be the reported vectored event, but RIP still needs to be unwound. 4084 */ 4085 else if (!nrips && (is_soft || is_exception) && 4086 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) 4087 kvm_rip_write(vcpu, svm->soft_int_old_rip); 4088 } 4089 4090 static void svm_complete_interrupts(struct kvm_vcpu *vcpu) 4091 { 4092 struct vcpu_svm *svm = to_svm(vcpu); 4093 u8 vector; 4094 int type; 4095 u32 exitintinfo = svm->vmcb->control.exit_int_info; 4096 bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; 4097 bool soft_int_injected = svm->soft_int_injected; 4098 4099 svm->nmi_l1_to_l2 = false; 4100 svm->soft_int_injected = false; 4101 4102 /* 4103 * If we've made progress since setting awaiting_iret_completion, we've 4104 * executed an IRET and can allow NMI injection. 4105 */ 4106 if (svm->awaiting_iret_completion && 4107 kvm_rip_read(vcpu) != svm->nmi_iret_rip) { 4108 svm->awaiting_iret_completion = false; 4109 svm->nmi_masked = false; 4110 kvm_make_request(KVM_REQ_EVENT, vcpu); 4111 } 4112 4113 vcpu->arch.nmi_injected = false; 4114 kvm_clear_exception_queue(vcpu); 4115 kvm_clear_interrupt_queue(vcpu); 4116 4117 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 4118 return; 4119 4120 kvm_make_request(KVM_REQ_EVENT, vcpu); 4121 4122 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 4123 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 4124 4125 if (soft_int_injected) 4126 svm_complete_soft_interrupt(vcpu, vector, type); 4127 4128 switch (type) { 4129 case SVM_EXITINTINFO_TYPE_NMI: 4130 vcpu->arch.nmi_injected = true; 4131 svm->nmi_l1_to_l2 = nmi_l1_to_l2; 4132 break; 4133 case SVM_EXITINTINFO_TYPE_EXEPT: { 4134 u32 error_code = 0; 4135 4136 /* 4137 * Never re-inject a #VC exception. 4138 */ 4139 if (vector == X86_TRAP_VC) 4140 break; 4141 4142 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) 4143 error_code = svm->vmcb->control.exit_int_info_err; 4144 4145 kvm_requeue_exception(vcpu, vector, 4146 exitintinfo & SVM_EXITINTINFO_VALID_ERR, 4147 error_code); 4148 break; 4149 } 4150 case SVM_EXITINTINFO_TYPE_INTR: 4151 kvm_queue_interrupt(vcpu, vector, false); 4152 break; 4153 case SVM_EXITINTINFO_TYPE_SOFT: 4154 kvm_queue_interrupt(vcpu, vector, true); 4155 break; 4156 default: 4157 break; 4158 } 4159 4160 } 4161 4162 static void svm_cancel_injection(struct kvm_vcpu *vcpu) 4163 { 4164 struct vcpu_svm *svm = to_svm(vcpu); 4165 struct vmcb_control_area *control = &svm->vmcb->control; 4166 4167 control->exit_int_info = control->event_inj; 4168 control->exit_int_info_err = control->event_inj_err; 4169 control->event_inj = 0; 4170 svm_complete_interrupts(vcpu); 4171 } 4172 4173 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) 4174 { 4175 if (to_kvm_sev_info(vcpu->kvm)->need_init) 4176 return -EINVAL; 4177 4178 return 1; 4179 } 4180 4181 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 4182 { 4183 struct vcpu_svm *svm = to_svm(vcpu); 4184 4185 if (is_guest_mode(vcpu)) 4186 return EXIT_FASTPATH_NONE; 4187 4188 switch (svm->vmcb->control.exit_code) { 4189 case SVM_EXIT_MSR: 4190 if (!svm->vmcb->control.exit_info_1) 4191 break; 4192 return handle_fastpath_set_msr_irqoff(vcpu); 4193 case SVM_EXIT_HLT: 4194 return handle_fastpath_hlt(vcpu); 4195 default: 4196 break; 4197 } 4198 4199 return EXIT_FASTPATH_NONE; 4200 } 4201 4202 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) 4203 { 4204 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 4205 struct vcpu_svm *svm = to_svm(vcpu); 4206 4207 guest_state_enter_irqoff(); 4208 4209 /* 4210 * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of 4211 * VMRUN controls whether or not physical IRQs are masked (KVM always 4212 * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the 4213 * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow 4214 * into guest state if delivery of an event during VMRUN triggers a 4215 * #VMEXIT, and the guest_state transitions already tell lockdep that 4216 * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of 4217 * this path, so IRQs aren't actually unmasked while running host code. 4218 */ 4219 raw_local_irq_enable(); 4220 4221 amd_clear_divider(); 4222 4223 if (sev_es_guest(vcpu->kvm)) 4224 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, 4225 sev_es_host_save_area(sd)); 4226 else 4227 __svm_vcpu_run(svm, spec_ctrl_intercepted); 4228 4229 raw_local_irq_disable(); 4230 4231 guest_state_exit_irqoff(); 4232 } 4233 4234 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 4235 { 4236 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; 4237 struct vcpu_svm *svm = to_svm(vcpu); 4238 bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); 4239 4240 trace_kvm_entry(vcpu, force_immediate_exit); 4241 4242 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4243 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4244 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4245 4246 /* 4247 * Disable singlestep if we're injecting an interrupt/exception. 4248 * We don't want our modified rflags to be pushed on the stack where 4249 * we might not be able to easily reset them if we disabled NMI 4250 * singlestep later. 4251 */ 4252 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { 4253 /* 4254 * Event injection happens before external interrupts cause a 4255 * vmexit and interrupts are disabled here, so smp_send_reschedule 4256 * is enough to force an immediate vmexit. 4257 */ 4258 disable_nmi_singlestep(svm); 4259 force_immediate_exit = true; 4260 } 4261 4262 if (force_immediate_exit) 4263 smp_send_reschedule(vcpu->cpu); 4264 4265 if (pre_svm_run(vcpu)) { 4266 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 4267 vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; 4268 vcpu->run->fail_entry.cpu = vcpu->cpu; 4269 return EXIT_FASTPATH_EXIT_USERSPACE; 4270 } 4271 4272 sync_lapic_to_cr8(vcpu); 4273 4274 if (unlikely(svm->asid != svm->vmcb->control.asid)) { 4275 svm->vmcb->control.asid = svm->asid; 4276 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 4277 } 4278 svm->vmcb->save.cr2 = vcpu->arch.cr2; 4279 4280 svm_hv_update_vp_id(svm->vmcb, vcpu); 4281 4282 /* 4283 * Run with all-zero DR6 unless the guest can write DR6 freely, so that 4284 * KVM can get the exact cause of a #DB. Note, loading guest DR6 from 4285 * KVM's snapshot is only necessary when DR accesses won't exit. 4286 */ 4287 if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) 4288 svm_set_dr6(vcpu, vcpu->arch.dr6); 4289 else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) 4290 svm_set_dr6(vcpu, DR6_ACTIVE_LOW); 4291 4292 clgi(); 4293 kvm_load_guest_xsave_state(vcpu); 4294 4295 /* 4296 * Hardware only context switches DEBUGCTL if LBR virtualization is 4297 * enabled. Manually load DEBUGCTL if necessary (and restore it after 4298 * VM-Exit), as running with the host's DEBUGCTL can negatively affect 4299 * guest state and can even be fatal, e.g. due to Bus Lock Detect. 4300 */ 4301 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4302 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4303 update_debugctlmsr(svm->vmcb->save.dbgctl); 4304 4305 kvm_wait_lapic_expire(vcpu); 4306 4307 /* 4308 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 4309 * it's non-zero. Since vmentry is serialising on affected CPUs, there 4310 * is no need to worry about the conditional branch over the wrmsr 4311 * being speculatively taken. 4312 */ 4313 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 4314 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl); 4315 4316 svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); 4317 4318 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 4319 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); 4320 4321 if (!sev_es_guest(vcpu->kvm)) { 4322 vcpu->arch.cr2 = svm->vmcb->save.cr2; 4323 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 4324 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 4325 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 4326 } 4327 vcpu->arch.regs_dirty = 0; 4328 4329 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 4330 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 4331 4332 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4333 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4334 update_debugctlmsr(vcpu->arch.host_debugctl); 4335 4336 kvm_load_host_xsave_state(vcpu); 4337 stgi(); 4338 4339 /* Any pending NMI will happen here */ 4340 4341 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 4342 kvm_after_interrupt(vcpu); 4343 4344 sync_cr8_to_lapic(vcpu); 4345 4346 svm->next_rip = 0; 4347 if (is_guest_mode(vcpu)) { 4348 nested_sync_control_from_vmcb02(svm); 4349 4350 /* Track VMRUNs that have made past consistency checking */ 4351 if (svm->nested.nested_run_pending && 4352 svm->vmcb->control.exit_code != SVM_EXIT_ERR) 4353 ++vcpu->stat.nested_run; 4354 4355 svm->nested.nested_run_pending = 0; 4356 } 4357 4358 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 4359 vmcb_mark_all_clean(svm->vmcb); 4360 4361 /* if exit due to PF check for async PF */ 4362 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) 4363 vcpu->arch.apf.host_apf_flags = 4364 kvm_read_and_reset_apf_flags(); 4365 4366 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; 4367 4368 /* 4369 * We need to handle MC intercepts here before the vcpu has a chance to 4370 * change the physical cpu 4371 */ 4372 if (unlikely(svm->vmcb->control.exit_code == 4373 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 4374 svm_handle_mce(vcpu); 4375 4376 trace_kvm_exit(vcpu, KVM_ISA_SVM); 4377 4378 svm_complete_interrupts(vcpu); 4379 4380 return svm_exit_handlers_fastpath(vcpu); 4381 } 4382 4383 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, 4384 int root_level) 4385 { 4386 struct vcpu_svm *svm = to_svm(vcpu); 4387 unsigned long cr3; 4388 4389 if (npt_enabled) { 4390 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); 4391 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 4392 4393 hv_track_root_tdp(vcpu, root_hpa); 4394 4395 cr3 = vcpu->arch.cr3; 4396 } else if (root_level >= PT64_ROOT_4LEVEL) { 4397 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); 4398 } else { 4399 /* PCID in the guest should be impossible with a 32-bit MMU. */ 4400 WARN_ON_ONCE(kvm_get_active_pcid(vcpu)); 4401 cr3 = root_hpa; 4402 } 4403 4404 svm->vmcb->save.cr3 = cr3; 4405 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 4406 } 4407 4408 static void 4409 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 4410 { 4411 /* 4412 * Patch in the VMMCALL instruction: 4413 */ 4414 hypercall[0] = 0x0f; 4415 hypercall[1] = 0x01; 4416 hypercall[2] = 0xd9; 4417 } 4418 4419 /* 4420 * The kvm parameter can be NULL (module initialization, or invocation before 4421 * VM creation). Be sure to check the kvm parameter before using it. 4422 */ 4423 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) 4424 { 4425 switch (index) { 4426 case MSR_IA32_MCG_EXT_CTL: 4427 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 4428 return false; 4429 case MSR_IA32_SMBASE: 4430 if (!IS_ENABLED(CONFIG_KVM_SMM)) 4431 return false; 4432 /* SEV-ES guests do not support SMM, so report false */ 4433 if (kvm && sev_es_guest(kvm)) 4434 return false; 4435 break; 4436 default: 4437 break; 4438 } 4439 4440 return true; 4441 } 4442 4443 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 4444 { 4445 struct vcpu_svm *svm = to_svm(vcpu); 4446 4447 /* 4448 * SVM doesn't provide a way to disable just XSAVES in the guest, KVM 4449 * can only disable all variants of by disallowing CR4.OSXSAVE from 4450 * being set. As a result, if the host has XSAVE and XSAVES, and the 4451 * guest has XSAVE enabled, the guest can execute XSAVES without 4452 * faulting. Treat XSAVES as enabled in this case regardless of 4453 * whether it's advertised to the guest so that KVM context switches 4454 * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give 4455 * the guest read/write access to the host's XSS. 4456 */ 4457 guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES, 4458 boot_cpu_has(X86_FEATURE_XSAVES) && 4459 guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)); 4460 4461 /* 4462 * Intercept VMLOAD if the vCPU model is Intel in order to emulate that 4463 * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing 4464 * SVM on Intel is bonkers and extremely unlikely to work). 4465 */ 4466 if (guest_cpuid_is_intel_compatible(vcpu)) 4467 guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); 4468 4469 if (sev_guest(vcpu->kvm)) 4470 sev_vcpu_after_set_cpuid(svm); 4471 4472 svm_recalc_intercepts_after_set_cpuid(vcpu); 4473 } 4474 4475 static bool svm_has_wbinvd_exit(void) 4476 { 4477 return true; 4478 } 4479 4480 #define PRE_EX(exit) { .exit_code = (exit), \ 4481 .stage = X86_ICPT_PRE_EXCEPT, } 4482 #define POST_EX(exit) { .exit_code = (exit), \ 4483 .stage = X86_ICPT_POST_EXCEPT, } 4484 #define POST_MEM(exit) { .exit_code = (exit), \ 4485 .stage = X86_ICPT_POST_MEMACCESS, } 4486 4487 static const struct __x86_intercept { 4488 u32 exit_code; 4489 enum x86_intercept_stage stage; 4490 } x86_intercept_map[] = { 4491 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), 4492 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), 4493 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), 4494 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), 4495 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), 4496 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), 4497 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), 4498 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), 4499 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), 4500 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), 4501 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), 4502 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), 4503 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), 4504 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), 4505 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), 4506 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), 4507 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), 4508 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), 4509 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), 4510 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), 4511 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), 4512 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), 4513 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), 4514 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), 4515 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), 4516 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), 4517 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), 4518 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), 4519 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), 4520 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), 4521 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), 4522 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), 4523 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), 4524 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), 4525 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), 4526 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), 4527 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), 4528 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), 4529 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), 4530 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), 4531 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), 4532 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), 4533 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), 4534 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), 4535 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), 4536 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), 4537 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), 4538 }; 4539 4540 #undef PRE_EX 4541 #undef POST_EX 4542 #undef POST_MEM 4543 4544 static int svm_check_intercept(struct kvm_vcpu *vcpu, 4545 struct x86_instruction_info *info, 4546 enum x86_intercept_stage stage, 4547 struct x86_exception *exception) 4548 { 4549 struct vcpu_svm *svm = to_svm(vcpu); 4550 int vmexit, ret = X86EMUL_CONTINUE; 4551 struct __x86_intercept icpt_info; 4552 struct vmcb *vmcb = svm->vmcb; 4553 4554 if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) 4555 goto out; 4556 4557 icpt_info = x86_intercept_map[info->intercept]; 4558 4559 if (stage != icpt_info.stage) 4560 goto out; 4561 4562 switch (icpt_info.exit_code) { 4563 case SVM_EXIT_READ_CR0: 4564 if (info->intercept == x86_intercept_cr_read) 4565 icpt_info.exit_code += info->modrm_reg; 4566 break; 4567 case SVM_EXIT_WRITE_CR0: { 4568 unsigned long cr0, val; 4569 4570 if (info->intercept == x86_intercept_cr_write) 4571 icpt_info.exit_code += info->modrm_reg; 4572 4573 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || 4574 info->intercept == x86_intercept_clts) 4575 break; 4576 4577 if (!(vmcb12_is_intercept(&svm->nested.ctl, 4578 INTERCEPT_SELECTIVE_CR0))) 4579 break; 4580 4581 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; 4582 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; 4583 4584 if (info->intercept == x86_intercept_lmsw) { 4585 cr0 &= 0xfUL; 4586 val &= 0xfUL; 4587 /* lmsw can't clear PE - catch this here */ 4588 if (cr0 & X86_CR0_PE) 4589 val |= X86_CR0_PE; 4590 } 4591 4592 if (cr0 ^ val) 4593 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; 4594 4595 break; 4596 } 4597 case SVM_EXIT_READ_DR0: 4598 case SVM_EXIT_WRITE_DR0: 4599 icpt_info.exit_code += info->modrm_reg; 4600 break; 4601 case SVM_EXIT_MSR: 4602 if (info->intercept == x86_intercept_wrmsr) 4603 vmcb->control.exit_info_1 = 1; 4604 else 4605 vmcb->control.exit_info_1 = 0; 4606 break; 4607 case SVM_EXIT_PAUSE: 4608 /* 4609 * We get this for NOP only, but pause 4610 * is rep not, check this here 4611 */ 4612 if (info->rep_prefix != REPE_PREFIX) 4613 goto out; 4614 break; 4615 case SVM_EXIT_IOIO: { 4616 u64 exit_info; 4617 u32 bytes; 4618 4619 if (info->intercept == x86_intercept_in || 4620 info->intercept == x86_intercept_ins) { 4621 exit_info = ((info->src_val & 0xffff) << 16) | 4622 SVM_IOIO_TYPE_MASK; 4623 bytes = info->dst_bytes; 4624 } else { 4625 exit_info = (info->dst_val & 0xffff) << 16; 4626 bytes = info->src_bytes; 4627 } 4628 4629 if (info->intercept == x86_intercept_outs || 4630 info->intercept == x86_intercept_ins) 4631 exit_info |= SVM_IOIO_STR_MASK; 4632 4633 if (info->rep_prefix) 4634 exit_info |= SVM_IOIO_REP_MASK; 4635 4636 bytes = min(bytes, 4u); 4637 4638 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; 4639 4640 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); 4641 4642 vmcb->control.exit_info_1 = exit_info; 4643 vmcb->control.exit_info_2 = info->next_rip; 4644 4645 break; 4646 } 4647 default: 4648 break; 4649 } 4650 4651 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */ 4652 if (static_cpu_has(X86_FEATURE_NRIPS)) 4653 vmcb->control.next_rip = info->next_rip; 4654 vmcb->control.exit_code = icpt_info.exit_code; 4655 vmexit = nested_svm_exit_handled(svm); 4656 4657 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED 4658 : X86EMUL_CONTINUE; 4659 4660 out: 4661 return ret; 4662 } 4663 4664 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) 4665 { 4666 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) 4667 vcpu->arch.at_instruction_boundary = true; 4668 } 4669 4670 static void svm_setup_mce(struct kvm_vcpu *vcpu) 4671 { 4672 /* [63:9] are reserved. */ 4673 vcpu->arch.mcg_cap &= 0x1ff; 4674 } 4675 4676 #ifdef CONFIG_KVM_SMM 4677 bool svm_smi_blocked(struct kvm_vcpu *vcpu) 4678 { 4679 struct vcpu_svm *svm = to_svm(vcpu); 4680 4681 /* Per APM Vol.2 15.22.2 "Response to SMI" */ 4682 if (!gif_set(svm)) 4683 return true; 4684 4685 return is_smm(vcpu); 4686 } 4687 4688 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4689 { 4690 struct vcpu_svm *svm = to_svm(vcpu); 4691 if (svm->nested.nested_run_pending) 4692 return -EBUSY; 4693 4694 if (svm_smi_blocked(vcpu)) 4695 return 0; 4696 4697 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ 4698 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) 4699 return -EBUSY; 4700 4701 return 1; 4702 } 4703 4704 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 4705 { 4706 struct vcpu_svm *svm = to_svm(vcpu); 4707 struct kvm_host_map map_save; 4708 int ret; 4709 4710 if (!is_guest_mode(vcpu)) 4711 return 0; 4712 4713 /* 4714 * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is 4715 * responsible for ensuring nested SVM and SMIs are mutually exclusive. 4716 */ 4717 4718 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 4719 return 1; 4720 4721 smram->smram64.svm_guest_flag = 1; 4722 smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa; 4723 4724 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4725 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4726 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4727 4728 ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); 4729 if (ret) 4730 return ret; 4731 4732 /* 4733 * KVM uses VMCB01 to store L1 host state while L2 runs but 4734 * VMCB01 is going to be used during SMM and thus the state will 4735 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save 4736 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the 4737 * format of the area is identical to guest save area offsetted 4738 * by 0x400 (matches the offset of 'struct vmcb_save_area' 4739 * within 'struct vmcb'). Note: HSAVE area may also be used by 4740 * L1 hypervisor to save additional host context (e.g. KVM does 4741 * that, see svm_prepare_switch_to_guest()) which must be 4742 * preserved. 4743 */ 4744 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) 4745 return 1; 4746 4747 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); 4748 4749 svm_copy_vmrun_state(map_save.hva + 0x400, 4750 &svm->vmcb01.ptr->save); 4751 4752 kvm_vcpu_unmap(vcpu, &map_save); 4753 return 0; 4754 } 4755 4756 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 4757 { 4758 struct vcpu_svm *svm = to_svm(vcpu); 4759 struct kvm_host_map map, map_save; 4760 struct vmcb *vmcb12; 4761 int ret; 4762 4763 const struct kvm_smram_state_64 *smram64 = &smram->smram64; 4764 4765 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 4766 return 0; 4767 4768 /* Non-zero if SMI arrived while vCPU was in guest mode. */ 4769 if (!smram64->svm_guest_flag) 4770 return 0; 4771 4772 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) 4773 return 1; 4774 4775 if (!(smram64->efer & EFER_SVME)) 4776 return 1; 4777 4778 if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map)) 4779 return 1; 4780 4781 ret = 1; 4782 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) 4783 goto unmap_map; 4784 4785 if (svm_allocate_nested(svm)) 4786 goto unmap_save; 4787 4788 /* 4789 * Restore L1 host state from L1 HSAVE area as VMCB01 was 4790 * used during SMM (see svm_enter_smm()) 4791 */ 4792 4793 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); 4794 4795 /* 4796 * Enter the nested guest now 4797 */ 4798 4799 vmcb_mark_all_dirty(svm->vmcb01.ptr); 4800 4801 vmcb12 = map.hva; 4802 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); 4803 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); 4804 ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false); 4805 4806 if (ret) 4807 goto unmap_save; 4808 4809 svm->nested.nested_run_pending = 1; 4810 4811 unmap_save: 4812 kvm_vcpu_unmap(vcpu, &map_save); 4813 unmap_map: 4814 kvm_vcpu_unmap(vcpu, &map); 4815 return ret; 4816 } 4817 4818 static void svm_enable_smi_window(struct kvm_vcpu *vcpu) 4819 { 4820 struct vcpu_svm *svm = to_svm(vcpu); 4821 4822 if (!gif_set(svm)) { 4823 if (vgif) 4824 svm_set_intercept(svm, INTERCEPT_STGI); 4825 /* STGI will cause a vm exit */ 4826 } else { 4827 /* We must be in SMM; RSM will cause a vmexit anyway. */ 4828 } 4829 } 4830 #endif 4831 4832 static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 4833 void *insn, int insn_len) 4834 { 4835 struct vcpu_svm *svm = to_svm(vcpu); 4836 bool smep, smap, is_user; 4837 u64 error_code; 4838 4839 /* Check that emulation is possible during event vectoring */ 4840 if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) && 4841 !kvm_can_emulate_event_vectoring(emul_type)) 4842 return X86EMUL_UNHANDLEABLE_VECTORING; 4843 4844 /* Emulation is always possible when KVM has access to all guest state. */ 4845 if (!sev_guest(vcpu->kvm)) 4846 return X86EMUL_CONTINUE; 4847 4848 /* #UD and #GP should never be intercepted for SEV guests. */ 4849 WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | 4850 EMULTYPE_TRAP_UD_FORCED | 4851 EMULTYPE_VMWARE_GP)); 4852 4853 /* 4854 * Emulation is impossible for SEV-ES guests as KVM doesn't have access 4855 * to guest register state. 4856 */ 4857 if (sev_es_guest(vcpu->kvm)) 4858 return X86EMUL_RETRY_INSTR; 4859 4860 /* 4861 * Emulation is possible if the instruction is already decoded, e.g. 4862 * when completing I/O after returning from userspace. 4863 */ 4864 if (emul_type & EMULTYPE_NO_DECODE) 4865 return X86EMUL_CONTINUE; 4866 4867 /* 4868 * Emulation is possible for SEV guests if and only if a prefilled 4869 * buffer containing the bytes of the intercepted instruction is 4870 * available. SEV guest memory is encrypted with a guest specific key 4871 * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and 4872 * decode garbage. 4873 * 4874 * If KVM is NOT trying to simply skip an instruction, inject #UD if 4875 * KVM reached this point without an instruction buffer. In practice, 4876 * this path should never be hit by a well-behaved guest, e.g. KVM 4877 * doesn't intercept #UD or #GP for SEV guests, but this path is still 4878 * theoretically reachable, e.g. via unaccelerated fault-like AVIC 4879 * access, and needs to be handled by KVM to avoid putting the guest 4880 * into an infinite loop. Injecting #UD is somewhat arbitrary, but 4881 * its the least awful option given lack of insight into the guest. 4882 * 4883 * If KVM is trying to skip an instruction, simply resume the guest. 4884 * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM 4885 * will attempt to re-inject the INT3/INTO and skip the instruction. 4886 * In that scenario, retrying the INT3/INTO and hoping the guest will 4887 * make forward progress is the only option that has a chance of 4888 * success (and in practice it will work the vast majority of the time). 4889 */ 4890 if (unlikely(!insn)) { 4891 if (emul_type & EMULTYPE_SKIP) 4892 return X86EMUL_UNHANDLEABLE; 4893 4894 kvm_queue_exception(vcpu, UD_VECTOR); 4895 return X86EMUL_PROPAGATE_FAULT; 4896 } 4897 4898 /* 4899 * Emulate for SEV guests if the insn buffer is not empty. The buffer 4900 * will be empty if the DecodeAssist microcode cannot fetch bytes for 4901 * the faulting instruction because the code fetch itself faulted, e.g. 4902 * the guest attempted to fetch from emulated MMIO or a guest page 4903 * table used to translate CS:RIP resides in emulated MMIO. 4904 */ 4905 if (likely(insn_len)) 4906 return X86EMUL_CONTINUE; 4907 4908 /* 4909 * Detect and workaround Errata 1096 Fam_17h_00_0Fh. 4910 * 4911 * Errata: 4912 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is 4913 * possible that CPU microcode implementing DecodeAssist will fail to 4914 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly 4915 * be '0'. This happens because microcode reads CS:RIP using a _data_ 4916 * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode 4917 * gives up and does not fill the instruction bytes buffer. 4918 * 4919 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU 4920 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler 4921 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the 4922 * GuestIntrBytes field of the VMCB. 4923 * 4924 * This does _not_ mean that the erratum has been encountered, as the 4925 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate 4926 * #PF, e.g. if the guest attempt to execute from emulated MMIO and 4927 * encountered a reserved/not-present #PF. 4928 * 4929 * To hit the erratum, the following conditions must be true: 4930 * 1. CR4.SMAP=1 (obviously). 4931 * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot 4932 * have been hit as the guest would have encountered a SMEP 4933 * violation #PF, not a #NPF. 4934 * 3. The #NPF is not due to a code fetch, in which case failure to 4935 * retrieve the instruction bytes is legitimate (see abvoe). 4936 * 4937 * In addition, don't apply the erratum workaround if the #NPF occurred 4938 * while translating guest page tables (see below). 4939 */ 4940 error_code = svm->vmcb->control.exit_info_1; 4941 if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) 4942 goto resume_guest; 4943 4944 smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP); 4945 smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP); 4946 is_user = svm_get_cpl(vcpu) == 3; 4947 if (smap && (!smep || is_user)) { 4948 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); 4949 4950 /* 4951 * If the fault occurred in userspace, arbitrarily inject #GP 4952 * to avoid killing the guest and to hopefully avoid confusing 4953 * the guest kernel too much, e.g. injecting #PF would not be 4954 * coherent with respect to the guest's page tables. Request 4955 * triple fault if the fault occurred in the kernel as there's 4956 * no fault that KVM can inject without confusing the guest. 4957 * In practice, the triple fault is moot as no sane SEV kernel 4958 * will execute from user memory while also running with SMAP=1. 4959 */ 4960 if (is_user) 4961 kvm_inject_gp(vcpu, 0); 4962 else 4963 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4964 return X86EMUL_PROPAGATE_FAULT; 4965 } 4966 4967 resume_guest: 4968 /* 4969 * If the erratum was not hit, simply resume the guest and let it fault 4970 * again. While awful, e.g. the vCPU may get stuck in an infinite loop 4971 * if the fault is at CPL=0, it's the lesser of all evils. Exiting to 4972 * userspace will kill the guest, and letting the emulator read garbage 4973 * will yield random behavior and potentially corrupt the guest. 4974 * 4975 * Simply resuming the guest is technically not a violation of the SEV 4976 * architecture. AMD's APM states that all code fetches and page table 4977 * accesses for SEV guest are encrypted, regardless of the C-Bit. The 4978 * APM also states that encrypted accesses to MMIO are "ignored", but 4979 * doesn't explicitly define "ignored", i.e. doing nothing and letting 4980 * the guest spin is technically "ignoring" the access. 4981 */ 4982 return X86EMUL_RETRY_INSTR; 4983 } 4984 4985 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 4986 { 4987 struct vcpu_svm *svm = to_svm(vcpu); 4988 4989 return !gif_set(svm); 4990 } 4991 4992 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 4993 { 4994 if (!sev_es_guest(vcpu->kvm)) 4995 return kvm_vcpu_deliver_sipi_vector(vcpu, vector); 4996 4997 sev_vcpu_deliver_sipi_vector(vcpu, vector); 4998 } 4999 5000 static void svm_vm_destroy(struct kvm *kvm) 5001 { 5002 avic_vm_destroy(kvm); 5003 sev_vm_destroy(kvm); 5004 5005 svm_srso_vm_destroy(); 5006 } 5007 5008 static int svm_vm_init(struct kvm *kvm) 5009 { 5010 int type = kvm->arch.vm_type; 5011 5012 if (type != KVM_X86_DEFAULT_VM && 5013 type != KVM_X86_SW_PROTECTED_VM) { 5014 kvm->arch.has_protected_state = 5015 (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); 5016 to_kvm_sev_info(kvm)->need_init = true; 5017 5018 kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); 5019 kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; 5020 } 5021 5022 if (!pause_filter_count || !pause_filter_thresh) 5023 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); 5024 5025 if (enable_apicv) { 5026 int ret = avic_vm_init(kvm); 5027 if (ret) 5028 return ret; 5029 } 5030 5031 svm_srso_vm_init(); 5032 return 0; 5033 } 5034 5035 static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) 5036 { 5037 struct page *page = snp_safe_alloc_page(); 5038 5039 if (!page) 5040 return NULL; 5041 5042 return page_address(page); 5043 } 5044 5045 static struct kvm_x86_ops svm_x86_ops __initdata = { 5046 .name = KBUILD_MODNAME, 5047 5048 .check_processor_compatibility = svm_check_processor_compat, 5049 5050 .hardware_unsetup = svm_hardware_unsetup, 5051 .enable_virtualization_cpu = svm_enable_virtualization_cpu, 5052 .disable_virtualization_cpu = svm_disable_virtualization_cpu, 5053 .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, 5054 .has_emulated_msr = svm_has_emulated_msr, 5055 5056 .vcpu_create = svm_vcpu_create, 5057 .vcpu_free = svm_vcpu_free, 5058 .vcpu_reset = svm_vcpu_reset, 5059 5060 .vm_size = sizeof(struct kvm_svm), 5061 .vm_init = svm_vm_init, 5062 .vm_destroy = svm_vm_destroy, 5063 5064 .prepare_switch_to_guest = svm_prepare_switch_to_guest, 5065 .vcpu_load = svm_vcpu_load, 5066 .vcpu_put = svm_vcpu_put, 5067 .vcpu_blocking = avic_vcpu_blocking, 5068 .vcpu_unblocking = avic_vcpu_unblocking, 5069 5070 .update_exception_bitmap = svm_update_exception_bitmap, 5071 .get_feature_msr = svm_get_feature_msr, 5072 .get_msr = svm_get_msr, 5073 .set_msr = svm_set_msr, 5074 .get_segment_base = svm_get_segment_base, 5075 .get_segment = svm_get_segment, 5076 .set_segment = svm_set_segment, 5077 .get_cpl = svm_get_cpl, 5078 .get_cpl_no_cache = svm_get_cpl, 5079 .get_cs_db_l_bits = svm_get_cs_db_l_bits, 5080 .is_valid_cr0 = svm_is_valid_cr0, 5081 .set_cr0 = svm_set_cr0, 5082 .post_set_cr3 = sev_post_set_cr3, 5083 .is_valid_cr4 = svm_is_valid_cr4, 5084 .set_cr4 = svm_set_cr4, 5085 .set_efer = svm_set_efer, 5086 .get_idt = svm_get_idt, 5087 .set_idt = svm_set_idt, 5088 .get_gdt = svm_get_gdt, 5089 .set_gdt = svm_set_gdt, 5090 .set_dr7 = svm_set_dr7, 5091 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, 5092 .cache_reg = svm_cache_reg, 5093 .get_rflags = svm_get_rflags, 5094 .set_rflags = svm_set_rflags, 5095 .get_if_flag = svm_get_if_flag, 5096 5097 .flush_tlb_all = svm_flush_tlb_all, 5098 .flush_tlb_current = svm_flush_tlb_current, 5099 .flush_tlb_gva = svm_flush_tlb_gva, 5100 .flush_tlb_guest = svm_flush_tlb_asid, 5101 5102 .vcpu_pre_run = svm_vcpu_pre_run, 5103 .vcpu_run = svm_vcpu_run, 5104 .handle_exit = svm_handle_exit, 5105 .skip_emulated_instruction = svm_skip_emulated_instruction, 5106 .update_emulated_instruction = NULL, 5107 .set_interrupt_shadow = svm_set_interrupt_shadow, 5108 .get_interrupt_shadow = svm_get_interrupt_shadow, 5109 .patch_hypercall = svm_patch_hypercall, 5110 .inject_irq = svm_inject_irq, 5111 .inject_nmi = svm_inject_nmi, 5112 .is_vnmi_pending = svm_is_vnmi_pending, 5113 .set_vnmi_pending = svm_set_vnmi_pending, 5114 .inject_exception = svm_inject_exception, 5115 .cancel_injection = svm_cancel_injection, 5116 .interrupt_allowed = svm_interrupt_allowed, 5117 .nmi_allowed = svm_nmi_allowed, 5118 .get_nmi_mask = svm_get_nmi_mask, 5119 .set_nmi_mask = svm_set_nmi_mask, 5120 .enable_nmi_window = svm_enable_nmi_window, 5121 .enable_irq_window = svm_enable_irq_window, 5122 .update_cr8_intercept = svm_update_cr8_intercept, 5123 5124 .x2apic_icr_is_split = true, 5125 .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, 5126 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, 5127 .apicv_post_state_restore = avic_apicv_post_state_restore, 5128 .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, 5129 5130 .get_exit_info = svm_get_exit_info, 5131 .get_entry_info = svm_get_entry_info, 5132 5133 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, 5134 5135 .has_wbinvd_exit = svm_has_wbinvd_exit, 5136 5137 .get_l2_tsc_offset = svm_get_l2_tsc_offset, 5138 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, 5139 .write_tsc_offset = svm_write_tsc_offset, 5140 .write_tsc_multiplier = svm_write_tsc_multiplier, 5141 5142 .load_mmu_pgd = svm_load_mmu_pgd, 5143 5144 .check_intercept = svm_check_intercept, 5145 .handle_exit_irqoff = svm_handle_exit_irqoff, 5146 5147 .nested_ops = &svm_nested_ops, 5148 5149 .deliver_interrupt = svm_deliver_interrupt, 5150 .pi_update_irte = avic_pi_update_irte, 5151 .setup_mce = svm_setup_mce, 5152 5153 #ifdef CONFIG_KVM_SMM 5154 .smi_allowed = svm_smi_allowed, 5155 .enter_smm = svm_enter_smm, 5156 .leave_smm = svm_leave_smm, 5157 .enable_smi_window = svm_enable_smi_window, 5158 #endif 5159 5160 #ifdef CONFIG_KVM_AMD_SEV 5161 .dev_get_attr = sev_dev_get_attr, 5162 .mem_enc_ioctl = sev_mem_enc_ioctl, 5163 .mem_enc_register_region = sev_mem_enc_register_region, 5164 .mem_enc_unregister_region = sev_mem_enc_unregister_region, 5165 .guest_memory_reclaimed = sev_guest_memory_reclaimed, 5166 5167 .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, 5168 .vm_move_enc_context_from = sev_vm_move_enc_context_from, 5169 #endif 5170 .check_emulate_instruction = svm_check_emulate_instruction, 5171 5172 .apic_init_signal_blocked = svm_apic_init_signal_blocked, 5173 5174 .recalc_msr_intercepts = svm_recalc_msr_intercepts, 5175 .complete_emulated_msr = svm_complete_emulated_msr, 5176 5177 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, 5178 .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, 5179 .alloc_apic_backing_page = svm_alloc_apic_backing_page, 5180 5181 .gmem_prepare = sev_gmem_prepare, 5182 .gmem_invalidate = sev_gmem_invalidate, 5183 .private_max_mapping_level = sev_private_max_mapping_level, 5184 }; 5185 5186 /* 5187 * The default MMIO mask is a single bit (excluding the present bit), 5188 * which could conflict with the memory encryption bit. Check for 5189 * memory encryption support and override the default MMIO mask if 5190 * memory encryption is enabled. 5191 */ 5192 static __init void svm_adjust_mmio_mask(void) 5193 { 5194 unsigned int enc_bit, mask_bit; 5195 u64 msr, mask; 5196 5197 /* If there is no memory encryption support, use existing mask */ 5198 if (cpuid_eax(0x80000000) < 0x8000001f) 5199 return; 5200 5201 /* If memory encryption is not enabled, use existing mask */ 5202 rdmsrq(MSR_AMD64_SYSCFG, msr); 5203 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) 5204 return; 5205 5206 enc_bit = cpuid_ebx(0x8000001f) & 0x3f; 5207 mask_bit = boot_cpu_data.x86_phys_bits; 5208 5209 /* Increment the mask bit if it is the same as the encryption bit */ 5210 if (enc_bit == mask_bit) 5211 mask_bit++; 5212 5213 /* 5214 * If the mask bit location is below 52, then some bits above the 5215 * physical addressing limit will always be reserved, so use the 5216 * rsvd_bits() function to generate the mask. This mask, along with 5217 * the present bit, will be used to generate a page fault with 5218 * PFER.RSV = 1. 5219 * 5220 * If the mask bit location is 52 (or above), then clear the mask. 5221 */ 5222 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; 5223 5224 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); 5225 } 5226 5227 static __init void svm_set_cpu_caps(void) 5228 { 5229 kvm_set_cpu_caps(); 5230 5231 kvm_caps.supported_perf_cap = 0; 5232 kvm_caps.supported_xss = 0; 5233 5234 /* CPUID 0x80000001 and 0x8000000A (SVM features) */ 5235 if (nested) { 5236 kvm_cpu_cap_set(X86_FEATURE_SVM); 5237 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); 5238 5239 /* 5240 * KVM currently flushes TLBs on *every* nested SVM transition, 5241 * and so for all intents and purposes KVM supports flushing by 5242 * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush. 5243 */ 5244 kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID); 5245 5246 if (nrips) 5247 kvm_cpu_cap_set(X86_FEATURE_NRIPS); 5248 5249 if (npt_enabled) 5250 kvm_cpu_cap_set(X86_FEATURE_NPT); 5251 5252 if (tsc_scaling) 5253 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); 5254 5255 if (vls) 5256 kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD); 5257 if (lbrv) 5258 kvm_cpu_cap_set(X86_FEATURE_LBRV); 5259 5260 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) 5261 kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER); 5262 5263 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) 5264 kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD); 5265 5266 if (vgif) 5267 kvm_cpu_cap_set(X86_FEATURE_VGIF); 5268 5269 if (vnmi) 5270 kvm_cpu_cap_set(X86_FEATURE_VNMI); 5271 5272 /* Nested VM can receive #VMEXIT instead of triggering #GP */ 5273 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); 5274 } 5275 5276 if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) 5277 kvm_caps.has_bus_lock_exit = true; 5278 5279 /* CPUID 0x80000008 */ 5280 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 5281 boot_cpu_has(X86_FEATURE_AMD_SSBD)) 5282 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 5283 5284 if (enable_pmu) { 5285 /* 5286 * Enumerate support for PERFCTR_CORE if and only if KVM has 5287 * access to enough counters to virtualize "core" support, 5288 * otherwise limit vPMU support to the legacy number of counters. 5289 */ 5290 if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) 5291 kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, 5292 kvm_pmu_cap.num_counters_gp); 5293 else 5294 kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); 5295 5296 if (kvm_pmu_cap.version != 2 || 5297 !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) 5298 kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); 5299 } 5300 5301 /* CPUID 0x8000001F (SME/SEV features) */ 5302 sev_set_cpu_caps(); 5303 5304 /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ 5305 kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); 5306 } 5307 5308 static __init int svm_hardware_setup(void) 5309 { 5310 void *iopm_va; 5311 int cpu, r; 5312 5313 /* 5314 * NX is required for shadow paging and for NPT if the NX huge pages 5315 * mitigation is enabled. 5316 */ 5317 if (!boot_cpu_has(X86_FEATURE_NX)) { 5318 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 5319 return -EOPNOTSUPP; 5320 } 5321 kvm_enable_efer_bits(EFER_NX); 5322 5323 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 5324 XFEATURE_MASK_BNDCSR); 5325 5326 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 5327 kvm_enable_efer_bits(EFER_FFXSR); 5328 5329 if (tsc_scaling) { 5330 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 5331 tsc_scaling = false; 5332 } else { 5333 pr_info("TSC scaling supported\n"); 5334 kvm_caps.has_tsc_control = true; 5335 } 5336 } 5337 kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; 5338 kvm_caps.tsc_scaling_ratio_frac_bits = 32; 5339 5340 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); 5341 5342 if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) 5343 kvm_enable_efer_bits(EFER_AUTOIBRS); 5344 5345 /* Check for pause filtering support */ 5346 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 5347 pause_filter_count = 0; 5348 pause_filter_thresh = 0; 5349 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 5350 pause_filter_thresh = 0; 5351 } 5352 5353 if (nested) { 5354 pr_info("Nested Virtualization enabled\n"); 5355 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 5356 5357 r = nested_svm_init_msrpm_merge_offsets(); 5358 if (r) 5359 return r; 5360 } 5361 5362 /* 5363 * KVM's MMU doesn't support using 2-level paging for itself, and thus 5364 * NPT isn't supported if the host is using 2-level paging since host 5365 * CR4 is unchanged on VMRUN. 5366 */ 5367 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) 5368 npt_enabled = false; 5369 5370 if (!boot_cpu_has(X86_FEATURE_NPT)) 5371 npt_enabled = false; 5372 5373 /* Force VM NPT level equal to the host's paging level */ 5374 kvm_configure_mmu(npt_enabled, get_npt_level(), 5375 get_npt_level(), PG_LEVEL_1G); 5376 pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); 5377 5378 /* Setup shadow_me_value and shadow_me_mask */ 5379 kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); 5380 5381 svm_adjust_mmio_mask(); 5382 5383 nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); 5384 5385 if (lbrv) { 5386 if (!boot_cpu_has(X86_FEATURE_LBRV)) 5387 lbrv = false; 5388 else 5389 pr_info("LBR virtualization supported\n"); 5390 } 5391 5392 iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL); 5393 if (!iopm_va) 5394 return -ENOMEM; 5395 5396 iopm_base = __sme_set(__pa(iopm_va)); 5397 5398 /* 5399 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which 5400 * may be modified by svm_adjust_mmio_mask()), as well as nrips. 5401 */ 5402 sev_hardware_setup(); 5403 5404 svm_hv_hardware_setup(); 5405 5406 for_each_possible_cpu(cpu) { 5407 r = svm_cpu_init(cpu); 5408 if (r) 5409 goto err; 5410 } 5411 5412 enable_apicv = avic = avic && avic_hardware_setup(); 5413 5414 if (!enable_apicv) { 5415 enable_ipiv = false; 5416 svm_x86_ops.vcpu_blocking = NULL; 5417 svm_x86_ops.vcpu_unblocking = NULL; 5418 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; 5419 } else if (!x2avic_enabled) { 5420 svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; 5421 } 5422 5423 if (vls) { 5424 if (!npt_enabled || 5425 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 5426 !IS_ENABLED(CONFIG_X86_64)) { 5427 vls = false; 5428 } else { 5429 pr_info("Virtual VMLOAD VMSAVE supported\n"); 5430 } 5431 } 5432 5433 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) 5434 svm_gp_erratum_intercept = false; 5435 5436 if (vgif) { 5437 if (!boot_cpu_has(X86_FEATURE_VGIF)) 5438 vgif = false; 5439 else 5440 pr_info("Virtual GIF supported\n"); 5441 } 5442 5443 vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI); 5444 if (vnmi) 5445 pr_info("Virtual NMI enabled\n"); 5446 5447 if (!vnmi) { 5448 svm_x86_ops.is_vnmi_pending = NULL; 5449 svm_x86_ops.set_vnmi_pending = NULL; 5450 } 5451 5452 if (!enable_pmu) 5453 pr_info("PMU virtualization is disabled\n"); 5454 5455 svm_set_cpu_caps(); 5456 5457 /* 5458 * It seems that on AMD processors PTE's accessed bit is 5459 * being set by the CPU hardware before the NPF vmexit. 5460 * This is not expected behaviour and our tests fail because 5461 * of it. 5462 * A workaround here is to disable support for 5463 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 5464 * In this case userspace can know if there is support using 5465 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 5466 * it 5467 * If future AMD CPU models change the behaviour described above, 5468 * this variable can be changed accordingly 5469 */ 5470 allow_smaller_maxphyaddr = !npt_enabled; 5471 5472 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; 5473 return 0; 5474 5475 err: 5476 svm_hardware_unsetup(); 5477 return r; 5478 } 5479 5480 5481 static struct kvm_x86_init_ops svm_init_ops __initdata = { 5482 .hardware_setup = svm_hardware_setup, 5483 5484 .runtime_ops = &svm_x86_ops, 5485 .pmu_ops = &amd_pmu_ops, 5486 }; 5487 5488 static void __svm_exit(void) 5489 { 5490 kvm_x86_vendor_exit(); 5491 } 5492 5493 static int __init svm_init(void) 5494 { 5495 int r; 5496 5497 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm); 5498 5499 __unused_size_checks(); 5500 5501 if (!kvm_is_svm_supported()) 5502 return -EOPNOTSUPP; 5503 5504 r = kvm_x86_vendor_init(&svm_init_ops); 5505 if (r) 5506 return r; 5507 5508 /* 5509 * Common KVM initialization _must_ come last, after this, /dev/kvm is 5510 * exposed to userspace! 5511 */ 5512 r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), 5513 THIS_MODULE); 5514 if (r) 5515 goto err_kvm_init; 5516 5517 return 0; 5518 5519 err_kvm_init: 5520 __svm_exit(); 5521 return r; 5522 } 5523 5524 static void __exit svm_exit(void) 5525 { 5526 kvm_exit(); 5527 __svm_exit(); 5528 } 5529 5530 module_init(svm_init) 5531 module_exit(svm_exit) 5532