1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/kvm_host.h> 4 5 #include "irq.h" 6 #include "mmu.h" 7 #include "kvm_cache_regs.h" 8 #include "x86.h" 9 #include "smm.h" 10 #include "cpuid.h" 11 #include "pmu.h" 12 13 #include <linux/module.h> 14 #include <linux/mod_devicetable.h> 15 #include <linux/kernel.h> 16 #include <linux/vmalloc.h> 17 #include <linux/highmem.h> 18 #include <linux/amd-iommu.h> 19 #include <linux/sched.h> 20 #include <linux/trace_events.h> 21 #include <linux/slab.h> 22 #include <linux/hashtable.h> 23 #include <linux/objtool.h> 24 #include <linux/psp-sev.h> 25 #include <linux/file.h> 26 #include <linux/pagemap.h> 27 #include <linux/swap.h> 28 #include <linux/rwsem.h> 29 #include <linux/cc_platform.h> 30 #include <linux/smp.h> 31 #include <linux/string_choices.h> 32 #include <linux/mutex.h> 33 34 #include <asm/apic.h> 35 #include <asm/msr.h> 36 #include <asm/perf_event.h> 37 #include <asm/tlbflush.h> 38 #include <asm/desc.h> 39 #include <asm/debugreg.h> 40 #include <asm/kvm_para.h> 41 #include <asm/irq_remapping.h> 42 #include <asm/spec-ctrl.h> 43 #include <asm/cpu_device_id.h> 44 #include <asm/traps.h> 45 #include <asm/reboot.h> 46 #include <asm/fpu/api.h> 47 48 #include <trace/events/ipi.h> 49 50 #include "trace.h" 51 52 #include "svm.h" 53 #include "svm_ops.h" 54 55 #include "kvm_onhyperv.h" 56 #include "svm_onhyperv.h" 57 58 MODULE_AUTHOR("Qumranet"); 59 MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions"); 60 MODULE_LICENSE("GPL"); 61 62 #ifdef MODULE 63 static const struct x86_cpu_id svm_cpu_id[] = { 64 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL), 65 {} 66 }; 67 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); 68 #endif 69 70 #define SEG_TYPE_LDT 2 71 #define SEG_TYPE_BUSY_TSS16 3 72 73 static bool erratum_383_found __read_mostly; 74 75 /* 76 * Set osvw_len to higher value when updated Revision Guides 77 * are published and we know what the new status bits are 78 */ 79 static uint64_t osvw_len = 4, osvw_status; 80 81 static DEFINE_PER_CPU(u64, current_tsc_ratio); 82 83 /* 84 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 85 * pause_filter_count: On processors that support Pause filtering(indicated 86 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter 87 * count value. On VMRUN this value is loaded into an internal counter. 88 * Each time a pause instruction is executed, this counter is decremented 89 * until it reaches zero at which time a #VMEXIT is generated if pause 90 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause 91 * Intercept Filtering for more details. 92 * This also indicate if ple logic enabled. 93 * 94 * pause_filter_thresh: In addition, some processor families support advanced 95 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on 96 * the amount of time a guest is allowed to execute in a pause loop. 97 * In this mode, a 16-bit pause filter threshold field is added in the 98 * VMCB. The threshold value is a cycle count that is used to reset the 99 * pause counter. As with simple pause filtering, VMRUN loads the pause 100 * count value from VMCB into an internal counter. Then, on each pause 101 * instruction the hardware checks the elapsed number of cycles since 102 * the most recent pause instruction against the pause filter threshold. 103 * If the elapsed cycle count is greater than the pause filter threshold, 104 * then the internal pause count is reloaded from the VMCB and execution 105 * continues. If the elapsed cycle count is less than the pause filter 106 * threshold, then the internal pause count is decremented. If the count 107 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is 108 * triggered. If advanced pause filtering is supported and pause filter 109 * threshold field is set to zero, the filter will operate in the simpler, 110 * count only mode. 111 */ 112 113 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; 114 module_param(pause_filter_thresh, ushort, 0444); 115 116 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; 117 module_param(pause_filter_count, ushort, 0444); 118 119 /* Default doubles per-vcpu window every exit. */ 120 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 121 module_param(pause_filter_count_grow, ushort, 0444); 122 123 /* Default resets per-vcpu window every exit to pause_filter_count. */ 124 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 125 module_param(pause_filter_count_shrink, ushort, 0444); 126 127 /* Default is to compute the maximum so we can never overflow. */ 128 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; 129 module_param(pause_filter_count_max, ushort, 0444); 130 131 /* 132 * Use nested page tables by default. Note, NPT may get forced off by 133 * svm_hardware_setup() if it's unsupported by hardware or the host kernel. 134 */ 135 bool npt_enabled = true; 136 module_param_named(npt, npt_enabled, bool, 0444); 137 138 /* allow nested virtualization in KVM/SVM */ 139 static int nested = true; 140 module_param(nested, int, 0444); 141 142 /* enable/disable Next RIP Save */ 143 int nrips = true; 144 module_param(nrips, int, 0444); 145 146 /* enable/disable Virtual VMLOAD VMSAVE */ 147 static int vls = true; 148 module_param(vls, int, 0444); 149 150 /* enable/disable Virtual GIF */ 151 int vgif = true; 152 module_param(vgif, int, 0444); 153 154 /* enable/disable LBR virtualization */ 155 int lbrv = true; 156 module_param(lbrv, int, 0444); 157 158 static int tsc_scaling = true; 159 module_param(tsc_scaling, int, 0444); 160 161 module_param(enable_device_posted_irqs, bool, 0444); 162 163 bool __read_mostly dump_invalid_vmcb; 164 module_param(dump_invalid_vmcb, bool, 0644); 165 166 167 bool intercept_smi = true; 168 module_param(intercept_smi, bool, 0444); 169 170 bool vnmi = true; 171 module_param(vnmi, bool, 0444); 172 173 module_param(enable_mediated_pmu, bool, 0444); 174 175 static bool svm_gp_erratum_intercept = true; 176 177 static u8 rsm_ins_bytes[] = "\x0f\xaa"; 178 179 static unsigned long iopm_base; 180 181 DEFINE_PER_CPU(struct svm_cpu_data, svm_data); 182 183 static DEFINE_MUTEX(vmcb_dump_mutex); 184 185 /* 186 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via 187 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. 188 * 189 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to 190 * defer the restoration of TSC_AUX until the CPU returns to userspace. 191 */ 192 int tsc_aux_uret_slot __ro_after_init = -1; 193 194 static int get_npt_level(void) 195 { 196 #ifdef CONFIG_X86_64 197 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; 198 #else 199 return PT32E_ROOT_LEVEL; 200 #endif 201 } 202 203 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 204 { 205 struct vcpu_svm *svm = to_svm(vcpu); 206 u64 old_efer = vcpu->arch.efer; 207 vcpu->arch.efer = efer; 208 209 if (!npt_enabled) { 210 /* Shadow paging assumes NX to be available. */ 211 efer |= EFER_NX; 212 213 if (!(efer & EFER_LMA)) 214 efer &= ~EFER_LME; 215 } 216 217 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { 218 if (!(efer & EFER_SVME)) { 219 svm_leave_nested(vcpu); 220 /* #GP intercept is still needed for vmware backdoor */ 221 if (!enable_vmware_backdoor) 222 clr_exception_intercept(svm, GP_VECTOR); 223 224 /* 225 * Free the nested guest state, unless we are in SMM. 226 * In this case we will return to the nested guest 227 * as soon as we leave SMM. 228 */ 229 if (!is_smm(vcpu)) 230 svm_free_nested(svm); 231 232 } else { 233 int ret = svm_allocate_nested(svm); 234 235 if (ret) { 236 vcpu->arch.efer = old_efer; 237 return ret; 238 } 239 240 /* 241 * Never intercept #GP for SEV guests, KVM can't 242 * decrypt guest memory to workaround the erratum. 243 */ 244 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) 245 set_exception_intercept(svm, GP_VECTOR); 246 } 247 } 248 249 svm->vmcb->save.efer = efer | EFER_SVME; 250 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 251 return 0; 252 } 253 254 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) 255 { 256 struct vcpu_svm *svm = to_svm(vcpu); 257 u32 ret = 0; 258 259 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 260 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; 261 return ret; 262 } 263 264 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 265 { 266 struct vcpu_svm *svm = to_svm(vcpu); 267 268 if (mask == 0) 269 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 270 else 271 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; 272 273 } 274 275 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, 276 int emul_type, 277 bool commit_side_effects) 278 { 279 struct vcpu_svm *svm = to_svm(vcpu); 280 unsigned long old_rflags; 281 282 /* 283 * SEV-ES does not expose the next RIP. The RIP update is controlled by 284 * the type of exit and the #VC handler in the guest. 285 */ 286 if (sev_es_guest(vcpu->kvm)) 287 goto done; 288 289 if (nrips && svm->vmcb->control.next_rip != 0) { 290 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); 291 svm->next_rip = svm->vmcb->control.next_rip; 292 } 293 294 if (!svm->next_rip) { 295 if (unlikely(!commit_side_effects)) 296 old_rflags = svm->vmcb->save.rflags; 297 298 if (!kvm_emulate_instruction(vcpu, emul_type)) 299 return 0; 300 301 if (unlikely(!commit_side_effects)) 302 svm->vmcb->save.rflags = old_rflags; 303 } else { 304 kvm_rip_write(vcpu, svm->next_rip); 305 } 306 307 done: 308 if (likely(commit_side_effects)) 309 svm_set_interrupt_shadow(vcpu, 0); 310 311 return 1; 312 } 313 314 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) 315 { 316 return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true); 317 } 318 319 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector) 320 { 321 const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT | 322 EMULTYPE_SET_SOFT_INT_VECTOR(vector); 323 unsigned long rip, old_rip = kvm_rip_read(vcpu); 324 struct vcpu_svm *svm = to_svm(vcpu); 325 326 /* 327 * Due to architectural shortcomings, the CPU doesn't always provide 328 * NextRIP, e.g. if KVM intercepted an exception that occurred while 329 * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip 330 * the instruction even if NextRIP is supported to acquire the next 331 * RIP so that it can be shoved into the NextRIP field, otherwise 332 * hardware will fail to advance guest RIP during event injection. 333 * Drop the exception/interrupt if emulation fails and effectively 334 * retry the instruction, it's the least awful option. If NRIPS is 335 * in use, the skip must not commit any side effects such as clearing 336 * the interrupt shadow or RFLAGS.RF. 337 */ 338 if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips)) 339 return -EIO; 340 341 rip = kvm_rip_read(vcpu); 342 343 /* 344 * Save the injection information, even when using next_rip, as the 345 * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection 346 * doesn't complete due to a VM-Exit occurring while the CPU is 347 * vectoring the event. Decoding the instruction isn't guaranteed to 348 * work as there may be no backing instruction, e.g. if the event is 349 * being injected by L1 for L2, or if the guest is patching INT3 into 350 * a different instruction. 351 */ 352 svm->soft_int_injected = true; 353 svm->soft_int_csbase = svm->vmcb->save.cs.base; 354 svm->soft_int_old_rip = old_rip; 355 svm->soft_int_next_rip = rip; 356 357 if (nrips) 358 kvm_rip_write(vcpu, old_rip); 359 360 if (static_cpu_has(X86_FEATURE_NRIPS)) 361 svm->vmcb->control.next_rip = rip; 362 363 return 0; 364 } 365 366 static void svm_inject_exception(struct kvm_vcpu *vcpu) 367 { 368 struct kvm_queued_exception *ex = &vcpu->arch.exception; 369 struct vcpu_svm *svm = to_svm(vcpu); 370 371 kvm_deliver_exception_payload(vcpu, ex); 372 373 if (kvm_exception_is_soft(ex->vector) && 374 svm_update_soft_interrupt_rip(vcpu, ex->vector)) 375 return; 376 377 svm->vmcb->control.event_inj = ex->vector 378 | SVM_EVTINJ_VALID 379 | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0) 380 | SVM_EVTINJ_TYPE_EXEPT; 381 svm->vmcb->control.event_inj_err = ex->error_code; 382 } 383 384 static void svm_init_erratum_383(void) 385 { 386 u64 val; 387 388 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) 389 return; 390 391 /* Use _safe variants to not break nested virtualization */ 392 if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) 393 return; 394 395 val |= (1ULL << 47); 396 397 native_write_msr_safe(MSR_AMD64_DC_CFG, val); 398 399 erratum_383_found = true; 400 } 401 402 static void svm_init_osvw(struct kvm_vcpu *vcpu) 403 { 404 /* 405 * Guests should see errata 400 and 415 as fixed (assuming that 406 * HLT and IO instructions are intercepted). 407 */ 408 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; 409 vcpu->arch.osvw.status = osvw_status & ~(6ULL); 410 411 /* 412 * By increasing VCPU's osvw.length to 3 we are telling the guest that 413 * all osvw.status bits inside that length, including bit 0 (which is 414 * reserved for erratum 298), are valid. However, if host processor's 415 * osvw_len is 0 then osvw_status[0] carries no information. We need to 416 * be conservative here and therefore we tell the guest that erratum 298 417 * is present (because we really don't know). 418 */ 419 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) 420 vcpu->arch.osvw.status |= 1; 421 } 422 423 static bool __kvm_is_svm_supported(void) 424 { 425 int cpu = smp_processor_id(); 426 struct cpuinfo_x86 *c = &cpu_data(cpu); 427 428 if (c->x86_vendor != X86_VENDOR_AMD && 429 c->x86_vendor != X86_VENDOR_HYGON) { 430 pr_err("CPU %d isn't AMD or Hygon\n", cpu); 431 return false; 432 } 433 434 if (!cpu_has(c, X86_FEATURE_SVM)) { 435 pr_err("SVM not supported by CPU %d\n", cpu); 436 return false; 437 } 438 439 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { 440 pr_info("KVM is unsupported when running as an SEV guest\n"); 441 return false; 442 } 443 444 return true; 445 } 446 447 static bool kvm_is_svm_supported(void) 448 { 449 bool supported; 450 451 migrate_disable(); 452 supported = __kvm_is_svm_supported(); 453 migrate_enable(); 454 455 return supported; 456 } 457 458 static int svm_check_processor_compat(void) 459 { 460 if (!__kvm_is_svm_supported()) 461 return -EIO; 462 463 return 0; 464 } 465 466 static void __svm_write_tsc_multiplier(u64 multiplier) 467 { 468 if (multiplier == __this_cpu_read(current_tsc_ratio)) 469 return; 470 471 wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); 472 __this_cpu_write(current_tsc_ratio, multiplier); 473 } 474 475 static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) 476 { 477 return &sd->save_area->host_sev_es_save; 478 } 479 480 static inline void kvm_cpu_svm_disable(void) 481 { 482 uint64_t efer; 483 484 wrmsrq(MSR_VM_HSAVE_PA, 0); 485 rdmsrq(MSR_EFER, efer); 486 if (efer & EFER_SVME) { 487 /* 488 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and 489 * NMI aren't blocked. 490 */ 491 stgi(); 492 wrmsrq(MSR_EFER, efer & ~EFER_SVME); 493 } 494 } 495 496 static void svm_emergency_disable_virtualization_cpu(void) 497 { 498 kvm_rebooting = true; 499 500 kvm_cpu_svm_disable(); 501 } 502 503 static void svm_disable_virtualization_cpu(void) 504 { 505 /* Make sure we clean up behind us */ 506 if (tsc_scaling) 507 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 508 509 kvm_cpu_svm_disable(); 510 511 amd_pmu_disable_virt(); 512 } 513 514 static int svm_enable_virtualization_cpu(void) 515 { 516 517 struct svm_cpu_data *sd; 518 uint64_t efer; 519 int me = raw_smp_processor_id(); 520 521 rdmsrq(MSR_EFER, efer); 522 if (efer & EFER_SVME) 523 return -EBUSY; 524 525 sd = per_cpu_ptr(&svm_data, me); 526 sd->asid_generation = 1; 527 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 528 sd->next_asid = sd->max_asid + 1; 529 sd->min_asid = max_sev_asid + 1; 530 531 wrmsrq(MSR_EFER, efer | EFER_SVME); 532 533 wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); 534 535 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 536 /* 537 * Set the default value, even if we don't use TSC scaling 538 * to avoid having stale value in the msr 539 */ 540 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 541 } 542 543 544 /* 545 * Get OSVW bits. 546 * 547 * Note that it is possible to have a system with mixed processor 548 * revisions and therefore different OSVW bits. If bits are not the same 549 * on different processors then choose the worst case (i.e. if erratum 550 * is present on one processor and not on another then assume that the 551 * erratum is present everywhere). 552 */ 553 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { 554 u64 len, status = 0; 555 int err; 556 557 err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); 558 if (!err) 559 err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); 560 561 if (err) 562 osvw_status = osvw_len = 0; 563 else { 564 if (len < osvw_len) 565 osvw_len = len; 566 osvw_status |= status; 567 osvw_status &= (1ULL << osvw_len) - 1; 568 } 569 } else 570 osvw_status = osvw_len = 0; 571 572 svm_init_erratum_383(); 573 574 amd_pmu_enable_virt(); 575 576 return 0; 577 } 578 579 static void svm_cpu_uninit(int cpu) 580 { 581 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 582 583 if (!sd->save_area) 584 return; 585 586 kfree(sd->sev_vmcbs); 587 __free_page(__sme_pa_to_page(sd->save_area_pa)); 588 sd->save_area_pa = 0; 589 sd->save_area = NULL; 590 } 591 592 static int svm_cpu_init(int cpu) 593 { 594 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 595 struct page *save_area_page; 596 int ret = -ENOMEM; 597 598 memset(sd, 0, sizeof(struct svm_cpu_data)); 599 save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); 600 if (!save_area_page) 601 return ret; 602 603 ret = sev_cpu_init(sd); 604 if (ret) 605 goto free_save_area; 606 607 sd->save_area = page_address(save_area_page); 608 sd->save_area_pa = __sme_page_pa(save_area_page); 609 return 0; 610 611 free_save_area: 612 __free_page(save_area_page); 613 return ret; 614 615 } 616 617 static void set_dr_intercepts(struct vcpu_svm *svm) 618 { 619 struct vmcb *vmcb = svm->vmcb01.ptr; 620 621 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); 622 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); 623 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); 624 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); 625 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); 626 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); 627 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); 628 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); 629 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); 630 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); 631 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); 632 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); 633 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); 634 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); 635 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); 636 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); 637 638 recalc_intercepts(svm); 639 } 640 641 static void clr_dr_intercepts(struct vcpu_svm *svm) 642 { 643 struct vmcb *vmcb = svm->vmcb01.ptr; 644 645 vmcb->control.intercepts[INTERCEPT_DR] = 0; 646 647 recalc_intercepts(svm); 648 } 649 650 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) 651 { 652 /* 653 * For non-nested case: 654 * If the L01 MSR bitmap does not intercept the MSR, then we need to 655 * save it. 656 * 657 * For nested case: 658 * If the L02 MSR bitmap does not intercept the MSR, then we need to 659 * save it. 660 */ 661 void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm : 662 to_svm(vcpu)->msrpm; 663 664 return svm_test_msr_bitmap_write(msrpm, msr); 665 } 666 667 void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) 668 { 669 struct vcpu_svm *svm = to_svm(vcpu); 670 void *msrpm = svm->msrpm; 671 672 /* Don't disable interception for MSRs userspace wants to handle. */ 673 if (type & MSR_TYPE_R) { 674 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 675 svm_clear_msr_bitmap_read(msrpm, msr); 676 else 677 svm_set_msr_bitmap_read(msrpm, msr); 678 } 679 680 if (type & MSR_TYPE_W) { 681 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 682 svm_clear_msr_bitmap_write(msrpm, msr); 683 else 684 svm_set_msr_bitmap_write(msrpm, msr); 685 } 686 687 svm_hv_vmcb_dirty_nested_enlightenments(vcpu); 688 svm->nested.force_msr_bitmap_recalc = true; 689 } 690 691 void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) 692 { 693 unsigned int order = get_order(size); 694 struct page *pages = alloc_pages(gfp_mask, order); 695 void *pm; 696 697 if (!pages) 698 return NULL; 699 700 /* 701 * Set all bits in the permissions map so that all MSR and I/O accesses 702 * are intercepted by default. 703 */ 704 pm = page_address(pages); 705 memset(pm, 0xff, PAGE_SIZE * (1 << order)); 706 707 return pm; 708 } 709 710 static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) 711 { 712 struct vcpu_svm *svm = to_svm(vcpu); 713 bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); 714 715 if (intercept == svm->lbr_msrs_intercepted) 716 return; 717 718 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); 719 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); 720 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept); 721 svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept); 722 723 if (sev_es_guest(vcpu->kvm)) 724 svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); 725 726 svm->lbr_msrs_intercepted = intercept; 727 } 728 729 void svm_vcpu_free_msrpm(void *msrpm) 730 { 731 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); 732 } 733 734 static void svm_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) 735 { 736 bool intercept = !kvm_vcpu_has_mediated_pmu(vcpu); 737 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 738 int i; 739 740 if (!enable_mediated_pmu) 741 return; 742 743 /* Legacy counters are always available for AMD CPUs with a PMU. */ 744 for (i = 0; i < min(pmu->nr_arch_gp_counters, AMD64_NUM_COUNTERS); i++) 745 svm_set_intercept_for_msr(vcpu, MSR_K7_PERFCTR0 + i, 746 MSR_TYPE_RW, intercept); 747 748 intercept |= !guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE); 749 for (i = 0; i < pmu->nr_arch_gp_counters; i++) 750 svm_set_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, 751 MSR_TYPE_RW, intercept); 752 753 for ( ; i < kvm_pmu_cap.num_counters_gp; i++) 754 svm_enable_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, 755 MSR_TYPE_RW); 756 757 intercept = kvm_need_perf_global_ctrl_intercept(vcpu); 758 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 759 MSR_TYPE_RW, intercept); 760 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 761 MSR_TYPE_RW, intercept); 762 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 763 MSR_TYPE_RW, intercept); 764 svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, 765 MSR_TYPE_RW, intercept); 766 } 767 768 static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 769 { 770 struct vcpu_svm *svm = to_svm(vcpu); 771 772 svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW); 773 svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 774 775 #ifdef CONFIG_X86_64 776 svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 777 svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 778 svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 779 svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW); 780 svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW); 781 svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW); 782 #endif 783 784 if (lbrv) 785 svm_recalc_lbr_msr_intercepts(vcpu); 786 787 if (cpu_feature_enabled(X86_FEATURE_IBPB)) 788 svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 789 !guest_has_pred_cmd_msr(vcpu)); 790 791 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) 792 svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 793 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 794 795 /* 796 * Disable interception of SPEC_CTRL if KVM doesn't need to manually 797 * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if 798 * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively 799 * using SPEC_CTRL. 800 */ 801 if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL)) 802 svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 803 !guest_has_spec_ctrl_msr(vcpu)); 804 else 805 svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 806 !svm->spec_ctrl); 807 808 /* 809 * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU, 810 * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits. 811 */ 812 svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW, 813 guest_cpuid_is_intel_compatible(vcpu)); 814 svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW, 815 guest_cpuid_is_intel_compatible(vcpu)); 816 817 if (kvm_aperfmperf_in_guest(vcpu->kvm)) { 818 svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); 819 svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); 820 } 821 822 if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { 823 bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); 824 825 svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled); 826 svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled); 827 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled); 828 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled); 829 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled); 830 svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled); 831 } 832 833 if (sev_es_guest(vcpu->kvm)) 834 sev_es_recalc_msr_intercepts(vcpu); 835 836 svm_recalc_pmu_msr_intercepts(vcpu); 837 838 /* 839 * x2APIC intercepts are modified on-demand and cannot be filtered by 840 * userspace. 841 */ 842 } 843 844 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) 845 { 846 to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; 847 to_vmcb->save.br_from = from_vmcb->save.br_from; 848 to_vmcb->save.br_to = from_vmcb->save.br_to; 849 to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; 850 to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; 851 852 vmcb_mark_dirty(to_vmcb, VMCB_LBR); 853 } 854 855 static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) 856 { 857 to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 858 } 859 860 void svm_enable_lbrv(struct kvm_vcpu *vcpu) 861 { 862 __svm_enable_lbrv(vcpu); 863 svm_recalc_lbr_msr_intercepts(vcpu); 864 } 865 866 static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) 867 { 868 KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); 869 to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 870 } 871 872 void svm_update_lbrv(struct kvm_vcpu *vcpu) 873 { 874 struct vcpu_svm *svm = to_svm(vcpu); 875 bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; 876 bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) || 877 (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && 878 (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); 879 880 if (enable_lbrv && !current_enable_lbrv) 881 __svm_enable_lbrv(vcpu); 882 else if (!enable_lbrv && current_enable_lbrv) 883 __svm_disable_lbrv(vcpu); 884 885 /* 886 * During nested transitions, it is possible that the current VMCB has 887 * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa). 888 * In this case, even though LBR_CTL does not need an update, intercepts 889 * do, so always recalculate the intercepts here. 890 */ 891 svm_recalc_lbr_msr_intercepts(vcpu); 892 } 893 894 void disable_nmi_singlestep(struct vcpu_svm *svm) 895 { 896 svm->nmi_singlestep = false; 897 898 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 899 /* Clear our flags if they were not set by the guest */ 900 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 901 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; 902 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 903 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; 904 } 905 } 906 907 static void grow_ple_window(struct kvm_vcpu *vcpu) 908 { 909 struct vcpu_svm *svm = to_svm(vcpu); 910 struct vmcb_control_area *control = &svm->vmcb->control; 911 int old = control->pause_filter_count; 912 913 if (kvm_pause_in_guest(vcpu->kvm)) 914 return; 915 916 control->pause_filter_count = __grow_ple_window(old, 917 pause_filter_count, 918 pause_filter_count_grow, 919 pause_filter_count_max); 920 921 if (control->pause_filter_count != old) { 922 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 923 trace_kvm_ple_window_update(vcpu->vcpu_id, 924 control->pause_filter_count, old); 925 } 926 } 927 928 static void shrink_ple_window(struct kvm_vcpu *vcpu) 929 { 930 struct vcpu_svm *svm = to_svm(vcpu); 931 struct vmcb_control_area *control = &svm->vmcb->control; 932 int old = control->pause_filter_count; 933 934 if (kvm_pause_in_guest(vcpu->kvm)) 935 return; 936 937 control->pause_filter_count = 938 __shrink_ple_window(old, 939 pause_filter_count, 940 pause_filter_count_shrink, 941 pause_filter_count); 942 if (control->pause_filter_count != old) { 943 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 944 trace_kvm_ple_window_update(vcpu->vcpu_id, 945 control->pause_filter_count, old); 946 } 947 } 948 949 static void svm_hardware_unsetup(void) 950 { 951 int cpu; 952 953 avic_hardware_unsetup(); 954 955 sev_hardware_unsetup(); 956 957 for_each_possible_cpu(cpu) 958 svm_cpu_uninit(cpu); 959 960 __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE)); 961 iopm_base = 0; 962 } 963 964 static void init_seg(struct vmcb_seg *seg) 965 { 966 seg->selector = 0; 967 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 968 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 969 seg->limit = 0xffff; 970 seg->base = 0; 971 } 972 973 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) 974 { 975 seg->selector = 0; 976 seg->attrib = SVM_SELECTOR_P_MASK | type; 977 seg->limit = 0xffff; 978 seg->base = 0; 979 } 980 981 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 982 { 983 struct vcpu_svm *svm = to_svm(vcpu); 984 985 return svm->nested.ctl.tsc_offset; 986 } 987 988 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 989 { 990 struct vcpu_svm *svm = to_svm(vcpu); 991 992 return svm->tsc_ratio_msr; 993 } 994 995 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu) 996 { 997 struct vcpu_svm *svm = to_svm(vcpu); 998 999 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; 1000 svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; 1001 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1002 } 1003 1004 void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1005 { 1006 preempt_disable(); 1007 if (to_svm(vcpu)->guest_state_loaded) 1008 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); 1009 preempt_enable(); 1010 } 1011 1012 /* Evaluate instruction intercepts that depend on guest CPUID features. */ 1013 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) 1014 { 1015 struct vcpu_svm *svm = to_svm(vcpu); 1016 1017 /* 1018 * Intercept INVPCID if shadow paging is enabled to sync/free shadow 1019 * roots, or if INVPCID is disabled in the guest to inject #UD. 1020 */ 1021 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { 1022 if (!npt_enabled || 1023 !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID)) 1024 svm_set_intercept(svm, INTERCEPT_INVPCID); 1025 else 1026 svm_clr_intercept(svm, INTERCEPT_INVPCID); 1027 } 1028 1029 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { 1030 if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP)) 1031 svm_clr_intercept(svm, INTERCEPT_RDTSCP); 1032 else 1033 svm_set_intercept(svm, INTERCEPT_RDTSCP); 1034 } 1035 1036 /* 1037 * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is 1038 * always set if vls is enabled. If the intercepts are set, the bit is 1039 * meaningless anyway. 1040 */ 1041 if (guest_cpuid_is_intel_compatible(vcpu)) { 1042 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1043 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1044 } else { 1045 /* 1046 * If hardware supports Virtual VMLOAD VMSAVE then enable it 1047 * in VMCB and clear intercepts to avoid #VMEXIT. 1048 */ 1049 if (vls) { 1050 svm_clr_intercept(svm, INTERCEPT_VMLOAD); 1051 svm_clr_intercept(svm, INTERCEPT_VMSAVE); 1052 } 1053 } 1054 1055 if (kvm_need_rdpmc_intercept(vcpu)) 1056 svm_set_intercept(svm, INTERCEPT_RDPMC); 1057 else 1058 svm_clr_intercept(svm, INTERCEPT_RDPMC); 1059 } 1060 1061 static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) 1062 { 1063 svm_recalc_instruction_intercepts(vcpu); 1064 svm_recalc_msr_intercepts(vcpu); 1065 } 1066 1067 static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) 1068 { 1069 struct vcpu_svm *svm = to_svm(vcpu); 1070 struct vmcb *vmcb = svm->vmcb01.ptr; 1071 struct vmcb_control_area *control = &vmcb->control; 1072 struct vmcb_save_area *save = &vmcb->save; 1073 1074 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1075 svm_set_intercept(svm, INTERCEPT_CR3_READ); 1076 svm_set_intercept(svm, INTERCEPT_CR4_READ); 1077 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1078 svm_set_intercept(svm, INTERCEPT_CR3_WRITE); 1079 svm_set_intercept(svm, INTERCEPT_CR4_WRITE); 1080 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 1081 1082 set_dr_intercepts(svm); 1083 1084 set_exception_intercept(svm, PF_VECTOR); 1085 set_exception_intercept(svm, UD_VECTOR); 1086 set_exception_intercept(svm, MC_VECTOR); 1087 set_exception_intercept(svm, AC_VECTOR); 1088 set_exception_intercept(svm, DB_VECTOR); 1089 /* 1090 * Guest access to VMware backdoor ports could legitimately 1091 * trigger #GP because of TSS I/O permission bitmap. 1092 * We intercept those #GP and allow access to them anyway 1093 * as VMware does. 1094 */ 1095 if (enable_vmware_backdoor) 1096 set_exception_intercept(svm, GP_VECTOR); 1097 1098 svm_set_intercept(svm, INTERCEPT_INTR); 1099 svm_set_intercept(svm, INTERCEPT_NMI); 1100 1101 if (intercept_smi) 1102 svm_set_intercept(svm, INTERCEPT_SMI); 1103 1104 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1105 svm_set_intercept(svm, INTERCEPT_RDPMC); 1106 svm_set_intercept(svm, INTERCEPT_CPUID); 1107 svm_set_intercept(svm, INTERCEPT_INVD); 1108 svm_set_intercept(svm, INTERCEPT_INVLPG); 1109 svm_set_intercept(svm, INTERCEPT_INVLPGA); 1110 svm_set_intercept(svm, INTERCEPT_IOIO_PROT); 1111 svm_set_intercept(svm, INTERCEPT_MSR_PROT); 1112 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH); 1113 svm_set_intercept(svm, INTERCEPT_SHUTDOWN); 1114 svm_set_intercept(svm, INTERCEPT_VMRUN); 1115 svm_set_intercept(svm, INTERCEPT_VMMCALL); 1116 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1117 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1118 svm_set_intercept(svm, INTERCEPT_STGI); 1119 svm_set_intercept(svm, INTERCEPT_CLGI); 1120 svm_set_intercept(svm, INTERCEPT_SKINIT); 1121 svm_set_intercept(svm, INTERCEPT_WBINVD); 1122 svm_set_intercept(svm, INTERCEPT_XSETBV); 1123 svm_set_intercept(svm, INTERCEPT_RDPRU); 1124 svm_set_intercept(svm, INTERCEPT_RSM); 1125 1126 if (!kvm_mwait_in_guest(vcpu->kvm)) { 1127 svm_set_intercept(svm, INTERCEPT_MONITOR); 1128 svm_set_intercept(svm, INTERCEPT_MWAIT); 1129 } 1130 1131 if (!kvm_hlt_in_guest(vcpu->kvm)) { 1132 if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) 1133 svm_set_intercept(svm, INTERCEPT_IDLE_HLT); 1134 else 1135 svm_set_intercept(svm, INTERCEPT_HLT); 1136 } 1137 1138 control->iopm_base_pa = iopm_base; 1139 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1140 control->int_ctl = V_INTR_MASKING_MASK; 1141 1142 init_seg(&save->es); 1143 init_seg(&save->ss); 1144 init_seg(&save->ds); 1145 init_seg(&save->fs); 1146 init_seg(&save->gs); 1147 1148 save->cs.selector = 0xf000; 1149 save->cs.base = 0xffff0000; 1150 /* Executable/Readable Code Segment */ 1151 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1152 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1153 save->cs.limit = 0xffff; 1154 1155 save->gdtr.base = 0; 1156 save->gdtr.limit = 0xffff; 1157 save->idtr.base = 0; 1158 save->idtr.limit = 0xffff; 1159 1160 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1161 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1162 1163 if (npt_enabled) { 1164 /* Setup VMCB for Nested Paging */ 1165 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 1166 svm_clr_intercept(svm, INTERCEPT_INVLPG); 1167 clr_exception_intercept(svm, PF_VECTOR); 1168 svm_clr_intercept(svm, INTERCEPT_CR3_READ); 1169 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); 1170 save->g_pat = vcpu->arch.pat; 1171 save->cr3 = 0; 1172 } 1173 svm->current_vmcb->asid_generation = 0; 1174 svm->asid = 0; 1175 1176 svm->nested.vmcb12_gpa = INVALID_GPA; 1177 svm->nested.last_vmcb12_gpa = INVALID_GPA; 1178 1179 if (!kvm_pause_in_guest(vcpu->kvm)) { 1180 control->pause_filter_count = pause_filter_count; 1181 if (pause_filter_thresh) 1182 control->pause_filter_thresh = pause_filter_thresh; 1183 svm_set_intercept(svm, INTERCEPT_PAUSE); 1184 } else { 1185 svm_clr_intercept(svm, INTERCEPT_PAUSE); 1186 } 1187 1188 if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) 1189 svm->vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP; 1190 1191 if (enable_apicv && irqchip_in_kernel(vcpu->kvm)) 1192 avic_init_vmcb(svm, vmcb); 1193 1194 if (vnmi) 1195 svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK; 1196 1197 if (vgif) { 1198 svm_clr_intercept(svm, INTERCEPT_STGI); 1199 svm_clr_intercept(svm, INTERCEPT_CLGI); 1200 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1201 } 1202 1203 if (vls) 1204 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1205 1206 if (vcpu->kvm->arch.bus_lock_detection_enabled) 1207 svm_set_intercept(svm, INTERCEPT_BUSLOCK); 1208 1209 if (sev_guest(vcpu->kvm)) 1210 sev_init_vmcb(svm, init_event); 1211 1212 svm_hv_init_vmcb(vmcb); 1213 1214 kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); 1215 1216 vmcb_mark_all_dirty(vmcb); 1217 1218 enable_gif(svm); 1219 } 1220 1221 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) 1222 { 1223 struct vcpu_svm *svm = to_svm(vcpu); 1224 1225 svm_init_osvw(vcpu); 1226 1227 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 1228 vcpu->arch.microcode_version = 0x01000065; 1229 svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; 1230 1231 svm->nmi_masked = false; 1232 svm->awaiting_iret_completion = false; 1233 } 1234 1235 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 1236 { 1237 struct vcpu_svm *svm = to_svm(vcpu); 1238 1239 svm->spec_ctrl = 0; 1240 svm->virt_spec_ctrl = 0; 1241 1242 init_vmcb(vcpu, init_event); 1243 1244 if (!init_event) 1245 __svm_vcpu_reset(vcpu); 1246 } 1247 1248 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) 1249 { 1250 svm->current_vmcb = target_vmcb; 1251 svm->vmcb = target_vmcb->ptr; 1252 } 1253 1254 static int svm_vcpu_precreate(struct kvm *kvm) 1255 { 1256 return avic_alloc_physical_id_table(kvm); 1257 } 1258 1259 static int svm_vcpu_create(struct kvm_vcpu *vcpu) 1260 { 1261 struct vcpu_svm *svm; 1262 struct page *vmcb01_page; 1263 int err; 1264 1265 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); 1266 svm = to_svm(vcpu); 1267 1268 err = -ENOMEM; 1269 vmcb01_page = snp_safe_alloc_page(); 1270 if (!vmcb01_page) 1271 goto out; 1272 1273 err = sev_vcpu_create(vcpu); 1274 if (err) 1275 goto error_free_vmcb_page; 1276 1277 err = avic_init_vcpu(svm); 1278 if (err) 1279 goto error_free_sev; 1280 1281 svm->msrpm = svm_vcpu_alloc_msrpm(); 1282 if (!svm->msrpm) { 1283 err = -ENOMEM; 1284 goto error_free_sev; 1285 } 1286 1287 svm->x2avic_msrs_intercepted = true; 1288 svm->lbr_msrs_intercepted = true; 1289 1290 svm->vmcb01.ptr = page_address(vmcb01_page); 1291 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); 1292 svm_switch_vmcb(svm, &svm->vmcb01); 1293 1294 svm->guest_state_loaded = false; 1295 1296 return 0; 1297 1298 error_free_sev: 1299 sev_free_vcpu(vcpu); 1300 error_free_vmcb_page: 1301 __free_page(vmcb01_page); 1302 out: 1303 return err; 1304 } 1305 1306 static void svm_vcpu_free(struct kvm_vcpu *vcpu) 1307 { 1308 struct vcpu_svm *svm = to_svm(vcpu); 1309 1310 WARN_ON_ONCE(!list_empty(&svm->ir_list)); 1311 1312 svm_leave_nested(vcpu); 1313 svm_free_nested(svm); 1314 1315 sev_free_vcpu(vcpu); 1316 1317 __free_page(__sme_pa_to_page(svm->vmcb01.pa)); 1318 svm_vcpu_free_msrpm(svm->msrpm); 1319 } 1320 1321 #ifdef CONFIG_CPU_MITIGATIONS 1322 static DEFINE_SPINLOCK(srso_lock); 1323 static atomic_t srso_nr_vms; 1324 1325 static void svm_srso_clear_bp_spec_reduce(void *ign) 1326 { 1327 struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); 1328 1329 if (!sd->bp_spec_reduce_set) 1330 return; 1331 1332 msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); 1333 sd->bp_spec_reduce_set = false; 1334 } 1335 1336 static void svm_srso_vm_destroy(void) 1337 { 1338 if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) 1339 return; 1340 1341 if (atomic_dec_return(&srso_nr_vms)) 1342 return; 1343 1344 guard(spinlock)(&srso_lock); 1345 1346 /* 1347 * Verify a new VM didn't come along, acquire the lock, and increment 1348 * the count before this task acquired the lock. 1349 */ 1350 if (atomic_read(&srso_nr_vms)) 1351 return; 1352 1353 on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); 1354 } 1355 1356 static void svm_srso_vm_init(void) 1357 { 1358 if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) 1359 return; 1360 1361 /* 1362 * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 1363 * transition, i.e. destroying the last VM, is fully complete, e.g. so 1364 * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. 1365 */ 1366 if (atomic_inc_not_zero(&srso_nr_vms)) 1367 return; 1368 1369 guard(spinlock)(&srso_lock); 1370 1371 atomic_inc(&srso_nr_vms); 1372 } 1373 #else 1374 static void svm_srso_vm_init(void) { } 1375 static void svm_srso_vm_destroy(void) { } 1376 #endif 1377 1378 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1379 { 1380 struct vcpu_svm *svm = to_svm(vcpu); 1381 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 1382 1383 if (sev_es_guest(vcpu->kvm)) 1384 sev_es_unmap_ghcb(svm); 1385 1386 if (svm->guest_state_loaded) 1387 return; 1388 1389 /* 1390 * Save additional host state that will be restored on VMEXIT (sev-es) 1391 * or subsequent vmload of host save area. 1392 */ 1393 vmsave(sd->save_area_pa); 1394 if (sev_es_guest(vcpu->kvm)) 1395 sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd)); 1396 1397 if (tsc_scaling) 1398 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); 1399 1400 /* 1401 * TSC_AUX is always virtualized (context switched by hardware) for 1402 * SEV-ES guests when the feature is available. For non-SEV-ES guests, 1403 * context switch TSC_AUX via the user_return MSR infrastructure (not 1404 * all CPUs support TSC_AUX virtualization). 1405 */ 1406 if (likely(tsc_aux_uret_slot >= 0) && 1407 (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) 1408 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); 1409 1410 if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && 1411 !sd->bp_spec_reduce_set) { 1412 sd->bp_spec_reduce_set = true; 1413 msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); 1414 } 1415 svm->guest_state_loaded = true; 1416 } 1417 1418 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) 1419 { 1420 to_svm(vcpu)->guest_state_loaded = false; 1421 } 1422 1423 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1424 { 1425 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1426 shrink_ple_window(vcpu); 1427 1428 if (kvm_vcpu_apicv_active(vcpu)) 1429 avic_vcpu_load(vcpu, cpu); 1430 } 1431 1432 static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1433 { 1434 if (kvm_vcpu_apicv_active(vcpu)) 1435 avic_vcpu_put(vcpu); 1436 1437 svm_prepare_host_switch(vcpu); 1438 1439 ++vcpu->stat.host_state_reload; 1440 } 1441 1442 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1443 { 1444 struct vcpu_svm *svm = to_svm(vcpu); 1445 unsigned long rflags = svm->vmcb->save.rflags; 1446 1447 if (svm->nmi_singlestep) { 1448 /* Hide our flags if they were not set by the guest */ 1449 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 1450 rflags &= ~X86_EFLAGS_TF; 1451 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 1452 rflags &= ~X86_EFLAGS_RF; 1453 } 1454 return rflags; 1455 } 1456 1457 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1458 { 1459 if (to_svm(vcpu)->nmi_singlestep) 1460 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 1461 1462 /* 1463 * Any change of EFLAGS.VM is accompanied by a reload of SS 1464 * (caused by either a task switch or an inter-privilege IRET), 1465 * so we do not need to update the CPL here. 1466 */ 1467 to_svm(vcpu)->vmcb->save.rflags = rflags; 1468 } 1469 1470 static bool svm_get_if_flag(struct kvm_vcpu *vcpu) 1471 { 1472 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1473 1474 return sev_es_guest(vcpu->kvm) 1475 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK 1476 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; 1477 } 1478 1479 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1480 { 1481 kvm_register_mark_available(vcpu, reg); 1482 1483 switch (reg) { 1484 case VCPU_EXREG_PDPTR: 1485 /* 1486 * When !npt_enabled, mmu->pdptrs[] is already available since 1487 * it is always updated per SDM when moving to CRs. 1488 */ 1489 if (npt_enabled) 1490 load_pdptrs(vcpu, kvm_read_cr3(vcpu)); 1491 break; 1492 default: 1493 KVM_BUG_ON(1, vcpu->kvm); 1494 } 1495 } 1496 1497 static void svm_set_vintr(struct vcpu_svm *svm) 1498 { 1499 struct vmcb_control_area *control; 1500 1501 /* 1502 * The following fields are ignored when AVIC is enabled 1503 */ 1504 WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu)); 1505 1506 svm_set_intercept(svm, INTERCEPT_VINTR); 1507 1508 /* 1509 * Recalculating intercepts may have cleared the VINTR intercept. If 1510 * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF 1511 * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN. 1512 * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as 1513 * interrupts will never be unblocked while L2 is running. 1514 */ 1515 if (!svm_is_intercept(svm, INTERCEPT_VINTR)) 1516 return; 1517 1518 /* 1519 * This is just a dummy VINTR to actually cause a vmexit to happen. 1520 * Actual injection of virtual interrupts happens through EVENTINJ. 1521 */ 1522 control = &svm->vmcb->control; 1523 control->int_vector = 0x0; 1524 control->int_ctl &= ~V_INTR_PRIO_MASK; 1525 control->int_ctl |= V_IRQ_MASK | 1526 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 1527 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1528 } 1529 1530 static void svm_clear_vintr(struct vcpu_svm *svm) 1531 { 1532 svm_clr_intercept(svm, INTERCEPT_VINTR); 1533 1534 /* Drop int_ctl fields related to VINTR injection. */ 1535 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1536 if (is_guest_mode(&svm->vcpu)) { 1537 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1538 1539 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != 1540 (svm->nested.ctl.int_ctl & V_TPR_MASK)); 1541 1542 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & 1543 V_IRQ_INJECTION_BITS_MASK; 1544 1545 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; 1546 } 1547 1548 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1549 } 1550 1551 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1552 { 1553 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1554 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save; 1555 1556 switch (seg) { 1557 case VCPU_SREG_CS: return &save->cs; 1558 case VCPU_SREG_DS: return &save->ds; 1559 case VCPU_SREG_ES: return &save->es; 1560 case VCPU_SREG_FS: return &save01->fs; 1561 case VCPU_SREG_GS: return &save01->gs; 1562 case VCPU_SREG_SS: return &save->ss; 1563 case VCPU_SREG_TR: return &save01->tr; 1564 case VCPU_SREG_LDTR: return &save01->ldtr; 1565 } 1566 BUG(); 1567 return NULL; 1568 } 1569 1570 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) 1571 { 1572 struct vmcb_seg *s = svm_seg(vcpu, seg); 1573 1574 return s->base; 1575 } 1576 1577 static void svm_get_segment(struct kvm_vcpu *vcpu, 1578 struct kvm_segment *var, int seg) 1579 { 1580 struct vmcb_seg *s = svm_seg(vcpu, seg); 1581 1582 var->base = s->base; 1583 var->limit = s->limit; 1584 var->selector = s->selector; 1585 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; 1586 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; 1587 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; 1588 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; 1589 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; 1590 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 1591 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1592 1593 /* 1594 * AMD CPUs circa 2014 track the G bit for all segments except CS. 1595 * However, the SVM spec states that the G bit is not observed by the 1596 * CPU, and some VMware virtual CPUs drop the G bit for all segments. 1597 * So let's synthesize a legal G bit for all segments, this helps 1598 * running KVM nested. It also helps cross-vendor migration, because 1599 * Intel's vmentry has a check on the 'G' bit. 1600 */ 1601 var->g = s->limit > 0xfffff; 1602 1603 /* 1604 * AMD's VMCB does not have an explicit unusable field, so emulate it 1605 * for cross vendor migration purposes by "not present" 1606 */ 1607 var->unusable = !var->present; 1608 1609 switch (seg) { 1610 case VCPU_SREG_TR: 1611 /* 1612 * Work around a bug where the busy flag in the tr selector 1613 * isn't exposed 1614 */ 1615 var->type |= 0x2; 1616 break; 1617 case VCPU_SREG_DS: 1618 case VCPU_SREG_ES: 1619 case VCPU_SREG_FS: 1620 case VCPU_SREG_GS: 1621 /* 1622 * The accessed bit must always be set in the segment 1623 * descriptor cache, although it can be cleared in the 1624 * descriptor, the cached bit always remains at 1. Since 1625 * Intel has a check on this, set it here to support 1626 * cross-vendor migration. 1627 */ 1628 if (!var->unusable) 1629 var->type |= 0x1; 1630 break; 1631 case VCPU_SREG_SS: 1632 /* 1633 * On AMD CPUs sometimes the DB bit in the segment 1634 * descriptor is left as 1, although the whole segment has 1635 * been made unusable. Clear it here to pass an Intel VMX 1636 * entry check when cross vendor migrating. 1637 */ 1638 if (var->unusable) 1639 var->db = 0; 1640 /* This is symmetric with svm_set_segment() */ 1641 var->dpl = to_svm(vcpu)->vmcb->save.cpl; 1642 break; 1643 } 1644 } 1645 1646 static int svm_get_cpl(struct kvm_vcpu *vcpu) 1647 { 1648 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1649 1650 return save->cpl; 1651 } 1652 1653 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 1654 { 1655 struct kvm_segment cs; 1656 1657 svm_get_segment(vcpu, &cs, VCPU_SREG_CS); 1658 *db = cs.db; 1659 *l = cs.l; 1660 } 1661 1662 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1663 { 1664 struct vcpu_svm *svm = to_svm(vcpu); 1665 1666 dt->size = svm->vmcb->save.idtr.limit; 1667 dt->address = svm->vmcb->save.idtr.base; 1668 } 1669 1670 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1671 { 1672 struct vcpu_svm *svm = to_svm(vcpu); 1673 1674 svm->vmcb->save.idtr.limit = dt->size; 1675 svm->vmcb->save.idtr.base = dt->address ; 1676 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1677 } 1678 1679 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1680 { 1681 struct vcpu_svm *svm = to_svm(vcpu); 1682 1683 dt->size = svm->vmcb->save.gdtr.limit; 1684 dt->address = svm->vmcb->save.gdtr.base; 1685 } 1686 1687 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1688 { 1689 struct vcpu_svm *svm = to_svm(vcpu); 1690 1691 svm->vmcb->save.gdtr.limit = dt->size; 1692 svm->vmcb->save.gdtr.base = dt->address ; 1693 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1694 } 1695 1696 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1697 { 1698 struct vcpu_svm *svm = to_svm(vcpu); 1699 1700 /* 1701 * For guests that don't set guest_state_protected, the cr3 update is 1702 * handled via kvm_mmu_load() while entering the guest. For guests 1703 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to 1704 * VMCB save area now, since the save area will become the initial 1705 * contents of the VMSA, and future VMCB save area updates won't be 1706 * seen. 1707 */ 1708 if (sev_es_guest(vcpu->kvm)) { 1709 svm->vmcb->save.cr3 = cr3; 1710 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1711 } 1712 } 1713 1714 static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1715 { 1716 return true; 1717 } 1718 1719 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1720 { 1721 struct vcpu_svm *svm = to_svm(vcpu); 1722 u64 hcr0 = cr0; 1723 bool old_paging = is_paging(vcpu); 1724 1725 #ifdef CONFIG_X86_64 1726 if (vcpu->arch.efer & EFER_LME) { 1727 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1728 vcpu->arch.efer |= EFER_LMA; 1729 if (!vcpu->arch.guest_state_protected) 1730 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1731 } 1732 1733 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1734 vcpu->arch.efer &= ~EFER_LMA; 1735 if (!vcpu->arch.guest_state_protected) 1736 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1737 } 1738 } 1739 #endif 1740 vcpu->arch.cr0 = cr0; 1741 1742 if (!npt_enabled) { 1743 hcr0 |= X86_CR0_PG | X86_CR0_WP; 1744 if (old_paging != is_paging(vcpu)) 1745 svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); 1746 } 1747 1748 /* 1749 * re-enable caching here because the QEMU bios 1750 * does not do it - this results in some delay at 1751 * reboot 1752 */ 1753 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 1754 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1755 1756 svm->vmcb->save.cr0 = hcr0; 1757 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1758 1759 /* 1760 * SEV-ES guests must always keep the CR intercepts cleared. CR 1761 * tracking is done using the CR write traps. 1762 */ 1763 if (sev_es_guest(vcpu->kvm)) 1764 return; 1765 1766 if (hcr0 == cr0) { 1767 /* Selective CR0 write remains on. */ 1768 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 1769 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 1770 } else { 1771 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1772 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1773 } 1774 } 1775 1776 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1777 { 1778 return true; 1779 } 1780 1781 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1782 { 1783 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; 1784 unsigned long old_cr4 = vcpu->arch.cr4; 1785 1786 vcpu->arch.cr4 = cr4; 1787 if (!npt_enabled) { 1788 cr4 |= X86_CR4_PAE; 1789 1790 if (!is_paging(vcpu)) 1791 cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 1792 } 1793 cr4 |= host_cr4_mce; 1794 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1795 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 1796 1797 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 1798 vcpu->arch.cpuid_dynamic_bits_dirty = true; 1799 } 1800 1801 static void svm_set_segment(struct kvm_vcpu *vcpu, 1802 struct kvm_segment *var, int seg) 1803 { 1804 struct vcpu_svm *svm = to_svm(vcpu); 1805 struct vmcb_seg *s = svm_seg(vcpu, seg); 1806 1807 s->base = var->base; 1808 s->limit = var->limit; 1809 s->selector = var->selector; 1810 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); 1811 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; 1812 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; 1813 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; 1814 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; 1815 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; 1816 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 1817 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1818 1819 /* 1820 * This is always accurate, except if SYSRET returned to a segment 1821 * with SS.DPL != 3. Intel does not have this quirk, and always 1822 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it 1823 * would entail passing the CPL to userspace and back. 1824 */ 1825 if (seg == VCPU_SREG_SS) 1826 /* This is symmetric with svm_get_segment() */ 1827 svm->vmcb->save.cpl = (var->dpl & 3); 1828 1829 vmcb_mark_dirty(svm->vmcb, VMCB_SEG); 1830 } 1831 1832 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu) 1833 { 1834 struct vcpu_svm *svm = to_svm(vcpu); 1835 1836 clr_exception_intercept(svm, BP_VECTOR); 1837 1838 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1839 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1840 set_exception_intercept(svm, BP_VECTOR); 1841 } 1842 } 1843 1844 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1845 { 1846 if (sd->next_asid > sd->max_asid) { 1847 ++sd->asid_generation; 1848 sd->next_asid = sd->min_asid; 1849 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1850 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 1851 } 1852 1853 svm->current_vmcb->asid_generation = sd->asid_generation; 1854 svm->asid = sd->next_asid++; 1855 } 1856 1857 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) 1858 { 1859 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1860 1861 if (vcpu->arch.guest_state_protected) 1862 return; 1863 1864 if (unlikely(value != vmcb->save.dr6)) { 1865 vmcb->save.dr6 = value; 1866 vmcb_mark_dirty(vmcb, VMCB_DR); 1867 } 1868 } 1869 1870 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 1871 { 1872 struct vcpu_svm *svm = to_svm(vcpu); 1873 1874 if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm))) 1875 return; 1876 1877 get_debugreg(vcpu->arch.db[0], 0); 1878 get_debugreg(vcpu->arch.db[1], 1); 1879 get_debugreg(vcpu->arch.db[2], 2); 1880 get_debugreg(vcpu->arch.db[3], 3); 1881 /* 1882 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here, 1883 * because db_interception might need it. We can do it before vmentry. 1884 */ 1885 vcpu->arch.dr6 = svm->vmcb->save.dr6; 1886 vcpu->arch.dr7 = svm->vmcb->save.dr7; 1887 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 1888 set_dr_intercepts(svm); 1889 } 1890 1891 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1892 { 1893 struct vcpu_svm *svm = to_svm(vcpu); 1894 1895 if (vcpu->arch.guest_state_protected) 1896 return; 1897 1898 svm->vmcb->save.dr7 = value; 1899 vmcb_mark_dirty(svm->vmcb, VMCB_DR); 1900 } 1901 1902 static int pf_interception(struct kvm_vcpu *vcpu) 1903 { 1904 struct vcpu_svm *svm = to_svm(vcpu); 1905 1906 u64 fault_address = svm->vmcb->control.exit_info_2; 1907 u64 error_code = svm->vmcb->control.exit_info_1; 1908 1909 return kvm_handle_page_fault(vcpu, error_code, fault_address, 1910 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 1911 svm->vmcb->control.insn_bytes : NULL, 1912 svm->vmcb->control.insn_len); 1913 } 1914 1915 static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1916 void *insn, int insn_len); 1917 1918 static int npf_interception(struct kvm_vcpu *vcpu) 1919 { 1920 struct vcpu_svm *svm = to_svm(vcpu); 1921 int rc; 1922 1923 u64 error_code = svm->vmcb->control.exit_info_1; 1924 gpa_t gpa = svm->vmcb->control.exit_info_2; 1925 1926 /* 1927 * WARN if hardware generates a fault with an error code that collides 1928 * with KVM-defined sythentic flags. Clear the flags and continue on, 1929 * i.e. don't terminate the VM, as KVM can't possibly be relying on a 1930 * flag that KVM doesn't know about. 1931 */ 1932 if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) 1933 error_code &= ~PFERR_SYNTHETIC_MASK; 1934 1935 /* 1936 * Expedite fast MMIO kicks if the next RIP is known and KVM is allowed 1937 * emulate a page fault, e.g. skipping the current instruction is wrong 1938 * if the #NPF occurred while vectoring an event. 1939 */ 1940 if ((error_code & PFERR_RSVD_MASK) && !is_guest_mode(vcpu)) { 1941 const int emul_type = EMULTYPE_PF | EMULTYPE_NO_DECODE; 1942 1943 if (svm_check_emulate_instruction(vcpu, emul_type, NULL, 0)) 1944 return 1; 1945 1946 if (nrips && svm->vmcb->control.next_rip && 1947 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 1948 trace_kvm_fast_mmio(gpa); 1949 return kvm_skip_emulated_instruction(vcpu); 1950 } 1951 } 1952 1953 if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) 1954 error_code |= PFERR_PRIVATE_ACCESS; 1955 1956 trace_kvm_page_fault(vcpu, gpa, error_code); 1957 rc = kvm_mmu_page_fault(vcpu, gpa, error_code, 1958 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 1959 svm->vmcb->control.insn_bytes : NULL, 1960 svm->vmcb->control.insn_len); 1961 1962 if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) 1963 sev_handle_rmp_fault(vcpu, gpa, error_code); 1964 1965 return rc; 1966 } 1967 1968 static int db_interception(struct kvm_vcpu *vcpu) 1969 { 1970 struct kvm_run *kvm_run = vcpu->run; 1971 struct vcpu_svm *svm = to_svm(vcpu); 1972 1973 if (!(vcpu->guest_debug & 1974 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1975 !svm->nmi_singlestep) { 1976 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; 1977 kvm_queue_exception_p(vcpu, DB_VECTOR, payload); 1978 return 1; 1979 } 1980 1981 if (svm->nmi_singlestep) { 1982 disable_nmi_singlestep(svm); 1983 /* Make sure we check for pending NMIs upon entry */ 1984 kvm_make_request(KVM_REQ_EVENT, vcpu); 1985 } 1986 1987 if (vcpu->guest_debug & 1988 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { 1989 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1990 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; 1991 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; 1992 kvm_run->debug.arch.pc = 1993 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1994 kvm_run->debug.arch.exception = DB_VECTOR; 1995 return 0; 1996 } 1997 1998 return 1; 1999 } 2000 2001 static int bp_interception(struct kvm_vcpu *vcpu) 2002 { 2003 struct vcpu_svm *svm = to_svm(vcpu); 2004 struct kvm_run *kvm_run = vcpu->run; 2005 2006 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2007 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2008 kvm_run->debug.arch.exception = BP_VECTOR; 2009 return 0; 2010 } 2011 2012 static int ud_interception(struct kvm_vcpu *vcpu) 2013 { 2014 return handle_ud(vcpu); 2015 } 2016 2017 static int ac_interception(struct kvm_vcpu *vcpu) 2018 { 2019 kvm_queue_exception_e(vcpu, AC_VECTOR, 0); 2020 return 1; 2021 } 2022 2023 static bool is_erratum_383(void) 2024 { 2025 int i; 2026 u64 value; 2027 2028 if (!erratum_383_found) 2029 return false; 2030 2031 if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) 2032 return false; 2033 2034 /* Bit 62 may or may not be set for this mce */ 2035 value &= ~(1ULL << 62); 2036 2037 if (value != 0xb600000000010015ULL) 2038 return false; 2039 2040 /* Clear MCi_STATUS registers */ 2041 for (i = 0; i < 6; ++i) 2042 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); 2043 2044 if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { 2045 value &= ~(1ULL << 2); 2046 native_write_msr_safe(MSR_IA32_MCG_STATUS, value); 2047 } 2048 2049 /* Flush tlb to evict multi-match entries */ 2050 __flush_tlb_all(); 2051 2052 return true; 2053 } 2054 2055 static void svm_handle_mce(struct kvm_vcpu *vcpu) 2056 { 2057 if (is_erratum_383()) { 2058 /* 2059 * Erratum 383 triggered. Guest state is corrupt so kill the 2060 * guest. 2061 */ 2062 pr_err("Guest triggered AMD Erratum 383\n"); 2063 2064 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2065 2066 return; 2067 } 2068 2069 /* 2070 * On an #MC intercept the MCE handler is not called automatically in 2071 * the host. So do it by hand here. 2072 */ 2073 kvm_machine_check(); 2074 } 2075 2076 static int mc_interception(struct kvm_vcpu *vcpu) 2077 { 2078 return 1; 2079 } 2080 2081 static int shutdown_interception(struct kvm_vcpu *vcpu) 2082 { 2083 struct kvm_run *kvm_run = vcpu->run; 2084 struct vcpu_svm *svm = to_svm(vcpu); 2085 2086 2087 /* 2088 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put 2089 * the VMCB in a known good state. Unfortuately, KVM doesn't have 2090 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking 2091 * userspace. At a platform view, INIT is acceptable behavior as 2092 * there exist bare metal platforms that automatically INIT the CPU 2093 * in response to shutdown. 2094 * 2095 * The VM save area for SEV-ES guests has already been encrypted so it 2096 * cannot be reinitialized, i.e. synthesizing INIT is futile. 2097 */ 2098 if (!sev_es_guest(vcpu->kvm)) { 2099 clear_page(svm->vmcb); 2100 #ifdef CONFIG_KVM_SMM 2101 if (is_smm(vcpu)) 2102 kvm_smm_changed(vcpu, false); 2103 #endif 2104 kvm_vcpu_reset(vcpu, true); 2105 } 2106 2107 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2108 return 0; 2109 } 2110 2111 static int io_interception(struct kvm_vcpu *vcpu) 2112 { 2113 struct vcpu_svm *svm = to_svm(vcpu); 2114 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 2115 int size, in, string; 2116 unsigned port; 2117 2118 ++vcpu->stat.io_exits; 2119 string = (io_info & SVM_IOIO_STR_MASK) != 0; 2120 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 2121 port = io_info >> 16; 2122 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2123 2124 if (string) { 2125 if (sev_es_guest(vcpu->kvm)) 2126 return sev_es_string_io(svm, size, port, in); 2127 else 2128 return kvm_emulate_instruction(vcpu, 0); 2129 } 2130 2131 svm->next_rip = svm->vmcb->control.exit_info_2; 2132 2133 return kvm_fast_pio(vcpu, size, port, in); 2134 } 2135 2136 static int nmi_interception(struct kvm_vcpu *vcpu) 2137 { 2138 return 1; 2139 } 2140 2141 static int smi_interception(struct kvm_vcpu *vcpu) 2142 { 2143 return 1; 2144 } 2145 2146 static int intr_interception(struct kvm_vcpu *vcpu) 2147 { 2148 ++vcpu->stat.irq_exits; 2149 return 1; 2150 } 2151 2152 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) 2153 { 2154 struct vcpu_svm *svm = to_svm(vcpu); 2155 struct vmcb *vmcb12; 2156 struct kvm_host_map map; 2157 int ret; 2158 2159 if (nested_svm_check_permissions(vcpu)) 2160 return 1; 2161 2162 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); 2163 if (ret) { 2164 if (ret == -EINVAL) 2165 kvm_inject_gp(vcpu, 0); 2166 return 1; 2167 } 2168 2169 vmcb12 = map.hva; 2170 2171 ret = kvm_skip_emulated_instruction(vcpu); 2172 2173 /* KVM always performs VMLOAD/VMSAVE on VMCB01 (see __svm_vcpu_run()) */ 2174 if (vmload) { 2175 svm_copy_vmloadsave_state(svm->vmcb01.ptr, vmcb12); 2176 svm->sysenter_eip_hi = 0; 2177 svm->sysenter_esp_hi = 0; 2178 } else { 2179 svm_copy_vmloadsave_state(vmcb12, svm->vmcb01.ptr); 2180 } 2181 2182 kvm_vcpu_unmap(vcpu, &map); 2183 2184 return ret; 2185 } 2186 2187 static int vmload_interception(struct kvm_vcpu *vcpu) 2188 { 2189 return vmload_vmsave_interception(vcpu, true); 2190 } 2191 2192 static int vmsave_interception(struct kvm_vcpu *vcpu) 2193 { 2194 return vmload_vmsave_interception(vcpu, false); 2195 } 2196 2197 static int vmrun_interception(struct kvm_vcpu *vcpu) 2198 { 2199 if (nested_svm_check_permissions(vcpu)) 2200 return 1; 2201 2202 return nested_svm_vmrun(vcpu); 2203 } 2204 2205 enum { 2206 NONE_SVM_INSTR, 2207 SVM_INSTR_VMRUN, 2208 SVM_INSTR_VMLOAD, 2209 SVM_INSTR_VMSAVE, 2210 }; 2211 2212 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ 2213 static int svm_instr_opcode(struct kvm_vcpu *vcpu) 2214 { 2215 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 2216 2217 if (ctxt->b != 0x1 || ctxt->opcode_len != 2) 2218 return NONE_SVM_INSTR; 2219 2220 switch (ctxt->modrm) { 2221 case 0xd8: /* VMRUN */ 2222 return SVM_INSTR_VMRUN; 2223 case 0xda: /* VMLOAD */ 2224 return SVM_INSTR_VMLOAD; 2225 case 0xdb: /* VMSAVE */ 2226 return SVM_INSTR_VMSAVE; 2227 default: 2228 break; 2229 } 2230 2231 return NONE_SVM_INSTR; 2232 } 2233 2234 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) 2235 { 2236 const int guest_mode_exit_codes[] = { 2237 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, 2238 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, 2239 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, 2240 }; 2241 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { 2242 [SVM_INSTR_VMRUN] = vmrun_interception, 2243 [SVM_INSTR_VMLOAD] = vmload_interception, 2244 [SVM_INSTR_VMSAVE] = vmsave_interception, 2245 }; 2246 struct vcpu_svm *svm = to_svm(vcpu); 2247 int ret; 2248 2249 if (is_guest_mode(vcpu)) { 2250 /* Returns '1' or -errno on failure, '0' on success. */ 2251 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); 2252 if (ret) 2253 return ret; 2254 return 1; 2255 } 2256 return svm_instr_handlers[opcode](vcpu); 2257 } 2258 2259 /* 2260 * #GP handling code. Note that #GP can be triggered under the following two 2261 * cases: 2262 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on 2263 * some AMD CPUs when EAX of these instructions are in the reserved memory 2264 * regions (e.g. SMM memory on host). 2265 * 2) VMware backdoor 2266 */ 2267 static int gp_interception(struct kvm_vcpu *vcpu) 2268 { 2269 struct vcpu_svm *svm = to_svm(vcpu); 2270 u32 error_code = svm->vmcb->control.exit_info_1; 2271 int opcode; 2272 2273 /* Both #GP cases have zero error_code */ 2274 if (error_code) 2275 goto reinject; 2276 2277 /* Decode the instruction for usage later */ 2278 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) 2279 goto reinject; 2280 2281 opcode = svm_instr_opcode(vcpu); 2282 2283 if (opcode == NONE_SVM_INSTR) { 2284 if (!enable_vmware_backdoor) 2285 goto reinject; 2286 2287 /* 2288 * VMware backdoor emulation on #GP interception only handles 2289 * IN{S}, OUT{S}, and RDPMC. 2290 */ 2291 if (!is_guest_mode(vcpu)) 2292 return kvm_emulate_instruction(vcpu, 2293 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); 2294 } else { 2295 /* All SVM instructions expect page aligned RAX */ 2296 if (svm->vmcb->save.rax & ~PAGE_MASK) 2297 goto reinject; 2298 2299 return emulate_svm_instr(vcpu, opcode); 2300 } 2301 2302 reinject: 2303 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 2304 return 1; 2305 } 2306 2307 void svm_set_gif(struct vcpu_svm *svm, bool value) 2308 { 2309 if (value) { 2310 /* 2311 * If VGIF is enabled, the STGI intercept is only added to 2312 * detect the opening of the SMI/NMI window; remove it now. 2313 * Likewise, clear the VINTR intercept, we will set it 2314 * again while processing KVM_REQ_EVENT if needed. 2315 */ 2316 if (vgif) 2317 svm_clr_intercept(svm, INTERCEPT_STGI); 2318 if (svm_is_intercept(svm, INTERCEPT_VINTR)) 2319 svm_clear_vintr(svm); 2320 2321 enable_gif(svm); 2322 if (svm->vcpu.arch.smi_pending || 2323 svm->vcpu.arch.nmi_pending || 2324 kvm_cpu_has_injectable_intr(&svm->vcpu) || 2325 kvm_apic_has_pending_init_or_sipi(&svm->vcpu)) 2326 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 2327 } else { 2328 disable_gif(svm); 2329 2330 /* 2331 * After a CLGI no interrupts should come. But if vGIF is 2332 * in use, we still rely on the VINTR intercept (rather than 2333 * STGI) to detect an open interrupt window. 2334 */ 2335 if (!vgif) 2336 svm_clear_vintr(svm); 2337 } 2338 } 2339 2340 static int stgi_interception(struct kvm_vcpu *vcpu) 2341 { 2342 int ret; 2343 2344 if (nested_svm_check_permissions(vcpu)) 2345 return 1; 2346 2347 ret = kvm_skip_emulated_instruction(vcpu); 2348 svm_set_gif(to_svm(vcpu), true); 2349 return ret; 2350 } 2351 2352 static int clgi_interception(struct kvm_vcpu *vcpu) 2353 { 2354 int ret; 2355 2356 if (nested_svm_check_permissions(vcpu)) 2357 return 1; 2358 2359 ret = kvm_skip_emulated_instruction(vcpu); 2360 svm_set_gif(to_svm(vcpu), false); 2361 return ret; 2362 } 2363 2364 static int invlpga_interception(struct kvm_vcpu *vcpu) 2365 { 2366 gva_t gva = kvm_rax_read(vcpu); 2367 u32 asid = kvm_rcx_read(vcpu); 2368 2369 /* FIXME: Handle an address size prefix. */ 2370 if (!is_long_mode(vcpu)) 2371 gva = (u32)gva; 2372 2373 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva); 2374 2375 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 2376 kvm_mmu_invlpg(vcpu, gva); 2377 2378 return kvm_skip_emulated_instruction(vcpu); 2379 } 2380 2381 static int skinit_interception(struct kvm_vcpu *vcpu) 2382 { 2383 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu)); 2384 2385 kvm_queue_exception(vcpu, UD_VECTOR); 2386 return 1; 2387 } 2388 2389 static int task_switch_interception(struct kvm_vcpu *vcpu) 2390 { 2391 struct vcpu_svm *svm = to_svm(vcpu); 2392 u16 tss_selector; 2393 int reason; 2394 int int_type = svm->vmcb->control.exit_int_info & 2395 SVM_EXITINTINFO_TYPE_MASK; 2396 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; 2397 uint32_t type = 2398 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2399 uint32_t idt_v = 2400 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2401 bool has_error_code = false; 2402 u32 error_code = 0; 2403 2404 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2405 2406 if (svm->vmcb->control.exit_info_2 & 2407 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 2408 reason = TASK_SWITCH_IRET; 2409 else if (svm->vmcb->control.exit_info_2 & 2410 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 2411 reason = TASK_SWITCH_JMP; 2412 else if (idt_v) 2413 reason = TASK_SWITCH_GATE; 2414 else 2415 reason = TASK_SWITCH_CALL; 2416 2417 if (reason == TASK_SWITCH_GATE) { 2418 switch (type) { 2419 case SVM_EXITINTINFO_TYPE_NMI: 2420 vcpu->arch.nmi_injected = false; 2421 break; 2422 case SVM_EXITINTINFO_TYPE_EXEPT: 2423 if (svm->vmcb->control.exit_info_2 & 2424 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { 2425 has_error_code = true; 2426 error_code = 2427 (u32)svm->vmcb->control.exit_info_2; 2428 } 2429 kvm_clear_exception_queue(vcpu); 2430 break; 2431 case SVM_EXITINTINFO_TYPE_INTR: 2432 case SVM_EXITINTINFO_TYPE_SOFT: 2433 kvm_clear_interrupt_queue(vcpu); 2434 break; 2435 default: 2436 break; 2437 } 2438 } 2439 2440 if (reason != TASK_SWITCH_GATE || 2441 int_type == SVM_EXITINTINFO_TYPE_SOFT || 2442 (int_type == SVM_EXITINTINFO_TYPE_EXEPT && 2443 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { 2444 if (!svm_skip_emulated_instruction(vcpu)) 2445 return 0; 2446 } 2447 2448 if (int_type != SVM_EXITINTINFO_TYPE_SOFT) 2449 int_vec = -1; 2450 2451 return kvm_task_switch(vcpu, tss_selector, int_vec, reason, 2452 has_error_code, error_code); 2453 } 2454 2455 static void svm_clr_iret_intercept(struct vcpu_svm *svm) 2456 { 2457 if (!sev_es_guest(svm->vcpu.kvm)) 2458 svm_clr_intercept(svm, INTERCEPT_IRET); 2459 } 2460 2461 static void svm_set_iret_intercept(struct vcpu_svm *svm) 2462 { 2463 if (!sev_es_guest(svm->vcpu.kvm)) 2464 svm_set_intercept(svm, INTERCEPT_IRET); 2465 } 2466 2467 static int iret_interception(struct kvm_vcpu *vcpu) 2468 { 2469 struct vcpu_svm *svm = to_svm(vcpu); 2470 2471 WARN_ON_ONCE(sev_es_guest(vcpu->kvm)); 2472 2473 ++vcpu->stat.nmi_window_exits; 2474 svm->awaiting_iret_completion = true; 2475 2476 svm_clr_iret_intercept(svm); 2477 svm->nmi_iret_rip = kvm_rip_read(vcpu); 2478 2479 kvm_make_request(KVM_REQ_EVENT, vcpu); 2480 return 1; 2481 } 2482 2483 static int invlpg_interception(struct kvm_vcpu *vcpu) 2484 { 2485 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2486 return kvm_emulate_instruction(vcpu, 0); 2487 2488 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1); 2489 return kvm_skip_emulated_instruction(vcpu); 2490 } 2491 2492 static int emulate_on_interception(struct kvm_vcpu *vcpu) 2493 { 2494 return kvm_emulate_instruction(vcpu, 0); 2495 } 2496 2497 static int rsm_interception(struct kvm_vcpu *vcpu) 2498 { 2499 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2); 2500 } 2501 2502 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, 2503 unsigned long val) 2504 { 2505 struct vcpu_svm *svm = to_svm(vcpu); 2506 unsigned long cr0 = vcpu->arch.cr0; 2507 bool ret = false; 2508 2509 if (!is_guest_mode(vcpu) || 2510 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) 2511 return false; 2512 2513 cr0 &= ~SVM_CR0_SELECTIVE_MASK; 2514 val &= ~SVM_CR0_SELECTIVE_MASK; 2515 2516 if (cr0 ^ val) { 2517 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 2518 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); 2519 } 2520 2521 return ret; 2522 } 2523 2524 #define CR_VALID (1ULL << 63) 2525 2526 static int cr_interception(struct kvm_vcpu *vcpu) 2527 { 2528 struct vcpu_svm *svm = to_svm(vcpu); 2529 int reg, cr; 2530 unsigned long val; 2531 int err; 2532 2533 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2534 return emulate_on_interception(vcpu); 2535 2536 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) 2537 return emulate_on_interception(vcpu); 2538 2539 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2540 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) 2541 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; 2542 else 2543 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 2544 2545 err = 0; 2546 if (cr >= 16) { /* mov to cr */ 2547 cr -= 16; 2548 val = kvm_register_read(vcpu, reg); 2549 trace_kvm_cr_write(cr, val); 2550 switch (cr) { 2551 case 0: 2552 if (!check_selective_cr0_intercepted(vcpu, val)) 2553 err = kvm_set_cr0(vcpu, val); 2554 else 2555 return 1; 2556 2557 break; 2558 case 3: 2559 err = kvm_set_cr3(vcpu, val); 2560 break; 2561 case 4: 2562 err = kvm_set_cr4(vcpu, val); 2563 break; 2564 case 8: 2565 err = kvm_set_cr8(vcpu, val); 2566 break; 2567 default: 2568 WARN(1, "unhandled write to CR%d", cr); 2569 kvm_queue_exception(vcpu, UD_VECTOR); 2570 return 1; 2571 } 2572 } else { /* mov from cr */ 2573 switch (cr) { 2574 case 0: 2575 val = kvm_read_cr0(vcpu); 2576 break; 2577 case 2: 2578 val = vcpu->arch.cr2; 2579 break; 2580 case 3: 2581 val = kvm_read_cr3(vcpu); 2582 break; 2583 case 4: 2584 val = kvm_read_cr4(vcpu); 2585 break; 2586 case 8: 2587 val = kvm_get_cr8(vcpu); 2588 break; 2589 default: 2590 WARN(1, "unhandled read from CR%d", cr); 2591 kvm_queue_exception(vcpu, UD_VECTOR); 2592 return 1; 2593 } 2594 kvm_register_write(vcpu, reg, val); 2595 trace_kvm_cr_read(cr, val); 2596 } 2597 return kvm_complete_insn_gp(vcpu, err); 2598 } 2599 2600 static int cr_trap(struct kvm_vcpu *vcpu) 2601 { 2602 struct vcpu_svm *svm = to_svm(vcpu); 2603 unsigned long old_value, new_value; 2604 unsigned int cr; 2605 int ret = 0; 2606 2607 new_value = (unsigned long)svm->vmcb->control.exit_info_1; 2608 2609 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP; 2610 switch (cr) { 2611 case 0: 2612 old_value = kvm_read_cr0(vcpu); 2613 svm_set_cr0(vcpu, new_value); 2614 2615 kvm_post_set_cr0(vcpu, old_value, new_value); 2616 break; 2617 case 4: 2618 old_value = kvm_read_cr4(vcpu); 2619 svm_set_cr4(vcpu, new_value); 2620 2621 kvm_post_set_cr4(vcpu, old_value, new_value); 2622 break; 2623 case 8: 2624 ret = kvm_set_cr8(vcpu, new_value); 2625 break; 2626 default: 2627 WARN(1, "unhandled CR%d write trap", cr); 2628 kvm_queue_exception(vcpu, UD_VECTOR); 2629 return 1; 2630 } 2631 2632 return kvm_complete_insn_gp(vcpu, ret); 2633 } 2634 2635 static int dr_interception(struct kvm_vcpu *vcpu) 2636 { 2637 struct vcpu_svm *svm = to_svm(vcpu); 2638 int reg, dr; 2639 int err = 0; 2640 2641 /* 2642 * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT 2643 * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early. 2644 */ 2645 if (sev_es_guest(vcpu->kvm)) 2646 return 1; 2647 2648 if (vcpu->guest_debug == 0) { 2649 /* 2650 * No more DR vmexits; force a reload of the debug registers 2651 * and reenter on this instruction. The next vmexit will 2652 * retrieve the full state of the debug registers. 2653 */ 2654 clr_dr_intercepts(svm); 2655 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 2656 return 1; 2657 } 2658 2659 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) 2660 return emulate_on_interception(vcpu); 2661 2662 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2663 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; 2664 if (dr >= 16) { /* mov to DRn */ 2665 dr -= 16; 2666 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 2667 } else { 2668 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 2669 } 2670 2671 return kvm_complete_insn_gp(vcpu, err); 2672 } 2673 2674 static int cr8_write_interception(struct kvm_vcpu *vcpu) 2675 { 2676 u8 cr8_prev = kvm_get_cr8(vcpu); 2677 int r; 2678 2679 WARN_ON_ONCE(kvm_vcpu_apicv_active(vcpu)); 2680 2681 /* instruction emulation calls kvm_set_cr8() */ 2682 r = cr_interception(vcpu); 2683 if (lapic_in_kernel(vcpu)) 2684 return r; 2685 if (cr8_prev <= kvm_get_cr8(vcpu)) 2686 return r; 2687 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 2688 return 0; 2689 } 2690 2691 static int efer_trap(struct kvm_vcpu *vcpu) 2692 { 2693 struct msr_data msr_info; 2694 int ret; 2695 2696 /* 2697 * Clear the EFER_SVME bit from EFER. The SVM code always sets this 2698 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against 2699 * whether the guest has X86_FEATURE_SVM - this avoids a failure if 2700 * the guest doesn't have X86_FEATURE_SVM. 2701 */ 2702 msr_info.host_initiated = false; 2703 msr_info.index = MSR_EFER; 2704 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME; 2705 ret = kvm_set_msr_common(vcpu, &msr_info); 2706 2707 return kvm_complete_insn_gp(vcpu, ret); 2708 } 2709 2710 static int svm_get_feature_msr(u32 msr, u64 *data) 2711 { 2712 *data = 0; 2713 2714 switch (msr) { 2715 case MSR_AMD64_DE_CFG: 2716 if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) 2717 *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; 2718 break; 2719 default: 2720 return KVM_MSR_RET_UNSUPPORTED; 2721 } 2722 2723 return 0; 2724 } 2725 2726 static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, 2727 struct msr_data *msr_info) 2728 { 2729 return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && 2730 msr_info->index != MSR_IA32_XSS && 2731 !msr_write_intercepted(vcpu, msr_info->index); 2732 } 2733 2734 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2735 { 2736 struct vcpu_svm *svm = to_svm(vcpu); 2737 2738 if (sev_es_prevent_msr_access(vcpu, msr_info)) { 2739 msr_info->data = 0; 2740 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; 2741 } 2742 2743 switch (msr_info->index) { 2744 case MSR_AMD64_TSC_RATIO: 2745 if (!msr_info->host_initiated && 2746 !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) 2747 return 1; 2748 msr_info->data = svm->tsc_ratio_msr; 2749 break; 2750 case MSR_STAR: 2751 msr_info->data = svm->vmcb01.ptr->save.star; 2752 break; 2753 #ifdef CONFIG_X86_64 2754 case MSR_LSTAR: 2755 msr_info->data = svm->vmcb01.ptr->save.lstar; 2756 break; 2757 case MSR_CSTAR: 2758 msr_info->data = svm->vmcb01.ptr->save.cstar; 2759 break; 2760 case MSR_GS_BASE: 2761 msr_info->data = svm->vmcb01.ptr->save.gs.base; 2762 break; 2763 case MSR_FS_BASE: 2764 msr_info->data = svm->vmcb01.ptr->save.fs.base; 2765 break; 2766 case MSR_KERNEL_GS_BASE: 2767 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; 2768 break; 2769 case MSR_SYSCALL_MASK: 2770 msr_info->data = svm->vmcb01.ptr->save.sfmask; 2771 break; 2772 #endif 2773 case MSR_IA32_SYSENTER_CS: 2774 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; 2775 break; 2776 case MSR_IA32_SYSENTER_EIP: 2777 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; 2778 if (guest_cpuid_is_intel_compatible(vcpu)) 2779 msr_info->data |= (u64)svm->sysenter_eip_hi << 32; 2780 break; 2781 case MSR_IA32_SYSENTER_ESP: 2782 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; 2783 if (guest_cpuid_is_intel_compatible(vcpu)) 2784 msr_info->data |= (u64)svm->sysenter_esp_hi << 32; 2785 break; 2786 case MSR_IA32_S_CET: 2787 msr_info->data = svm->vmcb->save.s_cet; 2788 break; 2789 case MSR_IA32_INT_SSP_TAB: 2790 msr_info->data = svm->vmcb->save.isst_addr; 2791 break; 2792 case MSR_KVM_INTERNAL_GUEST_SSP: 2793 msr_info->data = svm->vmcb->save.ssp; 2794 break; 2795 case MSR_TSC_AUX: 2796 msr_info->data = svm->tsc_aux; 2797 break; 2798 case MSR_IA32_DEBUGCTLMSR: 2799 msr_info->data = svm->vmcb->save.dbgctl; 2800 break; 2801 case MSR_IA32_LASTBRANCHFROMIP: 2802 msr_info->data = svm->vmcb->save.br_from; 2803 break; 2804 case MSR_IA32_LASTBRANCHTOIP: 2805 msr_info->data = svm->vmcb->save.br_to; 2806 break; 2807 case MSR_IA32_LASTINTFROMIP: 2808 msr_info->data = svm->vmcb->save.last_excp_from; 2809 break; 2810 case MSR_IA32_LASTINTTOIP: 2811 msr_info->data = svm->vmcb->save.last_excp_to; 2812 break; 2813 case MSR_VM_HSAVE_PA: 2814 msr_info->data = svm->nested.hsave_msr; 2815 break; 2816 case MSR_VM_CR: 2817 msr_info->data = svm->nested.vm_cr_msr; 2818 break; 2819 case MSR_IA32_SPEC_CTRL: 2820 if (!msr_info->host_initiated && 2821 !guest_has_spec_ctrl_msr(vcpu)) 2822 return 1; 2823 2824 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2825 msr_info->data = svm->vmcb->save.spec_ctrl; 2826 else 2827 msr_info->data = svm->spec_ctrl; 2828 break; 2829 case MSR_AMD64_VIRT_SPEC_CTRL: 2830 if (!msr_info->host_initiated && 2831 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2832 return 1; 2833 2834 msr_info->data = svm->virt_spec_ctrl; 2835 break; 2836 case MSR_F15H_IC_CFG: { 2837 2838 int family, model; 2839 2840 family = guest_cpuid_family(vcpu); 2841 model = guest_cpuid_model(vcpu); 2842 2843 if (family < 0 || model < 0) 2844 return kvm_get_msr_common(vcpu, msr_info); 2845 2846 msr_info->data = 0; 2847 2848 if (family == 0x15 && 2849 (model >= 0x2 && model < 0x20)) 2850 msr_info->data = 0x1E; 2851 } 2852 break; 2853 case MSR_AMD64_DE_CFG: 2854 msr_info->data = svm->msr_decfg; 2855 break; 2856 default: 2857 return kvm_get_msr_common(vcpu, msr_info); 2858 } 2859 return 0; 2860 } 2861 2862 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2863 { 2864 struct vcpu_svm *svm = to_svm(vcpu); 2865 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) 2866 return kvm_complete_insn_gp(vcpu, err); 2867 2868 svm_vmgexit_inject_exception(svm, X86_TRAP_GP); 2869 return 1; 2870 } 2871 2872 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) 2873 { 2874 struct vcpu_svm *svm = to_svm(vcpu); 2875 int svm_dis, chg_mask; 2876 2877 if (data & ~SVM_VM_CR_VALID_MASK) 2878 return 1; 2879 2880 chg_mask = SVM_VM_CR_VALID_MASK; 2881 2882 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) 2883 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); 2884 2885 svm->nested.vm_cr_msr &= ~chg_mask; 2886 svm->nested.vm_cr_msr |= (data & chg_mask); 2887 2888 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; 2889 2890 /* check for svm_disable while efer.svme is set */ 2891 if (svm_dis && (vcpu->arch.efer & EFER_SVME)) 2892 return 1; 2893 2894 return 0; 2895 } 2896 2897 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2898 { 2899 struct vcpu_svm *svm = to_svm(vcpu); 2900 int ret = 0; 2901 2902 u32 ecx = msr->index; 2903 u64 data = msr->data; 2904 2905 if (sev_es_prevent_msr_access(vcpu, msr)) 2906 return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; 2907 2908 switch (ecx) { 2909 case MSR_AMD64_TSC_RATIO: 2910 2911 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) { 2912 2913 if (!msr->host_initiated) 2914 return 1; 2915 /* 2916 * In case TSC scaling is not enabled, always 2917 * leave this MSR at the default value. 2918 * 2919 * Due to bug in qemu 6.2.0, it would try to set 2920 * this msr to 0 if tsc scaling is not enabled. 2921 * Ignore this value as well. 2922 */ 2923 if (data != 0 && data != svm->tsc_ratio_msr) 2924 return 1; 2925 break; 2926 } 2927 2928 if (data & SVM_TSC_RATIO_RSVD) 2929 return 1; 2930 2931 svm->tsc_ratio_msr = data; 2932 2933 if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && 2934 is_guest_mode(vcpu)) 2935 nested_svm_update_tsc_ratio_msr(vcpu); 2936 2937 break; 2938 case MSR_IA32_CR_PAT: 2939 ret = kvm_set_msr_common(vcpu, msr); 2940 if (ret) 2941 break; 2942 2943 svm->vmcb01.ptr->save.g_pat = data; 2944 if (is_guest_mode(vcpu)) 2945 nested_vmcb02_compute_g_pat(svm); 2946 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 2947 break; 2948 case MSR_IA32_SPEC_CTRL: 2949 if (!msr->host_initiated && 2950 !guest_has_spec_ctrl_msr(vcpu)) 2951 return 1; 2952 2953 if (kvm_spec_ctrl_test_value(data)) 2954 return 1; 2955 2956 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2957 svm->vmcb->save.spec_ctrl = data; 2958 else 2959 svm->spec_ctrl = data; 2960 if (!data) 2961 break; 2962 2963 /* 2964 * For non-nested: 2965 * When it's written (to non-zero) for the first time, pass 2966 * it through. 2967 * 2968 * For nested: 2969 * The handling of the MSR bitmap for L2 guests is done in 2970 * nested_svm_merge_msrpm(). 2971 * We update the L1 MSR bit as well since it will end up 2972 * touching the MSR anyway now. 2973 */ 2974 svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 2975 break; 2976 case MSR_AMD64_VIRT_SPEC_CTRL: 2977 if (!msr->host_initiated && 2978 !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2979 return 1; 2980 2981 if (data & ~SPEC_CTRL_SSBD) 2982 return 1; 2983 2984 svm->virt_spec_ctrl = data; 2985 break; 2986 case MSR_STAR: 2987 svm->vmcb01.ptr->save.star = data; 2988 break; 2989 #ifdef CONFIG_X86_64 2990 case MSR_LSTAR: 2991 svm->vmcb01.ptr->save.lstar = data; 2992 break; 2993 case MSR_CSTAR: 2994 svm->vmcb01.ptr->save.cstar = data; 2995 break; 2996 case MSR_GS_BASE: 2997 svm->vmcb01.ptr->save.gs.base = data; 2998 break; 2999 case MSR_FS_BASE: 3000 svm->vmcb01.ptr->save.fs.base = data; 3001 break; 3002 case MSR_KERNEL_GS_BASE: 3003 svm->vmcb01.ptr->save.kernel_gs_base = data; 3004 break; 3005 case MSR_SYSCALL_MASK: 3006 svm->vmcb01.ptr->save.sfmask = data; 3007 break; 3008 #endif 3009 case MSR_IA32_SYSENTER_CS: 3010 svm->vmcb01.ptr->save.sysenter_cs = data; 3011 break; 3012 case MSR_IA32_SYSENTER_EIP: 3013 svm->vmcb01.ptr->save.sysenter_eip = (u32)data; 3014 /* 3015 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs 3016 * when we spoof an Intel vendor ID (for cross vendor migration). 3017 * In this case we use this intercept to track the high 3018 * 32 bit part of these msrs to support Intel's 3019 * implementation of SYSENTER/SYSEXIT. 3020 */ 3021 svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; 3022 break; 3023 case MSR_IA32_SYSENTER_ESP: 3024 svm->vmcb01.ptr->save.sysenter_esp = (u32)data; 3025 svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; 3026 break; 3027 case MSR_IA32_S_CET: 3028 svm->vmcb->save.s_cet = data; 3029 vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); 3030 break; 3031 case MSR_IA32_INT_SSP_TAB: 3032 svm->vmcb->save.isst_addr = data; 3033 vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); 3034 break; 3035 case MSR_KVM_INTERNAL_GUEST_SSP: 3036 svm->vmcb->save.ssp = data; 3037 vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); 3038 break; 3039 case MSR_TSC_AUX: 3040 /* 3041 * TSC_AUX is always virtualized for SEV-ES guests when the 3042 * feature is available. The user return MSR support is not 3043 * required in this case because TSC_AUX is restored on #VMEXIT 3044 * from the host save area. 3045 */ 3046 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) 3047 break; 3048 3049 /* 3050 * TSC_AUX is usually changed only during boot and never read 3051 * directly. Intercept TSC_AUX and switch it via user return. 3052 */ 3053 preempt_disable(); 3054 ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); 3055 preempt_enable(); 3056 if (ret) 3057 break; 3058 3059 svm->tsc_aux = data; 3060 break; 3061 case MSR_IA32_DEBUGCTLMSR: 3062 if (!lbrv) { 3063 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3064 break; 3065 } 3066 3067 /* 3068 * Suppress BTF as KVM doesn't virtualize BTF, but there's no 3069 * way to communicate lack of support to the guest. 3070 */ 3071 if (data & DEBUGCTLMSR_BTF) { 3072 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 3073 data &= ~DEBUGCTLMSR_BTF; 3074 } 3075 3076 if (data & DEBUGCTL_RESERVED_BITS) 3077 return 1; 3078 3079 if (svm->vmcb->save.dbgctl == data) 3080 break; 3081 3082 svm->vmcb->save.dbgctl = data; 3083 vmcb_mark_dirty(svm->vmcb, VMCB_LBR); 3084 svm_update_lbrv(vcpu); 3085 break; 3086 case MSR_VM_HSAVE_PA: 3087 /* 3088 * Old kernels did not validate the value written to 3089 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid 3090 * value to allow live migrating buggy or malicious guests 3091 * originating from those kernels. 3092 */ 3093 if (!msr->host_initiated && !page_address_valid(vcpu, data)) 3094 return 1; 3095 3096 svm->nested.hsave_msr = data & PAGE_MASK; 3097 break; 3098 case MSR_VM_CR: 3099 return svm_set_vm_cr(vcpu, data); 3100 case MSR_VM_IGNNE: 3101 kvm_pr_unimpl_wrmsr(vcpu, ecx, data); 3102 break; 3103 case MSR_AMD64_DE_CFG: { 3104 u64 supported_de_cfg; 3105 3106 if (svm_get_feature_msr(ecx, &supported_de_cfg)) 3107 return 1; 3108 3109 if (data & ~supported_de_cfg) 3110 return 1; 3111 3112 svm->msr_decfg = data; 3113 break; 3114 } 3115 default: 3116 return kvm_set_msr_common(vcpu, msr); 3117 } 3118 return ret; 3119 } 3120 3121 static int msr_interception(struct kvm_vcpu *vcpu) 3122 { 3123 if (to_svm(vcpu)->vmcb->control.exit_info_1) 3124 return kvm_emulate_wrmsr(vcpu); 3125 else 3126 return kvm_emulate_rdmsr(vcpu); 3127 } 3128 3129 static int interrupt_window_interception(struct kvm_vcpu *vcpu) 3130 { 3131 kvm_make_request(KVM_REQ_EVENT, vcpu); 3132 svm_clear_vintr(to_svm(vcpu)); 3133 3134 /* 3135 * If not running nested, for AVIC, the only reason to end up here is ExtINTs. 3136 * In this case AVIC was temporarily disabled for 3137 * requesting the IRQ window and we have to re-enable it. 3138 * 3139 * If running nested, still remove the VM wide AVIC inhibit to 3140 * support case in which the interrupt window was requested when the 3141 * vCPU was not running nested. 3142 3143 * All vCPUs which run still run nested, will remain to have their 3144 * AVIC still inhibited due to per-cpu AVIC inhibition. 3145 */ 3146 kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 3147 3148 ++vcpu->stat.irq_window_exits; 3149 return 1; 3150 } 3151 3152 static int pause_interception(struct kvm_vcpu *vcpu) 3153 { 3154 bool in_kernel; 3155 /* 3156 * CPL is not made available for an SEV-ES guest, therefore 3157 * vcpu->arch.preempted_in_kernel can never be true. Just 3158 * set in_kernel to false as well. 3159 */ 3160 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; 3161 3162 grow_ple_window(vcpu); 3163 3164 kvm_vcpu_on_spin(vcpu, in_kernel); 3165 return kvm_skip_emulated_instruction(vcpu); 3166 } 3167 3168 static int invpcid_interception(struct kvm_vcpu *vcpu) 3169 { 3170 struct vcpu_svm *svm = to_svm(vcpu); 3171 unsigned long type; 3172 gva_t gva; 3173 3174 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 3175 kvm_queue_exception(vcpu, UD_VECTOR); 3176 return 1; 3177 } 3178 3179 /* 3180 * For an INVPCID intercept: 3181 * EXITINFO1 provides the linear address of the memory operand. 3182 * EXITINFO2 provides the contents of the register operand. 3183 */ 3184 type = svm->vmcb->control.exit_info_2; 3185 gva = svm->vmcb->control.exit_info_1; 3186 3187 /* 3188 * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the 3189 * stack segment is used. The intercept takes priority over all 3190 * #GP checks except CPL>0, but somehow still generates a linear 3191 * address? The APM is sorely lacking. 3192 */ 3193 if (is_noncanonical_address(gva, vcpu, 0)) { 3194 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 3195 return 1; 3196 } 3197 3198 return kvm_handle_invpcid(vcpu, type, gva); 3199 } 3200 3201 static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) 3202 { 3203 struct vcpu_svm *svm = to_svm(vcpu); 3204 3205 /* 3206 * If userspace has NOT changed RIP, then KVM's ABI is to let the guest 3207 * execute the bus-locking instruction. Set the bus lock counter to '1' 3208 * to effectively step past the bus lock. 3209 */ 3210 if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) 3211 svm->vmcb->control.bus_lock_counter = 1; 3212 3213 return 1; 3214 } 3215 3216 static int bus_lock_exit(struct kvm_vcpu *vcpu) 3217 { 3218 struct vcpu_svm *svm = to_svm(vcpu); 3219 3220 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 3221 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 3222 3223 vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); 3224 vcpu->arch.complete_userspace_io = complete_userspace_buslock; 3225 3226 if (is_guest_mode(vcpu)) 3227 svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; 3228 3229 return 0; 3230 } 3231 3232 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { 3233 [SVM_EXIT_READ_CR0] = cr_interception, 3234 [SVM_EXIT_READ_CR3] = cr_interception, 3235 [SVM_EXIT_READ_CR4] = cr_interception, 3236 [SVM_EXIT_READ_CR8] = cr_interception, 3237 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, 3238 [SVM_EXIT_WRITE_CR0] = cr_interception, 3239 [SVM_EXIT_WRITE_CR3] = cr_interception, 3240 [SVM_EXIT_WRITE_CR4] = cr_interception, 3241 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3242 [SVM_EXIT_READ_DR0] = dr_interception, 3243 [SVM_EXIT_READ_DR1] = dr_interception, 3244 [SVM_EXIT_READ_DR2] = dr_interception, 3245 [SVM_EXIT_READ_DR3] = dr_interception, 3246 [SVM_EXIT_READ_DR4] = dr_interception, 3247 [SVM_EXIT_READ_DR5] = dr_interception, 3248 [SVM_EXIT_READ_DR6] = dr_interception, 3249 [SVM_EXIT_READ_DR7] = dr_interception, 3250 [SVM_EXIT_WRITE_DR0] = dr_interception, 3251 [SVM_EXIT_WRITE_DR1] = dr_interception, 3252 [SVM_EXIT_WRITE_DR2] = dr_interception, 3253 [SVM_EXIT_WRITE_DR3] = dr_interception, 3254 [SVM_EXIT_WRITE_DR4] = dr_interception, 3255 [SVM_EXIT_WRITE_DR5] = dr_interception, 3256 [SVM_EXIT_WRITE_DR6] = dr_interception, 3257 [SVM_EXIT_WRITE_DR7] = dr_interception, 3258 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3259 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3260 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3261 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 3262 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 3263 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 3264 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, 3265 [SVM_EXIT_INTR] = intr_interception, 3266 [SVM_EXIT_NMI] = nmi_interception, 3267 [SVM_EXIT_SMI] = smi_interception, 3268 [SVM_EXIT_VINTR] = interrupt_window_interception, 3269 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, 3270 [SVM_EXIT_CPUID] = kvm_emulate_cpuid, 3271 [SVM_EXIT_IRET] = iret_interception, 3272 [SVM_EXIT_INVD] = kvm_emulate_invd, 3273 [SVM_EXIT_PAUSE] = pause_interception, 3274 [SVM_EXIT_HLT] = kvm_emulate_halt, 3275 [SVM_EXIT_INVLPG] = invlpg_interception, 3276 [SVM_EXIT_INVLPGA] = invlpga_interception, 3277 [SVM_EXIT_IOIO] = io_interception, 3278 [SVM_EXIT_MSR] = msr_interception, 3279 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 3280 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 3281 [SVM_EXIT_VMRUN] = vmrun_interception, 3282 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, 3283 [SVM_EXIT_VMLOAD] = vmload_interception, 3284 [SVM_EXIT_VMSAVE] = vmsave_interception, 3285 [SVM_EXIT_STGI] = stgi_interception, 3286 [SVM_EXIT_CLGI] = clgi_interception, 3287 [SVM_EXIT_SKINIT] = skinit_interception, 3288 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, 3289 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, 3290 [SVM_EXIT_MONITOR] = kvm_emulate_monitor, 3291 [SVM_EXIT_MWAIT] = kvm_emulate_mwait, 3292 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, 3293 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, 3294 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, 3295 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, 3296 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, 3297 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, 3298 [SVM_EXIT_INVPCID] = invpcid_interception, 3299 [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, 3300 [SVM_EXIT_NPF] = npf_interception, 3301 [SVM_EXIT_BUS_LOCK] = bus_lock_exit, 3302 [SVM_EXIT_RSM] = rsm_interception, 3303 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 3304 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, 3305 #ifdef CONFIG_KVM_AMD_SEV 3306 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, 3307 #endif 3308 }; 3309 3310 static void dump_vmcb(struct kvm_vcpu *vcpu) 3311 { 3312 struct vcpu_svm *svm = to_svm(vcpu); 3313 struct vmcb_control_area *control = &svm->vmcb->control; 3314 struct vmcb_save_area *save = &svm->vmcb->save; 3315 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; 3316 char *vm_type; 3317 3318 if (!dump_invalid_vmcb) { 3319 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3320 return; 3321 } 3322 3323 guard(mutex)(&vmcb_dump_mutex); 3324 3325 vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : 3326 sev_es_guest(vcpu->kvm) ? "SEV-ES" : 3327 sev_guest(vcpu->kvm) ? "SEV" : "SVM"; 3328 3329 pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", 3330 vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); 3331 pr_err("VMCB Control Area:\n"); 3332 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); 3333 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); 3334 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); 3335 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); 3336 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); 3337 pr_err("%-20s%08x %08x\n", "intercepts:", 3338 control->intercepts[INTERCEPT_WORD3], 3339 control->intercepts[INTERCEPT_WORD4]); 3340 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); 3341 pr_err("%-20s%d\n", "pause filter threshold:", 3342 control->pause_filter_thresh); 3343 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); 3344 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); 3345 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); 3346 pr_err("%-20s%d\n", "asid:", control->asid); 3347 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); 3348 pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl); 3349 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); 3350 pr_err("%-20s%08x\n", "int_vector:", control->int_vector); 3351 pr_err("%-20s%08x\n", "int_state:", control->int_state); 3352 pr_err("%-20s%016llx\n", "exit_code:", control->exit_code); 3353 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); 3354 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); 3355 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); 3356 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); 3357 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); 3358 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); 3359 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 3360 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); 3361 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 3362 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 3363 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 3364 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 3365 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 3366 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); 3367 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); 3368 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); 3369 pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); 3370 pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); 3371 3372 if (sev_es_guest(vcpu->kvm)) { 3373 save = sev_decrypt_vmsa(vcpu); 3374 if (!save) 3375 goto no_vmsa; 3376 3377 save01 = save; 3378 } 3379 3380 pr_err("VMCB State Save Area:\n"); 3381 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3382 "es:", 3383 save->es.selector, save->es.attrib, 3384 save->es.limit, save->es.base); 3385 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3386 "cs:", 3387 save->cs.selector, save->cs.attrib, 3388 save->cs.limit, save->cs.base); 3389 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3390 "ss:", 3391 save->ss.selector, save->ss.attrib, 3392 save->ss.limit, save->ss.base); 3393 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3394 "ds:", 3395 save->ds.selector, save->ds.attrib, 3396 save->ds.limit, save->ds.base); 3397 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3398 "fs:", 3399 save01->fs.selector, save01->fs.attrib, 3400 save01->fs.limit, save01->fs.base); 3401 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3402 "gs:", 3403 save01->gs.selector, save01->gs.attrib, 3404 save01->gs.limit, save01->gs.base); 3405 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3406 "gdtr:", 3407 save->gdtr.selector, save->gdtr.attrib, 3408 save->gdtr.limit, save->gdtr.base); 3409 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3410 "ldtr:", 3411 save01->ldtr.selector, save01->ldtr.attrib, 3412 save01->ldtr.limit, save01->ldtr.base); 3413 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3414 "idtr:", 3415 save->idtr.selector, save->idtr.attrib, 3416 save->idtr.limit, save->idtr.base); 3417 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3418 "tr:", 3419 save01->tr.selector, save01->tr.attrib, 3420 save01->tr.limit, save01->tr.base); 3421 pr_err("vmpl: %d cpl: %d efer: %016llx\n", 3422 save->vmpl, save->cpl, save->efer); 3423 pr_err("%-15s %016llx %-13s %016llx\n", 3424 "cr0:", save->cr0, "cr2:", save->cr2); 3425 pr_err("%-15s %016llx %-13s %016llx\n", 3426 "cr3:", save->cr3, "cr4:", save->cr4); 3427 pr_err("%-15s %016llx %-13s %016llx\n", 3428 "dr6:", save->dr6, "dr7:", save->dr7); 3429 pr_err("%-15s %016llx %-13s %016llx\n", 3430 "rip:", save->rip, "rflags:", save->rflags); 3431 pr_err("%-15s %016llx %-13s %016llx\n", 3432 "rsp:", save->rsp, "rax:", save->rax); 3433 pr_err("%-15s %016llx %-13s %016llx\n", 3434 "s_cet:", save->s_cet, "ssp:", save->ssp); 3435 pr_err("%-15s %016llx\n", 3436 "isst_addr:", save->isst_addr); 3437 pr_err("%-15s %016llx %-13s %016llx\n", 3438 "star:", save01->star, "lstar:", save01->lstar); 3439 pr_err("%-15s %016llx %-13s %016llx\n", 3440 "cstar:", save01->cstar, "sfmask:", save01->sfmask); 3441 pr_err("%-15s %016llx %-13s %016llx\n", 3442 "kernel_gs_base:", save01->kernel_gs_base, 3443 "sysenter_cs:", save01->sysenter_cs); 3444 pr_err("%-15s %016llx %-13s %016llx\n", 3445 "sysenter_esp:", save01->sysenter_esp, 3446 "sysenter_eip:", save01->sysenter_eip); 3447 pr_err("%-15s %016llx %-13s %016llx\n", 3448 "gpat:", save->g_pat, "dbgctl:", save->dbgctl); 3449 pr_err("%-15s %016llx %-13s %016llx\n", 3450 "br_from:", save->br_from, "br_to:", save->br_to); 3451 pr_err("%-15s %016llx %-13s %016llx\n", 3452 "excp_from:", save->last_excp_from, 3453 "excp_to:", save->last_excp_to); 3454 3455 if (sev_es_guest(vcpu->kvm)) { 3456 struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; 3457 3458 pr_err("%-15s %016llx\n", 3459 "sev_features", vmsa->sev_features); 3460 3461 pr_err("%-15s %016llx %-13s %016llx\n", 3462 "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp); 3463 pr_err("%-15s %016llx %-13s %016llx\n", 3464 "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp); 3465 pr_err("%-15s %016llx\n", 3466 "u_cet:", vmsa->u_cet); 3467 3468 pr_err("%-15s %016llx %-13s %016llx\n", 3469 "rax:", vmsa->rax, "rbx:", vmsa->rbx); 3470 pr_err("%-15s %016llx %-13s %016llx\n", 3471 "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); 3472 pr_err("%-15s %016llx %-13s %016llx\n", 3473 "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); 3474 pr_err("%-15s %016llx %-13s %016llx\n", 3475 "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); 3476 pr_err("%-15s %016llx %-13s %016llx\n", 3477 "r8:", vmsa->r8, "r9:", vmsa->r9); 3478 pr_err("%-15s %016llx %-13s %016llx\n", 3479 "r10:", vmsa->r10, "r11:", vmsa->r11); 3480 pr_err("%-15s %016llx %-13s %016llx\n", 3481 "r12:", vmsa->r12, "r13:", vmsa->r13); 3482 pr_err("%-15s %016llx %-13s %016llx\n", 3483 "r14:", vmsa->r14, "r15:", vmsa->r15); 3484 pr_err("%-15s %016llx %-13s %016llx\n", 3485 "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); 3486 } else { 3487 pr_err("%-15s %016llx %-13s %016lx\n", 3488 "rax:", save->rax, "rbx:", 3489 vcpu->arch.regs[VCPU_REGS_RBX]); 3490 pr_err("%-15s %016lx %-13s %016lx\n", 3491 "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], 3492 "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); 3493 pr_err("%-15s %016lx %-13s %016lx\n", 3494 "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], 3495 "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); 3496 pr_err("%-15s %016lx %-13s %016llx\n", 3497 "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], 3498 "rsp:", save->rsp); 3499 #ifdef CONFIG_X86_64 3500 pr_err("%-15s %016lx %-13s %016lx\n", 3501 "r8:", vcpu->arch.regs[VCPU_REGS_R8], 3502 "r9:", vcpu->arch.regs[VCPU_REGS_R9]); 3503 pr_err("%-15s %016lx %-13s %016lx\n", 3504 "r10:", vcpu->arch.regs[VCPU_REGS_R10], 3505 "r11:", vcpu->arch.regs[VCPU_REGS_R11]); 3506 pr_err("%-15s %016lx %-13s %016lx\n", 3507 "r12:", vcpu->arch.regs[VCPU_REGS_R12], 3508 "r13:", vcpu->arch.regs[VCPU_REGS_R13]); 3509 pr_err("%-15s %016lx %-13s %016lx\n", 3510 "r14:", vcpu->arch.regs[VCPU_REGS_R14], 3511 "r15:", vcpu->arch.regs[VCPU_REGS_R15]); 3512 #endif 3513 } 3514 3515 no_vmsa: 3516 if (sev_es_guest(vcpu->kvm)) 3517 sev_free_decrypted_vmsa(vcpu, save); 3518 } 3519 3520 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code) 3521 { 3522 u32 exit_code = __exit_code; 3523 3524 /* 3525 * SVM uses negative values, i.e. 64-bit values, to indicate that VMRUN 3526 * failed. Report all such errors to userspace (note, VMEXIT_INVALID, 3527 * a.k.a. SVM_EXIT_ERR, is special cased by svm_handle_exit()). Skip 3528 * the check when running as a VM, as KVM has historically left garbage 3529 * in bits 63:32, i.e. running KVM-on-KVM would hit false positives if 3530 * the underlying kernel is buggy. 3531 */ 3532 if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR) && 3533 (u64)exit_code != __exit_code) 3534 goto unexpected_vmexit; 3535 3536 #ifdef CONFIG_MITIGATION_RETPOLINE 3537 if (exit_code == SVM_EXIT_MSR) 3538 return msr_interception(vcpu); 3539 else if (exit_code == SVM_EXIT_VINTR) 3540 return interrupt_window_interception(vcpu); 3541 else if (exit_code == SVM_EXIT_INTR) 3542 return intr_interception(vcpu); 3543 else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) 3544 return kvm_emulate_halt(vcpu); 3545 else if (exit_code == SVM_EXIT_NPF) 3546 return npf_interception(vcpu); 3547 #ifdef CONFIG_KVM_AMD_SEV 3548 else if (exit_code == SVM_EXIT_VMGEXIT) 3549 return sev_handle_vmgexit(vcpu); 3550 #endif 3551 #endif 3552 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)) 3553 goto unexpected_vmexit; 3554 3555 exit_code = array_index_nospec(exit_code, ARRAY_SIZE(svm_exit_handlers)); 3556 if (!svm_exit_handlers[exit_code]) 3557 goto unexpected_vmexit; 3558 3559 return svm_exit_handlers[exit_code](vcpu); 3560 3561 unexpected_vmexit: 3562 dump_vmcb(vcpu); 3563 kvm_prepare_unexpected_reason_exit(vcpu, __exit_code); 3564 return 0; 3565 } 3566 3567 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 3568 u64 *info1, u64 *info2, 3569 u32 *intr_info, u32 *error_code) 3570 { 3571 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 3572 3573 *reason = control->exit_code; 3574 *info1 = control->exit_info_1; 3575 *info2 = control->exit_info_2; 3576 *intr_info = control->exit_int_info; 3577 if ((*intr_info & SVM_EXITINTINFO_VALID) && 3578 (*intr_info & SVM_EXITINTINFO_VALID_ERR)) 3579 *error_code = control->exit_int_info_err; 3580 else 3581 *error_code = 0; 3582 } 3583 3584 static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, 3585 u32 *error_code) 3586 { 3587 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 3588 3589 *intr_info = control->event_inj; 3590 3591 if ((*intr_info & SVM_EXITINTINFO_VALID) && 3592 (*intr_info & SVM_EXITINTINFO_VALID_ERR)) 3593 *error_code = control->event_inj_err; 3594 else 3595 *error_code = 0; 3596 3597 } 3598 3599 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 3600 { 3601 struct vcpu_svm *svm = to_svm(vcpu); 3602 struct kvm_run *kvm_run = vcpu->run; 3603 3604 /* SEV-ES guests must use the CR write traps to track CR registers. */ 3605 if (!sev_es_guest(vcpu->kvm)) { 3606 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) 3607 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3608 if (npt_enabled) 3609 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3610 } 3611 3612 if (is_guest_mode(vcpu)) { 3613 int vmexit; 3614 3615 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); 3616 3617 vmexit = nested_svm_exit_special(svm); 3618 3619 if (vmexit == NESTED_EXIT_CONTINUE) 3620 vmexit = nested_svm_exit_handled(svm); 3621 3622 if (vmexit == NESTED_EXIT_DONE) 3623 return 1; 3624 } 3625 3626 if (svm_is_vmrun_failure(svm->vmcb->control.exit_code)) { 3627 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3628 kvm_run->fail_entry.hardware_entry_failure_reason 3629 = svm->vmcb->control.exit_code; 3630 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 3631 dump_vmcb(vcpu); 3632 return 0; 3633 } 3634 3635 if (exit_fastpath != EXIT_FASTPATH_NONE) 3636 return 1; 3637 3638 return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code); 3639 } 3640 3641 static int pre_svm_run(struct kvm_vcpu *vcpu) 3642 { 3643 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 3644 struct vcpu_svm *svm = to_svm(vcpu); 3645 3646 /* 3647 * If the previous vmrun of the vmcb occurred on a different physical 3648 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's 3649 * vmcb clean bits are per logical CPU, as are KVM's asid assignments. 3650 */ 3651 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) { 3652 svm->current_vmcb->asid_generation = 0; 3653 vmcb_mark_all_dirty(svm->vmcb); 3654 svm->current_vmcb->cpu = vcpu->cpu; 3655 } 3656 3657 if (sev_guest(vcpu->kvm)) 3658 return pre_sev_run(svm, vcpu->cpu); 3659 3660 /* FIXME: handle wraparound of asid_generation */ 3661 if (svm->current_vmcb->asid_generation != sd->asid_generation) 3662 new_asid(svm, sd); 3663 3664 return 0; 3665 } 3666 3667 static void svm_inject_nmi(struct kvm_vcpu *vcpu) 3668 { 3669 struct vcpu_svm *svm = to_svm(vcpu); 3670 3671 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3672 3673 if (svm->nmi_l1_to_l2) 3674 return; 3675 3676 /* 3677 * No need to manually track NMI masking when vNMI is enabled, hardware 3678 * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the 3679 * case where software directly injects an NMI. 3680 */ 3681 if (!is_vnmi_enabled(svm)) { 3682 svm->nmi_masked = true; 3683 svm_set_iret_intercept(svm); 3684 } 3685 ++vcpu->stat.nmi_injections; 3686 } 3687 3688 static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu) 3689 { 3690 struct vcpu_svm *svm = to_svm(vcpu); 3691 3692 if (!is_vnmi_enabled(svm)) 3693 return false; 3694 3695 return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK); 3696 } 3697 3698 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) 3699 { 3700 struct vcpu_svm *svm = to_svm(vcpu); 3701 3702 if (!is_vnmi_enabled(svm)) 3703 return false; 3704 3705 if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK) 3706 return false; 3707 3708 svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK; 3709 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 3710 3711 /* 3712 * Because the pending NMI is serviced by hardware, KVM can't know when 3713 * the NMI is "injected", but for all intents and purposes, passing the 3714 * NMI off to hardware counts as injection. 3715 */ 3716 ++vcpu->stat.nmi_injections; 3717 3718 return true; 3719 } 3720 3721 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 3722 { 3723 struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt; 3724 struct vcpu_svm *svm = to_svm(vcpu); 3725 u32 type; 3726 3727 if (intr->soft) { 3728 if (svm_update_soft_interrupt_rip(vcpu, intr->nr)) 3729 return; 3730 3731 type = SVM_EVTINJ_TYPE_SOFT; 3732 } else { 3733 type = SVM_EVTINJ_TYPE_INTR; 3734 } 3735 3736 trace_kvm_inj_virq(intr->nr, intr->soft, reinjected); 3737 ++vcpu->stat.irq_injections; 3738 3739 svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type; 3740 } 3741 3742 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, 3743 int trig_mode, int vector) 3744 { 3745 /* 3746 * apic->apicv_active must be read after vcpu->mode. 3747 * Pairs with smp_store_release in vcpu_enter_guest. 3748 */ 3749 bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); 3750 3751 /* Note, this is called iff the local APIC is in-kernel. */ 3752 if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { 3753 /* Process the interrupt via kvm_check_and_inject_events(). */ 3754 kvm_make_request(KVM_REQ_EVENT, vcpu); 3755 kvm_vcpu_kick(vcpu); 3756 return; 3757 } 3758 3759 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 3760 if (in_guest_mode) { 3761 /* 3762 * Signal the doorbell to tell hardware to inject the IRQ. If 3763 * the vCPU exits the guest before the doorbell chimes, hardware 3764 * will automatically process AVIC interrupts at the next VMRUN. 3765 */ 3766 avic_ring_doorbell(vcpu); 3767 } else { 3768 /* 3769 * Wake the vCPU if it was blocking. KVM will then detect the 3770 * pending IRQ when checking if the vCPU has a wake event. 3771 */ 3772 kvm_vcpu_wake_up(vcpu); 3773 } 3774 } 3775 3776 static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 3777 int trig_mode, int vector) 3778 { 3779 kvm_lapic_set_irr(vector, apic); 3780 3781 /* 3782 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in 3783 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before 3784 * the read of guest_mode. This guarantees that either VMRUN will see 3785 * and process the new vIRR entry, or that svm_complete_interrupt_delivery 3786 * will signal the doorbell if the CPU has already entered the guest. 3787 */ 3788 smp_mb__after_atomic(); 3789 svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); 3790 } 3791 3792 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 3793 { 3794 struct vcpu_svm *svm = to_svm(vcpu); 3795 3796 /* 3797 * SEV-ES guests must always keep the CR intercepts cleared. CR 3798 * tracking is done using the CR write traps. 3799 */ 3800 if (sev_es_guest(vcpu->kvm)) 3801 return; 3802 3803 if (nested_svm_virtualize_tpr(vcpu)) 3804 return; 3805 3806 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 3807 3808 if (irr == -1) 3809 return; 3810 3811 if (tpr >= irr) 3812 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 3813 } 3814 3815 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 3816 { 3817 struct vcpu_svm *svm = to_svm(vcpu); 3818 3819 if (is_vnmi_enabled(svm)) 3820 return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK; 3821 else 3822 return svm->nmi_masked; 3823 } 3824 3825 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 3826 { 3827 struct vcpu_svm *svm = to_svm(vcpu); 3828 3829 if (is_vnmi_enabled(svm)) { 3830 if (masked) 3831 svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK; 3832 else 3833 svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK; 3834 3835 } else { 3836 svm->nmi_masked = masked; 3837 if (masked) 3838 svm_set_iret_intercept(svm); 3839 else 3840 svm_clr_iret_intercept(svm); 3841 } 3842 } 3843 3844 bool svm_nmi_blocked(struct kvm_vcpu *vcpu) 3845 { 3846 struct vcpu_svm *svm = to_svm(vcpu); 3847 struct vmcb *vmcb = svm->vmcb; 3848 3849 if (!gif_set(svm)) 3850 return true; 3851 3852 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3853 return false; 3854 3855 if (svm_get_nmi_mask(vcpu)) 3856 return true; 3857 3858 return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK; 3859 } 3860 3861 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3862 { 3863 struct vcpu_svm *svm = to_svm(vcpu); 3864 if (svm->nested.nested_run_pending) 3865 return -EBUSY; 3866 3867 if (svm_nmi_blocked(vcpu)) 3868 return 0; 3869 3870 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 3871 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3872 return -EBUSY; 3873 return 1; 3874 } 3875 3876 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) 3877 { 3878 struct vcpu_svm *svm = to_svm(vcpu); 3879 struct vmcb *vmcb = svm->vmcb; 3880 3881 if (!gif_set(svm)) 3882 return true; 3883 3884 if (is_guest_mode(vcpu)) { 3885 /* As long as interrupts are being delivered... */ 3886 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) 3887 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) 3888 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) 3889 return true; 3890 3891 /* ... vmexits aren't blocked by the interrupt shadow */ 3892 if (nested_exit_on_intr(svm)) 3893 return false; 3894 } else { 3895 if (!svm_get_if_flag(vcpu)) 3896 return true; 3897 } 3898 3899 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); 3900 } 3901 3902 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3903 { 3904 struct vcpu_svm *svm = to_svm(vcpu); 3905 3906 if (svm->nested.nested_run_pending) 3907 return -EBUSY; 3908 3909 if (svm_interrupt_blocked(vcpu)) 3910 return 0; 3911 3912 /* 3913 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 3914 * e.g. if the IRQ arrived asynchronously after checking nested events. 3915 */ 3916 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) 3917 return -EBUSY; 3918 3919 return 1; 3920 } 3921 3922 static void svm_enable_irq_window(struct kvm_vcpu *vcpu) 3923 { 3924 struct vcpu_svm *svm = to_svm(vcpu); 3925 3926 /* 3927 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 3928 * 1, because that's a separate STGI/VMRUN intercept. The next time we 3929 * get that intercept, this function will be called again though and 3930 * we'll get the vintr intercept. However, if the vGIF feature is 3931 * enabled, the STGI interception will not occur. Enable the irq 3932 * window under the assumption that the hardware will set the GIF. 3933 */ 3934 if (vgif || gif_set(svm)) { 3935 /* 3936 * IRQ window is not needed when AVIC is enabled, 3937 * unless we have pending ExtINT since it cannot be injected 3938 * via AVIC. In such case, KVM needs to temporarily disable AVIC, 3939 * and fallback to injecting IRQ via V_IRQ. 3940 * 3941 * If running nested, AVIC is already locally inhibited 3942 * on this vCPU, therefore there is no need to request 3943 * the VM wide AVIC inhibition. 3944 */ 3945 if (!is_guest_mode(vcpu)) 3946 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 3947 3948 svm_set_vintr(svm); 3949 } 3950 } 3951 3952 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) 3953 { 3954 struct vcpu_svm *svm = to_svm(vcpu); 3955 3956 /* 3957 * If NMIs are outright masked, i.e. the vCPU is already handling an 3958 * NMI, and KVM has not yet intercepted an IRET, then there is nothing 3959 * more to do at this time as KVM has already enabled IRET intercepts. 3960 * If KVM has already intercepted IRET, then single-step over the IRET, 3961 * as NMIs aren't architecturally unmasked until the IRET completes. 3962 * 3963 * If vNMI is enabled, KVM should never request an NMI window if NMIs 3964 * are masked, as KVM allows at most one to-be-injected NMI and one 3965 * pending NMI. If two NMIs arrive simultaneously, KVM will inject one 3966 * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are 3967 * unmasked. KVM _will_ request an NMI window in some situations, e.g. 3968 * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately 3969 * inject the NMI. In those situations, KVM needs to single-step over 3970 * the STI shadow or intercept STGI. 3971 */ 3972 if (svm_get_nmi_mask(vcpu)) { 3973 WARN_ON_ONCE(is_vnmi_enabled(svm)); 3974 3975 if (!svm->awaiting_iret_completion) 3976 return; /* IRET will cause a vm exit */ 3977 } 3978 3979 /* 3980 * SEV-ES guests are responsible for signaling when a vCPU is ready to 3981 * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e. 3982 * KVM can't intercept and single-step IRET to detect when NMIs are 3983 * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE. 3984 * 3985 * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware 3986 * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not 3987 * supported NAEs in the GHCB protocol. 3988 */ 3989 if (sev_es_guest(vcpu->kvm)) 3990 return; 3991 3992 if (!gif_set(svm)) { 3993 if (vgif) 3994 svm_set_intercept(svm, INTERCEPT_STGI); 3995 return; /* STGI will cause a vm exit */ 3996 } 3997 3998 /* 3999 * Something prevents NMI from been injected. Single step over possible 4000 * problem (IRET or exception injection or interrupt shadow) 4001 */ 4002 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); 4003 svm->nmi_singlestep = true; 4004 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 4005 } 4006 4007 static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu) 4008 { 4009 struct vcpu_svm *svm = to_svm(vcpu); 4010 4011 /* 4012 * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries. 4013 * A TLB flush for the current ASID flushes both "host" and "guest" TLB 4014 * entries, and thus is a superset of Hyper-V's fine grained flushing. 4015 */ 4016 kvm_hv_vcpu_purge_flush_tlb(vcpu); 4017 4018 /* 4019 * Flush only the current ASID even if the TLB flush was invoked via 4020 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all 4021 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and 4022 * unconditionally does a TLB flush on both nested VM-Enter and nested 4023 * VM-Exit (via kvm_mmu_reset_context()). 4024 */ 4025 if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) 4026 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 4027 else 4028 svm->current_vmcb->asid_generation--; 4029 } 4030 4031 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) 4032 { 4033 hpa_t root_tdp = vcpu->arch.mmu->root.hpa; 4034 4035 /* 4036 * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly 4037 * flush the NPT mappings via hypercall as flushing the ASID only 4038 * affects virtual to physical mappings, it does not invalidate guest 4039 * physical to host physical mappings. 4040 */ 4041 if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp)) 4042 hyperv_flush_guest_mapping(root_tdp); 4043 4044 svm_flush_tlb_asid(vcpu); 4045 } 4046 4047 static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) 4048 { 4049 /* 4050 * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB 4051 * flushes should be routed to hv_flush_remote_tlbs() without requesting 4052 * a "regular" remote flush. Reaching this point means either there's 4053 * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of 4054 * which might be fatal to the guest. Yell, but try to recover. 4055 */ 4056 if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) 4057 hv_flush_remote_tlbs(vcpu->kvm); 4058 4059 svm_flush_tlb_asid(vcpu); 4060 } 4061 4062 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) 4063 { 4064 struct vcpu_svm *svm = to_svm(vcpu); 4065 4066 invlpga(gva, svm->vmcb->control.asid); 4067 } 4068 4069 static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu) 4070 { 4071 kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS); 4072 4073 svm_flush_tlb_asid(vcpu); 4074 } 4075 4076 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) 4077 { 4078 struct vcpu_svm *svm = to_svm(vcpu); 4079 4080 if (nested_svm_virtualize_tpr(vcpu)) 4081 return; 4082 4083 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { 4084 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 4085 kvm_set_cr8(vcpu, cr8); 4086 } 4087 } 4088 4089 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) 4090 { 4091 struct vcpu_svm *svm = to_svm(vcpu); 4092 u64 cr8; 4093 4094 if (nested_svm_virtualize_tpr(vcpu)) 4095 return; 4096 4097 cr8 = kvm_get_cr8(vcpu); 4098 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 4099 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 4100 } 4101 4102 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, 4103 int type) 4104 { 4105 bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); 4106 bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); 4107 struct vcpu_svm *svm = to_svm(vcpu); 4108 4109 /* 4110 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's 4111 * associated with the original soft exception/interrupt. next_rip is 4112 * cleared on all exits that can occur while vectoring an event, so KVM 4113 * needs to manually set next_rip for re-injection. Unlike the !nrips 4114 * case below, this needs to be done if and only if KVM is re-injecting 4115 * the same event, i.e. if the event is a soft exception/interrupt, 4116 * otherwise next_rip is unused on VMRUN. 4117 */ 4118 if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && 4119 kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) 4120 svm->vmcb->control.next_rip = svm->soft_int_next_rip; 4121 /* 4122 * If NRIPS isn't enabled, KVM must manually advance RIP prior to 4123 * injecting the soft exception/interrupt. That advancement needs to 4124 * be unwound if vectoring didn't complete. Note, the new event may 4125 * not be the injected event, e.g. if KVM injected an INTn, the INTn 4126 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will 4127 * be the reported vectored event, but RIP still needs to be unwound. 4128 */ 4129 else if (!nrips && (is_soft || is_exception) && 4130 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) 4131 kvm_rip_write(vcpu, svm->soft_int_old_rip); 4132 } 4133 4134 static void svm_complete_interrupts(struct kvm_vcpu *vcpu) 4135 { 4136 struct vcpu_svm *svm = to_svm(vcpu); 4137 u8 vector; 4138 int type; 4139 u32 exitintinfo = svm->vmcb->control.exit_int_info; 4140 bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; 4141 bool soft_int_injected = svm->soft_int_injected; 4142 4143 svm->nmi_l1_to_l2 = false; 4144 svm->soft_int_injected = false; 4145 4146 /* 4147 * If we've made progress since setting awaiting_iret_completion, we've 4148 * executed an IRET and can allow NMI injection. 4149 */ 4150 if (svm->awaiting_iret_completion && 4151 kvm_rip_read(vcpu) != svm->nmi_iret_rip) { 4152 svm->awaiting_iret_completion = false; 4153 svm->nmi_masked = false; 4154 kvm_make_request(KVM_REQ_EVENT, vcpu); 4155 } 4156 4157 vcpu->arch.nmi_injected = false; 4158 kvm_clear_exception_queue(vcpu); 4159 kvm_clear_interrupt_queue(vcpu); 4160 4161 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 4162 return; 4163 4164 kvm_make_request(KVM_REQ_EVENT, vcpu); 4165 4166 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 4167 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 4168 4169 if (soft_int_injected) 4170 svm_complete_soft_interrupt(vcpu, vector, type); 4171 4172 switch (type) { 4173 case SVM_EXITINTINFO_TYPE_NMI: 4174 vcpu->arch.nmi_injected = true; 4175 svm->nmi_l1_to_l2 = nmi_l1_to_l2; 4176 break; 4177 case SVM_EXITINTINFO_TYPE_EXEPT: { 4178 u32 error_code = 0; 4179 4180 /* 4181 * Never re-inject a #VC exception. 4182 */ 4183 if (vector == X86_TRAP_VC) 4184 break; 4185 4186 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) 4187 error_code = svm->vmcb->control.exit_int_info_err; 4188 4189 kvm_requeue_exception(vcpu, vector, 4190 exitintinfo & SVM_EXITINTINFO_VALID_ERR, 4191 error_code); 4192 break; 4193 } 4194 case SVM_EXITINTINFO_TYPE_INTR: 4195 kvm_queue_interrupt(vcpu, vector, false); 4196 break; 4197 case SVM_EXITINTINFO_TYPE_SOFT: 4198 kvm_queue_interrupt(vcpu, vector, true); 4199 break; 4200 default: 4201 break; 4202 } 4203 4204 } 4205 4206 static void svm_cancel_injection(struct kvm_vcpu *vcpu) 4207 { 4208 struct vcpu_svm *svm = to_svm(vcpu); 4209 struct vmcb_control_area *control = &svm->vmcb->control; 4210 4211 control->exit_int_info = control->event_inj; 4212 control->exit_int_info_err = control->event_inj_err; 4213 control->event_inj = 0; 4214 svm_complete_interrupts(vcpu); 4215 } 4216 4217 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) 4218 { 4219 if (to_kvm_sev_info(vcpu->kvm)->need_init) 4220 return -EINVAL; 4221 4222 return 1; 4223 } 4224 4225 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 4226 { 4227 struct vcpu_svm *svm = to_svm(vcpu); 4228 struct vmcb_control_area *control = &svm->vmcb->control; 4229 4230 /* 4231 * Next RIP must be provided as IRQs are disabled, and accessing guest 4232 * memory to decode the instruction might fault, i.e. might sleep. 4233 */ 4234 if (!nrips || !control->next_rip) 4235 return EXIT_FASTPATH_NONE; 4236 4237 if (is_guest_mode(vcpu)) 4238 return EXIT_FASTPATH_NONE; 4239 4240 switch (control->exit_code) { 4241 case SVM_EXIT_MSR: 4242 if (!control->exit_info_1) 4243 break; 4244 return handle_fastpath_wrmsr(vcpu); 4245 case SVM_EXIT_HLT: 4246 return handle_fastpath_hlt(vcpu); 4247 case SVM_EXIT_INVD: 4248 return handle_fastpath_invd(vcpu); 4249 default: 4250 break; 4251 } 4252 4253 return EXIT_FASTPATH_NONE; 4254 } 4255 4256 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) 4257 { 4258 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); 4259 struct vcpu_svm *svm = to_svm(vcpu); 4260 4261 guest_state_enter_irqoff(); 4262 4263 /* 4264 * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of 4265 * VMRUN controls whether or not physical IRQs are masked (KVM always 4266 * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the 4267 * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow 4268 * into guest state if delivery of an event during VMRUN triggers a 4269 * #VMEXIT, and the guest_state transitions already tell lockdep that 4270 * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of 4271 * this path, so IRQs aren't actually unmasked while running host code. 4272 */ 4273 raw_local_irq_enable(); 4274 4275 amd_clear_divider(); 4276 4277 if (sev_es_guest(vcpu->kvm)) 4278 __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, 4279 sev_es_host_save_area(sd)); 4280 else 4281 __svm_vcpu_run(svm, spec_ctrl_intercepted); 4282 4283 raw_local_irq_disable(); 4284 4285 guest_state_exit_irqoff(); 4286 } 4287 4288 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 4289 { 4290 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; 4291 struct vcpu_svm *svm = to_svm(vcpu); 4292 bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); 4293 4294 trace_kvm_entry(vcpu, force_immediate_exit); 4295 4296 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4297 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4298 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4299 4300 /* 4301 * Disable singlestep if we're injecting an interrupt/exception. 4302 * We don't want our modified rflags to be pushed on the stack where 4303 * we might not be able to easily reset them if we disabled NMI 4304 * singlestep later. 4305 */ 4306 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { 4307 /* 4308 * Event injection happens before external interrupts cause a 4309 * vmexit and interrupts are disabled here, so smp_send_reschedule 4310 * is enough to force an immediate vmexit. 4311 */ 4312 disable_nmi_singlestep(svm); 4313 force_immediate_exit = true; 4314 } 4315 4316 if (force_immediate_exit) 4317 smp_send_reschedule(vcpu->cpu); 4318 4319 if (pre_svm_run(vcpu)) { 4320 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 4321 vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; 4322 vcpu->run->fail_entry.cpu = vcpu->cpu; 4323 return EXIT_FASTPATH_EXIT_USERSPACE; 4324 } 4325 4326 sync_lapic_to_cr8(vcpu); 4327 4328 if (unlikely(svm->asid != svm->vmcb->control.asid)) { 4329 svm->vmcb->control.asid = svm->asid; 4330 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 4331 } 4332 svm->vmcb->save.cr2 = vcpu->arch.cr2; 4333 4334 if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) && 4335 kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS)) 4336 svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; 4337 4338 svm_hv_update_vp_id(svm->vmcb, vcpu); 4339 4340 /* 4341 * Run with all-zero DR6 unless the guest can write DR6 freely, so that 4342 * KVM can get the exact cause of a #DB. Note, loading guest DR6 from 4343 * KVM's snapshot is only necessary when DR accesses won't exit. 4344 */ 4345 if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6)) 4346 svm_set_dr6(vcpu, vcpu->arch.dr6); 4347 else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) 4348 svm_set_dr6(vcpu, DR6_ACTIVE_LOW); 4349 4350 clgi(); 4351 4352 /* 4353 * Hardware only context switches DEBUGCTL if LBR virtualization is 4354 * enabled. Manually load DEBUGCTL if necessary (and restore it after 4355 * VM-Exit), as running with the host's DEBUGCTL can negatively affect 4356 * guest state and can even be fatal, e.g. due to Bus Lock Detect. 4357 */ 4358 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4359 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4360 update_debugctlmsr(svm->vmcb->save.dbgctl); 4361 4362 kvm_wait_lapic_expire(vcpu); 4363 4364 /* 4365 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 4366 * it's non-zero. Since vmentry is serialising on affected CPUs, there 4367 * is no need to worry about the conditional branch over the wrmsr 4368 * being speculatively taken. 4369 */ 4370 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 4371 x86_spec_ctrl_set_guest(svm->virt_spec_ctrl); 4372 4373 svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); 4374 4375 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 4376 x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); 4377 4378 if (!sev_es_guest(vcpu->kvm)) { 4379 vcpu->arch.cr2 = svm->vmcb->save.cr2; 4380 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 4381 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 4382 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 4383 } 4384 vcpu->arch.regs_dirty = 0; 4385 4386 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 4387 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 4388 4389 if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && 4390 vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) 4391 update_debugctlmsr(vcpu->arch.host_debugctl); 4392 4393 stgi(); 4394 4395 /* Any pending NMI will happen here */ 4396 4397 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 4398 kvm_after_interrupt(vcpu); 4399 4400 sync_cr8_to_lapic(vcpu); 4401 4402 svm->next_rip = 0; 4403 if (is_guest_mode(vcpu)) { 4404 nested_sync_control_from_vmcb02(svm); 4405 4406 /* Track VMRUNs that have made past consistency checking */ 4407 if (svm->nested.nested_run_pending && 4408 !svm_is_vmrun_failure(svm->vmcb->control.exit_code)) 4409 ++vcpu->stat.nested_run; 4410 4411 svm->nested.nested_run_pending = 0; 4412 } 4413 4414 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 4415 4416 /* 4417 * Unconditionally mask off the CLEAR_RAP bit, the AND is just as cheap 4418 * as the TEST+Jcc to avoid it. 4419 */ 4420 if (cpu_feature_enabled(X86_FEATURE_ERAPS)) 4421 svm->vmcb->control.erap_ctl &= ~ERAP_CONTROL_CLEAR_RAP; 4422 4423 vmcb_mark_all_clean(svm->vmcb); 4424 4425 /* if exit due to PF check for async PF */ 4426 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) 4427 vcpu->arch.apf.host_apf_flags = 4428 kvm_read_and_reset_apf_flags(); 4429 4430 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; 4431 4432 if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL)) 4433 rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl); 4434 4435 trace_kvm_exit(vcpu, KVM_ISA_SVM); 4436 4437 svm_complete_interrupts(vcpu); 4438 4439 return svm_exit_handlers_fastpath(vcpu); 4440 } 4441 4442 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, 4443 int root_level) 4444 { 4445 struct vcpu_svm *svm = to_svm(vcpu); 4446 unsigned long cr3; 4447 4448 if (npt_enabled) { 4449 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); 4450 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 4451 4452 hv_track_root_tdp(vcpu, root_hpa); 4453 4454 cr3 = vcpu->arch.cr3; 4455 } else if (root_level >= PT64_ROOT_4LEVEL) { 4456 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); 4457 } else { 4458 /* PCID in the guest should be impossible with a 32-bit MMU. */ 4459 WARN_ON_ONCE(kvm_get_active_pcid(vcpu)); 4460 cr3 = root_hpa; 4461 } 4462 4463 svm->vmcb->save.cr3 = cr3; 4464 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 4465 } 4466 4467 static void 4468 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 4469 { 4470 /* 4471 * Patch in the VMMCALL instruction: 4472 */ 4473 hypercall[0] = 0x0f; 4474 hypercall[1] = 0x01; 4475 hypercall[2] = 0xd9; 4476 } 4477 4478 /* 4479 * The kvm parameter can be NULL (module initialization, or invocation before 4480 * VM creation). Be sure to check the kvm parameter before using it. 4481 */ 4482 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) 4483 { 4484 switch (index) { 4485 case MSR_IA32_MCG_EXT_CTL: 4486 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 4487 return false; 4488 case MSR_IA32_SMBASE: 4489 if (!IS_ENABLED(CONFIG_KVM_SMM)) 4490 return false; 4491 /* SEV-ES guests do not support SMM, so report false */ 4492 if (kvm && sev_es_guest(kvm)) 4493 return false; 4494 break; 4495 default: 4496 break; 4497 } 4498 4499 return true; 4500 } 4501 4502 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 4503 { 4504 struct vcpu_svm *svm = to_svm(vcpu); 4505 4506 /* 4507 * SVM doesn't provide a way to disable just XSAVES in the guest, KVM 4508 * can only disable all variants of by disallowing CR4.OSXSAVE from 4509 * being set. As a result, if the host has XSAVE and XSAVES, and the 4510 * guest has XSAVE enabled, the guest can execute XSAVES without 4511 * faulting. Treat XSAVES as enabled in this case regardless of 4512 * whether it's advertised to the guest so that KVM context switches 4513 * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give 4514 * the guest read/write access to the host's XSS. 4515 */ 4516 guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES, 4517 boot_cpu_has(X86_FEATURE_XSAVES) && 4518 guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)); 4519 4520 /* 4521 * Intercept VMLOAD if the vCPU model is Intel in order to emulate that 4522 * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing 4523 * SVM on Intel is bonkers and extremely unlikely to work). 4524 */ 4525 if (guest_cpuid_is_intel_compatible(vcpu)) 4526 guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); 4527 4528 if (sev_guest(vcpu->kvm)) 4529 sev_vcpu_after_set_cpuid(svm); 4530 } 4531 4532 static bool svm_has_wbinvd_exit(void) 4533 { 4534 return true; 4535 } 4536 4537 #define PRE_EX(exit) { .exit_code = (exit), \ 4538 .stage = X86_ICPT_PRE_EXCEPT, } 4539 #define POST_EX(exit) { .exit_code = (exit), \ 4540 .stage = X86_ICPT_POST_EXCEPT, } 4541 #define POST_MEM(exit) { .exit_code = (exit), \ 4542 .stage = X86_ICPT_POST_MEMACCESS, } 4543 4544 static const struct __x86_intercept { 4545 u32 exit_code; 4546 enum x86_intercept_stage stage; 4547 } x86_intercept_map[] = { 4548 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), 4549 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), 4550 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), 4551 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), 4552 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), 4553 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), 4554 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), 4555 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), 4556 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), 4557 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), 4558 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), 4559 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), 4560 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), 4561 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), 4562 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), 4563 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), 4564 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), 4565 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), 4566 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), 4567 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), 4568 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), 4569 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), 4570 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), 4571 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), 4572 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), 4573 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), 4574 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), 4575 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), 4576 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), 4577 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), 4578 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), 4579 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), 4580 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), 4581 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), 4582 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), 4583 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), 4584 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), 4585 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), 4586 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), 4587 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), 4588 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), 4589 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), 4590 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), 4591 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), 4592 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), 4593 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), 4594 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), 4595 }; 4596 4597 #undef PRE_EX 4598 #undef POST_EX 4599 #undef POST_MEM 4600 4601 static int svm_check_intercept(struct kvm_vcpu *vcpu, 4602 struct x86_instruction_info *info, 4603 enum x86_intercept_stage stage, 4604 struct x86_exception *exception) 4605 { 4606 struct vcpu_svm *svm = to_svm(vcpu); 4607 int vmexit, ret = X86EMUL_CONTINUE; 4608 struct __x86_intercept icpt_info; 4609 struct vmcb *vmcb = svm->vmcb; 4610 4611 if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) 4612 goto out; 4613 4614 icpt_info = x86_intercept_map[info->intercept]; 4615 4616 if (stage != icpt_info.stage) 4617 goto out; 4618 4619 switch (icpt_info.exit_code) { 4620 case SVM_EXIT_READ_CR0: 4621 if (info->intercept == x86_intercept_cr_read) 4622 icpt_info.exit_code += info->modrm_reg; 4623 break; 4624 case SVM_EXIT_WRITE_CR0: { 4625 unsigned long cr0, val; 4626 4627 /* 4628 * Adjust the exit code accordingly if a CR other than CR0 is 4629 * being written, and skip straight to the common handling as 4630 * only CR0 has an additional selective intercept. 4631 */ 4632 if (info->intercept == x86_intercept_cr_write && info->modrm_reg) { 4633 icpt_info.exit_code += info->modrm_reg; 4634 break; 4635 } 4636 4637 /* 4638 * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a 4639 * selective CR0 intercept is triggered (the common logic will 4640 * treat the selective intercept as being enabled). Note, the 4641 * unconditional intercept has higher priority, i.e. this is 4642 * only relevant if *only* the selective intercept is enabled. 4643 */ 4644 if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) || 4645 !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))) 4646 break; 4647 4648 /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */ 4649 if (info->intercept == x86_intercept_clts) 4650 break; 4651 4652 /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */ 4653 if (info->intercept == x86_intercept_lmsw) { 4654 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; 4655 break; 4656 } 4657 4658 /* 4659 * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit 4660 * other than SVM_CR0_SELECTIVE_MASK is changed. 4661 */ 4662 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; 4663 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; 4664 if (cr0 ^ val) 4665 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; 4666 break; 4667 } 4668 case SVM_EXIT_READ_DR0: 4669 case SVM_EXIT_WRITE_DR0: 4670 icpt_info.exit_code += info->modrm_reg; 4671 break; 4672 case SVM_EXIT_MSR: 4673 if (info->intercept == x86_intercept_wrmsr) 4674 vmcb->control.exit_info_1 = 1; 4675 else 4676 vmcb->control.exit_info_1 = 0; 4677 break; 4678 case SVM_EXIT_PAUSE: 4679 /* 4680 * We get this for NOP only, but pause 4681 * is rep not, check this here 4682 */ 4683 if (info->rep_prefix != REPE_PREFIX) 4684 goto out; 4685 break; 4686 case SVM_EXIT_IOIO: { 4687 u64 exit_info; 4688 u32 bytes; 4689 4690 if (info->intercept == x86_intercept_in || 4691 info->intercept == x86_intercept_ins) { 4692 exit_info = ((info->src_val & 0xffff) << 16) | 4693 SVM_IOIO_TYPE_MASK; 4694 bytes = info->dst_bytes; 4695 } else { 4696 exit_info = (info->dst_val & 0xffff) << 16; 4697 bytes = info->src_bytes; 4698 } 4699 4700 if (info->intercept == x86_intercept_outs || 4701 info->intercept == x86_intercept_ins) 4702 exit_info |= SVM_IOIO_STR_MASK; 4703 4704 if (info->rep_prefix) 4705 exit_info |= SVM_IOIO_REP_MASK; 4706 4707 bytes = min(bytes, 4u); 4708 4709 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; 4710 4711 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); 4712 4713 vmcb->control.exit_info_1 = exit_info; 4714 vmcb->control.exit_info_2 = info->next_rip; 4715 4716 break; 4717 } 4718 default: 4719 break; 4720 } 4721 4722 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */ 4723 if (static_cpu_has(X86_FEATURE_NRIPS)) 4724 vmcb->control.next_rip = info->next_rip; 4725 vmcb->control.exit_code = icpt_info.exit_code; 4726 vmexit = nested_svm_exit_handled(svm); 4727 4728 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED 4729 : X86EMUL_CONTINUE; 4730 4731 out: 4732 return ret; 4733 } 4734 4735 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) 4736 { 4737 switch (to_svm(vcpu)->vmcb->control.exit_code) { 4738 case SVM_EXIT_EXCP_BASE + MC_VECTOR: 4739 svm_handle_mce(vcpu); 4740 break; 4741 case SVM_EXIT_INTR: 4742 vcpu->arch.at_instruction_boundary = true; 4743 break; 4744 default: 4745 break; 4746 } 4747 } 4748 4749 static void svm_setup_mce(struct kvm_vcpu *vcpu) 4750 { 4751 /* [63:9] are reserved. */ 4752 vcpu->arch.mcg_cap &= 0x1ff; 4753 } 4754 4755 #ifdef CONFIG_KVM_SMM 4756 bool svm_smi_blocked(struct kvm_vcpu *vcpu) 4757 { 4758 struct vcpu_svm *svm = to_svm(vcpu); 4759 4760 /* Per APM Vol.2 15.22.2 "Response to SMI" */ 4761 if (!gif_set(svm)) 4762 return true; 4763 4764 return is_smm(vcpu); 4765 } 4766 4767 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4768 { 4769 struct vcpu_svm *svm = to_svm(vcpu); 4770 if (svm->nested.nested_run_pending) 4771 return -EBUSY; 4772 4773 if (svm_smi_blocked(vcpu)) 4774 return 0; 4775 4776 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ 4777 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) 4778 return -EBUSY; 4779 4780 return 1; 4781 } 4782 4783 static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 4784 { 4785 struct vcpu_svm *svm = to_svm(vcpu); 4786 struct kvm_host_map map_save; 4787 int ret; 4788 4789 if (!is_guest_mode(vcpu)) 4790 return 0; 4791 4792 /* 4793 * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is 4794 * responsible for ensuring nested SVM and SMIs are mutually exclusive. 4795 */ 4796 4797 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 4798 return 1; 4799 4800 smram->smram64.svm_guest_flag = 1; 4801 smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa; 4802 4803 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4804 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4805 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4806 4807 ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); 4808 if (ret) 4809 return ret; 4810 4811 /* 4812 * KVM uses VMCB01 to store L1 host state while L2 runs but 4813 * VMCB01 is going to be used during SMM and thus the state will 4814 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save 4815 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the 4816 * format of the area is identical to guest save area offsetted 4817 * by 0x400 (matches the offset of 'struct vmcb_save_area' 4818 * within 'struct vmcb'). Note: HSAVE area may also be used by 4819 * L1 hypervisor to save additional host context (e.g. KVM does 4820 * that, see svm_prepare_switch_to_guest()) which must be 4821 * preserved. 4822 */ 4823 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) 4824 return 1; 4825 4826 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); 4827 4828 svm_copy_vmrun_state(map_save.hva + 0x400, 4829 &svm->vmcb01.ptr->save); 4830 4831 kvm_vcpu_unmap(vcpu, &map_save); 4832 return 0; 4833 } 4834 4835 static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 4836 { 4837 struct vcpu_svm *svm = to_svm(vcpu); 4838 struct kvm_host_map map, map_save; 4839 struct vmcb *vmcb12; 4840 int ret; 4841 4842 const struct kvm_smram_state_64 *smram64 = &smram->smram64; 4843 4844 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 4845 return 0; 4846 4847 /* Non-zero if SMI arrived while vCPU was in guest mode. */ 4848 if (!smram64->svm_guest_flag) 4849 return 0; 4850 4851 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) 4852 return 1; 4853 4854 if (!(smram64->efer & EFER_SVME)) 4855 return 1; 4856 4857 if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map)) 4858 return 1; 4859 4860 ret = 1; 4861 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save)) 4862 goto unmap_map; 4863 4864 if (svm_allocate_nested(svm)) 4865 goto unmap_save; 4866 4867 /* 4868 * Restore L1 host state from L1 HSAVE area as VMCB01 was 4869 * used during SMM (see svm_enter_smm()) 4870 */ 4871 4872 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); 4873 4874 /* 4875 * Enter the nested guest now 4876 */ 4877 4878 vmcb_mark_all_dirty(svm->vmcb01.ptr); 4879 4880 vmcb12 = map.hva; 4881 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); 4882 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); 4883 4884 if (nested_svm_check_cached_vmcb12(vcpu) < 0) 4885 goto unmap_save; 4886 4887 if (enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, 4888 vmcb12, false) != 0) 4889 goto unmap_save; 4890 4891 ret = 0; 4892 svm->nested.nested_run_pending = 1; 4893 4894 unmap_save: 4895 kvm_vcpu_unmap(vcpu, &map_save); 4896 unmap_map: 4897 kvm_vcpu_unmap(vcpu, &map); 4898 return ret; 4899 } 4900 4901 static void svm_enable_smi_window(struct kvm_vcpu *vcpu) 4902 { 4903 struct vcpu_svm *svm = to_svm(vcpu); 4904 4905 if (!gif_set(svm)) { 4906 if (vgif) 4907 svm_set_intercept(svm, INTERCEPT_STGI); 4908 /* STGI will cause a vm exit */ 4909 } else { 4910 /* We must be in SMM; RSM will cause a vmexit anyway. */ 4911 } 4912 } 4913 #endif 4914 4915 static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 4916 void *insn, int insn_len) 4917 { 4918 struct vcpu_svm *svm = to_svm(vcpu); 4919 bool smep, smap, is_user; 4920 u64 error_code; 4921 4922 /* Check that emulation is possible during event vectoring */ 4923 if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) && 4924 !kvm_can_emulate_event_vectoring(emul_type)) 4925 return X86EMUL_UNHANDLEABLE_VECTORING; 4926 4927 /* Emulation is always possible when KVM has access to all guest state. */ 4928 if (!sev_guest(vcpu->kvm)) 4929 return X86EMUL_CONTINUE; 4930 4931 /* #UD and #GP should never be intercepted for SEV guests. */ 4932 WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | 4933 EMULTYPE_TRAP_UD_FORCED | 4934 EMULTYPE_VMWARE_GP)); 4935 4936 /* 4937 * Emulation is impossible for SEV-ES guests as KVM doesn't have access 4938 * to guest register state. 4939 */ 4940 if (sev_es_guest(vcpu->kvm)) 4941 return X86EMUL_RETRY_INSTR; 4942 4943 /* 4944 * Emulation is possible if the instruction is already decoded, e.g. 4945 * when completing I/O after returning from userspace. 4946 */ 4947 if (emul_type & EMULTYPE_NO_DECODE) 4948 return X86EMUL_CONTINUE; 4949 4950 /* 4951 * Emulation is possible for SEV guests if and only if a prefilled 4952 * buffer containing the bytes of the intercepted instruction is 4953 * available. SEV guest memory is encrypted with a guest specific key 4954 * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and 4955 * decode garbage. 4956 * 4957 * If KVM is NOT trying to simply skip an instruction, inject #UD if 4958 * KVM reached this point without an instruction buffer. In practice, 4959 * this path should never be hit by a well-behaved guest, e.g. KVM 4960 * doesn't intercept #UD or #GP for SEV guests, but this path is still 4961 * theoretically reachable, e.g. via unaccelerated fault-like AVIC 4962 * access, and needs to be handled by KVM to avoid putting the guest 4963 * into an infinite loop. Injecting #UD is somewhat arbitrary, but 4964 * its the least awful option given lack of insight into the guest. 4965 * 4966 * If KVM is trying to skip an instruction, simply resume the guest. 4967 * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM 4968 * will attempt to re-inject the INT3/INTO and skip the instruction. 4969 * In that scenario, retrying the INT3/INTO and hoping the guest will 4970 * make forward progress is the only option that has a chance of 4971 * success (and in practice it will work the vast majority of the time). 4972 */ 4973 if (unlikely(!insn)) { 4974 if (emul_type & EMULTYPE_SKIP) 4975 return X86EMUL_UNHANDLEABLE; 4976 4977 kvm_queue_exception(vcpu, UD_VECTOR); 4978 return X86EMUL_PROPAGATE_FAULT; 4979 } 4980 4981 /* 4982 * Emulate for SEV guests if the insn buffer is not empty. The buffer 4983 * will be empty if the DecodeAssist microcode cannot fetch bytes for 4984 * the faulting instruction because the code fetch itself faulted, e.g. 4985 * the guest attempted to fetch from emulated MMIO or a guest page 4986 * table used to translate CS:RIP resides in emulated MMIO. 4987 */ 4988 if (likely(insn_len)) 4989 return X86EMUL_CONTINUE; 4990 4991 /* 4992 * Detect and workaround Errata 1096 Fam_17h_00_0Fh. 4993 * 4994 * Errata: 4995 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is 4996 * possible that CPU microcode implementing DecodeAssist will fail to 4997 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly 4998 * be '0'. This happens because microcode reads CS:RIP using a _data_ 4999 * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode 5000 * gives up and does not fill the instruction bytes buffer. 5001 * 5002 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU 5003 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler 5004 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the 5005 * GuestIntrBytes field of the VMCB. 5006 * 5007 * This does _not_ mean that the erratum has been encountered, as the 5008 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate 5009 * #PF, e.g. if the guest attempt to execute from emulated MMIO and 5010 * encountered a reserved/not-present #PF. 5011 * 5012 * To hit the erratum, the following conditions must be true: 5013 * 1. CR4.SMAP=1 (obviously). 5014 * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot 5015 * have been hit as the guest would have encountered a SMEP 5016 * violation #PF, not a #NPF. 5017 * 3. The #NPF is not due to a code fetch, in which case failure to 5018 * retrieve the instruction bytes is legitimate (see abvoe). 5019 * 5020 * In addition, don't apply the erratum workaround if the #NPF occurred 5021 * while translating guest page tables (see below). 5022 */ 5023 error_code = svm->vmcb->control.exit_info_1; 5024 if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) 5025 goto resume_guest; 5026 5027 smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP); 5028 smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP); 5029 is_user = svm_get_cpl(vcpu) == 3; 5030 if (smap && (!smep || is_user)) { 5031 pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); 5032 5033 /* 5034 * If the fault occurred in userspace, arbitrarily inject #GP 5035 * to avoid killing the guest and to hopefully avoid confusing 5036 * the guest kernel too much, e.g. injecting #PF would not be 5037 * coherent with respect to the guest's page tables. Request 5038 * triple fault if the fault occurred in the kernel as there's 5039 * no fault that KVM can inject without confusing the guest. 5040 * In practice, the triple fault is moot as no sane SEV kernel 5041 * will execute from user memory while also running with SMAP=1. 5042 */ 5043 if (is_user) 5044 kvm_inject_gp(vcpu, 0); 5045 else 5046 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5047 return X86EMUL_PROPAGATE_FAULT; 5048 } 5049 5050 resume_guest: 5051 /* 5052 * If the erratum was not hit, simply resume the guest and let it fault 5053 * again. While awful, e.g. the vCPU may get stuck in an infinite loop 5054 * if the fault is at CPL=0, it's the lesser of all evils. Exiting to 5055 * userspace will kill the guest, and letting the emulator read garbage 5056 * will yield random behavior and potentially corrupt the guest. 5057 * 5058 * Simply resuming the guest is technically not a violation of the SEV 5059 * architecture. AMD's APM states that all code fetches and page table 5060 * accesses for SEV guest are encrypted, regardless of the C-Bit. The 5061 * APM also states that encrypted accesses to MMIO are "ignored", but 5062 * doesn't explicitly define "ignored", i.e. doing nothing and letting 5063 * the guest spin is technically "ignoring" the access. 5064 */ 5065 return X86EMUL_RETRY_INSTR; 5066 } 5067 5068 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 5069 { 5070 struct vcpu_svm *svm = to_svm(vcpu); 5071 5072 return !gif_set(svm); 5073 } 5074 5075 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 5076 { 5077 if (!sev_es_guest(vcpu->kvm)) 5078 return kvm_vcpu_deliver_sipi_vector(vcpu, vector); 5079 5080 sev_vcpu_deliver_sipi_vector(vcpu, vector); 5081 } 5082 5083 static void svm_vm_destroy(struct kvm *kvm) 5084 { 5085 avic_vm_destroy(kvm); 5086 sev_vm_destroy(kvm); 5087 5088 svm_srso_vm_destroy(); 5089 } 5090 5091 static int svm_vm_init(struct kvm *kvm) 5092 { 5093 int type = kvm->arch.vm_type; 5094 5095 if (type != KVM_X86_DEFAULT_VM && 5096 type != KVM_X86_SW_PROTECTED_VM) { 5097 kvm->arch.has_protected_state = 5098 (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); 5099 to_kvm_sev_info(kvm)->need_init = true; 5100 5101 kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); 5102 kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; 5103 } 5104 5105 if (!pause_filter_count || !pause_filter_thresh) 5106 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); 5107 5108 if (enable_apicv) { 5109 int ret = avic_vm_init(kvm); 5110 if (ret) 5111 return ret; 5112 } 5113 5114 svm_srso_vm_init(); 5115 return 0; 5116 } 5117 5118 static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) 5119 { 5120 struct page *page = snp_safe_alloc_page(); 5121 5122 if (!page) 5123 return NULL; 5124 5125 return page_address(page); 5126 } 5127 5128 struct kvm_x86_ops svm_x86_ops __initdata = { 5129 .name = KBUILD_MODNAME, 5130 5131 .check_processor_compatibility = svm_check_processor_compat, 5132 5133 .hardware_unsetup = svm_hardware_unsetup, 5134 .enable_virtualization_cpu = svm_enable_virtualization_cpu, 5135 .disable_virtualization_cpu = svm_disable_virtualization_cpu, 5136 .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, 5137 .has_emulated_msr = svm_has_emulated_msr, 5138 5139 .vcpu_precreate = svm_vcpu_precreate, 5140 .vcpu_create = svm_vcpu_create, 5141 .vcpu_free = svm_vcpu_free, 5142 .vcpu_reset = svm_vcpu_reset, 5143 5144 .vm_size = sizeof(struct kvm_svm), 5145 .vm_init = svm_vm_init, 5146 .vm_destroy = svm_vm_destroy, 5147 5148 .prepare_switch_to_guest = svm_prepare_switch_to_guest, 5149 .vcpu_load = svm_vcpu_load, 5150 .vcpu_put = svm_vcpu_put, 5151 .vcpu_blocking = avic_vcpu_blocking, 5152 .vcpu_unblocking = avic_vcpu_unblocking, 5153 5154 .update_exception_bitmap = svm_update_exception_bitmap, 5155 .get_feature_msr = svm_get_feature_msr, 5156 .get_msr = svm_get_msr, 5157 .set_msr = svm_set_msr, 5158 .get_segment_base = svm_get_segment_base, 5159 .get_segment = svm_get_segment, 5160 .set_segment = svm_set_segment, 5161 .get_cpl = svm_get_cpl, 5162 .get_cpl_no_cache = svm_get_cpl, 5163 .get_cs_db_l_bits = svm_get_cs_db_l_bits, 5164 .is_valid_cr0 = svm_is_valid_cr0, 5165 .set_cr0 = svm_set_cr0, 5166 .post_set_cr3 = sev_post_set_cr3, 5167 .is_valid_cr4 = svm_is_valid_cr4, 5168 .set_cr4 = svm_set_cr4, 5169 .set_efer = svm_set_efer, 5170 .get_idt = svm_get_idt, 5171 .set_idt = svm_set_idt, 5172 .get_gdt = svm_get_gdt, 5173 .set_gdt = svm_set_gdt, 5174 .set_dr7 = svm_set_dr7, 5175 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, 5176 .cache_reg = svm_cache_reg, 5177 .get_rflags = svm_get_rflags, 5178 .set_rflags = svm_set_rflags, 5179 .get_if_flag = svm_get_if_flag, 5180 5181 .flush_tlb_all = svm_flush_tlb_all, 5182 .flush_tlb_current = svm_flush_tlb_current, 5183 .flush_tlb_gva = svm_flush_tlb_gva, 5184 .flush_tlb_guest = svm_flush_tlb_guest, 5185 5186 .vcpu_pre_run = svm_vcpu_pre_run, 5187 .vcpu_run = svm_vcpu_run, 5188 .handle_exit = svm_handle_exit, 5189 .skip_emulated_instruction = svm_skip_emulated_instruction, 5190 .update_emulated_instruction = NULL, 5191 .set_interrupt_shadow = svm_set_interrupt_shadow, 5192 .get_interrupt_shadow = svm_get_interrupt_shadow, 5193 .patch_hypercall = svm_patch_hypercall, 5194 .inject_irq = svm_inject_irq, 5195 .inject_nmi = svm_inject_nmi, 5196 .is_vnmi_pending = svm_is_vnmi_pending, 5197 .set_vnmi_pending = svm_set_vnmi_pending, 5198 .inject_exception = svm_inject_exception, 5199 .cancel_injection = svm_cancel_injection, 5200 .interrupt_allowed = svm_interrupt_allowed, 5201 .nmi_allowed = svm_nmi_allowed, 5202 .get_nmi_mask = svm_get_nmi_mask, 5203 .set_nmi_mask = svm_set_nmi_mask, 5204 .enable_nmi_window = svm_enable_nmi_window, 5205 .enable_irq_window = svm_enable_irq_window, 5206 .update_cr8_intercept = svm_update_cr8_intercept, 5207 5208 .x2apic_icr_is_split = true, 5209 .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, 5210 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, 5211 .apicv_post_state_restore = avic_apicv_post_state_restore, 5212 .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, 5213 5214 .get_exit_info = svm_get_exit_info, 5215 .get_entry_info = svm_get_entry_info, 5216 5217 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, 5218 5219 .has_wbinvd_exit = svm_has_wbinvd_exit, 5220 5221 .get_l2_tsc_offset = svm_get_l2_tsc_offset, 5222 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, 5223 .write_tsc_offset = svm_write_tsc_offset, 5224 .write_tsc_multiplier = svm_write_tsc_multiplier, 5225 5226 .load_mmu_pgd = svm_load_mmu_pgd, 5227 5228 .check_intercept = svm_check_intercept, 5229 .handle_exit_irqoff = svm_handle_exit_irqoff, 5230 5231 .nested_ops = &svm_nested_ops, 5232 5233 .deliver_interrupt = svm_deliver_interrupt, 5234 .pi_update_irte = avic_pi_update_irte, 5235 .setup_mce = svm_setup_mce, 5236 5237 #ifdef CONFIG_KVM_SMM 5238 .smi_allowed = svm_smi_allowed, 5239 .enter_smm = svm_enter_smm, 5240 .leave_smm = svm_leave_smm, 5241 .enable_smi_window = svm_enable_smi_window, 5242 #endif 5243 5244 #ifdef CONFIG_KVM_AMD_SEV 5245 .dev_get_attr = sev_dev_get_attr, 5246 .mem_enc_ioctl = sev_mem_enc_ioctl, 5247 .mem_enc_register_region = sev_mem_enc_register_region, 5248 .mem_enc_unregister_region = sev_mem_enc_unregister_region, 5249 .guest_memory_reclaimed = sev_guest_memory_reclaimed, 5250 5251 .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, 5252 .vm_move_enc_context_from = sev_vm_move_enc_context_from, 5253 #endif 5254 .check_emulate_instruction = svm_check_emulate_instruction, 5255 5256 .apic_init_signal_blocked = svm_apic_init_signal_blocked, 5257 5258 .recalc_intercepts = svm_recalc_intercepts, 5259 .complete_emulated_msr = svm_complete_emulated_msr, 5260 5261 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, 5262 .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, 5263 .alloc_apic_backing_page = svm_alloc_apic_backing_page, 5264 5265 .gmem_prepare = sev_gmem_prepare, 5266 .gmem_invalidate = sev_gmem_invalidate, 5267 .gmem_max_mapping_level = sev_gmem_max_mapping_level, 5268 }; 5269 5270 /* 5271 * The default MMIO mask is a single bit (excluding the present bit), 5272 * which could conflict with the memory encryption bit. Check for 5273 * memory encryption support and override the default MMIO mask if 5274 * memory encryption is enabled. 5275 */ 5276 static __init void svm_adjust_mmio_mask(void) 5277 { 5278 unsigned int enc_bit, mask_bit; 5279 u64 msr, mask; 5280 5281 /* If there is no memory encryption support, use existing mask */ 5282 if (cpuid_eax(0x80000000) < 0x8000001f) 5283 return; 5284 5285 /* If memory encryption is not enabled, use existing mask */ 5286 rdmsrq(MSR_AMD64_SYSCFG, msr); 5287 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) 5288 return; 5289 5290 enc_bit = cpuid_ebx(0x8000001f) & 0x3f; 5291 mask_bit = boot_cpu_data.x86_phys_bits; 5292 5293 /* Increment the mask bit if it is the same as the encryption bit */ 5294 if (enc_bit == mask_bit) 5295 mask_bit++; 5296 5297 /* 5298 * If the mask bit location is below 52, then some bits above the 5299 * physical addressing limit will always be reserved, so use the 5300 * rsvd_bits() function to generate the mask. This mask, along with 5301 * the present bit, will be used to generate a page fault with 5302 * PFER.RSV = 1. 5303 * 5304 * If the mask bit location is 52 (or above), then clear the mask. 5305 */ 5306 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; 5307 5308 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); 5309 } 5310 5311 static __init void svm_set_cpu_caps(void) 5312 { 5313 kvm_initialize_cpu_caps(); 5314 5315 kvm_caps.supported_perf_cap = 0; 5316 5317 kvm_cpu_cap_clear(X86_FEATURE_IBT); 5318 5319 /* CPUID 0x80000001 and 0x8000000A (SVM features) */ 5320 if (nested) { 5321 kvm_cpu_cap_set(X86_FEATURE_SVM); 5322 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); 5323 5324 /* 5325 * KVM currently flushes TLBs on *every* nested SVM transition, 5326 * and so for all intents and purposes KVM supports flushing by 5327 * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush. 5328 */ 5329 kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID); 5330 5331 if (nrips) 5332 kvm_cpu_cap_set(X86_FEATURE_NRIPS); 5333 5334 if (npt_enabled) 5335 kvm_cpu_cap_set(X86_FEATURE_NPT); 5336 5337 if (tsc_scaling) 5338 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); 5339 5340 if (vls) 5341 kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD); 5342 if (lbrv) 5343 kvm_cpu_cap_set(X86_FEATURE_LBRV); 5344 5345 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) 5346 kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER); 5347 5348 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) 5349 kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD); 5350 5351 if (vgif) 5352 kvm_cpu_cap_set(X86_FEATURE_VGIF); 5353 5354 if (vnmi) 5355 kvm_cpu_cap_set(X86_FEATURE_VNMI); 5356 5357 /* Nested VM can receive #VMEXIT instead of triggering #GP */ 5358 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); 5359 } 5360 5361 if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) 5362 kvm_caps.has_bus_lock_exit = true; 5363 5364 /* CPUID 0x80000008 */ 5365 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 5366 boot_cpu_has(X86_FEATURE_AMD_SSBD)) 5367 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 5368 5369 if (enable_pmu) { 5370 /* 5371 * Enumerate support for PERFCTR_CORE if and only if KVM has 5372 * access to enough counters to virtualize "core" support, 5373 * otherwise limit vPMU support to the legacy number of counters. 5374 */ 5375 if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) 5376 kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, 5377 kvm_pmu_cap.num_counters_gp); 5378 else 5379 kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); 5380 5381 if (kvm_pmu_cap.version != 2 || 5382 !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) 5383 kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); 5384 } 5385 5386 /* CPUID 0x8000001F (SME/SEV features) */ 5387 sev_set_cpu_caps(); 5388 5389 /* 5390 * Clear capabilities that are automatically configured by common code, 5391 * but that require explicit SVM support (that isn't yet implemented). 5392 */ 5393 kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); 5394 kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM); 5395 5396 kvm_setup_xss_caps(); 5397 kvm_finalize_cpu_caps(); 5398 } 5399 5400 static __init int svm_hardware_setup(void) 5401 { 5402 void *iopm_va; 5403 int cpu, r; 5404 5405 /* 5406 * NX is required for shadow paging and for NPT if the NX huge pages 5407 * mitigation is enabled. 5408 */ 5409 if (!boot_cpu_has(X86_FEATURE_NX)) { 5410 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 5411 return -EOPNOTSUPP; 5412 } 5413 kvm_enable_efer_bits(EFER_NX); 5414 5415 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 5416 XFEATURE_MASK_BNDCSR); 5417 5418 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 5419 kvm_enable_efer_bits(EFER_FFXSR); 5420 5421 if (tsc_scaling) { 5422 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 5423 tsc_scaling = false; 5424 } else { 5425 pr_info("TSC scaling supported\n"); 5426 kvm_caps.has_tsc_control = true; 5427 } 5428 } 5429 kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; 5430 kvm_caps.tsc_scaling_ratio_frac_bits = 32; 5431 5432 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); 5433 5434 if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) 5435 kvm_enable_efer_bits(EFER_AUTOIBRS); 5436 5437 /* Check for pause filtering support */ 5438 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 5439 pause_filter_count = 0; 5440 pause_filter_thresh = 0; 5441 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 5442 pause_filter_thresh = 0; 5443 } 5444 5445 if (nested) { 5446 pr_info("Nested Virtualization enabled\n"); 5447 kvm_enable_efer_bits(EFER_SVME); 5448 if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ)) 5449 kvm_enable_efer_bits(EFER_LMSLE); 5450 5451 r = nested_svm_init_msrpm_merge_offsets(); 5452 if (r) 5453 return r; 5454 } 5455 5456 /* 5457 * KVM's MMU doesn't support using 2-level paging for itself, and thus 5458 * NPT isn't supported if the host is using 2-level paging since host 5459 * CR4 is unchanged on VMRUN. 5460 */ 5461 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) 5462 npt_enabled = false; 5463 5464 if (!boot_cpu_has(X86_FEATURE_NPT)) 5465 npt_enabled = false; 5466 5467 /* Force VM NPT level equal to the host's paging level */ 5468 kvm_configure_mmu(npt_enabled, get_npt_level(), 5469 get_npt_level(), PG_LEVEL_1G); 5470 pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); 5471 5472 /* 5473 * It seems that on AMD processors PTE's accessed bit is 5474 * being set by the CPU hardware before the NPF vmexit. 5475 * This is not expected behaviour and our tests fail because 5476 * of it. 5477 * A workaround here is to disable support for 5478 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 5479 * In this case userspace can know if there is support using 5480 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 5481 * it 5482 * If future AMD CPU models change the behaviour described above, 5483 * this variable can be changed accordingly 5484 */ 5485 allow_smaller_maxphyaddr = !npt_enabled; 5486 5487 /* Setup shadow_me_value and shadow_me_mask */ 5488 kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); 5489 5490 svm_adjust_mmio_mask(); 5491 5492 nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); 5493 5494 if (lbrv) { 5495 if (!boot_cpu_has(X86_FEATURE_LBRV)) 5496 lbrv = false; 5497 else 5498 pr_info("LBR virtualization supported\n"); 5499 } 5500 5501 iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL); 5502 if (!iopm_va) 5503 return -ENOMEM; 5504 5505 iopm_base = __sme_set(__pa(iopm_va)); 5506 5507 /* 5508 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which 5509 * may be modified by svm_adjust_mmio_mask()), as well as nrips. 5510 */ 5511 sev_hardware_setup(); 5512 5513 svm_hv_hardware_setup(); 5514 5515 enable_apicv = avic_hardware_setup(); 5516 if (!enable_apicv) { 5517 enable_ipiv = false; 5518 svm_x86_ops.vcpu_blocking = NULL; 5519 svm_x86_ops.vcpu_unblocking = NULL; 5520 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; 5521 } 5522 5523 if (vls) { 5524 if (!npt_enabled || 5525 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 5526 !IS_ENABLED(CONFIG_X86_64)) { 5527 vls = false; 5528 } else { 5529 pr_info("Virtual VMLOAD VMSAVE supported\n"); 5530 } 5531 } 5532 5533 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) 5534 svm_gp_erratum_intercept = false; 5535 5536 if (vgif) { 5537 if (!boot_cpu_has(X86_FEATURE_VGIF)) 5538 vgif = false; 5539 else 5540 pr_info("Virtual GIF supported\n"); 5541 } 5542 5543 vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI); 5544 if (vnmi) 5545 pr_info("Virtual NMI enabled\n"); 5546 5547 if (!vnmi) { 5548 svm_x86_ops.is_vnmi_pending = NULL; 5549 svm_x86_ops.set_vnmi_pending = NULL; 5550 } 5551 5552 if (!enable_pmu) 5553 pr_info("PMU virtualization is disabled\n"); 5554 5555 svm_set_cpu_caps(); 5556 5557 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; 5558 5559 for_each_possible_cpu(cpu) { 5560 r = svm_cpu_init(cpu); 5561 if (r) 5562 goto err; 5563 } 5564 5565 return 0; 5566 5567 err: 5568 svm_hardware_unsetup(); 5569 return r; 5570 } 5571 5572 5573 static struct kvm_x86_init_ops svm_init_ops __initdata = { 5574 .hardware_setup = svm_hardware_setup, 5575 5576 .runtime_ops = &svm_x86_ops, 5577 .pmu_ops = &amd_pmu_ops, 5578 }; 5579 5580 static void __svm_exit(void) 5581 { 5582 kvm_x86_vendor_exit(); 5583 } 5584 5585 static int __init svm_init(void) 5586 { 5587 int r; 5588 5589 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm); 5590 5591 __unused_size_checks(); 5592 5593 if (!kvm_is_svm_supported()) 5594 return -EOPNOTSUPP; 5595 5596 r = kvm_x86_vendor_init(&svm_init_ops); 5597 if (r) 5598 return r; 5599 5600 /* 5601 * Common KVM initialization _must_ come last, after this, /dev/kvm is 5602 * exposed to userspace! 5603 */ 5604 r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), 5605 THIS_MODULE); 5606 if (r) 5607 goto err_kvm_init; 5608 5609 return 0; 5610 5611 err_kvm_init: 5612 __svm_exit(); 5613 return r; 5614 } 5615 5616 static void __exit svm_exit(void) 5617 { 5618 kvm_exit(); 5619 __svm_exit(); 5620 } 5621 5622 module_init(svm_init) 5623 module_exit(svm_exit) 5624