1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 #include <linux/entry-kvm.h> 32 33 #include <asm/apic.h> 34 #include <asm/asm.h> 35 #include <asm/cpu.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/api.h> 40 #include <asm/fpu/xstate.h> 41 #include <asm/fred.h> 42 #include <asm/idtentry.h> 43 #include <asm/io.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/reboot.h> 46 #include <asm/perf_event.h> 47 #include <asm/mmu_context.h> 48 #include <asm/mshyperv.h> 49 #include <asm/msr.h> 50 #include <asm/mwait.h> 51 #include <asm/spec-ctrl.h> 52 #include <asm/vmx.h> 53 54 #include <trace/events/ipi.h> 55 56 #include "capabilities.h" 57 #include "common.h" 58 #include "cpuid.h" 59 #include "hyperv.h" 60 #include "kvm_onhyperv.h" 61 #include "irq.h" 62 #include "kvm_cache_regs.h" 63 #include "lapic.h" 64 #include "mmu.h" 65 #include "nested.h" 66 #include "pmu.h" 67 #include "sgx.h" 68 #include "trace.h" 69 #include "vmcs.h" 70 #include "vmcs12.h" 71 #include "vmx.h" 72 #include "x86.h" 73 #include "x86_ops.h" 74 #include "smm.h" 75 #include "vmx_onhyperv.h" 76 #include "posted_intr.h" 77 78 #include "mmu/spte.h" 79 80 MODULE_AUTHOR("Qumranet"); 81 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); 82 MODULE_LICENSE("GPL"); 83 84 #ifdef MODULE 85 static const struct x86_cpu_id vmx_cpu_id[] = { 86 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 87 {} 88 }; 89 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 90 #endif 91 92 bool __read_mostly enable_vpid = 1; 93 module_param_named(vpid, enable_vpid, bool, 0444); 94 95 static bool __read_mostly enable_vnmi = 1; 96 module_param_named(vnmi, enable_vnmi, bool, 0444); 97 98 bool __read_mostly flexpriority_enabled = 1; 99 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 100 101 bool __read_mostly enable_ept = 1; 102 module_param_named(ept, enable_ept, bool, 0444); 103 104 bool __read_mostly enable_unrestricted_guest = 1; 105 module_param_named(unrestricted_guest, 106 enable_unrestricted_guest, bool, 0444); 107 108 bool __read_mostly enable_ept_ad_bits = 1; 109 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 110 111 static bool __read_mostly emulate_invalid_guest_state = true; 112 module_param(emulate_invalid_guest_state, bool, 0444); 113 114 static bool __read_mostly fasteoi = 1; 115 module_param(fasteoi, bool, 0444); 116 117 module_param(enable_apicv, bool, 0444); 118 module_param(enable_ipiv, bool, 0444); 119 120 module_param(enable_device_posted_irqs, bool, 0444); 121 122 /* 123 * If nested=1, nested virtualization is supported, i.e., guests may use 124 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 125 * use VMX instructions. 126 */ 127 static bool __read_mostly nested = 1; 128 module_param(nested, bool, 0444); 129 130 bool __read_mostly enable_pml = 1; 131 module_param_named(pml, enable_pml, bool, 0444); 132 133 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 134 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 135 136 static bool __read_mostly dump_invalid_vmcs = 0; 137 module_param(dump_invalid_vmcs, bool, 0644); 138 139 #define MSR_BITMAP_MODE_X2APIC 1 140 #define MSR_BITMAP_MODE_X2APIC_APICV 2 141 142 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 143 144 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 145 static int __read_mostly cpu_preemption_timer_multi; 146 static bool __read_mostly enable_preemption_timer = 1; 147 #ifdef CONFIG_X86_64 148 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 149 #endif 150 151 extern bool __read_mostly allow_smaller_maxphyaddr; 152 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 153 154 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 155 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 156 #define KVM_VM_CR0_ALWAYS_ON \ 157 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 158 159 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 160 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 161 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 162 163 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 164 165 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 166 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 167 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 168 RTIT_STATUS_BYTECNT)) 169 170 /* 171 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 172 * ple_gap: upper bound on the amount of time between two successive 173 * executions of PAUSE in a loop. Also indicate if ple enabled. 174 * According to test, this time is usually smaller than 128 cycles. 175 * ple_window: upper bound on the amount of time a guest is allowed to execute 176 * in a PAUSE loop. Tests indicate that most spinlocks are held for 177 * less than 2^12 cycles 178 * Time is measured based on a counter that runs at the same rate as the TSC, 179 * refer SDM volume 3b section 21.6.13 & 22.1.3. 180 */ 181 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 182 module_param(ple_gap, uint, 0444); 183 184 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 185 module_param(ple_window, uint, 0444); 186 187 /* Default doubles per-vcpu window every exit. */ 188 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 189 module_param(ple_window_grow, uint, 0444); 190 191 /* Default resets per-vcpu window every exit to ple_window. */ 192 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 193 module_param(ple_window_shrink, uint, 0444); 194 195 /* Default is to compute the maximum so we can never overflow. */ 196 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 197 module_param(ple_window_max, uint, 0444); 198 199 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ 200 int __read_mostly pt_mode = PT_MODE_SYSTEM; 201 #ifdef CONFIG_BROKEN 202 module_param(pt_mode, int, S_IRUGO); 203 #endif 204 205 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 206 207 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 208 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 209 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 210 211 /* Storage for pre module init parameter parsing */ 212 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 213 214 static const struct { 215 const char *option; 216 bool for_parse; 217 } vmentry_l1d_param[] = { 218 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 219 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 220 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 221 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 222 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 223 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 224 }; 225 226 #define L1D_CACHE_ORDER 4 227 static void *vmx_l1d_flush_pages; 228 229 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 230 { 231 struct page *page; 232 unsigned int i; 233 234 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 235 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 236 return 0; 237 } 238 239 if (!enable_ept) { 240 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 241 return 0; 242 } 243 244 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 245 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 246 return 0; 247 } 248 249 /* If set to auto use the default l1tf mitigation method */ 250 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 251 switch (l1tf_mitigation) { 252 case L1TF_MITIGATION_OFF: 253 l1tf = VMENTER_L1D_FLUSH_NEVER; 254 break; 255 case L1TF_MITIGATION_AUTO: 256 case L1TF_MITIGATION_FLUSH_NOWARN: 257 case L1TF_MITIGATION_FLUSH: 258 case L1TF_MITIGATION_FLUSH_NOSMT: 259 l1tf = VMENTER_L1D_FLUSH_COND; 260 break; 261 case L1TF_MITIGATION_FULL: 262 case L1TF_MITIGATION_FULL_FORCE: 263 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 264 break; 265 } 266 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 267 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 268 } 269 270 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 271 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 272 /* 273 * This allocation for vmx_l1d_flush_pages is not tied to a VM 274 * lifetime and so should not be charged to a memcg. 275 */ 276 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 277 if (!page) 278 return -ENOMEM; 279 vmx_l1d_flush_pages = page_address(page); 280 281 /* 282 * Initialize each page with a different pattern in 283 * order to protect against KSM in the nested 284 * virtualization case. 285 */ 286 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 287 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 288 PAGE_SIZE); 289 } 290 } 291 292 l1tf_vmx_mitigation = l1tf; 293 294 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 295 static_branch_enable(&vmx_l1d_should_flush); 296 else 297 static_branch_disable(&vmx_l1d_should_flush); 298 299 if (l1tf == VMENTER_L1D_FLUSH_COND) 300 static_branch_enable(&vmx_l1d_flush_cond); 301 else 302 static_branch_disable(&vmx_l1d_flush_cond); 303 return 0; 304 } 305 306 static int vmentry_l1d_flush_parse(const char *s) 307 { 308 unsigned int i; 309 310 if (s) { 311 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 312 if (vmentry_l1d_param[i].for_parse && 313 sysfs_streq(s, vmentry_l1d_param[i].option)) 314 return i; 315 } 316 } 317 return -EINVAL; 318 } 319 320 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 321 { 322 int l1tf, ret; 323 324 l1tf = vmentry_l1d_flush_parse(s); 325 if (l1tf < 0) 326 return l1tf; 327 328 if (!boot_cpu_has(X86_BUG_L1TF)) 329 return 0; 330 331 /* 332 * Has vmx_init() run already? If not then this is the pre init 333 * parameter parsing. In that case just store the value and let 334 * vmx_init() do the proper setup after enable_ept has been 335 * established. 336 */ 337 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 338 vmentry_l1d_flush_param = l1tf; 339 return 0; 340 } 341 342 mutex_lock(&vmx_l1d_flush_mutex); 343 ret = vmx_setup_l1d_flush(l1tf); 344 mutex_unlock(&vmx_l1d_flush_mutex); 345 return ret; 346 } 347 348 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 349 { 350 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 351 return sysfs_emit(s, "???\n"); 352 353 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 354 } 355 356 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 357 { 358 u64 msr; 359 360 if (!vmx->disable_fb_clear) 361 return; 362 363 msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); 364 msr |= FB_CLEAR_DIS; 365 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); 366 /* Cache the MSR value to avoid reading it later */ 367 vmx->msr_ia32_mcu_opt_ctrl = msr; 368 } 369 370 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 371 { 372 if (!vmx->disable_fb_clear) 373 return; 374 375 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 376 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 377 } 378 379 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 380 { 381 /* 382 * Disable VERW's behavior of clearing CPU buffers for the guest if the 383 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 384 * the mitigation. Disabling the clearing behavior provides a 385 * performance boost for guests that aren't aware that manually clearing 386 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 387 * and VM-Exit. 388 */ 389 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 390 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 391 !boot_cpu_has_bug(X86_BUG_MDS) && 392 !boot_cpu_has_bug(X86_BUG_TAA); 393 394 /* 395 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 396 * at VMEntry. Skip the MSR read/write when a guest has no use case to 397 * execute VERW. 398 */ 399 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 400 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 401 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 402 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 403 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 404 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 405 vmx->disable_fb_clear = false; 406 } 407 408 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 409 .set = vmentry_l1d_flush_set, 410 .get = vmentry_l1d_flush_get, 411 }; 412 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 413 414 static u32 vmx_segment_access_rights(struct kvm_segment *var); 415 416 void vmx_vmexit(void); 417 418 #define vmx_insn_failed(fmt...) \ 419 do { \ 420 WARN_ONCE(1, fmt); \ 421 pr_warn_ratelimited(fmt); \ 422 } while (0) 423 424 noinline void vmread_error(unsigned long field) 425 { 426 vmx_insn_failed("vmread failed: field=%lx\n", field); 427 } 428 429 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 430 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 431 { 432 if (fault) { 433 kvm_spurious_fault(); 434 } else { 435 instrumentation_begin(); 436 vmread_error(field); 437 instrumentation_end(); 438 } 439 } 440 #endif 441 442 noinline void vmwrite_error(unsigned long field, unsigned long value) 443 { 444 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 445 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 446 } 447 448 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 449 { 450 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 451 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 452 } 453 454 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 455 { 456 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 457 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 458 } 459 460 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 461 { 462 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 463 ext, vpid, gva); 464 } 465 466 noinline void invept_error(unsigned long ext, u64 eptp) 467 { 468 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); 469 } 470 471 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 472 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 473 /* 474 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 475 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 476 */ 477 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 478 479 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 480 static DEFINE_SPINLOCK(vmx_vpid_lock); 481 482 struct vmcs_config vmcs_config __ro_after_init; 483 struct vmx_capability vmx_capability __ro_after_init; 484 485 #define VMX_SEGMENT_FIELD(seg) \ 486 [VCPU_SREG_##seg] = { \ 487 .selector = GUEST_##seg##_SELECTOR, \ 488 .base = GUEST_##seg##_BASE, \ 489 .limit = GUEST_##seg##_LIMIT, \ 490 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 491 } 492 493 static const struct kvm_vmx_segment_field { 494 unsigned selector; 495 unsigned base; 496 unsigned limit; 497 unsigned ar_bytes; 498 } kvm_vmx_segment_fields[] = { 499 VMX_SEGMENT_FIELD(CS), 500 VMX_SEGMENT_FIELD(DS), 501 VMX_SEGMENT_FIELD(ES), 502 VMX_SEGMENT_FIELD(FS), 503 VMX_SEGMENT_FIELD(GS), 504 VMX_SEGMENT_FIELD(SS), 505 VMX_SEGMENT_FIELD(TR), 506 VMX_SEGMENT_FIELD(LDTR), 507 }; 508 509 510 static unsigned long host_idt_base; 511 512 #if IS_ENABLED(CONFIG_HYPERV) 513 static bool __read_mostly enlightened_vmcs = true; 514 module_param(enlightened_vmcs, bool, 0444); 515 516 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 517 { 518 struct hv_enlightened_vmcs *evmcs; 519 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 520 521 if (partition_assist_page == INVALID_PAGE) 522 return -ENOMEM; 523 524 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 525 526 evmcs->partition_assist_page = partition_assist_page; 527 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 528 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 529 530 return 0; 531 } 532 533 static __init void hv_init_evmcs(void) 534 { 535 int cpu; 536 537 if (!enlightened_vmcs) 538 return; 539 540 /* 541 * Enlightened VMCS usage should be recommended and the host needs 542 * to support eVMCS v1 or above. 543 */ 544 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 545 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 546 KVM_EVMCS_VERSION) { 547 548 /* Check that we have assist pages on all online CPUs */ 549 for_each_online_cpu(cpu) { 550 if (!hv_get_vp_assist_page(cpu)) { 551 enlightened_vmcs = false; 552 break; 553 } 554 } 555 556 if (enlightened_vmcs) { 557 pr_info("Using Hyper-V Enlightened VMCS\n"); 558 static_branch_enable(&__kvm_is_using_evmcs); 559 } 560 561 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 562 vt_x86_ops.enable_l2_tlb_flush 563 = hv_enable_l2_tlb_flush; 564 } else { 565 enlightened_vmcs = false; 566 } 567 } 568 569 static void hv_reset_evmcs(void) 570 { 571 struct hv_vp_assist_page *vp_ap; 572 573 if (!kvm_is_using_evmcs()) 574 return; 575 576 /* 577 * KVM should enable eVMCS if and only if all CPUs have a VP assist 578 * page, and should reject CPU onlining if eVMCS is enabled the CPU 579 * doesn't have a VP assist page allocated. 580 */ 581 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 582 if (WARN_ON_ONCE(!vp_ap)) 583 return; 584 585 /* 586 * Reset everything to support using non-enlightened VMCS access later 587 * (e.g. when we reload the module with enlightened_vmcs=0) 588 */ 589 vp_ap->nested_control.features.directhypercall = 0; 590 vp_ap->current_nested_vmcs = 0; 591 vp_ap->enlighten_vmentry = 0; 592 } 593 594 #else /* IS_ENABLED(CONFIG_HYPERV) */ 595 static void hv_init_evmcs(void) {} 596 static void hv_reset_evmcs(void) {} 597 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 598 599 /* 600 * Comment's format: document - errata name - stepping - processor name. 601 * Refer from 602 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 603 */ 604 static u32 vmx_preemption_cpu_tfms[] = { 605 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 606 0x000206E6, 607 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 608 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 609 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 610 0x00020652, 611 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 612 0x00020655, 613 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 614 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 615 /* 616 * 320767.pdf - AAP86 - B1 - 617 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 618 */ 619 0x000106E5, 620 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 621 0x000106A0, 622 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 623 0x000106A1, 624 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 625 0x000106A4, 626 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 627 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 628 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 629 0x000106A5, 630 /* Xeon E3-1220 V2 */ 631 0x000306A8, 632 }; 633 634 static inline bool cpu_has_broken_vmx_preemption_timer(void) 635 { 636 u32 eax = cpuid_eax(0x00000001), i; 637 638 /* Clear the reserved bits */ 639 eax &= ~(0x3U << 14 | 0xfU << 28); 640 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 641 if (eax == vmx_preemption_cpu_tfms[i]) 642 return true; 643 644 return false; 645 } 646 647 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 648 { 649 return flexpriority_enabled && lapic_in_kernel(vcpu); 650 } 651 652 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 653 { 654 int i; 655 656 i = kvm_find_user_return_msr(msr); 657 if (i >= 0) 658 return &vmx->guest_uret_msrs[i]; 659 return NULL; 660 } 661 662 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 663 struct vmx_uret_msr *msr, u64 data) 664 { 665 unsigned int slot = msr - vmx->guest_uret_msrs; 666 int ret = 0; 667 668 if (msr->load_into_hardware) { 669 preempt_disable(); 670 ret = kvm_set_user_return_msr(slot, data, msr->mask); 671 preempt_enable(); 672 } 673 if (!ret) 674 msr->data = data; 675 return ret; 676 } 677 678 /* 679 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 680 * 681 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 682 * atomically track post-VMXON state, e.g. this may be called in NMI context. 683 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 684 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 685 * magically in RM, VM86, compat mode, or at CPL>0. 686 */ 687 static int kvm_cpu_vmxoff(void) 688 { 689 asm goto("1: vmxoff\n\t" 690 _ASM_EXTABLE(1b, %l[fault]) 691 ::: "cc", "memory" : fault); 692 693 cr4_clear_bits(X86_CR4_VMXE); 694 return 0; 695 696 fault: 697 cr4_clear_bits(X86_CR4_VMXE); 698 return -EIO; 699 } 700 701 void vmx_emergency_disable_virtualization_cpu(void) 702 { 703 int cpu = raw_smp_processor_id(); 704 struct loaded_vmcs *v; 705 706 kvm_rebooting = true; 707 708 /* 709 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 710 * set in task context. If this races with VMX is disabled by an NMI, 711 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 712 * kvm_rebooting set. 713 */ 714 if (!(__read_cr4() & X86_CR4_VMXE)) 715 return; 716 717 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 718 loaded_vmcss_on_cpu_link) { 719 vmcs_clear(v->vmcs); 720 if (v->shadow_vmcs) 721 vmcs_clear(v->shadow_vmcs); 722 } 723 724 kvm_cpu_vmxoff(); 725 } 726 727 static void __loaded_vmcs_clear(void *arg) 728 { 729 struct loaded_vmcs *loaded_vmcs = arg; 730 int cpu = raw_smp_processor_id(); 731 732 if (loaded_vmcs->cpu != cpu) 733 return; /* vcpu migration can race with cpu offline */ 734 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 735 per_cpu(current_vmcs, cpu) = NULL; 736 737 vmcs_clear(loaded_vmcs->vmcs); 738 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 739 vmcs_clear(loaded_vmcs->shadow_vmcs); 740 741 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 742 743 /* 744 * Ensure all writes to loaded_vmcs, including deleting it from its 745 * current percpu list, complete before setting loaded_vmcs->cpu to 746 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 747 * and add loaded_vmcs to its percpu list before it's deleted from this 748 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 749 */ 750 smp_wmb(); 751 752 loaded_vmcs->cpu = -1; 753 loaded_vmcs->launched = 0; 754 } 755 756 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 757 { 758 int cpu = loaded_vmcs->cpu; 759 760 if (cpu != -1) 761 smp_call_function_single(cpu, 762 __loaded_vmcs_clear, loaded_vmcs, 1); 763 } 764 765 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 766 unsigned field) 767 { 768 bool ret; 769 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 770 771 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 772 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 773 vmx->segment_cache.bitmask = 0; 774 } 775 ret = vmx->segment_cache.bitmask & mask; 776 vmx->segment_cache.bitmask |= mask; 777 return ret; 778 } 779 780 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 781 { 782 u16 *p = &vmx->segment_cache.seg[seg].selector; 783 784 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 785 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 786 return *p; 787 } 788 789 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 790 { 791 ulong *p = &vmx->segment_cache.seg[seg].base; 792 793 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 794 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 795 return *p; 796 } 797 798 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 799 { 800 u32 *p = &vmx->segment_cache.seg[seg].limit; 801 802 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 803 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 804 return *p; 805 } 806 807 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 808 { 809 u32 *p = &vmx->segment_cache.seg[seg].ar; 810 811 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 812 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 813 return *p; 814 } 815 816 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 817 { 818 u32 eb; 819 820 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 821 (1u << DB_VECTOR) | (1u << AC_VECTOR); 822 /* 823 * #VE isn't used for VMX. To test against unexpected changes 824 * related to #VE for VMX, intercept unexpected #VE and warn on it. 825 */ 826 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 827 eb |= 1u << VE_VECTOR; 828 /* 829 * Guest access to VMware backdoor ports could legitimately 830 * trigger #GP because of TSS I/O permission bitmap. 831 * We intercept those #GP and allow access to them anyway 832 * as VMware does. 833 */ 834 if (enable_vmware_backdoor) 835 eb |= (1u << GP_VECTOR); 836 if ((vcpu->guest_debug & 837 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 838 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 839 eb |= 1u << BP_VECTOR; 840 if (to_vmx(vcpu)->rmode.vm86_active) 841 eb = ~0; 842 if (!vmx_need_pf_intercept(vcpu)) 843 eb &= ~(1u << PF_VECTOR); 844 845 /* When we are running a nested L2 guest and L1 specified for it a 846 * certain exception bitmap, we must trap the same exceptions and pass 847 * them to L1. When running L2, we will only handle the exceptions 848 * specified above if L1 did not want them. 849 */ 850 if (is_guest_mode(vcpu)) 851 eb |= get_vmcs12(vcpu)->exception_bitmap; 852 else { 853 int mask = 0, match = 0; 854 855 if (enable_ept && (eb & (1u << PF_VECTOR))) { 856 /* 857 * If EPT is enabled, #PF is currently only intercepted 858 * if MAXPHYADDR is smaller on the guest than on the 859 * host. In that case we only care about present, 860 * non-reserved faults. For vmcs02, however, PFEC_MASK 861 * and PFEC_MATCH are set in prepare_vmcs02_rare. 862 */ 863 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 864 match = PFERR_PRESENT_MASK; 865 } 866 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 867 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 868 } 869 870 /* 871 * Disabling xfd interception indicates that dynamic xfeatures 872 * might be used in the guest. Always trap #NM in this case 873 * to save guest xfd_err timely. 874 */ 875 if (vcpu->arch.xfd_no_write_intercept) 876 eb |= (1u << NM_VECTOR); 877 878 vmcs_write32(EXCEPTION_BITMAP, eb); 879 } 880 881 /* 882 * Check if MSR is intercepted for currently loaded MSR bitmap. 883 */ 884 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 885 { 886 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 887 return true; 888 889 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 890 } 891 892 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 893 { 894 unsigned int flags = 0; 895 896 if (vmx->loaded_vmcs->launched) 897 flags |= VMX_RUN_VMRESUME; 898 899 /* 900 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 901 * to change it directly without causing a vmexit. In that case read 902 * it after vmexit and store it in vmx->spec_ctrl. 903 */ 904 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 905 flags |= VMX_RUN_SAVE_SPEC_CTRL; 906 907 if (static_branch_unlikely(&cpu_buf_vm_clear) && 908 kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) 909 flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; 910 911 return flags; 912 } 913 914 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 915 unsigned long entry, unsigned long exit) 916 { 917 vm_entry_controls_clearbit(vmx, entry); 918 vm_exit_controls_clearbit(vmx, exit); 919 } 920 921 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 922 { 923 unsigned int i; 924 925 for (i = 0; i < m->nr; ++i) { 926 if (m->val[i].index == msr) 927 return i; 928 } 929 return -ENOENT; 930 } 931 932 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 933 { 934 int i; 935 struct msr_autoload *m = &vmx->msr_autoload; 936 937 switch (msr) { 938 case MSR_EFER: 939 if (cpu_has_load_ia32_efer()) { 940 clear_atomic_switch_msr_special(vmx, 941 VM_ENTRY_LOAD_IA32_EFER, 942 VM_EXIT_LOAD_IA32_EFER); 943 return; 944 } 945 break; 946 case MSR_CORE_PERF_GLOBAL_CTRL: 947 if (cpu_has_load_perf_global_ctrl()) { 948 clear_atomic_switch_msr_special(vmx, 949 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 950 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 951 return; 952 } 953 break; 954 } 955 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 956 if (i < 0) 957 goto skip_guest; 958 --m->guest.nr; 959 m->guest.val[i] = m->guest.val[m->guest.nr]; 960 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 961 962 skip_guest: 963 i = vmx_find_loadstore_msr_slot(&m->host, msr); 964 if (i < 0) 965 return; 966 967 --m->host.nr; 968 m->host.val[i] = m->host.val[m->host.nr]; 969 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 970 } 971 972 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 973 unsigned long entry, unsigned long exit, 974 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 975 u64 guest_val, u64 host_val) 976 { 977 vmcs_write64(guest_val_vmcs, guest_val); 978 if (host_val_vmcs != HOST_IA32_EFER) 979 vmcs_write64(host_val_vmcs, host_val); 980 vm_entry_controls_setbit(vmx, entry); 981 vm_exit_controls_setbit(vmx, exit); 982 } 983 984 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 985 u64 guest_val, u64 host_val, bool entry_only) 986 { 987 int i, j = 0; 988 struct msr_autoload *m = &vmx->msr_autoload; 989 990 switch (msr) { 991 case MSR_EFER: 992 if (cpu_has_load_ia32_efer()) { 993 add_atomic_switch_msr_special(vmx, 994 VM_ENTRY_LOAD_IA32_EFER, 995 VM_EXIT_LOAD_IA32_EFER, 996 GUEST_IA32_EFER, 997 HOST_IA32_EFER, 998 guest_val, host_val); 999 return; 1000 } 1001 break; 1002 case MSR_CORE_PERF_GLOBAL_CTRL: 1003 if (cpu_has_load_perf_global_ctrl()) { 1004 add_atomic_switch_msr_special(vmx, 1005 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1006 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1007 GUEST_IA32_PERF_GLOBAL_CTRL, 1008 HOST_IA32_PERF_GLOBAL_CTRL, 1009 guest_val, host_val); 1010 return; 1011 } 1012 break; 1013 case MSR_IA32_PEBS_ENABLE: 1014 /* PEBS needs a quiescent period after being disabled (to write 1015 * a record). Disabling PEBS through VMX MSR swapping doesn't 1016 * provide that period, so a CPU could write host's record into 1017 * guest's memory. 1018 */ 1019 wrmsrq(MSR_IA32_PEBS_ENABLE, 0); 1020 } 1021 1022 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1023 if (!entry_only) 1024 j = vmx_find_loadstore_msr_slot(&m->host, msr); 1025 1026 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 1027 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 1028 printk_once(KERN_WARNING "Not enough msr switch entries. " 1029 "Can't add msr %x\n", msr); 1030 return; 1031 } 1032 if (i < 0) { 1033 i = m->guest.nr++; 1034 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1035 } 1036 m->guest.val[i].index = msr; 1037 m->guest.val[i].value = guest_val; 1038 1039 if (entry_only) 1040 return; 1041 1042 if (j < 0) { 1043 j = m->host.nr++; 1044 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1045 } 1046 m->host.val[j].index = msr; 1047 m->host.val[j].value = host_val; 1048 } 1049 1050 static bool update_transition_efer(struct vcpu_vmx *vmx) 1051 { 1052 u64 guest_efer = vmx->vcpu.arch.efer; 1053 u64 ignore_bits = 0; 1054 int i; 1055 1056 /* Shadow paging assumes NX to be available. */ 1057 if (!enable_ept) 1058 guest_efer |= EFER_NX; 1059 1060 /* 1061 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1062 */ 1063 ignore_bits |= EFER_SCE; 1064 #ifdef CONFIG_X86_64 1065 ignore_bits |= EFER_LMA | EFER_LME; 1066 /* SCE is meaningful only in long mode on Intel */ 1067 if (guest_efer & EFER_LMA) 1068 ignore_bits &= ~(u64)EFER_SCE; 1069 #endif 1070 1071 /* 1072 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1073 * On CPUs that support "load IA32_EFER", always switch EFER 1074 * atomically, since it's faster than switching it manually. 1075 */ 1076 if (cpu_has_load_ia32_efer() || 1077 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { 1078 if (!(guest_efer & EFER_LMA)) 1079 guest_efer &= ~EFER_LME; 1080 if (guest_efer != kvm_host.efer) 1081 add_atomic_switch_msr(vmx, MSR_EFER, 1082 guest_efer, kvm_host.efer, false); 1083 else 1084 clear_atomic_switch_msr(vmx, MSR_EFER); 1085 return false; 1086 } 1087 1088 i = kvm_find_user_return_msr(MSR_EFER); 1089 if (i < 0) 1090 return false; 1091 1092 clear_atomic_switch_msr(vmx, MSR_EFER); 1093 1094 guest_efer &= ~ignore_bits; 1095 guest_efer |= kvm_host.efer & ignore_bits; 1096 1097 vmx->guest_uret_msrs[i].data = guest_efer; 1098 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1099 1100 return true; 1101 } 1102 1103 #ifdef CONFIG_X86_32 1104 /* 1105 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1106 * VMCS rather than the segment table. KVM uses this helper to figure 1107 * out the current bases to poke them into the VMCS before entry. 1108 */ 1109 static unsigned long segment_base(u16 selector) 1110 { 1111 struct desc_struct *table; 1112 unsigned long v; 1113 1114 if (!(selector & ~SEGMENT_RPL_MASK)) 1115 return 0; 1116 1117 table = get_current_gdt_ro(); 1118 1119 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1120 u16 ldt_selector = kvm_read_ldt(); 1121 1122 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1123 return 0; 1124 1125 table = (struct desc_struct *)segment_base(ldt_selector); 1126 } 1127 v = get_desc_base(&table[selector >> 3]); 1128 return v; 1129 } 1130 #endif 1131 1132 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1133 { 1134 return vmx_pt_mode_is_host_guest() && 1135 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1136 } 1137 1138 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1139 { 1140 /* The base must be 128-byte aligned and a legal physical address. */ 1141 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1142 } 1143 1144 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1145 { 1146 u32 i; 1147 1148 wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1149 wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1150 wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1151 wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1152 for (i = 0; i < addr_range; i++) { 1153 wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1154 wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1155 } 1156 } 1157 1158 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1159 { 1160 u32 i; 1161 1162 rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1163 rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1164 rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1165 rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1166 for (i = 0; i < addr_range; i++) { 1167 rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1168 rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1169 } 1170 } 1171 1172 static void pt_guest_enter(struct vcpu_vmx *vmx) 1173 { 1174 if (vmx_pt_mode_is_system()) 1175 return; 1176 1177 /* 1178 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1179 * Save host state before VM entry. 1180 */ 1181 rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1182 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1183 wrmsrq(MSR_IA32_RTIT_CTL, 0); 1184 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1185 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1186 } 1187 } 1188 1189 static void pt_guest_exit(struct vcpu_vmx *vmx) 1190 { 1191 if (vmx_pt_mode_is_system()) 1192 return; 1193 1194 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1195 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1196 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1197 } 1198 1199 /* 1200 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1201 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1202 */ 1203 if (vmx->pt_desc.host.ctl) 1204 wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1205 } 1206 1207 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1208 unsigned long fs_base, unsigned long gs_base) 1209 { 1210 if (unlikely(fs_sel != host->fs_sel)) { 1211 if (!(fs_sel & 7)) 1212 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1213 else 1214 vmcs_write16(HOST_FS_SELECTOR, 0); 1215 host->fs_sel = fs_sel; 1216 } 1217 if (unlikely(gs_sel != host->gs_sel)) { 1218 if (!(gs_sel & 7)) 1219 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1220 else 1221 vmcs_write16(HOST_GS_SELECTOR, 0); 1222 host->gs_sel = gs_sel; 1223 } 1224 if (unlikely(fs_base != host->fs_base)) { 1225 vmcs_writel(HOST_FS_BASE, fs_base); 1226 host->fs_base = fs_base; 1227 } 1228 if (unlikely(gs_base != host->gs_base)) { 1229 vmcs_writel(HOST_GS_BASE, gs_base); 1230 host->gs_base = gs_base; 1231 } 1232 } 1233 1234 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1235 { 1236 struct vcpu_vmx *vmx = to_vmx(vcpu); 1237 struct vcpu_vt *vt = to_vt(vcpu); 1238 struct vmcs_host_state *host_state; 1239 #ifdef CONFIG_X86_64 1240 int cpu = raw_smp_processor_id(); 1241 #endif 1242 unsigned long fs_base, gs_base; 1243 u16 fs_sel, gs_sel; 1244 int i; 1245 1246 /* 1247 * Note that guest MSRs to be saved/restored can also be changed 1248 * when guest state is loaded. This happens when guest transitions 1249 * to/from long-mode by setting MSR_EFER.LMA. 1250 */ 1251 if (!vmx->guest_uret_msrs_loaded) { 1252 vmx->guest_uret_msrs_loaded = true; 1253 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1254 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1255 continue; 1256 1257 kvm_set_user_return_msr(i, 1258 vmx->guest_uret_msrs[i].data, 1259 vmx->guest_uret_msrs[i].mask); 1260 } 1261 } 1262 1263 if (vmx->nested.need_vmcs12_to_shadow_sync) 1264 nested_sync_vmcs12_to_shadow(vcpu); 1265 1266 if (vt->guest_state_loaded) 1267 return; 1268 1269 host_state = &vmx->loaded_vmcs->host_state; 1270 1271 /* 1272 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1273 * allow segment selectors with cpl > 0 or ti == 1. 1274 */ 1275 host_state->ldt_sel = kvm_read_ldt(); 1276 1277 #ifdef CONFIG_X86_64 1278 savesegment(ds, host_state->ds_sel); 1279 savesegment(es, host_state->es_sel); 1280 1281 gs_base = cpu_kernelmode_gs_base(cpu); 1282 if (likely(is_64bit_mm(current->mm))) { 1283 current_save_fsgs(); 1284 fs_sel = current->thread.fsindex; 1285 gs_sel = current->thread.gsindex; 1286 fs_base = current->thread.fsbase; 1287 vt->msr_host_kernel_gs_base = current->thread.gsbase; 1288 } else { 1289 savesegment(fs, fs_sel); 1290 savesegment(gs, gs_sel); 1291 fs_base = read_msr(MSR_FS_BASE); 1292 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1293 } 1294 1295 wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1296 #else 1297 savesegment(fs, fs_sel); 1298 savesegment(gs, gs_sel); 1299 fs_base = segment_base(fs_sel); 1300 gs_base = segment_base(gs_sel); 1301 #endif 1302 1303 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1304 vt->guest_state_loaded = true; 1305 } 1306 1307 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1308 { 1309 struct vmcs_host_state *host_state; 1310 1311 if (!vmx->vt.guest_state_loaded) 1312 return; 1313 1314 host_state = &vmx->loaded_vmcs->host_state; 1315 1316 ++vmx->vcpu.stat.host_state_reload; 1317 1318 #ifdef CONFIG_X86_64 1319 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1320 #endif 1321 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1322 kvm_load_ldt(host_state->ldt_sel); 1323 #ifdef CONFIG_X86_64 1324 load_gs_index(host_state->gs_sel); 1325 #else 1326 loadsegment(gs, host_state->gs_sel); 1327 #endif 1328 } 1329 if (host_state->fs_sel & 7) 1330 loadsegment(fs, host_state->fs_sel); 1331 #ifdef CONFIG_X86_64 1332 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1333 loadsegment(ds, host_state->ds_sel); 1334 loadsegment(es, host_state->es_sel); 1335 } 1336 #endif 1337 invalidate_tss_limit(); 1338 #ifdef CONFIG_X86_64 1339 wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); 1340 #endif 1341 load_fixmap_gdt(raw_smp_processor_id()); 1342 vmx->vt.guest_state_loaded = false; 1343 vmx->guest_uret_msrs_loaded = false; 1344 } 1345 1346 #ifdef CONFIG_X86_64 1347 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1348 { 1349 preempt_disable(); 1350 if (vmx->vt.guest_state_loaded) 1351 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1352 preempt_enable(); 1353 return vmx->msr_guest_kernel_gs_base; 1354 } 1355 1356 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1357 { 1358 preempt_disable(); 1359 if (vmx->vt.guest_state_loaded) 1360 wrmsrq(MSR_KERNEL_GS_BASE, data); 1361 preempt_enable(); 1362 vmx->msr_guest_kernel_gs_base = data; 1363 } 1364 #endif 1365 1366 static void grow_ple_window(struct kvm_vcpu *vcpu) 1367 { 1368 struct vcpu_vmx *vmx = to_vmx(vcpu); 1369 unsigned int old = vmx->ple_window; 1370 1371 vmx->ple_window = __grow_ple_window(old, ple_window, 1372 ple_window_grow, 1373 ple_window_max); 1374 1375 if (vmx->ple_window != old) { 1376 vmx->ple_window_dirty = true; 1377 trace_kvm_ple_window_update(vcpu->vcpu_id, 1378 vmx->ple_window, old); 1379 } 1380 } 1381 1382 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1383 { 1384 struct vcpu_vmx *vmx = to_vmx(vcpu); 1385 unsigned int old = vmx->ple_window; 1386 1387 vmx->ple_window = __shrink_ple_window(old, ple_window, 1388 ple_window_shrink, 1389 ple_window); 1390 1391 if (vmx->ple_window != old) { 1392 vmx->ple_window_dirty = true; 1393 trace_kvm_ple_window_update(vcpu->vcpu_id, 1394 vmx->ple_window, old); 1395 } 1396 } 1397 1398 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) 1399 { 1400 struct vcpu_vmx *vmx = to_vmx(vcpu); 1401 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1402 struct vmcs *prev; 1403 1404 if (!already_loaded) { 1405 loaded_vmcs_clear(vmx->loaded_vmcs); 1406 local_irq_disable(); 1407 1408 /* 1409 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1410 * this cpu's percpu list, otherwise it may not yet be deleted 1411 * from its previous cpu's percpu list. Pairs with the 1412 * smb_wmb() in __loaded_vmcs_clear(). 1413 */ 1414 smp_rmb(); 1415 1416 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1417 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1418 local_irq_enable(); 1419 } 1420 1421 prev = per_cpu(current_vmcs, cpu); 1422 if (prev != vmx->loaded_vmcs->vmcs) { 1423 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1424 vmcs_load(vmx->loaded_vmcs->vmcs); 1425 } 1426 1427 if (!already_loaded) { 1428 void *gdt = get_current_gdt_ro(); 1429 1430 /* 1431 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1432 * TLB entries from its previous association with the vCPU. 1433 */ 1434 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1435 1436 /* 1437 * Linux uses per-cpu TSS and GDT, so set these when switching 1438 * processors. See 22.2.4. 1439 */ 1440 vmcs_writel(HOST_TR_BASE, 1441 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1442 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1443 1444 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1445 /* 22.2.3 */ 1446 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1447 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1448 } 1449 1450 vmx->loaded_vmcs->cpu = cpu; 1451 } 1452 } 1453 1454 /* 1455 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1456 * vcpu mutex is already taken. 1457 */ 1458 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1459 { 1460 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1461 shrink_ple_window(vcpu); 1462 1463 vmx_vcpu_load_vmcs(vcpu, cpu); 1464 1465 vmx_vcpu_pi_load(vcpu, cpu); 1466 } 1467 1468 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1469 { 1470 vmx_vcpu_pi_put(vcpu); 1471 1472 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1473 } 1474 1475 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1476 { 1477 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1478 } 1479 1480 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1481 { 1482 struct vcpu_vmx *vmx = to_vmx(vcpu); 1483 unsigned long rflags, save_rflags; 1484 1485 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1486 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1487 rflags = vmcs_readl(GUEST_RFLAGS); 1488 if (vmx->rmode.vm86_active) { 1489 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1490 save_rflags = vmx->rmode.save_rflags; 1491 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1492 } 1493 vmx->rflags = rflags; 1494 } 1495 return vmx->rflags; 1496 } 1497 1498 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1499 { 1500 struct vcpu_vmx *vmx = to_vmx(vcpu); 1501 unsigned long old_rflags; 1502 1503 /* 1504 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1505 * is an unrestricted guest in order to mark L2 as needing emulation 1506 * if L1 runs L2 as a restricted guest. 1507 */ 1508 if (is_unrestricted_guest(vcpu)) { 1509 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1510 vmx->rflags = rflags; 1511 vmcs_writel(GUEST_RFLAGS, rflags); 1512 return; 1513 } 1514 1515 old_rflags = vmx_get_rflags(vcpu); 1516 vmx->rflags = rflags; 1517 if (vmx->rmode.vm86_active) { 1518 vmx->rmode.save_rflags = rflags; 1519 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1520 } 1521 vmcs_writel(GUEST_RFLAGS, rflags); 1522 1523 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1524 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 1525 } 1526 1527 bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1528 { 1529 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1530 } 1531 1532 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1533 { 1534 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1535 int ret = 0; 1536 1537 if (interruptibility & GUEST_INTR_STATE_STI) 1538 ret |= KVM_X86_SHADOW_INT_STI; 1539 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1540 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1541 1542 return ret; 1543 } 1544 1545 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1546 { 1547 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1548 u32 interruptibility = interruptibility_old; 1549 1550 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1551 1552 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1553 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1554 else if (mask & KVM_X86_SHADOW_INT_STI) 1555 interruptibility |= GUEST_INTR_STATE_STI; 1556 1557 if ((interruptibility != interruptibility_old)) 1558 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1559 } 1560 1561 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1562 { 1563 struct vcpu_vmx *vmx = to_vmx(vcpu); 1564 unsigned long value; 1565 1566 /* 1567 * Any MSR write that attempts to change bits marked reserved will 1568 * case a #GP fault. 1569 */ 1570 if (data & vmx->pt_desc.ctl_bitmask) 1571 return 1; 1572 1573 /* 1574 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1575 * result in a #GP unless the same write also clears TraceEn. 1576 */ 1577 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1578 (data & RTIT_CTL_TRACEEN) && 1579 data != vmx->pt_desc.guest.ctl) 1580 return 1; 1581 1582 /* 1583 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1584 * and FabricEn would cause #GP, if 1585 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1586 */ 1587 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1588 !(data & RTIT_CTL_FABRIC_EN) && 1589 !intel_pt_validate_cap(vmx->pt_desc.caps, 1590 PT_CAP_single_range_output)) 1591 return 1; 1592 1593 /* 1594 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1595 * utilize encodings marked reserved will cause a #GP fault. 1596 */ 1597 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1598 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1599 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1600 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1601 return 1; 1602 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1603 PT_CAP_cycle_thresholds); 1604 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1605 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1606 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1607 return 1; 1608 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1609 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1610 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1611 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1612 return 1; 1613 1614 /* 1615 * If ADDRx_CFG is reserved or the encodings is >2 will 1616 * cause a #GP fault. 1617 */ 1618 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1619 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1620 return 1; 1621 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1622 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1623 return 1; 1624 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1625 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1626 return 1; 1627 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1628 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1629 return 1; 1630 1631 return 0; 1632 } 1633 1634 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1635 void *insn, int insn_len) 1636 { 1637 /* 1638 * Emulation of instructions in SGX enclaves is impossible as RIP does 1639 * not point at the failing instruction, and even if it did, the code 1640 * stream is inaccessible. Inject #UD instead of exiting to userspace 1641 * so that guest userspace can't DoS the guest simply by triggering 1642 * emulation (enclaves are CPL3 only). 1643 */ 1644 if (vmx_get_exit_reason(vcpu).enclave_mode) { 1645 kvm_queue_exception(vcpu, UD_VECTOR); 1646 return X86EMUL_PROPAGATE_FAULT; 1647 } 1648 1649 /* Check that emulation is possible during event vectoring */ 1650 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 1651 !kvm_can_emulate_event_vectoring(emul_type)) 1652 return X86EMUL_UNHANDLEABLE_VECTORING; 1653 1654 return X86EMUL_CONTINUE; 1655 } 1656 1657 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1658 { 1659 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 1660 unsigned long rip, orig_rip; 1661 u32 instr_len; 1662 1663 /* 1664 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1665 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1666 * set when EPT misconfig occurs. In practice, real hardware updates 1667 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1668 * (namely Hyper-V) don't set it due to it being undefined behavior, 1669 * i.e. we end up advancing IP with some random value. 1670 */ 1671 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1672 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1673 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1674 1675 /* 1676 * Emulating an enclave's instructions isn't supported as KVM 1677 * cannot access the enclave's memory or its true RIP, e.g. the 1678 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1679 * the RIP that actually triggered the VM-Exit. But, because 1680 * most instructions that cause VM-Exit will #UD in an enclave, 1681 * most instruction-based VM-Exits simply do not occur. 1682 * 1683 * There are a few exceptions, notably the debug instructions 1684 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1685 * and generate #DB/#BP as expected, which KVM might intercept. 1686 * But again, the CPU does the dirty work and saves an instr 1687 * length of zero so VMMs don't shoot themselves in the foot. 1688 * WARN if KVM tries to skip a non-zero length instruction on 1689 * a VM-Exit from an enclave. 1690 */ 1691 if (!instr_len) 1692 goto rip_updated; 1693 1694 WARN_ONCE(exit_reason.enclave_mode, 1695 "skipping instruction after SGX enclave VM-Exit"); 1696 1697 orig_rip = kvm_rip_read(vcpu); 1698 rip = orig_rip + instr_len; 1699 #ifdef CONFIG_X86_64 1700 /* 1701 * We need to mask out the high 32 bits of RIP if not in 64-bit 1702 * mode, but just finding out that we are in 64-bit mode is 1703 * quite expensive. Only do it if there was a carry. 1704 */ 1705 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1706 rip = (u32)rip; 1707 #endif 1708 kvm_rip_write(vcpu, rip); 1709 } else { 1710 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1711 return 0; 1712 } 1713 1714 rip_updated: 1715 /* skipping an emulated instruction also counts */ 1716 vmx_set_interrupt_shadow(vcpu, 0); 1717 1718 return 1; 1719 } 1720 1721 /* 1722 * Recognizes a pending MTF VM-exit and records the nested state for later 1723 * delivery. 1724 */ 1725 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1726 { 1727 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1728 struct vcpu_vmx *vmx = to_vmx(vcpu); 1729 1730 if (!is_guest_mode(vcpu)) 1731 return; 1732 1733 /* 1734 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1735 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1736 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1737 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1738 * as ICEBP is higher priority than both. As instruction emulation is 1739 * completed at this point (i.e. KVM is at the instruction boundary), 1740 * any #DB exception pending delivery must be a debug-trap of lower 1741 * priority than MTF. Record the pending MTF state to be delivered in 1742 * vmx_check_nested_events(). 1743 */ 1744 if (nested_cpu_has_mtf(vmcs12) && 1745 (!vcpu->arch.exception.pending || 1746 vcpu->arch.exception.vector == DB_VECTOR) && 1747 (!vcpu->arch.exception_vmexit.pending || 1748 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1749 vmx->nested.mtf_pending = true; 1750 kvm_make_request(KVM_REQ_EVENT, vcpu); 1751 } else { 1752 vmx->nested.mtf_pending = false; 1753 } 1754 } 1755 1756 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1757 { 1758 vmx_update_emulated_instruction(vcpu); 1759 return skip_emulated_instruction(vcpu); 1760 } 1761 1762 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1763 { 1764 /* 1765 * Ensure that we clear the HLT state in the VMCS. We don't need to 1766 * explicitly skip the instruction because if the HLT state is set, 1767 * then the instruction is already executing and RIP has already been 1768 * advanced. 1769 */ 1770 if (kvm_hlt_in_guest(vcpu->kvm) && 1771 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1772 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1773 } 1774 1775 void vmx_inject_exception(struct kvm_vcpu *vcpu) 1776 { 1777 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1778 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1779 struct vcpu_vmx *vmx = to_vmx(vcpu); 1780 1781 kvm_deliver_exception_payload(vcpu, ex); 1782 1783 if (ex->has_error_code) { 1784 /* 1785 * Despite the error code being architecturally defined as 32 1786 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1787 * VMX don't actually supporting setting bits 31:16. Hardware 1788 * will (should) never provide a bogus error code, but AMD CPUs 1789 * do generate error codes with bits 31:16 set, and so KVM's 1790 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1791 * the upper bits to avoid VM-Fail, losing information that 1792 * doesn't really exist is preferable to killing the VM. 1793 */ 1794 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1795 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1796 } 1797 1798 if (vmx->rmode.vm86_active) { 1799 int inc_eip = 0; 1800 if (kvm_exception_is_soft(ex->vector)) 1801 inc_eip = vcpu->arch.event_exit_inst_len; 1802 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1803 return; 1804 } 1805 1806 WARN_ON_ONCE(vmx->vt.emulation_required); 1807 1808 if (kvm_exception_is_soft(ex->vector)) { 1809 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1810 vmx->vcpu.arch.event_exit_inst_len); 1811 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1812 } else 1813 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1814 1815 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1816 1817 vmx_clear_hlt(vcpu); 1818 } 1819 1820 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1821 bool load_into_hardware) 1822 { 1823 struct vmx_uret_msr *uret_msr; 1824 1825 uret_msr = vmx_find_uret_msr(vmx, msr); 1826 if (!uret_msr) 1827 return; 1828 1829 uret_msr->load_into_hardware = load_into_hardware; 1830 } 1831 1832 /* 1833 * Configuring user return MSRs to automatically save, load, and restore MSRs 1834 * that need to be shoved into hardware when running the guest. Note, omitting 1835 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1836 * loaded into hardware when running the guest. 1837 */ 1838 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1839 { 1840 #ifdef CONFIG_X86_64 1841 bool load_syscall_msrs; 1842 1843 /* 1844 * The SYSCALL MSRs are only needed on long mode guests, and only 1845 * when EFER.SCE is set. 1846 */ 1847 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1848 (vmx->vcpu.arch.efer & EFER_SCE); 1849 1850 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1851 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1852 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1853 #endif 1854 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1855 1856 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1857 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1858 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1859 1860 /* 1861 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1862 * kernel and old userspace. If those guests run on a tsx=off host, do 1863 * allow guests to use TSX_CTRL, but don't change the value in hardware 1864 * so that TSX remains always disabled. 1865 */ 1866 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1867 1868 /* 1869 * The set of MSRs to load may have changed, reload MSRs before the 1870 * next VM-Enter. 1871 */ 1872 vmx->guest_uret_msrs_loaded = false; 1873 } 1874 1875 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1876 { 1877 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1878 1879 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 1880 return vmcs12->tsc_offset; 1881 1882 return 0; 1883 } 1884 1885 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1886 { 1887 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1888 1889 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 1890 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 1891 return vmcs12->tsc_multiplier; 1892 1893 return kvm_caps.default_tsc_scaling_ratio; 1894 } 1895 1896 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 1897 { 1898 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 1899 } 1900 1901 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1902 { 1903 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 1904 } 1905 1906 /* 1907 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 1908 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 1909 * backwards compatibility even though KVM doesn't support emulating SMX. And 1910 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 1911 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 1912 */ 1913 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 1914 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 1915 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 1916 FEAT_CTL_SGX_LC_ENABLED | \ 1917 FEAT_CTL_SGX_ENABLED | \ 1918 FEAT_CTL_LMCE_ENABLED) 1919 1920 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 1921 struct msr_data *msr) 1922 { 1923 uint64_t valid_bits; 1924 1925 /* 1926 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 1927 * exposed to the guest. 1928 */ 1929 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 1930 ~KVM_SUPPORTED_FEATURE_CONTROL); 1931 1932 if (!msr->host_initiated && 1933 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 1934 return false; 1935 1936 if (msr->host_initiated) 1937 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 1938 else 1939 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 1940 1941 return !(msr->data & ~valid_bits); 1942 } 1943 1944 int vmx_get_feature_msr(u32 msr, u64 *data) 1945 { 1946 switch (msr) { 1947 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 1948 if (!nested) 1949 return 1; 1950 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 1951 default: 1952 return KVM_MSR_RET_UNSUPPORTED; 1953 } 1954 } 1955 1956 /* 1957 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 1958 * Returns 0 on success, non-0 otherwise. 1959 * Assumes vcpu_load() was already called. 1960 */ 1961 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1962 { 1963 struct vcpu_vmx *vmx = to_vmx(vcpu); 1964 struct vmx_uret_msr *msr; 1965 u32 index; 1966 1967 switch (msr_info->index) { 1968 #ifdef CONFIG_X86_64 1969 case MSR_FS_BASE: 1970 msr_info->data = vmcs_readl(GUEST_FS_BASE); 1971 break; 1972 case MSR_GS_BASE: 1973 msr_info->data = vmcs_readl(GUEST_GS_BASE); 1974 break; 1975 case MSR_KERNEL_GS_BASE: 1976 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 1977 break; 1978 #endif 1979 case MSR_EFER: 1980 return kvm_get_msr_common(vcpu, msr_info); 1981 case MSR_IA32_TSX_CTRL: 1982 if (!msr_info->host_initiated && 1983 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 1984 return 1; 1985 goto find_uret_msr; 1986 case MSR_IA32_UMWAIT_CONTROL: 1987 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 1988 return 1; 1989 1990 msr_info->data = vmx->msr_ia32_umwait_control; 1991 break; 1992 case MSR_IA32_SPEC_CTRL: 1993 if (!msr_info->host_initiated && 1994 !guest_has_spec_ctrl_msr(vcpu)) 1995 return 1; 1996 1997 msr_info->data = to_vmx(vcpu)->spec_ctrl; 1998 break; 1999 case MSR_IA32_SYSENTER_CS: 2000 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2001 break; 2002 case MSR_IA32_SYSENTER_EIP: 2003 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2004 break; 2005 case MSR_IA32_SYSENTER_ESP: 2006 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2007 break; 2008 case MSR_IA32_BNDCFGS: 2009 if (!kvm_mpx_supported() || 2010 (!msr_info->host_initiated && 2011 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2012 return 1; 2013 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2014 break; 2015 case MSR_IA32_MCG_EXT_CTL: 2016 if (!msr_info->host_initiated && 2017 !(vmx->msr_ia32_feature_control & 2018 FEAT_CTL_LMCE_ENABLED)) 2019 return 1; 2020 msr_info->data = vcpu->arch.mcg_ext_ctl; 2021 break; 2022 case MSR_IA32_FEAT_CTL: 2023 msr_info->data = vmx->msr_ia32_feature_control; 2024 break; 2025 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2026 if (!msr_info->host_initiated && 2027 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 2028 return 1; 2029 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2030 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2031 break; 2032 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2033 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2034 return 1; 2035 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2036 &msr_info->data)) 2037 return 1; 2038 #ifdef CONFIG_KVM_HYPERV 2039 /* 2040 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2041 * instead of just ignoring the features, different Hyper-V 2042 * versions are either trying to use them and fail or do some 2043 * sanity checking and refuse to boot. Filter all unsupported 2044 * features out. 2045 */ 2046 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) 2047 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2048 &msr_info->data); 2049 #endif 2050 break; 2051 case MSR_IA32_RTIT_CTL: 2052 if (!vmx_pt_mode_is_host_guest()) 2053 return 1; 2054 msr_info->data = vmx->pt_desc.guest.ctl; 2055 break; 2056 case MSR_IA32_RTIT_STATUS: 2057 if (!vmx_pt_mode_is_host_guest()) 2058 return 1; 2059 msr_info->data = vmx->pt_desc.guest.status; 2060 break; 2061 case MSR_IA32_RTIT_CR3_MATCH: 2062 if (!vmx_pt_mode_is_host_guest() || 2063 !intel_pt_validate_cap(vmx->pt_desc.caps, 2064 PT_CAP_cr3_filtering)) 2065 return 1; 2066 msr_info->data = vmx->pt_desc.guest.cr3_match; 2067 break; 2068 case MSR_IA32_RTIT_OUTPUT_BASE: 2069 if (!vmx_pt_mode_is_host_guest() || 2070 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2071 PT_CAP_topa_output) && 2072 !intel_pt_validate_cap(vmx->pt_desc.caps, 2073 PT_CAP_single_range_output))) 2074 return 1; 2075 msr_info->data = vmx->pt_desc.guest.output_base; 2076 break; 2077 case MSR_IA32_RTIT_OUTPUT_MASK: 2078 if (!vmx_pt_mode_is_host_guest() || 2079 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2080 PT_CAP_topa_output) && 2081 !intel_pt_validate_cap(vmx->pt_desc.caps, 2082 PT_CAP_single_range_output))) 2083 return 1; 2084 msr_info->data = vmx->pt_desc.guest.output_mask; 2085 break; 2086 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2087 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2088 if (!vmx_pt_mode_is_host_guest() || 2089 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2090 return 1; 2091 if (index % 2) 2092 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2093 else 2094 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2095 break; 2096 case MSR_IA32_DEBUGCTLMSR: 2097 msr_info->data = vmx_guest_debugctl_read(); 2098 break; 2099 default: 2100 find_uret_msr: 2101 msr = vmx_find_uret_msr(vmx, msr_info->index); 2102 if (msr) { 2103 msr_info->data = msr->data; 2104 break; 2105 } 2106 return kvm_get_msr_common(vcpu, msr_info); 2107 } 2108 2109 return 0; 2110 } 2111 2112 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2113 u64 data) 2114 { 2115 #ifdef CONFIG_X86_64 2116 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 2117 return (u32)data; 2118 #endif 2119 return (unsigned long)data; 2120 } 2121 2122 u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2123 { 2124 u64 debugctl = 0; 2125 2126 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2127 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2128 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2129 2130 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && 2131 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2132 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2133 2134 if (boot_cpu_has(X86_FEATURE_RTM) && 2135 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))) 2136 debugctl |= DEBUGCTLMSR_RTM_DEBUG; 2137 2138 return debugctl; 2139 } 2140 2141 bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) 2142 { 2143 u64 invalid; 2144 2145 invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); 2146 if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { 2147 kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); 2148 invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); 2149 } 2150 return !invalid; 2151 } 2152 2153 /* 2154 * Writes msr value into the appropriate "register". 2155 * Returns 0 on success, non-0 otherwise. 2156 * Assumes vcpu_load() was already called. 2157 */ 2158 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2159 { 2160 struct vcpu_vmx *vmx = to_vmx(vcpu); 2161 struct vmx_uret_msr *msr; 2162 int ret = 0; 2163 u32 msr_index = msr_info->index; 2164 u64 data = msr_info->data; 2165 u32 index; 2166 2167 switch (msr_index) { 2168 case MSR_EFER: 2169 ret = kvm_set_msr_common(vcpu, msr_info); 2170 break; 2171 #ifdef CONFIG_X86_64 2172 case MSR_FS_BASE: 2173 vmx_segment_cache_clear(vmx); 2174 vmcs_writel(GUEST_FS_BASE, data); 2175 break; 2176 case MSR_GS_BASE: 2177 vmx_segment_cache_clear(vmx); 2178 vmcs_writel(GUEST_GS_BASE, data); 2179 break; 2180 case MSR_KERNEL_GS_BASE: 2181 vmx_write_guest_kernel_gs_base(vmx, data); 2182 break; 2183 case MSR_IA32_XFD: 2184 ret = kvm_set_msr_common(vcpu, msr_info); 2185 /* 2186 * Always intercepting WRMSR could incur non-negligible 2187 * overhead given xfd might be changed frequently in 2188 * guest context switch. Disable write interception 2189 * upon the first write with a non-zero value (indicating 2190 * potential usage on dynamic xfeatures). Also update 2191 * exception bitmap to trap #NM for proper virtualization 2192 * of guest xfd_err. 2193 */ 2194 if (!ret && data) { 2195 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2196 MSR_TYPE_RW); 2197 vcpu->arch.xfd_no_write_intercept = true; 2198 vmx_update_exception_bitmap(vcpu); 2199 } 2200 break; 2201 #endif 2202 case MSR_IA32_SYSENTER_CS: 2203 if (is_guest_mode(vcpu)) 2204 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2205 vmcs_write32(GUEST_SYSENTER_CS, data); 2206 break; 2207 case MSR_IA32_SYSENTER_EIP: 2208 if (is_guest_mode(vcpu)) { 2209 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2210 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2211 } 2212 vmcs_writel(GUEST_SYSENTER_EIP, data); 2213 break; 2214 case MSR_IA32_SYSENTER_ESP: 2215 if (is_guest_mode(vcpu)) { 2216 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2217 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2218 } 2219 vmcs_writel(GUEST_SYSENTER_ESP, data); 2220 break; 2221 case MSR_IA32_DEBUGCTLMSR: 2222 if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) 2223 return 1; 2224 2225 data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2226 2227 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2228 VM_EXIT_SAVE_DEBUG_CONTROLS) 2229 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2230 2231 vmx_guest_debugctl_write(vcpu, data); 2232 2233 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2234 (data & DEBUGCTLMSR_LBR)) 2235 intel_pmu_create_guest_lbr_event(vcpu); 2236 return 0; 2237 case MSR_IA32_BNDCFGS: 2238 if (!kvm_mpx_supported() || 2239 (!msr_info->host_initiated && 2240 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2241 return 1; 2242 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || 2243 (data & MSR_IA32_BNDCFGS_RSVD)) 2244 return 1; 2245 2246 if (is_guest_mode(vcpu) && 2247 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2248 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2249 get_vmcs12(vcpu)->guest_bndcfgs = data; 2250 2251 vmcs_write64(GUEST_BNDCFGS, data); 2252 break; 2253 case MSR_IA32_UMWAIT_CONTROL: 2254 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2255 return 1; 2256 2257 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2258 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2259 return 1; 2260 2261 vmx->msr_ia32_umwait_control = data; 2262 break; 2263 case MSR_IA32_SPEC_CTRL: 2264 if (!msr_info->host_initiated && 2265 !guest_has_spec_ctrl_msr(vcpu)) 2266 return 1; 2267 2268 if (kvm_spec_ctrl_test_value(data)) 2269 return 1; 2270 2271 vmx->spec_ctrl = data; 2272 if (!data) 2273 break; 2274 2275 /* 2276 * For non-nested: 2277 * When it's written (to non-zero) for the first time, pass 2278 * it through. 2279 * 2280 * For nested: 2281 * The handling of the MSR bitmap for L2 guests is done in 2282 * nested_vmx_prepare_msr_bitmap. We should not touch the 2283 * vmcs02.msr_bitmap here since it gets completely overwritten 2284 * in the merging. We update the vmcs01 here for L1 as well 2285 * since it will end up touching the MSR anyway now. 2286 */ 2287 vmx_disable_intercept_for_msr(vcpu, 2288 MSR_IA32_SPEC_CTRL, 2289 MSR_TYPE_RW); 2290 break; 2291 case MSR_IA32_TSX_CTRL: 2292 if (!msr_info->host_initiated && 2293 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2294 return 1; 2295 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2296 return 1; 2297 goto find_uret_msr; 2298 case MSR_IA32_CR_PAT: 2299 ret = kvm_set_msr_common(vcpu, msr_info); 2300 if (ret) 2301 break; 2302 2303 if (is_guest_mode(vcpu) && 2304 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2305 get_vmcs12(vcpu)->guest_ia32_pat = data; 2306 2307 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2308 vmcs_write64(GUEST_IA32_PAT, data); 2309 break; 2310 case MSR_IA32_MCG_EXT_CTL: 2311 if ((!msr_info->host_initiated && 2312 !(to_vmx(vcpu)->msr_ia32_feature_control & 2313 FEAT_CTL_LMCE_ENABLED)) || 2314 (data & ~MCG_EXT_CTL_LMCE_EN)) 2315 return 1; 2316 vcpu->arch.mcg_ext_ctl = data; 2317 break; 2318 case MSR_IA32_FEAT_CTL: 2319 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2320 return 1; 2321 2322 vmx->msr_ia32_feature_control = data; 2323 if (msr_info->host_initiated && data == 0) 2324 vmx_leave_nested(vcpu); 2325 2326 /* SGX may be enabled/disabled by guest's firmware */ 2327 vmx_write_encls_bitmap(vcpu, NULL); 2328 break; 2329 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2330 /* 2331 * On real hardware, the LE hash MSRs are writable before 2332 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2333 * at which point SGX related bits in IA32_FEATURE_CONTROL 2334 * become writable. 2335 * 2336 * KVM does not emulate SGX activation for simplicity, so 2337 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2338 * is unlocked. This is technically not architectural 2339 * behavior, but it's close enough. 2340 */ 2341 if (!msr_info->host_initiated && 2342 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || 2343 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2344 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2345 return 1; 2346 vmx->msr_ia32_sgxlepubkeyhash 2347 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2348 break; 2349 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2350 if (!msr_info->host_initiated) 2351 return 1; /* they are read-only */ 2352 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2353 return 1; 2354 return vmx_set_vmx_msr(vcpu, msr_index, data); 2355 case MSR_IA32_RTIT_CTL: 2356 if (!vmx_pt_mode_is_host_guest() || 2357 vmx_rtit_ctl_check(vcpu, data) || 2358 vmx->nested.vmxon) 2359 return 1; 2360 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2361 vmx->pt_desc.guest.ctl = data; 2362 pt_update_intercept_for_msr(vcpu); 2363 break; 2364 case MSR_IA32_RTIT_STATUS: 2365 if (!pt_can_write_msr(vmx)) 2366 return 1; 2367 if (data & MSR_IA32_RTIT_STATUS_MASK) 2368 return 1; 2369 vmx->pt_desc.guest.status = data; 2370 break; 2371 case MSR_IA32_RTIT_CR3_MATCH: 2372 if (!pt_can_write_msr(vmx)) 2373 return 1; 2374 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2375 PT_CAP_cr3_filtering)) 2376 return 1; 2377 vmx->pt_desc.guest.cr3_match = data; 2378 break; 2379 case MSR_IA32_RTIT_OUTPUT_BASE: 2380 if (!pt_can_write_msr(vmx)) 2381 return 1; 2382 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2383 PT_CAP_topa_output) && 2384 !intel_pt_validate_cap(vmx->pt_desc.caps, 2385 PT_CAP_single_range_output)) 2386 return 1; 2387 if (!pt_output_base_valid(vcpu, data)) 2388 return 1; 2389 vmx->pt_desc.guest.output_base = data; 2390 break; 2391 case MSR_IA32_RTIT_OUTPUT_MASK: 2392 if (!pt_can_write_msr(vmx)) 2393 return 1; 2394 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2395 PT_CAP_topa_output) && 2396 !intel_pt_validate_cap(vmx->pt_desc.caps, 2397 PT_CAP_single_range_output)) 2398 return 1; 2399 vmx->pt_desc.guest.output_mask = data; 2400 break; 2401 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2402 if (!pt_can_write_msr(vmx)) 2403 return 1; 2404 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2405 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2406 return 1; 2407 if (is_noncanonical_msr_address(data, vcpu)) 2408 return 1; 2409 if (index % 2) 2410 vmx->pt_desc.guest.addr_b[index / 2] = data; 2411 else 2412 vmx->pt_desc.guest.addr_a[index / 2] = data; 2413 break; 2414 case MSR_IA32_PERF_CAPABILITIES: 2415 if (data & PMU_CAP_LBR_FMT) { 2416 if ((data & PMU_CAP_LBR_FMT) != 2417 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) 2418 return 1; 2419 if (!cpuid_model_is_consistent(vcpu)) 2420 return 1; 2421 } 2422 if (data & PERF_CAP_PEBS_FORMAT) { 2423 if ((data & PERF_CAP_PEBS_MASK) != 2424 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2425 return 1; 2426 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) 2427 return 1; 2428 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) 2429 return 1; 2430 if (!cpuid_model_is_consistent(vcpu)) 2431 return 1; 2432 } 2433 ret = kvm_set_msr_common(vcpu, msr_info); 2434 break; 2435 2436 default: 2437 find_uret_msr: 2438 msr = vmx_find_uret_msr(vmx, msr_index); 2439 if (msr) 2440 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2441 else 2442 ret = kvm_set_msr_common(vcpu, msr_info); 2443 } 2444 2445 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2446 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2447 vmx_update_fb_clear_dis(vcpu, vmx); 2448 2449 return ret; 2450 } 2451 2452 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2453 { 2454 unsigned long guest_owned_bits; 2455 2456 kvm_register_mark_available(vcpu, reg); 2457 2458 switch (reg) { 2459 case VCPU_REGS_RSP: 2460 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2461 break; 2462 case VCPU_REGS_RIP: 2463 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2464 break; 2465 case VCPU_EXREG_PDPTR: 2466 if (enable_ept) 2467 ept_save_pdptrs(vcpu); 2468 break; 2469 case VCPU_EXREG_CR0: 2470 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2471 2472 vcpu->arch.cr0 &= ~guest_owned_bits; 2473 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2474 break; 2475 case VCPU_EXREG_CR3: 2476 /* 2477 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2478 * CR3 is loaded into hardware, not the guest's CR3. 2479 */ 2480 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2481 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2482 break; 2483 case VCPU_EXREG_CR4: 2484 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2485 2486 vcpu->arch.cr4 &= ~guest_owned_bits; 2487 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2488 break; 2489 default: 2490 KVM_BUG_ON(1, vcpu->kvm); 2491 break; 2492 } 2493 } 2494 2495 /* 2496 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2497 * directly instead of going through cpu_has(), to ensure KVM is trapping 2498 * ENCLS whenever it's supported in hardware. It does not matter whether 2499 * the host OS supports or has enabled SGX. 2500 */ 2501 static bool cpu_has_sgx(void) 2502 { 2503 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2504 } 2505 2506 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2507 { 2508 u32 vmx_msr_low, vmx_msr_high; 2509 u32 ctl = ctl_min | ctl_opt; 2510 2511 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2512 2513 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2514 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2515 2516 /* Ensure minimum (required) set of control bits are supported. */ 2517 if (ctl_min & ~ctl) 2518 return -EIO; 2519 2520 *result = ctl; 2521 return 0; 2522 } 2523 2524 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2525 { 2526 u64 allowed; 2527 2528 rdmsrq(msr, allowed); 2529 2530 return ctl_opt & allowed; 2531 } 2532 2533 #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ 2534 ({ \ 2535 int i, r = 0; \ 2536 \ 2537 BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ 2538 BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ 2539 \ 2540 for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ 2541 typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ 2542 typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ 2543 \ 2544 if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ 2545 continue; \ 2546 \ 2547 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ 2548 "entry = %llx (%llx), exit = %llx (%llx)\n", \ 2549 (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ 2550 (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ 2551 \ 2552 if (error_on_inconsistent_vmcs_config) \ 2553 r = -EIO; \ 2554 \ 2555 entry_controls &= ~n_ctrl; \ 2556 exit_controls &= ~x_ctrl; \ 2557 } \ 2558 r; \ 2559 }) 2560 2561 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2562 struct vmx_capability *vmx_cap) 2563 { 2564 u32 _pin_based_exec_control = 0; 2565 u32 _cpu_based_exec_control = 0; 2566 u32 _cpu_based_2nd_exec_control = 0; 2567 u64 _cpu_based_3rd_exec_control = 0; 2568 u32 _vmexit_control = 0; 2569 u32 _vmentry_control = 0; 2570 u64 basic_msr; 2571 u64 misc_msr; 2572 2573 /* 2574 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2575 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2576 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2577 */ 2578 struct { 2579 u32 entry_control; 2580 u32 exit_control; 2581 } const vmcs_entry_exit_pairs[] = { 2582 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2583 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2584 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2585 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2586 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2587 }; 2588 2589 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2590 2591 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2592 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2593 MSR_IA32_VMX_PROCBASED_CTLS, 2594 &_cpu_based_exec_control)) 2595 return -EIO; 2596 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2597 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2598 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2599 MSR_IA32_VMX_PROCBASED_CTLS2, 2600 &_cpu_based_2nd_exec_control)) 2601 return -EIO; 2602 } 2603 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 2604 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2605 2606 #ifndef CONFIG_X86_64 2607 if (!(_cpu_based_2nd_exec_control & 2608 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2609 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2610 #endif 2611 2612 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2613 _cpu_based_2nd_exec_control &= ~( 2614 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2615 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2616 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2617 2618 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2619 &vmx_cap->ept, &vmx_cap->vpid); 2620 2621 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2622 vmx_cap->ept) { 2623 pr_warn_once("EPT CAP should not exist if not support " 2624 "1-setting enable EPT VM-execution control\n"); 2625 2626 if (error_on_inconsistent_vmcs_config) 2627 return -EIO; 2628 2629 vmx_cap->ept = 0; 2630 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2631 } 2632 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2633 vmx_cap->vpid) { 2634 pr_warn_once("VPID CAP should not exist if not support " 2635 "1-setting enable VPID VM-execution control\n"); 2636 2637 if (error_on_inconsistent_vmcs_config) 2638 return -EIO; 2639 2640 vmx_cap->vpid = 0; 2641 } 2642 2643 if (!cpu_has_sgx()) 2644 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2645 2646 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2647 _cpu_based_3rd_exec_control = 2648 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2649 MSR_IA32_VMX_PROCBASED_CTLS3); 2650 2651 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2652 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2653 MSR_IA32_VMX_EXIT_CTLS, 2654 &_vmexit_control)) 2655 return -EIO; 2656 2657 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2658 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2659 MSR_IA32_VMX_PINBASED_CTLS, 2660 &_pin_based_exec_control)) 2661 return -EIO; 2662 2663 if (cpu_has_broken_vmx_preemption_timer()) 2664 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2665 if (!(_cpu_based_2nd_exec_control & 2666 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2667 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2668 2669 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2670 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2671 MSR_IA32_VMX_ENTRY_CTLS, 2672 &_vmentry_control)) 2673 return -EIO; 2674 2675 if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, 2676 _vmentry_control, _vmexit_control)) 2677 return -EIO; 2678 2679 /* 2680 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2681 * can't be used due to an errata where VM Exit may incorrectly clear 2682 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2683 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2684 */ 2685 switch (boot_cpu_data.x86_vfm) { 2686 case INTEL_NEHALEM_EP: /* AAK155 */ 2687 case INTEL_NEHALEM: /* AAP115 */ 2688 case INTEL_WESTMERE: /* AAT100 */ 2689 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2690 case INTEL_NEHALEM_EX: /* BA97 */ 2691 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2692 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2693 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2694 "does not work properly. Using workaround\n"); 2695 break; 2696 default: 2697 break; 2698 } 2699 2700 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); 2701 2702 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2703 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2704 return -EIO; 2705 2706 #ifdef CONFIG_X86_64 2707 /* 2708 * KVM expects to be able to shove all legal physical addresses into 2709 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2710 * 0 for processors that support Intel 64 architecture". 2711 */ 2712 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2713 return -EIO; 2714 #endif 2715 2716 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2717 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2718 return -EIO; 2719 2720 rdmsrq(MSR_IA32_VMX_MISC, misc_msr); 2721 2722 vmcs_conf->basic = basic_msr; 2723 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2724 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2725 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2726 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2727 vmcs_conf->vmexit_ctrl = _vmexit_control; 2728 vmcs_conf->vmentry_ctrl = _vmentry_control; 2729 vmcs_conf->misc = misc_msr; 2730 2731 #if IS_ENABLED(CONFIG_HYPERV) 2732 if (enlightened_vmcs) 2733 evmcs_sanitize_exec_ctrls(vmcs_conf); 2734 #endif 2735 2736 return 0; 2737 } 2738 2739 static bool __kvm_is_vmx_supported(void) 2740 { 2741 int cpu = smp_processor_id(); 2742 2743 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2744 pr_err("VMX not supported by CPU %d\n", cpu); 2745 return false; 2746 } 2747 2748 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2749 !this_cpu_has(X86_FEATURE_VMX)) { 2750 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2751 return false; 2752 } 2753 2754 return true; 2755 } 2756 2757 static bool kvm_is_vmx_supported(void) 2758 { 2759 bool supported; 2760 2761 migrate_disable(); 2762 supported = __kvm_is_vmx_supported(); 2763 migrate_enable(); 2764 2765 return supported; 2766 } 2767 2768 int vmx_check_processor_compat(void) 2769 { 2770 int cpu = raw_smp_processor_id(); 2771 struct vmcs_config vmcs_conf; 2772 struct vmx_capability vmx_cap; 2773 2774 if (!__kvm_is_vmx_supported()) 2775 return -EIO; 2776 2777 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2778 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2779 return -EIO; 2780 } 2781 if (nested) 2782 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2783 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2784 pr_err("Inconsistent VMCS config on CPU %d\n", cpu); 2785 return -EIO; 2786 } 2787 return 0; 2788 } 2789 2790 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2791 { 2792 u64 msr; 2793 2794 cr4_set_bits(X86_CR4_VMXE); 2795 2796 asm goto("1: vmxon %[vmxon_pointer]\n\t" 2797 _ASM_EXTABLE(1b, %l[fault]) 2798 : : [vmxon_pointer] "m"(vmxon_pointer) 2799 : : fault); 2800 return 0; 2801 2802 fault: 2803 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2804 rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2805 cr4_clear_bits(X86_CR4_VMXE); 2806 2807 return -EFAULT; 2808 } 2809 2810 int vmx_enable_virtualization_cpu(void) 2811 { 2812 int cpu = raw_smp_processor_id(); 2813 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2814 int r; 2815 2816 if (cr4_read_shadow() & X86_CR4_VMXE) 2817 return -EBUSY; 2818 2819 /* 2820 * This can happen if we hot-added a CPU but failed to allocate 2821 * VP assist page for it. 2822 */ 2823 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2824 return -EFAULT; 2825 2826 intel_pt_handle_vmx(1); 2827 2828 r = kvm_cpu_vmxon(phys_addr); 2829 if (r) { 2830 intel_pt_handle_vmx(0); 2831 return r; 2832 } 2833 2834 return 0; 2835 } 2836 2837 static void vmclear_local_loaded_vmcss(void) 2838 { 2839 int cpu = raw_smp_processor_id(); 2840 struct loaded_vmcs *v, *n; 2841 2842 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2843 loaded_vmcss_on_cpu_link) 2844 __loaded_vmcs_clear(v); 2845 } 2846 2847 void vmx_disable_virtualization_cpu(void) 2848 { 2849 vmclear_local_loaded_vmcss(); 2850 2851 if (kvm_cpu_vmxoff()) 2852 kvm_spurious_fault(); 2853 2854 hv_reset_evmcs(); 2855 2856 intel_pt_handle_vmx(0); 2857 } 2858 2859 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2860 { 2861 int node = cpu_to_node(cpu); 2862 struct page *pages; 2863 struct vmcs *vmcs; 2864 2865 pages = __alloc_pages_node(node, flags, 0); 2866 if (!pages) 2867 return NULL; 2868 vmcs = page_address(pages); 2869 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 2870 2871 /* KVM supports Enlightened VMCS v1 only */ 2872 if (kvm_is_using_evmcs()) 2873 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2874 else 2875 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 2876 2877 if (shadow) 2878 vmcs->hdr.shadow_vmcs = 1; 2879 return vmcs; 2880 } 2881 2882 void free_vmcs(struct vmcs *vmcs) 2883 { 2884 free_page((unsigned long)vmcs); 2885 } 2886 2887 /* 2888 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2889 */ 2890 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2891 { 2892 if (!loaded_vmcs->vmcs) 2893 return; 2894 loaded_vmcs_clear(loaded_vmcs); 2895 free_vmcs(loaded_vmcs->vmcs); 2896 loaded_vmcs->vmcs = NULL; 2897 if (loaded_vmcs->msr_bitmap) 2898 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2899 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2900 } 2901 2902 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2903 { 2904 loaded_vmcs->vmcs = alloc_vmcs(false); 2905 if (!loaded_vmcs->vmcs) 2906 return -ENOMEM; 2907 2908 vmcs_clear(loaded_vmcs->vmcs); 2909 2910 loaded_vmcs->shadow_vmcs = NULL; 2911 loaded_vmcs->hv_timer_soft_disabled = false; 2912 loaded_vmcs->cpu = -1; 2913 loaded_vmcs->launched = 0; 2914 2915 if (cpu_has_vmx_msr_bitmap()) { 2916 loaded_vmcs->msr_bitmap = (unsigned long *) 2917 __get_free_page(GFP_KERNEL_ACCOUNT); 2918 if (!loaded_vmcs->msr_bitmap) 2919 goto out_vmcs; 2920 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2921 } 2922 2923 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2924 memset(&loaded_vmcs->controls_shadow, 0, 2925 sizeof(struct vmcs_controls_shadow)); 2926 2927 return 0; 2928 2929 out_vmcs: 2930 free_loaded_vmcs(loaded_vmcs); 2931 return -ENOMEM; 2932 } 2933 2934 static void free_kvm_area(void) 2935 { 2936 int cpu; 2937 2938 for_each_possible_cpu(cpu) { 2939 free_vmcs(per_cpu(vmxarea, cpu)); 2940 per_cpu(vmxarea, cpu) = NULL; 2941 } 2942 } 2943 2944 static __init int alloc_kvm_area(void) 2945 { 2946 int cpu; 2947 2948 for_each_possible_cpu(cpu) { 2949 struct vmcs *vmcs; 2950 2951 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 2952 if (!vmcs) { 2953 free_kvm_area(); 2954 return -ENOMEM; 2955 } 2956 2957 /* 2958 * When eVMCS is enabled, alloc_vmcs_cpu() sets 2959 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 2960 * revision_id reported by MSR_IA32_VMX_BASIC. 2961 * 2962 * However, even though not explicitly documented by 2963 * TLFS, VMXArea passed as VMXON argument should 2964 * still be marked with revision_id reported by 2965 * physical CPU. 2966 */ 2967 if (kvm_is_using_evmcs()) 2968 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 2969 2970 per_cpu(vmxarea, cpu) = vmcs; 2971 } 2972 return 0; 2973 } 2974 2975 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 2976 struct kvm_segment *save) 2977 { 2978 if (!emulate_invalid_guest_state) { 2979 /* 2980 * CS and SS RPL should be equal during guest entry according 2981 * to VMX spec, but in reality it is not always so. Since vcpu 2982 * is in the middle of the transition from real mode to 2983 * protected mode it is safe to assume that RPL 0 is a good 2984 * default value. 2985 */ 2986 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 2987 save->selector &= ~SEGMENT_RPL_MASK; 2988 save->dpl = save->selector & SEGMENT_RPL_MASK; 2989 save->s = 1; 2990 } 2991 __vmx_set_segment(vcpu, save, seg); 2992 } 2993 2994 static void enter_pmode(struct kvm_vcpu *vcpu) 2995 { 2996 unsigned long flags; 2997 struct vcpu_vmx *vmx = to_vmx(vcpu); 2998 2999 /* 3000 * Update real mode segment cache. It may be not up-to-date if segment 3001 * register was written while vcpu was in a guest mode. 3002 */ 3003 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3004 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3005 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3006 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3007 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3008 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3009 3010 vmx->rmode.vm86_active = 0; 3011 3012 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3013 3014 flags = vmcs_readl(GUEST_RFLAGS); 3015 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3016 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3017 vmcs_writel(GUEST_RFLAGS, flags); 3018 3019 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3020 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3021 3022 vmx_update_exception_bitmap(vcpu); 3023 3024 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3025 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3026 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3027 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3028 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3029 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3030 } 3031 3032 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3033 { 3034 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3035 struct kvm_segment var = *save; 3036 3037 var.dpl = 0x3; 3038 if (seg == VCPU_SREG_CS) 3039 var.type = 0x3; 3040 3041 if (!emulate_invalid_guest_state) { 3042 var.selector = var.base >> 4; 3043 var.base = var.base & 0xffff0; 3044 var.limit = 0xffff; 3045 var.g = 0; 3046 var.db = 0; 3047 var.present = 1; 3048 var.s = 1; 3049 var.l = 0; 3050 var.unusable = 0; 3051 var.type = 0x3; 3052 var.avl = 0; 3053 if (save->base & 0xf) 3054 pr_warn_once("segment base is not paragraph aligned " 3055 "when entering protected mode (seg=%d)", seg); 3056 } 3057 3058 vmcs_write16(sf->selector, var.selector); 3059 vmcs_writel(sf->base, var.base); 3060 vmcs_write32(sf->limit, var.limit); 3061 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3062 } 3063 3064 static void enter_rmode(struct kvm_vcpu *vcpu) 3065 { 3066 unsigned long flags; 3067 struct vcpu_vmx *vmx = to_vmx(vcpu); 3068 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3069 3070 /* 3071 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3072 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3073 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3074 * should VM-Fail and KVM should reject userspace attempts to stuff 3075 * CR0.PG=0 when L2 is active. 3076 */ 3077 WARN_ON_ONCE(is_guest_mode(vcpu)); 3078 3079 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3080 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3081 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3082 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3083 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3084 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3085 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3086 3087 vmx->rmode.vm86_active = 1; 3088 3089 vmx_segment_cache_clear(vmx); 3090 3091 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3092 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3093 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3094 3095 flags = vmcs_readl(GUEST_RFLAGS); 3096 vmx->rmode.save_rflags = flags; 3097 3098 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3099 3100 vmcs_writel(GUEST_RFLAGS, flags); 3101 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3102 vmx_update_exception_bitmap(vcpu); 3103 3104 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3105 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3106 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3107 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3108 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3109 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3110 } 3111 3112 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3113 { 3114 struct vcpu_vmx *vmx = to_vmx(vcpu); 3115 3116 /* Nothing to do if hardware doesn't support EFER. */ 3117 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3118 return 0; 3119 3120 vcpu->arch.efer = efer; 3121 #ifdef CONFIG_X86_64 3122 if (efer & EFER_LMA) 3123 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3124 else 3125 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3126 #else 3127 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3128 return 1; 3129 #endif 3130 3131 vmx_setup_uret_msrs(vmx); 3132 return 0; 3133 } 3134 3135 #ifdef CONFIG_X86_64 3136 3137 static void enter_lmode(struct kvm_vcpu *vcpu) 3138 { 3139 u32 guest_tr_ar; 3140 3141 vmx_segment_cache_clear(to_vmx(vcpu)); 3142 3143 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3144 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3145 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3146 __func__); 3147 vmcs_write32(GUEST_TR_AR_BYTES, 3148 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3149 | VMX_AR_TYPE_BUSY_64_TSS); 3150 } 3151 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3152 } 3153 3154 static void exit_lmode(struct kvm_vcpu *vcpu) 3155 { 3156 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3157 } 3158 3159 #endif 3160 3161 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3162 { 3163 struct vcpu_vmx *vmx = to_vmx(vcpu); 3164 3165 /* 3166 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3167 * the CPU is not required to invalidate guest-physical mappings on 3168 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3169 * associated with the root EPT structure and not any particular VPID 3170 * (INVVPID also isn't required to invalidate guest-physical mappings). 3171 */ 3172 if (enable_ept) { 3173 ept_sync_global(); 3174 } else if (enable_vpid) { 3175 if (cpu_has_vmx_invvpid_global()) { 3176 vpid_sync_vcpu_global(); 3177 } else { 3178 vpid_sync_vcpu_single(vmx->vpid); 3179 vpid_sync_vcpu_single(vmx->nested.vpid02); 3180 } 3181 } 3182 } 3183 3184 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3185 { 3186 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) 3187 return nested_get_vpid02(vcpu); 3188 return to_vmx(vcpu)->vpid; 3189 } 3190 3191 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3192 { 3193 struct kvm_mmu *mmu = vcpu->arch.mmu; 3194 u64 root_hpa = mmu->root.hpa; 3195 3196 /* No flush required if the current context is invalid. */ 3197 if (!VALID_PAGE(root_hpa)) 3198 return; 3199 3200 if (enable_ept) 3201 ept_sync_context(construct_eptp(vcpu, root_hpa, 3202 mmu->root_role.level)); 3203 else 3204 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3205 } 3206 3207 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3208 { 3209 /* 3210 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3211 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3212 */ 3213 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3214 } 3215 3216 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3217 { 3218 /* 3219 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3220 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3221 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3222 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3223 * i.e. no explicit INVVPID is necessary. 3224 */ 3225 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3226 } 3227 3228 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3229 { 3230 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3231 3232 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3233 return; 3234 3235 if (is_pae_paging(vcpu)) { 3236 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3237 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3238 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3239 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3240 } 3241 } 3242 3243 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3244 { 3245 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3246 3247 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3248 return; 3249 3250 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3251 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3252 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3253 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3254 3255 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3256 } 3257 3258 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3259 CPU_BASED_CR3_STORE_EXITING) 3260 3261 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3262 { 3263 if (is_guest_mode(vcpu)) 3264 return nested_guest_cr0_valid(vcpu, cr0); 3265 3266 if (to_vmx(vcpu)->nested.vmxon) 3267 return nested_host_cr0_valid(vcpu, cr0); 3268 3269 return true; 3270 } 3271 3272 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3273 { 3274 struct vcpu_vmx *vmx = to_vmx(vcpu); 3275 unsigned long hw_cr0, old_cr0_pg; 3276 u32 tmp; 3277 3278 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3279 3280 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3281 if (enable_unrestricted_guest) 3282 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3283 else { 3284 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3285 if (!enable_ept) 3286 hw_cr0 |= X86_CR0_WP; 3287 3288 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3289 enter_pmode(vcpu); 3290 3291 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3292 enter_rmode(vcpu); 3293 } 3294 3295 vmcs_writel(CR0_READ_SHADOW, cr0); 3296 vmcs_writel(GUEST_CR0, hw_cr0); 3297 vcpu->arch.cr0 = cr0; 3298 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3299 3300 #ifdef CONFIG_X86_64 3301 if (vcpu->arch.efer & EFER_LME) { 3302 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3303 enter_lmode(vcpu); 3304 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3305 exit_lmode(vcpu); 3306 } 3307 #endif 3308 3309 if (enable_ept && !enable_unrestricted_guest) { 3310 /* 3311 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3312 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3313 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3314 * KVM's CR3 is installed. 3315 */ 3316 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3317 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3318 3319 /* 3320 * When running with EPT but not unrestricted guest, KVM must 3321 * intercept CR3 accesses when paging is _disabled_. This is 3322 * necessary because restricted guests can't actually run with 3323 * paging disabled, and so KVM stuffs its own CR3 in order to 3324 * run the guest when identity mapped page tables. 3325 * 3326 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3327 * update, it may be stale with respect to CR3 interception, 3328 * e.g. after nested VM-Enter. 3329 * 3330 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3331 * stores to forward them to L1, even if KVM does not need to 3332 * intercept them to preserve its identity mapped page tables. 3333 */ 3334 if (!(cr0 & X86_CR0_PG)) { 3335 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3336 } else if (!is_guest_mode(vcpu)) { 3337 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3338 } else { 3339 tmp = exec_controls_get(vmx); 3340 tmp &= ~CR3_EXITING_BITS; 3341 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3342 exec_controls_set(vmx, tmp); 3343 } 3344 3345 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3346 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3347 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3348 3349 /* 3350 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3351 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3352 */ 3353 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3354 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3355 } 3356 3357 /* depends on vcpu->arch.cr0 to be set to a new value */ 3358 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 3359 } 3360 3361 static int vmx_get_max_ept_level(void) 3362 { 3363 if (cpu_has_vmx_ept_5levels()) 3364 return 5; 3365 return 4; 3366 } 3367 3368 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3369 { 3370 u64 eptp = VMX_EPTP_MT_WB; 3371 3372 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3373 3374 if (enable_ept_ad_bits && 3375 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 3376 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3377 eptp |= root_hpa; 3378 3379 return eptp; 3380 } 3381 3382 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3383 { 3384 struct kvm *kvm = vcpu->kvm; 3385 bool update_guest_cr3 = true; 3386 unsigned long guest_cr3; 3387 u64 eptp; 3388 3389 if (enable_ept) { 3390 eptp = construct_eptp(vcpu, root_hpa, root_level); 3391 vmcs_write64(EPT_POINTER, eptp); 3392 3393 hv_track_root_tdp(vcpu, root_hpa); 3394 3395 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3396 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3397 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3398 guest_cr3 = vcpu->arch.cr3; 3399 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3400 update_guest_cr3 = false; 3401 vmx_ept_load_pdptrs(vcpu); 3402 } else { 3403 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3404 kvm_get_active_cr3_lam_bits(vcpu); 3405 } 3406 3407 if (update_guest_cr3) 3408 vmcs_writel(GUEST_CR3, guest_cr3); 3409 } 3410 3411 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3412 { 3413 /* 3414 * We operate under the default treatment of SMM, so VMX cannot be 3415 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3416 * i.e. is a reserved bit, is handled by common x86 code. 3417 */ 3418 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3419 return false; 3420 3421 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3422 return false; 3423 3424 return true; 3425 } 3426 3427 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3428 { 3429 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3430 struct vcpu_vmx *vmx = to_vmx(vcpu); 3431 unsigned long hw_cr4; 3432 3433 /* 3434 * Pass through host's Machine Check Enable value to hw_cr4, which 3435 * is in force while we are in guest mode. Do not let guests control 3436 * this bit, even if host CR4.MCE == 0. 3437 */ 3438 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3439 if (enable_unrestricted_guest) 3440 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3441 else if (vmx->rmode.vm86_active) 3442 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3443 else 3444 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3445 3446 if (vmx_umip_emulated()) { 3447 if (cr4 & X86_CR4_UMIP) { 3448 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3449 hw_cr4 &= ~X86_CR4_UMIP; 3450 } else if (!is_guest_mode(vcpu) || 3451 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3452 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3453 } 3454 } 3455 3456 vcpu->arch.cr4 = cr4; 3457 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3458 3459 if (!enable_unrestricted_guest) { 3460 if (enable_ept) { 3461 if (!is_paging(vcpu)) { 3462 hw_cr4 &= ~X86_CR4_PAE; 3463 hw_cr4 |= X86_CR4_PSE; 3464 } else if (!(cr4 & X86_CR4_PAE)) { 3465 hw_cr4 &= ~X86_CR4_PAE; 3466 } 3467 } 3468 3469 /* 3470 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3471 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3472 * to be manually disabled when guest switches to non-paging 3473 * mode. 3474 * 3475 * If !enable_unrestricted_guest, the CPU is always running 3476 * with CR0.PG=1 and CR4 needs to be modified. 3477 * If enable_unrestricted_guest, the CPU automatically 3478 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3479 */ 3480 if (!is_paging(vcpu)) 3481 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3482 } 3483 3484 vmcs_writel(CR4_READ_SHADOW, cr4); 3485 vmcs_writel(GUEST_CR4, hw_cr4); 3486 3487 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3488 vcpu->arch.cpuid_dynamic_bits_dirty = true; 3489 } 3490 3491 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3492 { 3493 struct vcpu_vmx *vmx = to_vmx(vcpu); 3494 u32 ar; 3495 3496 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3497 *var = vmx->rmode.segs[seg]; 3498 if (seg == VCPU_SREG_TR 3499 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3500 return; 3501 var->base = vmx_read_guest_seg_base(vmx, seg); 3502 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3503 return; 3504 } 3505 var->base = vmx_read_guest_seg_base(vmx, seg); 3506 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3507 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3508 ar = vmx_read_guest_seg_ar(vmx, seg); 3509 var->unusable = (ar >> 16) & 1; 3510 var->type = ar & 15; 3511 var->s = (ar >> 4) & 1; 3512 var->dpl = (ar >> 5) & 3; 3513 /* 3514 * Some userspaces do not preserve unusable property. Since usable 3515 * segment has to be present according to VMX spec we can use present 3516 * property to amend userspace bug by making unusable segment always 3517 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3518 * segment as unusable. 3519 */ 3520 var->present = !var->unusable; 3521 var->avl = (ar >> 12) & 1; 3522 var->l = (ar >> 13) & 1; 3523 var->db = (ar >> 14) & 1; 3524 var->g = (ar >> 15) & 1; 3525 } 3526 3527 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3528 { 3529 struct kvm_segment s; 3530 3531 if (to_vmx(vcpu)->rmode.vm86_active) { 3532 vmx_get_segment(vcpu, &s, seg); 3533 return s.base; 3534 } 3535 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3536 } 3537 3538 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) 3539 { 3540 struct vcpu_vmx *vmx = to_vmx(vcpu); 3541 int ar; 3542 3543 if (unlikely(vmx->rmode.vm86_active)) 3544 return 0; 3545 3546 if (no_cache) 3547 ar = vmcs_read32(GUEST_SS_AR_BYTES); 3548 else 3549 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3550 return VMX_AR_DPL(ar); 3551 } 3552 3553 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3554 { 3555 return __vmx_get_cpl(vcpu, false); 3556 } 3557 3558 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) 3559 { 3560 return __vmx_get_cpl(vcpu, true); 3561 } 3562 3563 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3564 { 3565 u32 ar; 3566 3567 ar = var->type & 15; 3568 ar |= (var->s & 1) << 4; 3569 ar |= (var->dpl & 3) << 5; 3570 ar |= (var->present & 1) << 7; 3571 ar |= (var->avl & 1) << 12; 3572 ar |= (var->l & 1) << 13; 3573 ar |= (var->db & 1) << 14; 3574 ar |= (var->g & 1) << 15; 3575 ar |= (var->unusable || !var->present) << 16; 3576 3577 return ar; 3578 } 3579 3580 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3581 { 3582 struct vcpu_vmx *vmx = to_vmx(vcpu); 3583 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3584 3585 vmx_segment_cache_clear(vmx); 3586 3587 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3588 vmx->rmode.segs[seg] = *var; 3589 if (seg == VCPU_SREG_TR) 3590 vmcs_write16(sf->selector, var->selector); 3591 else if (var->s) 3592 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3593 return; 3594 } 3595 3596 vmcs_writel(sf->base, var->base); 3597 vmcs_write32(sf->limit, var->limit); 3598 vmcs_write16(sf->selector, var->selector); 3599 3600 /* 3601 * Fix the "Accessed" bit in AR field of segment registers for older 3602 * qemu binaries. 3603 * IA32 arch specifies that at the time of processor reset the 3604 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3605 * is setting it to 0 in the userland code. This causes invalid guest 3606 * state vmexit when "unrestricted guest" mode is turned on. 3607 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3608 * tree. Newer qemu binaries with that qemu fix would not need this 3609 * kvm hack. 3610 */ 3611 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3612 var->type |= 0x1; /* Accessed */ 3613 3614 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3615 } 3616 3617 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3618 { 3619 __vmx_set_segment(vcpu, var, seg); 3620 3621 to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); 3622 } 3623 3624 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3625 { 3626 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3627 3628 *db = (ar >> 14) & 1; 3629 *l = (ar >> 13) & 1; 3630 } 3631 3632 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3633 { 3634 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3635 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3636 } 3637 3638 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3639 { 3640 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3641 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3642 } 3643 3644 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3645 { 3646 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3647 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3648 } 3649 3650 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3651 { 3652 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3653 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3654 } 3655 3656 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3657 { 3658 struct kvm_segment var; 3659 u32 ar; 3660 3661 vmx_get_segment(vcpu, &var, seg); 3662 var.dpl = 0x3; 3663 if (seg == VCPU_SREG_CS) 3664 var.type = 0x3; 3665 ar = vmx_segment_access_rights(&var); 3666 3667 if (var.base != (var.selector << 4)) 3668 return false; 3669 if (var.limit != 0xffff) 3670 return false; 3671 if (ar != 0xf3) 3672 return false; 3673 3674 return true; 3675 } 3676 3677 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3678 { 3679 struct kvm_segment cs; 3680 unsigned int cs_rpl; 3681 3682 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3683 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3684 3685 if (cs.unusable) 3686 return false; 3687 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3688 return false; 3689 if (!cs.s) 3690 return false; 3691 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3692 if (cs.dpl > cs_rpl) 3693 return false; 3694 } else { 3695 if (cs.dpl != cs_rpl) 3696 return false; 3697 } 3698 if (!cs.present) 3699 return false; 3700 3701 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3702 return true; 3703 } 3704 3705 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3706 { 3707 struct kvm_segment ss; 3708 unsigned int ss_rpl; 3709 3710 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3711 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3712 3713 if (ss.unusable) 3714 return true; 3715 if (ss.type != 3 && ss.type != 7) 3716 return false; 3717 if (!ss.s) 3718 return false; 3719 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3720 return false; 3721 if (!ss.present) 3722 return false; 3723 3724 return true; 3725 } 3726 3727 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3728 { 3729 struct kvm_segment var; 3730 unsigned int rpl; 3731 3732 vmx_get_segment(vcpu, &var, seg); 3733 rpl = var.selector & SEGMENT_RPL_MASK; 3734 3735 if (var.unusable) 3736 return true; 3737 if (!var.s) 3738 return false; 3739 if (!var.present) 3740 return false; 3741 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3742 if (var.dpl < rpl) /* DPL < RPL */ 3743 return false; 3744 } 3745 3746 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3747 * rights flags 3748 */ 3749 return true; 3750 } 3751 3752 static bool tr_valid(struct kvm_vcpu *vcpu) 3753 { 3754 struct kvm_segment tr; 3755 3756 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3757 3758 if (tr.unusable) 3759 return false; 3760 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3761 return false; 3762 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3763 return false; 3764 if (!tr.present) 3765 return false; 3766 3767 return true; 3768 } 3769 3770 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3771 { 3772 struct kvm_segment ldtr; 3773 3774 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3775 3776 if (ldtr.unusable) 3777 return true; 3778 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3779 return false; 3780 if (ldtr.type != 2) 3781 return false; 3782 if (!ldtr.present) 3783 return false; 3784 3785 return true; 3786 } 3787 3788 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3789 { 3790 struct kvm_segment cs, ss; 3791 3792 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3793 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3794 3795 return ((cs.selector & SEGMENT_RPL_MASK) == 3796 (ss.selector & SEGMENT_RPL_MASK)); 3797 } 3798 3799 /* 3800 * Check if guest state is valid. Returns true if valid, false if 3801 * not. 3802 * We assume that registers are always usable 3803 */ 3804 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3805 { 3806 /* real mode guest state checks */ 3807 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3808 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3809 return false; 3810 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3811 return false; 3812 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3813 return false; 3814 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3815 return false; 3816 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3817 return false; 3818 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3819 return false; 3820 } else { 3821 /* protected mode guest state checks */ 3822 if (!cs_ss_rpl_check(vcpu)) 3823 return false; 3824 if (!code_segment_valid(vcpu)) 3825 return false; 3826 if (!stack_segment_valid(vcpu)) 3827 return false; 3828 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3829 return false; 3830 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3831 return false; 3832 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3833 return false; 3834 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3835 return false; 3836 if (!tr_valid(vcpu)) 3837 return false; 3838 if (!ldtr_valid(vcpu)) 3839 return false; 3840 } 3841 /* TODO: 3842 * - Add checks on RIP 3843 * - Add checks on RFLAGS 3844 */ 3845 3846 return true; 3847 } 3848 3849 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3850 { 3851 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3852 u16 data; 3853 int i; 3854 3855 for (i = 0; i < 3; i++) { 3856 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3857 return -EFAULT; 3858 } 3859 3860 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3861 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3862 return -EFAULT; 3863 3864 data = ~0; 3865 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3866 return -EFAULT; 3867 3868 return 0; 3869 } 3870 3871 static int init_rmode_identity_map(struct kvm *kvm) 3872 { 3873 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3874 int i, r = 0; 3875 void __user *uaddr; 3876 u32 tmp; 3877 3878 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3879 mutex_lock(&kvm->slots_lock); 3880 3881 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3882 goto out; 3883 3884 if (!kvm_vmx->ept_identity_map_addr) 3885 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3886 3887 uaddr = __x86_set_memory_region(kvm, 3888 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3889 kvm_vmx->ept_identity_map_addr, 3890 PAGE_SIZE); 3891 if (IS_ERR(uaddr)) { 3892 r = PTR_ERR(uaddr); 3893 goto out; 3894 } 3895 3896 /* Set up identity-mapping pagetable for EPT in real mode */ 3897 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 3898 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3899 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3900 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 3901 r = -EFAULT; 3902 goto out; 3903 } 3904 } 3905 kvm_vmx->ept_identity_pagetable_done = true; 3906 3907 out: 3908 mutex_unlock(&kvm->slots_lock); 3909 return r; 3910 } 3911 3912 static void seg_setup(int seg) 3913 { 3914 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3915 unsigned int ar; 3916 3917 vmcs_write16(sf->selector, 0); 3918 vmcs_writel(sf->base, 0); 3919 vmcs_write32(sf->limit, 0xffff); 3920 ar = 0x93; 3921 if (seg == VCPU_SREG_CS) 3922 ar |= 0x08; /* code segment */ 3923 3924 vmcs_write32(sf->ar_bytes, ar); 3925 } 3926 3927 int allocate_vpid(void) 3928 { 3929 int vpid; 3930 3931 if (!enable_vpid) 3932 return 0; 3933 spin_lock(&vmx_vpid_lock); 3934 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3935 if (vpid < VMX_NR_VPIDS) 3936 __set_bit(vpid, vmx_vpid_bitmap); 3937 else 3938 vpid = 0; 3939 spin_unlock(&vmx_vpid_lock); 3940 return vpid; 3941 } 3942 3943 void free_vpid(int vpid) 3944 { 3945 if (!enable_vpid || vpid == 0) 3946 return; 3947 spin_lock(&vmx_vpid_lock); 3948 __clear_bit(vpid, vmx_vpid_bitmap); 3949 spin_unlock(&vmx_vpid_lock); 3950 } 3951 3952 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 3953 { 3954 /* 3955 * When KVM is a nested hypervisor on top of Hyper-V and uses 3956 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 3957 * bitmap has changed. 3958 */ 3959 if (kvm_is_using_evmcs()) { 3960 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 3961 3962 if (evmcs->hv_enlightenments_control.msr_bitmap) 3963 evmcs->hv_clean_fields &= 3964 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 3965 } 3966 3967 vmx->nested.force_msr_bitmap_recalc = true; 3968 } 3969 3970 void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) 3971 { 3972 struct vcpu_vmx *vmx = to_vmx(vcpu); 3973 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3974 3975 if (!cpu_has_vmx_msr_bitmap()) 3976 return; 3977 3978 vmx_msr_bitmap_l01_changed(vmx); 3979 3980 if (type & MSR_TYPE_R) { 3981 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 3982 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 3983 else 3984 vmx_set_msr_bitmap_read(msr_bitmap, msr); 3985 } 3986 3987 if (type & MSR_TYPE_W) { 3988 if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 3989 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 3990 else 3991 vmx_set_msr_bitmap_write(msr_bitmap, msr); 3992 } 3993 } 3994 3995 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 3996 { 3997 /* 3998 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 3999 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4000 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4001 */ 4002 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4003 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4004 struct vcpu_vmx *vmx = to_vmx(vcpu); 4005 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4006 u8 mode; 4007 4008 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4009 return; 4010 4011 if (cpu_has_secondary_exec_ctrls() && 4012 (secondary_exec_controls_get(vmx) & 4013 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4014 mode = MSR_BITMAP_MODE_X2APIC; 4015 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4016 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4017 } else { 4018 mode = 0; 4019 } 4020 4021 if (mode == vmx->x2apic_msr_bitmap_mode) 4022 return; 4023 4024 vmx->x2apic_msr_bitmap_mode = mode; 4025 4026 /* 4027 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4028 * registers (0x840 and above) intercepted, KVM doesn't support them. 4029 * Intercept all writes by default and poke holes as needed. Pass 4030 * through reads for all valid registers by default in x2APIC+APICv 4031 * mode, only the current timer count needs on-demand emulation by KVM. 4032 */ 4033 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4034 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4035 else 4036 msr_bitmap[read_idx] = ~0ull; 4037 msr_bitmap[write_idx] = ~0ull; 4038 4039 /* 4040 * TPR reads and writes can be virtualized even if virtual interrupt 4041 * delivery is not in use. 4042 */ 4043 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4044 !(mode & MSR_BITMAP_MODE_X2APIC)); 4045 4046 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4047 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4048 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4049 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4050 if (enable_ipiv) 4051 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4052 } 4053 } 4054 4055 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4056 { 4057 struct vcpu_vmx *vmx = to_vmx(vcpu); 4058 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4059 u32 i; 4060 4061 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4062 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4063 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4064 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4065 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4066 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4067 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4068 } 4069 } 4070 4071 void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4072 { 4073 if (!cpu_has_vmx_msr_bitmap()) 4074 return; 4075 4076 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 4077 #ifdef CONFIG_X86_64 4078 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 4079 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 4080 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 4081 #endif 4082 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 4083 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 4084 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 4085 if (kvm_cstate_in_guest(vcpu->kvm)) { 4086 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 4087 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 4088 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 4089 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 4090 } 4091 if (kvm_aperfmperf_in_guest(vcpu->kvm)) { 4092 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); 4093 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); 4094 } 4095 4096 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4097 if (vmx_pt_mode_is_host_guest()) 4098 pt_update_intercept_for_msr(vcpu); 4099 4100 if (vcpu->arch.xfd_no_write_intercept) 4101 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); 4102 4103 vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, 4104 !to_vmx(vcpu)->spec_ctrl); 4105 4106 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 4107 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 4108 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 4109 4110 if (cpu_feature_enabled(X86_FEATURE_IBPB)) 4111 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 4112 !guest_has_pred_cmd_msr(vcpu)); 4113 4114 if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) 4115 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 4116 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 4117 4118 /* 4119 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be 4120 * filtered by userspace. 4121 */ 4122 } 4123 4124 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4125 int vector) 4126 { 4127 struct vcpu_vmx *vmx = to_vmx(vcpu); 4128 4129 /* 4130 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4131 * and freed, and must not be accessed outside of vcpu->mutex. The 4132 * vCPU's cached PI NV is valid if and only if posted interrupts 4133 * enabled in its vmcs12, i.e. checking the vector also checks that 4134 * L1 has enabled posted interrupts for L2. 4135 */ 4136 if (is_guest_mode(vcpu) && 4137 vector == vmx->nested.posted_intr_nv) { 4138 /* 4139 * If a posted intr is not recognized by hardware, 4140 * we will accomplish it in the next vmentry. 4141 */ 4142 vmx->nested.pi_pending = true; 4143 kvm_make_request(KVM_REQ_EVENT, vcpu); 4144 4145 /* 4146 * This pairs with the smp_mb_*() after setting vcpu->mode in 4147 * vcpu_enter_guest() to guarantee the vCPU sees the event 4148 * request if triggering a posted interrupt "fails" because 4149 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4150 * the smb_wmb() in kvm_make_request() only ensures everything 4151 * done before making the request is visible when the request 4152 * is visible, it doesn't ensure ordering between the store to 4153 * vcpu->requests and the load from vcpu->mode. 4154 */ 4155 smp_mb__after_atomic(); 4156 4157 /* the PIR and ON have been set by L1. */ 4158 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4159 return 0; 4160 } 4161 return -1; 4162 } 4163 /* 4164 * Send interrupt to vcpu via posted interrupt way. 4165 * 1. If target vcpu is running(non-root mode), send posted interrupt 4166 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4167 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4168 * interrupt from PIR in next vmentry. 4169 */ 4170 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4171 { 4172 struct vcpu_vt *vt = to_vt(vcpu); 4173 int r; 4174 4175 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4176 if (!r) 4177 return 0; 4178 4179 /* Note, this is called iff the local APIC is in-kernel. */ 4180 if (!vcpu->arch.apic->apicv_active) 4181 return -1; 4182 4183 __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); 4184 return 0; 4185 } 4186 4187 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4188 int trig_mode, int vector) 4189 { 4190 struct kvm_vcpu *vcpu = apic->vcpu; 4191 4192 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4193 kvm_lapic_set_irr(vector, apic); 4194 kvm_make_request(KVM_REQ_EVENT, vcpu); 4195 kvm_vcpu_kick(vcpu); 4196 } else { 4197 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4198 trig_mode, vector); 4199 } 4200 } 4201 4202 /* 4203 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4204 * will not change in the lifetime of the guest. 4205 * Note that host-state that does change is set elsewhere. E.g., host-state 4206 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4207 */ 4208 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4209 { 4210 u32 low32, high32; 4211 unsigned long tmpl; 4212 unsigned long cr0, cr3, cr4; 4213 4214 cr0 = read_cr0(); 4215 WARN_ON(cr0 & X86_CR0_TS); 4216 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4217 4218 /* 4219 * Save the most likely value for this task's CR3 in the VMCS. 4220 * We can't use __get_current_cr3_fast() because we're not atomic. 4221 */ 4222 cr3 = __read_cr3(); 4223 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4224 vmx->loaded_vmcs->host_state.cr3 = cr3; 4225 4226 /* Save the most likely value for this task's CR4 in the VMCS. */ 4227 cr4 = cr4_read_shadow(); 4228 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4229 vmx->loaded_vmcs->host_state.cr4 = cr4; 4230 4231 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4232 #ifdef CONFIG_X86_64 4233 /* 4234 * Load null selectors, so we can avoid reloading them in 4235 * vmx_prepare_switch_to_host(), in case userspace uses 4236 * the null selectors too (the expected case). 4237 */ 4238 vmcs_write16(HOST_DS_SELECTOR, 0); 4239 vmcs_write16(HOST_ES_SELECTOR, 0); 4240 #else 4241 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4242 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4243 #endif 4244 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4245 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4246 4247 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4248 4249 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4250 4251 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4252 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4253 4254 /* 4255 * SYSENTER is used for 32-bit system calls on either 32-bit or 4256 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4257 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4258 * have already done so!). 4259 */ 4260 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4261 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4262 4263 rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); 4264 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4265 4266 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4267 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4268 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4269 } 4270 4271 if (cpu_has_load_ia32_efer()) 4272 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4273 } 4274 4275 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4276 { 4277 struct kvm_vcpu *vcpu = &vmx->vcpu; 4278 4279 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4280 ~vcpu->arch.cr4_guest_rsvd_bits; 4281 if (!enable_ept) { 4282 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4283 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4284 } 4285 if (is_guest_mode(&vmx->vcpu)) 4286 vcpu->arch.cr4_guest_owned_bits &= 4287 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4288 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4289 } 4290 4291 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4292 { 4293 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4294 4295 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4296 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4297 4298 if (!enable_vnmi) 4299 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4300 4301 if (!enable_preemption_timer) 4302 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4303 4304 return pin_based_exec_ctrl; 4305 } 4306 4307 static u32 vmx_vmentry_ctrl(void) 4308 { 4309 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4310 4311 if (vmx_pt_mode_is_system()) 4312 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4313 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4314 /* 4315 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4316 */ 4317 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4318 VM_ENTRY_LOAD_IA32_EFER | 4319 VM_ENTRY_IA32E_MODE); 4320 4321 return vmentry_ctrl; 4322 } 4323 4324 static u32 vmx_vmexit_ctrl(void) 4325 { 4326 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4327 4328 /* 4329 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4330 * nested virtualization and thus allowed to be set in vmcs12. 4331 */ 4332 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4333 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4334 4335 if (vmx_pt_mode_is_system()) 4336 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4337 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4338 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4339 return vmexit_ctrl & 4340 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4341 } 4342 4343 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4344 { 4345 struct vcpu_vmx *vmx = to_vmx(vcpu); 4346 4347 if (is_guest_mode(vcpu)) { 4348 vmx->nested.update_vmcs01_apicv_status = true; 4349 return; 4350 } 4351 4352 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4353 4354 if (kvm_vcpu_apicv_active(vcpu)) { 4355 secondary_exec_controls_setbit(vmx, 4356 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4357 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4358 if (enable_ipiv) 4359 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4360 } else { 4361 secondary_exec_controls_clearbit(vmx, 4362 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4363 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4364 if (enable_ipiv) 4365 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4366 } 4367 4368 vmx_update_msr_bitmap_x2apic(vcpu); 4369 } 4370 4371 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4372 { 4373 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4374 4375 /* 4376 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4377 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4378 */ 4379 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4380 CPU_BASED_USE_IO_BITMAPS | 4381 CPU_BASED_MONITOR_TRAP_FLAG | 4382 CPU_BASED_PAUSE_EXITING); 4383 4384 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4385 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4386 CPU_BASED_NMI_WINDOW_EXITING); 4387 4388 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4389 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4390 4391 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4392 exec_control &= ~CPU_BASED_TPR_SHADOW; 4393 4394 #ifdef CONFIG_X86_64 4395 if (exec_control & CPU_BASED_TPR_SHADOW) 4396 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4397 CPU_BASED_CR8_STORE_EXITING); 4398 else 4399 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4400 CPU_BASED_CR8_LOAD_EXITING; 4401 #endif 4402 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4403 if (enable_ept) 4404 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4405 CPU_BASED_CR3_STORE_EXITING | 4406 CPU_BASED_INVLPG_EXITING); 4407 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4408 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4409 CPU_BASED_MONITOR_EXITING); 4410 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4411 exec_control &= ~CPU_BASED_HLT_EXITING; 4412 return exec_control; 4413 } 4414 4415 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4416 { 4417 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4418 4419 /* 4420 * IPI virtualization relies on APICv. Disable IPI virtualization if 4421 * APICv is inhibited. 4422 */ 4423 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4424 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4425 4426 return exec_control; 4427 } 4428 4429 /* 4430 * Adjust a single secondary execution control bit to intercept/allow an 4431 * instruction in the guest. This is usually done based on whether or not a 4432 * feature has been exposed to the guest in order to correctly emulate faults. 4433 */ 4434 static inline void 4435 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4436 u32 control, bool enabled, bool exiting) 4437 { 4438 /* 4439 * If the control is for an opt-in feature, clear the control if the 4440 * feature is not exposed to the guest, i.e. not enabled. If the 4441 * control is opt-out, i.e. an exiting control, clear the control if 4442 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4443 * disabled for the associated instruction. Note, the caller is 4444 * responsible presetting exec_control to set all supported bits. 4445 */ 4446 if (enabled == exiting) 4447 *exec_control &= ~control; 4448 4449 /* 4450 * Update the nested MSR settings so that a nested VMM can/can't set 4451 * controls for features that are/aren't exposed to the guest. 4452 */ 4453 if (nested && 4454 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { 4455 /* 4456 * All features that can be added or removed to VMX MSRs must 4457 * be supported in the first place for nested virtualization. 4458 */ 4459 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4460 enabled = false; 4461 4462 if (enabled) 4463 vmx->nested.msrs.secondary_ctls_high |= control; 4464 else 4465 vmx->nested.msrs.secondary_ctls_high &= ~control; 4466 } 4467 } 4468 4469 /* 4470 * Wrapper macro for the common case of adjusting a secondary execution control 4471 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4472 * verifies that the control is actually supported by KVM and hardware. 4473 */ 4474 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4475 ({ \ 4476 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4477 bool __enabled; \ 4478 \ 4479 if (cpu_has_vmx_##name()) { \ 4480 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ 4481 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4482 __enabled, exiting); \ 4483 } \ 4484 }) 4485 4486 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4487 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4488 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4489 4490 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4491 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4492 4493 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4494 { 4495 struct kvm_vcpu *vcpu = &vmx->vcpu; 4496 4497 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4498 4499 if (vmx_pt_mode_is_system()) 4500 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4501 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4502 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4503 if (vmx->vpid == 0) 4504 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4505 if (!enable_ept) { 4506 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4507 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 4508 enable_unrestricted_guest = 0; 4509 } 4510 if (!enable_unrestricted_guest) 4511 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4512 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4513 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4514 if (!kvm_vcpu_apicv_active(vcpu)) 4515 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4516 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4517 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4518 4519 /* 4520 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4521 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4522 */ 4523 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4524 4525 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4526 * in vmx_set_cr4. */ 4527 exec_control &= ~SECONDARY_EXEC_DESC; 4528 4529 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4530 (handle_vmptrld). 4531 We can NOT enable shadow_vmcs here because we don't have yet 4532 a current VMCS12 4533 */ 4534 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4535 4536 /* 4537 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4538 * it needs to be set here when dirty logging is already active, e.g. 4539 * if this vCPU was created after dirty logging was enabled. 4540 */ 4541 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4542 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4543 4544 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4545 4546 /* 4547 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4548 * feature is exposed to the guest. This creates a virtualization hole 4549 * if both are supported in hardware but only one is exposed to the 4550 * guest, but letting the guest execute RDTSCP or RDPID when either one 4551 * is advertised is preferable to emulating the advertised instruction 4552 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4553 */ 4554 if (cpu_has_vmx_rdtscp()) { 4555 bool rdpid_or_rdtscp_enabled = 4556 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || 4557 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); 4558 4559 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4560 SECONDARY_EXEC_ENABLE_RDTSCP, 4561 rdpid_or_rdtscp_enabled, false); 4562 } 4563 4564 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4565 4566 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4567 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4568 4569 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4570 ENABLE_USR_WAIT_PAUSE, false); 4571 4572 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4573 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4574 4575 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4576 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4577 4578 return exec_control; 4579 } 4580 4581 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4582 { 4583 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4584 } 4585 4586 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4587 { 4588 struct page *pages; 4589 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4590 4591 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4592 return 0; 4593 4594 if (kvm_vmx->pid_table) 4595 return 0; 4596 4597 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4598 vmx_get_pid_table_order(kvm)); 4599 if (!pages) 4600 return -ENOMEM; 4601 4602 kvm_vmx->pid_table = (void *)page_address(pages); 4603 return 0; 4604 } 4605 4606 int vmx_vcpu_precreate(struct kvm *kvm) 4607 { 4608 return vmx_alloc_ipiv_pid_table(kvm); 4609 } 4610 4611 #define VMX_XSS_EXIT_BITMAP 0 4612 4613 static void init_vmcs(struct vcpu_vmx *vmx) 4614 { 4615 struct kvm *kvm = vmx->vcpu.kvm; 4616 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4617 4618 if (nested) 4619 nested_vmx_set_vmcs_shadowing_bitmap(); 4620 4621 if (cpu_has_vmx_msr_bitmap()) 4622 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4623 4624 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4625 4626 /* Control */ 4627 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4628 4629 exec_controls_set(vmx, vmx_exec_control(vmx)); 4630 4631 if (cpu_has_secondary_exec_ctrls()) { 4632 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4633 if (vmx->ve_info) 4634 vmcs_write64(VE_INFORMATION_ADDRESS, 4635 __pa(vmx->ve_info)); 4636 } 4637 4638 if (cpu_has_tertiary_exec_ctrls()) 4639 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4640 4641 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4642 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4643 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4644 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4645 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4646 4647 vmcs_write16(GUEST_INTR_STATUS, 0); 4648 4649 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4650 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); 4651 } 4652 4653 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4654 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4655 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4656 } 4657 4658 if (!kvm_pause_in_guest(kvm)) { 4659 vmcs_write32(PLE_GAP, ple_gap); 4660 vmx->ple_window = ple_window; 4661 vmx->ple_window_dirty = true; 4662 } 4663 4664 if (kvm_notify_vmexit_enabled(kvm)) 4665 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4666 4667 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4668 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4669 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4670 4671 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4672 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4673 vmx_set_constant_host_state(vmx); 4674 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4675 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4676 4677 if (cpu_has_vmx_vmfunc()) 4678 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4679 4680 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4681 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4682 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4683 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4684 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4685 4686 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4687 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4688 4689 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4690 4691 /* 22.2.1, 20.8.1 */ 4692 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4693 4694 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4695 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4696 4697 set_cr4_guest_host_mask(vmx); 4698 4699 if (vmx->vpid != 0) 4700 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4701 4702 if (cpu_has_vmx_xsaves()) 4703 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4704 4705 if (enable_pml) { 4706 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4707 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 4708 } 4709 4710 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4711 4712 if (vmx_pt_mode_is_host_guest()) { 4713 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4714 /* Bit[6~0] are forced to 1, writes are ignored. */ 4715 vmx->pt_desc.guest.output_mask = 0x7F; 4716 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4717 } 4718 4719 vmcs_write32(GUEST_SYSENTER_CS, 0); 4720 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4721 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4722 4723 vmx_guest_debugctl_write(&vmx->vcpu, 0); 4724 4725 if (cpu_has_vmx_tpr_shadow()) { 4726 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4727 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4728 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4729 __pa(vmx->vcpu.arch.apic->regs)); 4730 vmcs_write32(TPR_THRESHOLD, 0); 4731 } 4732 4733 vmx_setup_uret_msrs(vmx); 4734 } 4735 4736 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4737 { 4738 struct vcpu_vmx *vmx = to_vmx(vcpu); 4739 4740 init_vmcs(vmx); 4741 4742 if (nested && 4743 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4744 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4745 4746 vcpu_setup_sgx_lepubkeyhash(vcpu); 4747 4748 vmx->nested.posted_intr_nv = -1; 4749 vmx->nested.vmxon_ptr = INVALID_GPA; 4750 vmx->nested.current_vmptr = INVALID_GPA; 4751 4752 #ifdef CONFIG_KVM_HYPERV 4753 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4754 #endif 4755 4756 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4757 vcpu->arch.microcode_version = 0x100000000ULL; 4758 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4759 4760 /* 4761 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4762 * or POSTED_INTR_WAKEUP_VECTOR. 4763 */ 4764 vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 4765 __pi_set_sn(&vmx->vt.pi_desc); 4766 } 4767 4768 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4769 { 4770 struct vcpu_vmx *vmx = to_vmx(vcpu); 4771 4772 if (!init_event) 4773 __vmx_vcpu_reset(vcpu); 4774 4775 vmx->rmode.vm86_active = 0; 4776 vmx->spec_ctrl = 0; 4777 4778 vmx->msr_ia32_umwait_control = 0; 4779 4780 vmx->hv_deadline_tsc = -1; 4781 kvm_set_cr8(vcpu, 0); 4782 4783 seg_setup(VCPU_SREG_CS); 4784 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4785 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4786 4787 seg_setup(VCPU_SREG_DS); 4788 seg_setup(VCPU_SREG_ES); 4789 seg_setup(VCPU_SREG_FS); 4790 seg_setup(VCPU_SREG_GS); 4791 seg_setup(VCPU_SREG_SS); 4792 4793 vmcs_write16(GUEST_TR_SELECTOR, 0); 4794 vmcs_writel(GUEST_TR_BASE, 0); 4795 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4796 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4797 4798 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4799 vmcs_writel(GUEST_LDTR_BASE, 0); 4800 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4801 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4802 4803 vmcs_writel(GUEST_GDTR_BASE, 0); 4804 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4805 4806 vmcs_writel(GUEST_IDTR_BASE, 0); 4807 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4808 4809 vmx_segment_cache_clear(vmx); 4810 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4811 4812 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4813 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4814 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4815 if (kvm_mpx_supported()) 4816 vmcs_write64(GUEST_BNDCFGS, 0); 4817 4818 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4819 4820 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4821 4822 vpid_sync_context(vmx->vpid); 4823 4824 vmx_update_fb_clear_dis(vcpu, vmx); 4825 } 4826 4827 void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 4828 { 4829 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4830 } 4831 4832 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 4833 { 4834 if (!enable_vnmi || 4835 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4836 vmx_enable_irq_window(vcpu); 4837 return; 4838 } 4839 4840 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4841 } 4842 4843 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 4844 { 4845 struct vcpu_vmx *vmx = to_vmx(vcpu); 4846 uint32_t intr; 4847 int irq = vcpu->arch.interrupt.nr; 4848 4849 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 4850 4851 ++vcpu->stat.irq_injections; 4852 if (vmx->rmode.vm86_active) { 4853 int inc_eip = 0; 4854 if (vcpu->arch.interrupt.soft) 4855 inc_eip = vcpu->arch.event_exit_inst_len; 4856 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4857 return; 4858 } 4859 intr = irq | INTR_INFO_VALID_MASK; 4860 if (vcpu->arch.interrupt.soft) { 4861 intr |= INTR_TYPE_SOFT_INTR; 4862 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4863 vmx->vcpu.arch.event_exit_inst_len); 4864 } else 4865 intr |= INTR_TYPE_EXT_INTR; 4866 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4867 4868 vmx_clear_hlt(vcpu); 4869 } 4870 4871 void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4872 { 4873 struct vcpu_vmx *vmx = to_vmx(vcpu); 4874 4875 if (!enable_vnmi) { 4876 /* 4877 * Tracking the NMI-blocked state in software is built upon 4878 * finding the next open IRQ window. This, in turn, depends on 4879 * well-behaving guests: They have to keep IRQs disabled at 4880 * least as long as the NMI handler runs. Otherwise we may 4881 * cause NMI nesting, maybe breaking the guest. But as this is 4882 * highly unlikely, we can live with the residual risk. 4883 */ 4884 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 4885 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4886 } 4887 4888 ++vcpu->stat.nmi_injections; 4889 vmx->loaded_vmcs->nmi_known_unmasked = false; 4890 4891 if (vmx->rmode.vm86_active) { 4892 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 4893 return; 4894 } 4895 4896 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4897 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4898 4899 vmx_clear_hlt(vcpu); 4900 } 4901 4902 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4903 { 4904 struct vcpu_vmx *vmx = to_vmx(vcpu); 4905 bool masked; 4906 4907 if (!enable_vnmi) 4908 return vmx->loaded_vmcs->soft_vnmi_blocked; 4909 if (vmx->loaded_vmcs->nmi_known_unmasked) 4910 return false; 4911 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4912 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4913 return masked; 4914 } 4915 4916 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4917 { 4918 struct vcpu_vmx *vmx = to_vmx(vcpu); 4919 4920 if (!enable_vnmi) { 4921 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 4922 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 4923 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4924 } 4925 } else { 4926 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4927 if (masked) 4928 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4929 GUEST_INTR_STATE_NMI); 4930 else 4931 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4932 GUEST_INTR_STATE_NMI); 4933 } 4934 } 4935 4936 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 4937 { 4938 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 4939 return false; 4940 4941 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 4942 return true; 4943 4944 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4945 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 4946 GUEST_INTR_STATE_NMI)); 4947 } 4948 4949 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4950 { 4951 if (to_vmx(vcpu)->nested.nested_run_pending) 4952 return -EBUSY; 4953 4954 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 4955 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 4956 return -EBUSY; 4957 4958 return !vmx_nmi_blocked(vcpu); 4959 } 4960 4961 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 4962 { 4963 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 4964 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4965 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 4966 } 4967 4968 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 4969 { 4970 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 4971 return false; 4972 4973 return __vmx_interrupt_blocked(vcpu); 4974 } 4975 4976 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4977 { 4978 if (to_vmx(vcpu)->nested.nested_run_pending) 4979 return -EBUSY; 4980 4981 /* 4982 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 4983 * e.g. if the IRQ arrived asynchronously after checking nested events. 4984 */ 4985 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 4986 return -EBUSY; 4987 4988 return !vmx_interrupt_blocked(vcpu); 4989 } 4990 4991 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 4992 { 4993 void __user *ret; 4994 4995 if (enable_unrestricted_guest) 4996 return 0; 4997 4998 mutex_lock(&kvm->slots_lock); 4999 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5000 PAGE_SIZE * 3); 5001 mutex_unlock(&kvm->slots_lock); 5002 5003 if (IS_ERR(ret)) 5004 return PTR_ERR(ret); 5005 5006 to_kvm_vmx(kvm)->tss_addr = addr; 5007 5008 return init_rmode_tss(kvm, ret); 5009 } 5010 5011 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5012 { 5013 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5014 return 0; 5015 } 5016 5017 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5018 { 5019 switch (vec) { 5020 case BP_VECTOR: 5021 /* 5022 * Update instruction length as we may reinject the exception 5023 * from user space while in guest debugging mode. 5024 */ 5025 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5026 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5027 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5028 return false; 5029 fallthrough; 5030 case DB_VECTOR: 5031 return !(vcpu->guest_debug & 5032 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5033 case DE_VECTOR: 5034 case OF_VECTOR: 5035 case BR_VECTOR: 5036 case UD_VECTOR: 5037 case DF_VECTOR: 5038 case SS_VECTOR: 5039 case GP_VECTOR: 5040 case MF_VECTOR: 5041 return true; 5042 } 5043 return false; 5044 } 5045 5046 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5047 int vec, u32 err_code) 5048 { 5049 /* 5050 * Instruction with address size override prefix opcode 0x67 5051 * Cause the #SS fault with 0 error code in VM86 mode. 5052 */ 5053 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5054 if (kvm_emulate_instruction(vcpu, 0)) { 5055 if (vcpu->arch.halt_request) { 5056 vcpu->arch.halt_request = 0; 5057 return kvm_emulate_halt_noskip(vcpu); 5058 } 5059 return 1; 5060 } 5061 return 0; 5062 } 5063 5064 /* 5065 * Forward all other exceptions that are valid in real mode. 5066 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5067 * the required debugging infrastructure rework. 5068 */ 5069 kvm_queue_exception(vcpu, vec); 5070 return 1; 5071 } 5072 5073 static int handle_machine_check(struct kvm_vcpu *vcpu) 5074 { 5075 /* handled by vmx_vcpu_run() */ 5076 return 1; 5077 } 5078 5079 /* 5080 * If the host has split lock detection disabled, then #AC is 5081 * unconditionally injected into the guest, which is the pre split lock 5082 * detection behaviour. 5083 * 5084 * If the host has split lock detection enabled then #AC is 5085 * only injected into the guest when: 5086 * - Guest CPL == 3 (user mode) 5087 * - Guest has #AC detection enabled in CR0 5088 * - Guest EFLAGS has AC bit set 5089 */ 5090 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5091 { 5092 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5093 return true; 5094 5095 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5096 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5097 } 5098 5099 static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) 5100 { 5101 return vcpu->arch.guest_fpu.fpstate->xfd && 5102 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); 5103 } 5104 5105 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5106 { 5107 struct vcpu_vmx *vmx = to_vmx(vcpu); 5108 struct kvm_run *kvm_run = vcpu->run; 5109 u32 intr_info, ex_no, error_code; 5110 unsigned long cr2, dr6; 5111 u32 vect_info; 5112 5113 vect_info = vmx->idt_vectoring_info; 5114 intr_info = vmx_get_intr_info(vcpu); 5115 5116 /* 5117 * Machine checks are handled by handle_exception_irqoff(), or by 5118 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5119 * vmx_vcpu_enter_exit(). 5120 */ 5121 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5122 return 1; 5123 5124 /* 5125 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5126 * This ensures the nested_vmx check is not skipped so vmexit can 5127 * be reflected to L1 (when it intercepts #NM) before reaching this 5128 * point. 5129 */ 5130 if (is_nm_fault(intr_info)) { 5131 kvm_queue_exception_p(vcpu, NM_VECTOR, 5132 is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); 5133 return 1; 5134 } 5135 5136 if (is_invalid_opcode(intr_info)) 5137 return handle_ud(vcpu); 5138 5139 if (WARN_ON_ONCE(is_ve_fault(intr_info))) { 5140 struct vmx_ve_information *ve_info = vmx->ve_info; 5141 5142 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, 5143 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); 5144 dump_vmcs(vcpu); 5145 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); 5146 return 1; 5147 } 5148 5149 error_code = 0; 5150 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5151 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5152 5153 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5154 WARN_ON_ONCE(!enable_vmware_backdoor); 5155 5156 /* 5157 * VMware backdoor emulation on #GP interception only handles 5158 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5159 * error code on #GP. 5160 */ 5161 if (error_code) { 5162 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5163 return 1; 5164 } 5165 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5166 } 5167 5168 /* 5169 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5170 * MMIO, it is better to report an internal error. 5171 * See the comments in vmx_handle_exit. 5172 */ 5173 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5174 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5175 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5176 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5177 vcpu->run->internal.ndata = 4; 5178 vcpu->run->internal.data[0] = vect_info; 5179 vcpu->run->internal.data[1] = intr_info; 5180 vcpu->run->internal.data[2] = error_code; 5181 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5182 return 0; 5183 } 5184 5185 if (is_page_fault(intr_info)) { 5186 cr2 = vmx_get_exit_qual(vcpu); 5187 if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5188 /* 5189 * EPT will cause page fault only if we need to 5190 * detect illegal GPAs. 5191 */ 5192 WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5193 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5194 return 1; 5195 } else 5196 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5197 } 5198 5199 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5200 5201 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5202 return handle_rmode_exception(vcpu, ex_no, error_code); 5203 5204 switch (ex_no) { 5205 case DB_VECTOR: 5206 dr6 = vmx_get_exit_qual(vcpu); 5207 if (!(vcpu->guest_debug & 5208 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5209 /* 5210 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5211 * instruction. ICEBP generates a trap-like #DB, but 5212 * despite its interception control being tied to #DB, 5213 * is an instruction intercept, i.e. the VM-Exit occurs 5214 * on the ICEBP itself. Use the inner "skip" helper to 5215 * avoid single-step #DB and MTF updates, as ICEBP is 5216 * higher priority. Note, skipping ICEBP still clears 5217 * STI and MOVSS blocking. 5218 * 5219 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5220 * if single-step is enabled in RFLAGS and STI or MOVSS 5221 * blocking is active, as the CPU doesn't set the bit 5222 * on VM-Exit due to #DB interception. VM-Entry has a 5223 * consistency check that a single-step #DB is pending 5224 * in this scenario as the previous instruction cannot 5225 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5226 * don't modify RFLAGS), therefore the one instruction 5227 * delay when activating single-step breakpoints must 5228 * have already expired. Note, the CPU sets/clears BS 5229 * as appropriate for all other VM-Exits types. 5230 */ 5231 if (is_icebp(intr_info)) 5232 WARN_ON(!skip_emulated_instruction(vcpu)); 5233 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5234 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5235 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5236 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5237 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5238 5239 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5240 return 1; 5241 } 5242 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5243 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5244 fallthrough; 5245 case BP_VECTOR: 5246 /* 5247 * Update instruction length as we may reinject #BP from 5248 * user space while in guest debugging mode. Reading it for 5249 * #DB as well causes no harm, it is not used in that case. 5250 */ 5251 vmx->vcpu.arch.event_exit_inst_len = 5252 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5253 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5254 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5255 kvm_run->debug.arch.exception = ex_no; 5256 break; 5257 case AC_VECTOR: 5258 if (vmx_guest_inject_ac(vcpu)) { 5259 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5260 return 1; 5261 } 5262 5263 /* 5264 * Handle split lock. Depending on detection mode this will 5265 * either warn and disable split lock detection for this 5266 * task or force SIGBUS on it. 5267 */ 5268 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5269 return 1; 5270 fallthrough; 5271 default: 5272 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5273 kvm_run->ex.exception = ex_no; 5274 kvm_run->ex.error_code = error_code; 5275 break; 5276 } 5277 return 0; 5278 } 5279 5280 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5281 { 5282 ++vcpu->stat.irq_exits; 5283 return 1; 5284 } 5285 5286 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5287 { 5288 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5289 vcpu->mmio_needed = 0; 5290 return 0; 5291 } 5292 5293 static int handle_io(struct kvm_vcpu *vcpu) 5294 { 5295 unsigned long exit_qualification; 5296 int size, in, string; 5297 unsigned port; 5298 5299 exit_qualification = vmx_get_exit_qual(vcpu); 5300 string = (exit_qualification & 16) != 0; 5301 5302 ++vcpu->stat.io_exits; 5303 5304 if (string) 5305 return kvm_emulate_instruction(vcpu, 0); 5306 5307 port = exit_qualification >> 16; 5308 size = (exit_qualification & 7) + 1; 5309 in = (exit_qualification & 8) != 0; 5310 5311 return kvm_fast_pio(vcpu, size, port, in); 5312 } 5313 5314 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5315 { 5316 /* 5317 * Patch in the VMCALL instruction: 5318 */ 5319 hypercall[0] = 0x0f; 5320 hypercall[1] = 0x01; 5321 hypercall[2] = 0xc1; 5322 } 5323 5324 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5325 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5326 { 5327 if (is_guest_mode(vcpu)) { 5328 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5329 unsigned long orig_val = val; 5330 5331 /* 5332 * We get here when L2 changed cr0 in a way that did not change 5333 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5334 * but did change L0 shadowed bits. So we first calculate the 5335 * effective cr0 value that L1 would like to write into the 5336 * hardware. It consists of the L2-owned bits from the new 5337 * value combined with the L1-owned bits from L1's guest_cr0. 5338 */ 5339 val = (val & ~vmcs12->cr0_guest_host_mask) | 5340 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5341 5342 if (kvm_set_cr0(vcpu, val)) 5343 return 1; 5344 vmcs_writel(CR0_READ_SHADOW, orig_val); 5345 return 0; 5346 } else { 5347 return kvm_set_cr0(vcpu, val); 5348 } 5349 } 5350 5351 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5352 { 5353 if (is_guest_mode(vcpu)) { 5354 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5355 unsigned long orig_val = val; 5356 5357 /* analogously to handle_set_cr0 */ 5358 val = (val & ~vmcs12->cr4_guest_host_mask) | 5359 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5360 if (kvm_set_cr4(vcpu, val)) 5361 return 1; 5362 vmcs_writel(CR4_READ_SHADOW, orig_val); 5363 return 0; 5364 } else 5365 return kvm_set_cr4(vcpu, val); 5366 } 5367 5368 static int handle_desc(struct kvm_vcpu *vcpu) 5369 { 5370 /* 5371 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5372 * and other code needs to be updated if UMIP can be guest owned. 5373 */ 5374 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5375 5376 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5377 return kvm_emulate_instruction(vcpu, 0); 5378 } 5379 5380 static int handle_cr(struct kvm_vcpu *vcpu) 5381 { 5382 unsigned long exit_qualification, val; 5383 int cr; 5384 int reg; 5385 int err; 5386 int ret; 5387 5388 exit_qualification = vmx_get_exit_qual(vcpu); 5389 cr = exit_qualification & 15; 5390 reg = (exit_qualification >> 8) & 15; 5391 switch ((exit_qualification >> 4) & 3) { 5392 case 0: /* mov to cr */ 5393 val = kvm_register_read(vcpu, reg); 5394 trace_kvm_cr_write(cr, val); 5395 switch (cr) { 5396 case 0: 5397 err = handle_set_cr0(vcpu, val); 5398 return kvm_complete_insn_gp(vcpu, err); 5399 case 3: 5400 WARN_ON_ONCE(enable_unrestricted_guest); 5401 5402 err = kvm_set_cr3(vcpu, val); 5403 return kvm_complete_insn_gp(vcpu, err); 5404 case 4: 5405 err = handle_set_cr4(vcpu, val); 5406 return kvm_complete_insn_gp(vcpu, err); 5407 case 8: { 5408 u8 cr8_prev = kvm_get_cr8(vcpu); 5409 u8 cr8 = (u8)val; 5410 err = kvm_set_cr8(vcpu, cr8); 5411 ret = kvm_complete_insn_gp(vcpu, err); 5412 if (lapic_in_kernel(vcpu)) 5413 return ret; 5414 if (cr8_prev <= cr8) 5415 return ret; 5416 /* 5417 * TODO: we might be squashing a 5418 * KVM_GUESTDBG_SINGLESTEP-triggered 5419 * KVM_EXIT_DEBUG here. 5420 */ 5421 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5422 return 0; 5423 } 5424 } 5425 break; 5426 case 2: /* clts */ 5427 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5428 return -EIO; 5429 case 1: /*mov from cr*/ 5430 switch (cr) { 5431 case 3: 5432 WARN_ON_ONCE(enable_unrestricted_guest); 5433 5434 val = kvm_read_cr3(vcpu); 5435 kvm_register_write(vcpu, reg, val); 5436 trace_kvm_cr_read(cr, val); 5437 return kvm_skip_emulated_instruction(vcpu); 5438 case 8: 5439 val = kvm_get_cr8(vcpu); 5440 kvm_register_write(vcpu, reg, val); 5441 trace_kvm_cr_read(cr, val); 5442 return kvm_skip_emulated_instruction(vcpu); 5443 } 5444 break; 5445 case 3: /* lmsw */ 5446 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5447 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5448 kvm_lmsw(vcpu, val); 5449 5450 return kvm_skip_emulated_instruction(vcpu); 5451 default: 5452 break; 5453 } 5454 vcpu->run->exit_reason = 0; 5455 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5456 (int)(exit_qualification >> 4) & 3, cr); 5457 return 0; 5458 } 5459 5460 static int handle_dr(struct kvm_vcpu *vcpu) 5461 { 5462 unsigned long exit_qualification; 5463 int dr, dr7, reg; 5464 int err = 1; 5465 5466 exit_qualification = vmx_get_exit_qual(vcpu); 5467 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5468 5469 /* First, if DR does not exist, trigger UD */ 5470 if (!kvm_require_dr(vcpu, dr)) 5471 return 1; 5472 5473 if (vmx_get_cpl(vcpu) > 0) 5474 goto out; 5475 5476 dr7 = vmcs_readl(GUEST_DR7); 5477 if (dr7 & DR7_GD) { 5478 /* 5479 * As the vm-exit takes precedence over the debug trap, we 5480 * need to emulate the latter, either for the host or the 5481 * guest debugging itself. 5482 */ 5483 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5484 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5485 vcpu->run->debug.arch.dr7 = dr7; 5486 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5487 vcpu->run->debug.arch.exception = DB_VECTOR; 5488 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5489 return 0; 5490 } else { 5491 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5492 return 1; 5493 } 5494 } 5495 5496 if (vcpu->guest_debug == 0) { 5497 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5498 5499 /* 5500 * No more DR vmexits; force a reload of the debug registers 5501 * and reenter on this instruction. The next vmexit will 5502 * retrieve the full state of the debug registers. 5503 */ 5504 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5505 return 1; 5506 } 5507 5508 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5509 if (exit_qualification & TYPE_MOV_FROM_DR) { 5510 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5511 err = 0; 5512 } else { 5513 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5514 } 5515 5516 out: 5517 return kvm_complete_insn_gp(vcpu, err); 5518 } 5519 5520 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5521 { 5522 get_debugreg(vcpu->arch.db[0], 0); 5523 get_debugreg(vcpu->arch.db[1], 1); 5524 get_debugreg(vcpu->arch.db[2], 2); 5525 get_debugreg(vcpu->arch.db[3], 3); 5526 get_debugreg(vcpu->arch.dr6, 6); 5527 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5528 5529 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5530 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5531 5532 /* 5533 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5534 * a stale dr6 from the guest. 5535 */ 5536 set_debugreg(DR6_RESERVED, 6); 5537 } 5538 5539 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5540 { 5541 vmcs_writel(GUEST_DR7, val); 5542 } 5543 5544 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5545 { 5546 kvm_apic_update_ppr(vcpu); 5547 return 1; 5548 } 5549 5550 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5551 { 5552 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5553 5554 kvm_make_request(KVM_REQ_EVENT, vcpu); 5555 5556 ++vcpu->stat.irq_window_exits; 5557 return 1; 5558 } 5559 5560 static int handle_invlpg(struct kvm_vcpu *vcpu) 5561 { 5562 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5563 5564 kvm_mmu_invlpg(vcpu, exit_qualification); 5565 return kvm_skip_emulated_instruction(vcpu); 5566 } 5567 5568 static int handle_apic_access(struct kvm_vcpu *vcpu) 5569 { 5570 if (likely(fasteoi)) { 5571 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5572 int access_type, offset; 5573 5574 access_type = exit_qualification & APIC_ACCESS_TYPE; 5575 offset = exit_qualification & APIC_ACCESS_OFFSET; 5576 /* 5577 * Sane guest uses MOV to write EOI, with written value 5578 * not cared. So make a short-circuit here by avoiding 5579 * heavy instruction emulation. 5580 */ 5581 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5582 (offset == APIC_EOI)) { 5583 kvm_lapic_set_eoi(vcpu); 5584 return kvm_skip_emulated_instruction(vcpu); 5585 } 5586 } 5587 return kvm_emulate_instruction(vcpu, 0); 5588 } 5589 5590 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5591 { 5592 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5593 int vector = exit_qualification & 0xff; 5594 5595 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5596 kvm_apic_set_eoi_accelerated(vcpu, vector); 5597 return 1; 5598 } 5599 5600 static int handle_apic_write(struct kvm_vcpu *vcpu) 5601 { 5602 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5603 5604 /* 5605 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5606 * hardware has done any necessary aliasing, offset adjustments, etc... 5607 * for the access. I.e. the correct value has already been written to 5608 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5609 * retrieve the register value and emulate the access. 5610 */ 5611 u32 offset = exit_qualification & 0xff0; 5612 5613 kvm_apic_write_nodecode(vcpu, offset); 5614 return 1; 5615 } 5616 5617 static int handle_task_switch(struct kvm_vcpu *vcpu) 5618 { 5619 struct vcpu_vmx *vmx = to_vmx(vcpu); 5620 unsigned long exit_qualification; 5621 bool has_error_code = false; 5622 u32 error_code = 0; 5623 u16 tss_selector; 5624 int reason, type, idt_v, idt_index; 5625 5626 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5627 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5628 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5629 5630 exit_qualification = vmx_get_exit_qual(vcpu); 5631 5632 reason = (u32)exit_qualification >> 30; 5633 if (reason == TASK_SWITCH_GATE && idt_v) { 5634 switch (type) { 5635 case INTR_TYPE_NMI_INTR: 5636 vcpu->arch.nmi_injected = false; 5637 vmx_set_nmi_mask(vcpu, true); 5638 break; 5639 case INTR_TYPE_EXT_INTR: 5640 case INTR_TYPE_SOFT_INTR: 5641 kvm_clear_interrupt_queue(vcpu); 5642 break; 5643 case INTR_TYPE_HARD_EXCEPTION: 5644 if (vmx->idt_vectoring_info & 5645 VECTORING_INFO_DELIVER_CODE_MASK) { 5646 has_error_code = true; 5647 error_code = 5648 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5649 } 5650 fallthrough; 5651 case INTR_TYPE_SOFT_EXCEPTION: 5652 kvm_clear_exception_queue(vcpu); 5653 break; 5654 default: 5655 break; 5656 } 5657 } 5658 tss_selector = exit_qualification; 5659 5660 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5661 type != INTR_TYPE_EXT_INTR && 5662 type != INTR_TYPE_NMI_INTR)) 5663 WARN_ON(!skip_emulated_instruction(vcpu)); 5664 5665 /* 5666 * TODO: What about debug traps on tss switch? 5667 * Are we supposed to inject them and update dr6? 5668 */ 5669 return kvm_task_switch(vcpu, tss_selector, 5670 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5671 reason, has_error_code, error_code); 5672 } 5673 5674 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5675 { 5676 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5677 gpa_t gpa; 5678 5679 /* 5680 * EPT violation happened while executing iret from NMI, 5681 * "blocked by NMI" bit has to be set before next VM entry. 5682 * There are errata that may cause this bit to not be set: 5683 * AAK134, BY25. 5684 */ 5685 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5686 enable_vnmi && 5687 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5688 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5689 5690 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5691 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5692 5693 /* 5694 * Check that the GPA doesn't exceed physical memory limits, as that is 5695 * a guest page fault. We have to emulate the instruction here, because 5696 * if the illegal address is that of a paging structure, then 5697 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5698 * would also use advanced VM-exit information for EPT violations to 5699 * reconstruct the page fault error code. 5700 */ 5701 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 5702 return kvm_emulate_instruction(vcpu, 0); 5703 5704 return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); 5705 } 5706 5707 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5708 { 5709 gpa_t gpa; 5710 5711 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5712 return 1; 5713 5714 /* 5715 * A nested guest cannot optimize MMIO vmexits, because we have an 5716 * nGPA here instead of the required GPA. 5717 */ 5718 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5719 if (!is_guest_mode(vcpu) && 5720 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5721 trace_kvm_fast_mmio(gpa); 5722 return kvm_skip_emulated_instruction(vcpu); 5723 } 5724 5725 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5726 } 5727 5728 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5729 { 5730 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5731 return -EIO; 5732 5733 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5734 ++vcpu->stat.nmi_window_exits; 5735 kvm_make_request(KVM_REQ_EVENT, vcpu); 5736 5737 return 1; 5738 } 5739 5740 /* 5741 * Returns true if emulation is required (due to the vCPU having invalid state 5742 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the 5743 * current vCPU state. 5744 */ 5745 static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) 5746 { 5747 struct vcpu_vmx *vmx = to_vmx(vcpu); 5748 5749 if (!vmx->vt.emulation_required) 5750 return false; 5751 5752 /* 5753 * It is architecturally impossible for emulation to be required when a 5754 * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if 5755 * guest state is invalid and unrestricted guest is disabled, i.e. KVM 5756 * should synthesize VM-Fail instead emulation L2 code. This path is 5757 * only reachable if userspace modifies L2 guest state after KVM has 5758 * performed the nested VM-Enter consistency checks. 5759 */ 5760 if (vmx->nested.nested_run_pending) 5761 return true; 5762 5763 /* 5764 * KVM only supports emulating exceptions if the vCPU is in Real Mode. 5765 * If emulation is required, KVM can't perform a successful VM-Enter to 5766 * inject the exception. 5767 */ 5768 return !vmx->rmode.vm86_active && 5769 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 5770 } 5771 5772 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5773 { 5774 struct vcpu_vmx *vmx = to_vmx(vcpu); 5775 bool intr_window_requested; 5776 unsigned count = 130; 5777 5778 intr_window_requested = exec_controls_get(vmx) & 5779 CPU_BASED_INTR_WINDOW_EXITING; 5780 5781 while (vmx->vt.emulation_required && count-- != 0) { 5782 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 5783 return handle_interrupt_window(&vmx->vcpu); 5784 5785 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5786 return 1; 5787 5788 if (!kvm_emulate_instruction(vcpu, 0)) 5789 return 0; 5790 5791 if (vmx_unhandleable_emulation_required(vcpu)) { 5792 kvm_prepare_emulation_failure_exit(vcpu); 5793 return 0; 5794 } 5795 5796 if (vcpu->arch.halt_request) { 5797 vcpu->arch.halt_request = 0; 5798 return kvm_emulate_halt_noskip(vcpu); 5799 } 5800 5801 /* 5802 * Note, return 1 and not 0, vcpu_run() will invoke 5803 * xfer_to_guest_mode() which will create a proper return 5804 * code. 5805 */ 5806 if (__xfer_to_guest_mode_work_pending()) 5807 return 1; 5808 } 5809 5810 return 1; 5811 } 5812 5813 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5814 { 5815 if (vmx_unhandleable_emulation_required(vcpu)) { 5816 kvm_prepare_emulation_failure_exit(vcpu); 5817 return 0; 5818 } 5819 5820 return 1; 5821 } 5822 5823 /* 5824 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5825 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5826 */ 5827 static int handle_pause(struct kvm_vcpu *vcpu) 5828 { 5829 if (!kvm_pause_in_guest(vcpu->kvm)) 5830 grow_ple_window(vcpu); 5831 5832 /* 5833 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5834 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5835 * never set PAUSE_EXITING and just set PLE if supported, 5836 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5837 */ 5838 kvm_vcpu_on_spin(vcpu, true); 5839 return kvm_skip_emulated_instruction(vcpu); 5840 } 5841 5842 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5843 { 5844 return 1; 5845 } 5846 5847 static int handle_invpcid(struct kvm_vcpu *vcpu) 5848 { 5849 u32 vmx_instruction_info; 5850 unsigned long type; 5851 gva_t gva; 5852 struct { 5853 u64 pcid; 5854 u64 gla; 5855 } operand; 5856 int gpr_index; 5857 5858 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 5859 kvm_queue_exception(vcpu, UD_VECTOR); 5860 return 1; 5861 } 5862 5863 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5864 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5865 type = kvm_register_read(vcpu, gpr_index); 5866 5867 /* According to the Intel instruction reference, the memory operand 5868 * is read even if it isn't needed (e.g., for type==all) 5869 */ 5870 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5871 vmx_instruction_info, false, 5872 sizeof(operand), &gva)) 5873 return 1; 5874 5875 return kvm_handle_invpcid(vcpu, type, gva); 5876 } 5877 5878 static int handle_pml_full(struct kvm_vcpu *vcpu) 5879 { 5880 unsigned long exit_qualification; 5881 5882 trace_kvm_pml_full(vcpu->vcpu_id); 5883 5884 exit_qualification = vmx_get_exit_qual(vcpu); 5885 5886 /* 5887 * PML buffer FULL happened while executing iret from NMI, 5888 * "blocked by NMI" bit has to be set before next VM entry. 5889 */ 5890 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5891 enable_vnmi && 5892 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5893 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5894 GUEST_INTR_STATE_NMI); 5895 5896 /* 5897 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 5898 * here.., and there's no userspace involvement needed for PML. 5899 */ 5900 return 1; 5901 } 5902 5903 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 5904 bool force_immediate_exit) 5905 { 5906 struct vcpu_vmx *vmx = to_vmx(vcpu); 5907 5908 /* 5909 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 5910 * due to the timer expiring while it was "soft" disabled, just eat the 5911 * exit and re-enter the guest. 5912 */ 5913 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 5914 return EXIT_FASTPATH_REENTER_GUEST; 5915 5916 /* 5917 * If the timer expired because KVM used it to force an immediate exit, 5918 * then mission accomplished. 5919 */ 5920 if (force_immediate_exit) 5921 return EXIT_FASTPATH_EXIT_HANDLED; 5922 5923 /* 5924 * If L2 is active, go down the slow path as emulating the guest timer 5925 * expiration likely requires synthesizing a nested VM-Exit. 5926 */ 5927 if (is_guest_mode(vcpu)) 5928 return EXIT_FASTPATH_NONE; 5929 5930 kvm_lapic_expired_hv_timer(vcpu); 5931 return EXIT_FASTPATH_REENTER_GUEST; 5932 } 5933 5934 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 5935 { 5936 /* 5937 * This non-fastpath handler is reached if and only if the preemption 5938 * timer was being used to emulate a guest timer while L2 is active. 5939 * All other scenarios are supposed to be handled in the fastpath. 5940 */ 5941 WARN_ON_ONCE(!is_guest_mode(vcpu)); 5942 kvm_lapic_expired_hv_timer(vcpu); 5943 return 1; 5944 } 5945 5946 /* 5947 * When nested=0, all VMX instruction VM Exits filter here. The handlers 5948 * are overwritten by nested_vmx_hardware_setup() when nested=1. 5949 */ 5950 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 5951 { 5952 kvm_queue_exception(vcpu, UD_VECTOR); 5953 return 1; 5954 } 5955 5956 #ifndef CONFIG_X86_SGX_KVM 5957 static int handle_encls(struct kvm_vcpu *vcpu) 5958 { 5959 /* 5960 * SGX virtualization is disabled. There is no software enable bit for 5961 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 5962 * the guest from executing ENCLS (when SGX is supported by hardware). 5963 */ 5964 kvm_queue_exception(vcpu, UD_VECTOR); 5965 return 1; 5966 } 5967 #endif /* CONFIG_X86_SGX_KVM */ 5968 5969 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 5970 { 5971 /* 5972 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 5973 * VM-Exits. Unconditionally set the flag here and leave the handling to 5974 * vmx_handle_exit(). 5975 */ 5976 to_vt(vcpu)->exit_reason.bus_lock_detected = true; 5977 return 1; 5978 } 5979 5980 static int handle_notify(struct kvm_vcpu *vcpu) 5981 { 5982 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5983 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 5984 5985 ++vcpu->stat.notify_window_exits; 5986 5987 /* 5988 * Notify VM exit happened while executing iret from NMI, 5989 * "blocked by NMI" bit has to be set before next VM entry. 5990 */ 5991 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 5992 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5993 GUEST_INTR_STATE_NMI); 5994 5995 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 5996 context_invalid) { 5997 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 5998 vcpu->run->notify.flags = context_invalid ? 5999 KVM_NOTIFY_CONTEXT_INVALID : 0; 6000 return 0; 6001 } 6002 6003 return 1; 6004 } 6005 6006 /* 6007 * The exit handlers return 1 if the exit was handled fully and guest execution 6008 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6009 * to be done to userspace and return 0. 6010 */ 6011 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6012 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6013 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6014 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6015 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6016 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6017 [EXIT_REASON_CR_ACCESS] = handle_cr, 6018 [EXIT_REASON_DR_ACCESS] = handle_dr, 6019 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6020 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6021 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6022 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6023 [EXIT_REASON_HLT] = kvm_emulate_halt, 6024 [EXIT_REASON_INVD] = kvm_emulate_invd, 6025 [EXIT_REASON_INVLPG] = handle_invlpg, 6026 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6027 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6028 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6029 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6030 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6031 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6032 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6033 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6034 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6035 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6036 [EXIT_REASON_VMON] = handle_vmx_instruction, 6037 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6038 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6039 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6040 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6041 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6042 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6043 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6044 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6045 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6046 [EXIT_REASON_LDTR_TR] = handle_desc, 6047 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6048 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6049 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6050 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6051 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6052 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6053 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6054 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6055 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6056 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6057 [EXIT_REASON_PML_FULL] = handle_pml_full, 6058 [EXIT_REASON_INVPCID] = handle_invpcid, 6059 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6060 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6061 [EXIT_REASON_ENCLS] = handle_encls, 6062 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6063 [EXIT_REASON_NOTIFY] = handle_notify, 6064 }; 6065 6066 static const int kvm_vmx_max_exit_handlers = 6067 ARRAY_SIZE(kvm_vmx_exit_handlers); 6068 6069 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6070 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 6071 { 6072 struct vcpu_vmx *vmx = to_vmx(vcpu); 6073 6074 *reason = vmx->vt.exit_reason.full; 6075 *info1 = vmx_get_exit_qual(vcpu); 6076 if (!(vmx->vt.exit_reason.failed_vmentry)) { 6077 *info2 = vmx->idt_vectoring_info; 6078 *intr_info = vmx_get_intr_info(vcpu); 6079 if (is_exception_with_error_code(*intr_info)) 6080 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6081 else 6082 *error_code = 0; 6083 } else { 6084 *info2 = 0; 6085 *intr_info = 0; 6086 *error_code = 0; 6087 } 6088 } 6089 6090 void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) 6091 { 6092 *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 6093 if (is_exception_with_error_code(*intr_info)) 6094 *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); 6095 else 6096 *error_code = 0; 6097 } 6098 6099 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6100 { 6101 if (vmx->pml_pg) { 6102 __free_page(vmx->pml_pg); 6103 vmx->pml_pg = NULL; 6104 } 6105 } 6106 6107 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6108 { 6109 struct vcpu_vmx *vmx = to_vmx(vcpu); 6110 u16 pml_idx, pml_tail_index; 6111 u64 *pml_buf; 6112 int i; 6113 6114 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6115 6116 /* Do nothing if PML buffer is empty */ 6117 if (pml_idx == PML_HEAD_INDEX) 6118 return; 6119 /* 6120 * PML index always points to the next available PML buffer entity 6121 * unless PML log has just overflowed. 6122 */ 6123 pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; 6124 6125 /* 6126 * PML log is written backwards: the CPU first writes the entry 511 6127 * then the entry 510, and so on. 6128 * 6129 * Read the entries in the same order they were written, to ensure that 6130 * the dirty ring is filled in the same order the CPU wrote them. 6131 */ 6132 pml_buf = page_address(vmx->pml_pg); 6133 6134 for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { 6135 u64 gpa; 6136 6137 gpa = pml_buf[i]; 6138 WARN_ON(gpa & (PAGE_SIZE - 1)); 6139 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6140 } 6141 6142 /* reset PML index */ 6143 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 6144 } 6145 6146 static void vmx_dump_sel(char *name, uint32_t sel) 6147 { 6148 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6149 name, vmcs_read16(sel), 6150 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6151 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6152 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6153 } 6154 6155 static void vmx_dump_dtsel(char *name, uint32_t limit) 6156 { 6157 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6158 name, vmcs_read32(limit), 6159 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6160 } 6161 6162 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6163 { 6164 unsigned int i; 6165 struct vmx_msr_entry *e; 6166 6167 pr_err("MSR %s:\n", name); 6168 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6169 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6170 } 6171 6172 void dump_vmcs(struct kvm_vcpu *vcpu) 6173 { 6174 struct vcpu_vmx *vmx = to_vmx(vcpu); 6175 u32 vmentry_ctl, vmexit_ctl; 6176 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6177 u64 tertiary_exec_control; 6178 unsigned long cr4; 6179 int efer_slot; 6180 6181 if (!dump_invalid_vmcs) { 6182 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6183 return; 6184 } 6185 6186 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6187 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6188 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6189 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6190 cr4 = vmcs_readl(GUEST_CR4); 6191 6192 if (cpu_has_secondary_exec_ctrls()) 6193 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6194 else 6195 secondary_exec_control = 0; 6196 6197 if (cpu_has_tertiary_exec_ctrls()) 6198 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6199 else 6200 tertiary_exec_control = 0; 6201 6202 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6203 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6204 pr_err("*** Guest State ***\n"); 6205 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6206 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6207 vmcs_readl(CR0_GUEST_HOST_MASK)); 6208 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6209 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6210 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6211 if (cpu_has_vmx_ept()) { 6212 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6213 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6214 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6215 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6216 } 6217 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6218 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6219 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6220 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6221 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6222 vmcs_readl(GUEST_SYSENTER_ESP), 6223 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6224 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6225 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6226 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6227 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6228 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6229 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6230 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6231 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6232 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6233 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6234 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6235 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6236 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6237 else if (efer_slot >= 0) 6238 pr_err("EFER= 0x%016llx (autoload)\n", 6239 vmx->msr_autoload.guest.val[efer_slot].value); 6240 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6241 pr_err("EFER= 0x%016llx (effective)\n", 6242 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6243 else 6244 pr_err("EFER= 0x%016llx (effective)\n", 6245 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6246 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6247 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6248 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6249 vmcs_read64(GUEST_IA32_DEBUGCTL), 6250 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6251 if (cpu_has_load_perf_global_ctrl() && 6252 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6253 pr_err("PerfGlobCtl = 0x%016llx\n", 6254 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6255 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6256 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6257 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6258 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6259 vmcs_read32(GUEST_ACTIVITY_STATE)); 6260 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6261 pr_err("InterruptStatus = %04x\n", 6262 vmcs_read16(GUEST_INTR_STATUS)); 6263 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6264 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6265 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6266 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6267 6268 pr_err("*** Host State ***\n"); 6269 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6270 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6271 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6272 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6273 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6274 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6275 vmcs_read16(HOST_TR_SELECTOR)); 6276 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6277 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6278 vmcs_readl(HOST_TR_BASE)); 6279 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6280 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6281 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6282 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6283 vmcs_readl(HOST_CR4)); 6284 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6285 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6286 vmcs_read32(HOST_IA32_SYSENTER_CS), 6287 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6288 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6289 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6290 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6291 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6292 if (cpu_has_load_perf_global_ctrl() && 6293 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6294 pr_err("PerfGlobCtl = 0x%016llx\n", 6295 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6296 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6297 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6298 6299 pr_err("*** Control State ***\n"); 6300 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6301 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6302 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6303 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6304 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6305 vmcs_read32(EXCEPTION_BITMAP), 6306 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6307 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6308 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6309 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6310 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6311 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6312 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6313 vmcs_read32(VM_EXIT_INTR_INFO), 6314 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6315 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6316 pr_err(" reason=%08x qualification=%016lx\n", 6317 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6318 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6319 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6320 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6321 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6322 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6323 pr_err("TSC Multiplier = 0x%016llx\n", 6324 vmcs_read64(TSC_MULTIPLIER)); 6325 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6326 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6327 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6328 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6329 } 6330 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6331 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6332 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6333 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6334 } 6335 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6336 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6337 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6338 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6339 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6340 pr_err("PLE Gap=%08x Window=%08x\n", 6341 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6342 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6343 pr_err("Virtual processor ID = 0x%04x\n", 6344 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6345 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { 6346 struct vmx_ve_information *ve_info = vmx->ve_info; 6347 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); 6348 6349 /* 6350 * If KVM is dumping the VMCS, then something has gone wrong 6351 * already. Derefencing an address from the VMCS, which could 6352 * very well be corrupted, is a terrible idea. The virtual 6353 * address is known so use it. 6354 */ 6355 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, 6356 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); 6357 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", 6358 ve_info->exit_reason, ve_info->delivery, 6359 ve_info->exit_qualification, 6360 ve_info->guest_linear_address, 6361 ve_info->guest_physical_address, ve_info->eptp_index); 6362 } 6363 } 6364 6365 /* 6366 * The guest has exited. See if we can fix it or if we need userspace 6367 * assistance. 6368 */ 6369 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6370 { 6371 struct vcpu_vmx *vmx = to_vmx(vcpu); 6372 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 6373 u32 vectoring_info = vmx->idt_vectoring_info; 6374 u16 exit_handler_index; 6375 6376 /* 6377 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6378 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6379 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6380 * mode as if vcpus is in root mode, the PML buffer must has been 6381 * flushed already. Note, PML is never enabled in hardware while 6382 * running L2. 6383 */ 6384 if (enable_pml && !is_guest_mode(vcpu)) 6385 vmx_flush_pml_buffer(vcpu); 6386 6387 /* 6388 * KVM should never reach this point with a pending nested VM-Enter. 6389 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6390 * invalid guest state should never happen as that means KVM knowingly 6391 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6392 */ 6393 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6394 return -EIO; 6395 6396 if (is_guest_mode(vcpu)) { 6397 /* 6398 * PML is never enabled when running L2, bail immediately if a 6399 * PML full exit occurs as something is horribly wrong. 6400 */ 6401 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6402 goto unexpected_vmexit; 6403 6404 /* 6405 * The host physical addresses of some pages of guest memory 6406 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6407 * Page). The CPU may write to these pages via their host 6408 * physical address while L2 is running, bypassing any 6409 * address-translation-based dirty tracking (e.g. EPT write 6410 * protection). 6411 * 6412 * Mark them dirty on every exit from L2 to prevent them from 6413 * getting out of sync with dirty tracking. 6414 */ 6415 nested_mark_vmcs12_pages_dirty(vcpu); 6416 6417 /* 6418 * Synthesize a triple fault if L2 state is invalid. In normal 6419 * operation, nested VM-Enter rejects any attempt to enter L2 6420 * with invalid state. However, those checks are skipped if 6421 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6422 * L2 state is invalid, it means either L1 modified SMRAM state 6423 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6424 * doing so is architecturally allowed in the RSM case, and is 6425 * the least awful solution for the userspace case without 6426 * risking false positives. 6427 */ 6428 if (vmx->vt.emulation_required) { 6429 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6430 return 1; 6431 } 6432 6433 if (nested_vmx_reflect_vmexit(vcpu)) 6434 return 1; 6435 } 6436 6437 /* If guest state is invalid, start emulating. L2 is handled above. */ 6438 if (vmx->vt.emulation_required) 6439 return handle_invalid_guest_state(vcpu); 6440 6441 if (exit_reason.failed_vmentry) { 6442 dump_vmcs(vcpu); 6443 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6444 vcpu->run->fail_entry.hardware_entry_failure_reason 6445 = exit_reason.full; 6446 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6447 return 0; 6448 } 6449 6450 if (unlikely(vmx->fail)) { 6451 dump_vmcs(vcpu); 6452 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6453 vcpu->run->fail_entry.hardware_entry_failure_reason 6454 = vmcs_read32(VM_INSTRUCTION_ERROR); 6455 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6456 return 0; 6457 } 6458 6459 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6460 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6461 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6462 exit_reason.basic != EXIT_REASON_PML_FULL && 6463 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6464 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6465 exit_reason.basic != EXIT_REASON_NOTIFY && 6466 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { 6467 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); 6468 return 0; 6469 } 6470 6471 if (unlikely(!enable_vnmi && 6472 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6473 if (!vmx_interrupt_blocked(vcpu)) { 6474 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6475 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6476 vcpu->arch.nmi_pending) { 6477 /* 6478 * This CPU don't support us in finding the end of an 6479 * NMI-blocked window if the guest runs with IRQs 6480 * disabled. So we pull the trigger after 1 s of 6481 * futile waiting, but inform the user about this. 6482 */ 6483 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6484 "state on VCPU %d after 1 s timeout\n", 6485 __func__, vcpu->vcpu_id); 6486 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6487 } 6488 } 6489 6490 if (exit_fastpath != EXIT_FASTPATH_NONE) 6491 return 1; 6492 6493 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6494 goto unexpected_vmexit; 6495 #ifdef CONFIG_MITIGATION_RETPOLINE 6496 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6497 return kvm_emulate_wrmsr(vcpu); 6498 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6499 return handle_preemption_timer(vcpu); 6500 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6501 return handle_interrupt_window(vcpu); 6502 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6503 return handle_external_interrupt(vcpu); 6504 else if (exit_reason.basic == EXIT_REASON_HLT) 6505 return kvm_emulate_halt(vcpu); 6506 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6507 return handle_ept_misconfig(vcpu); 6508 #endif 6509 6510 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6511 kvm_vmx_max_exit_handlers); 6512 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6513 goto unexpected_vmexit; 6514 6515 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6516 6517 unexpected_vmexit: 6518 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6519 exit_reason.full); 6520 dump_vmcs(vcpu); 6521 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6522 vcpu->run->internal.suberror = 6523 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6524 vcpu->run->internal.ndata = 2; 6525 vcpu->run->internal.data[0] = exit_reason.full; 6526 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6527 return 0; 6528 } 6529 6530 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6531 { 6532 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6533 6534 /* 6535 * Exit to user space when bus lock detected to inform that there is 6536 * a bus lock in guest. 6537 */ 6538 if (vmx_get_exit_reason(vcpu).bus_lock_detected) { 6539 if (ret > 0) 6540 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6541 6542 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6543 return 0; 6544 } 6545 return ret; 6546 } 6547 6548 /* 6549 * Software based L1D cache flush which is used when microcode providing 6550 * the cache control MSR is not loaded. 6551 * 6552 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6553 * flush it is required to read in 64 KiB because the replacement algorithm 6554 * is not exactly LRU. This could be sized at runtime via topology 6555 * information but as all relevant affected CPUs have 32KiB L1D cache size 6556 * there is no point in doing so. 6557 */ 6558 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6559 { 6560 int size = PAGE_SIZE << L1D_CACHE_ORDER; 6561 6562 /* 6563 * This code is only executed when the flush mode is 'cond' or 6564 * 'always' 6565 */ 6566 if (static_branch_likely(&vmx_l1d_flush_cond)) { 6567 bool flush_l1d; 6568 6569 /* 6570 * Clear the per-vcpu flush bit, it gets set again if the vCPU 6571 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 6572 * exits to userspace, or if KVM reaches one of the unsafe 6573 * VMEXIT handlers, e.g. if KVM calls into the emulator. 6574 */ 6575 flush_l1d = vcpu->arch.l1tf_flush_l1d; 6576 vcpu->arch.l1tf_flush_l1d = false; 6577 6578 /* 6579 * Clear the per-cpu flush bit, it gets set again from 6580 * the interrupt handlers. 6581 */ 6582 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6583 kvm_clear_cpu_l1tf_flush_l1d(); 6584 6585 if (!flush_l1d) 6586 return; 6587 } 6588 6589 vcpu->stat.l1d_flush++; 6590 6591 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6592 native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6593 return; 6594 } 6595 6596 asm volatile( 6597 /* First ensure the pages are in the TLB */ 6598 "xorl %%eax, %%eax\n" 6599 ".Lpopulate_tlb:\n\t" 6600 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6601 "addl $4096, %%eax\n\t" 6602 "cmpl %%eax, %[size]\n\t" 6603 "jne .Lpopulate_tlb\n\t" 6604 "xorl %%eax, %%eax\n\t" 6605 "cpuid\n\t" 6606 /* Now fill the cache */ 6607 "xorl %%eax, %%eax\n" 6608 ".Lfill_cache:\n" 6609 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6610 "addl $64, %%eax\n\t" 6611 "cmpl %%eax, %[size]\n\t" 6612 "jne .Lfill_cache\n\t" 6613 "lfence\n" 6614 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6615 [size] "r" (size) 6616 : "eax", "ebx", "ecx", "edx"); 6617 } 6618 6619 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6620 { 6621 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6622 int tpr_threshold; 6623 6624 if (is_guest_mode(vcpu) && 6625 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6626 return; 6627 6628 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6629 if (is_guest_mode(vcpu)) 6630 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; 6631 else 6632 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6633 } 6634 6635 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6636 { 6637 struct vcpu_vmx *vmx = to_vmx(vcpu); 6638 u32 sec_exec_control; 6639 6640 if (!lapic_in_kernel(vcpu)) 6641 return; 6642 6643 if (!flexpriority_enabled && 6644 !cpu_has_vmx_virtualize_x2apic_mode()) 6645 return; 6646 6647 /* Postpone execution until vmcs01 is the current VMCS. */ 6648 if (is_guest_mode(vcpu)) { 6649 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6650 return; 6651 } 6652 6653 sec_exec_control = secondary_exec_controls_get(vmx); 6654 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6655 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6656 6657 switch (kvm_get_apic_mode(vcpu)) { 6658 case LAPIC_MODE_INVALID: 6659 WARN_ONCE(true, "Invalid local APIC state"); 6660 break; 6661 case LAPIC_MODE_DISABLED: 6662 break; 6663 case LAPIC_MODE_XAPIC: 6664 if (flexpriority_enabled) { 6665 sec_exec_control |= 6666 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6667 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6668 6669 /* 6670 * Flush the TLB, reloading the APIC access page will 6671 * only do so if its physical address has changed, but 6672 * the guest may have inserted a non-APIC mapping into 6673 * the TLB while the APIC access page was disabled. 6674 */ 6675 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6676 } 6677 break; 6678 case LAPIC_MODE_X2APIC: 6679 if (cpu_has_vmx_virtualize_x2apic_mode()) 6680 sec_exec_control |= 6681 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6682 break; 6683 } 6684 secondary_exec_controls_set(vmx, sec_exec_control); 6685 6686 vmx_update_msr_bitmap_x2apic(vcpu); 6687 } 6688 6689 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6690 { 6691 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6692 struct kvm *kvm = vcpu->kvm; 6693 struct kvm_memslots *slots = kvm_memslots(kvm); 6694 struct kvm_memory_slot *slot; 6695 struct page *refcounted_page; 6696 unsigned long mmu_seq; 6697 kvm_pfn_t pfn; 6698 bool writable; 6699 6700 /* Defer reload until vmcs01 is the current VMCS. */ 6701 if (is_guest_mode(vcpu)) { 6702 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; 6703 return; 6704 } 6705 6706 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6707 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6708 return; 6709 6710 /* 6711 * Explicitly grab the memslot using KVM's internal slot ID to ensure 6712 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 6713 * be impossible for userspace to create a memslot for the APIC when 6714 * APICv is enabled, but paranoia won't hurt in this case. 6715 */ 6716 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6717 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6718 return; 6719 6720 /* 6721 * Ensure that the mmu_notifier sequence count is read before KVM 6722 * retrieves the pfn from the primary MMU. Note, the memslot is 6723 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6724 * in kvm_mmu_invalidate_end(). 6725 */ 6726 mmu_seq = kvm->mmu_invalidate_seq; 6727 smp_rmb(); 6728 6729 /* 6730 * No need to retry if the memslot does not exist or is invalid. KVM 6731 * controls the APIC-access page memslot, and only deletes the memslot 6732 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6733 */ 6734 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); 6735 if (is_error_noslot_pfn(pfn)) 6736 return; 6737 6738 read_lock(&vcpu->kvm->mmu_lock); 6739 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) 6740 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6741 else 6742 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6743 6744 /* 6745 * Do not pin the APIC access page in memory so that it can be freely 6746 * migrated, the MMU notifier will call us again if it is migrated or 6747 * swapped out. KVM backs the memslot with anonymous memory, the pfn 6748 * should always point at a refcounted page (if the pfn is valid). 6749 */ 6750 if (!WARN_ON_ONCE(!refcounted_page)) 6751 kvm_release_page_clean(refcounted_page); 6752 6753 /* 6754 * No need for a manual TLB flush at this point, KVM has already done a 6755 * flush if there were SPTEs pointing at the previous page. 6756 */ 6757 read_unlock(&vcpu->kvm->mmu_lock); 6758 } 6759 6760 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 6761 { 6762 u16 status; 6763 u8 old; 6764 6765 /* 6766 * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI 6767 * is only relevant for if and only if Virtual Interrupt Delivery is 6768 * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's 6769 * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested 6770 * VM-Exit, otherwise L1 with run with a stale SVI. 6771 */ 6772 if (is_guest_mode(vcpu)) { 6773 /* 6774 * KVM is supposed to forward intercepted L2 EOIs to L1 if VID 6775 * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. 6776 * Note, userspace can stuff state while L2 is active; assert 6777 * that VID is disabled if and only if the vCPU is in KVM_RUN 6778 * to avoid false positives if userspace is setting APIC state. 6779 */ 6780 WARN_ON_ONCE(vcpu->wants_to_run && 6781 nested_cpu_has_vid(get_vmcs12(vcpu))); 6782 to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; 6783 return; 6784 } 6785 6786 if (max_isr == -1) 6787 max_isr = 0; 6788 6789 status = vmcs_read16(GUEST_INTR_STATUS); 6790 old = status >> 8; 6791 if (max_isr != old) { 6792 status &= 0xff; 6793 status |= max_isr << 8; 6794 vmcs_write16(GUEST_INTR_STATUS, status); 6795 } 6796 } 6797 6798 static void vmx_set_rvi(int vector) 6799 { 6800 u16 status; 6801 u8 old; 6802 6803 if (vector == -1) 6804 vector = 0; 6805 6806 status = vmcs_read16(GUEST_INTR_STATUS); 6807 old = (u8)status & 0xff; 6808 if ((u8)vector != old) { 6809 status &= ~0xff; 6810 status |= (u8)vector; 6811 vmcs_write16(GUEST_INTR_STATUS, status); 6812 } 6813 } 6814 6815 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6816 { 6817 struct vcpu_vt *vt = to_vt(vcpu); 6818 int max_irr; 6819 bool got_posted_interrupt; 6820 6821 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 6822 return -EIO; 6823 6824 if (pi_test_on(&vt->pi_desc)) { 6825 pi_clear_on(&vt->pi_desc); 6826 /* 6827 * IOMMU can write to PID.ON, so the barrier matters even on UP. 6828 * But on x86 this is just a compiler barrier anyway. 6829 */ 6830 smp_mb__after_atomic(); 6831 got_posted_interrupt = 6832 kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); 6833 } else { 6834 max_irr = kvm_lapic_find_highest_irr(vcpu); 6835 got_posted_interrupt = false; 6836 } 6837 6838 /* 6839 * Newly recognized interrupts are injected via either virtual interrupt 6840 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 6841 * disabled in two cases: 6842 * 6843 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 6844 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 6845 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 6846 * into L2, but KVM doesn't use virtual interrupt delivery to inject 6847 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 6848 * 6849 * 2) If APICv is disabled for this vCPU, assigned devices may still 6850 * attempt to post interrupts. The posted interrupt vector will cause 6851 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 6852 */ 6853 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 6854 vmx_set_rvi(max_irr); 6855 else if (got_posted_interrupt) 6856 kvm_make_request(KVM_REQ_EVENT, vcpu); 6857 6858 return max_irr; 6859 } 6860 6861 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6862 { 6863 if (!kvm_vcpu_apicv_active(vcpu)) 6864 return; 6865 6866 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6867 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6868 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6869 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6870 } 6871 6872 void vmx_do_interrupt_irqoff(unsigned long entry); 6873 void vmx_do_nmi_irqoff(void); 6874 6875 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 6876 { 6877 /* 6878 * Save xfd_err to guest_fpu before interrupt is enabled, so the 6879 * MSR value is not clobbered by the host activity before the guest 6880 * has chance to consume it. 6881 * 6882 * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM 6883 * interception may have been caused by L1 interception. Per the SDM, 6884 * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. 6885 * 6886 * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. 6887 * unlike CR2 and DR6, the value is not a payload that is attached to 6888 * the #NM exception. 6889 */ 6890 if (is_xfd_nm_fault(vcpu)) 6891 rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 6892 } 6893 6894 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 6895 { 6896 /* if exit due to PF check for async PF */ 6897 if (is_page_fault(intr_info)) 6898 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 6899 /* if exit due to NM, handle before interrupts are enabled */ 6900 else if (is_nm_fault(intr_info)) 6901 handle_nm_fault_irqoff(vcpu); 6902 /* Handle machine checks before interrupts are enabled */ 6903 else if (is_machine_check(intr_info)) 6904 kvm_machine_check(); 6905 } 6906 6907 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, 6908 u32 intr_info) 6909 { 6910 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 6911 6912 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 6913 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 6914 return; 6915 6916 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 6917 if (cpu_feature_enabled(X86_FEATURE_FRED)) 6918 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 6919 else 6920 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); 6921 kvm_after_interrupt(vcpu); 6922 6923 vcpu->arch.at_instruction_boundary = true; 6924 } 6925 6926 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 6927 { 6928 if (to_vt(vcpu)->emulation_required) 6929 return; 6930 6931 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6932 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 6933 else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) 6934 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 6935 } 6936 6937 /* 6938 * The kvm parameter can be NULL (module initialization, or invocation before 6939 * VM creation). Be sure to check the kvm parameter before using it. 6940 */ 6941 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 6942 { 6943 switch (index) { 6944 case MSR_IA32_SMBASE: 6945 if (!IS_ENABLED(CONFIG_KVM_SMM)) 6946 return false; 6947 /* 6948 * We cannot do SMM unless we can run the guest in big 6949 * real mode. 6950 */ 6951 return enable_unrestricted_guest || emulate_invalid_guest_state; 6952 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 6953 return nested; 6954 case MSR_AMD64_VIRT_SPEC_CTRL: 6955 case MSR_AMD64_TSC_RATIO: 6956 /* This is AMD only. */ 6957 return false; 6958 default: 6959 return true; 6960 } 6961 } 6962 6963 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 6964 { 6965 u32 exit_intr_info; 6966 bool unblock_nmi; 6967 u8 vector; 6968 bool idtv_info_valid; 6969 6970 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6971 6972 if (enable_vnmi) { 6973 if (vmx->loaded_vmcs->nmi_known_unmasked) 6974 return; 6975 6976 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 6977 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 6978 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 6979 /* 6980 * SDM 3: 27.7.1.2 (September 2008) 6981 * Re-set bit "block by NMI" before VM entry if vmexit caused by 6982 * a guest IRET fault. 6983 * SDM 3: 23.2.2 (September 2008) 6984 * Bit 12 is undefined in any of the following cases: 6985 * If the VM exit sets the valid bit in the IDT-vectoring 6986 * information field. 6987 * If the VM exit is due to a double fault. 6988 */ 6989 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 6990 vector != DF_VECTOR && !idtv_info_valid) 6991 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6992 GUEST_INTR_STATE_NMI); 6993 else 6994 vmx->loaded_vmcs->nmi_known_unmasked = 6995 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 6996 & GUEST_INTR_STATE_NMI); 6997 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 6998 vmx->loaded_vmcs->vnmi_blocked_time += 6999 ktime_to_ns(ktime_sub(ktime_get(), 7000 vmx->loaded_vmcs->entry_time)); 7001 } 7002 7003 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7004 u32 idt_vectoring_info, 7005 int instr_len_field, 7006 int error_code_field) 7007 { 7008 u8 vector; 7009 int type; 7010 bool idtv_info_valid; 7011 7012 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7013 7014 vcpu->arch.nmi_injected = false; 7015 kvm_clear_exception_queue(vcpu); 7016 kvm_clear_interrupt_queue(vcpu); 7017 7018 if (!idtv_info_valid) 7019 return; 7020 7021 kvm_make_request(KVM_REQ_EVENT, vcpu); 7022 7023 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7024 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7025 7026 switch (type) { 7027 case INTR_TYPE_NMI_INTR: 7028 vcpu->arch.nmi_injected = true; 7029 /* 7030 * SDM 3: 27.7.1.2 (September 2008) 7031 * Clear bit "block by NMI" before VM entry if a NMI 7032 * delivery faulted. 7033 */ 7034 vmx_set_nmi_mask(vcpu, false); 7035 break; 7036 case INTR_TYPE_SOFT_EXCEPTION: 7037 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7038 fallthrough; 7039 case INTR_TYPE_HARD_EXCEPTION: { 7040 u32 error_code = 0; 7041 7042 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) 7043 error_code = vmcs_read32(error_code_field); 7044 7045 kvm_requeue_exception(vcpu, vector, 7046 idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, 7047 error_code); 7048 break; 7049 } 7050 case INTR_TYPE_SOFT_INTR: 7051 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7052 fallthrough; 7053 case INTR_TYPE_EXT_INTR: 7054 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7055 break; 7056 default: 7057 break; 7058 } 7059 } 7060 7061 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7062 { 7063 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7064 VM_EXIT_INSTRUCTION_LEN, 7065 IDT_VECTORING_ERROR_CODE); 7066 } 7067 7068 void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7069 { 7070 __vmx_complete_interrupts(vcpu, 7071 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7072 VM_ENTRY_INSTRUCTION_LEN, 7073 VM_ENTRY_EXCEPTION_ERROR_CODE); 7074 7075 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7076 } 7077 7078 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7079 { 7080 int i, nr_msrs; 7081 struct perf_guest_switch_msr *msrs; 7082 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7083 7084 pmu->host_cross_mapped_mask = 0; 7085 if (pmu->pebs_enable & pmu->global_ctrl) 7086 intel_pmu_cross_mapped_check(pmu); 7087 7088 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7089 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7090 if (!msrs) 7091 return; 7092 7093 for (i = 0; i < nr_msrs; i++) 7094 if (msrs[i].host == msrs[i].guest) 7095 clear_atomic_switch_msr(vmx, msrs[i].msr); 7096 else 7097 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7098 msrs[i].host, false); 7099 } 7100 7101 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7102 { 7103 struct vcpu_vmx *vmx = to_vmx(vcpu); 7104 u64 tscl; 7105 u32 delta_tsc; 7106 7107 if (force_immediate_exit) { 7108 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7109 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7110 } else if (vmx->hv_deadline_tsc != -1) { 7111 tscl = rdtsc(); 7112 if (vmx->hv_deadline_tsc > tscl) 7113 /* set_hv_timer ensures the delta fits in 32-bits */ 7114 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7115 cpu_preemption_timer_multi); 7116 else 7117 delta_tsc = 0; 7118 7119 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7120 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7121 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7122 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7123 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7124 } 7125 } 7126 7127 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7128 { 7129 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7130 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7131 vmcs_writel(HOST_RSP, host_rsp); 7132 } 7133 } 7134 7135 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7136 unsigned int flags) 7137 { 7138 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7139 7140 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7141 return; 7142 7143 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7144 vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); 7145 7146 /* 7147 * If the guest/host SPEC_CTRL values differ, restore the host value. 7148 * 7149 * For legacy IBRS, the IBRS bit always needs to be written after 7150 * transitioning from a less privileged predictor mode, regardless of 7151 * whether the guest/host values differ. 7152 */ 7153 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7154 vmx->spec_ctrl != hostval) 7155 native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); 7156 7157 barrier_nospec(); 7158 } 7159 7160 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7161 bool force_immediate_exit) 7162 { 7163 /* 7164 * If L2 is active, some VMX preemption timer exits can be handled in 7165 * the fastpath even, all other exits must use the slow path. 7166 */ 7167 if (is_guest_mode(vcpu) && 7168 vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) 7169 return EXIT_FASTPATH_NONE; 7170 7171 switch (vmx_get_exit_reason(vcpu).basic) { 7172 case EXIT_REASON_MSR_WRITE: 7173 return handle_fastpath_set_msr_irqoff(vcpu); 7174 case EXIT_REASON_PREEMPTION_TIMER: 7175 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7176 case EXIT_REASON_HLT: 7177 return handle_fastpath_hlt(vcpu); 7178 default: 7179 return EXIT_FASTPATH_NONE; 7180 } 7181 } 7182 7183 noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) 7184 { 7185 if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || 7186 !is_nmi(vmx_get_intr_info(vcpu))) 7187 return; 7188 7189 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7190 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7191 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7192 else 7193 vmx_do_nmi_irqoff(); 7194 kvm_after_interrupt(vcpu); 7195 } 7196 7197 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7198 unsigned int flags) 7199 { 7200 struct vcpu_vmx *vmx = to_vmx(vcpu); 7201 7202 guest_state_enter_irqoff(); 7203 7204 /* 7205 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7206 * mitigation for MDS is done late in VMentry and is still 7207 * executed in spite of L1D Flush. This is because an extra VERW 7208 * should not matter much after the big hammer L1D Flush. 7209 * 7210 * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, 7211 * and is affected by MMIO Stale Data. In such cases mitigation in only 7212 * needed against an MMIO capable guest. 7213 */ 7214 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7215 vmx_l1d_flush(vcpu); 7216 else if (static_branch_unlikely(&cpu_buf_vm_clear) && 7217 (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) 7218 x86_clear_cpu_buffers(); 7219 7220 vmx_disable_fb_clear(vmx); 7221 7222 if (vcpu->arch.cr2 != native_read_cr2()) 7223 native_write_cr2(vcpu->arch.cr2); 7224 7225 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7226 flags); 7227 7228 vcpu->arch.cr2 = native_read_cr2(); 7229 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7230 7231 vmx->idt_vectoring_info = 0; 7232 7233 vmx_enable_fb_clear(vmx); 7234 7235 if (unlikely(vmx->fail)) { 7236 vmx->vt.exit_reason.full = 0xdead; 7237 goto out; 7238 } 7239 7240 vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7241 if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) 7242 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7243 7244 vmx_handle_nmi(vcpu); 7245 7246 out: 7247 guest_state_exit_irqoff(); 7248 } 7249 7250 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) 7251 { 7252 bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; 7253 struct vcpu_vmx *vmx = to_vmx(vcpu); 7254 unsigned long cr3, cr4; 7255 7256 /* Record the guest's net vcpu time for enforced NMI injections. */ 7257 if (unlikely(!enable_vnmi && 7258 vmx->loaded_vmcs->soft_vnmi_blocked)) 7259 vmx->loaded_vmcs->entry_time = ktime_get(); 7260 7261 /* 7262 * Don't enter VMX if guest state is invalid, let the exit handler 7263 * start emulation until we arrive back to a valid state. Synthesize a 7264 * consistency check VM-Exit due to invalid guest state and bail. 7265 */ 7266 if (unlikely(vmx->vt.emulation_required)) { 7267 vmx->fail = 0; 7268 7269 vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; 7270 vmx->vt.exit_reason.failed_vmentry = 1; 7271 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7272 vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; 7273 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7274 vmx->vt.exit_intr_info = 0; 7275 return EXIT_FASTPATH_NONE; 7276 } 7277 7278 trace_kvm_entry(vcpu, force_immediate_exit); 7279 7280 if (vmx->ple_window_dirty) { 7281 vmx->ple_window_dirty = false; 7282 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7283 } 7284 7285 /* 7286 * We did this in prepare_switch_to_guest, because it needs to 7287 * be within srcu_read_lock. 7288 */ 7289 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7290 7291 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7292 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7293 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7294 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7295 vcpu->arch.regs_dirty = 0; 7296 7297 if (run_flags & KVM_RUN_LOAD_GUEST_DR6) 7298 set_debugreg(vcpu->arch.dr6, 6); 7299 7300 if (run_flags & KVM_RUN_LOAD_DEBUGCTL) 7301 vmx_reload_guest_debugctl(vcpu); 7302 7303 /* 7304 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7305 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7306 * it switches back to the current->mm, which can occur in KVM context 7307 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7308 * toggles a static key while handling a VM-Exit. 7309 */ 7310 cr3 = __get_current_cr3_fast(); 7311 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7312 vmcs_writel(HOST_CR3, cr3); 7313 vmx->loaded_vmcs->host_state.cr3 = cr3; 7314 } 7315 7316 cr4 = cr4_read_shadow(); 7317 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7318 vmcs_writel(HOST_CR4, cr4); 7319 vmx->loaded_vmcs->host_state.cr4 = cr4; 7320 } 7321 7322 /* When single-stepping over STI and MOV SS, we must clear the 7323 * corresponding interruptibility bits in the guest state. Otherwise 7324 * vmentry fails as it then expects bit 14 (BS) in pending debug 7325 * exceptions being set, but that's not correct for the guest debugging 7326 * case. */ 7327 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7328 vmx_set_interrupt_shadow(vcpu, 0); 7329 7330 kvm_load_guest_xsave_state(vcpu); 7331 7332 pt_guest_enter(vmx); 7333 7334 atomic_switch_perf_msrs(vmx); 7335 if (intel_pmu_lbr_is_enabled(vcpu)) 7336 vmx_passthrough_lbr_msrs(vcpu); 7337 7338 if (enable_preemption_timer) 7339 vmx_update_hv_timer(vcpu, force_immediate_exit); 7340 else if (force_immediate_exit) 7341 smp_send_reschedule(vcpu->cpu); 7342 7343 kvm_wait_lapic_expire(vcpu); 7344 7345 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7346 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7347 7348 /* All fields are clean at this point */ 7349 if (kvm_is_using_evmcs()) { 7350 current_evmcs->hv_clean_fields |= 7351 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7352 7353 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7354 } 7355 7356 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7357 if (vcpu->arch.host_debugctl) 7358 update_debugctlmsr(vcpu->arch.host_debugctl); 7359 7360 #ifndef CONFIG_X86_64 7361 /* 7362 * The sysexit path does not restore ds/es, so we must set them to 7363 * a reasonable value ourselves. 7364 * 7365 * We can't defer this to vmx_prepare_switch_to_host() since that 7366 * function may be executed in interrupt context, which saves and 7367 * restore segments around it, nullifying its effect. 7368 */ 7369 loadsegment(ds, __USER_DS); 7370 loadsegment(es, __USER_DS); 7371 #endif 7372 7373 pt_guest_exit(vmx); 7374 7375 kvm_load_host_xsave_state(vcpu); 7376 7377 if (is_guest_mode(vcpu)) { 7378 /* 7379 * Track VMLAUNCH/VMRESUME that have made past guest state 7380 * checking. 7381 */ 7382 if (vmx->nested.nested_run_pending && 7383 !vmx_get_exit_reason(vcpu).failed_vmentry) 7384 ++vcpu->stat.nested_run; 7385 7386 vmx->nested.nested_run_pending = 0; 7387 } 7388 7389 if (unlikely(vmx->fail)) 7390 return EXIT_FASTPATH_NONE; 7391 7392 if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7393 kvm_machine_check(); 7394 7395 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7396 7397 if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) 7398 return EXIT_FASTPATH_NONE; 7399 7400 vmx->loaded_vmcs->launched = 1; 7401 7402 vmx_recover_nmi_blocking(vmx); 7403 vmx_complete_interrupts(vmx); 7404 7405 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7406 } 7407 7408 void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7409 { 7410 struct vcpu_vmx *vmx = to_vmx(vcpu); 7411 7412 if (enable_pml) 7413 vmx_destroy_pml_buffer(vmx); 7414 free_vpid(vmx->vpid); 7415 nested_vmx_free_vcpu(vcpu); 7416 free_loaded_vmcs(vmx->loaded_vmcs); 7417 free_page((unsigned long)vmx->ve_info); 7418 } 7419 7420 int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7421 { 7422 struct vmx_uret_msr *tsx_ctrl; 7423 struct vcpu_vmx *vmx; 7424 int i, err; 7425 7426 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7427 vmx = to_vmx(vcpu); 7428 7429 INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); 7430 7431 err = -ENOMEM; 7432 7433 vmx->vpid = allocate_vpid(); 7434 7435 /* 7436 * If PML is turned on, failure on enabling PML just results in failure 7437 * of creating the vcpu, therefore we can simplify PML logic (by 7438 * avoiding dealing with cases, such as enabling PML partially on vcpus 7439 * for the guest), etc. 7440 */ 7441 if (enable_pml) { 7442 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7443 if (!vmx->pml_pg) 7444 goto free_vpid; 7445 } 7446 7447 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7448 vmx->guest_uret_msrs[i].mask = -1ull; 7449 if (boot_cpu_has(X86_FEATURE_RTM)) { 7450 /* 7451 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7452 * Keep the host value unchanged to avoid changing CPUID bits 7453 * under the host kernel's feet. 7454 */ 7455 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7456 if (tsx_ctrl) 7457 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7458 } 7459 7460 err = alloc_loaded_vmcs(&vmx->vmcs01); 7461 if (err < 0) 7462 goto free_pml; 7463 7464 /* 7465 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7466 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7467 * feature only for vmcs01, KVM currently isn't equipped to realize any 7468 * performance benefits from enabling it for vmcs02. 7469 */ 7470 if (kvm_is_using_evmcs() && 7471 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7472 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7473 7474 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7475 } 7476 7477 vmx->loaded_vmcs = &vmx->vmcs01; 7478 7479 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7480 err = kvm_alloc_apic_access_page(vcpu->kvm); 7481 if (err) 7482 goto free_vmcs; 7483 } 7484 7485 if (enable_ept && !enable_unrestricted_guest) { 7486 err = init_rmode_identity_map(vcpu->kvm); 7487 if (err) 7488 goto free_vmcs; 7489 } 7490 7491 err = -ENOMEM; 7492 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { 7493 struct page *page; 7494 7495 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); 7496 7497 /* ve_info must be page aligned. */ 7498 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7499 if (!page) 7500 goto free_vmcs; 7501 7502 vmx->ve_info = page_to_virt(page); 7503 } 7504 7505 if (vmx_can_use_ipiv(vcpu)) 7506 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7507 __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); 7508 7509 return 0; 7510 7511 free_vmcs: 7512 free_loaded_vmcs(vmx->loaded_vmcs); 7513 free_pml: 7514 vmx_destroy_pml_buffer(vmx); 7515 free_vpid: 7516 free_vpid(vmx->vpid); 7517 return err; 7518 } 7519 7520 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7521 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7522 7523 int vmx_vm_init(struct kvm *kvm) 7524 { 7525 if (!ple_gap) 7526 kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); 7527 7528 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7529 switch (l1tf_mitigation) { 7530 case L1TF_MITIGATION_OFF: 7531 case L1TF_MITIGATION_FLUSH_NOWARN: 7532 /* 'I explicitly don't care' is set */ 7533 break; 7534 case L1TF_MITIGATION_AUTO: 7535 case L1TF_MITIGATION_FLUSH: 7536 case L1TF_MITIGATION_FLUSH_NOSMT: 7537 case L1TF_MITIGATION_FULL: 7538 /* 7539 * Warn upon starting the first VM in a potentially 7540 * insecure environment. 7541 */ 7542 if (sched_smt_active()) 7543 pr_warn_once(L1TF_MSG_SMT); 7544 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7545 pr_warn_once(L1TF_MSG_L1D); 7546 break; 7547 case L1TF_MITIGATION_FULL_FORCE: 7548 /* Flush is enforced */ 7549 break; 7550 } 7551 } 7552 7553 if (enable_pml) 7554 kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; 7555 return 0; 7556 } 7557 7558 static inline bool vmx_ignore_guest_pat(struct kvm *kvm) 7559 { 7560 /* 7561 * Non-coherent DMA devices need the guest to flush CPU properly. 7562 * In that case it is not possible to map all guest RAM as WB, so 7563 * always trust guest PAT. 7564 */ 7565 return !kvm_arch_has_noncoherent_dma(kvm) && 7566 kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); 7567 } 7568 7569 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7570 { 7571 /* 7572 * Force UC for host MMIO regions, as allowing the guest to access MMIO 7573 * with cacheable accesses will result in Machine Checks. 7574 */ 7575 if (is_mmio) 7576 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7577 7578 /* Force WB if ignoring guest PAT */ 7579 if (vmx_ignore_guest_pat(vcpu->kvm)) 7580 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7581 7582 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); 7583 } 7584 7585 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7586 { 7587 /* 7588 * These bits in the secondary execution controls field 7589 * are dynamic, the others are mostly based on the hypervisor 7590 * architecture and the guest's CPUID. Do not touch the 7591 * dynamic bits. 7592 */ 7593 u32 mask = 7594 SECONDARY_EXEC_SHADOW_VMCS | 7595 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7596 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7597 SECONDARY_EXEC_DESC; 7598 7599 u32 cur_ctl = secondary_exec_controls_get(vmx); 7600 7601 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7602 } 7603 7604 /* 7605 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7606 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7607 */ 7608 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7609 { 7610 struct vcpu_vmx *vmx = to_vmx(vcpu); 7611 struct kvm_cpuid_entry2 *entry; 7612 7613 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7614 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7615 7616 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7617 if (entry && (entry->_reg & (_cpuid_mask))) \ 7618 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7619 } while (0) 7620 7621 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7622 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7623 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7624 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7625 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7626 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7627 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7628 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7629 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7630 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7631 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7632 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7633 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7634 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7635 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7636 7637 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7638 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7639 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7640 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7641 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7642 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7643 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7644 7645 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7646 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7647 7648 #undef cr4_fixed1_update 7649 } 7650 7651 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7652 { 7653 struct vcpu_vmx *vmx = to_vmx(vcpu); 7654 struct kvm_cpuid_entry2 *best = NULL; 7655 int i; 7656 7657 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7658 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7659 if (!best) 7660 return; 7661 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7662 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7663 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7664 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7665 } 7666 7667 /* Get the number of configurable Address Ranges for filtering */ 7668 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7669 PT_CAP_num_address_ranges); 7670 7671 /* Initialize and clear the no dependency bits */ 7672 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7673 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7674 RTIT_CTL_BRANCH_EN); 7675 7676 /* 7677 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7678 * will inject an #GP 7679 */ 7680 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7681 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7682 7683 /* 7684 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7685 * PSBFreq can be set 7686 */ 7687 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7688 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7689 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7690 7691 /* 7692 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7693 */ 7694 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7695 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7696 RTIT_CTL_MTC_RANGE); 7697 7698 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7699 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7700 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7701 RTIT_CTL_PTW_EN); 7702 7703 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7704 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7705 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7706 7707 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7708 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7709 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7710 7711 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7712 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7713 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7714 7715 /* unmask address range configure area */ 7716 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7717 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7718 } 7719 7720 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7721 { 7722 struct vcpu_vmx *vmx = to_vmx(vcpu); 7723 7724 /* 7725 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7726 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7727 * set if and only if XSAVE is supported. 7728 */ 7729 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) 7730 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); 7731 7732 vmx_setup_uret_msrs(vmx); 7733 7734 if (cpu_has_secondary_exec_ctrls()) 7735 vmcs_set_secondary_exec_control(vmx, 7736 vmx_secondary_exec_control(vmx)); 7737 7738 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7739 vmx->msr_ia32_feature_control_valid_bits |= 7740 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7741 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7742 else 7743 vmx->msr_ia32_feature_control_valid_bits &= 7744 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7745 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7746 7747 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7748 nested_vmx_cr_fixed1_bits_update(vcpu); 7749 7750 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7751 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) 7752 update_intel_pt_cfg(vcpu); 7753 7754 if (boot_cpu_has(X86_FEATURE_RTM)) { 7755 struct vmx_uret_msr *msr; 7756 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7757 if (msr) { 7758 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); 7759 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 7760 } 7761 } 7762 7763 set_cr4_guest_host_mask(vmx); 7764 7765 vmx_write_encls_bitmap(vcpu, NULL); 7766 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) 7767 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 7768 else 7769 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 7770 7771 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 7772 vmx->msr_ia32_feature_control_valid_bits |= 7773 FEAT_CTL_SGX_LC_ENABLED; 7774 else 7775 vmx->msr_ia32_feature_control_valid_bits &= 7776 ~FEAT_CTL_SGX_LC_ENABLED; 7777 7778 /* Recalc MSR interception to account for feature changes. */ 7779 vmx_recalc_msr_intercepts(vcpu); 7780 7781 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7782 vmx_update_exception_bitmap(vcpu); 7783 } 7784 7785 static __init u64 vmx_get_perf_capabilities(void) 7786 { 7787 u64 perf_cap = PMU_CAP_FW_WRITES; 7788 u64 host_perf_cap = 0; 7789 7790 if (!enable_pmu) 7791 return 0; 7792 7793 if (boot_cpu_has(X86_FEATURE_PDCM)) 7794 rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 7795 7796 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { 7797 x86_perf_get_lbr(&vmx_lbr_caps); 7798 7799 /* 7800 * KVM requires LBR callstack support, as the overhead due to 7801 * context switching LBRs without said support is too high. 7802 * See intel_pmu_create_guest_lbr_event() for more info. 7803 */ 7804 if (!vmx_lbr_caps.has_callstack) 7805 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 7806 else if (vmx_lbr_caps.nr) 7807 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; 7808 } 7809 7810 if (vmx_pebs_supported()) { 7811 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 7812 7813 /* 7814 * Disallow adaptive PEBS as it is functionally broken, can be 7815 * used by the guest to read *host* LBRs, and can be used to 7816 * bypass userspace event filters. To correctly and safely 7817 * support adaptive PEBS, KVM needs to: 7818 * 7819 * 1. Account for the ADAPTIVE flag when (re)programming fixed 7820 * counters. 7821 * 7822 * 2. Gain support from perf (or take direct control of counter 7823 * programming) to support events without adaptive PEBS 7824 * enabled for the hardware counter. 7825 * 7826 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 7827 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 7828 * 7829 * 4. Document which PMU events are effectively exposed to the 7830 * guest via adaptive PEBS, and make adaptive PEBS mutually 7831 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 7832 */ 7833 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 7834 } 7835 7836 return perf_cap; 7837 } 7838 7839 static __init void vmx_set_cpu_caps(void) 7840 { 7841 kvm_set_cpu_caps(); 7842 7843 /* CPUID 0x1 */ 7844 if (nested) 7845 kvm_cpu_cap_set(X86_FEATURE_VMX); 7846 7847 /* CPUID 0x7 */ 7848 if (kvm_mpx_supported()) 7849 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 7850 if (!cpu_has_vmx_invpcid()) 7851 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 7852 if (vmx_pt_mode_is_host_guest()) 7853 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 7854 if (vmx_pebs_supported()) { 7855 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 7856 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 7857 } 7858 7859 if (!enable_pmu) 7860 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 7861 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 7862 7863 if (!enable_sgx) { 7864 kvm_cpu_cap_clear(X86_FEATURE_SGX); 7865 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7866 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7867 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7868 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 7869 } 7870 7871 if (vmx_umip_emulated()) 7872 kvm_cpu_cap_set(X86_FEATURE_UMIP); 7873 7874 /* CPUID 0xD.1 */ 7875 kvm_caps.supported_xss = 0; 7876 if (!cpu_has_vmx_xsaves()) 7877 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7878 7879 /* CPUID 0x80000001 and 0x7 (RDPID) */ 7880 if (!cpu_has_vmx_rdtscp()) { 7881 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 7882 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 7883 } 7884 7885 if (cpu_has_vmx_waitpkg()) 7886 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7887 } 7888 7889 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, 7890 struct x86_instruction_info *info, 7891 unsigned long *exit_qualification) 7892 { 7893 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7894 unsigned short port; 7895 int size; 7896 bool imm; 7897 7898 /* 7899 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 7900 * VM-exits depend on the 'unconditional IO exiting' VM-execution 7901 * control. 7902 * 7903 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 7904 */ 7905 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 7906 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 7907 7908 if (info->intercept == x86_intercept_in || 7909 info->intercept == x86_intercept_ins) { 7910 port = info->src_val; 7911 size = info->dst_bytes; 7912 imm = info->src_type == OP_IMM; 7913 } else { 7914 port = info->dst_val; 7915 size = info->src_bytes; 7916 imm = info->dst_type == OP_IMM; 7917 } 7918 7919 7920 *exit_qualification = ((unsigned long)port << 16) | (size - 1); 7921 7922 if (info->intercept == x86_intercept_ins || 7923 info->intercept == x86_intercept_outs) 7924 *exit_qualification |= BIT(4); 7925 7926 if (info->rep_prefix) 7927 *exit_qualification |= BIT(5); 7928 7929 if (imm) 7930 *exit_qualification |= BIT(6); 7931 7932 return nested_vmx_check_io_bitmaps(vcpu, port, size); 7933 } 7934 7935 int vmx_check_intercept(struct kvm_vcpu *vcpu, 7936 struct x86_instruction_info *info, 7937 enum x86_intercept_stage stage, 7938 struct x86_exception *exception) 7939 { 7940 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7941 unsigned long exit_qualification = 0; 7942 u32 vm_exit_reason; 7943 u64 exit_insn_len; 7944 7945 switch (info->intercept) { 7946 case x86_intercept_rdpid: 7947 /* 7948 * RDPID causes #UD if not enabled through secondary execution 7949 * controls (ENABLE_RDTSCP). Note, the implicit MSR access to 7950 * TSC_AUX is NOT subject to interception, i.e. checking only 7951 * the dedicated execution control is architecturally correct. 7952 */ 7953 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 7954 exception->vector = UD_VECTOR; 7955 exception->error_code_valid = false; 7956 return X86EMUL_PROPAGATE_FAULT; 7957 } 7958 return X86EMUL_CONTINUE; 7959 7960 case x86_intercept_in: 7961 case x86_intercept_ins: 7962 case x86_intercept_out: 7963 case x86_intercept_outs: 7964 if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) 7965 return X86EMUL_CONTINUE; 7966 7967 vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; 7968 break; 7969 7970 case x86_intercept_lgdt: 7971 case x86_intercept_lidt: 7972 case x86_intercept_lldt: 7973 case x86_intercept_ltr: 7974 case x86_intercept_sgdt: 7975 case x86_intercept_sidt: 7976 case x86_intercept_sldt: 7977 case x86_intercept_str: 7978 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 7979 return X86EMUL_CONTINUE; 7980 7981 if (info->intercept == x86_intercept_lldt || 7982 info->intercept == x86_intercept_ltr || 7983 info->intercept == x86_intercept_sldt || 7984 info->intercept == x86_intercept_str) 7985 vm_exit_reason = EXIT_REASON_LDTR_TR; 7986 else 7987 vm_exit_reason = EXIT_REASON_GDTR_IDTR; 7988 /* 7989 * FIXME: Decode the ModR/M to generate the correct exit 7990 * qualification for memory operands. 7991 */ 7992 break; 7993 7994 case x86_intercept_hlt: 7995 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) 7996 return X86EMUL_CONTINUE; 7997 7998 vm_exit_reason = EXIT_REASON_HLT; 7999 break; 8000 8001 case x86_intercept_pause: 8002 /* 8003 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8004 * with vanilla NOPs in the emulator. Apply the interception 8005 * check only to actual PAUSE instructions. Don't check 8006 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8007 * exit, i.e. KVM is within its rights to allow L2 to execute 8008 * the PAUSE. 8009 */ 8010 if ((info->rep_prefix != REPE_PREFIX) || 8011 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) 8012 return X86EMUL_CONTINUE; 8013 8014 vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; 8015 break; 8016 8017 /* TODO: check more intercepts... */ 8018 default: 8019 return X86EMUL_UNHANDLEABLE; 8020 } 8021 8022 exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); 8023 if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) 8024 return X86EMUL_UNHANDLEABLE; 8025 8026 __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, 8027 exit_insn_len); 8028 return X86EMUL_INTERCEPTED; 8029 } 8030 8031 #ifdef CONFIG_X86_64 8032 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8033 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8034 u64 divisor, u64 *result) 8035 { 8036 u64 low = a << shift, high = a >> (64 - shift); 8037 8038 /* To avoid the overflow on divq */ 8039 if (high >= divisor) 8040 return 1; 8041 8042 /* Low hold the result, high hold rem which is discarded */ 8043 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8044 "rm" (divisor), "0" (low), "1" (high)); 8045 *result = low; 8046 8047 return 0; 8048 } 8049 8050 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8051 bool *expired) 8052 { 8053 struct vcpu_vmx *vmx; 8054 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8055 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8056 8057 vmx = to_vmx(vcpu); 8058 tscl = rdtsc(); 8059 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8060 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8061 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8062 ktimer->timer_advance_ns); 8063 8064 if (delta_tsc > lapic_timer_advance_cycles) 8065 delta_tsc -= lapic_timer_advance_cycles; 8066 else 8067 delta_tsc = 0; 8068 8069 /* Convert to host delta tsc if tsc scaling is enabled */ 8070 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8071 delta_tsc && u64_shl_div_u64(delta_tsc, 8072 kvm_caps.tsc_scaling_ratio_frac_bits, 8073 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8074 return -ERANGE; 8075 8076 /* 8077 * If the delta tsc can't fit in the 32 bit after the multi shift, 8078 * we can't use the preemption timer. 8079 * It's possible that it fits on later vmentries, but checking 8080 * on every vmentry is costly so we just use an hrtimer. 8081 */ 8082 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8083 return -ERANGE; 8084 8085 vmx->hv_deadline_tsc = tscl + delta_tsc; 8086 *expired = !delta_tsc; 8087 return 0; 8088 } 8089 8090 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8091 { 8092 to_vmx(vcpu)->hv_deadline_tsc = -1; 8093 } 8094 #endif 8095 8096 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8097 { 8098 struct vcpu_vmx *vmx = to_vmx(vcpu); 8099 8100 if (WARN_ON_ONCE(!enable_pml)) 8101 return; 8102 8103 if (is_guest_mode(vcpu)) { 8104 vmx->nested.update_vmcs01_cpu_dirty_logging = true; 8105 return; 8106 } 8107 8108 /* 8109 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8110 * code, but in that case another update request will be made and so 8111 * the guest will never run with a stale PML value. 8112 */ 8113 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8114 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8115 else 8116 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8117 } 8118 8119 void vmx_setup_mce(struct kvm_vcpu *vcpu) 8120 { 8121 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8122 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8123 FEAT_CTL_LMCE_ENABLED; 8124 else 8125 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8126 ~FEAT_CTL_LMCE_ENABLED; 8127 } 8128 8129 #ifdef CONFIG_KVM_SMM 8130 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8131 { 8132 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8133 if (to_vmx(vcpu)->nested.nested_run_pending) 8134 return -EBUSY; 8135 return !is_smm(vcpu); 8136 } 8137 8138 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8139 { 8140 struct vcpu_vmx *vmx = to_vmx(vcpu); 8141 8142 /* 8143 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8144 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8145 * SMI and RSM only modify state that is saved and restored via SMRAM. 8146 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8147 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8148 */ 8149 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8150 if (vmx->nested.smm.guest_mode) 8151 nested_vmx_vmexit(vcpu, -1, 0, 0); 8152 8153 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8154 vmx->nested.vmxon = false; 8155 vmx_clear_hlt(vcpu); 8156 return 0; 8157 } 8158 8159 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8160 { 8161 struct vcpu_vmx *vmx = to_vmx(vcpu); 8162 int ret; 8163 8164 if (vmx->nested.smm.vmxon) { 8165 vmx->nested.vmxon = true; 8166 vmx->nested.smm.vmxon = false; 8167 } 8168 8169 if (vmx->nested.smm.guest_mode) { 8170 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8171 if (ret) 8172 return ret; 8173 8174 vmx->nested.nested_run_pending = 1; 8175 vmx->nested.smm.guest_mode = false; 8176 } 8177 return 0; 8178 } 8179 8180 void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8181 { 8182 /* RSM will cause a vmexit anyway. */ 8183 } 8184 #endif 8185 8186 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8187 { 8188 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8189 } 8190 8191 void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8192 { 8193 if (is_guest_mode(vcpu)) { 8194 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8195 8196 if (hrtimer_try_to_cancel(timer) == 1) 8197 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8198 } 8199 } 8200 8201 void vmx_hardware_unsetup(void) 8202 { 8203 kvm_set_posted_intr_wakeup_handler(NULL); 8204 8205 if (nested) 8206 nested_vmx_hardware_unsetup(); 8207 8208 free_kvm_area(); 8209 } 8210 8211 void vmx_vm_destroy(struct kvm *kvm) 8212 { 8213 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8214 8215 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8216 } 8217 8218 /* 8219 * Note, the SDM states that the linear address is masked *after* the modified 8220 * canonicality check, whereas KVM masks (untags) the address and then performs 8221 * a "normal" canonicality check. Functionally, the two methods are identical, 8222 * and when the masking occurs relative to the canonicality check isn't visible 8223 * to software, i.e. KVM's behavior doesn't violate the SDM. 8224 */ 8225 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8226 { 8227 int lam_bit; 8228 unsigned long cr3_bits; 8229 8230 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8231 return gva; 8232 8233 if (!is_64_bit_mode(vcpu)) 8234 return gva; 8235 8236 /* 8237 * Bit 63 determines if the address should be treated as user address 8238 * or a supervisor address. 8239 */ 8240 if (!(gva & BIT_ULL(63))) { 8241 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8242 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8243 return gva; 8244 8245 /* LAM_U48 is ignored if LAM_U57 is set. */ 8246 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8247 } else { 8248 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8249 return gva; 8250 8251 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8252 } 8253 8254 /* 8255 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8256 * Bit 63 is retained from the raw virtual address so that untagging 8257 * doesn't change a user access to a supervisor access, and vice versa. 8258 */ 8259 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8260 } 8261 8262 static unsigned int vmx_handle_intel_pt_intr(void) 8263 { 8264 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8265 8266 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8267 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8268 return 0; 8269 8270 kvm_make_request(KVM_REQ_PMI, vcpu); 8271 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8272 (unsigned long *)&vcpu->arch.pmu.global_status); 8273 return 1; 8274 } 8275 8276 static __init void vmx_setup_user_return_msrs(void) 8277 { 8278 8279 /* 8280 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8281 * will emulate SYSCALL in legacy mode if the vendor string in guest 8282 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8283 * support this emulation, MSR_STAR is included in the list for i386, 8284 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8285 * into hardware and is here purely for emulation purposes. 8286 */ 8287 const u32 vmx_uret_msrs_list[] = { 8288 #ifdef CONFIG_X86_64 8289 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8290 #endif 8291 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8292 MSR_IA32_TSX_CTRL, 8293 }; 8294 int i; 8295 8296 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8297 8298 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8299 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8300 } 8301 8302 static void __init vmx_setup_me_spte_mask(void) 8303 { 8304 u64 me_mask = 0; 8305 8306 /* 8307 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8308 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, 8309 * boot_cpu_data.x86_phys_bits holds the actual physical address 8310 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to 8311 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. 8312 */ 8313 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) 8314 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8315 kvm_host.maxphyaddr - 1); 8316 8317 /* 8318 * Unlike SME, host kernel doesn't support setting up any 8319 * MKTME KeyID on Intel platforms. No memory encryption 8320 * bits should be included into the SPTE. 8321 */ 8322 kvm_mmu_set_me_spte_mask(0, me_mask); 8323 } 8324 8325 __init int vmx_hardware_setup(void) 8326 { 8327 unsigned long host_bndcfgs; 8328 struct desc_ptr dt; 8329 int r; 8330 8331 store_idt(&dt); 8332 host_idt_base = dt.address; 8333 8334 vmx_setup_user_return_msrs(); 8335 8336 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8337 return -EIO; 8338 8339 if (boot_cpu_has(X86_FEATURE_NX)) 8340 kvm_enable_efer_bits(EFER_NX); 8341 8342 if (boot_cpu_has(X86_FEATURE_MPX)) { 8343 rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); 8344 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8345 } 8346 8347 if (!cpu_has_vmx_mpx()) 8348 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8349 XFEATURE_MASK_BNDCSR); 8350 8351 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8352 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8353 enable_vpid = 0; 8354 8355 if (!cpu_has_vmx_ept() || 8356 !cpu_has_vmx_ept_4levels() || 8357 !cpu_has_vmx_ept_mt_wb() || 8358 !cpu_has_vmx_invept_global()) 8359 enable_ept = 0; 8360 8361 /* NX support is required for shadow paging. */ 8362 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8363 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8364 return -EOPNOTSUPP; 8365 } 8366 8367 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8368 enable_ept_ad_bits = 0; 8369 8370 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8371 enable_unrestricted_guest = 0; 8372 8373 if (!cpu_has_vmx_flexpriority()) 8374 flexpriority_enabled = 0; 8375 8376 if (!cpu_has_virtual_nmis()) 8377 enable_vnmi = 0; 8378 8379 #ifdef CONFIG_X86_SGX_KVM 8380 if (!cpu_has_vmx_encls_vmexit()) 8381 enable_sgx = false; 8382 #endif 8383 8384 /* 8385 * set_apic_access_page_addr() is used to reload apic access 8386 * page upon invalidation. No need to do anything if not 8387 * using the APIC_ACCESS_ADDR VMCS field. 8388 */ 8389 if (!flexpriority_enabled) 8390 vt_x86_ops.set_apic_access_page_addr = NULL; 8391 8392 if (!cpu_has_vmx_tpr_shadow()) 8393 vt_x86_ops.update_cr8_intercept = NULL; 8394 8395 #if IS_ENABLED(CONFIG_HYPERV) 8396 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8397 && enable_ept) { 8398 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8399 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8400 } 8401 #endif 8402 8403 if (!cpu_has_vmx_ple()) { 8404 ple_gap = 0; 8405 ple_window = 0; 8406 ple_window_grow = 0; 8407 ple_window_max = 0; 8408 ple_window_shrink = 0; 8409 } 8410 8411 if (!cpu_has_vmx_apicv()) 8412 enable_apicv = 0; 8413 if (!enable_apicv) 8414 vt_x86_ops.sync_pir_to_irr = NULL; 8415 8416 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8417 enable_ipiv = false; 8418 8419 if (cpu_has_vmx_tsc_scaling()) 8420 kvm_caps.has_tsc_control = true; 8421 8422 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8423 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8424 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8425 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8426 8427 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8428 8429 if (enable_ept) 8430 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8431 cpu_has_vmx_ept_execute_only()); 8432 else 8433 vt_x86_ops.get_mt_mask = NULL; 8434 8435 /* 8436 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8437 * bits to shadow_zero_check. 8438 */ 8439 vmx_setup_me_spte_mask(); 8440 8441 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8442 ept_caps_to_lpage_level(vmx_capability.ept)); 8443 8444 /* 8445 * Only enable PML when hardware supports PML feature, and both EPT 8446 * and EPT A/D bit features are enabled -- PML depends on them to work. 8447 */ 8448 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8449 enable_pml = 0; 8450 8451 if (!cpu_has_vmx_preemption_timer()) 8452 enable_preemption_timer = false; 8453 8454 if (enable_preemption_timer) { 8455 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8456 8457 cpu_preemption_timer_multi = 8458 vmx_misc_preemption_timer_rate(vmcs_config.misc); 8459 8460 if (tsc_khz) 8461 use_timer_freq = (u64)tsc_khz * 1000; 8462 use_timer_freq >>= cpu_preemption_timer_multi; 8463 8464 /* 8465 * KVM "disables" the preemption timer by setting it to its max 8466 * value. Don't use the timer if it might cause spurious exits 8467 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8468 */ 8469 if (use_timer_freq > 0xffffffffu / 10) 8470 enable_preemption_timer = false; 8471 } 8472 8473 if (!enable_preemption_timer) { 8474 vt_x86_ops.set_hv_timer = NULL; 8475 vt_x86_ops.cancel_hv_timer = NULL; 8476 } 8477 8478 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8479 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8480 8481 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8482 return -EINVAL; 8483 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8484 pt_mode = PT_MODE_SYSTEM; 8485 if (pt_mode == PT_MODE_HOST_GUEST) 8486 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8487 else 8488 vt_init_ops.handle_intel_pt_intr = NULL; 8489 8490 setup_default_sgx_lepubkeyhash(); 8491 8492 if (nested) { 8493 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8494 8495 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8496 if (r) 8497 return r; 8498 } 8499 8500 vmx_set_cpu_caps(); 8501 8502 r = alloc_kvm_area(); 8503 if (r && nested) 8504 nested_vmx_hardware_unsetup(); 8505 8506 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8507 8508 /* 8509 * On Intel CPUs that lack self-snoop feature, letting the guest control 8510 * memory types may result in unexpected behavior. So always ignore guest 8511 * PAT on those CPUs and map VM as writeback, not allowing userspace to 8512 * disable the quirk. 8513 * 8514 * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is 8515 * supported, UC is slow enough to cause issues with some older guests (e.g. 8516 * an old version of bochs driver uses ioremap() instead of ioremap_wc() to 8517 * map the video RAM, causing wayland desktop to fail to get started 8518 * correctly). To avoid breaking those older guests that rely on KVM to force 8519 * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the 8520 * safer (for performance) default behavior. 8521 * 8522 * On top of this, non-coherent DMA devices need the guest to flush CPU 8523 * caches properly. This also requires honoring guest PAT, and is forced 8524 * independent of the quirk in vmx_ignore_guest_pat(). 8525 */ 8526 if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) 8527 kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8528 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8529 return r; 8530 } 8531 8532 static void vmx_cleanup_l1d_flush(void) 8533 { 8534 if (vmx_l1d_flush_pages) { 8535 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8536 vmx_l1d_flush_pages = NULL; 8537 } 8538 /* Restore state so sysfs ignores VMX */ 8539 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8540 } 8541 8542 void vmx_exit(void) 8543 { 8544 allow_smaller_maxphyaddr = false; 8545 8546 vmx_cleanup_l1d_flush(); 8547 8548 kvm_x86_vendor_exit(); 8549 } 8550 8551 int __init vmx_init(void) 8552 { 8553 int r, cpu; 8554 8555 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx); 8556 8557 if (!kvm_is_vmx_supported()) 8558 return -EOPNOTSUPP; 8559 8560 /* 8561 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing 8562 * to unwind if a later step fails. 8563 */ 8564 hv_init_evmcs(); 8565 8566 r = kvm_x86_vendor_init(&vt_init_ops); 8567 if (r) 8568 return r; 8569 8570 /* 8571 * Must be called after common x86 init so enable_ept is properly set 8572 * up. Hand the parameter mitigation value in which was stored in 8573 * the pre module init parser. If no parameter was given, it will 8574 * contain 'auto' which will be turned into the default 'cond' 8575 * mitigation mode. 8576 */ 8577 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8578 if (r) 8579 goto err_l1d_flush; 8580 8581 for_each_possible_cpu(cpu) { 8582 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8583 8584 pi_init_cpu(cpu); 8585 } 8586 8587 vmx_check_vmcs12_offsets(); 8588 8589 /* 8590 * Shadow paging doesn't have a (further) performance penalty 8591 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8592 * by default 8593 */ 8594 if (!enable_ept) 8595 allow_smaller_maxphyaddr = true; 8596 8597 return 0; 8598 8599 err_l1d_flush: 8600 kvm_x86_vendor_exit(); 8601 return r; 8602 } 8603