1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 #include <linux/entry-kvm.h> 32 33 #include <asm/apic.h> 34 #include <asm/asm.h> 35 #include <asm/cpu.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/api.h> 40 #include <asm/fpu/xstate.h> 41 #include <asm/fred.h> 42 #include <asm/idtentry.h> 43 #include <asm/io.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/reboot.h> 46 #include <asm/perf_event.h> 47 #include <asm/mmu_context.h> 48 #include <asm/mshyperv.h> 49 #include <asm/msr.h> 50 #include <asm/mwait.h> 51 #include <asm/spec-ctrl.h> 52 #include <asm/vmx.h> 53 54 #include <trace/events/ipi.h> 55 56 #include "capabilities.h" 57 #include "cpuid.h" 58 #include "hyperv.h" 59 #include "kvm_onhyperv.h" 60 #include "irq.h" 61 #include "kvm_cache_regs.h" 62 #include "lapic.h" 63 #include "mmu.h" 64 #include "nested.h" 65 #include "pmu.h" 66 #include "sgx.h" 67 #include "trace.h" 68 #include "vmcs.h" 69 #include "vmcs12.h" 70 #include "vmx.h" 71 #include "x86.h" 72 #include "x86_ops.h" 73 #include "smm.h" 74 #include "vmx_onhyperv.h" 75 #include "posted_intr.h" 76 77 MODULE_AUTHOR("Qumranet"); 78 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); 79 MODULE_LICENSE("GPL"); 80 81 #ifdef MODULE 82 static const struct x86_cpu_id vmx_cpu_id[] = { 83 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 84 {} 85 }; 86 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 87 #endif 88 89 bool __read_mostly enable_vpid = 1; 90 module_param_named(vpid, enable_vpid, bool, 0444); 91 92 static bool __read_mostly enable_vnmi = 1; 93 module_param_named(vnmi, enable_vnmi, bool, 0444); 94 95 bool __read_mostly flexpriority_enabled = 1; 96 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 97 98 bool __read_mostly enable_ept = 1; 99 module_param_named(ept, enable_ept, bool, 0444); 100 101 bool __read_mostly enable_unrestricted_guest = 1; 102 module_param_named(unrestricted_guest, 103 enable_unrestricted_guest, bool, 0444); 104 105 bool __read_mostly enable_ept_ad_bits = 1; 106 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 107 108 static bool __read_mostly emulate_invalid_guest_state = true; 109 module_param(emulate_invalid_guest_state, bool, 0444); 110 111 static bool __read_mostly fasteoi = 1; 112 module_param(fasteoi, bool, 0444); 113 114 module_param(enable_apicv, bool, 0444); 115 116 bool __read_mostly enable_ipiv = true; 117 module_param(enable_ipiv, bool, 0444); 118 119 /* 120 * If nested=1, nested virtualization is supported, i.e., guests may use 121 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 122 * use VMX instructions. 123 */ 124 static bool __read_mostly nested = 1; 125 module_param(nested, bool, 0444); 126 127 bool __read_mostly enable_pml = 1; 128 module_param_named(pml, enable_pml, bool, 0444); 129 130 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 131 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 132 133 static bool __read_mostly dump_invalid_vmcs = 0; 134 module_param(dump_invalid_vmcs, bool, 0644); 135 136 #define MSR_BITMAP_MODE_X2APIC 1 137 #define MSR_BITMAP_MODE_X2APIC_APICV 2 138 139 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 140 141 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 142 static int __read_mostly cpu_preemption_timer_multi; 143 static bool __read_mostly enable_preemption_timer = 1; 144 #ifdef CONFIG_X86_64 145 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 146 #endif 147 148 extern bool __read_mostly allow_smaller_maxphyaddr; 149 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 150 151 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 152 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 153 #define KVM_VM_CR0_ALWAYS_ON \ 154 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 155 156 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 157 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 158 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 159 160 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 161 162 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 163 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 164 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 165 RTIT_STATUS_BYTECNT)) 166 167 /* 168 * List of MSRs that can be directly passed to the guest. 169 * In addition to these x2apic, PT and LBR MSRs are handled specially. 170 */ 171 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 172 MSR_IA32_SPEC_CTRL, 173 MSR_IA32_PRED_CMD, 174 MSR_IA32_FLUSH_CMD, 175 MSR_IA32_TSC, 176 #ifdef CONFIG_X86_64 177 MSR_FS_BASE, 178 MSR_GS_BASE, 179 MSR_KERNEL_GS_BASE, 180 MSR_IA32_XFD, 181 MSR_IA32_XFD_ERR, 182 #endif 183 MSR_IA32_SYSENTER_CS, 184 MSR_IA32_SYSENTER_ESP, 185 MSR_IA32_SYSENTER_EIP, 186 MSR_CORE_C1_RES, 187 MSR_CORE_C3_RESIDENCY, 188 MSR_CORE_C6_RESIDENCY, 189 MSR_CORE_C7_RESIDENCY, 190 }; 191 192 /* 193 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 194 * ple_gap: upper bound on the amount of time between two successive 195 * executions of PAUSE in a loop. Also indicate if ple enabled. 196 * According to test, this time is usually smaller than 128 cycles. 197 * ple_window: upper bound on the amount of time a guest is allowed to execute 198 * in a PAUSE loop. Tests indicate that most spinlocks are held for 199 * less than 2^12 cycles 200 * Time is measured based on a counter that runs at the same rate as the TSC, 201 * refer SDM volume 3b section 21.6.13 & 22.1.3. 202 */ 203 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 204 module_param(ple_gap, uint, 0444); 205 206 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 207 module_param(ple_window, uint, 0444); 208 209 /* Default doubles per-vcpu window every exit. */ 210 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 211 module_param(ple_window_grow, uint, 0444); 212 213 /* Default resets per-vcpu window every exit to ple_window. */ 214 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 215 module_param(ple_window_shrink, uint, 0444); 216 217 /* Default is to compute the maximum so we can never overflow. */ 218 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 219 module_param(ple_window_max, uint, 0444); 220 221 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ 222 int __read_mostly pt_mode = PT_MODE_SYSTEM; 223 #ifdef CONFIG_BROKEN 224 module_param(pt_mode, int, S_IRUGO); 225 #endif 226 227 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 228 229 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 230 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 231 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 232 233 /* Storage for pre module init parameter parsing */ 234 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 235 236 static const struct { 237 const char *option; 238 bool for_parse; 239 } vmentry_l1d_param[] = { 240 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 241 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 242 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 243 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 244 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 245 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 246 }; 247 248 #define L1D_CACHE_ORDER 4 249 static void *vmx_l1d_flush_pages; 250 251 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 252 { 253 struct page *page; 254 unsigned int i; 255 256 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 257 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 258 return 0; 259 } 260 261 if (!enable_ept) { 262 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 263 return 0; 264 } 265 266 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 267 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 268 return 0; 269 } 270 271 /* If set to auto use the default l1tf mitigation method */ 272 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 273 switch (l1tf_mitigation) { 274 case L1TF_MITIGATION_OFF: 275 l1tf = VMENTER_L1D_FLUSH_NEVER; 276 break; 277 case L1TF_MITIGATION_AUTO: 278 case L1TF_MITIGATION_FLUSH_NOWARN: 279 case L1TF_MITIGATION_FLUSH: 280 case L1TF_MITIGATION_FLUSH_NOSMT: 281 l1tf = VMENTER_L1D_FLUSH_COND; 282 break; 283 case L1TF_MITIGATION_FULL: 284 case L1TF_MITIGATION_FULL_FORCE: 285 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 286 break; 287 } 288 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 289 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 290 } 291 292 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 293 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 294 /* 295 * This allocation for vmx_l1d_flush_pages is not tied to a VM 296 * lifetime and so should not be charged to a memcg. 297 */ 298 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 299 if (!page) 300 return -ENOMEM; 301 vmx_l1d_flush_pages = page_address(page); 302 303 /* 304 * Initialize each page with a different pattern in 305 * order to protect against KSM in the nested 306 * virtualization case. 307 */ 308 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 309 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 310 PAGE_SIZE); 311 } 312 } 313 314 l1tf_vmx_mitigation = l1tf; 315 316 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 317 static_branch_enable(&vmx_l1d_should_flush); 318 else 319 static_branch_disable(&vmx_l1d_should_flush); 320 321 if (l1tf == VMENTER_L1D_FLUSH_COND) 322 static_branch_enable(&vmx_l1d_flush_cond); 323 else 324 static_branch_disable(&vmx_l1d_flush_cond); 325 return 0; 326 } 327 328 static int vmentry_l1d_flush_parse(const char *s) 329 { 330 unsigned int i; 331 332 if (s) { 333 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 334 if (vmentry_l1d_param[i].for_parse && 335 sysfs_streq(s, vmentry_l1d_param[i].option)) 336 return i; 337 } 338 } 339 return -EINVAL; 340 } 341 342 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 343 { 344 int l1tf, ret; 345 346 l1tf = vmentry_l1d_flush_parse(s); 347 if (l1tf < 0) 348 return l1tf; 349 350 if (!boot_cpu_has(X86_BUG_L1TF)) 351 return 0; 352 353 /* 354 * Has vmx_init() run already? If not then this is the pre init 355 * parameter parsing. In that case just store the value and let 356 * vmx_init() do the proper setup after enable_ept has been 357 * established. 358 */ 359 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 360 vmentry_l1d_flush_param = l1tf; 361 return 0; 362 } 363 364 mutex_lock(&vmx_l1d_flush_mutex); 365 ret = vmx_setup_l1d_flush(l1tf); 366 mutex_unlock(&vmx_l1d_flush_mutex); 367 return ret; 368 } 369 370 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 371 { 372 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 373 return sysfs_emit(s, "???\n"); 374 375 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 376 } 377 378 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 379 { 380 u64 msr; 381 382 if (!vmx->disable_fb_clear) 383 return; 384 385 msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); 386 msr |= FB_CLEAR_DIS; 387 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); 388 /* Cache the MSR value to avoid reading it later */ 389 vmx->msr_ia32_mcu_opt_ctrl = msr; 390 } 391 392 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 393 { 394 if (!vmx->disable_fb_clear) 395 return; 396 397 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 398 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 399 } 400 401 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 402 { 403 /* 404 * Disable VERW's behavior of clearing CPU buffers for the guest if the 405 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 406 * the mitigation. Disabling the clearing behavior provides a 407 * performance boost for guests that aren't aware that manually clearing 408 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 409 * and VM-Exit. 410 */ 411 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 412 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 413 !boot_cpu_has_bug(X86_BUG_MDS) && 414 !boot_cpu_has_bug(X86_BUG_TAA); 415 416 /* 417 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 418 * at VMEntry. Skip the MSR read/write when a guest has no use case to 419 * execute VERW. 420 */ 421 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 422 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 423 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 424 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 425 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 426 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 427 vmx->disable_fb_clear = false; 428 } 429 430 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 431 .set = vmentry_l1d_flush_set, 432 .get = vmentry_l1d_flush_get, 433 }; 434 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 435 436 static u32 vmx_segment_access_rights(struct kvm_segment *var); 437 438 void vmx_vmexit(void); 439 440 #define vmx_insn_failed(fmt...) \ 441 do { \ 442 WARN_ONCE(1, fmt); \ 443 pr_warn_ratelimited(fmt); \ 444 } while (0) 445 446 noinline void vmread_error(unsigned long field) 447 { 448 vmx_insn_failed("vmread failed: field=%lx\n", field); 449 } 450 451 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 452 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 453 { 454 if (fault) { 455 kvm_spurious_fault(); 456 } else { 457 instrumentation_begin(); 458 vmread_error(field); 459 instrumentation_end(); 460 } 461 } 462 #endif 463 464 noinline void vmwrite_error(unsigned long field, unsigned long value) 465 { 466 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 467 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 468 } 469 470 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 471 { 472 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 473 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 474 } 475 476 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 477 { 478 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 479 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 480 } 481 482 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 483 { 484 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 485 ext, vpid, gva); 486 } 487 488 noinline void invept_error(unsigned long ext, u64 eptp) 489 { 490 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); 491 } 492 493 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 494 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 495 /* 496 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 497 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 498 */ 499 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 500 501 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 502 static DEFINE_SPINLOCK(vmx_vpid_lock); 503 504 struct vmcs_config vmcs_config __ro_after_init; 505 struct vmx_capability vmx_capability __ro_after_init; 506 507 #define VMX_SEGMENT_FIELD(seg) \ 508 [VCPU_SREG_##seg] = { \ 509 .selector = GUEST_##seg##_SELECTOR, \ 510 .base = GUEST_##seg##_BASE, \ 511 .limit = GUEST_##seg##_LIMIT, \ 512 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 513 } 514 515 static const struct kvm_vmx_segment_field { 516 unsigned selector; 517 unsigned base; 518 unsigned limit; 519 unsigned ar_bytes; 520 } kvm_vmx_segment_fields[] = { 521 VMX_SEGMENT_FIELD(CS), 522 VMX_SEGMENT_FIELD(DS), 523 VMX_SEGMENT_FIELD(ES), 524 VMX_SEGMENT_FIELD(FS), 525 VMX_SEGMENT_FIELD(GS), 526 VMX_SEGMENT_FIELD(SS), 527 VMX_SEGMENT_FIELD(TR), 528 VMX_SEGMENT_FIELD(LDTR), 529 }; 530 531 532 static unsigned long host_idt_base; 533 534 #if IS_ENABLED(CONFIG_HYPERV) 535 static bool __read_mostly enlightened_vmcs = true; 536 module_param(enlightened_vmcs, bool, 0444); 537 538 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 539 { 540 struct hv_enlightened_vmcs *evmcs; 541 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 542 543 if (partition_assist_page == INVALID_PAGE) 544 return -ENOMEM; 545 546 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 547 548 evmcs->partition_assist_page = partition_assist_page; 549 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 550 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 551 552 return 0; 553 } 554 555 static __init void hv_init_evmcs(void) 556 { 557 int cpu; 558 559 if (!enlightened_vmcs) 560 return; 561 562 /* 563 * Enlightened VMCS usage should be recommended and the host needs 564 * to support eVMCS v1 or above. 565 */ 566 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 567 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 568 KVM_EVMCS_VERSION) { 569 570 /* Check that we have assist pages on all online CPUs */ 571 for_each_online_cpu(cpu) { 572 if (!hv_get_vp_assist_page(cpu)) { 573 enlightened_vmcs = false; 574 break; 575 } 576 } 577 578 if (enlightened_vmcs) { 579 pr_info("Using Hyper-V Enlightened VMCS\n"); 580 static_branch_enable(&__kvm_is_using_evmcs); 581 } 582 583 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 584 vt_x86_ops.enable_l2_tlb_flush 585 = hv_enable_l2_tlb_flush; 586 } else { 587 enlightened_vmcs = false; 588 } 589 } 590 591 static void hv_reset_evmcs(void) 592 { 593 struct hv_vp_assist_page *vp_ap; 594 595 if (!kvm_is_using_evmcs()) 596 return; 597 598 /* 599 * KVM should enable eVMCS if and only if all CPUs have a VP assist 600 * page, and should reject CPU onlining if eVMCS is enabled the CPU 601 * doesn't have a VP assist page allocated. 602 */ 603 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 604 if (WARN_ON_ONCE(!vp_ap)) 605 return; 606 607 /* 608 * Reset everything to support using non-enlightened VMCS access later 609 * (e.g. when we reload the module with enlightened_vmcs=0) 610 */ 611 vp_ap->nested_control.features.directhypercall = 0; 612 vp_ap->current_nested_vmcs = 0; 613 vp_ap->enlighten_vmentry = 0; 614 } 615 616 #else /* IS_ENABLED(CONFIG_HYPERV) */ 617 static void hv_init_evmcs(void) {} 618 static void hv_reset_evmcs(void) {} 619 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 620 621 /* 622 * Comment's format: document - errata name - stepping - processor name. 623 * Refer from 624 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 625 */ 626 static u32 vmx_preemption_cpu_tfms[] = { 627 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 628 0x000206E6, 629 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 630 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 631 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 632 0x00020652, 633 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 634 0x00020655, 635 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 636 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 637 /* 638 * 320767.pdf - AAP86 - B1 - 639 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 640 */ 641 0x000106E5, 642 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 643 0x000106A0, 644 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 645 0x000106A1, 646 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 647 0x000106A4, 648 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 649 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 650 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 651 0x000106A5, 652 /* Xeon E3-1220 V2 */ 653 0x000306A8, 654 }; 655 656 static inline bool cpu_has_broken_vmx_preemption_timer(void) 657 { 658 u32 eax = cpuid_eax(0x00000001), i; 659 660 /* Clear the reserved bits */ 661 eax &= ~(0x3U << 14 | 0xfU << 28); 662 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 663 if (eax == vmx_preemption_cpu_tfms[i]) 664 return true; 665 666 return false; 667 } 668 669 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 670 { 671 return flexpriority_enabled && lapic_in_kernel(vcpu); 672 } 673 674 static int vmx_get_passthrough_msr_slot(u32 msr) 675 { 676 int i; 677 678 switch (msr) { 679 case 0x800 ... 0x8ff: 680 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 681 return -ENOENT; 682 case MSR_IA32_RTIT_STATUS: 683 case MSR_IA32_RTIT_OUTPUT_BASE: 684 case MSR_IA32_RTIT_OUTPUT_MASK: 685 case MSR_IA32_RTIT_CR3_MATCH: 686 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 687 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ 688 case MSR_LBR_SELECT: 689 case MSR_LBR_TOS: 690 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: 691 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: 692 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: 693 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 694 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 695 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 696 return -ENOENT; 697 } 698 699 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 700 if (vmx_possible_passthrough_msrs[i] == msr) 701 return i; 702 } 703 704 WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 705 return -ENOENT; 706 } 707 708 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 709 { 710 int i; 711 712 i = kvm_find_user_return_msr(msr); 713 if (i >= 0) 714 return &vmx->guest_uret_msrs[i]; 715 return NULL; 716 } 717 718 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 719 struct vmx_uret_msr *msr, u64 data) 720 { 721 unsigned int slot = msr - vmx->guest_uret_msrs; 722 int ret = 0; 723 724 if (msr->load_into_hardware) { 725 preempt_disable(); 726 ret = kvm_set_user_return_msr(slot, data, msr->mask); 727 preempt_enable(); 728 } 729 if (!ret) 730 msr->data = data; 731 return ret; 732 } 733 734 /* 735 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 736 * 737 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 738 * atomically track post-VMXON state, e.g. this may be called in NMI context. 739 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 740 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 741 * magically in RM, VM86, compat mode, or at CPL>0. 742 */ 743 static int kvm_cpu_vmxoff(void) 744 { 745 asm goto("1: vmxoff\n\t" 746 _ASM_EXTABLE(1b, %l[fault]) 747 ::: "cc", "memory" : fault); 748 749 cr4_clear_bits(X86_CR4_VMXE); 750 return 0; 751 752 fault: 753 cr4_clear_bits(X86_CR4_VMXE); 754 return -EIO; 755 } 756 757 void vmx_emergency_disable_virtualization_cpu(void) 758 { 759 int cpu = raw_smp_processor_id(); 760 struct loaded_vmcs *v; 761 762 kvm_rebooting = true; 763 764 /* 765 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 766 * set in task context. If this races with VMX is disabled by an NMI, 767 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 768 * kvm_rebooting set. 769 */ 770 if (!(__read_cr4() & X86_CR4_VMXE)) 771 return; 772 773 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 774 loaded_vmcss_on_cpu_link) 775 vmcs_clear(v->vmcs); 776 777 kvm_cpu_vmxoff(); 778 } 779 780 static void __loaded_vmcs_clear(void *arg) 781 { 782 struct loaded_vmcs *loaded_vmcs = arg; 783 int cpu = raw_smp_processor_id(); 784 785 if (loaded_vmcs->cpu != cpu) 786 return; /* vcpu migration can race with cpu offline */ 787 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 788 per_cpu(current_vmcs, cpu) = NULL; 789 790 vmcs_clear(loaded_vmcs->vmcs); 791 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 792 vmcs_clear(loaded_vmcs->shadow_vmcs); 793 794 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 795 796 /* 797 * Ensure all writes to loaded_vmcs, including deleting it from its 798 * current percpu list, complete before setting loaded_vmcs->cpu to 799 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 800 * and add loaded_vmcs to its percpu list before it's deleted from this 801 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 802 */ 803 smp_wmb(); 804 805 loaded_vmcs->cpu = -1; 806 loaded_vmcs->launched = 0; 807 } 808 809 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 810 { 811 int cpu = loaded_vmcs->cpu; 812 813 if (cpu != -1) 814 smp_call_function_single(cpu, 815 __loaded_vmcs_clear, loaded_vmcs, 1); 816 } 817 818 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 819 unsigned field) 820 { 821 bool ret; 822 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 823 824 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 825 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 826 vmx->segment_cache.bitmask = 0; 827 } 828 ret = vmx->segment_cache.bitmask & mask; 829 vmx->segment_cache.bitmask |= mask; 830 return ret; 831 } 832 833 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 834 { 835 u16 *p = &vmx->segment_cache.seg[seg].selector; 836 837 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 838 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 839 return *p; 840 } 841 842 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 843 { 844 ulong *p = &vmx->segment_cache.seg[seg].base; 845 846 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 847 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 848 return *p; 849 } 850 851 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 852 { 853 u32 *p = &vmx->segment_cache.seg[seg].limit; 854 855 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 856 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 857 return *p; 858 } 859 860 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 861 { 862 u32 *p = &vmx->segment_cache.seg[seg].ar; 863 864 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 865 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 866 return *p; 867 } 868 869 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 870 { 871 u32 eb; 872 873 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 874 (1u << DB_VECTOR) | (1u << AC_VECTOR); 875 /* 876 * #VE isn't used for VMX. To test against unexpected changes 877 * related to #VE for VMX, intercept unexpected #VE and warn on it. 878 */ 879 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 880 eb |= 1u << VE_VECTOR; 881 /* 882 * Guest access to VMware backdoor ports could legitimately 883 * trigger #GP because of TSS I/O permission bitmap. 884 * We intercept those #GP and allow access to them anyway 885 * as VMware does. 886 */ 887 if (enable_vmware_backdoor) 888 eb |= (1u << GP_VECTOR); 889 if ((vcpu->guest_debug & 890 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 891 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 892 eb |= 1u << BP_VECTOR; 893 if (to_vmx(vcpu)->rmode.vm86_active) 894 eb = ~0; 895 if (!vmx_need_pf_intercept(vcpu)) 896 eb &= ~(1u << PF_VECTOR); 897 898 /* When we are running a nested L2 guest and L1 specified for it a 899 * certain exception bitmap, we must trap the same exceptions and pass 900 * them to L1. When running L2, we will only handle the exceptions 901 * specified above if L1 did not want them. 902 */ 903 if (is_guest_mode(vcpu)) 904 eb |= get_vmcs12(vcpu)->exception_bitmap; 905 else { 906 int mask = 0, match = 0; 907 908 if (enable_ept && (eb & (1u << PF_VECTOR))) { 909 /* 910 * If EPT is enabled, #PF is currently only intercepted 911 * if MAXPHYADDR is smaller on the guest than on the 912 * host. In that case we only care about present, 913 * non-reserved faults. For vmcs02, however, PFEC_MASK 914 * and PFEC_MATCH are set in prepare_vmcs02_rare. 915 */ 916 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 917 match = PFERR_PRESENT_MASK; 918 } 919 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 920 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 921 } 922 923 /* 924 * Disabling xfd interception indicates that dynamic xfeatures 925 * might be used in the guest. Always trap #NM in this case 926 * to save guest xfd_err timely. 927 */ 928 if (vcpu->arch.xfd_no_write_intercept) 929 eb |= (1u << NM_VECTOR); 930 931 vmcs_write32(EXCEPTION_BITMAP, eb); 932 } 933 934 /* 935 * Check if MSR is intercepted for currently loaded MSR bitmap. 936 */ 937 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 938 { 939 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 940 return true; 941 942 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 943 } 944 945 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 946 { 947 unsigned int flags = 0; 948 949 if (vmx->loaded_vmcs->launched) 950 flags |= VMX_RUN_VMRESUME; 951 952 /* 953 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 954 * to change it directly without causing a vmexit. In that case read 955 * it after vmexit and store it in vmx->spec_ctrl. 956 */ 957 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 958 flags |= VMX_RUN_SAVE_SPEC_CTRL; 959 960 return flags; 961 } 962 963 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 964 unsigned long entry, unsigned long exit) 965 { 966 vm_entry_controls_clearbit(vmx, entry); 967 vm_exit_controls_clearbit(vmx, exit); 968 } 969 970 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 971 { 972 unsigned int i; 973 974 for (i = 0; i < m->nr; ++i) { 975 if (m->val[i].index == msr) 976 return i; 977 } 978 return -ENOENT; 979 } 980 981 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 982 { 983 int i; 984 struct msr_autoload *m = &vmx->msr_autoload; 985 986 switch (msr) { 987 case MSR_EFER: 988 if (cpu_has_load_ia32_efer()) { 989 clear_atomic_switch_msr_special(vmx, 990 VM_ENTRY_LOAD_IA32_EFER, 991 VM_EXIT_LOAD_IA32_EFER); 992 return; 993 } 994 break; 995 case MSR_CORE_PERF_GLOBAL_CTRL: 996 if (cpu_has_load_perf_global_ctrl()) { 997 clear_atomic_switch_msr_special(vmx, 998 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 999 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1000 return; 1001 } 1002 break; 1003 } 1004 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1005 if (i < 0) 1006 goto skip_guest; 1007 --m->guest.nr; 1008 m->guest.val[i] = m->guest.val[m->guest.nr]; 1009 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1010 1011 skip_guest: 1012 i = vmx_find_loadstore_msr_slot(&m->host, msr); 1013 if (i < 0) 1014 return; 1015 1016 --m->host.nr; 1017 m->host.val[i] = m->host.val[m->host.nr]; 1018 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1019 } 1020 1021 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1022 unsigned long entry, unsigned long exit, 1023 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1024 u64 guest_val, u64 host_val) 1025 { 1026 vmcs_write64(guest_val_vmcs, guest_val); 1027 if (host_val_vmcs != HOST_IA32_EFER) 1028 vmcs_write64(host_val_vmcs, host_val); 1029 vm_entry_controls_setbit(vmx, entry); 1030 vm_exit_controls_setbit(vmx, exit); 1031 } 1032 1033 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1034 u64 guest_val, u64 host_val, bool entry_only) 1035 { 1036 int i, j = 0; 1037 struct msr_autoload *m = &vmx->msr_autoload; 1038 1039 switch (msr) { 1040 case MSR_EFER: 1041 if (cpu_has_load_ia32_efer()) { 1042 add_atomic_switch_msr_special(vmx, 1043 VM_ENTRY_LOAD_IA32_EFER, 1044 VM_EXIT_LOAD_IA32_EFER, 1045 GUEST_IA32_EFER, 1046 HOST_IA32_EFER, 1047 guest_val, host_val); 1048 return; 1049 } 1050 break; 1051 case MSR_CORE_PERF_GLOBAL_CTRL: 1052 if (cpu_has_load_perf_global_ctrl()) { 1053 add_atomic_switch_msr_special(vmx, 1054 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1055 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1056 GUEST_IA32_PERF_GLOBAL_CTRL, 1057 HOST_IA32_PERF_GLOBAL_CTRL, 1058 guest_val, host_val); 1059 return; 1060 } 1061 break; 1062 case MSR_IA32_PEBS_ENABLE: 1063 /* PEBS needs a quiescent period after being disabled (to write 1064 * a record). Disabling PEBS through VMX MSR swapping doesn't 1065 * provide that period, so a CPU could write host's record into 1066 * guest's memory. 1067 */ 1068 wrmsrq(MSR_IA32_PEBS_ENABLE, 0); 1069 } 1070 1071 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1072 if (!entry_only) 1073 j = vmx_find_loadstore_msr_slot(&m->host, msr); 1074 1075 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 1076 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 1077 printk_once(KERN_WARNING "Not enough msr switch entries. " 1078 "Can't add msr %x\n", msr); 1079 return; 1080 } 1081 if (i < 0) { 1082 i = m->guest.nr++; 1083 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1084 } 1085 m->guest.val[i].index = msr; 1086 m->guest.val[i].value = guest_val; 1087 1088 if (entry_only) 1089 return; 1090 1091 if (j < 0) { 1092 j = m->host.nr++; 1093 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1094 } 1095 m->host.val[j].index = msr; 1096 m->host.val[j].value = host_val; 1097 } 1098 1099 static bool update_transition_efer(struct vcpu_vmx *vmx) 1100 { 1101 u64 guest_efer = vmx->vcpu.arch.efer; 1102 u64 ignore_bits = 0; 1103 int i; 1104 1105 /* Shadow paging assumes NX to be available. */ 1106 if (!enable_ept) 1107 guest_efer |= EFER_NX; 1108 1109 /* 1110 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1111 */ 1112 ignore_bits |= EFER_SCE; 1113 #ifdef CONFIG_X86_64 1114 ignore_bits |= EFER_LMA | EFER_LME; 1115 /* SCE is meaningful only in long mode on Intel */ 1116 if (guest_efer & EFER_LMA) 1117 ignore_bits &= ~(u64)EFER_SCE; 1118 #endif 1119 1120 /* 1121 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1122 * On CPUs that support "load IA32_EFER", always switch EFER 1123 * atomically, since it's faster than switching it manually. 1124 */ 1125 if (cpu_has_load_ia32_efer() || 1126 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { 1127 if (!(guest_efer & EFER_LMA)) 1128 guest_efer &= ~EFER_LME; 1129 if (guest_efer != kvm_host.efer) 1130 add_atomic_switch_msr(vmx, MSR_EFER, 1131 guest_efer, kvm_host.efer, false); 1132 else 1133 clear_atomic_switch_msr(vmx, MSR_EFER); 1134 return false; 1135 } 1136 1137 i = kvm_find_user_return_msr(MSR_EFER); 1138 if (i < 0) 1139 return false; 1140 1141 clear_atomic_switch_msr(vmx, MSR_EFER); 1142 1143 guest_efer &= ~ignore_bits; 1144 guest_efer |= kvm_host.efer & ignore_bits; 1145 1146 vmx->guest_uret_msrs[i].data = guest_efer; 1147 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1148 1149 return true; 1150 } 1151 1152 #ifdef CONFIG_X86_32 1153 /* 1154 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1155 * VMCS rather than the segment table. KVM uses this helper to figure 1156 * out the current bases to poke them into the VMCS before entry. 1157 */ 1158 static unsigned long segment_base(u16 selector) 1159 { 1160 struct desc_struct *table; 1161 unsigned long v; 1162 1163 if (!(selector & ~SEGMENT_RPL_MASK)) 1164 return 0; 1165 1166 table = get_current_gdt_ro(); 1167 1168 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1169 u16 ldt_selector = kvm_read_ldt(); 1170 1171 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1172 return 0; 1173 1174 table = (struct desc_struct *)segment_base(ldt_selector); 1175 } 1176 v = get_desc_base(&table[selector >> 3]); 1177 return v; 1178 } 1179 #endif 1180 1181 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1182 { 1183 return vmx_pt_mode_is_host_guest() && 1184 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1185 } 1186 1187 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1188 { 1189 /* The base must be 128-byte aligned and a legal physical address. */ 1190 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1191 } 1192 1193 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1194 { 1195 u32 i; 1196 1197 wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1198 wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1199 wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1200 wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1201 for (i = 0; i < addr_range; i++) { 1202 wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1203 wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1204 } 1205 } 1206 1207 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1208 { 1209 u32 i; 1210 1211 rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1212 rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1213 rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1214 rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1215 for (i = 0; i < addr_range; i++) { 1216 rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1217 rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1218 } 1219 } 1220 1221 static void pt_guest_enter(struct vcpu_vmx *vmx) 1222 { 1223 if (vmx_pt_mode_is_system()) 1224 return; 1225 1226 /* 1227 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1228 * Save host state before VM entry. 1229 */ 1230 rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1231 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1232 wrmsrq(MSR_IA32_RTIT_CTL, 0); 1233 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1234 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1235 } 1236 } 1237 1238 static void pt_guest_exit(struct vcpu_vmx *vmx) 1239 { 1240 if (vmx_pt_mode_is_system()) 1241 return; 1242 1243 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1244 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1245 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1246 } 1247 1248 /* 1249 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1250 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1251 */ 1252 if (vmx->pt_desc.host.ctl) 1253 wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1254 } 1255 1256 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1257 unsigned long fs_base, unsigned long gs_base) 1258 { 1259 if (unlikely(fs_sel != host->fs_sel)) { 1260 if (!(fs_sel & 7)) 1261 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1262 else 1263 vmcs_write16(HOST_FS_SELECTOR, 0); 1264 host->fs_sel = fs_sel; 1265 } 1266 if (unlikely(gs_sel != host->gs_sel)) { 1267 if (!(gs_sel & 7)) 1268 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1269 else 1270 vmcs_write16(HOST_GS_SELECTOR, 0); 1271 host->gs_sel = gs_sel; 1272 } 1273 if (unlikely(fs_base != host->fs_base)) { 1274 vmcs_writel(HOST_FS_BASE, fs_base); 1275 host->fs_base = fs_base; 1276 } 1277 if (unlikely(gs_base != host->gs_base)) { 1278 vmcs_writel(HOST_GS_BASE, gs_base); 1279 host->gs_base = gs_base; 1280 } 1281 } 1282 1283 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1284 { 1285 struct vcpu_vmx *vmx = to_vmx(vcpu); 1286 struct vmcs_host_state *host_state; 1287 #ifdef CONFIG_X86_64 1288 int cpu = raw_smp_processor_id(); 1289 #endif 1290 unsigned long fs_base, gs_base; 1291 u16 fs_sel, gs_sel; 1292 int i; 1293 1294 /* 1295 * Note that guest MSRs to be saved/restored can also be changed 1296 * when guest state is loaded. This happens when guest transitions 1297 * to/from long-mode by setting MSR_EFER.LMA. 1298 */ 1299 if (!vmx->guest_uret_msrs_loaded) { 1300 vmx->guest_uret_msrs_loaded = true; 1301 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1302 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1303 continue; 1304 1305 kvm_set_user_return_msr(i, 1306 vmx->guest_uret_msrs[i].data, 1307 vmx->guest_uret_msrs[i].mask); 1308 } 1309 } 1310 1311 if (vmx->nested.need_vmcs12_to_shadow_sync) 1312 nested_sync_vmcs12_to_shadow(vcpu); 1313 1314 if (vmx->guest_state_loaded) 1315 return; 1316 1317 host_state = &vmx->loaded_vmcs->host_state; 1318 1319 /* 1320 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1321 * allow segment selectors with cpl > 0 or ti == 1. 1322 */ 1323 host_state->ldt_sel = kvm_read_ldt(); 1324 1325 #ifdef CONFIG_X86_64 1326 savesegment(ds, host_state->ds_sel); 1327 savesegment(es, host_state->es_sel); 1328 1329 gs_base = cpu_kernelmode_gs_base(cpu); 1330 if (likely(is_64bit_mm(current->mm))) { 1331 current_save_fsgs(); 1332 fs_sel = current->thread.fsindex; 1333 gs_sel = current->thread.gsindex; 1334 fs_base = current->thread.fsbase; 1335 vmx->msr_host_kernel_gs_base = current->thread.gsbase; 1336 } else { 1337 savesegment(fs, fs_sel); 1338 savesegment(gs, gs_sel); 1339 fs_base = read_msr(MSR_FS_BASE); 1340 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1341 } 1342 1343 wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1344 #else 1345 savesegment(fs, fs_sel); 1346 savesegment(gs, gs_sel); 1347 fs_base = segment_base(fs_sel); 1348 gs_base = segment_base(gs_sel); 1349 #endif 1350 1351 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1352 vmx->guest_state_loaded = true; 1353 } 1354 1355 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1356 { 1357 struct vmcs_host_state *host_state; 1358 1359 if (!vmx->guest_state_loaded) 1360 return; 1361 1362 host_state = &vmx->loaded_vmcs->host_state; 1363 1364 ++vmx->vcpu.stat.host_state_reload; 1365 1366 #ifdef CONFIG_X86_64 1367 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1368 #endif 1369 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1370 kvm_load_ldt(host_state->ldt_sel); 1371 #ifdef CONFIG_X86_64 1372 load_gs_index(host_state->gs_sel); 1373 #else 1374 loadsegment(gs, host_state->gs_sel); 1375 #endif 1376 } 1377 if (host_state->fs_sel & 7) 1378 loadsegment(fs, host_state->fs_sel); 1379 #ifdef CONFIG_X86_64 1380 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1381 loadsegment(ds, host_state->ds_sel); 1382 loadsegment(es, host_state->es_sel); 1383 } 1384 #endif 1385 invalidate_tss_limit(); 1386 #ifdef CONFIG_X86_64 1387 wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1388 #endif 1389 load_fixmap_gdt(raw_smp_processor_id()); 1390 vmx->guest_state_loaded = false; 1391 vmx->guest_uret_msrs_loaded = false; 1392 } 1393 1394 #ifdef CONFIG_X86_64 1395 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1396 { 1397 preempt_disable(); 1398 if (vmx->guest_state_loaded) 1399 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1400 preempt_enable(); 1401 return vmx->msr_guest_kernel_gs_base; 1402 } 1403 1404 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1405 { 1406 preempt_disable(); 1407 if (vmx->guest_state_loaded) 1408 wrmsrq(MSR_KERNEL_GS_BASE, data); 1409 preempt_enable(); 1410 vmx->msr_guest_kernel_gs_base = data; 1411 } 1412 #endif 1413 1414 static void grow_ple_window(struct kvm_vcpu *vcpu) 1415 { 1416 struct vcpu_vmx *vmx = to_vmx(vcpu); 1417 unsigned int old = vmx->ple_window; 1418 1419 vmx->ple_window = __grow_ple_window(old, ple_window, 1420 ple_window_grow, 1421 ple_window_max); 1422 1423 if (vmx->ple_window != old) { 1424 vmx->ple_window_dirty = true; 1425 trace_kvm_ple_window_update(vcpu->vcpu_id, 1426 vmx->ple_window, old); 1427 } 1428 } 1429 1430 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1431 { 1432 struct vcpu_vmx *vmx = to_vmx(vcpu); 1433 unsigned int old = vmx->ple_window; 1434 1435 vmx->ple_window = __shrink_ple_window(old, ple_window, 1436 ple_window_shrink, 1437 ple_window); 1438 1439 if (vmx->ple_window != old) { 1440 vmx->ple_window_dirty = true; 1441 trace_kvm_ple_window_update(vcpu->vcpu_id, 1442 vmx->ple_window, old); 1443 } 1444 } 1445 1446 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, 1447 struct loaded_vmcs *buddy) 1448 { 1449 struct vcpu_vmx *vmx = to_vmx(vcpu); 1450 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1451 struct vmcs *prev; 1452 1453 if (!already_loaded) { 1454 loaded_vmcs_clear(vmx->loaded_vmcs); 1455 local_irq_disable(); 1456 1457 /* 1458 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1459 * this cpu's percpu list, otherwise it may not yet be deleted 1460 * from its previous cpu's percpu list. Pairs with the 1461 * smb_wmb() in __loaded_vmcs_clear(). 1462 */ 1463 smp_rmb(); 1464 1465 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1466 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1467 local_irq_enable(); 1468 } 1469 1470 prev = per_cpu(current_vmcs, cpu); 1471 if (prev != vmx->loaded_vmcs->vmcs) { 1472 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1473 vmcs_load(vmx->loaded_vmcs->vmcs); 1474 1475 /* 1476 * No indirect branch prediction barrier needed when switching 1477 * the active VMCS within a vCPU, unless IBRS is advertised to 1478 * the vCPU. To minimize the number of IBPBs executed, KVM 1479 * performs IBPB on nested VM-Exit (a single nested transition 1480 * may switch the active VMCS multiple times). 1481 */ 1482 if (static_branch_likely(&switch_vcpu_ibpb) && 1483 (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) 1484 indirect_branch_prediction_barrier(); 1485 } 1486 1487 if (!already_loaded) { 1488 void *gdt = get_current_gdt_ro(); 1489 1490 /* 1491 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1492 * TLB entries from its previous association with the vCPU. 1493 */ 1494 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1495 1496 /* 1497 * Linux uses per-cpu TSS and GDT, so set these when switching 1498 * processors. See 22.2.4. 1499 */ 1500 vmcs_writel(HOST_TR_BASE, 1501 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1502 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1503 1504 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1505 /* 22.2.3 */ 1506 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1507 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1508 } 1509 1510 vmx->loaded_vmcs->cpu = cpu; 1511 } 1512 } 1513 1514 /* 1515 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1516 * vcpu mutex is already taken. 1517 */ 1518 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1519 { 1520 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1521 shrink_ple_window(vcpu); 1522 1523 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1524 1525 vmx_vcpu_pi_load(vcpu, cpu); 1526 } 1527 1528 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1529 { 1530 vmx_vcpu_pi_put(vcpu); 1531 1532 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1533 } 1534 1535 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1536 { 1537 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1538 } 1539 1540 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1541 { 1542 struct vcpu_vmx *vmx = to_vmx(vcpu); 1543 unsigned long rflags, save_rflags; 1544 1545 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1546 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1547 rflags = vmcs_readl(GUEST_RFLAGS); 1548 if (vmx->rmode.vm86_active) { 1549 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1550 save_rflags = vmx->rmode.save_rflags; 1551 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1552 } 1553 vmx->rflags = rflags; 1554 } 1555 return vmx->rflags; 1556 } 1557 1558 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1559 { 1560 struct vcpu_vmx *vmx = to_vmx(vcpu); 1561 unsigned long old_rflags; 1562 1563 /* 1564 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1565 * is an unrestricted guest in order to mark L2 as needing emulation 1566 * if L1 runs L2 as a restricted guest. 1567 */ 1568 if (is_unrestricted_guest(vcpu)) { 1569 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1570 vmx->rflags = rflags; 1571 vmcs_writel(GUEST_RFLAGS, rflags); 1572 return; 1573 } 1574 1575 old_rflags = vmx_get_rflags(vcpu); 1576 vmx->rflags = rflags; 1577 if (vmx->rmode.vm86_active) { 1578 vmx->rmode.save_rflags = rflags; 1579 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1580 } 1581 vmcs_writel(GUEST_RFLAGS, rflags); 1582 1583 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1584 vmx->emulation_required = vmx_emulation_required(vcpu); 1585 } 1586 1587 bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1588 { 1589 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1590 } 1591 1592 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1593 { 1594 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1595 int ret = 0; 1596 1597 if (interruptibility & GUEST_INTR_STATE_STI) 1598 ret |= KVM_X86_SHADOW_INT_STI; 1599 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1600 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1601 1602 return ret; 1603 } 1604 1605 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1606 { 1607 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1608 u32 interruptibility = interruptibility_old; 1609 1610 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1611 1612 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1613 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1614 else if (mask & KVM_X86_SHADOW_INT_STI) 1615 interruptibility |= GUEST_INTR_STATE_STI; 1616 1617 if ((interruptibility != interruptibility_old)) 1618 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1619 } 1620 1621 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1622 { 1623 struct vcpu_vmx *vmx = to_vmx(vcpu); 1624 unsigned long value; 1625 1626 /* 1627 * Any MSR write that attempts to change bits marked reserved will 1628 * case a #GP fault. 1629 */ 1630 if (data & vmx->pt_desc.ctl_bitmask) 1631 return 1; 1632 1633 /* 1634 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1635 * result in a #GP unless the same write also clears TraceEn. 1636 */ 1637 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1638 (data & RTIT_CTL_TRACEEN) && 1639 data != vmx->pt_desc.guest.ctl) 1640 return 1; 1641 1642 /* 1643 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1644 * and FabricEn would cause #GP, if 1645 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1646 */ 1647 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1648 !(data & RTIT_CTL_FABRIC_EN) && 1649 !intel_pt_validate_cap(vmx->pt_desc.caps, 1650 PT_CAP_single_range_output)) 1651 return 1; 1652 1653 /* 1654 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1655 * utilize encodings marked reserved will cause a #GP fault. 1656 */ 1657 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1658 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1659 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1660 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1661 return 1; 1662 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1663 PT_CAP_cycle_thresholds); 1664 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1665 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1666 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1667 return 1; 1668 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1669 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1670 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1671 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1672 return 1; 1673 1674 /* 1675 * If ADDRx_CFG is reserved or the encodings is >2 will 1676 * cause a #GP fault. 1677 */ 1678 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1679 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1680 return 1; 1681 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1682 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1683 return 1; 1684 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1685 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1686 return 1; 1687 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1688 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1689 return 1; 1690 1691 return 0; 1692 } 1693 1694 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1695 void *insn, int insn_len) 1696 { 1697 /* 1698 * Emulation of instructions in SGX enclaves is impossible as RIP does 1699 * not point at the failing instruction, and even if it did, the code 1700 * stream is inaccessible. Inject #UD instead of exiting to userspace 1701 * so that guest userspace can't DoS the guest simply by triggering 1702 * emulation (enclaves are CPL3 only). 1703 */ 1704 if (to_vmx(vcpu)->exit_reason.enclave_mode) { 1705 kvm_queue_exception(vcpu, UD_VECTOR); 1706 return X86EMUL_PROPAGATE_FAULT; 1707 } 1708 1709 /* Check that emulation is possible during event vectoring */ 1710 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 1711 !kvm_can_emulate_event_vectoring(emul_type)) 1712 return X86EMUL_UNHANDLEABLE_VECTORING; 1713 1714 return X86EMUL_CONTINUE; 1715 } 1716 1717 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1718 { 1719 union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; 1720 unsigned long rip, orig_rip; 1721 u32 instr_len; 1722 1723 /* 1724 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1725 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1726 * set when EPT misconfig occurs. In practice, real hardware updates 1727 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1728 * (namely Hyper-V) don't set it due to it being undefined behavior, 1729 * i.e. we end up advancing IP with some random value. 1730 */ 1731 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1732 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1733 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1734 1735 /* 1736 * Emulating an enclave's instructions isn't supported as KVM 1737 * cannot access the enclave's memory or its true RIP, e.g. the 1738 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1739 * the RIP that actually triggered the VM-Exit. But, because 1740 * most instructions that cause VM-Exit will #UD in an enclave, 1741 * most instruction-based VM-Exits simply do not occur. 1742 * 1743 * There are a few exceptions, notably the debug instructions 1744 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1745 * and generate #DB/#BP as expected, which KVM might intercept. 1746 * But again, the CPU does the dirty work and saves an instr 1747 * length of zero so VMMs don't shoot themselves in the foot. 1748 * WARN if KVM tries to skip a non-zero length instruction on 1749 * a VM-Exit from an enclave. 1750 */ 1751 if (!instr_len) 1752 goto rip_updated; 1753 1754 WARN_ONCE(exit_reason.enclave_mode, 1755 "skipping instruction after SGX enclave VM-Exit"); 1756 1757 orig_rip = kvm_rip_read(vcpu); 1758 rip = orig_rip + instr_len; 1759 #ifdef CONFIG_X86_64 1760 /* 1761 * We need to mask out the high 32 bits of RIP if not in 64-bit 1762 * mode, but just finding out that we are in 64-bit mode is 1763 * quite expensive. Only do it if there was a carry. 1764 */ 1765 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1766 rip = (u32)rip; 1767 #endif 1768 kvm_rip_write(vcpu, rip); 1769 } else { 1770 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1771 return 0; 1772 } 1773 1774 rip_updated: 1775 /* skipping an emulated instruction also counts */ 1776 vmx_set_interrupt_shadow(vcpu, 0); 1777 1778 return 1; 1779 } 1780 1781 /* 1782 * Recognizes a pending MTF VM-exit and records the nested state for later 1783 * delivery. 1784 */ 1785 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1786 { 1787 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1788 struct vcpu_vmx *vmx = to_vmx(vcpu); 1789 1790 if (!is_guest_mode(vcpu)) 1791 return; 1792 1793 /* 1794 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1795 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1796 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1797 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1798 * as ICEBP is higher priority than both. As instruction emulation is 1799 * completed at this point (i.e. KVM is at the instruction boundary), 1800 * any #DB exception pending delivery must be a debug-trap of lower 1801 * priority than MTF. Record the pending MTF state to be delivered in 1802 * vmx_check_nested_events(). 1803 */ 1804 if (nested_cpu_has_mtf(vmcs12) && 1805 (!vcpu->arch.exception.pending || 1806 vcpu->arch.exception.vector == DB_VECTOR) && 1807 (!vcpu->arch.exception_vmexit.pending || 1808 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1809 vmx->nested.mtf_pending = true; 1810 kvm_make_request(KVM_REQ_EVENT, vcpu); 1811 } else { 1812 vmx->nested.mtf_pending = false; 1813 } 1814 } 1815 1816 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1817 { 1818 vmx_update_emulated_instruction(vcpu); 1819 return skip_emulated_instruction(vcpu); 1820 } 1821 1822 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1823 { 1824 /* 1825 * Ensure that we clear the HLT state in the VMCS. We don't need to 1826 * explicitly skip the instruction because if the HLT state is set, 1827 * then the instruction is already executing and RIP has already been 1828 * advanced. 1829 */ 1830 if (kvm_hlt_in_guest(vcpu->kvm) && 1831 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1832 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1833 } 1834 1835 void vmx_inject_exception(struct kvm_vcpu *vcpu) 1836 { 1837 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1838 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1839 struct vcpu_vmx *vmx = to_vmx(vcpu); 1840 1841 kvm_deliver_exception_payload(vcpu, ex); 1842 1843 if (ex->has_error_code) { 1844 /* 1845 * Despite the error code being architecturally defined as 32 1846 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1847 * VMX don't actually supporting setting bits 31:16. Hardware 1848 * will (should) never provide a bogus error code, but AMD CPUs 1849 * do generate error codes with bits 31:16 set, and so KVM's 1850 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1851 * the upper bits to avoid VM-Fail, losing information that 1852 * doesn't really exist is preferable to killing the VM. 1853 */ 1854 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1855 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1856 } 1857 1858 if (vmx->rmode.vm86_active) { 1859 int inc_eip = 0; 1860 if (kvm_exception_is_soft(ex->vector)) 1861 inc_eip = vcpu->arch.event_exit_inst_len; 1862 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1863 return; 1864 } 1865 1866 WARN_ON_ONCE(vmx->emulation_required); 1867 1868 if (kvm_exception_is_soft(ex->vector)) { 1869 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1870 vmx->vcpu.arch.event_exit_inst_len); 1871 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1872 } else 1873 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1874 1875 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1876 1877 vmx_clear_hlt(vcpu); 1878 } 1879 1880 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1881 bool load_into_hardware) 1882 { 1883 struct vmx_uret_msr *uret_msr; 1884 1885 uret_msr = vmx_find_uret_msr(vmx, msr); 1886 if (!uret_msr) 1887 return; 1888 1889 uret_msr->load_into_hardware = load_into_hardware; 1890 } 1891 1892 /* 1893 * Configuring user return MSRs to automatically save, load, and restore MSRs 1894 * that need to be shoved into hardware when running the guest. Note, omitting 1895 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1896 * loaded into hardware when running the guest. 1897 */ 1898 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1899 { 1900 #ifdef CONFIG_X86_64 1901 bool load_syscall_msrs; 1902 1903 /* 1904 * The SYSCALL MSRs are only needed on long mode guests, and only 1905 * when EFER.SCE is set. 1906 */ 1907 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1908 (vmx->vcpu.arch.efer & EFER_SCE); 1909 1910 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1911 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1912 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1913 #endif 1914 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1915 1916 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1917 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1918 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1919 1920 /* 1921 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1922 * kernel and old userspace. If those guests run on a tsx=off host, do 1923 * allow guests to use TSX_CTRL, but don't change the value in hardware 1924 * so that TSX remains always disabled. 1925 */ 1926 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1927 1928 /* 1929 * The set of MSRs to load may have changed, reload MSRs before the 1930 * next VM-Enter. 1931 */ 1932 vmx->guest_uret_msrs_loaded = false; 1933 } 1934 1935 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1936 { 1937 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1938 1939 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 1940 return vmcs12->tsc_offset; 1941 1942 return 0; 1943 } 1944 1945 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1946 { 1947 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1948 1949 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 1950 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 1951 return vmcs12->tsc_multiplier; 1952 1953 return kvm_caps.default_tsc_scaling_ratio; 1954 } 1955 1956 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 1957 { 1958 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 1959 } 1960 1961 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1962 { 1963 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 1964 } 1965 1966 /* 1967 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 1968 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 1969 * backwards compatibility even though KVM doesn't support emulating SMX. And 1970 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 1971 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 1972 */ 1973 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 1974 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 1975 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 1976 FEAT_CTL_SGX_LC_ENABLED | \ 1977 FEAT_CTL_SGX_ENABLED | \ 1978 FEAT_CTL_LMCE_ENABLED) 1979 1980 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 1981 struct msr_data *msr) 1982 { 1983 uint64_t valid_bits; 1984 1985 /* 1986 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 1987 * exposed to the guest. 1988 */ 1989 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 1990 ~KVM_SUPPORTED_FEATURE_CONTROL); 1991 1992 if (!msr->host_initiated && 1993 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 1994 return false; 1995 1996 if (msr->host_initiated) 1997 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 1998 else 1999 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 2000 2001 return !(msr->data & ~valid_bits); 2002 } 2003 2004 int vmx_get_feature_msr(u32 msr, u64 *data) 2005 { 2006 switch (msr) { 2007 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2008 if (!nested) 2009 return 1; 2010 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2011 default: 2012 return KVM_MSR_RET_UNSUPPORTED; 2013 } 2014 } 2015 2016 /* 2017 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 2018 * Returns 0 on success, non-0 otherwise. 2019 * Assumes vcpu_load() was already called. 2020 */ 2021 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2022 { 2023 struct vcpu_vmx *vmx = to_vmx(vcpu); 2024 struct vmx_uret_msr *msr; 2025 u32 index; 2026 2027 switch (msr_info->index) { 2028 #ifdef CONFIG_X86_64 2029 case MSR_FS_BASE: 2030 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2031 break; 2032 case MSR_GS_BASE: 2033 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2034 break; 2035 case MSR_KERNEL_GS_BASE: 2036 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 2037 break; 2038 #endif 2039 case MSR_EFER: 2040 return kvm_get_msr_common(vcpu, msr_info); 2041 case MSR_IA32_TSX_CTRL: 2042 if (!msr_info->host_initiated && 2043 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2044 return 1; 2045 goto find_uret_msr; 2046 case MSR_IA32_UMWAIT_CONTROL: 2047 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2048 return 1; 2049 2050 msr_info->data = vmx->msr_ia32_umwait_control; 2051 break; 2052 case MSR_IA32_SPEC_CTRL: 2053 if (!msr_info->host_initiated && 2054 !guest_has_spec_ctrl_msr(vcpu)) 2055 return 1; 2056 2057 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2058 break; 2059 case MSR_IA32_SYSENTER_CS: 2060 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2061 break; 2062 case MSR_IA32_SYSENTER_EIP: 2063 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2064 break; 2065 case MSR_IA32_SYSENTER_ESP: 2066 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2067 break; 2068 case MSR_IA32_BNDCFGS: 2069 if (!kvm_mpx_supported() || 2070 (!msr_info->host_initiated && 2071 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2072 return 1; 2073 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2074 break; 2075 case MSR_IA32_MCG_EXT_CTL: 2076 if (!msr_info->host_initiated && 2077 !(vmx->msr_ia32_feature_control & 2078 FEAT_CTL_LMCE_ENABLED)) 2079 return 1; 2080 msr_info->data = vcpu->arch.mcg_ext_ctl; 2081 break; 2082 case MSR_IA32_FEAT_CTL: 2083 msr_info->data = vmx->msr_ia32_feature_control; 2084 break; 2085 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2086 if (!msr_info->host_initiated && 2087 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 2088 return 1; 2089 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2090 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2091 break; 2092 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2093 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2094 return 1; 2095 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2096 &msr_info->data)) 2097 return 1; 2098 #ifdef CONFIG_KVM_HYPERV 2099 /* 2100 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2101 * instead of just ignoring the features, different Hyper-V 2102 * versions are either trying to use them and fail or do some 2103 * sanity checking and refuse to boot. Filter all unsupported 2104 * features out. 2105 */ 2106 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) 2107 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2108 &msr_info->data); 2109 #endif 2110 break; 2111 case MSR_IA32_RTIT_CTL: 2112 if (!vmx_pt_mode_is_host_guest()) 2113 return 1; 2114 msr_info->data = vmx->pt_desc.guest.ctl; 2115 break; 2116 case MSR_IA32_RTIT_STATUS: 2117 if (!vmx_pt_mode_is_host_guest()) 2118 return 1; 2119 msr_info->data = vmx->pt_desc.guest.status; 2120 break; 2121 case MSR_IA32_RTIT_CR3_MATCH: 2122 if (!vmx_pt_mode_is_host_guest() || 2123 !intel_pt_validate_cap(vmx->pt_desc.caps, 2124 PT_CAP_cr3_filtering)) 2125 return 1; 2126 msr_info->data = vmx->pt_desc.guest.cr3_match; 2127 break; 2128 case MSR_IA32_RTIT_OUTPUT_BASE: 2129 if (!vmx_pt_mode_is_host_guest() || 2130 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2131 PT_CAP_topa_output) && 2132 !intel_pt_validate_cap(vmx->pt_desc.caps, 2133 PT_CAP_single_range_output))) 2134 return 1; 2135 msr_info->data = vmx->pt_desc.guest.output_base; 2136 break; 2137 case MSR_IA32_RTIT_OUTPUT_MASK: 2138 if (!vmx_pt_mode_is_host_guest() || 2139 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2140 PT_CAP_topa_output) && 2141 !intel_pt_validate_cap(vmx->pt_desc.caps, 2142 PT_CAP_single_range_output))) 2143 return 1; 2144 msr_info->data = vmx->pt_desc.guest.output_mask; 2145 break; 2146 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2147 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2148 if (!vmx_pt_mode_is_host_guest() || 2149 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2150 return 1; 2151 if (index % 2) 2152 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2153 else 2154 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2155 break; 2156 case MSR_IA32_DEBUGCTLMSR: 2157 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 2158 break; 2159 default: 2160 find_uret_msr: 2161 msr = vmx_find_uret_msr(vmx, msr_info->index); 2162 if (msr) { 2163 msr_info->data = msr->data; 2164 break; 2165 } 2166 return kvm_get_msr_common(vcpu, msr_info); 2167 } 2168 2169 return 0; 2170 } 2171 2172 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2173 u64 data) 2174 { 2175 #ifdef CONFIG_X86_64 2176 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 2177 return (u32)data; 2178 #endif 2179 return (unsigned long)data; 2180 } 2181 2182 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2183 { 2184 u64 debugctl = 0; 2185 2186 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2187 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2188 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2189 2190 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && 2191 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2192 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2193 2194 return debugctl; 2195 } 2196 2197 /* 2198 * Writes msr value into the appropriate "register". 2199 * Returns 0 on success, non-0 otherwise. 2200 * Assumes vcpu_load() was already called. 2201 */ 2202 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2203 { 2204 struct vcpu_vmx *vmx = to_vmx(vcpu); 2205 struct vmx_uret_msr *msr; 2206 int ret = 0; 2207 u32 msr_index = msr_info->index; 2208 u64 data = msr_info->data; 2209 u32 index; 2210 2211 switch (msr_index) { 2212 case MSR_EFER: 2213 ret = kvm_set_msr_common(vcpu, msr_info); 2214 break; 2215 #ifdef CONFIG_X86_64 2216 case MSR_FS_BASE: 2217 vmx_segment_cache_clear(vmx); 2218 vmcs_writel(GUEST_FS_BASE, data); 2219 break; 2220 case MSR_GS_BASE: 2221 vmx_segment_cache_clear(vmx); 2222 vmcs_writel(GUEST_GS_BASE, data); 2223 break; 2224 case MSR_KERNEL_GS_BASE: 2225 vmx_write_guest_kernel_gs_base(vmx, data); 2226 break; 2227 case MSR_IA32_XFD: 2228 ret = kvm_set_msr_common(vcpu, msr_info); 2229 /* 2230 * Always intercepting WRMSR could incur non-negligible 2231 * overhead given xfd might be changed frequently in 2232 * guest context switch. Disable write interception 2233 * upon the first write with a non-zero value (indicating 2234 * potential usage on dynamic xfeatures). Also update 2235 * exception bitmap to trap #NM for proper virtualization 2236 * of guest xfd_err. 2237 */ 2238 if (!ret && data) { 2239 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2240 MSR_TYPE_RW); 2241 vcpu->arch.xfd_no_write_intercept = true; 2242 vmx_update_exception_bitmap(vcpu); 2243 } 2244 break; 2245 #endif 2246 case MSR_IA32_SYSENTER_CS: 2247 if (is_guest_mode(vcpu)) 2248 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2249 vmcs_write32(GUEST_SYSENTER_CS, data); 2250 break; 2251 case MSR_IA32_SYSENTER_EIP: 2252 if (is_guest_mode(vcpu)) { 2253 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2254 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2255 } 2256 vmcs_writel(GUEST_SYSENTER_EIP, data); 2257 break; 2258 case MSR_IA32_SYSENTER_ESP: 2259 if (is_guest_mode(vcpu)) { 2260 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2261 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2262 } 2263 vmcs_writel(GUEST_SYSENTER_ESP, data); 2264 break; 2265 case MSR_IA32_DEBUGCTLMSR: { 2266 u64 invalid; 2267 2268 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2269 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2270 kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); 2271 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2272 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2273 } 2274 2275 if (invalid) 2276 return 1; 2277 2278 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2279 VM_EXIT_SAVE_DEBUG_CONTROLS) 2280 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2281 2282 vmcs_write64(GUEST_IA32_DEBUGCTL, data); 2283 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2284 (data & DEBUGCTLMSR_LBR)) 2285 intel_pmu_create_guest_lbr_event(vcpu); 2286 return 0; 2287 } 2288 case MSR_IA32_BNDCFGS: 2289 if (!kvm_mpx_supported() || 2290 (!msr_info->host_initiated && 2291 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2292 return 1; 2293 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || 2294 (data & MSR_IA32_BNDCFGS_RSVD)) 2295 return 1; 2296 2297 if (is_guest_mode(vcpu) && 2298 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2299 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2300 get_vmcs12(vcpu)->guest_bndcfgs = data; 2301 2302 vmcs_write64(GUEST_BNDCFGS, data); 2303 break; 2304 case MSR_IA32_UMWAIT_CONTROL: 2305 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2306 return 1; 2307 2308 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2309 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2310 return 1; 2311 2312 vmx->msr_ia32_umwait_control = data; 2313 break; 2314 case MSR_IA32_SPEC_CTRL: 2315 if (!msr_info->host_initiated && 2316 !guest_has_spec_ctrl_msr(vcpu)) 2317 return 1; 2318 2319 if (kvm_spec_ctrl_test_value(data)) 2320 return 1; 2321 2322 vmx->spec_ctrl = data; 2323 if (!data) 2324 break; 2325 2326 /* 2327 * For non-nested: 2328 * When it's written (to non-zero) for the first time, pass 2329 * it through. 2330 * 2331 * For nested: 2332 * The handling of the MSR bitmap for L2 guests is done in 2333 * nested_vmx_prepare_msr_bitmap. We should not touch the 2334 * vmcs02.msr_bitmap here since it gets completely overwritten 2335 * in the merging. We update the vmcs01 here for L1 as well 2336 * since it will end up touching the MSR anyway now. 2337 */ 2338 vmx_disable_intercept_for_msr(vcpu, 2339 MSR_IA32_SPEC_CTRL, 2340 MSR_TYPE_RW); 2341 break; 2342 case MSR_IA32_TSX_CTRL: 2343 if (!msr_info->host_initiated && 2344 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2345 return 1; 2346 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2347 return 1; 2348 goto find_uret_msr; 2349 case MSR_IA32_CR_PAT: 2350 ret = kvm_set_msr_common(vcpu, msr_info); 2351 if (ret) 2352 break; 2353 2354 if (is_guest_mode(vcpu) && 2355 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2356 get_vmcs12(vcpu)->guest_ia32_pat = data; 2357 2358 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2359 vmcs_write64(GUEST_IA32_PAT, data); 2360 break; 2361 case MSR_IA32_MCG_EXT_CTL: 2362 if ((!msr_info->host_initiated && 2363 !(to_vmx(vcpu)->msr_ia32_feature_control & 2364 FEAT_CTL_LMCE_ENABLED)) || 2365 (data & ~MCG_EXT_CTL_LMCE_EN)) 2366 return 1; 2367 vcpu->arch.mcg_ext_ctl = data; 2368 break; 2369 case MSR_IA32_FEAT_CTL: 2370 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2371 return 1; 2372 2373 vmx->msr_ia32_feature_control = data; 2374 if (msr_info->host_initiated && data == 0) 2375 vmx_leave_nested(vcpu); 2376 2377 /* SGX may be enabled/disabled by guest's firmware */ 2378 vmx_write_encls_bitmap(vcpu, NULL); 2379 break; 2380 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2381 /* 2382 * On real hardware, the LE hash MSRs are writable before 2383 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2384 * at which point SGX related bits in IA32_FEATURE_CONTROL 2385 * become writable. 2386 * 2387 * KVM does not emulate SGX activation for simplicity, so 2388 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2389 * is unlocked. This is technically not architectural 2390 * behavior, but it's close enough. 2391 */ 2392 if (!msr_info->host_initiated && 2393 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || 2394 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2395 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2396 return 1; 2397 vmx->msr_ia32_sgxlepubkeyhash 2398 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2399 break; 2400 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2401 if (!msr_info->host_initiated) 2402 return 1; /* they are read-only */ 2403 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2404 return 1; 2405 return vmx_set_vmx_msr(vcpu, msr_index, data); 2406 case MSR_IA32_RTIT_CTL: 2407 if (!vmx_pt_mode_is_host_guest() || 2408 vmx_rtit_ctl_check(vcpu, data) || 2409 vmx->nested.vmxon) 2410 return 1; 2411 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2412 vmx->pt_desc.guest.ctl = data; 2413 pt_update_intercept_for_msr(vcpu); 2414 break; 2415 case MSR_IA32_RTIT_STATUS: 2416 if (!pt_can_write_msr(vmx)) 2417 return 1; 2418 if (data & MSR_IA32_RTIT_STATUS_MASK) 2419 return 1; 2420 vmx->pt_desc.guest.status = data; 2421 break; 2422 case MSR_IA32_RTIT_CR3_MATCH: 2423 if (!pt_can_write_msr(vmx)) 2424 return 1; 2425 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2426 PT_CAP_cr3_filtering)) 2427 return 1; 2428 vmx->pt_desc.guest.cr3_match = data; 2429 break; 2430 case MSR_IA32_RTIT_OUTPUT_BASE: 2431 if (!pt_can_write_msr(vmx)) 2432 return 1; 2433 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2434 PT_CAP_topa_output) && 2435 !intel_pt_validate_cap(vmx->pt_desc.caps, 2436 PT_CAP_single_range_output)) 2437 return 1; 2438 if (!pt_output_base_valid(vcpu, data)) 2439 return 1; 2440 vmx->pt_desc.guest.output_base = data; 2441 break; 2442 case MSR_IA32_RTIT_OUTPUT_MASK: 2443 if (!pt_can_write_msr(vmx)) 2444 return 1; 2445 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2446 PT_CAP_topa_output) && 2447 !intel_pt_validate_cap(vmx->pt_desc.caps, 2448 PT_CAP_single_range_output)) 2449 return 1; 2450 vmx->pt_desc.guest.output_mask = data; 2451 break; 2452 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2453 if (!pt_can_write_msr(vmx)) 2454 return 1; 2455 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2456 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2457 return 1; 2458 if (is_noncanonical_msr_address(data, vcpu)) 2459 return 1; 2460 if (index % 2) 2461 vmx->pt_desc.guest.addr_b[index / 2] = data; 2462 else 2463 vmx->pt_desc.guest.addr_a[index / 2] = data; 2464 break; 2465 case MSR_IA32_PERF_CAPABILITIES: 2466 if (data & PMU_CAP_LBR_FMT) { 2467 if ((data & PMU_CAP_LBR_FMT) != 2468 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) 2469 return 1; 2470 if (!cpuid_model_is_consistent(vcpu)) 2471 return 1; 2472 } 2473 if (data & PERF_CAP_PEBS_FORMAT) { 2474 if ((data & PERF_CAP_PEBS_MASK) != 2475 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2476 return 1; 2477 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) 2478 return 1; 2479 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) 2480 return 1; 2481 if (!cpuid_model_is_consistent(vcpu)) 2482 return 1; 2483 } 2484 ret = kvm_set_msr_common(vcpu, msr_info); 2485 break; 2486 2487 default: 2488 find_uret_msr: 2489 msr = vmx_find_uret_msr(vmx, msr_index); 2490 if (msr) 2491 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2492 else 2493 ret = kvm_set_msr_common(vcpu, msr_info); 2494 } 2495 2496 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2497 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2498 vmx_update_fb_clear_dis(vcpu, vmx); 2499 2500 return ret; 2501 } 2502 2503 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2504 { 2505 unsigned long guest_owned_bits; 2506 2507 kvm_register_mark_available(vcpu, reg); 2508 2509 switch (reg) { 2510 case VCPU_REGS_RSP: 2511 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2512 break; 2513 case VCPU_REGS_RIP: 2514 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2515 break; 2516 case VCPU_EXREG_PDPTR: 2517 if (enable_ept) 2518 ept_save_pdptrs(vcpu); 2519 break; 2520 case VCPU_EXREG_CR0: 2521 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2522 2523 vcpu->arch.cr0 &= ~guest_owned_bits; 2524 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2525 break; 2526 case VCPU_EXREG_CR3: 2527 /* 2528 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2529 * CR3 is loaded into hardware, not the guest's CR3. 2530 */ 2531 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2532 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2533 break; 2534 case VCPU_EXREG_CR4: 2535 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2536 2537 vcpu->arch.cr4 &= ~guest_owned_bits; 2538 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2539 break; 2540 default: 2541 KVM_BUG_ON(1, vcpu->kvm); 2542 break; 2543 } 2544 } 2545 2546 /* 2547 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2548 * directly instead of going through cpu_has(), to ensure KVM is trapping 2549 * ENCLS whenever it's supported in hardware. It does not matter whether 2550 * the host OS supports or has enabled SGX. 2551 */ 2552 static bool cpu_has_sgx(void) 2553 { 2554 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2555 } 2556 2557 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2558 { 2559 u32 vmx_msr_low, vmx_msr_high; 2560 u32 ctl = ctl_min | ctl_opt; 2561 2562 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2563 2564 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2565 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2566 2567 /* Ensure minimum (required) set of control bits are supported. */ 2568 if (ctl_min & ~ctl) 2569 return -EIO; 2570 2571 *result = ctl; 2572 return 0; 2573 } 2574 2575 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2576 { 2577 u64 allowed; 2578 2579 rdmsrq(msr, allowed); 2580 2581 return ctl_opt & allowed; 2582 } 2583 2584 #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ 2585 ({ \ 2586 int i, r = 0; \ 2587 \ 2588 BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ 2589 BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ 2590 \ 2591 for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ 2592 typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ 2593 typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ 2594 \ 2595 if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ 2596 continue; \ 2597 \ 2598 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ 2599 "entry = %llx (%llx), exit = %llx (%llx)\n", \ 2600 (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ 2601 (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ 2602 \ 2603 if (error_on_inconsistent_vmcs_config) \ 2604 r = -EIO; \ 2605 \ 2606 entry_controls &= ~n_ctrl; \ 2607 exit_controls &= ~x_ctrl; \ 2608 } \ 2609 r; \ 2610 }) 2611 2612 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2613 struct vmx_capability *vmx_cap) 2614 { 2615 u32 _pin_based_exec_control = 0; 2616 u32 _cpu_based_exec_control = 0; 2617 u32 _cpu_based_2nd_exec_control = 0; 2618 u64 _cpu_based_3rd_exec_control = 0; 2619 u32 _vmexit_control = 0; 2620 u32 _vmentry_control = 0; 2621 u64 basic_msr; 2622 u64 misc_msr; 2623 2624 /* 2625 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2626 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2627 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2628 */ 2629 struct { 2630 u32 entry_control; 2631 u32 exit_control; 2632 } const vmcs_entry_exit_pairs[] = { 2633 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2634 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2635 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2636 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2637 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2638 }; 2639 2640 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2641 2642 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2643 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2644 MSR_IA32_VMX_PROCBASED_CTLS, 2645 &_cpu_based_exec_control)) 2646 return -EIO; 2647 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2648 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2649 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2650 MSR_IA32_VMX_PROCBASED_CTLS2, 2651 &_cpu_based_2nd_exec_control)) 2652 return -EIO; 2653 } 2654 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 2655 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2656 2657 #ifndef CONFIG_X86_64 2658 if (!(_cpu_based_2nd_exec_control & 2659 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2660 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2661 #endif 2662 2663 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2664 _cpu_based_2nd_exec_control &= ~( 2665 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2666 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2667 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2668 2669 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2670 &vmx_cap->ept, &vmx_cap->vpid); 2671 2672 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2673 vmx_cap->ept) { 2674 pr_warn_once("EPT CAP should not exist if not support " 2675 "1-setting enable EPT VM-execution control\n"); 2676 2677 if (error_on_inconsistent_vmcs_config) 2678 return -EIO; 2679 2680 vmx_cap->ept = 0; 2681 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2682 } 2683 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2684 vmx_cap->vpid) { 2685 pr_warn_once("VPID CAP should not exist if not support " 2686 "1-setting enable VPID VM-execution control\n"); 2687 2688 if (error_on_inconsistent_vmcs_config) 2689 return -EIO; 2690 2691 vmx_cap->vpid = 0; 2692 } 2693 2694 if (!cpu_has_sgx()) 2695 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2696 2697 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2698 _cpu_based_3rd_exec_control = 2699 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2700 MSR_IA32_VMX_PROCBASED_CTLS3); 2701 2702 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2703 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2704 MSR_IA32_VMX_EXIT_CTLS, 2705 &_vmexit_control)) 2706 return -EIO; 2707 2708 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2709 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2710 MSR_IA32_VMX_PINBASED_CTLS, 2711 &_pin_based_exec_control)) 2712 return -EIO; 2713 2714 if (cpu_has_broken_vmx_preemption_timer()) 2715 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2716 if (!(_cpu_based_2nd_exec_control & 2717 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2718 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2719 2720 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2721 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2722 MSR_IA32_VMX_ENTRY_CTLS, 2723 &_vmentry_control)) 2724 return -EIO; 2725 2726 if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, 2727 _vmentry_control, _vmexit_control)) 2728 return -EIO; 2729 2730 /* 2731 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2732 * can't be used due to an errata where VM Exit may incorrectly clear 2733 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2734 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2735 */ 2736 switch (boot_cpu_data.x86_vfm) { 2737 case INTEL_NEHALEM_EP: /* AAK155 */ 2738 case INTEL_NEHALEM: /* AAP115 */ 2739 case INTEL_WESTMERE: /* AAT100 */ 2740 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2741 case INTEL_NEHALEM_EX: /* BA97 */ 2742 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2743 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2744 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2745 "does not work properly. Using workaround\n"); 2746 break; 2747 default: 2748 break; 2749 } 2750 2751 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); 2752 2753 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2754 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2755 return -EIO; 2756 2757 #ifdef CONFIG_X86_64 2758 /* 2759 * KVM expects to be able to shove all legal physical addresses into 2760 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2761 * 0 for processors that support Intel 64 architecture". 2762 */ 2763 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2764 return -EIO; 2765 #endif 2766 2767 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2768 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2769 return -EIO; 2770 2771 rdmsrq(MSR_IA32_VMX_MISC, misc_msr); 2772 2773 vmcs_conf->basic = basic_msr; 2774 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2775 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2776 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2777 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2778 vmcs_conf->vmexit_ctrl = _vmexit_control; 2779 vmcs_conf->vmentry_ctrl = _vmentry_control; 2780 vmcs_conf->misc = misc_msr; 2781 2782 #if IS_ENABLED(CONFIG_HYPERV) 2783 if (enlightened_vmcs) 2784 evmcs_sanitize_exec_ctrls(vmcs_conf); 2785 #endif 2786 2787 return 0; 2788 } 2789 2790 static bool __kvm_is_vmx_supported(void) 2791 { 2792 int cpu = smp_processor_id(); 2793 2794 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2795 pr_err("VMX not supported by CPU %d\n", cpu); 2796 return false; 2797 } 2798 2799 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2800 !this_cpu_has(X86_FEATURE_VMX)) { 2801 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2802 return false; 2803 } 2804 2805 return true; 2806 } 2807 2808 static bool kvm_is_vmx_supported(void) 2809 { 2810 bool supported; 2811 2812 migrate_disable(); 2813 supported = __kvm_is_vmx_supported(); 2814 migrate_enable(); 2815 2816 return supported; 2817 } 2818 2819 int vmx_check_processor_compat(void) 2820 { 2821 int cpu = raw_smp_processor_id(); 2822 struct vmcs_config vmcs_conf; 2823 struct vmx_capability vmx_cap; 2824 2825 if (!__kvm_is_vmx_supported()) 2826 return -EIO; 2827 2828 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2829 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2830 return -EIO; 2831 } 2832 if (nested) 2833 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2834 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2835 pr_err("Inconsistent VMCS config on CPU %d\n", cpu); 2836 return -EIO; 2837 } 2838 return 0; 2839 } 2840 2841 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2842 { 2843 u64 msr; 2844 2845 cr4_set_bits(X86_CR4_VMXE); 2846 2847 asm goto("1: vmxon %[vmxon_pointer]\n\t" 2848 _ASM_EXTABLE(1b, %l[fault]) 2849 : : [vmxon_pointer] "m"(vmxon_pointer) 2850 : : fault); 2851 return 0; 2852 2853 fault: 2854 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2855 rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2856 cr4_clear_bits(X86_CR4_VMXE); 2857 2858 return -EFAULT; 2859 } 2860 2861 int vmx_enable_virtualization_cpu(void) 2862 { 2863 int cpu = raw_smp_processor_id(); 2864 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2865 int r; 2866 2867 if (cr4_read_shadow() & X86_CR4_VMXE) 2868 return -EBUSY; 2869 2870 /* 2871 * This can happen if we hot-added a CPU but failed to allocate 2872 * VP assist page for it. 2873 */ 2874 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2875 return -EFAULT; 2876 2877 intel_pt_handle_vmx(1); 2878 2879 r = kvm_cpu_vmxon(phys_addr); 2880 if (r) { 2881 intel_pt_handle_vmx(0); 2882 return r; 2883 } 2884 2885 return 0; 2886 } 2887 2888 static void vmclear_local_loaded_vmcss(void) 2889 { 2890 int cpu = raw_smp_processor_id(); 2891 struct loaded_vmcs *v, *n; 2892 2893 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2894 loaded_vmcss_on_cpu_link) 2895 __loaded_vmcs_clear(v); 2896 } 2897 2898 void vmx_disable_virtualization_cpu(void) 2899 { 2900 vmclear_local_loaded_vmcss(); 2901 2902 if (kvm_cpu_vmxoff()) 2903 kvm_spurious_fault(); 2904 2905 hv_reset_evmcs(); 2906 2907 intel_pt_handle_vmx(0); 2908 } 2909 2910 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2911 { 2912 int node = cpu_to_node(cpu); 2913 struct page *pages; 2914 struct vmcs *vmcs; 2915 2916 pages = __alloc_pages_node(node, flags, 0); 2917 if (!pages) 2918 return NULL; 2919 vmcs = page_address(pages); 2920 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 2921 2922 /* KVM supports Enlightened VMCS v1 only */ 2923 if (kvm_is_using_evmcs()) 2924 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2925 else 2926 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 2927 2928 if (shadow) 2929 vmcs->hdr.shadow_vmcs = 1; 2930 return vmcs; 2931 } 2932 2933 void free_vmcs(struct vmcs *vmcs) 2934 { 2935 free_page((unsigned long)vmcs); 2936 } 2937 2938 /* 2939 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2940 */ 2941 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2942 { 2943 if (!loaded_vmcs->vmcs) 2944 return; 2945 loaded_vmcs_clear(loaded_vmcs); 2946 free_vmcs(loaded_vmcs->vmcs); 2947 loaded_vmcs->vmcs = NULL; 2948 if (loaded_vmcs->msr_bitmap) 2949 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2950 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2951 } 2952 2953 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2954 { 2955 loaded_vmcs->vmcs = alloc_vmcs(false); 2956 if (!loaded_vmcs->vmcs) 2957 return -ENOMEM; 2958 2959 vmcs_clear(loaded_vmcs->vmcs); 2960 2961 loaded_vmcs->shadow_vmcs = NULL; 2962 loaded_vmcs->hv_timer_soft_disabled = false; 2963 loaded_vmcs->cpu = -1; 2964 loaded_vmcs->launched = 0; 2965 2966 if (cpu_has_vmx_msr_bitmap()) { 2967 loaded_vmcs->msr_bitmap = (unsigned long *) 2968 __get_free_page(GFP_KERNEL_ACCOUNT); 2969 if (!loaded_vmcs->msr_bitmap) 2970 goto out_vmcs; 2971 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2972 } 2973 2974 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2975 memset(&loaded_vmcs->controls_shadow, 0, 2976 sizeof(struct vmcs_controls_shadow)); 2977 2978 return 0; 2979 2980 out_vmcs: 2981 free_loaded_vmcs(loaded_vmcs); 2982 return -ENOMEM; 2983 } 2984 2985 static void free_kvm_area(void) 2986 { 2987 int cpu; 2988 2989 for_each_possible_cpu(cpu) { 2990 free_vmcs(per_cpu(vmxarea, cpu)); 2991 per_cpu(vmxarea, cpu) = NULL; 2992 } 2993 } 2994 2995 static __init int alloc_kvm_area(void) 2996 { 2997 int cpu; 2998 2999 for_each_possible_cpu(cpu) { 3000 struct vmcs *vmcs; 3001 3002 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 3003 if (!vmcs) { 3004 free_kvm_area(); 3005 return -ENOMEM; 3006 } 3007 3008 /* 3009 * When eVMCS is enabled, alloc_vmcs_cpu() sets 3010 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 3011 * revision_id reported by MSR_IA32_VMX_BASIC. 3012 * 3013 * However, even though not explicitly documented by 3014 * TLFS, VMXArea passed as VMXON argument should 3015 * still be marked with revision_id reported by 3016 * physical CPU. 3017 */ 3018 if (kvm_is_using_evmcs()) 3019 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3020 3021 per_cpu(vmxarea, cpu) = vmcs; 3022 } 3023 return 0; 3024 } 3025 3026 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3027 struct kvm_segment *save) 3028 { 3029 if (!emulate_invalid_guest_state) { 3030 /* 3031 * CS and SS RPL should be equal during guest entry according 3032 * to VMX spec, but in reality it is not always so. Since vcpu 3033 * is in the middle of the transition from real mode to 3034 * protected mode it is safe to assume that RPL 0 is a good 3035 * default value. 3036 */ 3037 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3038 save->selector &= ~SEGMENT_RPL_MASK; 3039 save->dpl = save->selector & SEGMENT_RPL_MASK; 3040 save->s = 1; 3041 } 3042 __vmx_set_segment(vcpu, save, seg); 3043 } 3044 3045 static void enter_pmode(struct kvm_vcpu *vcpu) 3046 { 3047 unsigned long flags; 3048 struct vcpu_vmx *vmx = to_vmx(vcpu); 3049 3050 /* 3051 * Update real mode segment cache. It may be not up-to-date if segment 3052 * register was written while vcpu was in a guest mode. 3053 */ 3054 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3055 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3056 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3057 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3058 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3059 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3060 3061 vmx->rmode.vm86_active = 0; 3062 3063 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3064 3065 flags = vmcs_readl(GUEST_RFLAGS); 3066 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3067 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3068 vmcs_writel(GUEST_RFLAGS, flags); 3069 3070 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3071 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3072 3073 vmx_update_exception_bitmap(vcpu); 3074 3075 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3076 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3077 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3078 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3079 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3080 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3081 } 3082 3083 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3084 { 3085 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3086 struct kvm_segment var = *save; 3087 3088 var.dpl = 0x3; 3089 if (seg == VCPU_SREG_CS) 3090 var.type = 0x3; 3091 3092 if (!emulate_invalid_guest_state) { 3093 var.selector = var.base >> 4; 3094 var.base = var.base & 0xffff0; 3095 var.limit = 0xffff; 3096 var.g = 0; 3097 var.db = 0; 3098 var.present = 1; 3099 var.s = 1; 3100 var.l = 0; 3101 var.unusable = 0; 3102 var.type = 0x3; 3103 var.avl = 0; 3104 if (save->base & 0xf) 3105 pr_warn_once("segment base is not paragraph aligned " 3106 "when entering protected mode (seg=%d)", seg); 3107 } 3108 3109 vmcs_write16(sf->selector, var.selector); 3110 vmcs_writel(sf->base, var.base); 3111 vmcs_write32(sf->limit, var.limit); 3112 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3113 } 3114 3115 static void enter_rmode(struct kvm_vcpu *vcpu) 3116 { 3117 unsigned long flags; 3118 struct vcpu_vmx *vmx = to_vmx(vcpu); 3119 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3120 3121 /* 3122 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3123 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3124 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3125 * should VM-Fail and KVM should reject userspace attempts to stuff 3126 * CR0.PG=0 when L2 is active. 3127 */ 3128 WARN_ON_ONCE(is_guest_mode(vcpu)); 3129 3130 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3131 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3132 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3133 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3134 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3135 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3136 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3137 3138 vmx->rmode.vm86_active = 1; 3139 3140 vmx_segment_cache_clear(vmx); 3141 3142 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3143 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3144 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3145 3146 flags = vmcs_readl(GUEST_RFLAGS); 3147 vmx->rmode.save_rflags = flags; 3148 3149 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3150 3151 vmcs_writel(GUEST_RFLAGS, flags); 3152 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3153 vmx_update_exception_bitmap(vcpu); 3154 3155 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3156 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3157 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3158 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3159 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3160 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3161 } 3162 3163 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3164 { 3165 struct vcpu_vmx *vmx = to_vmx(vcpu); 3166 3167 /* Nothing to do if hardware doesn't support EFER. */ 3168 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3169 return 0; 3170 3171 vcpu->arch.efer = efer; 3172 #ifdef CONFIG_X86_64 3173 if (efer & EFER_LMA) 3174 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3175 else 3176 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3177 #else 3178 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3179 return 1; 3180 #endif 3181 3182 vmx_setup_uret_msrs(vmx); 3183 return 0; 3184 } 3185 3186 #ifdef CONFIG_X86_64 3187 3188 static void enter_lmode(struct kvm_vcpu *vcpu) 3189 { 3190 u32 guest_tr_ar; 3191 3192 vmx_segment_cache_clear(to_vmx(vcpu)); 3193 3194 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3195 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3196 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3197 __func__); 3198 vmcs_write32(GUEST_TR_AR_BYTES, 3199 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3200 | VMX_AR_TYPE_BUSY_64_TSS); 3201 } 3202 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3203 } 3204 3205 static void exit_lmode(struct kvm_vcpu *vcpu) 3206 { 3207 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3208 } 3209 3210 #endif 3211 3212 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3213 { 3214 struct vcpu_vmx *vmx = to_vmx(vcpu); 3215 3216 /* 3217 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3218 * the CPU is not required to invalidate guest-physical mappings on 3219 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3220 * associated with the root EPT structure and not any particular VPID 3221 * (INVVPID also isn't required to invalidate guest-physical mappings). 3222 */ 3223 if (enable_ept) { 3224 ept_sync_global(); 3225 } else if (enable_vpid) { 3226 if (cpu_has_vmx_invvpid_global()) { 3227 vpid_sync_vcpu_global(); 3228 } else { 3229 vpid_sync_vcpu_single(vmx->vpid); 3230 vpid_sync_vcpu_single(vmx->nested.vpid02); 3231 } 3232 } 3233 } 3234 3235 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3236 { 3237 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) 3238 return nested_get_vpid02(vcpu); 3239 return to_vmx(vcpu)->vpid; 3240 } 3241 3242 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3243 { 3244 struct kvm_mmu *mmu = vcpu->arch.mmu; 3245 u64 root_hpa = mmu->root.hpa; 3246 3247 /* No flush required if the current context is invalid. */ 3248 if (!VALID_PAGE(root_hpa)) 3249 return; 3250 3251 if (enable_ept) 3252 ept_sync_context(construct_eptp(vcpu, root_hpa, 3253 mmu->root_role.level)); 3254 else 3255 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3256 } 3257 3258 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3259 { 3260 /* 3261 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3262 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3263 */ 3264 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3265 } 3266 3267 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3268 { 3269 /* 3270 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3271 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3272 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3273 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3274 * i.e. no explicit INVVPID is necessary. 3275 */ 3276 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3277 } 3278 3279 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3280 { 3281 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3282 3283 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3284 return; 3285 3286 if (is_pae_paging(vcpu)) { 3287 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3288 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3289 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3290 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3291 } 3292 } 3293 3294 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3295 { 3296 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3297 3298 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3299 return; 3300 3301 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3302 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3303 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3304 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3305 3306 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3307 } 3308 3309 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3310 CPU_BASED_CR3_STORE_EXITING) 3311 3312 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3313 { 3314 if (is_guest_mode(vcpu)) 3315 return nested_guest_cr0_valid(vcpu, cr0); 3316 3317 if (to_vmx(vcpu)->nested.vmxon) 3318 return nested_host_cr0_valid(vcpu, cr0); 3319 3320 return true; 3321 } 3322 3323 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3324 { 3325 struct vcpu_vmx *vmx = to_vmx(vcpu); 3326 unsigned long hw_cr0, old_cr0_pg; 3327 u32 tmp; 3328 3329 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3330 3331 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3332 if (enable_unrestricted_guest) 3333 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3334 else { 3335 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3336 if (!enable_ept) 3337 hw_cr0 |= X86_CR0_WP; 3338 3339 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3340 enter_pmode(vcpu); 3341 3342 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3343 enter_rmode(vcpu); 3344 } 3345 3346 vmcs_writel(CR0_READ_SHADOW, cr0); 3347 vmcs_writel(GUEST_CR0, hw_cr0); 3348 vcpu->arch.cr0 = cr0; 3349 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3350 3351 #ifdef CONFIG_X86_64 3352 if (vcpu->arch.efer & EFER_LME) { 3353 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3354 enter_lmode(vcpu); 3355 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3356 exit_lmode(vcpu); 3357 } 3358 #endif 3359 3360 if (enable_ept && !enable_unrestricted_guest) { 3361 /* 3362 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3363 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3364 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3365 * KVM's CR3 is installed. 3366 */ 3367 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3368 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3369 3370 /* 3371 * When running with EPT but not unrestricted guest, KVM must 3372 * intercept CR3 accesses when paging is _disabled_. This is 3373 * necessary because restricted guests can't actually run with 3374 * paging disabled, and so KVM stuffs its own CR3 in order to 3375 * run the guest when identity mapped page tables. 3376 * 3377 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3378 * update, it may be stale with respect to CR3 interception, 3379 * e.g. after nested VM-Enter. 3380 * 3381 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3382 * stores to forward them to L1, even if KVM does not need to 3383 * intercept them to preserve its identity mapped page tables. 3384 */ 3385 if (!(cr0 & X86_CR0_PG)) { 3386 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3387 } else if (!is_guest_mode(vcpu)) { 3388 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3389 } else { 3390 tmp = exec_controls_get(vmx); 3391 tmp &= ~CR3_EXITING_BITS; 3392 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3393 exec_controls_set(vmx, tmp); 3394 } 3395 3396 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3397 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3398 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3399 3400 /* 3401 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3402 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3403 */ 3404 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3405 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3406 } 3407 3408 /* depends on vcpu->arch.cr0 to be set to a new value */ 3409 vmx->emulation_required = vmx_emulation_required(vcpu); 3410 } 3411 3412 static int vmx_get_max_ept_level(void) 3413 { 3414 if (cpu_has_vmx_ept_5levels()) 3415 return 5; 3416 return 4; 3417 } 3418 3419 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3420 { 3421 u64 eptp = VMX_EPTP_MT_WB; 3422 3423 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3424 3425 if (enable_ept_ad_bits && 3426 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 3427 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3428 eptp |= root_hpa; 3429 3430 return eptp; 3431 } 3432 3433 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3434 { 3435 struct kvm *kvm = vcpu->kvm; 3436 bool update_guest_cr3 = true; 3437 unsigned long guest_cr3; 3438 u64 eptp; 3439 3440 if (enable_ept) { 3441 eptp = construct_eptp(vcpu, root_hpa, root_level); 3442 vmcs_write64(EPT_POINTER, eptp); 3443 3444 hv_track_root_tdp(vcpu, root_hpa); 3445 3446 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3447 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3448 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3449 guest_cr3 = vcpu->arch.cr3; 3450 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3451 update_guest_cr3 = false; 3452 vmx_ept_load_pdptrs(vcpu); 3453 } else { 3454 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3455 kvm_get_active_cr3_lam_bits(vcpu); 3456 } 3457 3458 if (update_guest_cr3) 3459 vmcs_writel(GUEST_CR3, guest_cr3); 3460 } 3461 3462 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3463 { 3464 /* 3465 * We operate under the default treatment of SMM, so VMX cannot be 3466 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3467 * i.e. is a reserved bit, is handled by common x86 code. 3468 */ 3469 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3470 return false; 3471 3472 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3473 return false; 3474 3475 return true; 3476 } 3477 3478 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3479 { 3480 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3481 struct vcpu_vmx *vmx = to_vmx(vcpu); 3482 unsigned long hw_cr4; 3483 3484 /* 3485 * Pass through host's Machine Check Enable value to hw_cr4, which 3486 * is in force while we are in guest mode. Do not let guests control 3487 * this bit, even if host CR4.MCE == 0. 3488 */ 3489 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3490 if (enable_unrestricted_guest) 3491 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3492 else if (vmx->rmode.vm86_active) 3493 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3494 else 3495 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3496 3497 if (vmx_umip_emulated()) { 3498 if (cr4 & X86_CR4_UMIP) { 3499 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3500 hw_cr4 &= ~X86_CR4_UMIP; 3501 } else if (!is_guest_mode(vcpu) || 3502 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3503 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3504 } 3505 } 3506 3507 vcpu->arch.cr4 = cr4; 3508 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3509 3510 if (!enable_unrestricted_guest) { 3511 if (enable_ept) { 3512 if (!is_paging(vcpu)) { 3513 hw_cr4 &= ~X86_CR4_PAE; 3514 hw_cr4 |= X86_CR4_PSE; 3515 } else if (!(cr4 & X86_CR4_PAE)) { 3516 hw_cr4 &= ~X86_CR4_PAE; 3517 } 3518 } 3519 3520 /* 3521 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3522 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3523 * to be manually disabled when guest switches to non-paging 3524 * mode. 3525 * 3526 * If !enable_unrestricted_guest, the CPU is always running 3527 * with CR0.PG=1 and CR4 needs to be modified. 3528 * If enable_unrestricted_guest, the CPU automatically 3529 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3530 */ 3531 if (!is_paging(vcpu)) 3532 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3533 } 3534 3535 vmcs_writel(CR4_READ_SHADOW, cr4); 3536 vmcs_writel(GUEST_CR4, hw_cr4); 3537 3538 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3539 vcpu->arch.cpuid_dynamic_bits_dirty = true; 3540 } 3541 3542 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3543 { 3544 struct vcpu_vmx *vmx = to_vmx(vcpu); 3545 u32 ar; 3546 3547 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3548 *var = vmx->rmode.segs[seg]; 3549 if (seg == VCPU_SREG_TR 3550 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3551 return; 3552 var->base = vmx_read_guest_seg_base(vmx, seg); 3553 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3554 return; 3555 } 3556 var->base = vmx_read_guest_seg_base(vmx, seg); 3557 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3558 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3559 ar = vmx_read_guest_seg_ar(vmx, seg); 3560 var->unusable = (ar >> 16) & 1; 3561 var->type = ar & 15; 3562 var->s = (ar >> 4) & 1; 3563 var->dpl = (ar >> 5) & 3; 3564 /* 3565 * Some userspaces do not preserve unusable property. Since usable 3566 * segment has to be present according to VMX spec we can use present 3567 * property to amend userspace bug by making unusable segment always 3568 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3569 * segment as unusable. 3570 */ 3571 var->present = !var->unusable; 3572 var->avl = (ar >> 12) & 1; 3573 var->l = (ar >> 13) & 1; 3574 var->db = (ar >> 14) & 1; 3575 var->g = (ar >> 15) & 1; 3576 } 3577 3578 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3579 { 3580 struct kvm_segment s; 3581 3582 if (to_vmx(vcpu)->rmode.vm86_active) { 3583 vmx_get_segment(vcpu, &s, seg); 3584 return s.base; 3585 } 3586 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3587 } 3588 3589 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) 3590 { 3591 struct vcpu_vmx *vmx = to_vmx(vcpu); 3592 int ar; 3593 3594 if (unlikely(vmx->rmode.vm86_active)) 3595 return 0; 3596 3597 if (no_cache) 3598 ar = vmcs_read32(GUEST_SS_AR_BYTES); 3599 else 3600 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3601 return VMX_AR_DPL(ar); 3602 } 3603 3604 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3605 { 3606 return __vmx_get_cpl(vcpu, false); 3607 } 3608 3609 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) 3610 { 3611 return __vmx_get_cpl(vcpu, true); 3612 } 3613 3614 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3615 { 3616 u32 ar; 3617 3618 ar = var->type & 15; 3619 ar |= (var->s & 1) << 4; 3620 ar |= (var->dpl & 3) << 5; 3621 ar |= (var->present & 1) << 7; 3622 ar |= (var->avl & 1) << 12; 3623 ar |= (var->l & 1) << 13; 3624 ar |= (var->db & 1) << 14; 3625 ar |= (var->g & 1) << 15; 3626 ar |= (var->unusable || !var->present) << 16; 3627 3628 return ar; 3629 } 3630 3631 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3632 { 3633 struct vcpu_vmx *vmx = to_vmx(vcpu); 3634 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3635 3636 vmx_segment_cache_clear(vmx); 3637 3638 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3639 vmx->rmode.segs[seg] = *var; 3640 if (seg == VCPU_SREG_TR) 3641 vmcs_write16(sf->selector, var->selector); 3642 else if (var->s) 3643 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3644 return; 3645 } 3646 3647 vmcs_writel(sf->base, var->base); 3648 vmcs_write32(sf->limit, var->limit); 3649 vmcs_write16(sf->selector, var->selector); 3650 3651 /* 3652 * Fix the "Accessed" bit in AR field of segment registers for older 3653 * qemu binaries. 3654 * IA32 arch specifies that at the time of processor reset the 3655 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3656 * is setting it to 0 in the userland code. This causes invalid guest 3657 * state vmexit when "unrestricted guest" mode is turned on. 3658 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3659 * tree. Newer qemu binaries with that qemu fix would not need this 3660 * kvm hack. 3661 */ 3662 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3663 var->type |= 0x1; /* Accessed */ 3664 3665 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3666 } 3667 3668 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3669 { 3670 __vmx_set_segment(vcpu, var, seg); 3671 3672 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 3673 } 3674 3675 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3676 { 3677 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3678 3679 *db = (ar >> 14) & 1; 3680 *l = (ar >> 13) & 1; 3681 } 3682 3683 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3684 { 3685 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3686 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3687 } 3688 3689 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3690 { 3691 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3692 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3693 } 3694 3695 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3696 { 3697 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3698 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3699 } 3700 3701 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3702 { 3703 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3704 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3705 } 3706 3707 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3708 { 3709 struct kvm_segment var; 3710 u32 ar; 3711 3712 vmx_get_segment(vcpu, &var, seg); 3713 var.dpl = 0x3; 3714 if (seg == VCPU_SREG_CS) 3715 var.type = 0x3; 3716 ar = vmx_segment_access_rights(&var); 3717 3718 if (var.base != (var.selector << 4)) 3719 return false; 3720 if (var.limit != 0xffff) 3721 return false; 3722 if (ar != 0xf3) 3723 return false; 3724 3725 return true; 3726 } 3727 3728 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3729 { 3730 struct kvm_segment cs; 3731 unsigned int cs_rpl; 3732 3733 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3734 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3735 3736 if (cs.unusable) 3737 return false; 3738 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3739 return false; 3740 if (!cs.s) 3741 return false; 3742 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3743 if (cs.dpl > cs_rpl) 3744 return false; 3745 } else { 3746 if (cs.dpl != cs_rpl) 3747 return false; 3748 } 3749 if (!cs.present) 3750 return false; 3751 3752 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3753 return true; 3754 } 3755 3756 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3757 { 3758 struct kvm_segment ss; 3759 unsigned int ss_rpl; 3760 3761 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3762 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3763 3764 if (ss.unusable) 3765 return true; 3766 if (ss.type != 3 && ss.type != 7) 3767 return false; 3768 if (!ss.s) 3769 return false; 3770 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3771 return false; 3772 if (!ss.present) 3773 return false; 3774 3775 return true; 3776 } 3777 3778 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3779 { 3780 struct kvm_segment var; 3781 unsigned int rpl; 3782 3783 vmx_get_segment(vcpu, &var, seg); 3784 rpl = var.selector & SEGMENT_RPL_MASK; 3785 3786 if (var.unusable) 3787 return true; 3788 if (!var.s) 3789 return false; 3790 if (!var.present) 3791 return false; 3792 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3793 if (var.dpl < rpl) /* DPL < RPL */ 3794 return false; 3795 } 3796 3797 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3798 * rights flags 3799 */ 3800 return true; 3801 } 3802 3803 static bool tr_valid(struct kvm_vcpu *vcpu) 3804 { 3805 struct kvm_segment tr; 3806 3807 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3808 3809 if (tr.unusable) 3810 return false; 3811 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3812 return false; 3813 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3814 return false; 3815 if (!tr.present) 3816 return false; 3817 3818 return true; 3819 } 3820 3821 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3822 { 3823 struct kvm_segment ldtr; 3824 3825 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3826 3827 if (ldtr.unusable) 3828 return true; 3829 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3830 return false; 3831 if (ldtr.type != 2) 3832 return false; 3833 if (!ldtr.present) 3834 return false; 3835 3836 return true; 3837 } 3838 3839 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3840 { 3841 struct kvm_segment cs, ss; 3842 3843 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3844 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3845 3846 return ((cs.selector & SEGMENT_RPL_MASK) == 3847 (ss.selector & SEGMENT_RPL_MASK)); 3848 } 3849 3850 /* 3851 * Check if guest state is valid. Returns true if valid, false if 3852 * not. 3853 * We assume that registers are always usable 3854 */ 3855 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3856 { 3857 /* real mode guest state checks */ 3858 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3859 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3860 return false; 3861 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3862 return false; 3863 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3864 return false; 3865 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3866 return false; 3867 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3868 return false; 3869 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3870 return false; 3871 } else { 3872 /* protected mode guest state checks */ 3873 if (!cs_ss_rpl_check(vcpu)) 3874 return false; 3875 if (!code_segment_valid(vcpu)) 3876 return false; 3877 if (!stack_segment_valid(vcpu)) 3878 return false; 3879 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3880 return false; 3881 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3882 return false; 3883 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3884 return false; 3885 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3886 return false; 3887 if (!tr_valid(vcpu)) 3888 return false; 3889 if (!ldtr_valid(vcpu)) 3890 return false; 3891 } 3892 /* TODO: 3893 * - Add checks on RIP 3894 * - Add checks on RFLAGS 3895 */ 3896 3897 return true; 3898 } 3899 3900 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3901 { 3902 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3903 u16 data; 3904 int i; 3905 3906 for (i = 0; i < 3; i++) { 3907 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3908 return -EFAULT; 3909 } 3910 3911 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3912 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3913 return -EFAULT; 3914 3915 data = ~0; 3916 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3917 return -EFAULT; 3918 3919 return 0; 3920 } 3921 3922 static int init_rmode_identity_map(struct kvm *kvm) 3923 { 3924 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3925 int i, r = 0; 3926 void __user *uaddr; 3927 u32 tmp; 3928 3929 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3930 mutex_lock(&kvm->slots_lock); 3931 3932 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3933 goto out; 3934 3935 if (!kvm_vmx->ept_identity_map_addr) 3936 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3937 3938 uaddr = __x86_set_memory_region(kvm, 3939 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3940 kvm_vmx->ept_identity_map_addr, 3941 PAGE_SIZE); 3942 if (IS_ERR(uaddr)) { 3943 r = PTR_ERR(uaddr); 3944 goto out; 3945 } 3946 3947 /* Set up identity-mapping pagetable for EPT in real mode */ 3948 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 3949 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3950 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3951 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 3952 r = -EFAULT; 3953 goto out; 3954 } 3955 } 3956 kvm_vmx->ept_identity_pagetable_done = true; 3957 3958 out: 3959 mutex_unlock(&kvm->slots_lock); 3960 return r; 3961 } 3962 3963 static void seg_setup(int seg) 3964 { 3965 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3966 unsigned int ar; 3967 3968 vmcs_write16(sf->selector, 0); 3969 vmcs_writel(sf->base, 0); 3970 vmcs_write32(sf->limit, 0xffff); 3971 ar = 0x93; 3972 if (seg == VCPU_SREG_CS) 3973 ar |= 0x08; /* code segment */ 3974 3975 vmcs_write32(sf->ar_bytes, ar); 3976 } 3977 3978 int allocate_vpid(void) 3979 { 3980 int vpid; 3981 3982 if (!enable_vpid) 3983 return 0; 3984 spin_lock(&vmx_vpid_lock); 3985 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3986 if (vpid < VMX_NR_VPIDS) 3987 __set_bit(vpid, vmx_vpid_bitmap); 3988 else 3989 vpid = 0; 3990 spin_unlock(&vmx_vpid_lock); 3991 return vpid; 3992 } 3993 3994 void free_vpid(int vpid) 3995 { 3996 if (!enable_vpid || vpid == 0) 3997 return; 3998 spin_lock(&vmx_vpid_lock); 3999 __clear_bit(vpid, vmx_vpid_bitmap); 4000 spin_unlock(&vmx_vpid_lock); 4001 } 4002 4003 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 4004 { 4005 /* 4006 * When KVM is a nested hypervisor on top of Hyper-V and uses 4007 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 4008 * bitmap has changed. 4009 */ 4010 if (kvm_is_using_evmcs()) { 4011 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 4012 4013 if (evmcs->hv_enlightenments_control.msr_bitmap) 4014 evmcs->hv_clean_fields &= 4015 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 4016 } 4017 4018 vmx->nested.force_msr_bitmap_recalc = true; 4019 } 4020 4021 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4022 { 4023 struct vcpu_vmx *vmx = to_vmx(vcpu); 4024 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4025 int idx; 4026 4027 if (!cpu_has_vmx_msr_bitmap()) 4028 return; 4029 4030 vmx_msr_bitmap_l01_changed(vmx); 4031 4032 /* 4033 * Mark the desired intercept state in shadow bitmap, this is needed 4034 * for resync when the MSR filters change. 4035 */ 4036 idx = vmx_get_passthrough_msr_slot(msr); 4037 if (idx >= 0) { 4038 if (type & MSR_TYPE_R) 4039 clear_bit(idx, vmx->shadow_msr_intercept.read); 4040 if (type & MSR_TYPE_W) 4041 clear_bit(idx, vmx->shadow_msr_intercept.write); 4042 } 4043 4044 if ((type & MSR_TYPE_R) && 4045 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { 4046 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4047 type &= ~MSR_TYPE_R; 4048 } 4049 4050 if ((type & MSR_TYPE_W) && 4051 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { 4052 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4053 type &= ~MSR_TYPE_W; 4054 } 4055 4056 if (type & MSR_TYPE_R) 4057 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4058 4059 if (type & MSR_TYPE_W) 4060 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4061 } 4062 4063 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4064 { 4065 struct vcpu_vmx *vmx = to_vmx(vcpu); 4066 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4067 int idx; 4068 4069 if (!cpu_has_vmx_msr_bitmap()) 4070 return; 4071 4072 vmx_msr_bitmap_l01_changed(vmx); 4073 4074 /* 4075 * Mark the desired intercept state in shadow bitmap, this is needed 4076 * for resync when the MSR filter changes. 4077 */ 4078 idx = vmx_get_passthrough_msr_slot(msr); 4079 if (idx >= 0) { 4080 if (type & MSR_TYPE_R) 4081 set_bit(idx, vmx->shadow_msr_intercept.read); 4082 if (type & MSR_TYPE_W) 4083 set_bit(idx, vmx->shadow_msr_intercept.write); 4084 } 4085 4086 if (type & MSR_TYPE_R) 4087 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4088 4089 if (type & MSR_TYPE_W) 4090 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4091 } 4092 4093 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4094 { 4095 /* 4096 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4097 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4098 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4099 */ 4100 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4101 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4102 struct vcpu_vmx *vmx = to_vmx(vcpu); 4103 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4104 u8 mode; 4105 4106 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4107 return; 4108 4109 if (cpu_has_secondary_exec_ctrls() && 4110 (secondary_exec_controls_get(vmx) & 4111 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4112 mode = MSR_BITMAP_MODE_X2APIC; 4113 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4114 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4115 } else { 4116 mode = 0; 4117 } 4118 4119 if (mode == vmx->x2apic_msr_bitmap_mode) 4120 return; 4121 4122 vmx->x2apic_msr_bitmap_mode = mode; 4123 4124 /* 4125 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4126 * registers (0x840 and above) intercepted, KVM doesn't support them. 4127 * Intercept all writes by default and poke holes as needed. Pass 4128 * through reads for all valid registers by default in x2APIC+APICv 4129 * mode, only the current timer count needs on-demand emulation by KVM. 4130 */ 4131 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4132 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4133 else 4134 msr_bitmap[read_idx] = ~0ull; 4135 msr_bitmap[write_idx] = ~0ull; 4136 4137 /* 4138 * TPR reads and writes can be virtualized even if virtual interrupt 4139 * delivery is not in use. 4140 */ 4141 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4142 !(mode & MSR_BITMAP_MODE_X2APIC)); 4143 4144 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4145 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4146 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4147 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4148 if (enable_ipiv) 4149 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4150 } 4151 } 4152 4153 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4154 { 4155 struct vcpu_vmx *vmx = to_vmx(vcpu); 4156 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4157 u32 i; 4158 4159 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4160 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4161 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4162 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4163 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4164 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4165 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4166 } 4167 } 4168 4169 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) 4170 { 4171 struct vcpu_vmx *vmx = to_vmx(vcpu); 4172 u32 i; 4173 4174 if (!cpu_has_vmx_msr_bitmap()) 4175 return; 4176 4177 /* 4178 * Redo intercept permissions for MSRs that KVM is passing through to 4179 * the guest. Disabling interception will check the new MSR filter and 4180 * ensure that KVM enables interception if usersepace wants to filter 4181 * the MSR. MSRs that KVM is already intercepting don't need to be 4182 * refreshed since KVM is going to intercept them regardless of what 4183 * userspace wants. 4184 */ 4185 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 4186 u32 msr = vmx_possible_passthrough_msrs[i]; 4187 4188 if (!test_bit(i, vmx->shadow_msr_intercept.read)) 4189 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); 4190 4191 if (!test_bit(i, vmx->shadow_msr_intercept.write)) 4192 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); 4193 } 4194 4195 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4196 if (vmx_pt_mode_is_host_guest()) 4197 pt_update_intercept_for_msr(vcpu); 4198 } 4199 4200 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 4201 int pi_vec) 4202 { 4203 #ifdef CONFIG_SMP 4204 if (vcpu->mode == IN_GUEST_MODE) { 4205 /* 4206 * The vector of the virtual has already been set in the PIR. 4207 * Send a notification event to deliver the virtual interrupt 4208 * unless the vCPU is the currently running vCPU, i.e. the 4209 * event is being sent from a fastpath VM-Exit handler, in 4210 * which case the PIR will be synced to the vIRR before 4211 * re-entering the guest. 4212 * 4213 * When the target is not the running vCPU, the following 4214 * possibilities emerge: 4215 * 4216 * Case 1: vCPU stays in non-root mode. Sending a notification 4217 * event posts the interrupt to the vCPU. 4218 * 4219 * Case 2: vCPU exits to root mode and is still runnable. The 4220 * PIR will be synced to the vIRR before re-entering the guest. 4221 * Sending a notification event is ok as the host IRQ handler 4222 * will ignore the spurious event. 4223 * 4224 * Case 3: vCPU exits to root mode and is blocked. vcpu_block() 4225 * has already synced PIR to vIRR and never blocks the vCPU if 4226 * the vIRR is not empty. Therefore, a blocked vCPU here does 4227 * not wait for any requested interrupts in PIR, and sending a 4228 * notification event also results in a benign, spurious event. 4229 */ 4230 4231 if (vcpu != kvm_get_running_vcpu()) 4232 __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); 4233 return; 4234 } 4235 #endif 4236 /* 4237 * The vCPU isn't in the guest; wake the vCPU in case it is blocking, 4238 * otherwise do nothing as KVM will grab the highest priority pending 4239 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). 4240 */ 4241 kvm_vcpu_wake_up(vcpu); 4242 } 4243 4244 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4245 int vector) 4246 { 4247 struct vcpu_vmx *vmx = to_vmx(vcpu); 4248 4249 /* 4250 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4251 * and freed, and must not be accessed outside of vcpu->mutex. The 4252 * vCPU's cached PI NV is valid if and only if posted interrupts 4253 * enabled in its vmcs12, i.e. checking the vector also checks that 4254 * L1 has enabled posted interrupts for L2. 4255 */ 4256 if (is_guest_mode(vcpu) && 4257 vector == vmx->nested.posted_intr_nv) { 4258 /* 4259 * If a posted intr is not recognized by hardware, 4260 * we will accomplish it in the next vmentry. 4261 */ 4262 vmx->nested.pi_pending = true; 4263 kvm_make_request(KVM_REQ_EVENT, vcpu); 4264 4265 /* 4266 * This pairs with the smp_mb_*() after setting vcpu->mode in 4267 * vcpu_enter_guest() to guarantee the vCPU sees the event 4268 * request if triggering a posted interrupt "fails" because 4269 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4270 * the smb_wmb() in kvm_make_request() only ensures everything 4271 * done before making the request is visible when the request 4272 * is visible, it doesn't ensure ordering between the store to 4273 * vcpu->requests and the load from vcpu->mode. 4274 */ 4275 smp_mb__after_atomic(); 4276 4277 /* the PIR and ON have been set by L1. */ 4278 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4279 return 0; 4280 } 4281 return -1; 4282 } 4283 /* 4284 * Send interrupt to vcpu via posted interrupt way. 4285 * 1. If target vcpu is running(non-root mode), send posted interrupt 4286 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4287 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4288 * interrupt from PIR in next vmentry. 4289 */ 4290 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4291 { 4292 struct vcpu_vmx *vmx = to_vmx(vcpu); 4293 int r; 4294 4295 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4296 if (!r) 4297 return 0; 4298 4299 /* Note, this is called iff the local APIC is in-kernel. */ 4300 if (!vcpu->arch.apic->apicv_active) 4301 return -1; 4302 4303 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4304 return 0; 4305 4306 /* If a previous notification has sent the IPI, nothing to do. */ 4307 if (pi_test_and_set_on(&vmx->pi_desc)) 4308 return 0; 4309 4310 /* 4311 * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() 4312 * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is 4313 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a 4314 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. 4315 */ 4316 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); 4317 return 0; 4318 } 4319 4320 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4321 int trig_mode, int vector) 4322 { 4323 struct kvm_vcpu *vcpu = apic->vcpu; 4324 4325 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4326 kvm_lapic_set_irr(vector, apic); 4327 kvm_make_request(KVM_REQ_EVENT, vcpu); 4328 kvm_vcpu_kick(vcpu); 4329 } else { 4330 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4331 trig_mode, vector); 4332 } 4333 } 4334 4335 /* 4336 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4337 * will not change in the lifetime of the guest. 4338 * Note that host-state that does change is set elsewhere. E.g., host-state 4339 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4340 */ 4341 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4342 { 4343 u32 low32, high32; 4344 unsigned long tmpl; 4345 unsigned long cr0, cr3, cr4; 4346 4347 cr0 = read_cr0(); 4348 WARN_ON(cr0 & X86_CR0_TS); 4349 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4350 4351 /* 4352 * Save the most likely value for this task's CR3 in the VMCS. 4353 * We can't use __get_current_cr3_fast() because we're not atomic. 4354 */ 4355 cr3 = __read_cr3(); 4356 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4357 vmx->loaded_vmcs->host_state.cr3 = cr3; 4358 4359 /* Save the most likely value for this task's CR4 in the VMCS. */ 4360 cr4 = cr4_read_shadow(); 4361 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4362 vmx->loaded_vmcs->host_state.cr4 = cr4; 4363 4364 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4365 #ifdef CONFIG_X86_64 4366 /* 4367 * Load null selectors, so we can avoid reloading them in 4368 * vmx_prepare_switch_to_host(), in case userspace uses 4369 * the null selectors too (the expected case). 4370 */ 4371 vmcs_write16(HOST_DS_SELECTOR, 0); 4372 vmcs_write16(HOST_ES_SELECTOR, 0); 4373 #else 4374 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4375 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4376 #endif 4377 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4378 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4379 4380 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4381 4382 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4383 4384 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4385 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4386 4387 /* 4388 * SYSENTER is used for 32-bit system calls on either 32-bit or 4389 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4390 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4391 * have already done so!). 4392 */ 4393 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4394 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4395 4396 rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); 4397 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4398 4399 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4400 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4401 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4402 } 4403 4404 if (cpu_has_load_ia32_efer()) 4405 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4406 } 4407 4408 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4409 { 4410 struct kvm_vcpu *vcpu = &vmx->vcpu; 4411 4412 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4413 ~vcpu->arch.cr4_guest_rsvd_bits; 4414 if (!enable_ept) { 4415 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4416 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4417 } 4418 if (is_guest_mode(&vmx->vcpu)) 4419 vcpu->arch.cr4_guest_owned_bits &= 4420 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4421 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4422 } 4423 4424 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4425 { 4426 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4427 4428 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4429 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4430 4431 if (!enable_vnmi) 4432 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4433 4434 if (!enable_preemption_timer) 4435 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4436 4437 return pin_based_exec_ctrl; 4438 } 4439 4440 static u32 vmx_vmentry_ctrl(void) 4441 { 4442 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4443 4444 if (vmx_pt_mode_is_system()) 4445 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4446 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4447 /* 4448 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4449 */ 4450 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4451 VM_ENTRY_LOAD_IA32_EFER | 4452 VM_ENTRY_IA32E_MODE); 4453 4454 return vmentry_ctrl; 4455 } 4456 4457 static u32 vmx_vmexit_ctrl(void) 4458 { 4459 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4460 4461 /* 4462 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4463 * nested virtualization and thus allowed to be set in vmcs12. 4464 */ 4465 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4466 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4467 4468 if (vmx_pt_mode_is_system()) 4469 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4470 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4471 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4472 return vmexit_ctrl & 4473 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4474 } 4475 4476 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4477 { 4478 struct vcpu_vmx *vmx = to_vmx(vcpu); 4479 4480 if (is_guest_mode(vcpu)) { 4481 vmx->nested.update_vmcs01_apicv_status = true; 4482 return; 4483 } 4484 4485 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4486 4487 if (kvm_vcpu_apicv_active(vcpu)) { 4488 secondary_exec_controls_setbit(vmx, 4489 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4490 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4491 if (enable_ipiv) 4492 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4493 } else { 4494 secondary_exec_controls_clearbit(vmx, 4495 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4496 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4497 if (enable_ipiv) 4498 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4499 } 4500 4501 vmx_update_msr_bitmap_x2apic(vcpu); 4502 } 4503 4504 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4505 { 4506 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4507 4508 /* 4509 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4510 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4511 */ 4512 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4513 CPU_BASED_USE_IO_BITMAPS | 4514 CPU_BASED_MONITOR_TRAP_FLAG | 4515 CPU_BASED_PAUSE_EXITING); 4516 4517 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4518 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4519 CPU_BASED_NMI_WINDOW_EXITING); 4520 4521 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4522 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4523 4524 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4525 exec_control &= ~CPU_BASED_TPR_SHADOW; 4526 4527 #ifdef CONFIG_X86_64 4528 if (exec_control & CPU_BASED_TPR_SHADOW) 4529 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4530 CPU_BASED_CR8_STORE_EXITING); 4531 else 4532 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4533 CPU_BASED_CR8_LOAD_EXITING; 4534 #endif 4535 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4536 if (enable_ept) 4537 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4538 CPU_BASED_CR3_STORE_EXITING | 4539 CPU_BASED_INVLPG_EXITING); 4540 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4541 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4542 CPU_BASED_MONITOR_EXITING); 4543 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4544 exec_control &= ~CPU_BASED_HLT_EXITING; 4545 return exec_control; 4546 } 4547 4548 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4549 { 4550 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4551 4552 /* 4553 * IPI virtualization relies on APICv. Disable IPI virtualization if 4554 * APICv is inhibited. 4555 */ 4556 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4557 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4558 4559 return exec_control; 4560 } 4561 4562 /* 4563 * Adjust a single secondary execution control bit to intercept/allow an 4564 * instruction in the guest. This is usually done based on whether or not a 4565 * feature has been exposed to the guest in order to correctly emulate faults. 4566 */ 4567 static inline void 4568 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4569 u32 control, bool enabled, bool exiting) 4570 { 4571 /* 4572 * If the control is for an opt-in feature, clear the control if the 4573 * feature is not exposed to the guest, i.e. not enabled. If the 4574 * control is opt-out, i.e. an exiting control, clear the control if 4575 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4576 * disabled for the associated instruction. Note, the caller is 4577 * responsible presetting exec_control to set all supported bits. 4578 */ 4579 if (enabled == exiting) 4580 *exec_control &= ~control; 4581 4582 /* 4583 * Update the nested MSR settings so that a nested VMM can/can't set 4584 * controls for features that are/aren't exposed to the guest. 4585 */ 4586 if (nested && 4587 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { 4588 /* 4589 * All features that can be added or removed to VMX MSRs must 4590 * be supported in the first place for nested virtualization. 4591 */ 4592 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4593 enabled = false; 4594 4595 if (enabled) 4596 vmx->nested.msrs.secondary_ctls_high |= control; 4597 else 4598 vmx->nested.msrs.secondary_ctls_high &= ~control; 4599 } 4600 } 4601 4602 /* 4603 * Wrapper macro for the common case of adjusting a secondary execution control 4604 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4605 * verifies that the control is actually supported by KVM and hardware. 4606 */ 4607 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4608 ({ \ 4609 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4610 bool __enabled; \ 4611 \ 4612 if (cpu_has_vmx_##name()) { \ 4613 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ 4614 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4615 __enabled, exiting); \ 4616 } \ 4617 }) 4618 4619 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4620 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4621 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4622 4623 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4624 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4625 4626 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4627 { 4628 struct kvm_vcpu *vcpu = &vmx->vcpu; 4629 4630 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4631 4632 if (vmx_pt_mode_is_system()) 4633 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4634 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4635 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4636 if (vmx->vpid == 0) 4637 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4638 if (!enable_ept) { 4639 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4640 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 4641 enable_unrestricted_guest = 0; 4642 } 4643 if (!enable_unrestricted_guest) 4644 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4645 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4646 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4647 if (!kvm_vcpu_apicv_active(vcpu)) 4648 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4649 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4650 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4651 4652 /* 4653 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4654 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4655 */ 4656 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4657 4658 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4659 * in vmx_set_cr4. */ 4660 exec_control &= ~SECONDARY_EXEC_DESC; 4661 4662 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4663 (handle_vmptrld). 4664 We can NOT enable shadow_vmcs here because we don't have yet 4665 a current VMCS12 4666 */ 4667 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4668 4669 /* 4670 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4671 * it needs to be set here when dirty logging is already active, e.g. 4672 * if this vCPU was created after dirty logging was enabled. 4673 */ 4674 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4675 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4676 4677 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4678 4679 /* 4680 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4681 * feature is exposed to the guest. This creates a virtualization hole 4682 * if both are supported in hardware but only one is exposed to the 4683 * guest, but letting the guest execute RDTSCP or RDPID when either one 4684 * is advertised is preferable to emulating the advertised instruction 4685 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4686 */ 4687 if (cpu_has_vmx_rdtscp()) { 4688 bool rdpid_or_rdtscp_enabled = 4689 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || 4690 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); 4691 4692 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4693 SECONDARY_EXEC_ENABLE_RDTSCP, 4694 rdpid_or_rdtscp_enabled, false); 4695 } 4696 4697 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4698 4699 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4700 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4701 4702 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4703 ENABLE_USR_WAIT_PAUSE, false); 4704 4705 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4706 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4707 4708 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4709 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4710 4711 return exec_control; 4712 } 4713 4714 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4715 { 4716 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4717 } 4718 4719 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4720 { 4721 struct page *pages; 4722 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4723 4724 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4725 return 0; 4726 4727 if (kvm_vmx->pid_table) 4728 return 0; 4729 4730 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4731 vmx_get_pid_table_order(kvm)); 4732 if (!pages) 4733 return -ENOMEM; 4734 4735 kvm_vmx->pid_table = (void *)page_address(pages); 4736 return 0; 4737 } 4738 4739 int vmx_vcpu_precreate(struct kvm *kvm) 4740 { 4741 return vmx_alloc_ipiv_pid_table(kvm); 4742 } 4743 4744 #define VMX_XSS_EXIT_BITMAP 0 4745 4746 static void init_vmcs(struct vcpu_vmx *vmx) 4747 { 4748 struct kvm *kvm = vmx->vcpu.kvm; 4749 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4750 4751 if (nested) 4752 nested_vmx_set_vmcs_shadowing_bitmap(); 4753 4754 if (cpu_has_vmx_msr_bitmap()) 4755 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4756 4757 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4758 4759 /* Control */ 4760 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4761 4762 exec_controls_set(vmx, vmx_exec_control(vmx)); 4763 4764 if (cpu_has_secondary_exec_ctrls()) { 4765 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4766 if (vmx->ve_info) 4767 vmcs_write64(VE_INFORMATION_ADDRESS, 4768 __pa(vmx->ve_info)); 4769 } 4770 4771 if (cpu_has_tertiary_exec_ctrls()) 4772 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4773 4774 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4775 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4776 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4777 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4778 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4779 4780 vmcs_write16(GUEST_INTR_STATUS, 0); 4781 4782 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4783 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4784 } 4785 4786 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4787 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4788 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4789 } 4790 4791 if (!kvm_pause_in_guest(kvm)) { 4792 vmcs_write32(PLE_GAP, ple_gap); 4793 vmx->ple_window = ple_window; 4794 vmx->ple_window_dirty = true; 4795 } 4796 4797 if (kvm_notify_vmexit_enabled(kvm)) 4798 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4799 4800 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4801 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4802 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4803 4804 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4805 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4806 vmx_set_constant_host_state(vmx); 4807 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4808 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4809 4810 if (cpu_has_vmx_vmfunc()) 4811 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4812 4813 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4814 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4815 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4816 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4817 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4818 4819 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4820 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4821 4822 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4823 4824 /* 22.2.1, 20.8.1 */ 4825 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4826 4827 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4828 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4829 4830 set_cr4_guest_host_mask(vmx); 4831 4832 if (vmx->vpid != 0) 4833 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4834 4835 if (cpu_has_vmx_xsaves()) 4836 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4837 4838 if (enable_pml) { 4839 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4840 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 4841 } 4842 4843 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4844 4845 if (vmx_pt_mode_is_host_guest()) { 4846 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4847 /* Bit[6~0] are forced to 1, writes are ignored. */ 4848 vmx->pt_desc.guest.output_mask = 0x7F; 4849 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4850 } 4851 4852 vmcs_write32(GUEST_SYSENTER_CS, 0); 4853 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4854 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4855 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4856 4857 if (cpu_has_vmx_tpr_shadow()) { 4858 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4859 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4860 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4861 __pa(vmx->vcpu.arch.apic->regs)); 4862 vmcs_write32(TPR_THRESHOLD, 0); 4863 } 4864 4865 vmx_setup_uret_msrs(vmx); 4866 } 4867 4868 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4869 { 4870 struct vcpu_vmx *vmx = to_vmx(vcpu); 4871 4872 init_vmcs(vmx); 4873 4874 if (nested && 4875 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4876 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4877 4878 vcpu_setup_sgx_lepubkeyhash(vcpu); 4879 4880 vmx->nested.posted_intr_nv = -1; 4881 vmx->nested.vmxon_ptr = INVALID_GPA; 4882 vmx->nested.current_vmptr = INVALID_GPA; 4883 4884 #ifdef CONFIG_KVM_HYPERV 4885 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4886 #endif 4887 4888 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4889 vcpu->arch.microcode_version = 0x100000000ULL; 4890 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4891 4892 /* 4893 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4894 * or POSTED_INTR_WAKEUP_VECTOR. 4895 */ 4896 vmx->pi_desc.nv = POSTED_INTR_VECTOR; 4897 __pi_set_sn(&vmx->pi_desc); 4898 } 4899 4900 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4901 { 4902 struct vcpu_vmx *vmx = to_vmx(vcpu); 4903 4904 if (!init_event) 4905 __vmx_vcpu_reset(vcpu); 4906 4907 vmx->rmode.vm86_active = 0; 4908 vmx->spec_ctrl = 0; 4909 4910 vmx->msr_ia32_umwait_control = 0; 4911 4912 vmx->hv_deadline_tsc = -1; 4913 kvm_set_cr8(vcpu, 0); 4914 4915 seg_setup(VCPU_SREG_CS); 4916 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4917 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4918 4919 seg_setup(VCPU_SREG_DS); 4920 seg_setup(VCPU_SREG_ES); 4921 seg_setup(VCPU_SREG_FS); 4922 seg_setup(VCPU_SREG_GS); 4923 seg_setup(VCPU_SREG_SS); 4924 4925 vmcs_write16(GUEST_TR_SELECTOR, 0); 4926 vmcs_writel(GUEST_TR_BASE, 0); 4927 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4928 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4929 4930 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4931 vmcs_writel(GUEST_LDTR_BASE, 0); 4932 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4933 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4934 4935 vmcs_writel(GUEST_GDTR_BASE, 0); 4936 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4937 4938 vmcs_writel(GUEST_IDTR_BASE, 0); 4939 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4940 4941 vmx_segment_cache_clear(vmx); 4942 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4943 4944 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4945 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4946 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4947 if (kvm_mpx_supported()) 4948 vmcs_write64(GUEST_BNDCFGS, 0); 4949 4950 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4951 4952 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4953 4954 vpid_sync_context(vmx->vpid); 4955 4956 vmx_update_fb_clear_dis(vcpu, vmx); 4957 } 4958 4959 void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 4960 { 4961 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4962 } 4963 4964 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 4965 { 4966 if (!enable_vnmi || 4967 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4968 vmx_enable_irq_window(vcpu); 4969 return; 4970 } 4971 4972 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4973 } 4974 4975 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 4976 { 4977 struct vcpu_vmx *vmx = to_vmx(vcpu); 4978 uint32_t intr; 4979 int irq = vcpu->arch.interrupt.nr; 4980 4981 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 4982 4983 ++vcpu->stat.irq_injections; 4984 if (vmx->rmode.vm86_active) { 4985 int inc_eip = 0; 4986 if (vcpu->arch.interrupt.soft) 4987 inc_eip = vcpu->arch.event_exit_inst_len; 4988 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4989 return; 4990 } 4991 intr = irq | INTR_INFO_VALID_MASK; 4992 if (vcpu->arch.interrupt.soft) { 4993 intr |= INTR_TYPE_SOFT_INTR; 4994 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4995 vmx->vcpu.arch.event_exit_inst_len); 4996 } else 4997 intr |= INTR_TYPE_EXT_INTR; 4998 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4999 5000 vmx_clear_hlt(vcpu); 5001 } 5002 5003 void vmx_inject_nmi(struct kvm_vcpu *vcpu) 5004 { 5005 struct vcpu_vmx *vmx = to_vmx(vcpu); 5006 5007 if (!enable_vnmi) { 5008 /* 5009 * Tracking the NMI-blocked state in software is built upon 5010 * finding the next open IRQ window. This, in turn, depends on 5011 * well-behaving guests: They have to keep IRQs disabled at 5012 * least as long as the NMI handler runs. Otherwise we may 5013 * cause NMI nesting, maybe breaking the guest. But as this is 5014 * highly unlikely, we can live with the residual risk. 5015 */ 5016 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 5017 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5018 } 5019 5020 ++vcpu->stat.nmi_injections; 5021 vmx->loaded_vmcs->nmi_known_unmasked = false; 5022 5023 if (vmx->rmode.vm86_active) { 5024 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 5025 return; 5026 } 5027 5028 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 5029 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 5030 5031 vmx_clear_hlt(vcpu); 5032 } 5033 5034 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 5035 { 5036 struct vcpu_vmx *vmx = to_vmx(vcpu); 5037 bool masked; 5038 5039 if (!enable_vnmi) 5040 return vmx->loaded_vmcs->soft_vnmi_blocked; 5041 if (vmx->loaded_vmcs->nmi_known_unmasked) 5042 return false; 5043 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 5044 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5045 return masked; 5046 } 5047 5048 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5049 { 5050 struct vcpu_vmx *vmx = to_vmx(vcpu); 5051 5052 if (!enable_vnmi) { 5053 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 5054 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 5055 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5056 } 5057 } else { 5058 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5059 if (masked) 5060 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5061 GUEST_INTR_STATE_NMI); 5062 else 5063 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5064 GUEST_INTR_STATE_NMI); 5065 } 5066 } 5067 5068 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5069 { 5070 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5071 return false; 5072 5073 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5074 return true; 5075 5076 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5077 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5078 GUEST_INTR_STATE_NMI)); 5079 } 5080 5081 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5082 { 5083 if (to_vmx(vcpu)->nested.nested_run_pending) 5084 return -EBUSY; 5085 5086 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5087 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5088 return -EBUSY; 5089 5090 return !vmx_nmi_blocked(vcpu); 5091 } 5092 5093 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5094 { 5095 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5096 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5097 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5098 } 5099 5100 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5101 { 5102 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5103 return false; 5104 5105 return __vmx_interrupt_blocked(vcpu); 5106 } 5107 5108 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5109 { 5110 if (to_vmx(vcpu)->nested.nested_run_pending) 5111 return -EBUSY; 5112 5113 /* 5114 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5115 * e.g. if the IRQ arrived asynchronously after checking nested events. 5116 */ 5117 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5118 return -EBUSY; 5119 5120 return !vmx_interrupt_blocked(vcpu); 5121 } 5122 5123 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5124 { 5125 void __user *ret; 5126 5127 if (enable_unrestricted_guest) 5128 return 0; 5129 5130 mutex_lock(&kvm->slots_lock); 5131 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5132 PAGE_SIZE * 3); 5133 mutex_unlock(&kvm->slots_lock); 5134 5135 if (IS_ERR(ret)) 5136 return PTR_ERR(ret); 5137 5138 to_kvm_vmx(kvm)->tss_addr = addr; 5139 5140 return init_rmode_tss(kvm, ret); 5141 } 5142 5143 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5144 { 5145 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5146 return 0; 5147 } 5148 5149 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5150 { 5151 switch (vec) { 5152 case BP_VECTOR: 5153 /* 5154 * Update instruction length as we may reinject the exception 5155 * from user space while in guest debugging mode. 5156 */ 5157 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5158 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5159 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5160 return false; 5161 fallthrough; 5162 case DB_VECTOR: 5163 return !(vcpu->guest_debug & 5164 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5165 case DE_VECTOR: 5166 case OF_VECTOR: 5167 case BR_VECTOR: 5168 case UD_VECTOR: 5169 case DF_VECTOR: 5170 case SS_VECTOR: 5171 case GP_VECTOR: 5172 case MF_VECTOR: 5173 return true; 5174 } 5175 return false; 5176 } 5177 5178 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5179 int vec, u32 err_code) 5180 { 5181 /* 5182 * Instruction with address size override prefix opcode 0x67 5183 * Cause the #SS fault with 0 error code in VM86 mode. 5184 */ 5185 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5186 if (kvm_emulate_instruction(vcpu, 0)) { 5187 if (vcpu->arch.halt_request) { 5188 vcpu->arch.halt_request = 0; 5189 return kvm_emulate_halt_noskip(vcpu); 5190 } 5191 return 1; 5192 } 5193 return 0; 5194 } 5195 5196 /* 5197 * Forward all other exceptions that are valid in real mode. 5198 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5199 * the required debugging infrastructure rework. 5200 */ 5201 kvm_queue_exception(vcpu, vec); 5202 return 1; 5203 } 5204 5205 static int handle_machine_check(struct kvm_vcpu *vcpu) 5206 { 5207 /* handled by vmx_vcpu_run() */ 5208 return 1; 5209 } 5210 5211 /* 5212 * If the host has split lock detection disabled, then #AC is 5213 * unconditionally injected into the guest, which is the pre split lock 5214 * detection behaviour. 5215 * 5216 * If the host has split lock detection enabled then #AC is 5217 * only injected into the guest when: 5218 * - Guest CPL == 3 (user mode) 5219 * - Guest has #AC detection enabled in CR0 5220 * - Guest EFLAGS has AC bit set 5221 */ 5222 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5223 { 5224 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5225 return true; 5226 5227 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5228 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5229 } 5230 5231 static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) 5232 { 5233 return vcpu->arch.guest_fpu.fpstate->xfd && 5234 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); 5235 } 5236 5237 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5238 { 5239 struct vcpu_vmx *vmx = to_vmx(vcpu); 5240 struct kvm_run *kvm_run = vcpu->run; 5241 u32 intr_info, ex_no, error_code; 5242 unsigned long cr2, dr6; 5243 u32 vect_info; 5244 5245 vect_info = vmx->idt_vectoring_info; 5246 intr_info = vmx_get_intr_info(vcpu); 5247 5248 /* 5249 * Machine checks are handled by handle_exception_irqoff(), or by 5250 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5251 * vmx_vcpu_enter_exit(). 5252 */ 5253 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5254 return 1; 5255 5256 /* 5257 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5258 * This ensures the nested_vmx check is not skipped so vmexit can 5259 * be reflected to L1 (when it intercepts #NM) before reaching this 5260 * point. 5261 */ 5262 if (is_nm_fault(intr_info)) { 5263 kvm_queue_exception_p(vcpu, NM_VECTOR, 5264 is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); 5265 return 1; 5266 } 5267 5268 if (is_invalid_opcode(intr_info)) 5269 return handle_ud(vcpu); 5270 5271 if (WARN_ON_ONCE(is_ve_fault(intr_info))) { 5272 struct vmx_ve_information *ve_info = vmx->ve_info; 5273 5274 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, 5275 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); 5276 dump_vmcs(vcpu); 5277 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); 5278 return 1; 5279 } 5280 5281 error_code = 0; 5282 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5283 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5284 5285 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5286 WARN_ON_ONCE(!enable_vmware_backdoor); 5287 5288 /* 5289 * VMware backdoor emulation on #GP interception only handles 5290 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5291 * error code on #GP. 5292 */ 5293 if (error_code) { 5294 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5295 return 1; 5296 } 5297 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5298 } 5299 5300 /* 5301 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5302 * MMIO, it is better to report an internal error. 5303 * See the comments in vmx_handle_exit. 5304 */ 5305 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5306 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5307 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5308 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5309 vcpu->run->internal.ndata = 4; 5310 vcpu->run->internal.data[0] = vect_info; 5311 vcpu->run->internal.data[1] = intr_info; 5312 vcpu->run->internal.data[2] = error_code; 5313 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5314 return 0; 5315 } 5316 5317 if (is_page_fault(intr_info)) { 5318 cr2 = vmx_get_exit_qual(vcpu); 5319 if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5320 /* 5321 * EPT will cause page fault only if we need to 5322 * detect illegal GPAs. 5323 */ 5324 WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5325 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5326 return 1; 5327 } else 5328 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5329 } 5330 5331 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5332 5333 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5334 return handle_rmode_exception(vcpu, ex_no, error_code); 5335 5336 switch (ex_no) { 5337 case DB_VECTOR: 5338 dr6 = vmx_get_exit_qual(vcpu); 5339 if (!(vcpu->guest_debug & 5340 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5341 /* 5342 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5343 * instruction. ICEBP generates a trap-like #DB, but 5344 * despite its interception control being tied to #DB, 5345 * is an instruction intercept, i.e. the VM-Exit occurs 5346 * on the ICEBP itself. Use the inner "skip" helper to 5347 * avoid single-step #DB and MTF updates, as ICEBP is 5348 * higher priority. Note, skipping ICEBP still clears 5349 * STI and MOVSS blocking. 5350 * 5351 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5352 * if single-step is enabled in RFLAGS and STI or MOVSS 5353 * blocking is active, as the CPU doesn't set the bit 5354 * on VM-Exit due to #DB interception. VM-Entry has a 5355 * consistency check that a single-step #DB is pending 5356 * in this scenario as the previous instruction cannot 5357 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5358 * don't modify RFLAGS), therefore the one instruction 5359 * delay when activating single-step breakpoints must 5360 * have already expired. Note, the CPU sets/clears BS 5361 * as appropriate for all other VM-Exits types. 5362 */ 5363 if (is_icebp(intr_info)) 5364 WARN_ON(!skip_emulated_instruction(vcpu)); 5365 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5366 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5367 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5368 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5369 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5370 5371 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5372 return 1; 5373 } 5374 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5375 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5376 fallthrough; 5377 case BP_VECTOR: 5378 /* 5379 * Update instruction length as we may reinject #BP from 5380 * user space while in guest debugging mode. Reading it for 5381 * #DB as well causes no harm, it is not used in that case. 5382 */ 5383 vmx->vcpu.arch.event_exit_inst_len = 5384 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5385 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5386 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5387 kvm_run->debug.arch.exception = ex_no; 5388 break; 5389 case AC_VECTOR: 5390 if (vmx_guest_inject_ac(vcpu)) { 5391 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5392 return 1; 5393 } 5394 5395 /* 5396 * Handle split lock. Depending on detection mode this will 5397 * either warn and disable split lock detection for this 5398 * task or force SIGBUS on it. 5399 */ 5400 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5401 return 1; 5402 fallthrough; 5403 default: 5404 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5405 kvm_run->ex.exception = ex_no; 5406 kvm_run->ex.error_code = error_code; 5407 break; 5408 } 5409 return 0; 5410 } 5411 5412 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5413 { 5414 ++vcpu->stat.irq_exits; 5415 return 1; 5416 } 5417 5418 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5419 { 5420 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5421 vcpu->mmio_needed = 0; 5422 return 0; 5423 } 5424 5425 static int handle_io(struct kvm_vcpu *vcpu) 5426 { 5427 unsigned long exit_qualification; 5428 int size, in, string; 5429 unsigned port; 5430 5431 exit_qualification = vmx_get_exit_qual(vcpu); 5432 string = (exit_qualification & 16) != 0; 5433 5434 ++vcpu->stat.io_exits; 5435 5436 if (string) 5437 return kvm_emulate_instruction(vcpu, 0); 5438 5439 port = exit_qualification >> 16; 5440 size = (exit_qualification & 7) + 1; 5441 in = (exit_qualification & 8) != 0; 5442 5443 return kvm_fast_pio(vcpu, size, port, in); 5444 } 5445 5446 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5447 { 5448 /* 5449 * Patch in the VMCALL instruction: 5450 */ 5451 hypercall[0] = 0x0f; 5452 hypercall[1] = 0x01; 5453 hypercall[2] = 0xc1; 5454 } 5455 5456 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5457 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5458 { 5459 if (is_guest_mode(vcpu)) { 5460 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5461 unsigned long orig_val = val; 5462 5463 /* 5464 * We get here when L2 changed cr0 in a way that did not change 5465 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5466 * but did change L0 shadowed bits. So we first calculate the 5467 * effective cr0 value that L1 would like to write into the 5468 * hardware. It consists of the L2-owned bits from the new 5469 * value combined with the L1-owned bits from L1's guest_cr0. 5470 */ 5471 val = (val & ~vmcs12->cr0_guest_host_mask) | 5472 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5473 5474 if (kvm_set_cr0(vcpu, val)) 5475 return 1; 5476 vmcs_writel(CR0_READ_SHADOW, orig_val); 5477 return 0; 5478 } else { 5479 return kvm_set_cr0(vcpu, val); 5480 } 5481 } 5482 5483 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5484 { 5485 if (is_guest_mode(vcpu)) { 5486 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5487 unsigned long orig_val = val; 5488 5489 /* analogously to handle_set_cr0 */ 5490 val = (val & ~vmcs12->cr4_guest_host_mask) | 5491 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5492 if (kvm_set_cr4(vcpu, val)) 5493 return 1; 5494 vmcs_writel(CR4_READ_SHADOW, orig_val); 5495 return 0; 5496 } else 5497 return kvm_set_cr4(vcpu, val); 5498 } 5499 5500 static int handle_desc(struct kvm_vcpu *vcpu) 5501 { 5502 /* 5503 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5504 * and other code needs to be updated if UMIP can be guest owned. 5505 */ 5506 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5507 5508 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5509 return kvm_emulate_instruction(vcpu, 0); 5510 } 5511 5512 static int handle_cr(struct kvm_vcpu *vcpu) 5513 { 5514 unsigned long exit_qualification, val; 5515 int cr; 5516 int reg; 5517 int err; 5518 int ret; 5519 5520 exit_qualification = vmx_get_exit_qual(vcpu); 5521 cr = exit_qualification & 15; 5522 reg = (exit_qualification >> 8) & 15; 5523 switch ((exit_qualification >> 4) & 3) { 5524 case 0: /* mov to cr */ 5525 val = kvm_register_read(vcpu, reg); 5526 trace_kvm_cr_write(cr, val); 5527 switch (cr) { 5528 case 0: 5529 err = handle_set_cr0(vcpu, val); 5530 return kvm_complete_insn_gp(vcpu, err); 5531 case 3: 5532 WARN_ON_ONCE(enable_unrestricted_guest); 5533 5534 err = kvm_set_cr3(vcpu, val); 5535 return kvm_complete_insn_gp(vcpu, err); 5536 case 4: 5537 err = handle_set_cr4(vcpu, val); 5538 return kvm_complete_insn_gp(vcpu, err); 5539 case 8: { 5540 u8 cr8_prev = kvm_get_cr8(vcpu); 5541 u8 cr8 = (u8)val; 5542 err = kvm_set_cr8(vcpu, cr8); 5543 ret = kvm_complete_insn_gp(vcpu, err); 5544 if (lapic_in_kernel(vcpu)) 5545 return ret; 5546 if (cr8_prev <= cr8) 5547 return ret; 5548 /* 5549 * TODO: we might be squashing a 5550 * KVM_GUESTDBG_SINGLESTEP-triggered 5551 * KVM_EXIT_DEBUG here. 5552 */ 5553 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5554 return 0; 5555 } 5556 } 5557 break; 5558 case 2: /* clts */ 5559 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5560 return -EIO; 5561 case 1: /*mov from cr*/ 5562 switch (cr) { 5563 case 3: 5564 WARN_ON_ONCE(enable_unrestricted_guest); 5565 5566 val = kvm_read_cr3(vcpu); 5567 kvm_register_write(vcpu, reg, val); 5568 trace_kvm_cr_read(cr, val); 5569 return kvm_skip_emulated_instruction(vcpu); 5570 case 8: 5571 val = kvm_get_cr8(vcpu); 5572 kvm_register_write(vcpu, reg, val); 5573 trace_kvm_cr_read(cr, val); 5574 return kvm_skip_emulated_instruction(vcpu); 5575 } 5576 break; 5577 case 3: /* lmsw */ 5578 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5579 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5580 kvm_lmsw(vcpu, val); 5581 5582 return kvm_skip_emulated_instruction(vcpu); 5583 default: 5584 break; 5585 } 5586 vcpu->run->exit_reason = 0; 5587 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5588 (int)(exit_qualification >> 4) & 3, cr); 5589 return 0; 5590 } 5591 5592 static int handle_dr(struct kvm_vcpu *vcpu) 5593 { 5594 unsigned long exit_qualification; 5595 int dr, dr7, reg; 5596 int err = 1; 5597 5598 exit_qualification = vmx_get_exit_qual(vcpu); 5599 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5600 5601 /* First, if DR does not exist, trigger UD */ 5602 if (!kvm_require_dr(vcpu, dr)) 5603 return 1; 5604 5605 if (vmx_get_cpl(vcpu) > 0) 5606 goto out; 5607 5608 dr7 = vmcs_readl(GUEST_DR7); 5609 if (dr7 & DR7_GD) { 5610 /* 5611 * As the vm-exit takes precedence over the debug trap, we 5612 * need to emulate the latter, either for the host or the 5613 * guest debugging itself. 5614 */ 5615 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5616 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5617 vcpu->run->debug.arch.dr7 = dr7; 5618 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5619 vcpu->run->debug.arch.exception = DB_VECTOR; 5620 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5621 return 0; 5622 } else { 5623 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5624 return 1; 5625 } 5626 } 5627 5628 if (vcpu->guest_debug == 0) { 5629 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5630 5631 /* 5632 * No more DR vmexits; force a reload of the debug registers 5633 * and reenter on this instruction. The next vmexit will 5634 * retrieve the full state of the debug registers. 5635 */ 5636 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5637 return 1; 5638 } 5639 5640 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5641 if (exit_qualification & TYPE_MOV_FROM_DR) { 5642 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5643 err = 0; 5644 } else { 5645 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5646 } 5647 5648 out: 5649 return kvm_complete_insn_gp(vcpu, err); 5650 } 5651 5652 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5653 { 5654 get_debugreg(vcpu->arch.db[0], 0); 5655 get_debugreg(vcpu->arch.db[1], 1); 5656 get_debugreg(vcpu->arch.db[2], 2); 5657 get_debugreg(vcpu->arch.db[3], 3); 5658 get_debugreg(vcpu->arch.dr6, 6); 5659 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5660 5661 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5662 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5663 5664 /* 5665 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5666 * a stale dr6 from the guest. 5667 */ 5668 set_debugreg(DR6_RESERVED, 6); 5669 } 5670 5671 void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 5672 { 5673 lockdep_assert_irqs_disabled(); 5674 set_debugreg(vcpu->arch.dr6, 6); 5675 } 5676 5677 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5678 { 5679 vmcs_writel(GUEST_DR7, val); 5680 } 5681 5682 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5683 { 5684 kvm_apic_update_ppr(vcpu); 5685 return 1; 5686 } 5687 5688 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5689 { 5690 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5691 5692 kvm_make_request(KVM_REQ_EVENT, vcpu); 5693 5694 ++vcpu->stat.irq_window_exits; 5695 return 1; 5696 } 5697 5698 static int handle_invlpg(struct kvm_vcpu *vcpu) 5699 { 5700 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5701 5702 kvm_mmu_invlpg(vcpu, exit_qualification); 5703 return kvm_skip_emulated_instruction(vcpu); 5704 } 5705 5706 static int handle_apic_access(struct kvm_vcpu *vcpu) 5707 { 5708 if (likely(fasteoi)) { 5709 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5710 int access_type, offset; 5711 5712 access_type = exit_qualification & APIC_ACCESS_TYPE; 5713 offset = exit_qualification & APIC_ACCESS_OFFSET; 5714 /* 5715 * Sane guest uses MOV to write EOI, with written value 5716 * not cared. So make a short-circuit here by avoiding 5717 * heavy instruction emulation. 5718 */ 5719 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5720 (offset == APIC_EOI)) { 5721 kvm_lapic_set_eoi(vcpu); 5722 return kvm_skip_emulated_instruction(vcpu); 5723 } 5724 } 5725 return kvm_emulate_instruction(vcpu, 0); 5726 } 5727 5728 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5729 { 5730 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5731 int vector = exit_qualification & 0xff; 5732 5733 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5734 kvm_apic_set_eoi_accelerated(vcpu, vector); 5735 return 1; 5736 } 5737 5738 static int handle_apic_write(struct kvm_vcpu *vcpu) 5739 { 5740 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5741 5742 /* 5743 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5744 * hardware has done any necessary aliasing, offset adjustments, etc... 5745 * for the access. I.e. the correct value has already been written to 5746 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5747 * retrieve the register value and emulate the access. 5748 */ 5749 u32 offset = exit_qualification & 0xff0; 5750 5751 kvm_apic_write_nodecode(vcpu, offset); 5752 return 1; 5753 } 5754 5755 static int handle_task_switch(struct kvm_vcpu *vcpu) 5756 { 5757 struct vcpu_vmx *vmx = to_vmx(vcpu); 5758 unsigned long exit_qualification; 5759 bool has_error_code = false; 5760 u32 error_code = 0; 5761 u16 tss_selector; 5762 int reason, type, idt_v, idt_index; 5763 5764 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5765 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5766 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5767 5768 exit_qualification = vmx_get_exit_qual(vcpu); 5769 5770 reason = (u32)exit_qualification >> 30; 5771 if (reason == TASK_SWITCH_GATE && idt_v) { 5772 switch (type) { 5773 case INTR_TYPE_NMI_INTR: 5774 vcpu->arch.nmi_injected = false; 5775 vmx_set_nmi_mask(vcpu, true); 5776 break; 5777 case INTR_TYPE_EXT_INTR: 5778 case INTR_TYPE_SOFT_INTR: 5779 kvm_clear_interrupt_queue(vcpu); 5780 break; 5781 case INTR_TYPE_HARD_EXCEPTION: 5782 if (vmx->idt_vectoring_info & 5783 VECTORING_INFO_DELIVER_CODE_MASK) { 5784 has_error_code = true; 5785 error_code = 5786 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5787 } 5788 fallthrough; 5789 case INTR_TYPE_SOFT_EXCEPTION: 5790 kvm_clear_exception_queue(vcpu); 5791 break; 5792 default: 5793 break; 5794 } 5795 } 5796 tss_selector = exit_qualification; 5797 5798 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5799 type != INTR_TYPE_EXT_INTR && 5800 type != INTR_TYPE_NMI_INTR)) 5801 WARN_ON(!skip_emulated_instruction(vcpu)); 5802 5803 /* 5804 * TODO: What about debug traps on tss switch? 5805 * Are we supposed to inject them and update dr6? 5806 */ 5807 return kvm_task_switch(vcpu, tss_selector, 5808 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5809 reason, has_error_code, error_code); 5810 } 5811 5812 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5813 { 5814 unsigned long exit_qualification; 5815 gpa_t gpa; 5816 u64 error_code; 5817 5818 exit_qualification = vmx_get_exit_qual(vcpu); 5819 5820 /* 5821 * EPT violation happened while executing iret from NMI, 5822 * "blocked by NMI" bit has to be set before next VM entry. 5823 * There are errata that may cause this bit to not be set: 5824 * AAK134, BY25. 5825 */ 5826 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5827 enable_vnmi && 5828 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5829 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5830 5831 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5832 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5833 5834 /* Is it a read fault? */ 5835 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) 5836 ? PFERR_USER_MASK : 0; 5837 /* Is it a write fault? */ 5838 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) 5839 ? PFERR_WRITE_MASK : 0; 5840 /* Is it a fetch fault? */ 5841 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) 5842 ? PFERR_FETCH_MASK : 0; 5843 /* ept page table entry is present? */ 5844 error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) 5845 ? PFERR_PRESENT_MASK : 0; 5846 5847 if (error_code & EPT_VIOLATION_GVA_IS_VALID) 5848 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? 5849 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5850 5851 /* 5852 * Check that the GPA doesn't exceed physical memory limits, as that is 5853 * a guest page fault. We have to emulate the instruction here, because 5854 * if the illegal address is that of a paging structure, then 5855 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5856 * would also use advanced VM-exit information for EPT violations to 5857 * reconstruct the page fault error code. 5858 */ 5859 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 5860 return kvm_emulate_instruction(vcpu, 0); 5861 5862 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5863 } 5864 5865 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5866 { 5867 gpa_t gpa; 5868 5869 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5870 return 1; 5871 5872 /* 5873 * A nested guest cannot optimize MMIO vmexits, because we have an 5874 * nGPA here instead of the required GPA. 5875 */ 5876 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5877 if (!is_guest_mode(vcpu) && 5878 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5879 trace_kvm_fast_mmio(gpa); 5880 return kvm_skip_emulated_instruction(vcpu); 5881 } 5882 5883 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5884 } 5885 5886 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5887 { 5888 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5889 return -EIO; 5890 5891 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5892 ++vcpu->stat.nmi_window_exits; 5893 kvm_make_request(KVM_REQ_EVENT, vcpu); 5894 5895 return 1; 5896 } 5897 5898 /* 5899 * Returns true if emulation is required (due to the vCPU having invalid state 5900 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the 5901 * current vCPU state. 5902 */ 5903 static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) 5904 { 5905 struct vcpu_vmx *vmx = to_vmx(vcpu); 5906 5907 if (!vmx->emulation_required) 5908 return false; 5909 5910 /* 5911 * It is architecturally impossible for emulation to be required when a 5912 * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if 5913 * guest state is invalid and unrestricted guest is disabled, i.e. KVM 5914 * should synthesize VM-Fail instead emulation L2 code. This path is 5915 * only reachable if userspace modifies L2 guest state after KVM has 5916 * performed the nested VM-Enter consistency checks. 5917 */ 5918 if (vmx->nested.nested_run_pending) 5919 return true; 5920 5921 /* 5922 * KVM only supports emulating exceptions if the vCPU is in Real Mode. 5923 * If emulation is required, KVM can't perform a successful VM-Enter to 5924 * inject the exception. 5925 */ 5926 return !vmx->rmode.vm86_active && 5927 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 5928 } 5929 5930 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5931 { 5932 struct vcpu_vmx *vmx = to_vmx(vcpu); 5933 bool intr_window_requested; 5934 unsigned count = 130; 5935 5936 intr_window_requested = exec_controls_get(vmx) & 5937 CPU_BASED_INTR_WINDOW_EXITING; 5938 5939 while (vmx->emulation_required && count-- != 0) { 5940 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 5941 return handle_interrupt_window(&vmx->vcpu); 5942 5943 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5944 return 1; 5945 5946 if (!kvm_emulate_instruction(vcpu, 0)) 5947 return 0; 5948 5949 if (vmx_unhandleable_emulation_required(vcpu)) { 5950 kvm_prepare_emulation_failure_exit(vcpu); 5951 return 0; 5952 } 5953 5954 if (vcpu->arch.halt_request) { 5955 vcpu->arch.halt_request = 0; 5956 return kvm_emulate_halt_noskip(vcpu); 5957 } 5958 5959 /* 5960 * Note, return 1 and not 0, vcpu_run() will invoke 5961 * xfer_to_guest_mode() which will create a proper return 5962 * code. 5963 */ 5964 if (__xfer_to_guest_mode_work_pending()) 5965 return 1; 5966 } 5967 5968 return 1; 5969 } 5970 5971 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5972 { 5973 if (vmx_unhandleable_emulation_required(vcpu)) { 5974 kvm_prepare_emulation_failure_exit(vcpu); 5975 return 0; 5976 } 5977 5978 return 1; 5979 } 5980 5981 /* 5982 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5983 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5984 */ 5985 static int handle_pause(struct kvm_vcpu *vcpu) 5986 { 5987 if (!kvm_pause_in_guest(vcpu->kvm)) 5988 grow_ple_window(vcpu); 5989 5990 /* 5991 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5992 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5993 * never set PAUSE_EXITING and just set PLE if supported, 5994 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5995 */ 5996 kvm_vcpu_on_spin(vcpu, true); 5997 return kvm_skip_emulated_instruction(vcpu); 5998 } 5999 6000 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 6001 { 6002 return 1; 6003 } 6004 6005 static int handle_invpcid(struct kvm_vcpu *vcpu) 6006 { 6007 u32 vmx_instruction_info; 6008 unsigned long type; 6009 gva_t gva; 6010 struct { 6011 u64 pcid; 6012 u64 gla; 6013 } operand; 6014 int gpr_index; 6015 6016 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 6017 kvm_queue_exception(vcpu, UD_VECTOR); 6018 return 1; 6019 } 6020 6021 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6022 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6023 type = kvm_register_read(vcpu, gpr_index); 6024 6025 /* According to the Intel instruction reference, the memory operand 6026 * is read even if it isn't needed (e.g., for type==all) 6027 */ 6028 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6029 vmx_instruction_info, false, 6030 sizeof(operand), &gva)) 6031 return 1; 6032 6033 return kvm_handle_invpcid(vcpu, type, gva); 6034 } 6035 6036 static int handle_pml_full(struct kvm_vcpu *vcpu) 6037 { 6038 unsigned long exit_qualification; 6039 6040 trace_kvm_pml_full(vcpu->vcpu_id); 6041 6042 exit_qualification = vmx_get_exit_qual(vcpu); 6043 6044 /* 6045 * PML buffer FULL happened while executing iret from NMI, 6046 * "blocked by NMI" bit has to be set before next VM entry. 6047 */ 6048 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 6049 enable_vnmi && 6050 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 6051 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6052 GUEST_INTR_STATE_NMI); 6053 6054 /* 6055 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 6056 * here.., and there's no userspace involvement needed for PML. 6057 */ 6058 return 1; 6059 } 6060 6061 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 6062 bool force_immediate_exit) 6063 { 6064 struct vcpu_vmx *vmx = to_vmx(vcpu); 6065 6066 /* 6067 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 6068 * due to the timer expiring while it was "soft" disabled, just eat the 6069 * exit and re-enter the guest. 6070 */ 6071 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 6072 return EXIT_FASTPATH_REENTER_GUEST; 6073 6074 /* 6075 * If the timer expired because KVM used it to force an immediate exit, 6076 * then mission accomplished. 6077 */ 6078 if (force_immediate_exit) 6079 return EXIT_FASTPATH_EXIT_HANDLED; 6080 6081 /* 6082 * If L2 is active, go down the slow path as emulating the guest timer 6083 * expiration likely requires synthesizing a nested VM-Exit. 6084 */ 6085 if (is_guest_mode(vcpu)) 6086 return EXIT_FASTPATH_NONE; 6087 6088 kvm_lapic_expired_hv_timer(vcpu); 6089 return EXIT_FASTPATH_REENTER_GUEST; 6090 } 6091 6092 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6093 { 6094 /* 6095 * This non-fastpath handler is reached if and only if the preemption 6096 * timer was being used to emulate a guest timer while L2 is active. 6097 * All other scenarios are supposed to be handled in the fastpath. 6098 */ 6099 WARN_ON_ONCE(!is_guest_mode(vcpu)); 6100 kvm_lapic_expired_hv_timer(vcpu); 6101 return 1; 6102 } 6103 6104 /* 6105 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6106 * are overwritten by nested_vmx_hardware_setup() when nested=1. 6107 */ 6108 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6109 { 6110 kvm_queue_exception(vcpu, UD_VECTOR); 6111 return 1; 6112 } 6113 6114 #ifndef CONFIG_X86_SGX_KVM 6115 static int handle_encls(struct kvm_vcpu *vcpu) 6116 { 6117 /* 6118 * SGX virtualization is disabled. There is no software enable bit for 6119 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6120 * the guest from executing ENCLS (when SGX is supported by hardware). 6121 */ 6122 kvm_queue_exception(vcpu, UD_VECTOR); 6123 return 1; 6124 } 6125 #endif /* CONFIG_X86_SGX_KVM */ 6126 6127 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6128 { 6129 /* 6130 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6131 * VM-Exits. Unconditionally set the flag here and leave the handling to 6132 * vmx_handle_exit(). 6133 */ 6134 to_vmx(vcpu)->exit_reason.bus_lock_detected = true; 6135 return 1; 6136 } 6137 6138 static int handle_notify(struct kvm_vcpu *vcpu) 6139 { 6140 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6141 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6142 6143 ++vcpu->stat.notify_window_exits; 6144 6145 /* 6146 * Notify VM exit happened while executing iret from NMI, 6147 * "blocked by NMI" bit has to be set before next VM entry. 6148 */ 6149 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6150 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6151 GUEST_INTR_STATE_NMI); 6152 6153 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6154 context_invalid) { 6155 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6156 vcpu->run->notify.flags = context_invalid ? 6157 KVM_NOTIFY_CONTEXT_INVALID : 0; 6158 return 0; 6159 } 6160 6161 return 1; 6162 } 6163 6164 /* 6165 * The exit handlers return 1 if the exit was handled fully and guest execution 6166 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6167 * to be done to userspace and return 0. 6168 */ 6169 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6170 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6171 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6172 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6173 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6174 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6175 [EXIT_REASON_CR_ACCESS] = handle_cr, 6176 [EXIT_REASON_DR_ACCESS] = handle_dr, 6177 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6178 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6179 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6180 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6181 [EXIT_REASON_HLT] = kvm_emulate_halt, 6182 [EXIT_REASON_INVD] = kvm_emulate_invd, 6183 [EXIT_REASON_INVLPG] = handle_invlpg, 6184 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6185 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6186 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6187 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6188 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6189 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6190 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6191 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6192 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6193 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6194 [EXIT_REASON_VMON] = handle_vmx_instruction, 6195 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6196 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6197 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6198 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6199 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6200 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6201 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6202 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6203 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6204 [EXIT_REASON_LDTR_TR] = handle_desc, 6205 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6206 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6207 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6208 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6209 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6210 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6211 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6212 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6213 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6214 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6215 [EXIT_REASON_PML_FULL] = handle_pml_full, 6216 [EXIT_REASON_INVPCID] = handle_invpcid, 6217 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6218 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6219 [EXIT_REASON_ENCLS] = handle_encls, 6220 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6221 [EXIT_REASON_NOTIFY] = handle_notify, 6222 }; 6223 6224 static const int kvm_vmx_max_exit_handlers = 6225 ARRAY_SIZE(kvm_vmx_exit_handlers); 6226 6227 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6228 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 6229 { 6230 struct vcpu_vmx *vmx = to_vmx(vcpu); 6231 6232 *reason = vmx->exit_reason.full; 6233 *info1 = vmx_get_exit_qual(vcpu); 6234 if (!(vmx->exit_reason.failed_vmentry)) { 6235 *info2 = vmx->idt_vectoring_info; 6236 *intr_info = vmx_get_intr_info(vcpu); 6237 if (is_exception_with_error_code(*intr_info)) 6238 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6239 else 6240 *error_code = 0; 6241 } else { 6242 *info2 = 0; 6243 *intr_info = 0; 6244 *error_code = 0; 6245 } 6246 } 6247 6248 void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) 6249 { 6250 *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 6251 if (is_exception_with_error_code(*intr_info)) 6252 *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); 6253 else 6254 *error_code = 0; 6255 } 6256 6257 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6258 { 6259 if (vmx->pml_pg) { 6260 __free_page(vmx->pml_pg); 6261 vmx->pml_pg = NULL; 6262 } 6263 } 6264 6265 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6266 { 6267 struct vcpu_vmx *vmx = to_vmx(vcpu); 6268 u16 pml_idx, pml_tail_index; 6269 u64 *pml_buf; 6270 int i; 6271 6272 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6273 6274 /* Do nothing if PML buffer is empty */ 6275 if (pml_idx == PML_HEAD_INDEX) 6276 return; 6277 /* 6278 * PML index always points to the next available PML buffer entity 6279 * unless PML log has just overflowed. 6280 */ 6281 pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; 6282 6283 /* 6284 * PML log is written backwards: the CPU first writes the entry 511 6285 * then the entry 510, and so on. 6286 * 6287 * Read the entries in the same order they were written, to ensure that 6288 * the dirty ring is filled in the same order the CPU wrote them. 6289 */ 6290 pml_buf = page_address(vmx->pml_pg); 6291 6292 for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { 6293 u64 gpa; 6294 6295 gpa = pml_buf[i]; 6296 WARN_ON(gpa & (PAGE_SIZE - 1)); 6297 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6298 } 6299 6300 /* reset PML index */ 6301 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 6302 } 6303 6304 static void vmx_dump_sel(char *name, uint32_t sel) 6305 { 6306 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6307 name, vmcs_read16(sel), 6308 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6309 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6310 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6311 } 6312 6313 static void vmx_dump_dtsel(char *name, uint32_t limit) 6314 { 6315 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6316 name, vmcs_read32(limit), 6317 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6318 } 6319 6320 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6321 { 6322 unsigned int i; 6323 struct vmx_msr_entry *e; 6324 6325 pr_err("MSR %s:\n", name); 6326 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6327 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6328 } 6329 6330 void dump_vmcs(struct kvm_vcpu *vcpu) 6331 { 6332 struct vcpu_vmx *vmx = to_vmx(vcpu); 6333 u32 vmentry_ctl, vmexit_ctl; 6334 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6335 u64 tertiary_exec_control; 6336 unsigned long cr4; 6337 int efer_slot; 6338 6339 if (!dump_invalid_vmcs) { 6340 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6341 return; 6342 } 6343 6344 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6345 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6346 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6347 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6348 cr4 = vmcs_readl(GUEST_CR4); 6349 6350 if (cpu_has_secondary_exec_ctrls()) 6351 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6352 else 6353 secondary_exec_control = 0; 6354 6355 if (cpu_has_tertiary_exec_ctrls()) 6356 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6357 else 6358 tertiary_exec_control = 0; 6359 6360 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6361 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6362 pr_err("*** Guest State ***\n"); 6363 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6364 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6365 vmcs_readl(CR0_GUEST_HOST_MASK)); 6366 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6367 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6368 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6369 if (cpu_has_vmx_ept()) { 6370 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6371 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6372 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6373 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6374 } 6375 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6376 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6377 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6378 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6379 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6380 vmcs_readl(GUEST_SYSENTER_ESP), 6381 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6382 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6383 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6384 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6385 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6386 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6387 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6388 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6389 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6390 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6391 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6392 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6393 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6394 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6395 else if (efer_slot >= 0) 6396 pr_err("EFER= 0x%016llx (autoload)\n", 6397 vmx->msr_autoload.guest.val[efer_slot].value); 6398 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6399 pr_err("EFER= 0x%016llx (effective)\n", 6400 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6401 else 6402 pr_err("EFER= 0x%016llx (effective)\n", 6403 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6404 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6405 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6406 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6407 vmcs_read64(GUEST_IA32_DEBUGCTL), 6408 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6409 if (cpu_has_load_perf_global_ctrl() && 6410 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6411 pr_err("PerfGlobCtl = 0x%016llx\n", 6412 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6413 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6414 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6415 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6416 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6417 vmcs_read32(GUEST_ACTIVITY_STATE)); 6418 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6419 pr_err("InterruptStatus = %04x\n", 6420 vmcs_read16(GUEST_INTR_STATUS)); 6421 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6422 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6423 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6424 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6425 6426 pr_err("*** Host State ***\n"); 6427 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6428 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6429 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6430 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6431 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6432 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6433 vmcs_read16(HOST_TR_SELECTOR)); 6434 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6435 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6436 vmcs_readl(HOST_TR_BASE)); 6437 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6438 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6439 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6440 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6441 vmcs_readl(HOST_CR4)); 6442 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6443 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6444 vmcs_read32(HOST_IA32_SYSENTER_CS), 6445 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6446 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6447 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6448 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6449 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6450 if (cpu_has_load_perf_global_ctrl() && 6451 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6452 pr_err("PerfGlobCtl = 0x%016llx\n", 6453 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6454 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6455 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6456 6457 pr_err("*** Control State ***\n"); 6458 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6459 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6460 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6461 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6462 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6463 vmcs_read32(EXCEPTION_BITMAP), 6464 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6465 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6466 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6467 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6468 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6469 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6470 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6471 vmcs_read32(VM_EXIT_INTR_INFO), 6472 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6473 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6474 pr_err(" reason=%08x qualification=%016lx\n", 6475 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6476 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6477 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6478 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6479 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6480 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6481 pr_err("TSC Multiplier = 0x%016llx\n", 6482 vmcs_read64(TSC_MULTIPLIER)); 6483 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6484 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6485 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6486 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6487 } 6488 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6489 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6490 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6491 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6492 } 6493 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6494 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6495 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6496 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6497 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6498 pr_err("PLE Gap=%08x Window=%08x\n", 6499 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6500 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6501 pr_err("Virtual processor ID = 0x%04x\n", 6502 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6503 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { 6504 struct vmx_ve_information *ve_info = vmx->ve_info; 6505 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); 6506 6507 /* 6508 * If KVM is dumping the VMCS, then something has gone wrong 6509 * already. Derefencing an address from the VMCS, which could 6510 * very well be corrupted, is a terrible idea. The virtual 6511 * address is known so use it. 6512 */ 6513 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, 6514 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); 6515 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", 6516 ve_info->exit_reason, ve_info->delivery, 6517 ve_info->exit_qualification, 6518 ve_info->guest_linear_address, 6519 ve_info->guest_physical_address, ve_info->eptp_index); 6520 } 6521 } 6522 6523 /* 6524 * The guest has exited. See if we can fix it or if we need userspace 6525 * assistance. 6526 */ 6527 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6528 { 6529 struct vcpu_vmx *vmx = to_vmx(vcpu); 6530 union vmx_exit_reason exit_reason = vmx->exit_reason; 6531 u32 vectoring_info = vmx->idt_vectoring_info; 6532 u16 exit_handler_index; 6533 6534 /* 6535 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6536 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6537 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6538 * mode as if vcpus is in root mode, the PML buffer must has been 6539 * flushed already. Note, PML is never enabled in hardware while 6540 * running L2. 6541 */ 6542 if (enable_pml && !is_guest_mode(vcpu)) 6543 vmx_flush_pml_buffer(vcpu); 6544 6545 /* 6546 * KVM should never reach this point with a pending nested VM-Enter. 6547 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6548 * invalid guest state should never happen as that means KVM knowingly 6549 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6550 */ 6551 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6552 return -EIO; 6553 6554 if (is_guest_mode(vcpu)) { 6555 /* 6556 * PML is never enabled when running L2, bail immediately if a 6557 * PML full exit occurs as something is horribly wrong. 6558 */ 6559 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6560 goto unexpected_vmexit; 6561 6562 /* 6563 * The host physical addresses of some pages of guest memory 6564 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6565 * Page). The CPU may write to these pages via their host 6566 * physical address while L2 is running, bypassing any 6567 * address-translation-based dirty tracking (e.g. EPT write 6568 * protection). 6569 * 6570 * Mark them dirty on every exit from L2 to prevent them from 6571 * getting out of sync with dirty tracking. 6572 */ 6573 nested_mark_vmcs12_pages_dirty(vcpu); 6574 6575 /* 6576 * Synthesize a triple fault if L2 state is invalid. In normal 6577 * operation, nested VM-Enter rejects any attempt to enter L2 6578 * with invalid state. However, those checks are skipped if 6579 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6580 * L2 state is invalid, it means either L1 modified SMRAM state 6581 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6582 * doing so is architecturally allowed in the RSM case, and is 6583 * the least awful solution for the userspace case without 6584 * risking false positives. 6585 */ 6586 if (vmx->emulation_required) { 6587 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6588 return 1; 6589 } 6590 6591 if (nested_vmx_reflect_vmexit(vcpu)) 6592 return 1; 6593 } 6594 6595 /* If guest state is invalid, start emulating. L2 is handled above. */ 6596 if (vmx->emulation_required) 6597 return handle_invalid_guest_state(vcpu); 6598 6599 if (exit_reason.failed_vmentry) { 6600 dump_vmcs(vcpu); 6601 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6602 vcpu->run->fail_entry.hardware_entry_failure_reason 6603 = exit_reason.full; 6604 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6605 return 0; 6606 } 6607 6608 if (unlikely(vmx->fail)) { 6609 dump_vmcs(vcpu); 6610 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6611 vcpu->run->fail_entry.hardware_entry_failure_reason 6612 = vmcs_read32(VM_INSTRUCTION_ERROR); 6613 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6614 return 0; 6615 } 6616 6617 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6618 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6619 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6620 exit_reason.basic != EXIT_REASON_PML_FULL && 6621 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6622 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6623 exit_reason.basic != EXIT_REASON_NOTIFY && 6624 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { 6625 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); 6626 return 0; 6627 } 6628 6629 if (unlikely(!enable_vnmi && 6630 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6631 if (!vmx_interrupt_blocked(vcpu)) { 6632 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6633 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6634 vcpu->arch.nmi_pending) { 6635 /* 6636 * This CPU don't support us in finding the end of an 6637 * NMI-blocked window if the guest runs with IRQs 6638 * disabled. So we pull the trigger after 1 s of 6639 * futile waiting, but inform the user about this. 6640 */ 6641 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6642 "state on VCPU %d after 1 s timeout\n", 6643 __func__, vcpu->vcpu_id); 6644 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6645 } 6646 } 6647 6648 if (exit_fastpath != EXIT_FASTPATH_NONE) 6649 return 1; 6650 6651 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6652 goto unexpected_vmexit; 6653 #ifdef CONFIG_MITIGATION_RETPOLINE 6654 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6655 return kvm_emulate_wrmsr(vcpu); 6656 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6657 return handle_preemption_timer(vcpu); 6658 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6659 return handle_interrupt_window(vcpu); 6660 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6661 return handle_external_interrupt(vcpu); 6662 else if (exit_reason.basic == EXIT_REASON_HLT) 6663 return kvm_emulate_halt(vcpu); 6664 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6665 return handle_ept_misconfig(vcpu); 6666 #endif 6667 6668 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6669 kvm_vmx_max_exit_handlers); 6670 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6671 goto unexpected_vmexit; 6672 6673 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6674 6675 unexpected_vmexit: 6676 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6677 exit_reason.full); 6678 dump_vmcs(vcpu); 6679 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6680 vcpu->run->internal.suberror = 6681 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6682 vcpu->run->internal.ndata = 2; 6683 vcpu->run->internal.data[0] = exit_reason.full; 6684 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6685 return 0; 6686 } 6687 6688 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6689 { 6690 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6691 6692 /* 6693 * Exit to user space when bus lock detected to inform that there is 6694 * a bus lock in guest. 6695 */ 6696 if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { 6697 if (ret > 0) 6698 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6699 6700 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6701 return 0; 6702 } 6703 return ret; 6704 } 6705 6706 /* 6707 * Software based L1D cache flush which is used when microcode providing 6708 * the cache control MSR is not loaded. 6709 * 6710 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6711 * flush it is required to read in 64 KiB because the replacement algorithm 6712 * is not exactly LRU. This could be sized at runtime via topology 6713 * information but as all relevant affected CPUs have 32KiB L1D cache size 6714 * there is no point in doing so. 6715 */ 6716 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6717 { 6718 int size = PAGE_SIZE << L1D_CACHE_ORDER; 6719 6720 /* 6721 * This code is only executed when the flush mode is 'cond' or 6722 * 'always' 6723 */ 6724 if (static_branch_likely(&vmx_l1d_flush_cond)) { 6725 bool flush_l1d; 6726 6727 /* 6728 * Clear the per-vcpu flush bit, it gets set again if the vCPU 6729 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 6730 * exits to userspace, or if KVM reaches one of the unsafe 6731 * VMEXIT handlers, e.g. if KVM calls into the emulator. 6732 */ 6733 flush_l1d = vcpu->arch.l1tf_flush_l1d; 6734 vcpu->arch.l1tf_flush_l1d = false; 6735 6736 /* 6737 * Clear the per-cpu flush bit, it gets set again from 6738 * the interrupt handlers. 6739 */ 6740 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6741 kvm_clear_cpu_l1tf_flush_l1d(); 6742 6743 if (!flush_l1d) 6744 return; 6745 } 6746 6747 vcpu->stat.l1d_flush++; 6748 6749 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6750 native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6751 return; 6752 } 6753 6754 asm volatile( 6755 /* First ensure the pages are in the TLB */ 6756 "xorl %%eax, %%eax\n" 6757 ".Lpopulate_tlb:\n\t" 6758 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6759 "addl $4096, %%eax\n\t" 6760 "cmpl %%eax, %[size]\n\t" 6761 "jne .Lpopulate_tlb\n\t" 6762 "xorl %%eax, %%eax\n\t" 6763 "cpuid\n\t" 6764 /* Now fill the cache */ 6765 "xorl %%eax, %%eax\n" 6766 ".Lfill_cache:\n" 6767 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6768 "addl $64, %%eax\n\t" 6769 "cmpl %%eax, %[size]\n\t" 6770 "jne .Lfill_cache\n\t" 6771 "lfence\n" 6772 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6773 [size] "r" (size) 6774 : "eax", "ebx", "ecx", "edx"); 6775 } 6776 6777 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6778 { 6779 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6780 int tpr_threshold; 6781 6782 if (is_guest_mode(vcpu) && 6783 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6784 return; 6785 6786 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6787 if (is_guest_mode(vcpu)) 6788 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; 6789 else 6790 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6791 } 6792 6793 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6794 { 6795 struct vcpu_vmx *vmx = to_vmx(vcpu); 6796 u32 sec_exec_control; 6797 6798 if (!lapic_in_kernel(vcpu)) 6799 return; 6800 6801 if (!flexpriority_enabled && 6802 !cpu_has_vmx_virtualize_x2apic_mode()) 6803 return; 6804 6805 /* Postpone execution until vmcs01 is the current VMCS. */ 6806 if (is_guest_mode(vcpu)) { 6807 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6808 return; 6809 } 6810 6811 sec_exec_control = secondary_exec_controls_get(vmx); 6812 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6813 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6814 6815 switch (kvm_get_apic_mode(vcpu)) { 6816 case LAPIC_MODE_INVALID: 6817 WARN_ONCE(true, "Invalid local APIC state"); 6818 break; 6819 case LAPIC_MODE_DISABLED: 6820 break; 6821 case LAPIC_MODE_XAPIC: 6822 if (flexpriority_enabled) { 6823 sec_exec_control |= 6824 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6825 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6826 6827 /* 6828 * Flush the TLB, reloading the APIC access page will 6829 * only do so if its physical address has changed, but 6830 * the guest may have inserted a non-APIC mapping into 6831 * the TLB while the APIC access page was disabled. 6832 */ 6833 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6834 } 6835 break; 6836 case LAPIC_MODE_X2APIC: 6837 if (cpu_has_vmx_virtualize_x2apic_mode()) 6838 sec_exec_control |= 6839 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6840 break; 6841 } 6842 secondary_exec_controls_set(vmx, sec_exec_control); 6843 6844 vmx_update_msr_bitmap_x2apic(vcpu); 6845 } 6846 6847 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6848 { 6849 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6850 struct kvm *kvm = vcpu->kvm; 6851 struct kvm_memslots *slots = kvm_memslots(kvm); 6852 struct kvm_memory_slot *slot; 6853 struct page *refcounted_page; 6854 unsigned long mmu_seq; 6855 kvm_pfn_t pfn; 6856 bool writable; 6857 6858 /* Defer reload until vmcs01 is the current VMCS. */ 6859 if (is_guest_mode(vcpu)) { 6860 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; 6861 return; 6862 } 6863 6864 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6865 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6866 return; 6867 6868 /* 6869 * Explicitly grab the memslot using KVM's internal slot ID to ensure 6870 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 6871 * be impossible for userspace to create a memslot for the APIC when 6872 * APICv is enabled, but paranoia won't hurt in this case. 6873 */ 6874 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6875 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6876 return; 6877 6878 /* 6879 * Ensure that the mmu_notifier sequence count is read before KVM 6880 * retrieves the pfn from the primary MMU. Note, the memslot is 6881 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6882 * in kvm_mmu_invalidate_end(). 6883 */ 6884 mmu_seq = kvm->mmu_invalidate_seq; 6885 smp_rmb(); 6886 6887 /* 6888 * No need to retry if the memslot does not exist or is invalid. KVM 6889 * controls the APIC-access page memslot, and only deletes the memslot 6890 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6891 */ 6892 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); 6893 if (is_error_noslot_pfn(pfn)) 6894 return; 6895 6896 read_lock(&vcpu->kvm->mmu_lock); 6897 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) 6898 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6899 else 6900 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6901 6902 /* 6903 * Do not pin the APIC access page in memory so that it can be freely 6904 * migrated, the MMU notifier will call us again if it is migrated or 6905 * swapped out. KVM backs the memslot with anonymous memory, the pfn 6906 * should always point at a refcounted page (if the pfn is valid). 6907 */ 6908 if (!WARN_ON_ONCE(!refcounted_page)) 6909 kvm_release_page_clean(refcounted_page); 6910 6911 /* 6912 * No need for a manual TLB flush at this point, KVM has already done a 6913 * flush if there were SPTEs pointing at the previous page. 6914 */ 6915 read_unlock(&vcpu->kvm->mmu_lock); 6916 } 6917 6918 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 6919 { 6920 u16 status; 6921 u8 old; 6922 6923 /* 6924 * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI 6925 * is only relevant for if and only if Virtual Interrupt Delivery is 6926 * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's 6927 * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested 6928 * VM-Exit, otherwise L1 with run with a stale SVI. 6929 */ 6930 if (is_guest_mode(vcpu)) { 6931 /* 6932 * KVM is supposed to forward intercepted L2 EOIs to L1 if VID 6933 * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. 6934 * Note, userspace can stuff state while L2 is active; assert 6935 * that VID is disabled if and only if the vCPU is in KVM_RUN 6936 * to avoid false positives if userspace is setting APIC state. 6937 */ 6938 WARN_ON_ONCE(vcpu->wants_to_run && 6939 nested_cpu_has_vid(get_vmcs12(vcpu))); 6940 to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; 6941 return; 6942 } 6943 6944 if (max_isr == -1) 6945 max_isr = 0; 6946 6947 status = vmcs_read16(GUEST_INTR_STATUS); 6948 old = status >> 8; 6949 if (max_isr != old) { 6950 status &= 0xff; 6951 status |= max_isr << 8; 6952 vmcs_write16(GUEST_INTR_STATUS, status); 6953 } 6954 } 6955 6956 static void vmx_set_rvi(int vector) 6957 { 6958 u16 status; 6959 u8 old; 6960 6961 if (vector == -1) 6962 vector = 0; 6963 6964 status = vmcs_read16(GUEST_INTR_STATUS); 6965 old = (u8)status & 0xff; 6966 if ((u8)vector != old) { 6967 status &= ~0xff; 6968 status |= (u8)vector; 6969 vmcs_write16(GUEST_INTR_STATUS, status); 6970 } 6971 } 6972 6973 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6974 { 6975 struct vcpu_vmx *vmx = to_vmx(vcpu); 6976 int max_irr; 6977 bool got_posted_interrupt; 6978 6979 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 6980 return -EIO; 6981 6982 if (pi_test_on(&vmx->pi_desc)) { 6983 pi_clear_on(&vmx->pi_desc); 6984 /* 6985 * IOMMU can write to PID.ON, so the barrier matters even on UP. 6986 * But on x86 this is just a compiler barrier anyway. 6987 */ 6988 smp_mb__after_atomic(); 6989 got_posted_interrupt = 6990 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); 6991 } else { 6992 max_irr = kvm_lapic_find_highest_irr(vcpu); 6993 got_posted_interrupt = false; 6994 } 6995 6996 /* 6997 * Newly recognized interrupts are injected via either virtual interrupt 6998 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 6999 * disabled in two cases: 7000 * 7001 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 7002 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 7003 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 7004 * into L2, but KVM doesn't use virtual interrupt delivery to inject 7005 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 7006 * 7007 * 2) If APICv is disabled for this vCPU, assigned devices may still 7008 * attempt to post interrupts. The posted interrupt vector will cause 7009 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 7010 */ 7011 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 7012 vmx_set_rvi(max_irr); 7013 else if (got_posted_interrupt) 7014 kvm_make_request(KVM_REQ_EVENT, vcpu); 7015 7016 return max_irr; 7017 } 7018 7019 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 7020 { 7021 if (!kvm_vcpu_apicv_active(vcpu)) 7022 return; 7023 7024 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 7025 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 7026 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 7027 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 7028 } 7029 7030 void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) 7031 { 7032 struct vcpu_vmx *vmx = to_vmx(vcpu); 7033 7034 pi_clear_on(&vmx->pi_desc); 7035 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); 7036 } 7037 7038 void vmx_do_interrupt_irqoff(unsigned long entry); 7039 void vmx_do_nmi_irqoff(void); 7040 7041 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 7042 { 7043 /* 7044 * Save xfd_err to guest_fpu before interrupt is enabled, so the 7045 * MSR value is not clobbered by the host activity before the guest 7046 * has chance to consume it. 7047 * 7048 * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM 7049 * interception may have been caused by L1 interception. Per the SDM, 7050 * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. 7051 * 7052 * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. 7053 * unlike CR2 and DR6, the value is not a payload that is attached to 7054 * the #NM exception. 7055 */ 7056 if (is_xfd_nm_fault(vcpu)) 7057 rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 7058 } 7059 7060 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 7061 { 7062 /* if exit due to PF check for async PF */ 7063 if (is_page_fault(intr_info)) 7064 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 7065 /* if exit due to NM, handle before interrupts are enabled */ 7066 else if (is_nm_fault(intr_info)) 7067 handle_nm_fault_irqoff(vcpu); 7068 /* Handle machine checks before interrupts are enabled */ 7069 else if (is_machine_check(intr_info)) 7070 kvm_machine_check(); 7071 } 7072 7073 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, 7074 u32 intr_info) 7075 { 7076 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 7077 7078 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 7079 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 7080 return; 7081 7082 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 7083 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7084 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 7085 else 7086 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); 7087 kvm_after_interrupt(vcpu); 7088 7089 vcpu->arch.at_instruction_boundary = true; 7090 } 7091 7092 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 7093 { 7094 struct vcpu_vmx *vmx = to_vmx(vcpu); 7095 7096 if (vmx->emulation_required) 7097 return; 7098 7099 if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 7100 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7101 else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) 7102 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7103 } 7104 7105 /* 7106 * The kvm parameter can be NULL (module initialization, or invocation before 7107 * VM creation). Be sure to check the kvm parameter before using it. 7108 */ 7109 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7110 { 7111 switch (index) { 7112 case MSR_IA32_SMBASE: 7113 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7114 return false; 7115 /* 7116 * We cannot do SMM unless we can run the guest in big 7117 * real mode. 7118 */ 7119 return enable_unrestricted_guest || emulate_invalid_guest_state; 7120 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7121 return nested; 7122 case MSR_AMD64_VIRT_SPEC_CTRL: 7123 case MSR_AMD64_TSC_RATIO: 7124 /* This is AMD only. */ 7125 return false; 7126 default: 7127 return true; 7128 } 7129 } 7130 7131 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7132 { 7133 u32 exit_intr_info; 7134 bool unblock_nmi; 7135 u8 vector; 7136 bool idtv_info_valid; 7137 7138 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7139 7140 if (enable_vnmi) { 7141 if (vmx->loaded_vmcs->nmi_known_unmasked) 7142 return; 7143 7144 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7145 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7146 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7147 /* 7148 * SDM 3: 27.7.1.2 (September 2008) 7149 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7150 * a guest IRET fault. 7151 * SDM 3: 23.2.2 (September 2008) 7152 * Bit 12 is undefined in any of the following cases: 7153 * If the VM exit sets the valid bit in the IDT-vectoring 7154 * information field. 7155 * If the VM exit is due to a double fault. 7156 */ 7157 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7158 vector != DF_VECTOR && !idtv_info_valid) 7159 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7160 GUEST_INTR_STATE_NMI); 7161 else 7162 vmx->loaded_vmcs->nmi_known_unmasked = 7163 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7164 & GUEST_INTR_STATE_NMI); 7165 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7166 vmx->loaded_vmcs->vnmi_blocked_time += 7167 ktime_to_ns(ktime_sub(ktime_get(), 7168 vmx->loaded_vmcs->entry_time)); 7169 } 7170 7171 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7172 u32 idt_vectoring_info, 7173 int instr_len_field, 7174 int error_code_field) 7175 { 7176 u8 vector; 7177 int type; 7178 bool idtv_info_valid; 7179 7180 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7181 7182 vcpu->arch.nmi_injected = false; 7183 kvm_clear_exception_queue(vcpu); 7184 kvm_clear_interrupt_queue(vcpu); 7185 7186 if (!idtv_info_valid) 7187 return; 7188 7189 kvm_make_request(KVM_REQ_EVENT, vcpu); 7190 7191 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7192 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7193 7194 switch (type) { 7195 case INTR_TYPE_NMI_INTR: 7196 vcpu->arch.nmi_injected = true; 7197 /* 7198 * SDM 3: 27.7.1.2 (September 2008) 7199 * Clear bit "block by NMI" before VM entry if a NMI 7200 * delivery faulted. 7201 */ 7202 vmx_set_nmi_mask(vcpu, false); 7203 break; 7204 case INTR_TYPE_SOFT_EXCEPTION: 7205 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7206 fallthrough; 7207 case INTR_TYPE_HARD_EXCEPTION: { 7208 u32 error_code = 0; 7209 7210 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) 7211 error_code = vmcs_read32(error_code_field); 7212 7213 kvm_requeue_exception(vcpu, vector, 7214 idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, 7215 error_code); 7216 break; 7217 } 7218 case INTR_TYPE_SOFT_INTR: 7219 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7220 fallthrough; 7221 case INTR_TYPE_EXT_INTR: 7222 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7223 break; 7224 default: 7225 break; 7226 } 7227 } 7228 7229 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7230 { 7231 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7232 VM_EXIT_INSTRUCTION_LEN, 7233 IDT_VECTORING_ERROR_CODE); 7234 } 7235 7236 void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7237 { 7238 __vmx_complete_interrupts(vcpu, 7239 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7240 VM_ENTRY_INSTRUCTION_LEN, 7241 VM_ENTRY_EXCEPTION_ERROR_CODE); 7242 7243 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7244 } 7245 7246 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7247 { 7248 int i, nr_msrs; 7249 struct perf_guest_switch_msr *msrs; 7250 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7251 7252 pmu->host_cross_mapped_mask = 0; 7253 if (pmu->pebs_enable & pmu->global_ctrl) 7254 intel_pmu_cross_mapped_check(pmu); 7255 7256 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7257 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7258 if (!msrs) 7259 return; 7260 7261 for (i = 0; i < nr_msrs; i++) 7262 if (msrs[i].host == msrs[i].guest) 7263 clear_atomic_switch_msr(vmx, msrs[i].msr); 7264 else 7265 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7266 msrs[i].host, false); 7267 } 7268 7269 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7270 { 7271 struct vcpu_vmx *vmx = to_vmx(vcpu); 7272 u64 tscl; 7273 u32 delta_tsc; 7274 7275 if (force_immediate_exit) { 7276 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7277 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7278 } else if (vmx->hv_deadline_tsc != -1) { 7279 tscl = rdtsc(); 7280 if (vmx->hv_deadline_tsc > tscl) 7281 /* set_hv_timer ensures the delta fits in 32-bits */ 7282 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7283 cpu_preemption_timer_multi); 7284 else 7285 delta_tsc = 0; 7286 7287 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7288 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7289 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7290 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7291 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7292 } 7293 } 7294 7295 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7296 { 7297 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7298 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7299 vmcs_writel(HOST_RSP, host_rsp); 7300 } 7301 } 7302 7303 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7304 unsigned int flags) 7305 { 7306 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7307 7308 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7309 return; 7310 7311 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7312 vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); 7313 7314 /* 7315 * If the guest/host SPEC_CTRL values differ, restore the host value. 7316 * 7317 * For legacy IBRS, the IBRS bit always needs to be written after 7318 * transitioning from a less privileged predictor mode, regardless of 7319 * whether the guest/host values differ. 7320 */ 7321 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7322 vmx->spec_ctrl != hostval) 7323 native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); 7324 7325 barrier_nospec(); 7326 } 7327 7328 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7329 bool force_immediate_exit) 7330 { 7331 /* 7332 * If L2 is active, some VMX preemption timer exits can be handled in 7333 * the fastpath even, all other exits must use the slow path. 7334 */ 7335 if (is_guest_mode(vcpu) && 7336 to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) 7337 return EXIT_FASTPATH_NONE; 7338 7339 switch (to_vmx(vcpu)->exit_reason.basic) { 7340 case EXIT_REASON_MSR_WRITE: 7341 return handle_fastpath_set_msr_irqoff(vcpu); 7342 case EXIT_REASON_PREEMPTION_TIMER: 7343 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7344 case EXIT_REASON_HLT: 7345 return handle_fastpath_hlt(vcpu); 7346 default: 7347 return EXIT_FASTPATH_NONE; 7348 } 7349 } 7350 7351 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7352 unsigned int flags) 7353 { 7354 struct vcpu_vmx *vmx = to_vmx(vcpu); 7355 7356 guest_state_enter_irqoff(); 7357 7358 /* 7359 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7360 * mitigation for MDS is done late in VMentry and is still 7361 * executed in spite of L1D Flush. This is because an extra VERW 7362 * should not matter much after the big hammer L1D Flush. 7363 * 7364 * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, 7365 * and is affected by MMIO Stale Data. In such cases mitigation in only 7366 * needed against an MMIO capable guest. 7367 */ 7368 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7369 vmx_l1d_flush(vcpu); 7370 else if (static_branch_unlikely(&cpu_buf_vm_clear) && 7371 kvm_arch_has_assigned_device(vcpu->kvm)) 7372 mds_clear_cpu_buffers(); 7373 7374 vmx_disable_fb_clear(vmx); 7375 7376 if (vcpu->arch.cr2 != native_read_cr2()) 7377 native_write_cr2(vcpu->arch.cr2); 7378 7379 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7380 flags); 7381 7382 vcpu->arch.cr2 = native_read_cr2(); 7383 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7384 7385 vmx->idt_vectoring_info = 0; 7386 7387 vmx_enable_fb_clear(vmx); 7388 7389 if (unlikely(vmx->fail)) { 7390 vmx->exit_reason.full = 0xdead; 7391 goto out; 7392 } 7393 7394 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7395 if (likely(!vmx->exit_reason.failed_vmentry)) 7396 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7397 7398 if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && 7399 is_nmi(vmx_get_intr_info(vcpu))) { 7400 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7401 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7402 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7403 else 7404 vmx_do_nmi_irqoff(); 7405 kvm_after_interrupt(vcpu); 7406 } 7407 7408 out: 7409 guest_state_exit_irqoff(); 7410 } 7411 7412 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7413 { 7414 struct vcpu_vmx *vmx = to_vmx(vcpu); 7415 unsigned long cr3, cr4; 7416 7417 /* Record the guest's net vcpu time for enforced NMI injections. */ 7418 if (unlikely(!enable_vnmi && 7419 vmx->loaded_vmcs->soft_vnmi_blocked)) 7420 vmx->loaded_vmcs->entry_time = ktime_get(); 7421 7422 /* 7423 * Don't enter VMX if guest state is invalid, let the exit handler 7424 * start emulation until we arrive back to a valid state. Synthesize a 7425 * consistency check VM-Exit due to invalid guest state and bail. 7426 */ 7427 if (unlikely(vmx->emulation_required)) { 7428 vmx->fail = 0; 7429 7430 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; 7431 vmx->exit_reason.failed_vmentry = 1; 7432 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7433 vmx->exit_qualification = ENTRY_FAIL_DEFAULT; 7434 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7435 vmx->exit_intr_info = 0; 7436 return EXIT_FASTPATH_NONE; 7437 } 7438 7439 trace_kvm_entry(vcpu, force_immediate_exit); 7440 7441 if (vmx->ple_window_dirty) { 7442 vmx->ple_window_dirty = false; 7443 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7444 } 7445 7446 /* 7447 * We did this in prepare_switch_to_guest, because it needs to 7448 * be within srcu_read_lock. 7449 */ 7450 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7451 7452 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7453 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7454 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7455 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7456 vcpu->arch.regs_dirty = 0; 7457 7458 /* 7459 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7460 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7461 * it switches back to the current->mm, which can occur in KVM context 7462 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7463 * toggles a static key while handling a VM-Exit. 7464 */ 7465 cr3 = __get_current_cr3_fast(); 7466 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7467 vmcs_writel(HOST_CR3, cr3); 7468 vmx->loaded_vmcs->host_state.cr3 = cr3; 7469 } 7470 7471 cr4 = cr4_read_shadow(); 7472 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7473 vmcs_writel(HOST_CR4, cr4); 7474 vmx->loaded_vmcs->host_state.cr4 = cr4; 7475 } 7476 7477 /* When single-stepping over STI and MOV SS, we must clear the 7478 * corresponding interruptibility bits in the guest state. Otherwise 7479 * vmentry fails as it then expects bit 14 (BS) in pending debug 7480 * exceptions being set, but that's not correct for the guest debugging 7481 * case. */ 7482 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7483 vmx_set_interrupt_shadow(vcpu, 0); 7484 7485 kvm_load_guest_xsave_state(vcpu); 7486 7487 pt_guest_enter(vmx); 7488 7489 atomic_switch_perf_msrs(vmx); 7490 if (intel_pmu_lbr_is_enabled(vcpu)) 7491 vmx_passthrough_lbr_msrs(vcpu); 7492 7493 if (enable_preemption_timer) 7494 vmx_update_hv_timer(vcpu, force_immediate_exit); 7495 else if (force_immediate_exit) 7496 smp_send_reschedule(vcpu->cpu); 7497 7498 kvm_wait_lapic_expire(vcpu); 7499 7500 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7501 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7502 7503 /* All fields are clean at this point */ 7504 if (kvm_is_using_evmcs()) { 7505 current_evmcs->hv_clean_fields |= 7506 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7507 7508 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7509 } 7510 7511 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7512 if (vcpu->arch.host_debugctl) 7513 update_debugctlmsr(vcpu->arch.host_debugctl); 7514 7515 #ifndef CONFIG_X86_64 7516 /* 7517 * The sysexit path does not restore ds/es, so we must set them to 7518 * a reasonable value ourselves. 7519 * 7520 * We can't defer this to vmx_prepare_switch_to_host() since that 7521 * function may be executed in interrupt context, which saves and 7522 * restore segments around it, nullifying its effect. 7523 */ 7524 loadsegment(ds, __USER_DS); 7525 loadsegment(es, __USER_DS); 7526 #endif 7527 7528 pt_guest_exit(vmx); 7529 7530 kvm_load_host_xsave_state(vcpu); 7531 7532 if (is_guest_mode(vcpu)) { 7533 /* 7534 * Track VMLAUNCH/VMRESUME that have made past guest state 7535 * checking. 7536 */ 7537 if (vmx->nested.nested_run_pending && 7538 !vmx->exit_reason.failed_vmentry) 7539 ++vcpu->stat.nested_run; 7540 7541 vmx->nested.nested_run_pending = 0; 7542 } 7543 7544 if (unlikely(vmx->fail)) 7545 return EXIT_FASTPATH_NONE; 7546 7547 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7548 kvm_machine_check(); 7549 7550 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7551 7552 if (unlikely(vmx->exit_reason.failed_vmentry)) 7553 return EXIT_FASTPATH_NONE; 7554 7555 vmx->loaded_vmcs->launched = 1; 7556 7557 vmx_recover_nmi_blocking(vmx); 7558 vmx_complete_interrupts(vmx); 7559 7560 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7561 } 7562 7563 void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7564 { 7565 struct vcpu_vmx *vmx = to_vmx(vcpu); 7566 7567 if (enable_pml) 7568 vmx_destroy_pml_buffer(vmx); 7569 free_vpid(vmx->vpid); 7570 nested_vmx_free_vcpu(vcpu); 7571 free_loaded_vmcs(vmx->loaded_vmcs); 7572 free_page((unsigned long)vmx->ve_info); 7573 } 7574 7575 int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7576 { 7577 struct vmx_uret_msr *tsx_ctrl; 7578 struct vcpu_vmx *vmx; 7579 int i, err; 7580 7581 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7582 vmx = to_vmx(vcpu); 7583 7584 INIT_LIST_HEAD(&vmx->pi_wakeup_list); 7585 7586 err = -ENOMEM; 7587 7588 vmx->vpid = allocate_vpid(); 7589 7590 /* 7591 * If PML is turned on, failure on enabling PML just results in failure 7592 * of creating the vcpu, therefore we can simplify PML logic (by 7593 * avoiding dealing with cases, such as enabling PML partially on vcpus 7594 * for the guest), etc. 7595 */ 7596 if (enable_pml) { 7597 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7598 if (!vmx->pml_pg) 7599 goto free_vpid; 7600 } 7601 7602 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7603 vmx->guest_uret_msrs[i].mask = -1ull; 7604 if (boot_cpu_has(X86_FEATURE_RTM)) { 7605 /* 7606 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7607 * Keep the host value unchanged to avoid changing CPUID bits 7608 * under the host kernel's feet. 7609 */ 7610 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7611 if (tsx_ctrl) 7612 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7613 } 7614 7615 err = alloc_loaded_vmcs(&vmx->vmcs01); 7616 if (err < 0) 7617 goto free_pml; 7618 7619 /* 7620 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7621 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7622 * feature only for vmcs01, KVM currently isn't equipped to realize any 7623 * performance benefits from enabling it for vmcs02. 7624 */ 7625 if (kvm_is_using_evmcs() && 7626 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7627 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7628 7629 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7630 } 7631 7632 /* The MSR bitmap starts with all ones */ 7633 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7634 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7635 7636 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 7637 #ifdef CONFIG_X86_64 7638 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 7639 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 7640 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 7641 #endif 7642 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 7643 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 7644 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 7645 if (kvm_cstate_in_guest(vcpu->kvm)) { 7646 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 7647 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 7648 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 7649 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 7650 } 7651 7652 vmx->loaded_vmcs = &vmx->vmcs01; 7653 7654 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7655 err = kvm_alloc_apic_access_page(vcpu->kvm); 7656 if (err) 7657 goto free_vmcs; 7658 } 7659 7660 if (enable_ept && !enable_unrestricted_guest) { 7661 err = init_rmode_identity_map(vcpu->kvm); 7662 if (err) 7663 goto free_vmcs; 7664 } 7665 7666 err = -ENOMEM; 7667 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { 7668 struct page *page; 7669 7670 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); 7671 7672 /* ve_info must be page aligned. */ 7673 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7674 if (!page) 7675 goto free_vmcs; 7676 7677 vmx->ve_info = page_to_virt(page); 7678 } 7679 7680 if (vmx_can_use_ipiv(vcpu)) 7681 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7682 __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); 7683 7684 return 0; 7685 7686 free_vmcs: 7687 free_loaded_vmcs(vmx->loaded_vmcs); 7688 free_pml: 7689 vmx_destroy_pml_buffer(vmx); 7690 free_vpid: 7691 free_vpid(vmx->vpid); 7692 return err; 7693 } 7694 7695 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7696 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7697 7698 int vmx_vm_init(struct kvm *kvm) 7699 { 7700 if (!ple_gap) 7701 kvm->arch.pause_in_guest = true; 7702 7703 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7704 switch (l1tf_mitigation) { 7705 case L1TF_MITIGATION_OFF: 7706 case L1TF_MITIGATION_FLUSH_NOWARN: 7707 /* 'I explicitly don't care' is set */ 7708 break; 7709 case L1TF_MITIGATION_AUTO: 7710 case L1TF_MITIGATION_FLUSH: 7711 case L1TF_MITIGATION_FLUSH_NOSMT: 7712 case L1TF_MITIGATION_FULL: 7713 /* 7714 * Warn upon starting the first VM in a potentially 7715 * insecure environment. 7716 */ 7717 if (sched_smt_active()) 7718 pr_warn_once(L1TF_MSG_SMT); 7719 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7720 pr_warn_once(L1TF_MSG_L1D); 7721 break; 7722 case L1TF_MITIGATION_FULL_FORCE: 7723 /* Flush is enforced */ 7724 break; 7725 } 7726 } 7727 return 0; 7728 } 7729 7730 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7731 { 7732 /* 7733 * Force UC for host MMIO regions, as allowing the guest to access MMIO 7734 * with cacheable accesses will result in Machine Checks. 7735 */ 7736 if (is_mmio) 7737 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7738 7739 /* 7740 * Force WB and ignore guest PAT if the VM does NOT have a non-coherent 7741 * device attached. Letting the guest control memory types on Intel 7742 * CPUs may result in unexpected behavior, and so KVM's ABI is to trust 7743 * the guest to behave only as a last resort. 7744 */ 7745 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) 7746 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7747 7748 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); 7749 } 7750 7751 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7752 { 7753 /* 7754 * These bits in the secondary execution controls field 7755 * are dynamic, the others are mostly based on the hypervisor 7756 * architecture and the guest's CPUID. Do not touch the 7757 * dynamic bits. 7758 */ 7759 u32 mask = 7760 SECONDARY_EXEC_SHADOW_VMCS | 7761 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7762 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7763 SECONDARY_EXEC_DESC; 7764 7765 u32 cur_ctl = secondary_exec_controls_get(vmx); 7766 7767 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7768 } 7769 7770 /* 7771 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7772 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7773 */ 7774 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7775 { 7776 struct vcpu_vmx *vmx = to_vmx(vcpu); 7777 struct kvm_cpuid_entry2 *entry; 7778 7779 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7780 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7781 7782 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7783 if (entry && (entry->_reg & (_cpuid_mask))) \ 7784 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7785 } while (0) 7786 7787 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7788 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7789 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7790 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7791 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7792 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7793 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7794 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7795 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7796 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7797 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7798 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7799 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7800 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7801 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7802 7803 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7804 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7805 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7806 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7807 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7808 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7809 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7810 7811 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7812 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7813 7814 #undef cr4_fixed1_update 7815 } 7816 7817 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7818 { 7819 struct vcpu_vmx *vmx = to_vmx(vcpu); 7820 struct kvm_cpuid_entry2 *best = NULL; 7821 int i; 7822 7823 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7824 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7825 if (!best) 7826 return; 7827 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7828 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7829 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7830 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7831 } 7832 7833 /* Get the number of configurable Address Ranges for filtering */ 7834 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7835 PT_CAP_num_address_ranges); 7836 7837 /* Initialize and clear the no dependency bits */ 7838 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7839 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7840 RTIT_CTL_BRANCH_EN); 7841 7842 /* 7843 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7844 * will inject an #GP 7845 */ 7846 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7847 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7848 7849 /* 7850 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7851 * PSBFreq can be set 7852 */ 7853 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7854 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7855 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7856 7857 /* 7858 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7859 */ 7860 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7861 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7862 RTIT_CTL_MTC_RANGE); 7863 7864 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7865 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7866 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7867 RTIT_CTL_PTW_EN); 7868 7869 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7870 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7871 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7872 7873 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7874 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7875 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7876 7877 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7878 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7879 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7880 7881 /* unmask address range configure area */ 7882 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7883 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7884 } 7885 7886 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7887 { 7888 struct vcpu_vmx *vmx = to_vmx(vcpu); 7889 7890 /* 7891 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7892 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7893 * set if and only if XSAVE is supported. 7894 */ 7895 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) 7896 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); 7897 7898 vmx_setup_uret_msrs(vmx); 7899 7900 if (cpu_has_secondary_exec_ctrls()) 7901 vmcs_set_secondary_exec_control(vmx, 7902 vmx_secondary_exec_control(vmx)); 7903 7904 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7905 vmx->msr_ia32_feature_control_valid_bits |= 7906 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7907 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7908 else 7909 vmx->msr_ia32_feature_control_valid_bits &= 7910 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7911 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7912 7913 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7914 nested_vmx_cr_fixed1_bits_update(vcpu); 7915 7916 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7917 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) 7918 update_intel_pt_cfg(vcpu); 7919 7920 if (boot_cpu_has(X86_FEATURE_RTM)) { 7921 struct vmx_uret_msr *msr; 7922 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7923 if (msr) { 7924 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); 7925 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 7926 } 7927 } 7928 7929 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 7930 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 7931 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 7932 7933 if (boot_cpu_has(X86_FEATURE_IBPB)) 7934 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 7935 !guest_has_pred_cmd_msr(vcpu)); 7936 7937 if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 7938 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 7939 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 7940 7941 set_cr4_guest_host_mask(vmx); 7942 7943 vmx_write_encls_bitmap(vcpu, NULL); 7944 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) 7945 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 7946 else 7947 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 7948 7949 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 7950 vmx->msr_ia32_feature_control_valid_bits |= 7951 FEAT_CTL_SGX_LC_ENABLED; 7952 else 7953 vmx->msr_ia32_feature_control_valid_bits &= 7954 ~FEAT_CTL_SGX_LC_ENABLED; 7955 7956 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7957 vmx_update_exception_bitmap(vcpu); 7958 } 7959 7960 static __init u64 vmx_get_perf_capabilities(void) 7961 { 7962 u64 perf_cap = PMU_CAP_FW_WRITES; 7963 u64 host_perf_cap = 0; 7964 7965 if (!enable_pmu) 7966 return 0; 7967 7968 if (boot_cpu_has(X86_FEATURE_PDCM)) 7969 rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 7970 7971 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { 7972 x86_perf_get_lbr(&vmx_lbr_caps); 7973 7974 /* 7975 * KVM requires LBR callstack support, as the overhead due to 7976 * context switching LBRs without said support is too high. 7977 * See intel_pmu_create_guest_lbr_event() for more info. 7978 */ 7979 if (!vmx_lbr_caps.has_callstack) 7980 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 7981 else if (vmx_lbr_caps.nr) 7982 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; 7983 } 7984 7985 if (vmx_pebs_supported()) { 7986 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 7987 7988 /* 7989 * Disallow adaptive PEBS as it is functionally broken, can be 7990 * used by the guest to read *host* LBRs, and can be used to 7991 * bypass userspace event filters. To correctly and safely 7992 * support adaptive PEBS, KVM needs to: 7993 * 7994 * 1. Account for the ADAPTIVE flag when (re)programming fixed 7995 * counters. 7996 * 7997 * 2. Gain support from perf (or take direct control of counter 7998 * programming) to support events without adaptive PEBS 7999 * enabled for the hardware counter. 8000 * 8001 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 8002 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 8003 * 8004 * 4. Document which PMU events are effectively exposed to the 8005 * guest via adaptive PEBS, and make adaptive PEBS mutually 8006 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 8007 */ 8008 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 8009 } 8010 8011 return perf_cap; 8012 } 8013 8014 static __init void vmx_set_cpu_caps(void) 8015 { 8016 kvm_set_cpu_caps(); 8017 8018 /* CPUID 0x1 */ 8019 if (nested) 8020 kvm_cpu_cap_set(X86_FEATURE_VMX); 8021 8022 /* CPUID 0x7 */ 8023 if (kvm_mpx_supported()) 8024 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 8025 if (!cpu_has_vmx_invpcid()) 8026 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 8027 if (vmx_pt_mode_is_host_guest()) 8028 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 8029 if (vmx_pebs_supported()) { 8030 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 8031 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 8032 } 8033 8034 if (!enable_pmu) 8035 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 8036 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 8037 8038 if (!enable_sgx) { 8039 kvm_cpu_cap_clear(X86_FEATURE_SGX); 8040 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 8041 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 8042 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 8043 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 8044 } 8045 8046 if (vmx_umip_emulated()) 8047 kvm_cpu_cap_set(X86_FEATURE_UMIP); 8048 8049 /* CPUID 0xD.1 */ 8050 kvm_caps.supported_xss = 0; 8051 if (!cpu_has_vmx_xsaves()) 8052 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 8053 8054 /* CPUID 0x80000001 and 0x7 (RDPID) */ 8055 if (!cpu_has_vmx_rdtscp()) { 8056 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 8057 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 8058 } 8059 8060 if (cpu_has_vmx_waitpkg()) 8061 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 8062 } 8063 8064 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, 8065 struct x86_instruction_info *info, 8066 unsigned long *exit_qualification) 8067 { 8068 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8069 unsigned short port; 8070 int size; 8071 bool imm; 8072 8073 /* 8074 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 8075 * VM-exits depend on the 'unconditional IO exiting' VM-execution 8076 * control. 8077 * 8078 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 8079 */ 8080 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 8081 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 8082 8083 if (info->intercept == x86_intercept_in || 8084 info->intercept == x86_intercept_ins) { 8085 port = info->src_val; 8086 size = info->dst_bytes; 8087 imm = info->src_type == OP_IMM; 8088 } else { 8089 port = info->dst_val; 8090 size = info->src_bytes; 8091 imm = info->dst_type == OP_IMM; 8092 } 8093 8094 8095 *exit_qualification = ((unsigned long)port << 16) | (size - 1); 8096 8097 if (info->intercept == x86_intercept_ins || 8098 info->intercept == x86_intercept_outs) 8099 *exit_qualification |= BIT(4); 8100 8101 if (info->rep_prefix) 8102 *exit_qualification |= BIT(5); 8103 8104 if (imm) 8105 *exit_qualification |= BIT(6); 8106 8107 return nested_vmx_check_io_bitmaps(vcpu, port, size); 8108 } 8109 8110 int vmx_check_intercept(struct kvm_vcpu *vcpu, 8111 struct x86_instruction_info *info, 8112 enum x86_intercept_stage stage, 8113 struct x86_exception *exception) 8114 { 8115 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8116 unsigned long exit_qualification = 0; 8117 u32 vm_exit_reason; 8118 u64 exit_insn_len; 8119 8120 switch (info->intercept) { 8121 case x86_intercept_rdpid: 8122 /* 8123 * RDPID causes #UD if not enabled through secondary execution 8124 * controls (ENABLE_RDTSCP). Note, the implicit MSR access to 8125 * TSC_AUX is NOT subject to interception, i.e. checking only 8126 * the dedicated execution control is architecturally correct. 8127 */ 8128 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 8129 exception->vector = UD_VECTOR; 8130 exception->error_code_valid = false; 8131 return X86EMUL_PROPAGATE_FAULT; 8132 } 8133 return X86EMUL_CONTINUE; 8134 8135 case x86_intercept_in: 8136 case x86_intercept_ins: 8137 case x86_intercept_out: 8138 case x86_intercept_outs: 8139 if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) 8140 return X86EMUL_CONTINUE; 8141 8142 vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; 8143 break; 8144 8145 case x86_intercept_lgdt: 8146 case x86_intercept_lidt: 8147 case x86_intercept_lldt: 8148 case x86_intercept_ltr: 8149 case x86_intercept_sgdt: 8150 case x86_intercept_sidt: 8151 case x86_intercept_sldt: 8152 case x86_intercept_str: 8153 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8154 return X86EMUL_CONTINUE; 8155 8156 if (info->intercept == x86_intercept_lldt || 8157 info->intercept == x86_intercept_ltr || 8158 info->intercept == x86_intercept_sldt || 8159 info->intercept == x86_intercept_str) 8160 vm_exit_reason = EXIT_REASON_LDTR_TR; 8161 else 8162 vm_exit_reason = EXIT_REASON_GDTR_IDTR; 8163 /* 8164 * FIXME: Decode the ModR/M to generate the correct exit 8165 * qualification for memory operands. 8166 */ 8167 break; 8168 8169 case x86_intercept_hlt: 8170 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) 8171 return X86EMUL_CONTINUE; 8172 8173 vm_exit_reason = EXIT_REASON_HLT; 8174 break; 8175 8176 case x86_intercept_pause: 8177 /* 8178 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8179 * with vanilla NOPs in the emulator. Apply the interception 8180 * check only to actual PAUSE instructions. Don't check 8181 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8182 * exit, i.e. KVM is within its rights to allow L2 to execute 8183 * the PAUSE. 8184 */ 8185 if ((info->rep_prefix != REPE_PREFIX) || 8186 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) 8187 return X86EMUL_CONTINUE; 8188 8189 vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; 8190 break; 8191 8192 /* TODO: check more intercepts... */ 8193 default: 8194 return X86EMUL_UNHANDLEABLE; 8195 } 8196 8197 exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); 8198 if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) 8199 return X86EMUL_UNHANDLEABLE; 8200 8201 __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, 8202 exit_insn_len); 8203 return X86EMUL_INTERCEPTED; 8204 } 8205 8206 #ifdef CONFIG_X86_64 8207 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8208 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8209 u64 divisor, u64 *result) 8210 { 8211 u64 low = a << shift, high = a >> (64 - shift); 8212 8213 /* To avoid the overflow on divq */ 8214 if (high >= divisor) 8215 return 1; 8216 8217 /* Low hold the result, high hold rem which is discarded */ 8218 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8219 "rm" (divisor), "0" (low), "1" (high)); 8220 *result = low; 8221 8222 return 0; 8223 } 8224 8225 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8226 bool *expired) 8227 { 8228 struct vcpu_vmx *vmx; 8229 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8230 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8231 8232 vmx = to_vmx(vcpu); 8233 tscl = rdtsc(); 8234 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8235 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8236 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8237 ktimer->timer_advance_ns); 8238 8239 if (delta_tsc > lapic_timer_advance_cycles) 8240 delta_tsc -= lapic_timer_advance_cycles; 8241 else 8242 delta_tsc = 0; 8243 8244 /* Convert to host delta tsc if tsc scaling is enabled */ 8245 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8246 delta_tsc && u64_shl_div_u64(delta_tsc, 8247 kvm_caps.tsc_scaling_ratio_frac_bits, 8248 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8249 return -ERANGE; 8250 8251 /* 8252 * If the delta tsc can't fit in the 32 bit after the multi shift, 8253 * we can't use the preemption timer. 8254 * It's possible that it fits on later vmentries, but checking 8255 * on every vmentry is costly so we just use an hrtimer. 8256 */ 8257 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8258 return -ERANGE; 8259 8260 vmx->hv_deadline_tsc = tscl + delta_tsc; 8261 *expired = !delta_tsc; 8262 return 0; 8263 } 8264 8265 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8266 { 8267 to_vmx(vcpu)->hv_deadline_tsc = -1; 8268 } 8269 #endif 8270 8271 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8272 { 8273 struct vcpu_vmx *vmx = to_vmx(vcpu); 8274 8275 if (WARN_ON_ONCE(!enable_pml)) 8276 return; 8277 8278 if (is_guest_mode(vcpu)) { 8279 vmx->nested.update_vmcs01_cpu_dirty_logging = true; 8280 return; 8281 } 8282 8283 /* 8284 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8285 * code, but in that case another update request will be made and so 8286 * the guest will never run with a stale PML value. 8287 */ 8288 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8289 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8290 else 8291 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8292 } 8293 8294 void vmx_setup_mce(struct kvm_vcpu *vcpu) 8295 { 8296 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8297 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8298 FEAT_CTL_LMCE_ENABLED; 8299 else 8300 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8301 ~FEAT_CTL_LMCE_ENABLED; 8302 } 8303 8304 #ifdef CONFIG_KVM_SMM 8305 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8306 { 8307 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8308 if (to_vmx(vcpu)->nested.nested_run_pending) 8309 return -EBUSY; 8310 return !is_smm(vcpu); 8311 } 8312 8313 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8314 { 8315 struct vcpu_vmx *vmx = to_vmx(vcpu); 8316 8317 /* 8318 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8319 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8320 * SMI and RSM only modify state that is saved and restored via SMRAM. 8321 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8322 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8323 */ 8324 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8325 if (vmx->nested.smm.guest_mode) 8326 nested_vmx_vmexit(vcpu, -1, 0, 0); 8327 8328 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8329 vmx->nested.vmxon = false; 8330 vmx_clear_hlt(vcpu); 8331 return 0; 8332 } 8333 8334 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8335 { 8336 struct vcpu_vmx *vmx = to_vmx(vcpu); 8337 int ret; 8338 8339 if (vmx->nested.smm.vmxon) { 8340 vmx->nested.vmxon = true; 8341 vmx->nested.smm.vmxon = false; 8342 } 8343 8344 if (vmx->nested.smm.guest_mode) { 8345 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8346 if (ret) 8347 return ret; 8348 8349 vmx->nested.nested_run_pending = 1; 8350 vmx->nested.smm.guest_mode = false; 8351 } 8352 return 0; 8353 } 8354 8355 void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8356 { 8357 /* RSM will cause a vmexit anyway. */ 8358 } 8359 #endif 8360 8361 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8362 { 8363 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8364 } 8365 8366 void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8367 { 8368 if (is_guest_mode(vcpu)) { 8369 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8370 8371 if (hrtimer_try_to_cancel(timer) == 1) 8372 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8373 } 8374 } 8375 8376 void vmx_hardware_unsetup(void) 8377 { 8378 kvm_set_posted_intr_wakeup_handler(NULL); 8379 8380 if (nested) 8381 nested_vmx_hardware_unsetup(); 8382 8383 free_kvm_area(); 8384 } 8385 8386 void vmx_vm_destroy(struct kvm *kvm) 8387 { 8388 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8389 8390 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8391 } 8392 8393 /* 8394 * Note, the SDM states that the linear address is masked *after* the modified 8395 * canonicality check, whereas KVM masks (untags) the address and then performs 8396 * a "normal" canonicality check. Functionally, the two methods are identical, 8397 * and when the masking occurs relative to the canonicality check isn't visible 8398 * to software, i.e. KVM's behavior doesn't violate the SDM. 8399 */ 8400 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8401 { 8402 int lam_bit; 8403 unsigned long cr3_bits; 8404 8405 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8406 return gva; 8407 8408 if (!is_64_bit_mode(vcpu)) 8409 return gva; 8410 8411 /* 8412 * Bit 63 determines if the address should be treated as user address 8413 * or a supervisor address. 8414 */ 8415 if (!(gva & BIT_ULL(63))) { 8416 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8417 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8418 return gva; 8419 8420 /* LAM_U48 is ignored if LAM_U57 is set. */ 8421 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8422 } else { 8423 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8424 return gva; 8425 8426 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8427 } 8428 8429 /* 8430 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8431 * Bit 63 is retained from the raw virtual address so that untagging 8432 * doesn't change a user access to a supervisor access, and vice versa. 8433 */ 8434 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8435 } 8436 8437 static unsigned int vmx_handle_intel_pt_intr(void) 8438 { 8439 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8440 8441 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8442 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8443 return 0; 8444 8445 kvm_make_request(KVM_REQ_PMI, vcpu); 8446 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8447 (unsigned long *)&vcpu->arch.pmu.global_status); 8448 return 1; 8449 } 8450 8451 static __init void vmx_setup_user_return_msrs(void) 8452 { 8453 8454 /* 8455 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8456 * will emulate SYSCALL in legacy mode if the vendor string in guest 8457 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8458 * support this emulation, MSR_STAR is included in the list for i386, 8459 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8460 * into hardware and is here purely for emulation purposes. 8461 */ 8462 const u32 vmx_uret_msrs_list[] = { 8463 #ifdef CONFIG_X86_64 8464 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8465 #endif 8466 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8467 MSR_IA32_TSX_CTRL, 8468 }; 8469 int i; 8470 8471 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8472 8473 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8474 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8475 } 8476 8477 static void __init vmx_setup_me_spte_mask(void) 8478 { 8479 u64 me_mask = 0; 8480 8481 /* 8482 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8483 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, 8484 * boot_cpu_data.x86_phys_bits holds the actual physical address 8485 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to 8486 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. 8487 */ 8488 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) 8489 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8490 kvm_host.maxphyaddr - 1); 8491 8492 /* 8493 * Unlike SME, host kernel doesn't support setting up any 8494 * MKTME KeyID on Intel platforms. No memory encryption 8495 * bits should be included into the SPTE. 8496 */ 8497 kvm_mmu_set_me_spte_mask(0, me_mask); 8498 } 8499 8500 __init int vmx_hardware_setup(void) 8501 { 8502 unsigned long host_bndcfgs; 8503 struct desc_ptr dt; 8504 int r; 8505 8506 store_idt(&dt); 8507 host_idt_base = dt.address; 8508 8509 vmx_setup_user_return_msrs(); 8510 8511 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8512 return -EIO; 8513 8514 if (boot_cpu_has(X86_FEATURE_NX)) 8515 kvm_enable_efer_bits(EFER_NX); 8516 8517 if (boot_cpu_has(X86_FEATURE_MPX)) { 8518 rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); 8519 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8520 } 8521 8522 if (!cpu_has_vmx_mpx()) 8523 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8524 XFEATURE_MASK_BNDCSR); 8525 8526 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8527 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8528 enable_vpid = 0; 8529 8530 if (!cpu_has_vmx_ept() || 8531 !cpu_has_vmx_ept_4levels() || 8532 !cpu_has_vmx_ept_mt_wb() || 8533 !cpu_has_vmx_invept_global()) 8534 enable_ept = 0; 8535 8536 /* NX support is required for shadow paging. */ 8537 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8538 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8539 return -EOPNOTSUPP; 8540 } 8541 8542 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8543 enable_ept_ad_bits = 0; 8544 8545 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8546 enable_unrestricted_guest = 0; 8547 8548 if (!cpu_has_vmx_flexpriority()) 8549 flexpriority_enabled = 0; 8550 8551 if (!cpu_has_virtual_nmis()) 8552 enable_vnmi = 0; 8553 8554 #ifdef CONFIG_X86_SGX_KVM 8555 if (!cpu_has_vmx_encls_vmexit()) 8556 enable_sgx = false; 8557 #endif 8558 8559 /* 8560 * set_apic_access_page_addr() is used to reload apic access 8561 * page upon invalidation. No need to do anything if not 8562 * using the APIC_ACCESS_ADDR VMCS field. 8563 */ 8564 if (!flexpriority_enabled) 8565 vt_x86_ops.set_apic_access_page_addr = NULL; 8566 8567 if (!cpu_has_vmx_tpr_shadow()) 8568 vt_x86_ops.update_cr8_intercept = NULL; 8569 8570 #if IS_ENABLED(CONFIG_HYPERV) 8571 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8572 && enable_ept) { 8573 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8574 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8575 } 8576 #endif 8577 8578 if (!cpu_has_vmx_ple()) { 8579 ple_gap = 0; 8580 ple_window = 0; 8581 ple_window_grow = 0; 8582 ple_window_max = 0; 8583 ple_window_shrink = 0; 8584 } 8585 8586 if (!cpu_has_vmx_apicv()) 8587 enable_apicv = 0; 8588 if (!enable_apicv) 8589 vt_x86_ops.sync_pir_to_irr = NULL; 8590 8591 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8592 enable_ipiv = false; 8593 8594 if (cpu_has_vmx_tsc_scaling()) 8595 kvm_caps.has_tsc_control = true; 8596 8597 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8598 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8599 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8600 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8601 8602 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8603 8604 if (enable_ept) 8605 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8606 cpu_has_vmx_ept_execute_only()); 8607 8608 /* 8609 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8610 * bits to shadow_zero_check. 8611 */ 8612 vmx_setup_me_spte_mask(); 8613 8614 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8615 ept_caps_to_lpage_level(vmx_capability.ept)); 8616 8617 /* 8618 * Only enable PML when hardware supports PML feature, and both EPT 8619 * and EPT A/D bit features are enabled -- PML depends on them to work. 8620 */ 8621 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8622 enable_pml = 0; 8623 8624 if (!enable_pml) 8625 vt_x86_ops.cpu_dirty_log_size = 0; 8626 8627 if (!cpu_has_vmx_preemption_timer()) 8628 enable_preemption_timer = false; 8629 8630 if (enable_preemption_timer) { 8631 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8632 8633 cpu_preemption_timer_multi = 8634 vmx_misc_preemption_timer_rate(vmcs_config.misc); 8635 8636 if (tsc_khz) 8637 use_timer_freq = (u64)tsc_khz * 1000; 8638 use_timer_freq >>= cpu_preemption_timer_multi; 8639 8640 /* 8641 * KVM "disables" the preemption timer by setting it to its max 8642 * value. Don't use the timer if it might cause spurious exits 8643 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8644 */ 8645 if (use_timer_freq > 0xffffffffu / 10) 8646 enable_preemption_timer = false; 8647 } 8648 8649 if (!enable_preemption_timer) { 8650 vt_x86_ops.set_hv_timer = NULL; 8651 vt_x86_ops.cancel_hv_timer = NULL; 8652 } 8653 8654 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8655 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8656 8657 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8658 return -EINVAL; 8659 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8660 pt_mode = PT_MODE_SYSTEM; 8661 if (pt_mode == PT_MODE_HOST_GUEST) 8662 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8663 else 8664 vt_init_ops.handle_intel_pt_intr = NULL; 8665 8666 setup_default_sgx_lepubkeyhash(); 8667 8668 if (nested) { 8669 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8670 8671 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8672 if (r) 8673 return r; 8674 } 8675 8676 vmx_set_cpu_caps(); 8677 8678 r = alloc_kvm_area(); 8679 if (r && nested) 8680 nested_vmx_hardware_unsetup(); 8681 8682 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8683 8684 return r; 8685 } 8686 8687 static void vmx_cleanup_l1d_flush(void) 8688 { 8689 if (vmx_l1d_flush_pages) { 8690 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8691 vmx_l1d_flush_pages = NULL; 8692 } 8693 /* Restore state so sysfs ignores VMX */ 8694 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8695 } 8696 8697 static void __vmx_exit(void) 8698 { 8699 allow_smaller_maxphyaddr = false; 8700 8701 vmx_cleanup_l1d_flush(); 8702 } 8703 8704 static void __exit vmx_exit(void) 8705 { 8706 kvm_exit(); 8707 __vmx_exit(); 8708 kvm_x86_vendor_exit(); 8709 8710 } 8711 module_exit(vmx_exit); 8712 8713 static int __init vmx_init(void) 8714 { 8715 int r, cpu; 8716 8717 if (!kvm_is_vmx_supported()) 8718 return -EOPNOTSUPP; 8719 8720 /* 8721 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing 8722 * to unwind if a later step fails. 8723 */ 8724 hv_init_evmcs(); 8725 8726 r = kvm_x86_vendor_init(&vt_init_ops); 8727 if (r) 8728 return r; 8729 8730 /* 8731 * Must be called after common x86 init so enable_ept is properly set 8732 * up. Hand the parameter mitigation value in which was stored in 8733 * the pre module init parser. If no parameter was given, it will 8734 * contain 'auto' which will be turned into the default 'cond' 8735 * mitigation mode. 8736 */ 8737 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8738 if (r) 8739 goto err_l1d_flush; 8740 8741 for_each_possible_cpu(cpu) { 8742 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8743 8744 pi_init_cpu(cpu); 8745 } 8746 8747 vmx_check_vmcs12_offsets(); 8748 8749 /* 8750 * Shadow paging doesn't have a (further) performance penalty 8751 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8752 * by default 8753 */ 8754 if (!enable_ept) 8755 allow_smaller_maxphyaddr = true; 8756 8757 /* 8758 * Common KVM initialization _must_ come last, after this, /dev/kvm is 8759 * exposed to userspace! 8760 */ 8761 r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), 8762 THIS_MODULE); 8763 if (r) 8764 goto err_kvm_init; 8765 8766 return 0; 8767 8768 err_kvm_init: 8769 __vmx_exit(); 8770 err_l1d_flush: 8771 kvm_x86_vendor_exit(); 8772 return r; 8773 } 8774 module_init(vmx_init); 8775