1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 #include <linux/entry-kvm.h> 32 33 #include <asm/apic.h> 34 #include <asm/asm.h> 35 #include <asm/cpu.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/api.h> 40 #include <asm/fpu/xstate.h> 41 #include <asm/fred.h> 42 #include <asm/idtentry.h> 43 #include <asm/io.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/reboot.h> 46 #include <asm/perf_event.h> 47 #include <asm/mmu_context.h> 48 #include <asm/mshyperv.h> 49 #include <asm/mwait.h> 50 #include <asm/spec-ctrl.h> 51 #include <asm/vmx.h> 52 53 #include <trace/events/ipi.h> 54 55 #include "capabilities.h" 56 #include "common.h" 57 #include "cpuid.h" 58 #include "hyperv.h" 59 #include "kvm_onhyperv.h" 60 #include "irq.h" 61 #include "kvm_cache_regs.h" 62 #include "lapic.h" 63 #include "mmu.h" 64 #include "nested.h" 65 #include "pmu.h" 66 #include "sgx.h" 67 #include "trace.h" 68 #include "vmcs.h" 69 #include "vmcs12.h" 70 #include "vmx.h" 71 #include "x86.h" 72 #include "x86_ops.h" 73 #include "smm.h" 74 #include "vmx_onhyperv.h" 75 #include "posted_intr.h" 76 77 MODULE_AUTHOR("Qumranet"); 78 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); 79 MODULE_LICENSE("GPL"); 80 81 #ifdef MODULE 82 static const struct x86_cpu_id vmx_cpu_id[] = { 83 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 84 {} 85 }; 86 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 87 #endif 88 89 bool __read_mostly enable_vpid = 1; 90 module_param_named(vpid, enable_vpid, bool, 0444); 91 92 static bool __read_mostly enable_vnmi = 1; 93 module_param_named(vnmi, enable_vnmi, bool, 0444); 94 95 bool __read_mostly flexpriority_enabled = 1; 96 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 97 98 bool __read_mostly enable_ept = 1; 99 module_param_named(ept, enable_ept, bool, 0444); 100 101 bool __read_mostly enable_unrestricted_guest = 1; 102 module_param_named(unrestricted_guest, 103 enable_unrestricted_guest, bool, 0444); 104 105 bool __read_mostly enable_ept_ad_bits = 1; 106 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 107 108 static bool __read_mostly emulate_invalid_guest_state = true; 109 module_param(emulate_invalid_guest_state, bool, 0444); 110 111 static bool __read_mostly fasteoi = 1; 112 module_param(fasteoi, bool, 0444); 113 114 module_param(enable_apicv, bool, 0444); 115 116 bool __read_mostly enable_ipiv = true; 117 module_param(enable_ipiv, bool, 0444); 118 119 /* 120 * If nested=1, nested virtualization is supported, i.e., guests may use 121 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 122 * use VMX instructions. 123 */ 124 static bool __read_mostly nested = 1; 125 module_param(nested, bool, 0444); 126 127 bool __read_mostly enable_pml = 1; 128 module_param_named(pml, enable_pml, bool, 0444); 129 130 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 131 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 132 133 static bool __read_mostly dump_invalid_vmcs = 0; 134 module_param(dump_invalid_vmcs, bool, 0644); 135 136 #define MSR_BITMAP_MODE_X2APIC 1 137 #define MSR_BITMAP_MODE_X2APIC_APICV 2 138 139 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 140 141 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 142 static int __read_mostly cpu_preemption_timer_multi; 143 static bool __read_mostly enable_preemption_timer = 1; 144 #ifdef CONFIG_X86_64 145 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 146 #endif 147 148 extern bool __read_mostly allow_smaller_maxphyaddr; 149 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 150 151 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 152 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 153 #define KVM_VM_CR0_ALWAYS_ON \ 154 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 155 156 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 157 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 158 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 159 160 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 161 162 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 163 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 164 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 165 RTIT_STATUS_BYTECNT)) 166 167 /* 168 * List of MSRs that can be directly passed to the guest. 169 * In addition to these x2apic, PT and LBR MSRs are handled specially. 170 */ 171 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 172 MSR_IA32_SPEC_CTRL, 173 MSR_IA32_PRED_CMD, 174 MSR_IA32_FLUSH_CMD, 175 MSR_IA32_TSC, 176 #ifdef CONFIG_X86_64 177 MSR_FS_BASE, 178 MSR_GS_BASE, 179 MSR_KERNEL_GS_BASE, 180 MSR_IA32_XFD, 181 MSR_IA32_XFD_ERR, 182 #endif 183 MSR_IA32_SYSENTER_CS, 184 MSR_IA32_SYSENTER_ESP, 185 MSR_IA32_SYSENTER_EIP, 186 MSR_CORE_C1_RES, 187 MSR_CORE_C3_RESIDENCY, 188 MSR_CORE_C6_RESIDENCY, 189 MSR_CORE_C7_RESIDENCY, 190 }; 191 192 /* 193 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 194 * ple_gap: upper bound on the amount of time between two successive 195 * executions of PAUSE in a loop. Also indicate if ple enabled. 196 * According to test, this time is usually smaller than 128 cycles. 197 * ple_window: upper bound on the amount of time a guest is allowed to execute 198 * in a PAUSE loop. Tests indicate that most spinlocks are held for 199 * less than 2^12 cycles 200 * Time is measured based on a counter that runs at the same rate as the TSC, 201 * refer SDM volume 3b section 21.6.13 & 22.1.3. 202 */ 203 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 204 module_param(ple_gap, uint, 0444); 205 206 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 207 module_param(ple_window, uint, 0444); 208 209 /* Default doubles per-vcpu window every exit. */ 210 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 211 module_param(ple_window_grow, uint, 0444); 212 213 /* Default resets per-vcpu window every exit to ple_window. */ 214 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 215 module_param(ple_window_shrink, uint, 0444); 216 217 /* Default is to compute the maximum so we can never overflow. */ 218 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 219 module_param(ple_window_max, uint, 0444); 220 221 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ 222 int __read_mostly pt_mode = PT_MODE_SYSTEM; 223 #ifdef CONFIG_BROKEN 224 module_param(pt_mode, int, S_IRUGO); 225 #endif 226 227 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 228 229 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 230 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 231 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 232 233 /* Storage for pre module init parameter parsing */ 234 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 235 236 static const struct { 237 const char *option; 238 bool for_parse; 239 } vmentry_l1d_param[] = { 240 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 241 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 242 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 243 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 244 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 245 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 246 }; 247 248 #define L1D_CACHE_ORDER 4 249 static void *vmx_l1d_flush_pages; 250 251 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 252 { 253 struct page *page; 254 unsigned int i; 255 256 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 257 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 258 return 0; 259 } 260 261 if (!enable_ept) { 262 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 263 return 0; 264 } 265 266 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 267 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 268 return 0; 269 } 270 271 /* If set to auto use the default l1tf mitigation method */ 272 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 273 switch (l1tf_mitigation) { 274 case L1TF_MITIGATION_OFF: 275 l1tf = VMENTER_L1D_FLUSH_NEVER; 276 break; 277 case L1TF_MITIGATION_FLUSH_NOWARN: 278 case L1TF_MITIGATION_FLUSH: 279 case L1TF_MITIGATION_FLUSH_NOSMT: 280 l1tf = VMENTER_L1D_FLUSH_COND; 281 break; 282 case L1TF_MITIGATION_FULL: 283 case L1TF_MITIGATION_FULL_FORCE: 284 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 285 break; 286 } 287 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 288 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 289 } 290 291 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 292 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 293 /* 294 * This allocation for vmx_l1d_flush_pages is not tied to a VM 295 * lifetime and so should not be charged to a memcg. 296 */ 297 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 298 if (!page) 299 return -ENOMEM; 300 vmx_l1d_flush_pages = page_address(page); 301 302 /* 303 * Initialize each page with a different pattern in 304 * order to protect against KSM in the nested 305 * virtualization case. 306 */ 307 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 308 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 309 PAGE_SIZE); 310 } 311 } 312 313 l1tf_vmx_mitigation = l1tf; 314 315 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 316 static_branch_enable(&vmx_l1d_should_flush); 317 else 318 static_branch_disable(&vmx_l1d_should_flush); 319 320 if (l1tf == VMENTER_L1D_FLUSH_COND) 321 static_branch_enable(&vmx_l1d_flush_cond); 322 else 323 static_branch_disable(&vmx_l1d_flush_cond); 324 return 0; 325 } 326 327 static int vmentry_l1d_flush_parse(const char *s) 328 { 329 unsigned int i; 330 331 if (s) { 332 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 333 if (vmentry_l1d_param[i].for_parse && 334 sysfs_streq(s, vmentry_l1d_param[i].option)) 335 return i; 336 } 337 } 338 return -EINVAL; 339 } 340 341 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 342 { 343 int l1tf, ret; 344 345 l1tf = vmentry_l1d_flush_parse(s); 346 if (l1tf < 0) 347 return l1tf; 348 349 if (!boot_cpu_has(X86_BUG_L1TF)) 350 return 0; 351 352 /* 353 * Has vmx_init() run already? If not then this is the pre init 354 * parameter parsing. In that case just store the value and let 355 * vmx_init() do the proper setup after enable_ept has been 356 * established. 357 */ 358 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 359 vmentry_l1d_flush_param = l1tf; 360 return 0; 361 } 362 363 mutex_lock(&vmx_l1d_flush_mutex); 364 ret = vmx_setup_l1d_flush(l1tf); 365 mutex_unlock(&vmx_l1d_flush_mutex); 366 return ret; 367 } 368 369 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 370 { 371 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 372 return sysfs_emit(s, "???\n"); 373 374 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 375 } 376 377 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 378 { 379 u64 msr; 380 381 if (!vmx->disable_fb_clear) 382 return; 383 384 msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); 385 msr |= FB_CLEAR_DIS; 386 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); 387 /* Cache the MSR value to avoid reading it later */ 388 vmx->msr_ia32_mcu_opt_ctrl = msr; 389 } 390 391 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 392 { 393 if (!vmx->disable_fb_clear) 394 return; 395 396 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 397 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 398 } 399 400 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 401 { 402 /* 403 * Disable VERW's behavior of clearing CPU buffers for the guest if the 404 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 405 * the mitigation. Disabling the clearing behavior provides a 406 * performance boost for guests that aren't aware that manually clearing 407 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 408 * and VM-Exit. 409 */ 410 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 411 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 412 !boot_cpu_has_bug(X86_BUG_MDS) && 413 !boot_cpu_has_bug(X86_BUG_TAA); 414 415 /* 416 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 417 * at VMEntry. Skip the MSR read/write when a guest has no use case to 418 * execute VERW. 419 */ 420 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 421 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 422 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 423 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 424 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 425 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 426 vmx->disable_fb_clear = false; 427 } 428 429 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 430 .set = vmentry_l1d_flush_set, 431 .get = vmentry_l1d_flush_get, 432 }; 433 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 434 435 static u32 vmx_segment_access_rights(struct kvm_segment *var); 436 437 void vmx_vmexit(void); 438 439 #define vmx_insn_failed(fmt...) \ 440 do { \ 441 WARN_ONCE(1, fmt); \ 442 pr_warn_ratelimited(fmt); \ 443 } while (0) 444 445 noinline void vmread_error(unsigned long field) 446 { 447 vmx_insn_failed("vmread failed: field=%lx\n", field); 448 } 449 450 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 451 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 452 { 453 if (fault) { 454 kvm_spurious_fault(); 455 } else { 456 instrumentation_begin(); 457 vmread_error(field); 458 instrumentation_end(); 459 } 460 } 461 #endif 462 463 noinline void vmwrite_error(unsigned long field, unsigned long value) 464 { 465 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 466 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 467 } 468 469 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 470 { 471 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 472 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 473 } 474 475 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 476 { 477 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 478 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 479 } 480 481 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 482 { 483 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 484 ext, vpid, gva); 485 } 486 487 noinline void invept_error(unsigned long ext, u64 eptp) 488 { 489 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); 490 } 491 492 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 493 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 494 /* 495 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 496 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 497 */ 498 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 499 500 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 501 static DEFINE_SPINLOCK(vmx_vpid_lock); 502 503 struct vmcs_config vmcs_config __ro_after_init; 504 struct vmx_capability vmx_capability __ro_after_init; 505 506 #define VMX_SEGMENT_FIELD(seg) \ 507 [VCPU_SREG_##seg] = { \ 508 .selector = GUEST_##seg##_SELECTOR, \ 509 .base = GUEST_##seg##_BASE, \ 510 .limit = GUEST_##seg##_LIMIT, \ 511 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 512 } 513 514 static const struct kvm_vmx_segment_field { 515 unsigned selector; 516 unsigned base; 517 unsigned limit; 518 unsigned ar_bytes; 519 } kvm_vmx_segment_fields[] = { 520 VMX_SEGMENT_FIELD(CS), 521 VMX_SEGMENT_FIELD(DS), 522 VMX_SEGMENT_FIELD(ES), 523 VMX_SEGMENT_FIELD(FS), 524 VMX_SEGMENT_FIELD(GS), 525 VMX_SEGMENT_FIELD(SS), 526 VMX_SEGMENT_FIELD(TR), 527 VMX_SEGMENT_FIELD(LDTR), 528 }; 529 530 531 static unsigned long host_idt_base; 532 533 #if IS_ENABLED(CONFIG_HYPERV) 534 static bool __read_mostly enlightened_vmcs = true; 535 module_param(enlightened_vmcs, bool, 0444); 536 537 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 538 { 539 struct hv_enlightened_vmcs *evmcs; 540 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 541 542 if (partition_assist_page == INVALID_PAGE) 543 return -ENOMEM; 544 545 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 546 547 evmcs->partition_assist_page = partition_assist_page; 548 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 549 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 550 551 return 0; 552 } 553 554 static __init void hv_init_evmcs(void) 555 { 556 int cpu; 557 558 if (!enlightened_vmcs) 559 return; 560 561 /* 562 * Enlightened VMCS usage should be recommended and the host needs 563 * to support eVMCS v1 or above. 564 */ 565 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 566 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 567 KVM_EVMCS_VERSION) { 568 569 /* Check that we have assist pages on all online CPUs */ 570 for_each_online_cpu(cpu) { 571 if (!hv_get_vp_assist_page(cpu)) { 572 enlightened_vmcs = false; 573 break; 574 } 575 } 576 577 if (enlightened_vmcs) { 578 pr_info("Using Hyper-V Enlightened VMCS\n"); 579 static_branch_enable(&__kvm_is_using_evmcs); 580 } 581 582 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 583 vt_x86_ops.enable_l2_tlb_flush 584 = hv_enable_l2_tlb_flush; 585 } else { 586 enlightened_vmcs = false; 587 } 588 } 589 590 static void hv_reset_evmcs(void) 591 { 592 struct hv_vp_assist_page *vp_ap; 593 594 if (!kvm_is_using_evmcs()) 595 return; 596 597 /* 598 * KVM should enable eVMCS if and only if all CPUs have a VP assist 599 * page, and should reject CPU onlining if eVMCS is enabled the CPU 600 * doesn't have a VP assist page allocated. 601 */ 602 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 603 if (WARN_ON_ONCE(!vp_ap)) 604 return; 605 606 /* 607 * Reset everything to support using non-enlightened VMCS access later 608 * (e.g. when we reload the module with enlightened_vmcs=0) 609 */ 610 vp_ap->nested_control.features.directhypercall = 0; 611 vp_ap->current_nested_vmcs = 0; 612 vp_ap->enlighten_vmentry = 0; 613 } 614 615 #else /* IS_ENABLED(CONFIG_HYPERV) */ 616 static void hv_init_evmcs(void) {} 617 static void hv_reset_evmcs(void) {} 618 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 619 620 /* 621 * Comment's format: document - errata name - stepping - processor name. 622 * Refer from 623 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 624 */ 625 static u32 vmx_preemption_cpu_tfms[] = { 626 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 627 0x000206E6, 628 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 629 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 630 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 631 0x00020652, 632 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 633 0x00020655, 634 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 635 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 636 /* 637 * 320767.pdf - AAP86 - B1 - 638 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 639 */ 640 0x000106E5, 641 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 642 0x000106A0, 643 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 644 0x000106A1, 645 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 646 0x000106A4, 647 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 648 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 649 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 650 0x000106A5, 651 /* Xeon E3-1220 V2 */ 652 0x000306A8, 653 }; 654 655 static inline bool cpu_has_broken_vmx_preemption_timer(void) 656 { 657 u32 eax = cpuid_eax(0x00000001), i; 658 659 /* Clear the reserved bits */ 660 eax &= ~(0x3U << 14 | 0xfU << 28); 661 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 662 if (eax == vmx_preemption_cpu_tfms[i]) 663 return true; 664 665 return false; 666 } 667 668 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 669 { 670 return flexpriority_enabled && lapic_in_kernel(vcpu); 671 } 672 673 static int vmx_get_passthrough_msr_slot(u32 msr) 674 { 675 int i; 676 677 switch (msr) { 678 case 0x800 ... 0x8ff: 679 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 680 return -ENOENT; 681 case MSR_IA32_RTIT_STATUS: 682 case MSR_IA32_RTIT_OUTPUT_BASE: 683 case MSR_IA32_RTIT_OUTPUT_MASK: 684 case MSR_IA32_RTIT_CR3_MATCH: 685 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 686 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ 687 case MSR_LBR_SELECT: 688 case MSR_LBR_TOS: 689 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: 690 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: 691 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: 692 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 693 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 694 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 695 return -ENOENT; 696 } 697 698 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 699 if (vmx_possible_passthrough_msrs[i] == msr) 700 return i; 701 } 702 703 WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 704 return -ENOENT; 705 } 706 707 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 708 { 709 int i; 710 711 i = kvm_find_user_return_msr(msr); 712 if (i >= 0) 713 return &vmx->guest_uret_msrs[i]; 714 return NULL; 715 } 716 717 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 718 struct vmx_uret_msr *msr, u64 data) 719 { 720 unsigned int slot = msr - vmx->guest_uret_msrs; 721 int ret = 0; 722 723 if (msr->load_into_hardware) { 724 preempt_disable(); 725 ret = kvm_set_user_return_msr(slot, data, msr->mask); 726 preempt_enable(); 727 } 728 if (!ret) 729 msr->data = data; 730 return ret; 731 } 732 733 /* 734 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 735 * 736 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 737 * atomically track post-VMXON state, e.g. this may be called in NMI context. 738 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 739 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 740 * magically in RM, VM86, compat mode, or at CPL>0. 741 */ 742 static int kvm_cpu_vmxoff(void) 743 { 744 asm goto("1: vmxoff\n\t" 745 _ASM_EXTABLE(1b, %l[fault]) 746 ::: "cc", "memory" : fault); 747 748 cr4_clear_bits(X86_CR4_VMXE); 749 return 0; 750 751 fault: 752 cr4_clear_bits(X86_CR4_VMXE); 753 return -EIO; 754 } 755 756 void vmx_emergency_disable_virtualization_cpu(void) 757 { 758 int cpu = raw_smp_processor_id(); 759 struct loaded_vmcs *v; 760 761 kvm_rebooting = true; 762 763 /* 764 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 765 * set in task context. If this races with VMX is disabled by an NMI, 766 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 767 * kvm_rebooting set. 768 */ 769 if (!(__read_cr4() & X86_CR4_VMXE)) 770 return; 771 772 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 773 loaded_vmcss_on_cpu_link) 774 vmcs_clear(v->vmcs); 775 776 kvm_cpu_vmxoff(); 777 } 778 779 static void __loaded_vmcs_clear(void *arg) 780 { 781 struct loaded_vmcs *loaded_vmcs = arg; 782 int cpu = raw_smp_processor_id(); 783 784 if (loaded_vmcs->cpu != cpu) 785 return; /* vcpu migration can race with cpu offline */ 786 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 787 per_cpu(current_vmcs, cpu) = NULL; 788 789 vmcs_clear(loaded_vmcs->vmcs); 790 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 791 vmcs_clear(loaded_vmcs->shadow_vmcs); 792 793 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 794 795 /* 796 * Ensure all writes to loaded_vmcs, including deleting it from its 797 * current percpu list, complete before setting loaded_vmcs->cpu to 798 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 799 * and add loaded_vmcs to its percpu list before it's deleted from this 800 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 801 */ 802 smp_wmb(); 803 804 loaded_vmcs->cpu = -1; 805 loaded_vmcs->launched = 0; 806 } 807 808 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 809 { 810 int cpu = loaded_vmcs->cpu; 811 812 if (cpu != -1) 813 smp_call_function_single(cpu, 814 __loaded_vmcs_clear, loaded_vmcs, 1); 815 } 816 817 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 818 unsigned field) 819 { 820 bool ret; 821 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 822 823 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 824 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 825 vmx->segment_cache.bitmask = 0; 826 } 827 ret = vmx->segment_cache.bitmask & mask; 828 vmx->segment_cache.bitmask |= mask; 829 return ret; 830 } 831 832 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 833 { 834 u16 *p = &vmx->segment_cache.seg[seg].selector; 835 836 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 837 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 838 return *p; 839 } 840 841 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 842 { 843 ulong *p = &vmx->segment_cache.seg[seg].base; 844 845 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 846 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 847 return *p; 848 } 849 850 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 851 { 852 u32 *p = &vmx->segment_cache.seg[seg].limit; 853 854 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 855 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 856 return *p; 857 } 858 859 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 860 { 861 u32 *p = &vmx->segment_cache.seg[seg].ar; 862 863 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 864 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 865 return *p; 866 } 867 868 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 869 { 870 u32 eb; 871 872 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 873 (1u << DB_VECTOR) | (1u << AC_VECTOR); 874 /* 875 * #VE isn't used for VMX. To test against unexpected changes 876 * related to #VE for VMX, intercept unexpected #VE and warn on it. 877 */ 878 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 879 eb |= 1u << VE_VECTOR; 880 /* 881 * Guest access to VMware backdoor ports could legitimately 882 * trigger #GP because of TSS I/O permission bitmap. 883 * We intercept those #GP and allow access to them anyway 884 * as VMware does. 885 */ 886 if (enable_vmware_backdoor) 887 eb |= (1u << GP_VECTOR); 888 if ((vcpu->guest_debug & 889 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 890 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 891 eb |= 1u << BP_VECTOR; 892 if (to_vmx(vcpu)->rmode.vm86_active) 893 eb = ~0; 894 if (!vmx_need_pf_intercept(vcpu)) 895 eb &= ~(1u << PF_VECTOR); 896 897 /* When we are running a nested L2 guest and L1 specified for it a 898 * certain exception bitmap, we must trap the same exceptions and pass 899 * them to L1. When running L2, we will only handle the exceptions 900 * specified above if L1 did not want them. 901 */ 902 if (is_guest_mode(vcpu)) 903 eb |= get_vmcs12(vcpu)->exception_bitmap; 904 else { 905 int mask = 0, match = 0; 906 907 if (enable_ept && (eb & (1u << PF_VECTOR))) { 908 /* 909 * If EPT is enabled, #PF is currently only intercepted 910 * if MAXPHYADDR is smaller on the guest than on the 911 * host. In that case we only care about present, 912 * non-reserved faults. For vmcs02, however, PFEC_MASK 913 * and PFEC_MATCH are set in prepare_vmcs02_rare. 914 */ 915 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 916 match = PFERR_PRESENT_MASK; 917 } 918 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 919 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 920 } 921 922 /* 923 * Disabling xfd interception indicates that dynamic xfeatures 924 * might be used in the guest. Always trap #NM in this case 925 * to save guest xfd_err timely. 926 */ 927 if (vcpu->arch.xfd_no_write_intercept) 928 eb |= (1u << NM_VECTOR); 929 930 vmcs_write32(EXCEPTION_BITMAP, eb); 931 } 932 933 /* 934 * Check if MSR is intercepted for currently loaded MSR bitmap. 935 */ 936 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 937 { 938 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 939 return true; 940 941 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 942 } 943 944 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 945 { 946 unsigned int flags = 0; 947 948 if (vmx->loaded_vmcs->launched) 949 flags |= VMX_RUN_VMRESUME; 950 951 /* 952 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 953 * to change it directly without causing a vmexit. In that case read 954 * it after vmexit and store it in vmx->spec_ctrl. 955 */ 956 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 957 flags |= VMX_RUN_SAVE_SPEC_CTRL; 958 959 return flags; 960 } 961 962 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 963 unsigned long entry, unsigned long exit) 964 { 965 vm_entry_controls_clearbit(vmx, entry); 966 vm_exit_controls_clearbit(vmx, exit); 967 } 968 969 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 970 { 971 unsigned int i; 972 973 for (i = 0; i < m->nr; ++i) { 974 if (m->val[i].index == msr) 975 return i; 976 } 977 return -ENOENT; 978 } 979 980 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 981 { 982 int i; 983 struct msr_autoload *m = &vmx->msr_autoload; 984 985 switch (msr) { 986 case MSR_EFER: 987 if (cpu_has_load_ia32_efer()) { 988 clear_atomic_switch_msr_special(vmx, 989 VM_ENTRY_LOAD_IA32_EFER, 990 VM_EXIT_LOAD_IA32_EFER); 991 return; 992 } 993 break; 994 case MSR_CORE_PERF_GLOBAL_CTRL: 995 if (cpu_has_load_perf_global_ctrl()) { 996 clear_atomic_switch_msr_special(vmx, 997 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 998 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 999 return; 1000 } 1001 break; 1002 } 1003 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1004 if (i < 0) 1005 goto skip_guest; 1006 --m->guest.nr; 1007 m->guest.val[i] = m->guest.val[m->guest.nr]; 1008 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1009 1010 skip_guest: 1011 i = vmx_find_loadstore_msr_slot(&m->host, msr); 1012 if (i < 0) 1013 return; 1014 1015 --m->host.nr; 1016 m->host.val[i] = m->host.val[m->host.nr]; 1017 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1018 } 1019 1020 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1021 unsigned long entry, unsigned long exit, 1022 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1023 u64 guest_val, u64 host_val) 1024 { 1025 vmcs_write64(guest_val_vmcs, guest_val); 1026 if (host_val_vmcs != HOST_IA32_EFER) 1027 vmcs_write64(host_val_vmcs, host_val); 1028 vm_entry_controls_setbit(vmx, entry); 1029 vm_exit_controls_setbit(vmx, exit); 1030 } 1031 1032 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1033 u64 guest_val, u64 host_val, bool entry_only) 1034 { 1035 int i, j = 0; 1036 struct msr_autoload *m = &vmx->msr_autoload; 1037 1038 switch (msr) { 1039 case MSR_EFER: 1040 if (cpu_has_load_ia32_efer()) { 1041 add_atomic_switch_msr_special(vmx, 1042 VM_ENTRY_LOAD_IA32_EFER, 1043 VM_EXIT_LOAD_IA32_EFER, 1044 GUEST_IA32_EFER, 1045 HOST_IA32_EFER, 1046 guest_val, host_val); 1047 return; 1048 } 1049 break; 1050 case MSR_CORE_PERF_GLOBAL_CTRL: 1051 if (cpu_has_load_perf_global_ctrl()) { 1052 add_atomic_switch_msr_special(vmx, 1053 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1054 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1055 GUEST_IA32_PERF_GLOBAL_CTRL, 1056 HOST_IA32_PERF_GLOBAL_CTRL, 1057 guest_val, host_val); 1058 return; 1059 } 1060 break; 1061 case MSR_IA32_PEBS_ENABLE: 1062 /* PEBS needs a quiescent period after being disabled (to write 1063 * a record). Disabling PEBS through VMX MSR swapping doesn't 1064 * provide that period, so a CPU could write host's record into 1065 * guest's memory. 1066 */ 1067 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 1068 } 1069 1070 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1071 if (!entry_only) 1072 j = vmx_find_loadstore_msr_slot(&m->host, msr); 1073 1074 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 1075 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 1076 printk_once(KERN_WARNING "Not enough msr switch entries. " 1077 "Can't add msr %x\n", msr); 1078 return; 1079 } 1080 if (i < 0) { 1081 i = m->guest.nr++; 1082 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1083 } 1084 m->guest.val[i].index = msr; 1085 m->guest.val[i].value = guest_val; 1086 1087 if (entry_only) 1088 return; 1089 1090 if (j < 0) { 1091 j = m->host.nr++; 1092 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1093 } 1094 m->host.val[j].index = msr; 1095 m->host.val[j].value = host_val; 1096 } 1097 1098 static bool update_transition_efer(struct vcpu_vmx *vmx) 1099 { 1100 u64 guest_efer = vmx->vcpu.arch.efer; 1101 u64 ignore_bits = 0; 1102 int i; 1103 1104 /* Shadow paging assumes NX to be available. */ 1105 if (!enable_ept) 1106 guest_efer |= EFER_NX; 1107 1108 /* 1109 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1110 */ 1111 ignore_bits |= EFER_SCE; 1112 #ifdef CONFIG_X86_64 1113 ignore_bits |= EFER_LMA | EFER_LME; 1114 /* SCE is meaningful only in long mode on Intel */ 1115 if (guest_efer & EFER_LMA) 1116 ignore_bits &= ~(u64)EFER_SCE; 1117 #endif 1118 1119 /* 1120 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1121 * On CPUs that support "load IA32_EFER", always switch EFER 1122 * atomically, since it's faster than switching it manually. 1123 */ 1124 if (cpu_has_load_ia32_efer() || 1125 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { 1126 if (!(guest_efer & EFER_LMA)) 1127 guest_efer &= ~EFER_LME; 1128 if (guest_efer != kvm_host.efer) 1129 add_atomic_switch_msr(vmx, MSR_EFER, 1130 guest_efer, kvm_host.efer, false); 1131 else 1132 clear_atomic_switch_msr(vmx, MSR_EFER); 1133 return false; 1134 } 1135 1136 i = kvm_find_user_return_msr(MSR_EFER); 1137 if (i < 0) 1138 return false; 1139 1140 clear_atomic_switch_msr(vmx, MSR_EFER); 1141 1142 guest_efer &= ~ignore_bits; 1143 guest_efer |= kvm_host.efer & ignore_bits; 1144 1145 vmx->guest_uret_msrs[i].data = guest_efer; 1146 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1147 1148 return true; 1149 } 1150 1151 #ifdef CONFIG_X86_32 1152 /* 1153 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1154 * VMCS rather than the segment table. KVM uses this helper to figure 1155 * out the current bases to poke them into the VMCS before entry. 1156 */ 1157 static unsigned long segment_base(u16 selector) 1158 { 1159 struct desc_struct *table; 1160 unsigned long v; 1161 1162 if (!(selector & ~SEGMENT_RPL_MASK)) 1163 return 0; 1164 1165 table = get_current_gdt_ro(); 1166 1167 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1168 u16 ldt_selector = kvm_read_ldt(); 1169 1170 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1171 return 0; 1172 1173 table = (struct desc_struct *)segment_base(ldt_selector); 1174 } 1175 v = get_desc_base(&table[selector >> 3]); 1176 return v; 1177 } 1178 #endif 1179 1180 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1181 { 1182 return vmx_pt_mode_is_host_guest() && 1183 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1184 } 1185 1186 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1187 { 1188 /* The base must be 128-byte aligned and a legal physical address. */ 1189 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1190 } 1191 1192 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1193 { 1194 u32 i; 1195 1196 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1197 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1198 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1199 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1200 for (i = 0; i < addr_range; i++) { 1201 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1202 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1203 } 1204 } 1205 1206 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1207 { 1208 u32 i; 1209 1210 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1211 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1212 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1213 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1214 for (i = 0; i < addr_range; i++) { 1215 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1216 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1217 } 1218 } 1219 1220 static void pt_guest_enter(struct vcpu_vmx *vmx) 1221 { 1222 if (vmx_pt_mode_is_system()) 1223 return; 1224 1225 /* 1226 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1227 * Save host state before VM entry. 1228 */ 1229 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1230 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1231 wrmsrl(MSR_IA32_RTIT_CTL, 0); 1232 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1233 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1234 } 1235 } 1236 1237 static void pt_guest_exit(struct vcpu_vmx *vmx) 1238 { 1239 if (vmx_pt_mode_is_system()) 1240 return; 1241 1242 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1243 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1244 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1245 } 1246 1247 /* 1248 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1249 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1250 */ 1251 if (vmx->pt_desc.host.ctl) 1252 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1253 } 1254 1255 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1256 unsigned long fs_base, unsigned long gs_base) 1257 { 1258 if (unlikely(fs_sel != host->fs_sel)) { 1259 if (!(fs_sel & 7)) 1260 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1261 else 1262 vmcs_write16(HOST_FS_SELECTOR, 0); 1263 host->fs_sel = fs_sel; 1264 } 1265 if (unlikely(gs_sel != host->gs_sel)) { 1266 if (!(gs_sel & 7)) 1267 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1268 else 1269 vmcs_write16(HOST_GS_SELECTOR, 0); 1270 host->gs_sel = gs_sel; 1271 } 1272 if (unlikely(fs_base != host->fs_base)) { 1273 vmcs_writel(HOST_FS_BASE, fs_base); 1274 host->fs_base = fs_base; 1275 } 1276 if (unlikely(gs_base != host->gs_base)) { 1277 vmcs_writel(HOST_GS_BASE, gs_base); 1278 host->gs_base = gs_base; 1279 } 1280 } 1281 1282 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1283 { 1284 struct vcpu_vmx *vmx = to_vmx(vcpu); 1285 struct vcpu_vt *vt = to_vt(vcpu); 1286 struct vmcs_host_state *host_state; 1287 #ifdef CONFIG_X86_64 1288 int cpu = raw_smp_processor_id(); 1289 #endif 1290 unsigned long fs_base, gs_base; 1291 u16 fs_sel, gs_sel; 1292 int i; 1293 1294 /* 1295 * Note that guest MSRs to be saved/restored can also be changed 1296 * when guest state is loaded. This happens when guest transitions 1297 * to/from long-mode by setting MSR_EFER.LMA. 1298 */ 1299 if (!vmx->guest_uret_msrs_loaded) { 1300 vmx->guest_uret_msrs_loaded = true; 1301 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1302 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1303 continue; 1304 1305 kvm_set_user_return_msr(i, 1306 vmx->guest_uret_msrs[i].data, 1307 vmx->guest_uret_msrs[i].mask); 1308 } 1309 } 1310 1311 if (vmx->nested.need_vmcs12_to_shadow_sync) 1312 nested_sync_vmcs12_to_shadow(vcpu); 1313 1314 if (vt->guest_state_loaded) 1315 return; 1316 1317 host_state = &vmx->loaded_vmcs->host_state; 1318 1319 /* 1320 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1321 * allow segment selectors with cpl > 0 or ti == 1. 1322 */ 1323 host_state->ldt_sel = kvm_read_ldt(); 1324 1325 #ifdef CONFIG_X86_64 1326 savesegment(ds, host_state->ds_sel); 1327 savesegment(es, host_state->es_sel); 1328 1329 gs_base = cpu_kernelmode_gs_base(cpu); 1330 if (likely(is_64bit_mm(current->mm))) { 1331 current_save_fsgs(); 1332 fs_sel = current->thread.fsindex; 1333 gs_sel = current->thread.gsindex; 1334 fs_base = current->thread.fsbase; 1335 vt->msr_host_kernel_gs_base = current->thread.gsbase; 1336 } else { 1337 savesegment(fs, fs_sel); 1338 savesegment(gs, gs_sel); 1339 fs_base = read_msr(MSR_FS_BASE); 1340 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1341 } 1342 1343 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1344 #else 1345 savesegment(fs, fs_sel); 1346 savesegment(gs, gs_sel); 1347 fs_base = segment_base(fs_sel); 1348 gs_base = segment_base(gs_sel); 1349 #endif 1350 1351 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1352 vt->guest_state_loaded = true; 1353 } 1354 1355 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1356 { 1357 struct vmcs_host_state *host_state; 1358 1359 if (!vmx->vt.guest_state_loaded) 1360 return; 1361 1362 host_state = &vmx->loaded_vmcs->host_state; 1363 1364 ++vmx->vcpu.stat.host_state_reload; 1365 1366 #ifdef CONFIG_X86_64 1367 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1368 #endif 1369 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1370 kvm_load_ldt(host_state->ldt_sel); 1371 #ifdef CONFIG_X86_64 1372 load_gs_index(host_state->gs_sel); 1373 #else 1374 loadsegment(gs, host_state->gs_sel); 1375 #endif 1376 } 1377 if (host_state->fs_sel & 7) 1378 loadsegment(fs, host_state->fs_sel); 1379 #ifdef CONFIG_X86_64 1380 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1381 loadsegment(ds, host_state->ds_sel); 1382 loadsegment(es, host_state->es_sel); 1383 } 1384 #endif 1385 invalidate_tss_limit(); 1386 #ifdef CONFIG_X86_64 1387 wrmsrl(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); 1388 #endif 1389 load_fixmap_gdt(raw_smp_processor_id()); 1390 vmx->vt.guest_state_loaded = false; 1391 vmx->guest_uret_msrs_loaded = false; 1392 } 1393 1394 #ifdef CONFIG_X86_64 1395 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1396 { 1397 preempt_disable(); 1398 if (vmx->vt.guest_state_loaded) 1399 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1400 preempt_enable(); 1401 return vmx->msr_guest_kernel_gs_base; 1402 } 1403 1404 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1405 { 1406 preempt_disable(); 1407 if (vmx->vt.guest_state_loaded) 1408 wrmsrl(MSR_KERNEL_GS_BASE, data); 1409 preempt_enable(); 1410 vmx->msr_guest_kernel_gs_base = data; 1411 } 1412 #endif 1413 1414 static void grow_ple_window(struct kvm_vcpu *vcpu) 1415 { 1416 struct vcpu_vmx *vmx = to_vmx(vcpu); 1417 unsigned int old = vmx->ple_window; 1418 1419 vmx->ple_window = __grow_ple_window(old, ple_window, 1420 ple_window_grow, 1421 ple_window_max); 1422 1423 if (vmx->ple_window != old) { 1424 vmx->ple_window_dirty = true; 1425 trace_kvm_ple_window_update(vcpu->vcpu_id, 1426 vmx->ple_window, old); 1427 } 1428 } 1429 1430 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1431 { 1432 struct vcpu_vmx *vmx = to_vmx(vcpu); 1433 unsigned int old = vmx->ple_window; 1434 1435 vmx->ple_window = __shrink_ple_window(old, ple_window, 1436 ple_window_shrink, 1437 ple_window); 1438 1439 if (vmx->ple_window != old) { 1440 vmx->ple_window_dirty = true; 1441 trace_kvm_ple_window_update(vcpu->vcpu_id, 1442 vmx->ple_window, old); 1443 } 1444 } 1445 1446 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, 1447 struct loaded_vmcs *buddy) 1448 { 1449 struct vcpu_vmx *vmx = to_vmx(vcpu); 1450 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1451 struct vmcs *prev; 1452 1453 if (!already_loaded) { 1454 loaded_vmcs_clear(vmx->loaded_vmcs); 1455 local_irq_disable(); 1456 1457 /* 1458 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1459 * this cpu's percpu list, otherwise it may not yet be deleted 1460 * from its previous cpu's percpu list. Pairs with the 1461 * smb_wmb() in __loaded_vmcs_clear(). 1462 */ 1463 smp_rmb(); 1464 1465 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1466 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1467 local_irq_enable(); 1468 } 1469 1470 prev = per_cpu(current_vmcs, cpu); 1471 if (prev != vmx->loaded_vmcs->vmcs) { 1472 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1473 vmcs_load(vmx->loaded_vmcs->vmcs); 1474 1475 /* 1476 * No indirect branch prediction barrier needed when switching 1477 * the active VMCS within a vCPU, unless IBRS is advertised to 1478 * the vCPU. To minimize the number of IBPBs executed, KVM 1479 * performs IBPB on nested VM-Exit (a single nested transition 1480 * may switch the active VMCS multiple times). 1481 */ 1482 if (static_branch_likely(&switch_vcpu_ibpb) && 1483 (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) 1484 indirect_branch_prediction_barrier(); 1485 } 1486 1487 if (!already_loaded) { 1488 void *gdt = get_current_gdt_ro(); 1489 1490 /* 1491 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1492 * TLB entries from its previous association with the vCPU. 1493 */ 1494 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1495 1496 /* 1497 * Linux uses per-cpu TSS and GDT, so set these when switching 1498 * processors. See 22.2.4. 1499 */ 1500 vmcs_writel(HOST_TR_BASE, 1501 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1502 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1503 1504 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1505 /* 22.2.3 */ 1506 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1507 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1508 } 1509 1510 vmx->loaded_vmcs->cpu = cpu; 1511 } 1512 } 1513 1514 /* 1515 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1516 * vcpu mutex is already taken. 1517 */ 1518 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1519 { 1520 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1521 shrink_ple_window(vcpu); 1522 1523 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1524 1525 vmx_vcpu_pi_load(vcpu, cpu); 1526 } 1527 1528 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1529 { 1530 vmx_vcpu_pi_put(vcpu); 1531 1532 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1533 } 1534 1535 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1536 { 1537 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1538 } 1539 1540 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1541 { 1542 struct vcpu_vmx *vmx = to_vmx(vcpu); 1543 unsigned long rflags, save_rflags; 1544 1545 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1546 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1547 rflags = vmcs_readl(GUEST_RFLAGS); 1548 if (vmx->rmode.vm86_active) { 1549 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1550 save_rflags = vmx->rmode.save_rflags; 1551 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1552 } 1553 vmx->rflags = rflags; 1554 } 1555 return vmx->rflags; 1556 } 1557 1558 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1559 { 1560 struct vcpu_vmx *vmx = to_vmx(vcpu); 1561 unsigned long old_rflags; 1562 1563 /* 1564 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1565 * is an unrestricted guest in order to mark L2 as needing emulation 1566 * if L1 runs L2 as a restricted guest. 1567 */ 1568 if (is_unrestricted_guest(vcpu)) { 1569 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1570 vmx->rflags = rflags; 1571 vmcs_writel(GUEST_RFLAGS, rflags); 1572 return; 1573 } 1574 1575 old_rflags = vmx_get_rflags(vcpu); 1576 vmx->rflags = rflags; 1577 if (vmx->rmode.vm86_active) { 1578 vmx->rmode.save_rflags = rflags; 1579 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1580 } 1581 vmcs_writel(GUEST_RFLAGS, rflags); 1582 1583 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1584 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 1585 } 1586 1587 bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1588 { 1589 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1590 } 1591 1592 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1593 { 1594 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1595 int ret = 0; 1596 1597 if (interruptibility & GUEST_INTR_STATE_STI) 1598 ret |= KVM_X86_SHADOW_INT_STI; 1599 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1600 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1601 1602 return ret; 1603 } 1604 1605 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1606 { 1607 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1608 u32 interruptibility = interruptibility_old; 1609 1610 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1611 1612 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1613 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1614 else if (mask & KVM_X86_SHADOW_INT_STI) 1615 interruptibility |= GUEST_INTR_STATE_STI; 1616 1617 if ((interruptibility != interruptibility_old)) 1618 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1619 } 1620 1621 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1622 { 1623 struct vcpu_vmx *vmx = to_vmx(vcpu); 1624 unsigned long value; 1625 1626 /* 1627 * Any MSR write that attempts to change bits marked reserved will 1628 * case a #GP fault. 1629 */ 1630 if (data & vmx->pt_desc.ctl_bitmask) 1631 return 1; 1632 1633 /* 1634 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1635 * result in a #GP unless the same write also clears TraceEn. 1636 */ 1637 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1638 (data & RTIT_CTL_TRACEEN) && 1639 data != vmx->pt_desc.guest.ctl) 1640 return 1; 1641 1642 /* 1643 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1644 * and FabricEn would cause #GP, if 1645 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1646 */ 1647 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1648 !(data & RTIT_CTL_FABRIC_EN) && 1649 !intel_pt_validate_cap(vmx->pt_desc.caps, 1650 PT_CAP_single_range_output)) 1651 return 1; 1652 1653 /* 1654 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1655 * utilize encodings marked reserved will cause a #GP fault. 1656 */ 1657 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1658 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1659 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1660 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1661 return 1; 1662 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1663 PT_CAP_cycle_thresholds); 1664 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1665 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1666 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1667 return 1; 1668 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1669 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1670 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1671 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1672 return 1; 1673 1674 /* 1675 * If ADDRx_CFG is reserved or the encodings is >2 will 1676 * cause a #GP fault. 1677 */ 1678 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1679 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1680 return 1; 1681 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1682 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1683 return 1; 1684 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1685 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1686 return 1; 1687 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1688 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1689 return 1; 1690 1691 return 0; 1692 } 1693 1694 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1695 void *insn, int insn_len) 1696 { 1697 /* 1698 * Emulation of instructions in SGX enclaves is impossible as RIP does 1699 * not point at the failing instruction, and even if it did, the code 1700 * stream is inaccessible. Inject #UD instead of exiting to userspace 1701 * so that guest userspace can't DoS the guest simply by triggering 1702 * emulation (enclaves are CPL3 only). 1703 */ 1704 if (vmx_get_exit_reason(vcpu).enclave_mode) { 1705 kvm_queue_exception(vcpu, UD_VECTOR); 1706 return X86EMUL_PROPAGATE_FAULT; 1707 } 1708 1709 /* Check that emulation is possible during event vectoring */ 1710 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 1711 !kvm_can_emulate_event_vectoring(emul_type)) 1712 return X86EMUL_UNHANDLEABLE_VECTORING; 1713 1714 return X86EMUL_CONTINUE; 1715 } 1716 1717 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1718 { 1719 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 1720 unsigned long rip, orig_rip; 1721 u32 instr_len; 1722 1723 /* 1724 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1725 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1726 * set when EPT misconfig occurs. In practice, real hardware updates 1727 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1728 * (namely Hyper-V) don't set it due to it being undefined behavior, 1729 * i.e. we end up advancing IP with some random value. 1730 */ 1731 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1732 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1733 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1734 1735 /* 1736 * Emulating an enclave's instructions isn't supported as KVM 1737 * cannot access the enclave's memory or its true RIP, e.g. the 1738 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1739 * the RIP that actually triggered the VM-Exit. But, because 1740 * most instructions that cause VM-Exit will #UD in an enclave, 1741 * most instruction-based VM-Exits simply do not occur. 1742 * 1743 * There are a few exceptions, notably the debug instructions 1744 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1745 * and generate #DB/#BP as expected, which KVM might intercept. 1746 * But again, the CPU does the dirty work and saves an instr 1747 * length of zero so VMMs don't shoot themselves in the foot. 1748 * WARN if KVM tries to skip a non-zero length instruction on 1749 * a VM-Exit from an enclave. 1750 */ 1751 if (!instr_len) 1752 goto rip_updated; 1753 1754 WARN_ONCE(exit_reason.enclave_mode, 1755 "skipping instruction after SGX enclave VM-Exit"); 1756 1757 orig_rip = kvm_rip_read(vcpu); 1758 rip = orig_rip + instr_len; 1759 #ifdef CONFIG_X86_64 1760 /* 1761 * We need to mask out the high 32 bits of RIP if not in 64-bit 1762 * mode, but just finding out that we are in 64-bit mode is 1763 * quite expensive. Only do it if there was a carry. 1764 */ 1765 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1766 rip = (u32)rip; 1767 #endif 1768 kvm_rip_write(vcpu, rip); 1769 } else { 1770 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1771 return 0; 1772 } 1773 1774 rip_updated: 1775 /* skipping an emulated instruction also counts */ 1776 vmx_set_interrupt_shadow(vcpu, 0); 1777 1778 return 1; 1779 } 1780 1781 /* 1782 * Recognizes a pending MTF VM-exit and records the nested state for later 1783 * delivery. 1784 */ 1785 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1786 { 1787 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1788 struct vcpu_vmx *vmx = to_vmx(vcpu); 1789 1790 if (!is_guest_mode(vcpu)) 1791 return; 1792 1793 /* 1794 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1795 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1796 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1797 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1798 * as ICEBP is higher priority than both. As instruction emulation is 1799 * completed at this point (i.e. KVM is at the instruction boundary), 1800 * any #DB exception pending delivery must be a debug-trap of lower 1801 * priority than MTF. Record the pending MTF state to be delivered in 1802 * vmx_check_nested_events(). 1803 */ 1804 if (nested_cpu_has_mtf(vmcs12) && 1805 (!vcpu->arch.exception.pending || 1806 vcpu->arch.exception.vector == DB_VECTOR) && 1807 (!vcpu->arch.exception_vmexit.pending || 1808 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1809 vmx->nested.mtf_pending = true; 1810 kvm_make_request(KVM_REQ_EVENT, vcpu); 1811 } else { 1812 vmx->nested.mtf_pending = false; 1813 } 1814 } 1815 1816 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1817 { 1818 vmx_update_emulated_instruction(vcpu); 1819 return skip_emulated_instruction(vcpu); 1820 } 1821 1822 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1823 { 1824 /* 1825 * Ensure that we clear the HLT state in the VMCS. We don't need to 1826 * explicitly skip the instruction because if the HLT state is set, 1827 * then the instruction is already executing and RIP has already been 1828 * advanced. 1829 */ 1830 if (kvm_hlt_in_guest(vcpu->kvm) && 1831 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1832 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1833 } 1834 1835 void vmx_inject_exception(struct kvm_vcpu *vcpu) 1836 { 1837 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1838 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1839 struct vcpu_vmx *vmx = to_vmx(vcpu); 1840 1841 kvm_deliver_exception_payload(vcpu, ex); 1842 1843 if (ex->has_error_code) { 1844 /* 1845 * Despite the error code being architecturally defined as 32 1846 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1847 * VMX don't actually supporting setting bits 31:16. Hardware 1848 * will (should) never provide a bogus error code, but AMD CPUs 1849 * do generate error codes with bits 31:16 set, and so KVM's 1850 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1851 * the upper bits to avoid VM-Fail, losing information that 1852 * doesn't really exist is preferable to killing the VM. 1853 */ 1854 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1855 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1856 } 1857 1858 if (vmx->rmode.vm86_active) { 1859 int inc_eip = 0; 1860 if (kvm_exception_is_soft(ex->vector)) 1861 inc_eip = vcpu->arch.event_exit_inst_len; 1862 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1863 return; 1864 } 1865 1866 WARN_ON_ONCE(vmx->vt.emulation_required); 1867 1868 if (kvm_exception_is_soft(ex->vector)) { 1869 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1870 vmx->vcpu.arch.event_exit_inst_len); 1871 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1872 } else 1873 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1874 1875 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1876 1877 vmx_clear_hlt(vcpu); 1878 } 1879 1880 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1881 bool load_into_hardware) 1882 { 1883 struct vmx_uret_msr *uret_msr; 1884 1885 uret_msr = vmx_find_uret_msr(vmx, msr); 1886 if (!uret_msr) 1887 return; 1888 1889 uret_msr->load_into_hardware = load_into_hardware; 1890 } 1891 1892 /* 1893 * Configuring user return MSRs to automatically save, load, and restore MSRs 1894 * that need to be shoved into hardware when running the guest. Note, omitting 1895 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1896 * loaded into hardware when running the guest. 1897 */ 1898 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1899 { 1900 #ifdef CONFIG_X86_64 1901 bool load_syscall_msrs; 1902 1903 /* 1904 * The SYSCALL MSRs are only needed on long mode guests, and only 1905 * when EFER.SCE is set. 1906 */ 1907 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1908 (vmx->vcpu.arch.efer & EFER_SCE); 1909 1910 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1911 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1912 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1913 #endif 1914 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1915 1916 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1917 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1918 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1919 1920 /* 1921 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1922 * kernel and old userspace. If those guests run on a tsx=off host, do 1923 * allow guests to use TSX_CTRL, but don't change the value in hardware 1924 * so that TSX remains always disabled. 1925 */ 1926 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1927 1928 /* 1929 * The set of MSRs to load may have changed, reload MSRs before the 1930 * next VM-Enter. 1931 */ 1932 vmx->guest_uret_msrs_loaded = false; 1933 } 1934 1935 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1936 { 1937 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1938 1939 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 1940 return vmcs12->tsc_offset; 1941 1942 return 0; 1943 } 1944 1945 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1946 { 1947 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1948 1949 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 1950 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 1951 return vmcs12->tsc_multiplier; 1952 1953 return kvm_caps.default_tsc_scaling_ratio; 1954 } 1955 1956 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 1957 { 1958 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 1959 } 1960 1961 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1962 { 1963 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 1964 } 1965 1966 /* 1967 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 1968 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 1969 * backwards compatibility even though KVM doesn't support emulating SMX. And 1970 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 1971 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 1972 */ 1973 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 1974 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 1975 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 1976 FEAT_CTL_SGX_LC_ENABLED | \ 1977 FEAT_CTL_SGX_ENABLED | \ 1978 FEAT_CTL_LMCE_ENABLED) 1979 1980 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 1981 struct msr_data *msr) 1982 { 1983 uint64_t valid_bits; 1984 1985 /* 1986 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 1987 * exposed to the guest. 1988 */ 1989 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 1990 ~KVM_SUPPORTED_FEATURE_CONTROL); 1991 1992 if (!msr->host_initiated && 1993 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 1994 return false; 1995 1996 if (msr->host_initiated) 1997 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 1998 else 1999 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 2000 2001 return !(msr->data & ~valid_bits); 2002 } 2003 2004 int vmx_get_feature_msr(u32 msr, u64 *data) 2005 { 2006 switch (msr) { 2007 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2008 if (!nested) 2009 return 1; 2010 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2011 default: 2012 return KVM_MSR_RET_UNSUPPORTED; 2013 } 2014 } 2015 2016 /* 2017 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 2018 * Returns 0 on success, non-0 otherwise. 2019 * Assumes vcpu_load() was already called. 2020 */ 2021 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2022 { 2023 struct vcpu_vmx *vmx = to_vmx(vcpu); 2024 struct vmx_uret_msr *msr; 2025 u32 index; 2026 2027 switch (msr_info->index) { 2028 #ifdef CONFIG_X86_64 2029 case MSR_FS_BASE: 2030 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2031 break; 2032 case MSR_GS_BASE: 2033 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2034 break; 2035 case MSR_KERNEL_GS_BASE: 2036 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 2037 break; 2038 #endif 2039 case MSR_EFER: 2040 return kvm_get_msr_common(vcpu, msr_info); 2041 case MSR_IA32_TSX_CTRL: 2042 if (!msr_info->host_initiated && 2043 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2044 return 1; 2045 goto find_uret_msr; 2046 case MSR_IA32_UMWAIT_CONTROL: 2047 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2048 return 1; 2049 2050 msr_info->data = vmx->msr_ia32_umwait_control; 2051 break; 2052 case MSR_IA32_SPEC_CTRL: 2053 if (!msr_info->host_initiated && 2054 !guest_has_spec_ctrl_msr(vcpu)) 2055 return 1; 2056 2057 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2058 break; 2059 case MSR_IA32_SYSENTER_CS: 2060 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2061 break; 2062 case MSR_IA32_SYSENTER_EIP: 2063 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2064 break; 2065 case MSR_IA32_SYSENTER_ESP: 2066 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2067 break; 2068 case MSR_IA32_BNDCFGS: 2069 if (!kvm_mpx_supported() || 2070 (!msr_info->host_initiated && 2071 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2072 return 1; 2073 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2074 break; 2075 case MSR_IA32_MCG_EXT_CTL: 2076 if (!msr_info->host_initiated && 2077 !(vmx->msr_ia32_feature_control & 2078 FEAT_CTL_LMCE_ENABLED)) 2079 return 1; 2080 msr_info->data = vcpu->arch.mcg_ext_ctl; 2081 break; 2082 case MSR_IA32_FEAT_CTL: 2083 msr_info->data = vmx->msr_ia32_feature_control; 2084 break; 2085 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2086 if (!msr_info->host_initiated && 2087 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 2088 return 1; 2089 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2090 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2091 break; 2092 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2093 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2094 return 1; 2095 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2096 &msr_info->data)) 2097 return 1; 2098 #ifdef CONFIG_KVM_HYPERV 2099 /* 2100 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2101 * instead of just ignoring the features, different Hyper-V 2102 * versions are either trying to use them and fail or do some 2103 * sanity checking and refuse to boot. Filter all unsupported 2104 * features out. 2105 */ 2106 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) 2107 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2108 &msr_info->data); 2109 #endif 2110 break; 2111 case MSR_IA32_RTIT_CTL: 2112 if (!vmx_pt_mode_is_host_guest()) 2113 return 1; 2114 msr_info->data = vmx->pt_desc.guest.ctl; 2115 break; 2116 case MSR_IA32_RTIT_STATUS: 2117 if (!vmx_pt_mode_is_host_guest()) 2118 return 1; 2119 msr_info->data = vmx->pt_desc.guest.status; 2120 break; 2121 case MSR_IA32_RTIT_CR3_MATCH: 2122 if (!vmx_pt_mode_is_host_guest() || 2123 !intel_pt_validate_cap(vmx->pt_desc.caps, 2124 PT_CAP_cr3_filtering)) 2125 return 1; 2126 msr_info->data = vmx->pt_desc.guest.cr3_match; 2127 break; 2128 case MSR_IA32_RTIT_OUTPUT_BASE: 2129 if (!vmx_pt_mode_is_host_guest() || 2130 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2131 PT_CAP_topa_output) && 2132 !intel_pt_validate_cap(vmx->pt_desc.caps, 2133 PT_CAP_single_range_output))) 2134 return 1; 2135 msr_info->data = vmx->pt_desc.guest.output_base; 2136 break; 2137 case MSR_IA32_RTIT_OUTPUT_MASK: 2138 if (!vmx_pt_mode_is_host_guest() || 2139 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2140 PT_CAP_topa_output) && 2141 !intel_pt_validate_cap(vmx->pt_desc.caps, 2142 PT_CAP_single_range_output))) 2143 return 1; 2144 msr_info->data = vmx->pt_desc.guest.output_mask; 2145 break; 2146 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2147 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2148 if (!vmx_pt_mode_is_host_guest() || 2149 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2150 return 1; 2151 if (index % 2) 2152 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2153 else 2154 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2155 break; 2156 case MSR_IA32_DEBUGCTLMSR: 2157 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 2158 break; 2159 default: 2160 find_uret_msr: 2161 msr = vmx_find_uret_msr(vmx, msr_info->index); 2162 if (msr) { 2163 msr_info->data = msr->data; 2164 break; 2165 } 2166 return kvm_get_msr_common(vcpu, msr_info); 2167 } 2168 2169 return 0; 2170 } 2171 2172 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2173 u64 data) 2174 { 2175 #ifdef CONFIG_X86_64 2176 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 2177 return (u32)data; 2178 #endif 2179 return (unsigned long)data; 2180 } 2181 2182 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2183 { 2184 u64 debugctl = 0; 2185 2186 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2187 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2188 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2189 2190 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && 2191 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2192 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2193 2194 return debugctl; 2195 } 2196 2197 /* 2198 * Writes msr value into the appropriate "register". 2199 * Returns 0 on success, non-0 otherwise. 2200 * Assumes vcpu_load() was already called. 2201 */ 2202 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2203 { 2204 struct vcpu_vmx *vmx = to_vmx(vcpu); 2205 struct vmx_uret_msr *msr; 2206 int ret = 0; 2207 u32 msr_index = msr_info->index; 2208 u64 data = msr_info->data; 2209 u32 index; 2210 2211 switch (msr_index) { 2212 case MSR_EFER: 2213 ret = kvm_set_msr_common(vcpu, msr_info); 2214 break; 2215 #ifdef CONFIG_X86_64 2216 case MSR_FS_BASE: 2217 vmx_segment_cache_clear(vmx); 2218 vmcs_writel(GUEST_FS_BASE, data); 2219 break; 2220 case MSR_GS_BASE: 2221 vmx_segment_cache_clear(vmx); 2222 vmcs_writel(GUEST_GS_BASE, data); 2223 break; 2224 case MSR_KERNEL_GS_BASE: 2225 vmx_write_guest_kernel_gs_base(vmx, data); 2226 break; 2227 case MSR_IA32_XFD: 2228 ret = kvm_set_msr_common(vcpu, msr_info); 2229 /* 2230 * Always intercepting WRMSR could incur non-negligible 2231 * overhead given xfd might be changed frequently in 2232 * guest context switch. Disable write interception 2233 * upon the first write with a non-zero value (indicating 2234 * potential usage on dynamic xfeatures). Also update 2235 * exception bitmap to trap #NM for proper virtualization 2236 * of guest xfd_err. 2237 */ 2238 if (!ret && data) { 2239 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2240 MSR_TYPE_RW); 2241 vcpu->arch.xfd_no_write_intercept = true; 2242 vmx_update_exception_bitmap(vcpu); 2243 } 2244 break; 2245 #endif 2246 case MSR_IA32_SYSENTER_CS: 2247 if (is_guest_mode(vcpu)) 2248 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2249 vmcs_write32(GUEST_SYSENTER_CS, data); 2250 break; 2251 case MSR_IA32_SYSENTER_EIP: 2252 if (is_guest_mode(vcpu)) { 2253 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2254 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2255 } 2256 vmcs_writel(GUEST_SYSENTER_EIP, data); 2257 break; 2258 case MSR_IA32_SYSENTER_ESP: 2259 if (is_guest_mode(vcpu)) { 2260 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2261 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2262 } 2263 vmcs_writel(GUEST_SYSENTER_ESP, data); 2264 break; 2265 case MSR_IA32_DEBUGCTLMSR: { 2266 u64 invalid; 2267 2268 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2269 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2270 kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); 2271 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2272 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2273 } 2274 2275 if (invalid) 2276 return 1; 2277 2278 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2279 VM_EXIT_SAVE_DEBUG_CONTROLS) 2280 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2281 2282 vmcs_write64(GUEST_IA32_DEBUGCTL, data); 2283 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2284 (data & DEBUGCTLMSR_LBR)) 2285 intel_pmu_create_guest_lbr_event(vcpu); 2286 return 0; 2287 } 2288 case MSR_IA32_BNDCFGS: 2289 if (!kvm_mpx_supported() || 2290 (!msr_info->host_initiated && 2291 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2292 return 1; 2293 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || 2294 (data & MSR_IA32_BNDCFGS_RSVD)) 2295 return 1; 2296 2297 if (is_guest_mode(vcpu) && 2298 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2299 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2300 get_vmcs12(vcpu)->guest_bndcfgs = data; 2301 2302 vmcs_write64(GUEST_BNDCFGS, data); 2303 break; 2304 case MSR_IA32_UMWAIT_CONTROL: 2305 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2306 return 1; 2307 2308 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2309 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2310 return 1; 2311 2312 vmx->msr_ia32_umwait_control = data; 2313 break; 2314 case MSR_IA32_SPEC_CTRL: 2315 if (!msr_info->host_initiated && 2316 !guest_has_spec_ctrl_msr(vcpu)) 2317 return 1; 2318 2319 if (kvm_spec_ctrl_test_value(data)) 2320 return 1; 2321 2322 vmx->spec_ctrl = data; 2323 if (!data) 2324 break; 2325 2326 /* 2327 * For non-nested: 2328 * When it's written (to non-zero) for the first time, pass 2329 * it through. 2330 * 2331 * For nested: 2332 * The handling of the MSR bitmap for L2 guests is done in 2333 * nested_vmx_prepare_msr_bitmap. We should not touch the 2334 * vmcs02.msr_bitmap here since it gets completely overwritten 2335 * in the merging. We update the vmcs01 here for L1 as well 2336 * since it will end up touching the MSR anyway now. 2337 */ 2338 vmx_disable_intercept_for_msr(vcpu, 2339 MSR_IA32_SPEC_CTRL, 2340 MSR_TYPE_RW); 2341 break; 2342 case MSR_IA32_TSX_CTRL: 2343 if (!msr_info->host_initiated && 2344 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2345 return 1; 2346 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2347 return 1; 2348 goto find_uret_msr; 2349 case MSR_IA32_CR_PAT: 2350 ret = kvm_set_msr_common(vcpu, msr_info); 2351 if (ret) 2352 break; 2353 2354 if (is_guest_mode(vcpu) && 2355 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2356 get_vmcs12(vcpu)->guest_ia32_pat = data; 2357 2358 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2359 vmcs_write64(GUEST_IA32_PAT, data); 2360 break; 2361 case MSR_IA32_MCG_EXT_CTL: 2362 if ((!msr_info->host_initiated && 2363 !(to_vmx(vcpu)->msr_ia32_feature_control & 2364 FEAT_CTL_LMCE_ENABLED)) || 2365 (data & ~MCG_EXT_CTL_LMCE_EN)) 2366 return 1; 2367 vcpu->arch.mcg_ext_ctl = data; 2368 break; 2369 case MSR_IA32_FEAT_CTL: 2370 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2371 return 1; 2372 2373 vmx->msr_ia32_feature_control = data; 2374 if (msr_info->host_initiated && data == 0) 2375 vmx_leave_nested(vcpu); 2376 2377 /* SGX may be enabled/disabled by guest's firmware */ 2378 vmx_write_encls_bitmap(vcpu, NULL); 2379 break; 2380 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2381 /* 2382 * On real hardware, the LE hash MSRs are writable before 2383 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2384 * at which point SGX related bits in IA32_FEATURE_CONTROL 2385 * become writable. 2386 * 2387 * KVM does not emulate SGX activation for simplicity, so 2388 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2389 * is unlocked. This is technically not architectural 2390 * behavior, but it's close enough. 2391 */ 2392 if (!msr_info->host_initiated && 2393 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || 2394 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2395 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2396 return 1; 2397 vmx->msr_ia32_sgxlepubkeyhash 2398 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2399 break; 2400 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2401 if (!msr_info->host_initiated) 2402 return 1; /* they are read-only */ 2403 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2404 return 1; 2405 return vmx_set_vmx_msr(vcpu, msr_index, data); 2406 case MSR_IA32_RTIT_CTL: 2407 if (!vmx_pt_mode_is_host_guest() || 2408 vmx_rtit_ctl_check(vcpu, data) || 2409 vmx->nested.vmxon) 2410 return 1; 2411 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2412 vmx->pt_desc.guest.ctl = data; 2413 pt_update_intercept_for_msr(vcpu); 2414 break; 2415 case MSR_IA32_RTIT_STATUS: 2416 if (!pt_can_write_msr(vmx)) 2417 return 1; 2418 if (data & MSR_IA32_RTIT_STATUS_MASK) 2419 return 1; 2420 vmx->pt_desc.guest.status = data; 2421 break; 2422 case MSR_IA32_RTIT_CR3_MATCH: 2423 if (!pt_can_write_msr(vmx)) 2424 return 1; 2425 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2426 PT_CAP_cr3_filtering)) 2427 return 1; 2428 vmx->pt_desc.guest.cr3_match = data; 2429 break; 2430 case MSR_IA32_RTIT_OUTPUT_BASE: 2431 if (!pt_can_write_msr(vmx)) 2432 return 1; 2433 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2434 PT_CAP_topa_output) && 2435 !intel_pt_validate_cap(vmx->pt_desc.caps, 2436 PT_CAP_single_range_output)) 2437 return 1; 2438 if (!pt_output_base_valid(vcpu, data)) 2439 return 1; 2440 vmx->pt_desc.guest.output_base = data; 2441 break; 2442 case MSR_IA32_RTIT_OUTPUT_MASK: 2443 if (!pt_can_write_msr(vmx)) 2444 return 1; 2445 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2446 PT_CAP_topa_output) && 2447 !intel_pt_validate_cap(vmx->pt_desc.caps, 2448 PT_CAP_single_range_output)) 2449 return 1; 2450 vmx->pt_desc.guest.output_mask = data; 2451 break; 2452 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2453 if (!pt_can_write_msr(vmx)) 2454 return 1; 2455 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2456 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2457 return 1; 2458 if (is_noncanonical_msr_address(data, vcpu)) 2459 return 1; 2460 if (index % 2) 2461 vmx->pt_desc.guest.addr_b[index / 2] = data; 2462 else 2463 vmx->pt_desc.guest.addr_a[index / 2] = data; 2464 break; 2465 case MSR_IA32_PERF_CAPABILITIES: 2466 if (data & PMU_CAP_LBR_FMT) { 2467 if ((data & PMU_CAP_LBR_FMT) != 2468 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) 2469 return 1; 2470 if (!cpuid_model_is_consistent(vcpu)) 2471 return 1; 2472 } 2473 if (data & PERF_CAP_PEBS_FORMAT) { 2474 if ((data & PERF_CAP_PEBS_MASK) != 2475 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2476 return 1; 2477 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) 2478 return 1; 2479 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) 2480 return 1; 2481 if (!cpuid_model_is_consistent(vcpu)) 2482 return 1; 2483 } 2484 ret = kvm_set_msr_common(vcpu, msr_info); 2485 break; 2486 2487 default: 2488 find_uret_msr: 2489 msr = vmx_find_uret_msr(vmx, msr_index); 2490 if (msr) 2491 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2492 else 2493 ret = kvm_set_msr_common(vcpu, msr_info); 2494 } 2495 2496 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2497 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2498 vmx_update_fb_clear_dis(vcpu, vmx); 2499 2500 return ret; 2501 } 2502 2503 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2504 { 2505 unsigned long guest_owned_bits; 2506 2507 kvm_register_mark_available(vcpu, reg); 2508 2509 switch (reg) { 2510 case VCPU_REGS_RSP: 2511 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2512 break; 2513 case VCPU_REGS_RIP: 2514 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2515 break; 2516 case VCPU_EXREG_PDPTR: 2517 if (enable_ept) 2518 ept_save_pdptrs(vcpu); 2519 break; 2520 case VCPU_EXREG_CR0: 2521 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2522 2523 vcpu->arch.cr0 &= ~guest_owned_bits; 2524 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2525 break; 2526 case VCPU_EXREG_CR3: 2527 /* 2528 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2529 * CR3 is loaded into hardware, not the guest's CR3. 2530 */ 2531 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2532 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2533 break; 2534 case VCPU_EXREG_CR4: 2535 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2536 2537 vcpu->arch.cr4 &= ~guest_owned_bits; 2538 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2539 break; 2540 default: 2541 KVM_BUG_ON(1, vcpu->kvm); 2542 break; 2543 } 2544 } 2545 2546 /* 2547 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2548 * directly instead of going through cpu_has(), to ensure KVM is trapping 2549 * ENCLS whenever it's supported in hardware. It does not matter whether 2550 * the host OS supports or has enabled SGX. 2551 */ 2552 static bool cpu_has_sgx(void) 2553 { 2554 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2555 } 2556 2557 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2558 { 2559 u32 vmx_msr_low, vmx_msr_high; 2560 u32 ctl = ctl_min | ctl_opt; 2561 2562 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2563 2564 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2565 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2566 2567 /* Ensure minimum (required) set of control bits are supported. */ 2568 if (ctl_min & ~ctl) 2569 return -EIO; 2570 2571 *result = ctl; 2572 return 0; 2573 } 2574 2575 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2576 { 2577 u64 allowed; 2578 2579 rdmsrl(msr, allowed); 2580 2581 return ctl_opt & allowed; 2582 } 2583 2584 #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ 2585 ({ \ 2586 int i, r = 0; \ 2587 \ 2588 BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ 2589 BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ 2590 \ 2591 for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ 2592 typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ 2593 typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ 2594 \ 2595 if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ 2596 continue; \ 2597 \ 2598 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ 2599 "entry = %llx (%llx), exit = %llx (%llx)\n", \ 2600 (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ 2601 (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ 2602 \ 2603 if (error_on_inconsistent_vmcs_config) \ 2604 r = -EIO; \ 2605 \ 2606 entry_controls &= ~n_ctrl; \ 2607 exit_controls &= ~x_ctrl; \ 2608 } \ 2609 r; \ 2610 }) 2611 2612 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2613 struct vmx_capability *vmx_cap) 2614 { 2615 u32 _pin_based_exec_control = 0; 2616 u32 _cpu_based_exec_control = 0; 2617 u32 _cpu_based_2nd_exec_control = 0; 2618 u64 _cpu_based_3rd_exec_control = 0; 2619 u32 _vmexit_control = 0; 2620 u32 _vmentry_control = 0; 2621 u64 basic_msr; 2622 u64 misc_msr; 2623 2624 /* 2625 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2626 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2627 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2628 */ 2629 struct { 2630 u32 entry_control; 2631 u32 exit_control; 2632 } const vmcs_entry_exit_pairs[] = { 2633 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2634 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2635 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2636 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2637 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2638 }; 2639 2640 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2641 2642 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2643 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2644 MSR_IA32_VMX_PROCBASED_CTLS, 2645 &_cpu_based_exec_control)) 2646 return -EIO; 2647 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2648 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2649 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2650 MSR_IA32_VMX_PROCBASED_CTLS2, 2651 &_cpu_based_2nd_exec_control)) 2652 return -EIO; 2653 } 2654 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 2655 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2656 2657 #ifndef CONFIG_X86_64 2658 if (!(_cpu_based_2nd_exec_control & 2659 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2660 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2661 #endif 2662 2663 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2664 _cpu_based_2nd_exec_control &= ~( 2665 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2666 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2667 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2668 2669 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2670 &vmx_cap->ept, &vmx_cap->vpid); 2671 2672 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2673 vmx_cap->ept) { 2674 pr_warn_once("EPT CAP should not exist if not support " 2675 "1-setting enable EPT VM-execution control\n"); 2676 2677 if (error_on_inconsistent_vmcs_config) 2678 return -EIO; 2679 2680 vmx_cap->ept = 0; 2681 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2682 } 2683 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2684 vmx_cap->vpid) { 2685 pr_warn_once("VPID CAP should not exist if not support " 2686 "1-setting enable VPID VM-execution control\n"); 2687 2688 if (error_on_inconsistent_vmcs_config) 2689 return -EIO; 2690 2691 vmx_cap->vpid = 0; 2692 } 2693 2694 if (!cpu_has_sgx()) 2695 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2696 2697 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2698 _cpu_based_3rd_exec_control = 2699 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2700 MSR_IA32_VMX_PROCBASED_CTLS3); 2701 2702 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2703 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2704 MSR_IA32_VMX_EXIT_CTLS, 2705 &_vmexit_control)) 2706 return -EIO; 2707 2708 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2709 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2710 MSR_IA32_VMX_PINBASED_CTLS, 2711 &_pin_based_exec_control)) 2712 return -EIO; 2713 2714 if (cpu_has_broken_vmx_preemption_timer()) 2715 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2716 if (!(_cpu_based_2nd_exec_control & 2717 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2718 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2719 2720 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2721 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2722 MSR_IA32_VMX_ENTRY_CTLS, 2723 &_vmentry_control)) 2724 return -EIO; 2725 2726 if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, 2727 _vmentry_control, _vmexit_control)) 2728 return -EIO; 2729 2730 /* 2731 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2732 * can't be used due to an errata where VM Exit may incorrectly clear 2733 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2734 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2735 */ 2736 switch (boot_cpu_data.x86_vfm) { 2737 case INTEL_NEHALEM_EP: /* AAK155 */ 2738 case INTEL_NEHALEM: /* AAP115 */ 2739 case INTEL_WESTMERE: /* AAT100 */ 2740 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2741 case INTEL_NEHALEM_EX: /* BA97 */ 2742 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2743 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2744 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2745 "does not work properly. Using workaround\n"); 2746 break; 2747 default: 2748 break; 2749 } 2750 2751 rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); 2752 2753 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2754 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2755 return -EIO; 2756 2757 #ifdef CONFIG_X86_64 2758 /* 2759 * KVM expects to be able to shove all legal physical addresses into 2760 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2761 * 0 for processors that support Intel 64 architecture". 2762 */ 2763 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2764 return -EIO; 2765 #endif 2766 2767 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2768 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2769 return -EIO; 2770 2771 rdmsrl(MSR_IA32_VMX_MISC, misc_msr); 2772 2773 vmcs_conf->basic = basic_msr; 2774 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2775 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2776 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2777 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2778 vmcs_conf->vmexit_ctrl = _vmexit_control; 2779 vmcs_conf->vmentry_ctrl = _vmentry_control; 2780 vmcs_conf->misc = misc_msr; 2781 2782 #if IS_ENABLED(CONFIG_HYPERV) 2783 if (enlightened_vmcs) 2784 evmcs_sanitize_exec_ctrls(vmcs_conf); 2785 #endif 2786 2787 return 0; 2788 } 2789 2790 static bool __kvm_is_vmx_supported(void) 2791 { 2792 int cpu = smp_processor_id(); 2793 2794 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2795 pr_err("VMX not supported by CPU %d\n", cpu); 2796 return false; 2797 } 2798 2799 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2800 !this_cpu_has(X86_FEATURE_VMX)) { 2801 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2802 return false; 2803 } 2804 2805 return true; 2806 } 2807 2808 static bool kvm_is_vmx_supported(void) 2809 { 2810 bool supported; 2811 2812 migrate_disable(); 2813 supported = __kvm_is_vmx_supported(); 2814 migrate_enable(); 2815 2816 return supported; 2817 } 2818 2819 int vmx_check_processor_compat(void) 2820 { 2821 int cpu = raw_smp_processor_id(); 2822 struct vmcs_config vmcs_conf; 2823 struct vmx_capability vmx_cap; 2824 2825 if (!__kvm_is_vmx_supported()) 2826 return -EIO; 2827 2828 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2829 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2830 return -EIO; 2831 } 2832 if (nested) 2833 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2834 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2835 pr_err("Inconsistent VMCS config on CPU %d\n", cpu); 2836 return -EIO; 2837 } 2838 return 0; 2839 } 2840 2841 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2842 { 2843 u64 msr; 2844 2845 cr4_set_bits(X86_CR4_VMXE); 2846 2847 asm goto("1: vmxon %[vmxon_pointer]\n\t" 2848 _ASM_EXTABLE(1b, %l[fault]) 2849 : : [vmxon_pointer] "m"(vmxon_pointer) 2850 : : fault); 2851 return 0; 2852 2853 fault: 2854 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2855 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2856 cr4_clear_bits(X86_CR4_VMXE); 2857 2858 return -EFAULT; 2859 } 2860 2861 int vmx_enable_virtualization_cpu(void) 2862 { 2863 int cpu = raw_smp_processor_id(); 2864 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2865 int r; 2866 2867 if (cr4_read_shadow() & X86_CR4_VMXE) 2868 return -EBUSY; 2869 2870 /* 2871 * This can happen if we hot-added a CPU but failed to allocate 2872 * VP assist page for it. 2873 */ 2874 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2875 return -EFAULT; 2876 2877 intel_pt_handle_vmx(1); 2878 2879 r = kvm_cpu_vmxon(phys_addr); 2880 if (r) { 2881 intel_pt_handle_vmx(0); 2882 return r; 2883 } 2884 2885 return 0; 2886 } 2887 2888 static void vmclear_local_loaded_vmcss(void) 2889 { 2890 int cpu = raw_smp_processor_id(); 2891 struct loaded_vmcs *v, *n; 2892 2893 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2894 loaded_vmcss_on_cpu_link) 2895 __loaded_vmcs_clear(v); 2896 } 2897 2898 void vmx_disable_virtualization_cpu(void) 2899 { 2900 vmclear_local_loaded_vmcss(); 2901 2902 if (kvm_cpu_vmxoff()) 2903 kvm_spurious_fault(); 2904 2905 hv_reset_evmcs(); 2906 2907 intel_pt_handle_vmx(0); 2908 } 2909 2910 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2911 { 2912 int node = cpu_to_node(cpu); 2913 struct page *pages; 2914 struct vmcs *vmcs; 2915 2916 pages = __alloc_pages_node(node, flags, 0); 2917 if (!pages) 2918 return NULL; 2919 vmcs = page_address(pages); 2920 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 2921 2922 /* KVM supports Enlightened VMCS v1 only */ 2923 if (kvm_is_using_evmcs()) 2924 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2925 else 2926 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 2927 2928 if (shadow) 2929 vmcs->hdr.shadow_vmcs = 1; 2930 return vmcs; 2931 } 2932 2933 void free_vmcs(struct vmcs *vmcs) 2934 { 2935 free_page((unsigned long)vmcs); 2936 } 2937 2938 /* 2939 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2940 */ 2941 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2942 { 2943 if (!loaded_vmcs->vmcs) 2944 return; 2945 loaded_vmcs_clear(loaded_vmcs); 2946 free_vmcs(loaded_vmcs->vmcs); 2947 loaded_vmcs->vmcs = NULL; 2948 if (loaded_vmcs->msr_bitmap) 2949 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2950 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2951 } 2952 2953 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2954 { 2955 loaded_vmcs->vmcs = alloc_vmcs(false); 2956 if (!loaded_vmcs->vmcs) 2957 return -ENOMEM; 2958 2959 vmcs_clear(loaded_vmcs->vmcs); 2960 2961 loaded_vmcs->shadow_vmcs = NULL; 2962 loaded_vmcs->hv_timer_soft_disabled = false; 2963 loaded_vmcs->cpu = -1; 2964 loaded_vmcs->launched = 0; 2965 2966 if (cpu_has_vmx_msr_bitmap()) { 2967 loaded_vmcs->msr_bitmap = (unsigned long *) 2968 __get_free_page(GFP_KERNEL_ACCOUNT); 2969 if (!loaded_vmcs->msr_bitmap) 2970 goto out_vmcs; 2971 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2972 } 2973 2974 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2975 memset(&loaded_vmcs->controls_shadow, 0, 2976 sizeof(struct vmcs_controls_shadow)); 2977 2978 return 0; 2979 2980 out_vmcs: 2981 free_loaded_vmcs(loaded_vmcs); 2982 return -ENOMEM; 2983 } 2984 2985 static void free_kvm_area(void) 2986 { 2987 int cpu; 2988 2989 for_each_possible_cpu(cpu) { 2990 free_vmcs(per_cpu(vmxarea, cpu)); 2991 per_cpu(vmxarea, cpu) = NULL; 2992 } 2993 } 2994 2995 static __init int alloc_kvm_area(void) 2996 { 2997 int cpu; 2998 2999 for_each_possible_cpu(cpu) { 3000 struct vmcs *vmcs; 3001 3002 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 3003 if (!vmcs) { 3004 free_kvm_area(); 3005 return -ENOMEM; 3006 } 3007 3008 /* 3009 * When eVMCS is enabled, alloc_vmcs_cpu() sets 3010 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 3011 * revision_id reported by MSR_IA32_VMX_BASIC. 3012 * 3013 * However, even though not explicitly documented by 3014 * TLFS, VMXArea passed as VMXON argument should 3015 * still be marked with revision_id reported by 3016 * physical CPU. 3017 */ 3018 if (kvm_is_using_evmcs()) 3019 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3020 3021 per_cpu(vmxarea, cpu) = vmcs; 3022 } 3023 return 0; 3024 } 3025 3026 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3027 struct kvm_segment *save) 3028 { 3029 if (!emulate_invalid_guest_state) { 3030 /* 3031 * CS and SS RPL should be equal during guest entry according 3032 * to VMX spec, but in reality it is not always so. Since vcpu 3033 * is in the middle of the transition from real mode to 3034 * protected mode it is safe to assume that RPL 0 is a good 3035 * default value. 3036 */ 3037 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3038 save->selector &= ~SEGMENT_RPL_MASK; 3039 save->dpl = save->selector & SEGMENT_RPL_MASK; 3040 save->s = 1; 3041 } 3042 __vmx_set_segment(vcpu, save, seg); 3043 } 3044 3045 static void enter_pmode(struct kvm_vcpu *vcpu) 3046 { 3047 unsigned long flags; 3048 struct vcpu_vmx *vmx = to_vmx(vcpu); 3049 3050 /* 3051 * Update real mode segment cache. It may be not up-to-date if segment 3052 * register was written while vcpu was in a guest mode. 3053 */ 3054 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3055 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3056 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3057 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3058 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3059 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3060 3061 vmx->rmode.vm86_active = 0; 3062 3063 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3064 3065 flags = vmcs_readl(GUEST_RFLAGS); 3066 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3067 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3068 vmcs_writel(GUEST_RFLAGS, flags); 3069 3070 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3071 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3072 3073 vmx_update_exception_bitmap(vcpu); 3074 3075 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3076 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3077 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3078 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3079 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3080 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3081 } 3082 3083 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3084 { 3085 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3086 struct kvm_segment var = *save; 3087 3088 var.dpl = 0x3; 3089 if (seg == VCPU_SREG_CS) 3090 var.type = 0x3; 3091 3092 if (!emulate_invalid_guest_state) { 3093 var.selector = var.base >> 4; 3094 var.base = var.base & 0xffff0; 3095 var.limit = 0xffff; 3096 var.g = 0; 3097 var.db = 0; 3098 var.present = 1; 3099 var.s = 1; 3100 var.l = 0; 3101 var.unusable = 0; 3102 var.type = 0x3; 3103 var.avl = 0; 3104 if (save->base & 0xf) 3105 pr_warn_once("segment base is not paragraph aligned " 3106 "when entering protected mode (seg=%d)", seg); 3107 } 3108 3109 vmcs_write16(sf->selector, var.selector); 3110 vmcs_writel(sf->base, var.base); 3111 vmcs_write32(sf->limit, var.limit); 3112 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3113 } 3114 3115 static void enter_rmode(struct kvm_vcpu *vcpu) 3116 { 3117 unsigned long flags; 3118 struct vcpu_vmx *vmx = to_vmx(vcpu); 3119 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3120 3121 /* 3122 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3123 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3124 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3125 * should VM-Fail and KVM should reject userspace attempts to stuff 3126 * CR0.PG=0 when L2 is active. 3127 */ 3128 WARN_ON_ONCE(is_guest_mode(vcpu)); 3129 3130 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3131 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3132 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3133 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3134 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3135 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3136 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3137 3138 vmx->rmode.vm86_active = 1; 3139 3140 vmx_segment_cache_clear(vmx); 3141 3142 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3143 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3144 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3145 3146 flags = vmcs_readl(GUEST_RFLAGS); 3147 vmx->rmode.save_rflags = flags; 3148 3149 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3150 3151 vmcs_writel(GUEST_RFLAGS, flags); 3152 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3153 vmx_update_exception_bitmap(vcpu); 3154 3155 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3156 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3157 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3158 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3159 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3160 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3161 } 3162 3163 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3164 { 3165 struct vcpu_vmx *vmx = to_vmx(vcpu); 3166 3167 /* Nothing to do if hardware doesn't support EFER. */ 3168 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3169 return 0; 3170 3171 vcpu->arch.efer = efer; 3172 #ifdef CONFIG_X86_64 3173 if (efer & EFER_LMA) 3174 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3175 else 3176 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3177 #else 3178 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3179 return 1; 3180 #endif 3181 3182 vmx_setup_uret_msrs(vmx); 3183 return 0; 3184 } 3185 3186 #ifdef CONFIG_X86_64 3187 3188 static void enter_lmode(struct kvm_vcpu *vcpu) 3189 { 3190 u32 guest_tr_ar; 3191 3192 vmx_segment_cache_clear(to_vmx(vcpu)); 3193 3194 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3195 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3196 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3197 __func__); 3198 vmcs_write32(GUEST_TR_AR_BYTES, 3199 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3200 | VMX_AR_TYPE_BUSY_64_TSS); 3201 } 3202 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3203 } 3204 3205 static void exit_lmode(struct kvm_vcpu *vcpu) 3206 { 3207 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3208 } 3209 3210 #endif 3211 3212 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3213 { 3214 struct vcpu_vmx *vmx = to_vmx(vcpu); 3215 3216 /* 3217 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3218 * the CPU is not required to invalidate guest-physical mappings on 3219 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3220 * associated with the root EPT structure and not any particular VPID 3221 * (INVVPID also isn't required to invalidate guest-physical mappings). 3222 */ 3223 if (enable_ept) { 3224 ept_sync_global(); 3225 } else if (enable_vpid) { 3226 if (cpu_has_vmx_invvpid_global()) { 3227 vpid_sync_vcpu_global(); 3228 } else { 3229 vpid_sync_vcpu_single(vmx->vpid); 3230 vpid_sync_vcpu_single(vmx->nested.vpid02); 3231 } 3232 } 3233 } 3234 3235 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3236 { 3237 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) 3238 return nested_get_vpid02(vcpu); 3239 return to_vmx(vcpu)->vpid; 3240 } 3241 3242 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3243 { 3244 struct kvm_mmu *mmu = vcpu->arch.mmu; 3245 u64 root_hpa = mmu->root.hpa; 3246 3247 /* No flush required if the current context is invalid. */ 3248 if (!VALID_PAGE(root_hpa)) 3249 return; 3250 3251 if (enable_ept) 3252 ept_sync_context(construct_eptp(vcpu, root_hpa, 3253 mmu->root_role.level)); 3254 else 3255 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3256 } 3257 3258 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3259 { 3260 /* 3261 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3262 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3263 */ 3264 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3265 } 3266 3267 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3268 { 3269 /* 3270 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3271 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3272 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3273 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3274 * i.e. no explicit INVVPID is necessary. 3275 */ 3276 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3277 } 3278 3279 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3280 { 3281 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3282 3283 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3284 return; 3285 3286 if (is_pae_paging(vcpu)) { 3287 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3288 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3289 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3290 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3291 } 3292 } 3293 3294 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3295 { 3296 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3297 3298 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3299 return; 3300 3301 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3302 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3303 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3304 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3305 3306 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3307 } 3308 3309 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3310 CPU_BASED_CR3_STORE_EXITING) 3311 3312 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3313 { 3314 if (is_guest_mode(vcpu)) 3315 return nested_guest_cr0_valid(vcpu, cr0); 3316 3317 if (to_vmx(vcpu)->nested.vmxon) 3318 return nested_host_cr0_valid(vcpu, cr0); 3319 3320 return true; 3321 } 3322 3323 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3324 { 3325 struct vcpu_vmx *vmx = to_vmx(vcpu); 3326 unsigned long hw_cr0, old_cr0_pg; 3327 u32 tmp; 3328 3329 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3330 3331 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3332 if (enable_unrestricted_guest) 3333 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3334 else { 3335 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3336 if (!enable_ept) 3337 hw_cr0 |= X86_CR0_WP; 3338 3339 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3340 enter_pmode(vcpu); 3341 3342 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3343 enter_rmode(vcpu); 3344 } 3345 3346 vmcs_writel(CR0_READ_SHADOW, cr0); 3347 vmcs_writel(GUEST_CR0, hw_cr0); 3348 vcpu->arch.cr0 = cr0; 3349 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3350 3351 #ifdef CONFIG_X86_64 3352 if (vcpu->arch.efer & EFER_LME) { 3353 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3354 enter_lmode(vcpu); 3355 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3356 exit_lmode(vcpu); 3357 } 3358 #endif 3359 3360 if (enable_ept && !enable_unrestricted_guest) { 3361 /* 3362 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3363 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3364 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3365 * KVM's CR3 is installed. 3366 */ 3367 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3368 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3369 3370 /* 3371 * When running with EPT but not unrestricted guest, KVM must 3372 * intercept CR3 accesses when paging is _disabled_. This is 3373 * necessary because restricted guests can't actually run with 3374 * paging disabled, and so KVM stuffs its own CR3 in order to 3375 * run the guest when identity mapped page tables. 3376 * 3377 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3378 * update, it may be stale with respect to CR3 interception, 3379 * e.g. after nested VM-Enter. 3380 * 3381 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3382 * stores to forward them to L1, even if KVM does not need to 3383 * intercept them to preserve its identity mapped page tables. 3384 */ 3385 if (!(cr0 & X86_CR0_PG)) { 3386 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3387 } else if (!is_guest_mode(vcpu)) { 3388 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3389 } else { 3390 tmp = exec_controls_get(vmx); 3391 tmp &= ~CR3_EXITING_BITS; 3392 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3393 exec_controls_set(vmx, tmp); 3394 } 3395 3396 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3397 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3398 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3399 3400 /* 3401 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3402 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3403 */ 3404 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3405 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3406 } 3407 3408 /* depends on vcpu->arch.cr0 to be set to a new value */ 3409 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 3410 } 3411 3412 static int vmx_get_max_ept_level(void) 3413 { 3414 if (cpu_has_vmx_ept_5levels()) 3415 return 5; 3416 return 4; 3417 } 3418 3419 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3420 { 3421 u64 eptp = VMX_EPTP_MT_WB; 3422 3423 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3424 3425 if (enable_ept_ad_bits && 3426 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 3427 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3428 eptp |= root_hpa; 3429 3430 return eptp; 3431 } 3432 3433 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3434 { 3435 struct kvm *kvm = vcpu->kvm; 3436 bool update_guest_cr3 = true; 3437 unsigned long guest_cr3; 3438 u64 eptp; 3439 3440 if (enable_ept) { 3441 eptp = construct_eptp(vcpu, root_hpa, root_level); 3442 vmcs_write64(EPT_POINTER, eptp); 3443 3444 hv_track_root_tdp(vcpu, root_hpa); 3445 3446 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3447 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3448 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3449 guest_cr3 = vcpu->arch.cr3; 3450 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3451 update_guest_cr3 = false; 3452 vmx_ept_load_pdptrs(vcpu); 3453 } else { 3454 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3455 kvm_get_active_cr3_lam_bits(vcpu); 3456 } 3457 3458 if (update_guest_cr3) 3459 vmcs_writel(GUEST_CR3, guest_cr3); 3460 } 3461 3462 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3463 { 3464 /* 3465 * We operate under the default treatment of SMM, so VMX cannot be 3466 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3467 * i.e. is a reserved bit, is handled by common x86 code. 3468 */ 3469 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3470 return false; 3471 3472 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3473 return false; 3474 3475 return true; 3476 } 3477 3478 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3479 { 3480 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3481 struct vcpu_vmx *vmx = to_vmx(vcpu); 3482 unsigned long hw_cr4; 3483 3484 /* 3485 * Pass through host's Machine Check Enable value to hw_cr4, which 3486 * is in force while we are in guest mode. Do not let guests control 3487 * this bit, even if host CR4.MCE == 0. 3488 */ 3489 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3490 if (enable_unrestricted_guest) 3491 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3492 else if (vmx->rmode.vm86_active) 3493 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3494 else 3495 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3496 3497 if (vmx_umip_emulated()) { 3498 if (cr4 & X86_CR4_UMIP) { 3499 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3500 hw_cr4 &= ~X86_CR4_UMIP; 3501 } else if (!is_guest_mode(vcpu) || 3502 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3503 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3504 } 3505 } 3506 3507 vcpu->arch.cr4 = cr4; 3508 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3509 3510 if (!enable_unrestricted_guest) { 3511 if (enable_ept) { 3512 if (!is_paging(vcpu)) { 3513 hw_cr4 &= ~X86_CR4_PAE; 3514 hw_cr4 |= X86_CR4_PSE; 3515 } else if (!(cr4 & X86_CR4_PAE)) { 3516 hw_cr4 &= ~X86_CR4_PAE; 3517 } 3518 } 3519 3520 /* 3521 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3522 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3523 * to be manually disabled when guest switches to non-paging 3524 * mode. 3525 * 3526 * If !enable_unrestricted_guest, the CPU is always running 3527 * with CR0.PG=1 and CR4 needs to be modified. 3528 * If enable_unrestricted_guest, the CPU automatically 3529 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3530 */ 3531 if (!is_paging(vcpu)) 3532 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3533 } 3534 3535 vmcs_writel(CR4_READ_SHADOW, cr4); 3536 vmcs_writel(GUEST_CR4, hw_cr4); 3537 3538 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3539 vcpu->arch.cpuid_dynamic_bits_dirty = true; 3540 } 3541 3542 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3543 { 3544 struct vcpu_vmx *vmx = to_vmx(vcpu); 3545 u32 ar; 3546 3547 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3548 *var = vmx->rmode.segs[seg]; 3549 if (seg == VCPU_SREG_TR 3550 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3551 return; 3552 var->base = vmx_read_guest_seg_base(vmx, seg); 3553 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3554 return; 3555 } 3556 var->base = vmx_read_guest_seg_base(vmx, seg); 3557 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3558 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3559 ar = vmx_read_guest_seg_ar(vmx, seg); 3560 var->unusable = (ar >> 16) & 1; 3561 var->type = ar & 15; 3562 var->s = (ar >> 4) & 1; 3563 var->dpl = (ar >> 5) & 3; 3564 /* 3565 * Some userspaces do not preserve unusable property. Since usable 3566 * segment has to be present according to VMX spec we can use present 3567 * property to amend userspace bug by making unusable segment always 3568 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3569 * segment as unusable. 3570 */ 3571 var->present = !var->unusable; 3572 var->avl = (ar >> 12) & 1; 3573 var->l = (ar >> 13) & 1; 3574 var->db = (ar >> 14) & 1; 3575 var->g = (ar >> 15) & 1; 3576 } 3577 3578 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3579 { 3580 struct kvm_segment s; 3581 3582 if (to_vmx(vcpu)->rmode.vm86_active) { 3583 vmx_get_segment(vcpu, &s, seg); 3584 return s.base; 3585 } 3586 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3587 } 3588 3589 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) 3590 { 3591 struct vcpu_vmx *vmx = to_vmx(vcpu); 3592 int ar; 3593 3594 if (unlikely(vmx->rmode.vm86_active)) 3595 return 0; 3596 3597 if (no_cache) 3598 ar = vmcs_read32(GUEST_SS_AR_BYTES); 3599 else 3600 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3601 return VMX_AR_DPL(ar); 3602 } 3603 3604 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3605 { 3606 return __vmx_get_cpl(vcpu, false); 3607 } 3608 3609 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) 3610 { 3611 return __vmx_get_cpl(vcpu, true); 3612 } 3613 3614 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3615 { 3616 u32 ar; 3617 3618 ar = var->type & 15; 3619 ar |= (var->s & 1) << 4; 3620 ar |= (var->dpl & 3) << 5; 3621 ar |= (var->present & 1) << 7; 3622 ar |= (var->avl & 1) << 12; 3623 ar |= (var->l & 1) << 13; 3624 ar |= (var->db & 1) << 14; 3625 ar |= (var->g & 1) << 15; 3626 ar |= (var->unusable || !var->present) << 16; 3627 3628 return ar; 3629 } 3630 3631 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3632 { 3633 struct vcpu_vmx *vmx = to_vmx(vcpu); 3634 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3635 3636 vmx_segment_cache_clear(vmx); 3637 3638 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3639 vmx->rmode.segs[seg] = *var; 3640 if (seg == VCPU_SREG_TR) 3641 vmcs_write16(sf->selector, var->selector); 3642 else if (var->s) 3643 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3644 return; 3645 } 3646 3647 vmcs_writel(sf->base, var->base); 3648 vmcs_write32(sf->limit, var->limit); 3649 vmcs_write16(sf->selector, var->selector); 3650 3651 /* 3652 * Fix the "Accessed" bit in AR field of segment registers for older 3653 * qemu binaries. 3654 * IA32 arch specifies that at the time of processor reset the 3655 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3656 * is setting it to 0 in the userland code. This causes invalid guest 3657 * state vmexit when "unrestricted guest" mode is turned on. 3658 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3659 * tree. Newer qemu binaries with that qemu fix would not need this 3660 * kvm hack. 3661 */ 3662 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3663 var->type |= 0x1; /* Accessed */ 3664 3665 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3666 } 3667 3668 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3669 { 3670 __vmx_set_segment(vcpu, var, seg); 3671 3672 to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); 3673 } 3674 3675 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3676 { 3677 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3678 3679 *db = (ar >> 14) & 1; 3680 *l = (ar >> 13) & 1; 3681 } 3682 3683 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3684 { 3685 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3686 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3687 } 3688 3689 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3690 { 3691 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3692 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3693 } 3694 3695 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3696 { 3697 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3698 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3699 } 3700 3701 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3702 { 3703 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3704 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3705 } 3706 3707 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3708 { 3709 struct kvm_segment var; 3710 u32 ar; 3711 3712 vmx_get_segment(vcpu, &var, seg); 3713 var.dpl = 0x3; 3714 if (seg == VCPU_SREG_CS) 3715 var.type = 0x3; 3716 ar = vmx_segment_access_rights(&var); 3717 3718 if (var.base != (var.selector << 4)) 3719 return false; 3720 if (var.limit != 0xffff) 3721 return false; 3722 if (ar != 0xf3) 3723 return false; 3724 3725 return true; 3726 } 3727 3728 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3729 { 3730 struct kvm_segment cs; 3731 unsigned int cs_rpl; 3732 3733 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3734 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3735 3736 if (cs.unusable) 3737 return false; 3738 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3739 return false; 3740 if (!cs.s) 3741 return false; 3742 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3743 if (cs.dpl > cs_rpl) 3744 return false; 3745 } else { 3746 if (cs.dpl != cs_rpl) 3747 return false; 3748 } 3749 if (!cs.present) 3750 return false; 3751 3752 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3753 return true; 3754 } 3755 3756 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3757 { 3758 struct kvm_segment ss; 3759 unsigned int ss_rpl; 3760 3761 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3762 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3763 3764 if (ss.unusable) 3765 return true; 3766 if (ss.type != 3 && ss.type != 7) 3767 return false; 3768 if (!ss.s) 3769 return false; 3770 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3771 return false; 3772 if (!ss.present) 3773 return false; 3774 3775 return true; 3776 } 3777 3778 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3779 { 3780 struct kvm_segment var; 3781 unsigned int rpl; 3782 3783 vmx_get_segment(vcpu, &var, seg); 3784 rpl = var.selector & SEGMENT_RPL_MASK; 3785 3786 if (var.unusable) 3787 return true; 3788 if (!var.s) 3789 return false; 3790 if (!var.present) 3791 return false; 3792 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3793 if (var.dpl < rpl) /* DPL < RPL */ 3794 return false; 3795 } 3796 3797 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3798 * rights flags 3799 */ 3800 return true; 3801 } 3802 3803 static bool tr_valid(struct kvm_vcpu *vcpu) 3804 { 3805 struct kvm_segment tr; 3806 3807 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3808 3809 if (tr.unusable) 3810 return false; 3811 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3812 return false; 3813 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3814 return false; 3815 if (!tr.present) 3816 return false; 3817 3818 return true; 3819 } 3820 3821 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3822 { 3823 struct kvm_segment ldtr; 3824 3825 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3826 3827 if (ldtr.unusable) 3828 return true; 3829 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3830 return false; 3831 if (ldtr.type != 2) 3832 return false; 3833 if (!ldtr.present) 3834 return false; 3835 3836 return true; 3837 } 3838 3839 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3840 { 3841 struct kvm_segment cs, ss; 3842 3843 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3844 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3845 3846 return ((cs.selector & SEGMENT_RPL_MASK) == 3847 (ss.selector & SEGMENT_RPL_MASK)); 3848 } 3849 3850 /* 3851 * Check if guest state is valid. Returns true if valid, false if 3852 * not. 3853 * We assume that registers are always usable 3854 */ 3855 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3856 { 3857 /* real mode guest state checks */ 3858 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3859 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3860 return false; 3861 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3862 return false; 3863 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3864 return false; 3865 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3866 return false; 3867 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3868 return false; 3869 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3870 return false; 3871 } else { 3872 /* protected mode guest state checks */ 3873 if (!cs_ss_rpl_check(vcpu)) 3874 return false; 3875 if (!code_segment_valid(vcpu)) 3876 return false; 3877 if (!stack_segment_valid(vcpu)) 3878 return false; 3879 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3880 return false; 3881 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3882 return false; 3883 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3884 return false; 3885 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3886 return false; 3887 if (!tr_valid(vcpu)) 3888 return false; 3889 if (!ldtr_valid(vcpu)) 3890 return false; 3891 } 3892 /* TODO: 3893 * - Add checks on RIP 3894 * - Add checks on RFLAGS 3895 */ 3896 3897 return true; 3898 } 3899 3900 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3901 { 3902 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3903 u16 data; 3904 int i; 3905 3906 for (i = 0; i < 3; i++) { 3907 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3908 return -EFAULT; 3909 } 3910 3911 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3912 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3913 return -EFAULT; 3914 3915 data = ~0; 3916 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3917 return -EFAULT; 3918 3919 return 0; 3920 } 3921 3922 static int init_rmode_identity_map(struct kvm *kvm) 3923 { 3924 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3925 int i, r = 0; 3926 void __user *uaddr; 3927 u32 tmp; 3928 3929 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3930 mutex_lock(&kvm->slots_lock); 3931 3932 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3933 goto out; 3934 3935 if (!kvm_vmx->ept_identity_map_addr) 3936 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3937 3938 uaddr = __x86_set_memory_region(kvm, 3939 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3940 kvm_vmx->ept_identity_map_addr, 3941 PAGE_SIZE); 3942 if (IS_ERR(uaddr)) { 3943 r = PTR_ERR(uaddr); 3944 goto out; 3945 } 3946 3947 /* Set up identity-mapping pagetable for EPT in real mode */ 3948 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 3949 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3950 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3951 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 3952 r = -EFAULT; 3953 goto out; 3954 } 3955 } 3956 kvm_vmx->ept_identity_pagetable_done = true; 3957 3958 out: 3959 mutex_unlock(&kvm->slots_lock); 3960 return r; 3961 } 3962 3963 static void seg_setup(int seg) 3964 { 3965 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3966 unsigned int ar; 3967 3968 vmcs_write16(sf->selector, 0); 3969 vmcs_writel(sf->base, 0); 3970 vmcs_write32(sf->limit, 0xffff); 3971 ar = 0x93; 3972 if (seg == VCPU_SREG_CS) 3973 ar |= 0x08; /* code segment */ 3974 3975 vmcs_write32(sf->ar_bytes, ar); 3976 } 3977 3978 int allocate_vpid(void) 3979 { 3980 int vpid; 3981 3982 if (!enable_vpid) 3983 return 0; 3984 spin_lock(&vmx_vpid_lock); 3985 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3986 if (vpid < VMX_NR_VPIDS) 3987 __set_bit(vpid, vmx_vpid_bitmap); 3988 else 3989 vpid = 0; 3990 spin_unlock(&vmx_vpid_lock); 3991 return vpid; 3992 } 3993 3994 void free_vpid(int vpid) 3995 { 3996 if (!enable_vpid || vpid == 0) 3997 return; 3998 spin_lock(&vmx_vpid_lock); 3999 __clear_bit(vpid, vmx_vpid_bitmap); 4000 spin_unlock(&vmx_vpid_lock); 4001 } 4002 4003 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 4004 { 4005 /* 4006 * When KVM is a nested hypervisor on top of Hyper-V and uses 4007 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 4008 * bitmap has changed. 4009 */ 4010 if (kvm_is_using_evmcs()) { 4011 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 4012 4013 if (evmcs->hv_enlightenments_control.msr_bitmap) 4014 evmcs->hv_clean_fields &= 4015 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 4016 } 4017 4018 vmx->nested.force_msr_bitmap_recalc = true; 4019 } 4020 4021 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4022 { 4023 struct vcpu_vmx *vmx = to_vmx(vcpu); 4024 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4025 int idx; 4026 4027 if (!cpu_has_vmx_msr_bitmap()) 4028 return; 4029 4030 vmx_msr_bitmap_l01_changed(vmx); 4031 4032 /* 4033 * Mark the desired intercept state in shadow bitmap, this is needed 4034 * for resync when the MSR filters change. 4035 */ 4036 idx = vmx_get_passthrough_msr_slot(msr); 4037 if (idx >= 0) { 4038 if (type & MSR_TYPE_R) 4039 clear_bit(idx, vmx->shadow_msr_intercept.read); 4040 if (type & MSR_TYPE_W) 4041 clear_bit(idx, vmx->shadow_msr_intercept.write); 4042 } 4043 4044 if ((type & MSR_TYPE_R) && 4045 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { 4046 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4047 type &= ~MSR_TYPE_R; 4048 } 4049 4050 if ((type & MSR_TYPE_W) && 4051 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { 4052 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4053 type &= ~MSR_TYPE_W; 4054 } 4055 4056 if (type & MSR_TYPE_R) 4057 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4058 4059 if (type & MSR_TYPE_W) 4060 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4061 } 4062 4063 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4064 { 4065 struct vcpu_vmx *vmx = to_vmx(vcpu); 4066 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4067 int idx; 4068 4069 if (!cpu_has_vmx_msr_bitmap()) 4070 return; 4071 4072 vmx_msr_bitmap_l01_changed(vmx); 4073 4074 /* 4075 * Mark the desired intercept state in shadow bitmap, this is needed 4076 * for resync when the MSR filter changes. 4077 */ 4078 idx = vmx_get_passthrough_msr_slot(msr); 4079 if (idx >= 0) { 4080 if (type & MSR_TYPE_R) 4081 set_bit(idx, vmx->shadow_msr_intercept.read); 4082 if (type & MSR_TYPE_W) 4083 set_bit(idx, vmx->shadow_msr_intercept.write); 4084 } 4085 4086 if (type & MSR_TYPE_R) 4087 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4088 4089 if (type & MSR_TYPE_W) 4090 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4091 } 4092 4093 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4094 { 4095 /* 4096 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4097 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4098 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4099 */ 4100 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4101 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4102 struct vcpu_vmx *vmx = to_vmx(vcpu); 4103 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4104 u8 mode; 4105 4106 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4107 return; 4108 4109 if (cpu_has_secondary_exec_ctrls() && 4110 (secondary_exec_controls_get(vmx) & 4111 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4112 mode = MSR_BITMAP_MODE_X2APIC; 4113 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4114 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4115 } else { 4116 mode = 0; 4117 } 4118 4119 if (mode == vmx->x2apic_msr_bitmap_mode) 4120 return; 4121 4122 vmx->x2apic_msr_bitmap_mode = mode; 4123 4124 /* 4125 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4126 * registers (0x840 and above) intercepted, KVM doesn't support them. 4127 * Intercept all writes by default and poke holes as needed. Pass 4128 * through reads for all valid registers by default in x2APIC+APICv 4129 * mode, only the current timer count needs on-demand emulation by KVM. 4130 */ 4131 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4132 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4133 else 4134 msr_bitmap[read_idx] = ~0ull; 4135 msr_bitmap[write_idx] = ~0ull; 4136 4137 /* 4138 * TPR reads and writes can be virtualized even if virtual interrupt 4139 * delivery is not in use. 4140 */ 4141 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4142 !(mode & MSR_BITMAP_MODE_X2APIC)); 4143 4144 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4145 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4146 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4147 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4148 if (enable_ipiv) 4149 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4150 } 4151 } 4152 4153 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4154 { 4155 struct vcpu_vmx *vmx = to_vmx(vcpu); 4156 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4157 u32 i; 4158 4159 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4160 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4161 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4162 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4163 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4164 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4165 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4166 } 4167 } 4168 4169 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) 4170 { 4171 struct vcpu_vmx *vmx = to_vmx(vcpu); 4172 u32 i; 4173 4174 if (!cpu_has_vmx_msr_bitmap()) 4175 return; 4176 4177 /* 4178 * Redo intercept permissions for MSRs that KVM is passing through to 4179 * the guest. Disabling interception will check the new MSR filter and 4180 * ensure that KVM enables interception if usersepace wants to filter 4181 * the MSR. MSRs that KVM is already intercepting don't need to be 4182 * refreshed since KVM is going to intercept them regardless of what 4183 * userspace wants. 4184 */ 4185 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 4186 u32 msr = vmx_possible_passthrough_msrs[i]; 4187 4188 if (!test_bit(i, vmx->shadow_msr_intercept.read)) 4189 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); 4190 4191 if (!test_bit(i, vmx->shadow_msr_intercept.write)) 4192 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); 4193 } 4194 4195 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4196 if (vmx_pt_mode_is_host_guest()) 4197 pt_update_intercept_for_msr(vcpu); 4198 } 4199 4200 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4201 int vector) 4202 { 4203 struct vcpu_vmx *vmx = to_vmx(vcpu); 4204 4205 /* 4206 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4207 * and freed, and must not be accessed outside of vcpu->mutex. The 4208 * vCPU's cached PI NV is valid if and only if posted interrupts 4209 * enabled in its vmcs12, i.e. checking the vector also checks that 4210 * L1 has enabled posted interrupts for L2. 4211 */ 4212 if (is_guest_mode(vcpu) && 4213 vector == vmx->nested.posted_intr_nv) { 4214 /* 4215 * If a posted intr is not recognized by hardware, 4216 * we will accomplish it in the next vmentry. 4217 */ 4218 vmx->nested.pi_pending = true; 4219 kvm_make_request(KVM_REQ_EVENT, vcpu); 4220 4221 /* 4222 * This pairs with the smp_mb_*() after setting vcpu->mode in 4223 * vcpu_enter_guest() to guarantee the vCPU sees the event 4224 * request if triggering a posted interrupt "fails" because 4225 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4226 * the smb_wmb() in kvm_make_request() only ensures everything 4227 * done before making the request is visible when the request 4228 * is visible, it doesn't ensure ordering between the store to 4229 * vcpu->requests and the load from vcpu->mode. 4230 */ 4231 smp_mb__after_atomic(); 4232 4233 /* the PIR and ON have been set by L1. */ 4234 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4235 return 0; 4236 } 4237 return -1; 4238 } 4239 /* 4240 * Send interrupt to vcpu via posted interrupt way. 4241 * 1. If target vcpu is running(non-root mode), send posted interrupt 4242 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4243 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4244 * interrupt from PIR in next vmentry. 4245 */ 4246 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4247 { 4248 struct vcpu_vt *vt = to_vt(vcpu); 4249 int r; 4250 4251 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4252 if (!r) 4253 return 0; 4254 4255 /* Note, this is called iff the local APIC is in-kernel. */ 4256 if (!vcpu->arch.apic->apicv_active) 4257 return -1; 4258 4259 __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); 4260 return 0; 4261 } 4262 4263 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4264 int trig_mode, int vector) 4265 { 4266 struct kvm_vcpu *vcpu = apic->vcpu; 4267 4268 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4269 kvm_lapic_set_irr(vector, apic); 4270 kvm_make_request(KVM_REQ_EVENT, vcpu); 4271 kvm_vcpu_kick(vcpu); 4272 } else { 4273 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4274 trig_mode, vector); 4275 } 4276 } 4277 4278 /* 4279 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4280 * will not change in the lifetime of the guest. 4281 * Note that host-state that does change is set elsewhere. E.g., host-state 4282 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4283 */ 4284 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4285 { 4286 u32 low32, high32; 4287 unsigned long tmpl; 4288 unsigned long cr0, cr3, cr4; 4289 4290 cr0 = read_cr0(); 4291 WARN_ON(cr0 & X86_CR0_TS); 4292 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4293 4294 /* 4295 * Save the most likely value for this task's CR3 in the VMCS. 4296 * We can't use __get_current_cr3_fast() because we're not atomic. 4297 */ 4298 cr3 = __read_cr3(); 4299 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4300 vmx->loaded_vmcs->host_state.cr3 = cr3; 4301 4302 /* Save the most likely value for this task's CR4 in the VMCS. */ 4303 cr4 = cr4_read_shadow(); 4304 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4305 vmx->loaded_vmcs->host_state.cr4 = cr4; 4306 4307 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4308 #ifdef CONFIG_X86_64 4309 /* 4310 * Load null selectors, so we can avoid reloading them in 4311 * vmx_prepare_switch_to_host(), in case userspace uses 4312 * the null selectors too (the expected case). 4313 */ 4314 vmcs_write16(HOST_DS_SELECTOR, 0); 4315 vmcs_write16(HOST_ES_SELECTOR, 0); 4316 #else 4317 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4318 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4319 #endif 4320 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4321 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4322 4323 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4324 4325 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4326 4327 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4328 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4329 4330 /* 4331 * SYSENTER is used for 32-bit system calls on either 32-bit or 4332 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4333 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4334 * have already done so!). 4335 */ 4336 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4337 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4338 4339 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4340 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4341 4342 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4343 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4344 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4345 } 4346 4347 if (cpu_has_load_ia32_efer()) 4348 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4349 } 4350 4351 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4352 { 4353 struct kvm_vcpu *vcpu = &vmx->vcpu; 4354 4355 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4356 ~vcpu->arch.cr4_guest_rsvd_bits; 4357 if (!enable_ept) { 4358 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4359 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4360 } 4361 if (is_guest_mode(&vmx->vcpu)) 4362 vcpu->arch.cr4_guest_owned_bits &= 4363 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4364 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4365 } 4366 4367 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4368 { 4369 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4370 4371 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4372 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4373 4374 if (!enable_vnmi) 4375 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4376 4377 if (!enable_preemption_timer) 4378 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4379 4380 return pin_based_exec_ctrl; 4381 } 4382 4383 static u32 vmx_vmentry_ctrl(void) 4384 { 4385 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4386 4387 if (vmx_pt_mode_is_system()) 4388 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4389 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4390 /* 4391 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4392 */ 4393 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4394 VM_ENTRY_LOAD_IA32_EFER | 4395 VM_ENTRY_IA32E_MODE); 4396 4397 return vmentry_ctrl; 4398 } 4399 4400 static u32 vmx_vmexit_ctrl(void) 4401 { 4402 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4403 4404 /* 4405 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4406 * nested virtualization and thus allowed to be set in vmcs12. 4407 */ 4408 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4409 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4410 4411 if (vmx_pt_mode_is_system()) 4412 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4413 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4414 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4415 return vmexit_ctrl & 4416 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4417 } 4418 4419 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4420 { 4421 struct vcpu_vmx *vmx = to_vmx(vcpu); 4422 4423 if (is_guest_mode(vcpu)) { 4424 vmx->nested.update_vmcs01_apicv_status = true; 4425 return; 4426 } 4427 4428 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4429 4430 if (kvm_vcpu_apicv_active(vcpu)) { 4431 secondary_exec_controls_setbit(vmx, 4432 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4433 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4434 if (enable_ipiv) 4435 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4436 } else { 4437 secondary_exec_controls_clearbit(vmx, 4438 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4439 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4440 if (enable_ipiv) 4441 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4442 } 4443 4444 vmx_update_msr_bitmap_x2apic(vcpu); 4445 } 4446 4447 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4448 { 4449 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4450 4451 /* 4452 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4453 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4454 */ 4455 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4456 CPU_BASED_USE_IO_BITMAPS | 4457 CPU_BASED_MONITOR_TRAP_FLAG | 4458 CPU_BASED_PAUSE_EXITING); 4459 4460 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4461 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4462 CPU_BASED_NMI_WINDOW_EXITING); 4463 4464 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4465 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4466 4467 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4468 exec_control &= ~CPU_BASED_TPR_SHADOW; 4469 4470 #ifdef CONFIG_X86_64 4471 if (exec_control & CPU_BASED_TPR_SHADOW) 4472 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4473 CPU_BASED_CR8_STORE_EXITING); 4474 else 4475 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4476 CPU_BASED_CR8_LOAD_EXITING; 4477 #endif 4478 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4479 if (enable_ept) 4480 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4481 CPU_BASED_CR3_STORE_EXITING | 4482 CPU_BASED_INVLPG_EXITING); 4483 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4484 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4485 CPU_BASED_MONITOR_EXITING); 4486 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4487 exec_control &= ~CPU_BASED_HLT_EXITING; 4488 return exec_control; 4489 } 4490 4491 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4492 { 4493 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4494 4495 /* 4496 * IPI virtualization relies on APICv. Disable IPI virtualization if 4497 * APICv is inhibited. 4498 */ 4499 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4500 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4501 4502 return exec_control; 4503 } 4504 4505 /* 4506 * Adjust a single secondary execution control bit to intercept/allow an 4507 * instruction in the guest. This is usually done based on whether or not a 4508 * feature has been exposed to the guest in order to correctly emulate faults. 4509 */ 4510 static inline void 4511 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4512 u32 control, bool enabled, bool exiting) 4513 { 4514 /* 4515 * If the control is for an opt-in feature, clear the control if the 4516 * feature is not exposed to the guest, i.e. not enabled. If the 4517 * control is opt-out, i.e. an exiting control, clear the control if 4518 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4519 * disabled for the associated instruction. Note, the caller is 4520 * responsible presetting exec_control to set all supported bits. 4521 */ 4522 if (enabled == exiting) 4523 *exec_control &= ~control; 4524 4525 /* 4526 * Update the nested MSR settings so that a nested VMM can/can't set 4527 * controls for features that are/aren't exposed to the guest. 4528 */ 4529 if (nested && 4530 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { 4531 /* 4532 * All features that can be added or removed to VMX MSRs must 4533 * be supported in the first place for nested virtualization. 4534 */ 4535 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4536 enabled = false; 4537 4538 if (enabled) 4539 vmx->nested.msrs.secondary_ctls_high |= control; 4540 else 4541 vmx->nested.msrs.secondary_ctls_high &= ~control; 4542 } 4543 } 4544 4545 /* 4546 * Wrapper macro for the common case of adjusting a secondary execution control 4547 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4548 * verifies that the control is actually supported by KVM and hardware. 4549 */ 4550 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4551 ({ \ 4552 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4553 bool __enabled; \ 4554 \ 4555 if (cpu_has_vmx_##name()) { \ 4556 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ 4557 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4558 __enabled, exiting); \ 4559 } \ 4560 }) 4561 4562 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4563 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4564 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4565 4566 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4567 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4568 4569 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4570 { 4571 struct kvm_vcpu *vcpu = &vmx->vcpu; 4572 4573 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4574 4575 if (vmx_pt_mode_is_system()) 4576 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4577 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4578 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4579 if (vmx->vpid == 0) 4580 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4581 if (!enable_ept) { 4582 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4583 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 4584 enable_unrestricted_guest = 0; 4585 } 4586 if (!enable_unrestricted_guest) 4587 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4588 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4589 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4590 if (!kvm_vcpu_apicv_active(vcpu)) 4591 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4592 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4593 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4594 4595 /* 4596 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4597 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4598 */ 4599 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4600 4601 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4602 * in vmx_set_cr4. */ 4603 exec_control &= ~SECONDARY_EXEC_DESC; 4604 4605 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4606 (handle_vmptrld). 4607 We can NOT enable shadow_vmcs here because we don't have yet 4608 a current VMCS12 4609 */ 4610 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4611 4612 /* 4613 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4614 * it needs to be set here when dirty logging is already active, e.g. 4615 * if this vCPU was created after dirty logging was enabled. 4616 */ 4617 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4618 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4619 4620 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4621 4622 /* 4623 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4624 * feature is exposed to the guest. This creates a virtualization hole 4625 * if both are supported in hardware but only one is exposed to the 4626 * guest, but letting the guest execute RDTSCP or RDPID when either one 4627 * is advertised is preferable to emulating the advertised instruction 4628 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4629 */ 4630 if (cpu_has_vmx_rdtscp()) { 4631 bool rdpid_or_rdtscp_enabled = 4632 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || 4633 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); 4634 4635 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4636 SECONDARY_EXEC_ENABLE_RDTSCP, 4637 rdpid_or_rdtscp_enabled, false); 4638 } 4639 4640 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4641 4642 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4643 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4644 4645 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4646 ENABLE_USR_WAIT_PAUSE, false); 4647 4648 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4649 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4650 4651 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4652 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4653 4654 return exec_control; 4655 } 4656 4657 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4658 { 4659 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4660 } 4661 4662 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4663 { 4664 struct page *pages; 4665 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4666 4667 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4668 return 0; 4669 4670 if (kvm_vmx->pid_table) 4671 return 0; 4672 4673 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4674 vmx_get_pid_table_order(kvm)); 4675 if (!pages) 4676 return -ENOMEM; 4677 4678 kvm_vmx->pid_table = (void *)page_address(pages); 4679 return 0; 4680 } 4681 4682 int vmx_vcpu_precreate(struct kvm *kvm) 4683 { 4684 return vmx_alloc_ipiv_pid_table(kvm); 4685 } 4686 4687 #define VMX_XSS_EXIT_BITMAP 0 4688 4689 static void init_vmcs(struct vcpu_vmx *vmx) 4690 { 4691 struct kvm *kvm = vmx->vcpu.kvm; 4692 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4693 4694 if (nested) 4695 nested_vmx_set_vmcs_shadowing_bitmap(); 4696 4697 if (cpu_has_vmx_msr_bitmap()) 4698 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4699 4700 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4701 4702 /* Control */ 4703 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4704 4705 exec_controls_set(vmx, vmx_exec_control(vmx)); 4706 4707 if (cpu_has_secondary_exec_ctrls()) { 4708 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4709 if (vmx->ve_info) 4710 vmcs_write64(VE_INFORMATION_ADDRESS, 4711 __pa(vmx->ve_info)); 4712 } 4713 4714 if (cpu_has_tertiary_exec_ctrls()) 4715 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4716 4717 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4718 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4719 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4720 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4721 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4722 4723 vmcs_write16(GUEST_INTR_STATUS, 0); 4724 4725 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4726 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); 4727 } 4728 4729 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4730 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4731 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4732 } 4733 4734 if (!kvm_pause_in_guest(kvm)) { 4735 vmcs_write32(PLE_GAP, ple_gap); 4736 vmx->ple_window = ple_window; 4737 vmx->ple_window_dirty = true; 4738 } 4739 4740 if (kvm_notify_vmexit_enabled(kvm)) 4741 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4742 4743 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4744 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4745 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4746 4747 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4748 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4749 vmx_set_constant_host_state(vmx); 4750 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4751 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4752 4753 if (cpu_has_vmx_vmfunc()) 4754 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4755 4756 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4757 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4758 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4759 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4760 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4761 4762 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4763 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4764 4765 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4766 4767 /* 22.2.1, 20.8.1 */ 4768 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4769 4770 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4771 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4772 4773 set_cr4_guest_host_mask(vmx); 4774 4775 if (vmx->vpid != 0) 4776 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4777 4778 if (cpu_has_vmx_xsaves()) 4779 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4780 4781 if (enable_pml) { 4782 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4783 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 4784 } 4785 4786 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4787 4788 if (vmx_pt_mode_is_host_guest()) { 4789 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4790 /* Bit[6~0] are forced to 1, writes are ignored. */ 4791 vmx->pt_desc.guest.output_mask = 0x7F; 4792 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4793 } 4794 4795 vmcs_write32(GUEST_SYSENTER_CS, 0); 4796 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4797 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4798 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4799 4800 if (cpu_has_vmx_tpr_shadow()) { 4801 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4802 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4803 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4804 __pa(vmx->vcpu.arch.apic->regs)); 4805 vmcs_write32(TPR_THRESHOLD, 0); 4806 } 4807 4808 vmx_setup_uret_msrs(vmx); 4809 } 4810 4811 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4812 { 4813 struct vcpu_vmx *vmx = to_vmx(vcpu); 4814 4815 init_vmcs(vmx); 4816 4817 if (nested && 4818 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4819 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4820 4821 vcpu_setup_sgx_lepubkeyhash(vcpu); 4822 4823 vmx->nested.posted_intr_nv = -1; 4824 vmx->nested.vmxon_ptr = INVALID_GPA; 4825 vmx->nested.current_vmptr = INVALID_GPA; 4826 4827 #ifdef CONFIG_KVM_HYPERV 4828 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4829 #endif 4830 4831 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4832 vcpu->arch.microcode_version = 0x100000000ULL; 4833 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4834 4835 /* 4836 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4837 * or POSTED_INTR_WAKEUP_VECTOR. 4838 */ 4839 vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 4840 __pi_set_sn(&vmx->vt.pi_desc); 4841 } 4842 4843 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4844 { 4845 struct vcpu_vmx *vmx = to_vmx(vcpu); 4846 4847 if (!init_event) 4848 __vmx_vcpu_reset(vcpu); 4849 4850 vmx->rmode.vm86_active = 0; 4851 vmx->spec_ctrl = 0; 4852 4853 vmx->msr_ia32_umwait_control = 0; 4854 4855 vmx->hv_deadline_tsc = -1; 4856 kvm_set_cr8(vcpu, 0); 4857 4858 seg_setup(VCPU_SREG_CS); 4859 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4860 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4861 4862 seg_setup(VCPU_SREG_DS); 4863 seg_setup(VCPU_SREG_ES); 4864 seg_setup(VCPU_SREG_FS); 4865 seg_setup(VCPU_SREG_GS); 4866 seg_setup(VCPU_SREG_SS); 4867 4868 vmcs_write16(GUEST_TR_SELECTOR, 0); 4869 vmcs_writel(GUEST_TR_BASE, 0); 4870 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4871 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4872 4873 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4874 vmcs_writel(GUEST_LDTR_BASE, 0); 4875 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4876 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4877 4878 vmcs_writel(GUEST_GDTR_BASE, 0); 4879 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4880 4881 vmcs_writel(GUEST_IDTR_BASE, 0); 4882 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4883 4884 vmx_segment_cache_clear(vmx); 4885 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4886 4887 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4888 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4889 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4890 if (kvm_mpx_supported()) 4891 vmcs_write64(GUEST_BNDCFGS, 0); 4892 4893 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4894 4895 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4896 4897 vpid_sync_context(vmx->vpid); 4898 4899 vmx_update_fb_clear_dis(vcpu, vmx); 4900 } 4901 4902 void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 4903 { 4904 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4905 } 4906 4907 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 4908 { 4909 if (!enable_vnmi || 4910 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4911 vmx_enable_irq_window(vcpu); 4912 return; 4913 } 4914 4915 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4916 } 4917 4918 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 4919 { 4920 struct vcpu_vmx *vmx = to_vmx(vcpu); 4921 uint32_t intr; 4922 int irq = vcpu->arch.interrupt.nr; 4923 4924 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 4925 4926 ++vcpu->stat.irq_injections; 4927 if (vmx->rmode.vm86_active) { 4928 int inc_eip = 0; 4929 if (vcpu->arch.interrupt.soft) 4930 inc_eip = vcpu->arch.event_exit_inst_len; 4931 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4932 return; 4933 } 4934 intr = irq | INTR_INFO_VALID_MASK; 4935 if (vcpu->arch.interrupt.soft) { 4936 intr |= INTR_TYPE_SOFT_INTR; 4937 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4938 vmx->vcpu.arch.event_exit_inst_len); 4939 } else 4940 intr |= INTR_TYPE_EXT_INTR; 4941 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4942 4943 vmx_clear_hlt(vcpu); 4944 } 4945 4946 void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4947 { 4948 struct vcpu_vmx *vmx = to_vmx(vcpu); 4949 4950 if (!enable_vnmi) { 4951 /* 4952 * Tracking the NMI-blocked state in software is built upon 4953 * finding the next open IRQ window. This, in turn, depends on 4954 * well-behaving guests: They have to keep IRQs disabled at 4955 * least as long as the NMI handler runs. Otherwise we may 4956 * cause NMI nesting, maybe breaking the guest. But as this is 4957 * highly unlikely, we can live with the residual risk. 4958 */ 4959 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 4960 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4961 } 4962 4963 ++vcpu->stat.nmi_injections; 4964 vmx->loaded_vmcs->nmi_known_unmasked = false; 4965 4966 if (vmx->rmode.vm86_active) { 4967 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 4968 return; 4969 } 4970 4971 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4972 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4973 4974 vmx_clear_hlt(vcpu); 4975 } 4976 4977 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4978 { 4979 struct vcpu_vmx *vmx = to_vmx(vcpu); 4980 bool masked; 4981 4982 if (!enable_vnmi) 4983 return vmx->loaded_vmcs->soft_vnmi_blocked; 4984 if (vmx->loaded_vmcs->nmi_known_unmasked) 4985 return false; 4986 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4987 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4988 return masked; 4989 } 4990 4991 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4992 { 4993 struct vcpu_vmx *vmx = to_vmx(vcpu); 4994 4995 if (!enable_vnmi) { 4996 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 4997 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 4998 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4999 } 5000 } else { 5001 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5002 if (masked) 5003 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5004 GUEST_INTR_STATE_NMI); 5005 else 5006 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5007 GUEST_INTR_STATE_NMI); 5008 } 5009 } 5010 5011 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5012 { 5013 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5014 return false; 5015 5016 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5017 return true; 5018 5019 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5020 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5021 GUEST_INTR_STATE_NMI)); 5022 } 5023 5024 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5025 { 5026 if (to_vmx(vcpu)->nested.nested_run_pending) 5027 return -EBUSY; 5028 5029 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5030 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5031 return -EBUSY; 5032 5033 return !vmx_nmi_blocked(vcpu); 5034 } 5035 5036 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5037 { 5038 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5039 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5040 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5041 } 5042 5043 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5044 { 5045 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5046 return false; 5047 5048 return __vmx_interrupt_blocked(vcpu); 5049 } 5050 5051 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5052 { 5053 if (to_vmx(vcpu)->nested.nested_run_pending) 5054 return -EBUSY; 5055 5056 /* 5057 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5058 * e.g. if the IRQ arrived asynchronously after checking nested events. 5059 */ 5060 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5061 return -EBUSY; 5062 5063 return !vmx_interrupt_blocked(vcpu); 5064 } 5065 5066 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5067 { 5068 void __user *ret; 5069 5070 if (enable_unrestricted_guest) 5071 return 0; 5072 5073 mutex_lock(&kvm->slots_lock); 5074 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5075 PAGE_SIZE * 3); 5076 mutex_unlock(&kvm->slots_lock); 5077 5078 if (IS_ERR(ret)) 5079 return PTR_ERR(ret); 5080 5081 to_kvm_vmx(kvm)->tss_addr = addr; 5082 5083 return init_rmode_tss(kvm, ret); 5084 } 5085 5086 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5087 { 5088 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5089 return 0; 5090 } 5091 5092 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5093 { 5094 switch (vec) { 5095 case BP_VECTOR: 5096 /* 5097 * Update instruction length as we may reinject the exception 5098 * from user space while in guest debugging mode. 5099 */ 5100 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5101 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5102 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5103 return false; 5104 fallthrough; 5105 case DB_VECTOR: 5106 return !(vcpu->guest_debug & 5107 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5108 case DE_VECTOR: 5109 case OF_VECTOR: 5110 case BR_VECTOR: 5111 case UD_VECTOR: 5112 case DF_VECTOR: 5113 case SS_VECTOR: 5114 case GP_VECTOR: 5115 case MF_VECTOR: 5116 return true; 5117 } 5118 return false; 5119 } 5120 5121 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5122 int vec, u32 err_code) 5123 { 5124 /* 5125 * Instruction with address size override prefix opcode 0x67 5126 * Cause the #SS fault with 0 error code in VM86 mode. 5127 */ 5128 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5129 if (kvm_emulate_instruction(vcpu, 0)) { 5130 if (vcpu->arch.halt_request) { 5131 vcpu->arch.halt_request = 0; 5132 return kvm_emulate_halt_noskip(vcpu); 5133 } 5134 return 1; 5135 } 5136 return 0; 5137 } 5138 5139 /* 5140 * Forward all other exceptions that are valid in real mode. 5141 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5142 * the required debugging infrastructure rework. 5143 */ 5144 kvm_queue_exception(vcpu, vec); 5145 return 1; 5146 } 5147 5148 static int handle_machine_check(struct kvm_vcpu *vcpu) 5149 { 5150 /* handled by vmx_vcpu_run() */ 5151 return 1; 5152 } 5153 5154 /* 5155 * If the host has split lock detection disabled, then #AC is 5156 * unconditionally injected into the guest, which is the pre split lock 5157 * detection behaviour. 5158 * 5159 * If the host has split lock detection enabled then #AC is 5160 * only injected into the guest when: 5161 * - Guest CPL == 3 (user mode) 5162 * - Guest has #AC detection enabled in CR0 5163 * - Guest EFLAGS has AC bit set 5164 */ 5165 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5166 { 5167 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5168 return true; 5169 5170 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5171 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5172 } 5173 5174 static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) 5175 { 5176 return vcpu->arch.guest_fpu.fpstate->xfd && 5177 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); 5178 } 5179 5180 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5181 { 5182 struct vcpu_vmx *vmx = to_vmx(vcpu); 5183 struct kvm_run *kvm_run = vcpu->run; 5184 u32 intr_info, ex_no, error_code; 5185 unsigned long cr2, dr6; 5186 u32 vect_info; 5187 5188 vect_info = vmx->idt_vectoring_info; 5189 intr_info = vmx_get_intr_info(vcpu); 5190 5191 /* 5192 * Machine checks are handled by handle_exception_irqoff(), or by 5193 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5194 * vmx_vcpu_enter_exit(). 5195 */ 5196 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5197 return 1; 5198 5199 /* 5200 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5201 * This ensures the nested_vmx check is not skipped so vmexit can 5202 * be reflected to L1 (when it intercepts #NM) before reaching this 5203 * point. 5204 */ 5205 if (is_nm_fault(intr_info)) { 5206 kvm_queue_exception_p(vcpu, NM_VECTOR, 5207 is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); 5208 return 1; 5209 } 5210 5211 if (is_invalid_opcode(intr_info)) 5212 return handle_ud(vcpu); 5213 5214 if (WARN_ON_ONCE(is_ve_fault(intr_info))) { 5215 struct vmx_ve_information *ve_info = vmx->ve_info; 5216 5217 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, 5218 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); 5219 dump_vmcs(vcpu); 5220 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); 5221 return 1; 5222 } 5223 5224 error_code = 0; 5225 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5226 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5227 5228 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5229 WARN_ON_ONCE(!enable_vmware_backdoor); 5230 5231 /* 5232 * VMware backdoor emulation on #GP interception only handles 5233 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5234 * error code on #GP. 5235 */ 5236 if (error_code) { 5237 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5238 return 1; 5239 } 5240 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5241 } 5242 5243 /* 5244 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5245 * MMIO, it is better to report an internal error. 5246 * See the comments in vmx_handle_exit. 5247 */ 5248 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5249 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5250 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5251 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5252 vcpu->run->internal.ndata = 4; 5253 vcpu->run->internal.data[0] = vect_info; 5254 vcpu->run->internal.data[1] = intr_info; 5255 vcpu->run->internal.data[2] = error_code; 5256 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5257 return 0; 5258 } 5259 5260 if (is_page_fault(intr_info)) { 5261 cr2 = vmx_get_exit_qual(vcpu); 5262 if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5263 /* 5264 * EPT will cause page fault only if we need to 5265 * detect illegal GPAs. 5266 */ 5267 WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5268 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5269 return 1; 5270 } else 5271 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5272 } 5273 5274 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5275 5276 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5277 return handle_rmode_exception(vcpu, ex_no, error_code); 5278 5279 switch (ex_no) { 5280 case DB_VECTOR: 5281 dr6 = vmx_get_exit_qual(vcpu); 5282 if (!(vcpu->guest_debug & 5283 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5284 /* 5285 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5286 * instruction. ICEBP generates a trap-like #DB, but 5287 * despite its interception control being tied to #DB, 5288 * is an instruction intercept, i.e. the VM-Exit occurs 5289 * on the ICEBP itself. Use the inner "skip" helper to 5290 * avoid single-step #DB and MTF updates, as ICEBP is 5291 * higher priority. Note, skipping ICEBP still clears 5292 * STI and MOVSS blocking. 5293 * 5294 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5295 * if single-step is enabled in RFLAGS and STI or MOVSS 5296 * blocking is active, as the CPU doesn't set the bit 5297 * on VM-Exit due to #DB interception. VM-Entry has a 5298 * consistency check that a single-step #DB is pending 5299 * in this scenario as the previous instruction cannot 5300 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5301 * don't modify RFLAGS), therefore the one instruction 5302 * delay when activating single-step breakpoints must 5303 * have already expired. Note, the CPU sets/clears BS 5304 * as appropriate for all other VM-Exits types. 5305 */ 5306 if (is_icebp(intr_info)) 5307 WARN_ON(!skip_emulated_instruction(vcpu)); 5308 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5309 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5310 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5311 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5312 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5313 5314 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5315 return 1; 5316 } 5317 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5318 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5319 fallthrough; 5320 case BP_VECTOR: 5321 /* 5322 * Update instruction length as we may reinject #BP from 5323 * user space while in guest debugging mode. Reading it for 5324 * #DB as well causes no harm, it is not used in that case. 5325 */ 5326 vmx->vcpu.arch.event_exit_inst_len = 5327 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5328 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5329 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5330 kvm_run->debug.arch.exception = ex_no; 5331 break; 5332 case AC_VECTOR: 5333 if (vmx_guest_inject_ac(vcpu)) { 5334 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5335 return 1; 5336 } 5337 5338 /* 5339 * Handle split lock. Depending on detection mode this will 5340 * either warn and disable split lock detection for this 5341 * task or force SIGBUS on it. 5342 */ 5343 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5344 return 1; 5345 fallthrough; 5346 default: 5347 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5348 kvm_run->ex.exception = ex_no; 5349 kvm_run->ex.error_code = error_code; 5350 break; 5351 } 5352 return 0; 5353 } 5354 5355 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5356 { 5357 ++vcpu->stat.irq_exits; 5358 return 1; 5359 } 5360 5361 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5362 { 5363 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5364 vcpu->mmio_needed = 0; 5365 return 0; 5366 } 5367 5368 static int handle_io(struct kvm_vcpu *vcpu) 5369 { 5370 unsigned long exit_qualification; 5371 int size, in, string; 5372 unsigned port; 5373 5374 exit_qualification = vmx_get_exit_qual(vcpu); 5375 string = (exit_qualification & 16) != 0; 5376 5377 ++vcpu->stat.io_exits; 5378 5379 if (string) 5380 return kvm_emulate_instruction(vcpu, 0); 5381 5382 port = exit_qualification >> 16; 5383 size = (exit_qualification & 7) + 1; 5384 in = (exit_qualification & 8) != 0; 5385 5386 return kvm_fast_pio(vcpu, size, port, in); 5387 } 5388 5389 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5390 { 5391 /* 5392 * Patch in the VMCALL instruction: 5393 */ 5394 hypercall[0] = 0x0f; 5395 hypercall[1] = 0x01; 5396 hypercall[2] = 0xc1; 5397 } 5398 5399 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5400 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5401 { 5402 if (is_guest_mode(vcpu)) { 5403 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5404 unsigned long orig_val = val; 5405 5406 /* 5407 * We get here when L2 changed cr0 in a way that did not change 5408 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5409 * but did change L0 shadowed bits. So we first calculate the 5410 * effective cr0 value that L1 would like to write into the 5411 * hardware. It consists of the L2-owned bits from the new 5412 * value combined with the L1-owned bits from L1's guest_cr0. 5413 */ 5414 val = (val & ~vmcs12->cr0_guest_host_mask) | 5415 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5416 5417 if (kvm_set_cr0(vcpu, val)) 5418 return 1; 5419 vmcs_writel(CR0_READ_SHADOW, orig_val); 5420 return 0; 5421 } else { 5422 return kvm_set_cr0(vcpu, val); 5423 } 5424 } 5425 5426 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5427 { 5428 if (is_guest_mode(vcpu)) { 5429 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5430 unsigned long orig_val = val; 5431 5432 /* analogously to handle_set_cr0 */ 5433 val = (val & ~vmcs12->cr4_guest_host_mask) | 5434 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5435 if (kvm_set_cr4(vcpu, val)) 5436 return 1; 5437 vmcs_writel(CR4_READ_SHADOW, orig_val); 5438 return 0; 5439 } else 5440 return kvm_set_cr4(vcpu, val); 5441 } 5442 5443 static int handle_desc(struct kvm_vcpu *vcpu) 5444 { 5445 /* 5446 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5447 * and other code needs to be updated if UMIP can be guest owned. 5448 */ 5449 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5450 5451 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5452 return kvm_emulate_instruction(vcpu, 0); 5453 } 5454 5455 static int handle_cr(struct kvm_vcpu *vcpu) 5456 { 5457 unsigned long exit_qualification, val; 5458 int cr; 5459 int reg; 5460 int err; 5461 int ret; 5462 5463 exit_qualification = vmx_get_exit_qual(vcpu); 5464 cr = exit_qualification & 15; 5465 reg = (exit_qualification >> 8) & 15; 5466 switch ((exit_qualification >> 4) & 3) { 5467 case 0: /* mov to cr */ 5468 val = kvm_register_read(vcpu, reg); 5469 trace_kvm_cr_write(cr, val); 5470 switch (cr) { 5471 case 0: 5472 err = handle_set_cr0(vcpu, val); 5473 return kvm_complete_insn_gp(vcpu, err); 5474 case 3: 5475 WARN_ON_ONCE(enable_unrestricted_guest); 5476 5477 err = kvm_set_cr3(vcpu, val); 5478 return kvm_complete_insn_gp(vcpu, err); 5479 case 4: 5480 err = handle_set_cr4(vcpu, val); 5481 return kvm_complete_insn_gp(vcpu, err); 5482 case 8: { 5483 u8 cr8_prev = kvm_get_cr8(vcpu); 5484 u8 cr8 = (u8)val; 5485 err = kvm_set_cr8(vcpu, cr8); 5486 ret = kvm_complete_insn_gp(vcpu, err); 5487 if (lapic_in_kernel(vcpu)) 5488 return ret; 5489 if (cr8_prev <= cr8) 5490 return ret; 5491 /* 5492 * TODO: we might be squashing a 5493 * KVM_GUESTDBG_SINGLESTEP-triggered 5494 * KVM_EXIT_DEBUG here. 5495 */ 5496 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5497 return 0; 5498 } 5499 } 5500 break; 5501 case 2: /* clts */ 5502 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5503 return -EIO; 5504 case 1: /*mov from cr*/ 5505 switch (cr) { 5506 case 3: 5507 WARN_ON_ONCE(enable_unrestricted_guest); 5508 5509 val = kvm_read_cr3(vcpu); 5510 kvm_register_write(vcpu, reg, val); 5511 trace_kvm_cr_read(cr, val); 5512 return kvm_skip_emulated_instruction(vcpu); 5513 case 8: 5514 val = kvm_get_cr8(vcpu); 5515 kvm_register_write(vcpu, reg, val); 5516 trace_kvm_cr_read(cr, val); 5517 return kvm_skip_emulated_instruction(vcpu); 5518 } 5519 break; 5520 case 3: /* lmsw */ 5521 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5522 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5523 kvm_lmsw(vcpu, val); 5524 5525 return kvm_skip_emulated_instruction(vcpu); 5526 default: 5527 break; 5528 } 5529 vcpu->run->exit_reason = 0; 5530 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5531 (int)(exit_qualification >> 4) & 3, cr); 5532 return 0; 5533 } 5534 5535 static int handle_dr(struct kvm_vcpu *vcpu) 5536 { 5537 unsigned long exit_qualification; 5538 int dr, dr7, reg; 5539 int err = 1; 5540 5541 exit_qualification = vmx_get_exit_qual(vcpu); 5542 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5543 5544 /* First, if DR does not exist, trigger UD */ 5545 if (!kvm_require_dr(vcpu, dr)) 5546 return 1; 5547 5548 if (vmx_get_cpl(vcpu) > 0) 5549 goto out; 5550 5551 dr7 = vmcs_readl(GUEST_DR7); 5552 if (dr7 & DR7_GD) { 5553 /* 5554 * As the vm-exit takes precedence over the debug trap, we 5555 * need to emulate the latter, either for the host or the 5556 * guest debugging itself. 5557 */ 5558 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5559 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5560 vcpu->run->debug.arch.dr7 = dr7; 5561 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5562 vcpu->run->debug.arch.exception = DB_VECTOR; 5563 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5564 return 0; 5565 } else { 5566 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5567 return 1; 5568 } 5569 } 5570 5571 if (vcpu->guest_debug == 0) { 5572 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5573 5574 /* 5575 * No more DR vmexits; force a reload of the debug registers 5576 * and reenter on this instruction. The next vmexit will 5577 * retrieve the full state of the debug registers. 5578 */ 5579 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5580 return 1; 5581 } 5582 5583 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5584 if (exit_qualification & TYPE_MOV_FROM_DR) { 5585 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5586 err = 0; 5587 } else { 5588 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5589 } 5590 5591 out: 5592 return kvm_complete_insn_gp(vcpu, err); 5593 } 5594 5595 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5596 { 5597 get_debugreg(vcpu->arch.db[0], 0); 5598 get_debugreg(vcpu->arch.db[1], 1); 5599 get_debugreg(vcpu->arch.db[2], 2); 5600 get_debugreg(vcpu->arch.db[3], 3); 5601 get_debugreg(vcpu->arch.dr6, 6); 5602 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5603 5604 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5605 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5606 5607 /* 5608 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5609 * a stale dr6 from the guest. 5610 */ 5611 set_debugreg(DR6_RESERVED, 6); 5612 } 5613 5614 void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 5615 { 5616 lockdep_assert_irqs_disabled(); 5617 set_debugreg(vcpu->arch.dr6, 6); 5618 } 5619 5620 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5621 { 5622 vmcs_writel(GUEST_DR7, val); 5623 } 5624 5625 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5626 { 5627 kvm_apic_update_ppr(vcpu); 5628 return 1; 5629 } 5630 5631 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5632 { 5633 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5634 5635 kvm_make_request(KVM_REQ_EVENT, vcpu); 5636 5637 ++vcpu->stat.irq_window_exits; 5638 return 1; 5639 } 5640 5641 static int handle_invlpg(struct kvm_vcpu *vcpu) 5642 { 5643 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5644 5645 kvm_mmu_invlpg(vcpu, exit_qualification); 5646 return kvm_skip_emulated_instruction(vcpu); 5647 } 5648 5649 static int handle_apic_access(struct kvm_vcpu *vcpu) 5650 { 5651 if (likely(fasteoi)) { 5652 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5653 int access_type, offset; 5654 5655 access_type = exit_qualification & APIC_ACCESS_TYPE; 5656 offset = exit_qualification & APIC_ACCESS_OFFSET; 5657 /* 5658 * Sane guest uses MOV to write EOI, with written value 5659 * not cared. So make a short-circuit here by avoiding 5660 * heavy instruction emulation. 5661 */ 5662 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5663 (offset == APIC_EOI)) { 5664 kvm_lapic_set_eoi(vcpu); 5665 return kvm_skip_emulated_instruction(vcpu); 5666 } 5667 } 5668 return kvm_emulate_instruction(vcpu, 0); 5669 } 5670 5671 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5672 { 5673 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5674 int vector = exit_qualification & 0xff; 5675 5676 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5677 kvm_apic_set_eoi_accelerated(vcpu, vector); 5678 return 1; 5679 } 5680 5681 static int handle_apic_write(struct kvm_vcpu *vcpu) 5682 { 5683 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5684 5685 /* 5686 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5687 * hardware has done any necessary aliasing, offset adjustments, etc... 5688 * for the access. I.e. the correct value has already been written to 5689 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5690 * retrieve the register value and emulate the access. 5691 */ 5692 u32 offset = exit_qualification & 0xff0; 5693 5694 kvm_apic_write_nodecode(vcpu, offset); 5695 return 1; 5696 } 5697 5698 static int handle_task_switch(struct kvm_vcpu *vcpu) 5699 { 5700 struct vcpu_vmx *vmx = to_vmx(vcpu); 5701 unsigned long exit_qualification; 5702 bool has_error_code = false; 5703 u32 error_code = 0; 5704 u16 tss_selector; 5705 int reason, type, idt_v, idt_index; 5706 5707 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5708 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5709 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5710 5711 exit_qualification = vmx_get_exit_qual(vcpu); 5712 5713 reason = (u32)exit_qualification >> 30; 5714 if (reason == TASK_SWITCH_GATE && idt_v) { 5715 switch (type) { 5716 case INTR_TYPE_NMI_INTR: 5717 vcpu->arch.nmi_injected = false; 5718 vmx_set_nmi_mask(vcpu, true); 5719 break; 5720 case INTR_TYPE_EXT_INTR: 5721 case INTR_TYPE_SOFT_INTR: 5722 kvm_clear_interrupt_queue(vcpu); 5723 break; 5724 case INTR_TYPE_HARD_EXCEPTION: 5725 if (vmx->idt_vectoring_info & 5726 VECTORING_INFO_DELIVER_CODE_MASK) { 5727 has_error_code = true; 5728 error_code = 5729 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5730 } 5731 fallthrough; 5732 case INTR_TYPE_SOFT_EXCEPTION: 5733 kvm_clear_exception_queue(vcpu); 5734 break; 5735 default: 5736 break; 5737 } 5738 } 5739 tss_selector = exit_qualification; 5740 5741 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5742 type != INTR_TYPE_EXT_INTR && 5743 type != INTR_TYPE_NMI_INTR)) 5744 WARN_ON(!skip_emulated_instruction(vcpu)); 5745 5746 /* 5747 * TODO: What about debug traps on tss switch? 5748 * Are we supposed to inject them and update dr6? 5749 */ 5750 return kvm_task_switch(vcpu, tss_selector, 5751 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5752 reason, has_error_code, error_code); 5753 } 5754 5755 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5756 { 5757 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5758 gpa_t gpa; 5759 5760 /* 5761 * EPT violation happened while executing iret from NMI, 5762 * "blocked by NMI" bit has to be set before next VM entry. 5763 * There are errata that may cause this bit to not be set: 5764 * AAK134, BY25. 5765 */ 5766 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5767 enable_vnmi && 5768 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5769 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5770 5771 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5772 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5773 5774 /* 5775 * Check that the GPA doesn't exceed physical memory limits, as that is 5776 * a guest page fault. We have to emulate the instruction here, because 5777 * if the illegal address is that of a paging structure, then 5778 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5779 * would also use advanced VM-exit information for EPT violations to 5780 * reconstruct the page fault error code. 5781 */ 5782 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 5783 return kvm_emulate_instruction(vcpu, 0); 5784 5785 return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); 5786 } 5787 5788 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5789 { 5790 gpa_t gpa; 5791 5792 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5793 return 1; 5794 5795 /* 5796 * A nested guest cannot optimize MMIO vmexits, because we have an 5797 * nGPA here instead of the required GPA. 5798 */ 5799 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5800 if (!is_guest_mode(vcpu) && 5801 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5802 trace_kvm_fast_mmio(gpa); 5803 return kvm_skip_emulated_instruction(vcpu); 5804 } 5805 5806 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5807 } 5808 5809 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5810 { 5811 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5812 return -EIO; 5813 5814 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5815 ++vcpu->stat.nmi_window_exits; 5816 kvm_make_request(KVM_REQ_EVENT, vcpu); 5817 5818 return 1; 5819 } 5820 5821 /* 5822 * Returns true if emulation is required (due to the vCPU having invalid state 5823 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the 5824 * current vCPU state. 5825 */ 5826 static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) 5827 { 5828 struct vcpu_vmx *vmx = to_vmx(vcpu); 5829 5830 if (!vmx->vt.emulation_required) 5831 return false; 5832 5833 /* 5834 * It is architecturally impossible for emulation to be required when a 5835 * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if 5836 * guest state is invalid and unrestricted guest is disabled, i.e. KVM 5837 * should synthesize VM-Fail instead emulation L2 code. This path is 5838 * only reachable if userspace modifies L2 guest state after KVM has 5839 * performed the nested VM-Enter consistency checks. 5840 */ 5841 if (vmx->nested.nested_run_pending) 5842 return true; 5843 5844 /* 5845 * KVM only supports emulating exceptions if the vCPU is in Real Mode. 5846 * If emulation is required, KVM can't perform a successful VM-Enter to 5847 * inject the exception. 5848 */ 5849 return !vmx->rmode.vm86_active && 5850 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 5851 } 5852 5853 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5854 { 5855 struct vcpu_vmx *vmx = to_vmx(vcpu); 5856 bool intr_window_requested; 5857 unsigned count = 130; 5858 5859 intr_window_requested = exec_controls_get(vmx) & 5860 CPU_BASED_INTR_WINDOW_EXITING; 5861 5862 while (vmx->vt.emulation_required && count-- != 0) { 5863 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 5864 return handle_interrupt_window(&vmx->vcpu); 5865 5866 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5867 return 1; 5868 5869 if (!kvm_emulate_instruction(vcpu, 0)) 5870 return 0; 5871 5872 if (vmx_unhandleable_emulation_required(vcpu)) { 5873 kvm_prepare_emulation_failure_exit(vcpu); 5874 return 0; 5875 } 5876 5877 if (vcpu->arch.halt_request) { 5878 vcpu->arch.halt_request = 0; 5879 return kvm_emulate_halt_noskip(vcpu); 5880 } 5881 5882 /* 5883 * Note, return 1 and not 0, vcpu_run() will invoke 5884 * xfer_to_guest_mode() which will create a proper return 5885 * code. 5886 */ 5887 if (__xfer_to_guest_mode_work_pending()) 5888 return 1; 5889 } 5890 5891 return 1; 5892 } 5893 5894 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5895 { 5896 if (vmx_unhandleable_emulation_required(vcpu)) { 5897 kvm_prepare_emulation_failure_exit(vcpu); 5898 return 0; 5899 } 5900 5901 return 1; 5902 } 5903 5904 /* 5905 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5906 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5907 */ 5908 static int handle_pause(struct kvm_vcpu *vcpu) 5909 { 5910 if (!kvm_pause_in_guest(vcpu->kvm)) 5911 grow_ple_window(vcpu); 5912 5913 /* 5914 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5915 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5916 * never set PAUSE_EXITING and just set PLE if supported, 5917 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5918 */ 5919 kvm_vcpu_on_spin(vcpu, true); 5920 return kvm_skip_emulated_instruction(vcpu); 5921 } 5922 5923 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5924 { 5925 return 1; 5926 } 5927 5928 static int handle_invpcid(struct kvm_vcpu *vcpu) 5929 { 5930 u32 vmx_instruction_info; 5931 unsigned long type; 5932 gva_t gva; 5933 struct { 5934 u64 pcid; 5935 u64 gla; 5936 } operand; 5937 int gpr_index; 5938 5939 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 5940 kvm_queue_exception(vcpu, UD_VECTOR); 5941 return 1; 5942 } 5943 5944 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5945 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5946 type = kvm_register_read(vcpu, gpr_index); 5947 5948 /* According to the Intel instruction reference, the memory operand 5949 * is read even if it isn't needed (e.g., for type==all) 5950 */ 5951 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5952 vmx_instruction_info, false, 5953 sizeof(operand), &gva)) 5954 return 1; 5955 5956 return kvm_handle_invpcid(vcpu, type, gva); 5957 } 5958 5959 static int handle_pml_full(struct kvm_vcpu *vcpu) 5960 { 5961 unsigned long exit_qualification; 5962 5963 trace_kvm_pml_full(vcpu->vcpu_id); 5964 5965 exit_qualification = vmx_get_exit_qual(vcpu); 5966 5967 /* 5968 * PML buffer FULL happened while executing iret from NMI, 5969 * "blocked by NMI" bit has to be set before next VM entry. 5970 */ 5971 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5972 enable_vnmi && 5973 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5974 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5975 GUEST_INTR_STATE_NMI); 5976 5977 /* 5978 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 5979 * here.., and there's no userspace involvement needed for PML. 5980 */ 5981 return 1; 5982 } 5983 5984 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 5985 bool force_immediate_exit) 5986 { 5987 struct vcpu_vmx *vmx = to_vmx(vcpu); 5988 5989 /* 5990 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 5991 * due to the timer expiring while it was "soft" disabled, just eat the 5992 * exit and re-enter the guest. 5993 */ 5994 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 5995 return EXIT_FASTPATH_REENTER_GUEST; 5996 5997 /* 5998 * If the timer expired because KVM used it to force an immediate exit, 5999 * then mission accomplished. 6000 */ 6001 if (force_immediate_exit) 6002 return EXIT_FASTPATH_EXIT_HANDLED; 6003 6004 /* 6005 * If L2 is active, go down the slow path as emulating the guest timer 6006 * expiration likely requires synthesizing a nested VM-Exit. 6007 */ 6008 if (is_guest_mode(vcpu)) 6009 return EXIT_FASTPATH_NONE; 6010 6011 kvm_lapic_expired_hv_timer(vcpu); 6012 return EXIT_FASTPATH_REENTER_GUEST; 6013 } 6014 6015 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6016 { 6017 /* 6018 * This non-fastpath handler is reached if and only if the preemption 6019 * timer was being used to emulate a guest timer while L2 is active. 6020 * All other scenarios are supposed to be handled in the fastpath. 6021 */ 6022 WARN_ON_ONCE(!is_guest_mode(vcpu)); 6023 kvm_lapic_expired_hv_timer(vcpu); 6024 return 1; 6025 } 6026 6027 /* 6028 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6029 * are overwritten by nested_vmx_hardware_setup() when nested=1. 6030 */ 6031 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6032 { 6033 kvm_queue_exception(vcpu, UD_VECTOR); 6034 return 1; 6035 } 6036 6037 #ifndef CONFIG_X86_SGX_KVM 6038 static int handle_encls(struct kvm_vcpu *vcpu) 6039 { 6040 /* 6041 * SGX virtualization is disabled. There is no software enable bit for 6042 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6043 * the guest from executing ENCLS (when SGX is supported by hardware). 6044 */ 6045 kvm_queue_exception(vcpu, UD_VECTOR); 6046 return 1; 6047 } 6048 #endif /* CONFIG_X86_SGX_KVM */ 6049 6050 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6051 { 6052 /* 6053 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6054 * VM-Exits. Unconditionally set the flag here and leave the handling to 6055 * vmx_handle_exit(). 6056 */ 6057 to_vt(vcpu)->exit_reason.bus_lock_detected = true; 6058 return 1; 6059 } 6060 6061 static int handle_notify(struct kvm_vcpu *vcpu) 6062 { 6063 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6064 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6065 6066 ++vcpu->stat.notify_window_exits; 6067 6068 /* 6069 * Notify VM exit happened while executing iret from NMI, 6070 * "blocked by NMI" bit has to be set before next VM entry. 6071 */ 6072 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6073 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6074 GUEST_INTR_STATE_NMI); 6075 6076 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6077 context_invalid) { 6078 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6079 vcpu->run->notify.flags = context_invalid ? 6080 KVM_NOTIFY_CONTEXT_INVALID : 0; 6081 return 0; 6082 } 6083 6084 return 1; 6085 } 6086 6087 /* 6088 * The exit handlers return 1 if the exit was handled fully and guest execution 6089 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6090 * to be done to userspace and return 0. 6091 */ 6092 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6093 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6094 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6095 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6096 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6097 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6098 [EXIT_REASON_CR_ACCESS] = handle_cr, 6099 [EXIT_REASON_DR_ACCESS] = handle_dr, 6100 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6101 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6102 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6103 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6104 [EXIT_REASON_HLT] = kvm_emulate_halt, 6105 [EXIT_REASON_INVD] = kvm_emulate_invd, 6106 [EXIT_REASON_INVLPG] = handle_invlpg, 6107 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6108 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6109 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6110 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6111 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6112 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6113 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6114 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6115 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6116 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6117 [EXIT_REASON_VMON] = handle_vmx_instruction, 6118 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6119 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6120 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6121 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6122 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6123 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6124 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6125 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6126 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6127 [EXIT_REASON_LDTR_TR] = handle_desc, 6128 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6129 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6130 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6131 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6132 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6133 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6134 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6135 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6136 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6137 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6138 [EXIT_REASON_PML_FULL] = handle_pml_full, 6139 [EXIT_REASON_INVPCID] = handle_invpcid, 6140 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6141 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6142 [EXIT_REASON_ENCLS] = handle_encls, 6143 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6144 [EXIT_REASON_NOTIFY] = handle_notify, 6145 }; 6146 6147 static const int kvm_vmx_max_exit_handlers = 6148 ARRAY_SIZE(kvm_vmx_exit_handlers); 6149 6150 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6151 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 6152 { 6153 struct vcpu_vmx *vmx = to_vmx(vcpu); 6154 6155 *reason = vmx->vt.exit_reason.full; 6156 *info1 = vmx_get_exit_qual(vcpu); 6157 if (!(vmx->vt.exit_reason.failed_vmentry)) { 6158 *info2 = vmx->idt_vectoring_info; 6159 *intr_info = vmx_get_intr_info(vcpu); 6160 if (is_exception_with_error_code(*intr_info)) 6161 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6162 else 6163 *error_code = 0; 6164 } else { 6165 *info2 = 0; 6166 *intr_info = 0; 6167 *error_code = 0; 6168 } 6169 } 6170 6171 void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) 6172 { 6173 *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 6174 if (is_exception_with_error_code(*intr_info)) 6175 *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); 6176 else 6177 *error_code = 0; 6178 } 6179 6180 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6181 { 6182 if (vmx->pml_pg) { 6183 __free_page(vmx->pml_pg); 6184 vmx->pml_pg = NULL; 6185 } 6186 } 6187 6188 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6189 { 6190 struct vcpu_vmx *vmx = to_vmx(vcpu); 6191 u16 pml_idx, pml_tail_index; 6192 u64 *pml_buf; 6193 int i; 6194 6195 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6196 6197 /* Do nothing if PML buffer is empty */ 6198 if (pml_idx == PML_HEAD_INDEX) 6199 return; 6200 /* 6201 * PML index always points to the next available PML buffer entity 6202 * unless PML log has just overflowed. 6203 */ 6204 pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; 6205 6206 /* 6207 * PML log is written backwards: the CPU first writes the entry 511 6208 * then the entry 510, and so on. 6209 * 6210 * Read the entries in the same order they were written, to ensure that 6211 * the dirty ring is filled in the same order the CPU wrote them. 6212 */ 6213 pml_buf = page_address(vmx->pml_pg); 6214 6215 for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { 6216 u64 gpa; 6217 6218 gpa = pml_buf[i]; 6219 WARN_ON(gpa & (PAGE_SIZE - 1)); 6220 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6221 } 6222 6223 /* reset PML index */ 6224 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 6225 } 6226 6227 static void vmx_dump_sel(char *name, uint32_t sel) 6228 { 6229 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6230 name, vmcs_read16(sel), 6231 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6232 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6233 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6234 } 6235 6236 static void vmx_dump_dtsel(char *name, uint32_t limit) 6237 { 6238 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6239 name, vmcs_read32(limit), 6240 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6241 } 6242 6243 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6244 { 6245 unsigned int i; 6246 struct vmx_msr_entry *e; 6247 6248 pr_err("MSR %s:\n", name); 6249 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6250 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6251 } 6252 6253 void dump_vmcs(struct kvm_vcpu *vcpu) 6254 { 6255 struct vcpu_vmx *vmx = to_vmx(vcpu); 6256 u32 vmentry_ctl, vmexit_ctl; 6257 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6258 u64 tertiary_exec_control; 6259 unsigned long cr4; 6260 int efer_slot; 6261 6262 if (!dump_invalid_vmcs) { 6263 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6264 return; 6265 } 6266 6267 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6268 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6269 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6270 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6271 cr4 = vmcs_readl(GUEST_CR4); 6272 6273 if (cpu_has_secondary_exec_ctrls()) 6274 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6275 else 6276 secondary_exec_control = 0; 6277 6278 if (cpu_has_tertiary_exec_ctrls()) 6279 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6280 else 6281 tertiary_exec_control = 0; 6282 6283 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6284 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6285 pr_err("*** Guest State ***\n"); 6286 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6287 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6288 vmcs_readl(CR0_GUEST_HOST_MASK)); 6289 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6290 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6291 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6292 if (cpu_has_vmx_ept()) { 6293 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6294 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6295 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6296 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6297 } 6298 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6299 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6300 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6301 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6302 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6303 vmcs_readl(GUEST_SYSENTER_ESP), 6304 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6305 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6306 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6307 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6308 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6309 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6310 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6311 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6312 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6313 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6314 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6315 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6316 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6317 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6318 else if (efer_slot >= 0) 6319 pr_err("EFER= 0x%016llx (autoload)\n", 6320 vmx->msr_autoload.guest.val[efer_slot].value); 6321 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6322 pr_err("EFER= 0x%016llx (effective)\n", 6323 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6324 else 6325 pr_err("EFER= 0x%016llx (effective)\n", 6326 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6327 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6328 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6329 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6330 vmcs_read64(GUEST_IA32_DEBUGCTL), 6331 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6332 if (cpu_has_load_perf_global_ctrl() && 6333 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6334 pr_err("PerfGlobCtl = 0x%016llx\n", 6335 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6336 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6337 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6338 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6339 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6340 vmcs_read32(GUEST_ACTIVITY_STATE)); 6341 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6342 pr_err("InterruptStatus = %04x\n", 6343 vmcs_read16(GUEST_INTR_STATUS)); 6344 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6345 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6346 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6347 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6348 6349 pr_err("*** Host State ***\n"); 6350 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6351 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6352 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6353 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6354 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6355 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6356 vmcs_read16(HOST_TR_SELECTOR)); 6357 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6358 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6359 vmcs_readl(HOST_TR_BASE)); 6360 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6361 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6362 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6363 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6364 vmcs_readl(HOST_CR4)); 6365 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6366 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6367 vmcs_read32(HOST_IA32_SYSENTER_CS), 6368 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6369 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6370 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6371 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6372 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6373 if (cpu_has_load_perf_global_ctrl() && 6374 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6375 pr_err("PerfGlobCtl = 0x%016llx\n", 6376 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6377 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6378 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6379 6380 pr_err("*** Control State ***\n"); 6381 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6382 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6383 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6384 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6385 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6386 vmcs_read32(EXCEPTION_BITMAP), 6387 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6388 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6389 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6390 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6391 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6392 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6393 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6394 vmcs_read32(VM_EXIT_INTR_INFO), 6395 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6396 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6397 pr_err(" reason=%08x qualification=%016lx\n", 6398 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6399 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6400 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6401 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6402 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6403 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6404 pr_err("TSC Multiplier = 0x%016llx\n", 6405 vmcs_read64(TSC_MULTIPLIER)); 6406 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6407 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6408 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6409 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6410 } 6411 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6412 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6413 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6414 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6415 } 6416 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6417 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6418 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6419 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6420 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6421 pr_err("PLE Gap=%08x Window=%08x\n", 6422 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6423 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6424 pr_err("Virtual processor ID = 0x%04x\n", 6425 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6426 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { 6427 struct vmx_ve_information *ve_info = vmx->ve_info; 6428 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); 6429 6430 /* 6431 * If KVM is dumping the VMCS, then something has gone wrong 6432 * already. Derefencing an address from the VMCS, which could 6433 * very well be corrupted, is a terrible idea. The virtual 6434 * address is known so use it. 6435 */ 6436 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, 6437 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); 6438 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", 6439 ve_info->exit_reason, ve_info->delivery, 6440 ve_info->exit_qualification, 6441 ve_info->guest_linear_address, 6442 ve_info->guest_physical_address, ve_info->eptp_index); 6443 } 6444 } 6445 6446 /* 6447 * The guest has exited. See if we can fix it or if we need userspace 6448 * assistance. 6449 */ 6450 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6451 { 6452 struct vcpu_vmx *vmx = to_vmx(vcpu); 6453 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 6454 u32 vectoring_info = vmx->idt_vectoring_info; 6455 u16 exit_handler_index; 6456 6457 /* 6458 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6459 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6460 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6461 * mode as if vcpus is in root mode, the PML buffer must has been 6462 * flushed already. Note, PML is never enabled in hardware while 6463 * running L2. 6464 */ 6465 if (enable_pml && !is_guest_mode(vcpu)) 6466 vmx_flush_pml_buffer(vcpu); 6467 6468 /* 6469 * KVM should never reach this point with a pending nested VM-Enter. 6470 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6471 * invalid guest state should never happen as that means KVM knowingly 6472 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6473 */ 6474 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6475 return -EIO; 6476 6477 if (is_guest_mode(vcpu)) { 6478 /* 6479 * PML is never enabled when running L2, bail immediately if a 6480 * PML full exit occurs as something is horribly wrong. 6481 */ 6482 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6483 goto unexpected_vmexit; 6484 6485 /* 6486 * The host physical addresses of some pages of guest memory 6487 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6488 * Page). The CPU may write to these pages via their host 6489 * physical address while L2 is running, bypassing any 6490 * address-translation-based dirty tracking (e.g. EPT write 6491 * protection). 6492 * 6493 * Mark them dirty on every exit from L2 to prevent them from 6494 * getting out of sync with dirty tracking. 6495 */ 6496 nested_mark_vmcs12_pages_dirty(vcpu); 6497 6498 /* 6499 * Synthesize a triple fault if L2 state is invalid. In normal 6500 * operation, nested VM-Enter rejects any attempt to enter L2 6501 * with invalid state. However, those checks are skipped if 6502 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6503 * L2 state is invalid, it means either L1 modified SMRAM state 6504 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6505 * doing so is architecturally allowed in the RSM case, and is 6506 * the least awful solution for the userspace case without 6507 * risking false positives. 6508 */ 6509 if (vmx->vt.emulation_required) { 6510 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6511 return 1; 6512 } 6513 6514 if (nested_vmx_reflect_vmexit(vcpu)) 6515 return 1; 6516 } 6517 6518 /* If guest state is invalid, start emulating. L2 is handled above. */ 6519 if (vmx->vt.emulation_required) 6520 return handle_invalid_guest_state(vcpu); 6521 6522 if (exit_reason.failed_vmentry) { 6523 dump_vmcs(vcpu); 6524 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6525 vcpu->run->fail_entry.hardware_entry_failure_reason 6526 = exit_reason.full; 6527 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6528 return 0; 6529 } 6530 6531 if (unlikely(vmx->fail)) { 6532 dump_vmcs(vcpu); 6533 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6534 vcpu->run->fail_entry.hardware_entry_failure_reason 6535 = vmcs_read32(VM_INSTRUCTION_ERROR); 6536 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6537 return 0; 6538 } 6539 6540 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6541 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6542 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6543 exit_reason.basic != EXIT_REASON_PML_FULL && 6544 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6545 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6546 exit_reason.basic != EXIT_REASON_NOTIFY && 6547 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { 6548 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); 6549 return 0; 6550 } 6551 6552 if (unlikely(!enable_vnmi && 6553 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6554 if (!vmx_interrupt_blocked(vcpu)) { 6555 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6556 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6557 vcpu->arch.nmi_pending) { 6558 /* 6559 * This CPU don't support us in finding the end of an 6560 * NMI-blocked window if the guest runs with IRQs 6561 * disabled. So we pull the trigger after 1 s of 6562 * futile waiting, but inform the user about this. 6563 */ 6564 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6565 "state on VCPU %d after 1 s timeout\n", 6566 __func__, vcpu->vcpu_id); 6567 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6568 } 6569 } 6570 6571 if (exit_fastpath != EXIT_FASTPATH_NONE) 6572 return 1; 6573 6574 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6575 goto unexpected_vmexit; 6576 #ifdef CONFIG_MITIGATION_RETPOLINE 6577 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6578 return kvm_emulate_wrmsr(vcpu); 6579 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6580 return handle_preemption_timer(vcpu); 6581 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6582 return handle_interrupt_window(vcpu); 6583 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6584 return handle_external_interrupt(vcpu); 6585 else if (exit_reason.basic == EXIT_REASON_HLT) 6586 return kvm_emulate_halt(vcpu); 6587 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6588 return handle_ept_misconfig(vcpu); 6589 #endif 6590 6591 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6592 kvm_vmx_max_exit_handlers); 6593 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6594 goto unexpected_vmexit; 6595 6596 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6597 6598 unexpected_vmexit: 6599 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6600 exit_reason.full); 6601 dump_vmcs(vcpu); 6602 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6603 vcpu->run->internal.suberror = 6604 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6605 vcpu->run->internal.ndata = 2; 6606 vcpu->run->internal.data[0] = exit_reason.full; 6607 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6608 return 0; 6609 } 6610 6611 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6612 { 6613 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6614 6615 /* 6616 * Exit to user space when bus lock detected to inform that there is 6617 * a bus lock in guest. 6618 */ 6619 if (vmx_get_exit_reason(vcpu).bus_lock_detected) { 6620 if (ret > 0) 6621 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6622 6623 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6624 return 0; 6625 } 6626 return ret; 6627 } 6628 6629 /* 6630 * Software based L1D cache flush which is used when microcode providing 6631 * the cache control MSR is not loaded. 6632 * 6633 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6634 * flush it is required to read in 64 KiB because the replacement algorithm 6635 * is not exactly LRU. This could be sized at runtime via topology 6636 * information but as all relevant affected CPUs have 32KiB L1D cache size 6637 * there is no point in doing so. 6638 */ 6639 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6640 { 6641 int size = PAGE_SIZE << L1D_CACHE_ORDER; 6642 6643 /* 6644 * This code is only executed when the flush mode is 'cond' or 6645 * 'always' 6646 */ 6647 if (static_branch_likely(&vmx_l1d_flush_cond)) { 6648 bool flush_l1d; 6649 6650 /* 6651 * Clear the per-vcpu flush bit, it gets set again if the vCPU 6652 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 6653 * exits to userspace, or if KVM reaches one of the unsafe 6654 * VMEXIT handlers, e.g. if KVM calls into the emulator. 6655 */ 6656 flush_l1d = vcpu->arch.l1tf_flush_l1d; 6657 vcpu->arch.l1tf_flush_l1d = false; 6658 6659 /* 6660 * Clear the per-cpu flush bit, it gets set again from 6661 * the interrupt handlers. 6662 */ 6663 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6664 kvm_clear_cpu_l1tf_flush_l1d(); 6665 6666 if (!flush_l1d) 6667 return; 6668 } 6669 6670 vcpu->stat.l1d_flush++; 6671 6672 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6673 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6674 return; 6675 } 6676 6677 asm volatile( 6678 /* First ensure the pages are in the TLB */ 6679 "xorl %%eax, %%eax\n" 6680 ".Lpopulate_tlb:\n\t" 6681 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6682 "addl $4096, %%eax\n\t" 6683 "cmpl %%eax, %[size]\n\t" 6684 "jne .Lpopulate_tlb\n\t" 6685 "xorl %%eax, %%eax\n\t" 6686 "cpuid\n\t" 6687 /* Now fill the cache */ 6688 "xorl %%eax, %%eax\n" 6689 ".Lfill_cache:\n" 6690 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6691 "addl $64, %%eax\n\t" 6692 "cmpl %%eax, %[size]\n\t" 6693 "jne .Lfill_cache\n\t" 6694 "lfence\n" 6695 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6696 [size] "r" (size) 6697 : "eax", "ebx", "ecx", "edx"); 6698 } 6699 6700 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6701 { 6702 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6703 int tpr_threshold; 6704 6705 if (is_guest_mode(vcpu) && 6706 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6707 return; 6708 6709 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6710 if (is_guest_mode(vcpu)) 6711 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; 6712 else 6713 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6714 } 6715 6716 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6717 { 6718 struct vcpu_vmx *vmx = to_vmx(vcpu); 6719 u32 sec_exec_control; 6720 6721 if (!lapic_in_kernel(vcpu)) 6722 return; 6723 6724 if (!flexpriority_enabled && 6725 !cpu_has_vmx_virtualize_x2apic_mode()) 6726 return; 6727 6728 /* Postpone execution until vmcs01 is the current VMCS. */ 6729 if (is_guest_mode(vcpu)) { 6730 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6731 return; 6732 } 6733 6734 sec_exec_control = secondary_exec_controls_get(vmx); 6735 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6736 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6737 6738 switch (kvm_get_apic_mode(vcpu)) { 6739 case LAPIC_MODE_INVALID: 6740 WARN_ONCE(true, "Invalid local APIC state"); 6741 break; 6742 case LAPIC_MODE_DISABLED: 6743 break; 6744 case LAPIC_MODE_XAPIC: 6745 if (flexpriority_enabled) { 6746 sec_exec_control |= 6747 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6748 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6749 6750 /* 6751 * Flush the TLB, reloading the APIC access page will 6752 * only do so if its physical address has changed, but 6753 * the guest may have inserted a non-APIC mapping into 6754 * the TLB while the APIC access page was disabled. 6755 */ 6756 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6757 } 6758 break; 6759 case LAPIC_MODE_X2APIC: 6760 if (cpu_has_vmx_virtualize_x2apic_mode()) 6761 sec_exec_control |= 6762 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6763 break; 6764 } 6765 secondary_exec_controls_set(vmx, sec_exec_control); 6766 6767 vmx_update_msr_bitmap_x2apic(vcpu); 6768 } 6769 6770 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6771 { 6772 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6773 struct kvm *kvm = vcpu->kvm; 6774 struct kvm_memslots *slots = kvm_memslots(kvm); 6775 struct kvm_memory_slot *slot; 6776 struct page *refcounted_page; 6777 unsigned long mmu_seq; 6778 kvm_pfn_t pfn; 6779 bool writable; 6780 6781 /* Defer reload until vmcs01 is the current VMCS. */ 6782 if (is_guest_mode(vcpu)) { 6783 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; 6784 return; 6785 } 6786 6787 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6788 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6789 return; 6790 6791 /* 6792 * Explicitly grab the memslot using KVM's internal slot ID to ensure 6793 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 6794 * be impossible for userspace to create a memslot for the APIC when 6795 * APICv is enabled, but paranoia won't hurt in this case. 6796 */ 6797 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6798 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6799 return; 6800 6801 /* 6802 * Ensure that the mmu_notifier sequence count is read before KVM 6803 * retrieves the pfn from the primary MMU. Note, the memslot is 6804 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6805 * in kvm_mmu_invalidate_end(). 6806 */ 6807 mmu_seq = kvm->mmu_invalidate_seq; 6808 smp_rmb(); 6809 6810 /* 6811 * No need to retry if the memslot does not exist or is invalid. KVM 6812 * controls the APIC-access page memslot, and only deletes the memslot 6813 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6814 */ 6815 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); 6816 if (is_error_noslot_pfn(pfn)) 6817 return; 6818 6819 read_lock(&vcpu->kvm->mmu_lock); 6820 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) 6821 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6822 else 6823 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6824 6825 /* 6826 * Do not pin the APIC access page in memory so that it can be freely 6827 * migrated, the MMU notifier will call us again if it is migrated or 6828 * swapped out. KVM backs the memslot with anonymous memory, the pfn 6829 * should always point at a refcounted page (if the pfn is valid). 6830 */ 6831 if (!WARN_ON_ONCE(!refcounted_page)) 6832 kvm_release_page_clean(refcounted_page); 6833 6834 /* 6835 * No need for a manual TLB flush at this point, KVM has already done a 6836 * flush if there were SPTEs pointing at the previous page. 6837 */ 6838 read_unlock(&vcpu->kvm->mmu_lock); 6839 } 6840 6841 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 6842 { 6843 u16 status; 6844 u8 old; 6845 6846 /* 6847 * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI 6848 * is only relevant for if and only if Virtual Interrupt Delivery is 6849 * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's 6850 * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested 6851 * VM-Exit, otherwise L1 with run with a stale SVI. 6852 */ 6853 if (is_guest_mode(vcpu)) { 6854 /* 6855 * KVM is supposed to forward intercepted L2 EOIs to L1 if VID 6856 * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. 6857 * Note, userspace can stuff state while L2 is active; assert 6858 * that VID is disabled if and only if the vCPU is in KVM_RUN 6859 * to avoid false positives if userspace is setting APIC state. 6860 */ 6861 WARN_ON_ONCE(vcpu->wants_to_run && 6862 nested_cpu_has_vid(get_vmcs12(vcpu))); 6863 to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; 6864 return; 6865 } 6866 6867 if (max_isr == -1) 6868 max_isr = 0; 6869 6870 status = vmcs_read16(GUEST_INTR_STATUS); 6871 old = status >> 8; 6872 if (max_isr != old) { 6873 status &= 0xff; 6874 status |= max_isr << 8; 6875 vmcs_write16(GUEST_INTR_STATUS, status); 6876 } 6877 } 6878 6879 static void vmx_set_rvi(int vector) 6880 { 6881 u16 status; 6882 u8 old; 6883 6884 if (vector == -1) 6885 vector = 0; 6886 6887 status = vmcs_read16(GUEST_INTR_STATUS); 6888 old = (u8)status & 0xff; 6889 if ((u8)vector != old) { 6890 status &= ~0xff; 6891 status |= (u8)vector; 6892 vmcs_write16(GUEST_INTR_STATUS, status); 6893 } 6894 } 6895 6896 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6897 { 6898 struct vcpu_vt *vt = to_vt(vcpu); 6899 int max_irr; 6900 bool got_posted_interrupt; 6901 6902 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 6903 return -EIO; 6904 6905 if (pi_test_on(&vt->pi_desc)) { 6906 pi_clear_on(&vt->pi_desc); 6907 /* 6908 * IOMMU can write to PID.ON, so the barrier matters even on UP. 6909 * But on x86 this is just a compiler barrier anyway. 6910 */ 6911 smp_mb__after_atomic(); 6912 got_posted_interrupt = 6913 kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); 6914 } else { 6915 max_irr = kvm_lapic_find_highest_irr(vcpu); 6916 got_posted_interrupt = false; 6917 } 6918 6919 /* 6920 * Newly recognized interrupts are injected via either virtual interrupt 6921 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 6922 * disabled in two cases: 6923 * 6924 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 6925 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 6926 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 6927 * into L2, but KVM doesn't use virtual interrupt delivery to inject 6928 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 6929 * 6930 * 2) If APICv is disabled for this vCPU, assigned devices may still 6931 * attempt to post interrupts. The posted interrupt vector will cause 6932 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 6933 */ 6934 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 6935 vmx_set_rvi(max_irr); 6936 else if (got_posted_interrupt) 6937 kvm_make_request(KVM_REQ_EVENT, vcpu); 6938 6939 return max_irr; 6940 } 6941 6942 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6943 { 6944 if (!kvm_vcpu_apicv_active(vcpu)) 6945 return; 6946 6947 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6948 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6949 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6950 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6951 } 6952 6953 void vmx_do_interrupt_irqoff(unsigned long entry); 6954 void vmx_do_nmi_irqoff(void); 6955 6956 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 6957 { 6958 /* 6959 * Save xfd_err to guest_fpu before interrupt is enabled, so the 6960 * MSR value is not clobbered by the host activity before the guest 6961 * has chance to consume it. 6962 * 6963 * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM 6964 * interception may have been caused by L1 interception. Per the SDM, 6965 * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. 6966 * 6967 * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. 6968 * unlike CR2 and DR6, the value is not a payload that is attached to 6969 * the #NM exception. 6970 */ 6971 if (is_xfd_nm_fault(vcpu)) 6972 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 6973 } 6974 6975 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 6976 { 6977 /* if exit due to PF check for async PF */ 6978 if (is_page_fault(intr_info)) 6979 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 6980 /* if exit due to NM, handle before interrupts are enabled */ 6981 else if (is_nm_fault(intr_info)) 6982 handle_nm_fault_irqoff(vcpu); 6983 /* Handle machine checks before interrupts are enabled */ 6984 else if (is_machine_check(intr_info)) 6985 kvm_machine_check(); 6986 } 6987 6988 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, 6989 u32 intr_info) 6990 { 6991 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 6992 6993 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 6994 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 6995 return; 6996 6997 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 6998 if (cpu_feature_enabled(X86_FEATURE_FRED)) 6999 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 7000 else 7001 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); 7002 kvm_after_interrupt(vcpu); 7003 7004 vcpu->arch.at_instruction_boundary = true; 7005 } 7006 7007 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 7008 { 7009 if (to_vt(vcpu)->emulation_required) 7010 return; 7011 7012 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) 7013 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7014 else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) 7015 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7016 } 7017 7018 /* 7019 * The kvm parameter can be NULL (module initialization, or invocation before 7020 * VM creation). Be sure to check the kvm parameter before using it. 7021 */ 7022 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7023 { 7024 switch (index) { 7025 case MSR_IA32_SMBASE: 7026 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7027 return false; 7028 /* 7029 * We cannot do SMM unless we can run the guest in big 7030 * real mode. 7031 */ 7032 return enable_unrestricted_guest || emulate_invalid_guest_state; 7033 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7034 return nested; 7035 case MSR_AMD64_VIRT_SPEC_CTRL: 7036 case MSR_AMD64_TSC_RATIO: 7037 /* This is AMD only. */ 7038 return false; 7039 default: 7040 return true; 7041 } 7042 } 7043 7044 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7045 { 7046 u32 exit_intr_info; 7047 bool unblock_nmi; 7048 u8 vector; 7049 bool idtv_info_valid; 7050 7051 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7052 7053 if (enable_vnmi) { 7054 if (vmx->loaded_vmcs->nmi_known_unmasked) 7055 return; 7056 7057 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7058 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7059 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7060 /* 7061 * SDM 3: 27.7.1.2 (September 2008) 7062 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7063 * a guest IRET fault. 7064 * SDM 3: 23.2.2 (September 2008) 7065 * Bit 12 is undefined in any of the following cases: 7066 * If the VM exit sets the valid bit in the IDT-vectoring 7067 * information field. 7068 * If the VM exit is due to a double fault. 7069 */ 7070 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7071 vector != DF_VECTOR && !idtv_info_valid) 7072 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7073 GUEST_INTR_STATE_NMI); 7074 else 7075 vmx->loaded_vmcs->nmi_known_unmasked = 7076 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7077 & GUEST_INTR_STATE_NMI); 7078 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7079 vmx->loaded_vmcs->vnmi_blocked_time += 7080 ktime_to_ns(ktime_sub(ktime_get(), 7081 vmx->loaded_vmcs->entry_time)); 7082 } 7083 7084 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7085 u32 idt_vectoring_info, 7086 int instr_len_field, 7087 int error_code_field) 7088 { 7089 u8 vector; 7090 int type; 7091 bool idtv_info_valid; 7092 7093 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7094 7095 vcpu->arch.nmi_injected = false; 7096 kvm_clear_exception_queue(vcpu); 7097 kvm_clear_interrupt_queue(vcpu); 7098 7099 if (!idtv_info_valid) 7100 return; 7101 7102 kvm_make_request(KVM_REQ_EVENT, vcpu); 7103 7104 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7105 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7106 7107 switch (type) { 7108 case INTR_TYPE_NMI_INTR: 7109 vcpu->arch.nmi_injected = true; 7110 /* 7111 * SDM 3: 27.7.1.2 (September 2008) 7112 * Clear bit "block by NMI" before VM entry if a NMI 7113 * delivery faulted. 7114 */ 7115 vmx_set_nmi_mask(vcpu, false); 7116 break; 7117 case INTR_TYPE_SOFT_EXCEPTION: 7118 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7119 fallthrough; 7120 case INTR_TYPE_HARD_EXCEPTION: { 7121 u32 error_code = 0; 7122 7123 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) 7124 error_code = vmcs_read32(error_code_field); 7125 7126 kvm_requeue_exception(vcpu, vector, 7127 idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, 7128 error_code); 7129 break; 7130 } 7131 case INTR_TYPE_SOFT_INTR: 7132 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7133 fallthrough; 7134 case INTR_TYPE_EXT_INTR: 7135 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7136 break; 7137 default: 7138 break; 7139 } 7140 } 7141 7142 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7143 { 7144 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7145 VM_EXIT_INSTRUCTION_LEN, 7146 IDT_VECTORING_ERROR_CODE); 7147 } 7148 7149 void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7150 { 7151 __vmx_complete_interrupts(vcpu, 7152 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7153 VM_ENTRY_INSTRUCTION_LEN, 7154 VM_ENTRY_EXCEPTION_ERROR_CODE); 7155 7156 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7157 } 7158 7159 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7160 { 7161 int i, nr_msrs; 7162 struct perf_guest_switch_msr *msrs; 7163 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7164 7165 pmu->host_cross_mapped_mask = 0; 7166 if (pmu->pebs_enable & pmu->global_ctrl) 7167 intel_pmu_cross_mapped_check(pmu); 7168 7169 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7170 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7171 if (!msrs) 7172 return; 7173 7174 for (i = 0; i < nr_msrs; i++) 7175 if (msrs[i].host == msrs[i].guest) 7176 clear_atomic_switch_msr(vmx, msrs[i].msr); 7177 else 7178 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7179 msrs[i].host, false); 7180 } 7181 7182 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7183 { 7184 struct vcpu_vmx *vmx = to_vmx(vcpu); 7185 u64 tscl; 7186 u32 delta_tsc; 7187 7188 if (force_immediate_exit) { 7189 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7190 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7191 } else if (vmx->hv_deadline_tsc != -1) { 7192 tscl = rdtsc(); 7193 if (vmx->hv_deadline_tsc > tscl) 7194 /* set_hv_timer ensures the delta fits in 32-bits */ 7195 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7196 cpu_preemption_timer_multi); 7197 else 7198 delta_tsc = 0; 7199 7200 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7201 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7202 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7203 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7204 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7205 } 7206 } 7207 7208 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7209 { 7210 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7211 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7212 vmcs_writel(HOST_RSP, host_rsp); 7213 } 7214 } 7215 7216 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7217 unsigned int flags) 7218 { 7219 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7220 7221 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7222 return; 7223 7224 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7225 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); 7226 7227 /* 7228 * If the guest/host SPEC_CTRL values differ, restore the host value. 7229 * 7230 * For legacy IBRS, the IBRS bit always needs to be written after 7231 * transitioning from a less privileged predictor mode, regardless of 7232 * whether the guest/host values differ. 7233 */ 7234 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7235 vmx->spec_ctrl != hostval) 7236 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); 7237 7238 barrier_nospec(); 7239 } 7240 7241 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7242 bool force_immediate_exit) 7243 { 7244 /* 7245 * If L2 is active, some VMX preemption timer exits can be handled in 7246 * the fastpath even, all other exits must use the slow path. 7247 */ 7248 if (is_guest_mode(vcpu) && 7249 vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) 7250 return EXIT_FASTPATH_NONE; 7251 7252 switch (vmx_get_exit_reason(vcpu).basic) { 7253 case EXIT_REASON_MSR_WRITE: 7254 return handle_fastpath_set_msr_irqoff(vcpu); 7255 case EXIT_REASON_PREEMPTION_TIMER: 7256 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7257 case EXIT_REASON_HLT: 7258 return handle_fastpath_hlt(vcpu); 7259 default: 7260 return EXIT_FASTPATH_NONE; 7261 } 7262 } 7263 7264 noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) 7265 { 7266 if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || 7267 !is_nmi(vmx_get_intr_info(vcpu))) 7268 return; 7269 7270 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7271 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7272 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7273 else 7274 vmx_do_nmi_irqoff(); 7275 kvm_after_interrupt(vcpu); 7276 } 7277 7278 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7279 unsigned int flags) 7280 { 7281 struct vcpu_vmx *vmx = to_vmx(vcpu); 7282 7283 guest_state_enter_irqoff(); 7284 7285 /* 7286 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7287 * mitigation for MDS is done late in VMentry and is still 7288 * executed in spite of L1D Flush. This is because an extra VERW 7289 * should not matter much after the big hammer L1D Flush. 7290 */ 7291 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7292 vmx_l1d_flush(vcpu); 7293 else if (static_branch_unlikely(&mmio_stale_data_clear) && 7294 kvm_arch_has_assigned_device(vcpu->kvm)) 7295 mds_clear_cpu_buffers(); 7296 7297 vmx_disable_fb_clear(vmx); 7298 7299 if (vcpu->arch.cr2 != native_read_cr2()) 7300 native_write_cr2(vcpu->arch.cr2); 7301 7302 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7303 flags); 7304 7305 vcpu->arch.cr2 = native_read_cr2(); 7306 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7307 7308 vmx->idt_vectoring_info = 0; 7309 7310 vmx_enable_fb_clear(vmx); 7311 7312 if (unlikely(vmx->fail)) { 7313 vmx->vt.exit_reason.full = 0xdead; 7314 goto out; 7315 } 7316 7317 vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7318 if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) 7319 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7320 7321 vmx_handle_nmi(vcpu); 7322 7323 out: 7324 guest_state_exit_irqoff(); 7325 } 7326 7327 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7328 { 7329 struct vcpu_vmx *vmx = to_vmx(vcpu); 7330 unsigned long cr3, cr4; 7331 7332 /* Record the guest's net vcpu time for enforced NMI injections. */ 7333 if (unlikely(!enable_vnmi && 7334 vmx->loaded_vmcs->soft_vnmi_blocked)) 7335 vmx->loaded_vmcs->entry_time = ktime_get(); 7336 7337 /* 7338 * Don't enter VMX if guest state is invalid, let the exit handler 7339 * start emulation until we arrive back to a valid state. Synthesize a 7340 * consistency check VM-Exit due to invalid guest state and bail. 7341 */ 7342 if (unlikely(vmx->vt.emulation_required)) { 7343 vmx->fail = 0; 7344 7345 vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; 7346 vmx->vt.exit_reason.failed_vmentry = 1; 7347 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7348 vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; 7349 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7350 vmx->vt.exit_intr_info = 0; 7351 return EXIT_FASTPATH_NONE; 7352 } 7353 7354 trace_kvm_entry(vcpu, force_immediate_exit); 7355 7356 if (vmx->ple_window_dirty) { 7357 vmx->ple_window_dirty = false; 7358 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7359 } 7360 7361 /* 7362 * We did this in prepare_switch_to_guest, because it needs to 7363 * be within srcu_read_lock. 7364 */ 7365 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7366 7367 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7368 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7369 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7370 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7371 vcpu->arch.regs_dirty = 0; 7372 7373 /* 7374 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7375 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7376 * it switches back to the current->mm, which can occur in KVM context 7377 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7378 * toggles a static key while handling a VM-Exit. 7379 */ 7380 cr3 = __get_current_cr3_fast(); 7381 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7382 vmcs_writel(HOST_CR3, cr3); 7383 vmx->loaded_vmcs->host_state.cr3 = cr3; 7384 } 7385 7386 cr4 = cr4_read_shadow(); 7387 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7388 vmcs_writel(HOST_CR4, cr4); 7389 vmx->loaded_vmcs->host_state.cr4 = cr4; 7390 } 7391 7392 /* When single-stepping over STI and MOV SS, we must clear the 7393 * corresponding interruptibility bits in the guest state. Otherwise 7394 * vmentry fails as it then expects bit 14 (BS) in pending debug 7395 * exceptions being set, but that's not correct for the guest debugging 7396 * case. */ 7397 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7398 vmx_set_interrupt_shadow(vcpu, 0); 7399 7400 kvm_load_guest_xsave_state(vcpu); 7401 7402 pt_guest_enter(vmx); 7403 7404 atomic_switch_perf_msrs(vmx); 7405 if (intel_pmu_lbr_is_enabled(vcpu)) 7406 vmx_passthrough_lbr_msrs(vcpu); 7407 7408 if (enable_preemption_timer) 7409 vmx_update_hv_timer(vcpu, force_immediate_exit); 7410 else if (force_immediate_exit) 7411 smp_send_reschedule(vcpu->cpu); 7412 7413 kvm_wait_lapic_expire(vcpu); 7414 7415 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7416 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7417 7418 /* All fields are clean at this point */ 7419 if (kvm_is_using_evmcs()) { 7420 current_evmcs->hv_clean_fields |= 7421 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7422 7423 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7424 } 7425 7426 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7427 if (vcpu->arch.host_debugctl) 7428 update_debugctlmsr(vcpu->arch.host_debugctl); 7429 7430 #ifndef CONFIG_X86_64 7431 /* 7432 * The sysexit path does not restore ds/es, so we must set them to 7433 * a reasonable value ourselves. 7434 * 7435 * We can't defer this to vmx_prepare_switch_to_host() since that 7436 * function may be executed in interrupt context, which saves and 7437 * restore segments around it, nullifying its effect. 7438 */ 7439 loadsegment(ds, __USER_DS); 7440 loadsegment(es, __USER_DS); 7441 #endif 7442 7443 pt_guest_exit(vmx); 7444 7445 kvm_load_host_xsave_state(vcpu); 7446 7447 if (is_guest_mode(vcpu)) { 7448 /* 7449 * Track VMLAUNCH/VMRESUME that have made past guest state 7450 * checking. 7451 */ 7452 if (vmx->nested.nested_run_pending && 7453 !vmx_get_exit_reason(vcpu).failed_vmentry) 7454 ++vcpu->stat.nested_run; 7455 7456 vmx->nested.nested_run_pending = 0; 7457 } 7458 7459 if (unlikely(vmx->fail)) 7460 return EXIT_FASTPATH_NONE; 7461 7462 if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7463 kvm_machine_check(); 7464 7465 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7466 7467 if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) 7468 return EXIT_FASTPATH_NONE; 7469 7470 vmx->loaded_vmcs->launched = 1; 7471 7472 vmx_recover_nmi_blocking(vmx); 7473 vmx_complete_interrupts(vmx); 7474 7475 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7476 } 7477 7478 void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7479 { 7480 struct vcpu_vmx *vmx = to_vmx(vcpu); 7481 7482 if (enable_pml) 7483 vmx_destroy_pml_buffer(vmx); 7484 free_vpid(vmx->vpid); 7485 nested_vmx_free_vcpu(vcpu); 7486 free_loaded_vmcs(vmx->loaded_vmcs); 7487 free_page((unsigned long)vmx->ve_info); 7488 } 7489 7490 int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7491 { 7492 struct vmx_uret_msr *tsx_ctrl; 7493 struct vcpu_vmx *vmx; 7494 int i, err; 7495 7496 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7497 vmx = to_vmx(vcpu); 7498 7499 INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); 7500 7501 err = -ENOMEM; 7502 7503 vmx->vpid = allocate_vpid(); 7504 7505 /* 7506 * If PML is turned on, failure on enabling PML just results in failure 7507 * of creating the vcpu, therefore we can simplify PML logic (by 7508 * avoiding dealing with cases, such as enabling PML partially on vcpus 7509 * for the guest), etc. 7510 */ 7511 if (enable_pml) { 7512 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7513 if (!vmx->pml_pg) 7514 goto free_vpid; 7515 } 7516 7517 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7518 vmx->guest_uret_msrs[i].mask = -1ull; 7519 if (boot_cpu_has(X86_FEATURE_RTM)) { 7520 /* 7521 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7522 * Keep the host value unchanged to avoid changing CPUID bits 7523 * under the host kernel's feet. 7524 */ 7525 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7526 if (tsx_ctrl) 7527 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7528 } 7529 7530 err = alloc_loaded_vmcs(&vmx->vmcs01); 7531 if (err < 0) 7532 goto free_pml; 7533 7534 /* 7535 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7536 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7537 * feature only for vmcs01, KVM currently isn't equipped to realize any 7538 * performance benefits from enabling it for vmcs02. 7539 */ 7540 if (kvm_is_using_evmcs() && 7541 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7542 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7543 7544 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7545 } 7546 7547 /* The MSR bitmap starts with all ones */ 7548 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7549 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7550 7551 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 7552 #ifdef CONFIG_X86_64 7553 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 7554 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 7555 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 7556 #endif 7557 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 7558 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 7559 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 7560 if (kvm_cstate_in_guest(vcpu->kvm)) { 7561 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 7562 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 7563 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 7564 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 7565 } 7566 7567 vmx->loaded_vmcs = &vmx->vmcs01; 7568 7569 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7570 err = kvm_alloc_apic_access_page(vcpu->kvm); 7571 if (err) 7572 goto free_vmcs; 7573 } 7574 7575 if (enable_ept && !enable_unrestricted_guest) { 7576 err = init_rmode_identity_map(vcpu->kvm); 7577 if (err) 7578 goto free_vmcs; 7579 } 7580 7581 err = -ENOMEM; 7582 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { 7583 struct page *page; 7584 7585 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); 7586 7587 /* ve_info must be page aligned. */ 7588 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7589 if (!page) 7590 goto free_vmcs; 7591 7592 vmx->ve_info = page_to_virt(page); 7593 } 7594 7595 if (vmx_can_use_ipiv(vcpu)) 7596 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7597 __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); 7598 7599 return 0; 7600 7601 free_vmcs: 7602 free_loaded_vmcs(vmx->loaded_vmcs); 7603 free_pml: 7604 vmx_destroy_pml_buffer(vmx); 7605 free_vpid: 7606 free_vpid(vmx->vpid); 7607 return err; 7608 } 7609 7610 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7611 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7612 7613 int vmx_vm_init(struct kvm *kvm) 7614 { 7615 if (!ple_gap) 7616 kvm->arch.pause_in_guest = true; 7617 7618 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7619 switch (l1tf_mitigation) { 7620 case L1TF_MITIGATION_OFF: 7621 case L1TF_MITIGATION_FLUSH_NOWARN: 7622 /* 'I explicitly don't care' is set */ 7623 break; 7624 case L1TF_MITIGATION_FLUSH: 7625 case L1TF_MITIGATION_FLUSH_NOSMT: 7626 case L1TF_MITIGATION_FULL: 7627 /* 7628 * Warn upon starting the first VM in a potentially 7629 * insecure environment. 7630 */ 7631 if (sched_smt_active()) 7632 pr_warn_once(L1TF_MSG_SMT); 7633 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7634 pr_warn_once(L1TF_MSG_L1D); 7635 break; 7636 case L1TF_MITIGATION_FULL_FORCE: 7637 /* Flush is enforced */ 7638 break; 7639 } 7640 } 7641 7642 if (enable_pml) 7643 kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; 7644 return 0; 7645 } 7646 7647 static inline bool vmx_ignore_guest_pat(struct kvm *kvm) 7648 { 7649 /* 7650 * Non-coherent DMA devices need the guest to flush CPU properly. 7651 * In that case it is not possible to map all guest RAM as WB, so 7652 * always trust guest PAT. 7653 */ 7654 return !kvm_arch_has_noncoherent_dma(kvm) && 7655 kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); 7656 } 7657 7658 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7659 { 7660 /* 7661 * Force UC for host MMIO regions, as allowing the guest to access MMIO 7662 * with cacheable accesses will result in Machine Checks. 7663 */ 7664 if (is_mmio) 7665 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7666 7667 /* Force WB if ignoring guest PAT */ 7668 if (vmx_ignore_guest_pat(vcpu->kvm)) 7669 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7670 7671 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); 7672 } 7673 7674 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7675 { 7676 /* 7677 * These bits in the secondary execution controls field 7678 * are dynamic, the others are mostly based on the hypervisor 7679 * architecture and the guest's CPUID. Do not touch the 7680 * dynamic bits. 7681 */ 7682 u32 mask = 7683 SECONDARY_EXEC_SHADOW_VMCS | 7684 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7685 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7686 SECONDARY_EXEC_DESC; 7687 7688 u32 cur_ctl = secondary_exec_controls_get(vmx); 7689 7690 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7691 } 7692 7693 /* 7694 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7695 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7696 */ 7697 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7698 { 7699 struct vcpu_vmx *vmx = to_vmx(vcpu); 7700 struct kvm_cpuid_entry2 *entry; 7701 7702 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7703 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7704 7705 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7706 if (entry && (entry->_reg & (_cpuid_mask))) \ 7707 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7708 } while (0) 7709 7710 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7711 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7712 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7713 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7714 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7715 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7716 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7717 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7718 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7719 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7720 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7721 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7722 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7723 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7724 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7725 7726 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7727 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7728 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7729 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7730 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7731 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7732 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7733 7734 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7735 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7736 7737 #undef cr4_fixed1_update 7738 } 7739 7740 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7741 { 7742 struct vcpu_vmx *vmx = to_vmx(vcpu); 7743 struct kvm_cpuid_entry2 *best = NULL; 7744 int i; 7745 7746 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7747 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7748 if (!best) 7749 return; 7750 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7751 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7752 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7753 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7754 } 7755 7756 /* Get the number of configurable Address Ranges for filtering */ 7757 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7758 PT_CAP_num_address_ranges); 7759 7760 /* Initialize and clear the no dependency bits */ 7761 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7762 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7763 RTIT_CTL_BRANCH_EN); 7764 7765 /* 7766 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7767 * will inject an #GP 7768 */ 7769 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7770 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7771 7772 /* 7773 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7774 * PSBFreq can be set 7775 */ 7776 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7777 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7778 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7779 7780 /* 7781 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7782 */ 7783 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7784 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7785 RTIT_CTL_MTC_RANGE); 7786 7787 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7788 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7789 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7790 RTIT_CTL_PTW_EN); 7791 7792 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7793 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7794 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7795 7796 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7797 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7798 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7799 7800 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7801 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7802 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7803 7804 /* unmask address range configure area */ 7805 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7806 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7807 } 7808 7809 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7810 { 7811 struct vcpu_vmx *vmx = to_vmx(vcpu); 7812 7813 /* 7814 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7815 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7816 * set if and only if XSAVE is supported. 7817 */ 7818 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) 7819 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); 7820 7821 vmx_setup_uret_msrs(vmx); 7822 7823 if (cpu_has_secondary_exec_ctrls()) 7824 vmcs_set_secondary_exec_control(vmx, 7825 vmx_secondary_exec_control(vmx)); 7826 7827 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7828 vmx->msr_ia32_feature_control_valid_bits |= 7829 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7830 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7831 else 7832 vmx->msr_ia32_feature_control_valid_bits &= 7833 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7834 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7835 7836 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7837 nested_vmx_cr_fixed1_bits_update(vcpu); 7838 7839 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7840 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) 7841 update_intel_pt_cfg(vcpu); 7842 7843 if (boot_cpu_has(X86_FEATURE_RTM)) { 7844 struct vmx_uret_msr *msr; 7845 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7846 if (msr) { 7847 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); 7848 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 7849 } 7850 } 7851 7852 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 7853 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 7854 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 7855 7856 if (boot_cpu_has(X86_FEATURE_IBPB)) 7857 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 7858 !guest_has_pred_cmd_msr(vcpu)); 7859 7860 if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 7861 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 7862 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 7863 7864 set_cr4_guest_host_mask(vmx); 7865 7866 vmx_write_encls_bitmap(vcpu, NULL); 7867 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) 7868 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 7869 else 7870 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 7871 7872 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 7873 vmx->msr_ia32_feature_control_valid_bits |= 7874 FEAT_CTL_SGX_LC_ENABLED; 7875 else 7876 vmx->msr_ia32_feature_control_valid_bits &= 7877 ~FEAT_CTL_SGX_LC_ENABLED; 7878 7879 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7880 vmx_update_exception_bitmap(vcpu); 7881 } 7882 7883 static __init u64 vmx_get_perf_capabilities(void) 7884 { 7885 u64 perf_cap = PMU_CAP_FW_WRITES; 7886 u64 host_perf_cap = 0; 7887 7888 if (!enable_pmu) 7889 return 0; 7890 7891 if (boot_cpu_has(X86_FEATURE_PDCM)) 7892 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 7893 7894 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { 7895 x86_perf_get_lbr(&vmx_lbr_caps); 7896 7897 /* 7898 * KVM requires LBR callstack support, as the overhead due to 7899 * context switching LBRs without said support is too high. 7900 * See intel_pmu_create_guest_lbr_event() for more info. 7901 */ 7902 if (!vmx_lbr_caps.has_callstack) 7903 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 7904 else if (vmx_lbr_caps.nr) 7905 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; 7906 } 7907 7908 if (vmx_pebs_supported()) { 7909 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 7910 7911 /* 7912 * Disallow adaptive PEBS as it is functionally broken, can be 7913 * used by the guest to read *host* LBRs, and can be used to 7914 * bypass userspace event filters. To correctly and safely 7915 * support adaptive PEBS, KVM needs to: 7916 * 7917 * 1. Account for the ADAPTIVE flag when (re)programming fixed 7918 * counters. 7919 * 7920 * 2. Gain support from perf (or take direct control of counter 7921 * programming) to support events without adaptive PEBS 7922 * enabled for the hardware counter. 7923 * 7924 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 7925 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 7926 * 7927 * 4. Document which PMU events are effectively exposed to the 7928 * guest via adaptive PEBS, and make adaptive PEBS mutually 7929 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 7930 */ 7931 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 7932 } 7933 7934 return perf_cap; 7935 } 7936 7937 static __init void vmx_set_cpu_caps(void) 7938 { 7939 kvm_set_cpu_caps(); 7940 7941 /* CPUID 0x1 */ 7942 if (nested) 7943 kvm_cpu_cap_set(X86_FEATURE_VMX); 7944 7945 /* CPUID 0x7 */ 7946 if (kvm_mpx_supported()) 7947 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 7948 if (!cpu_has_vmx_invpcid()) 7949 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 7950 if (vmx_pt_mode_is_host_guest()) 7951 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 7952 if (vmx_pebs_supported()) { 7953 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 7954 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 7955 } 7956 7957 if (!enable_pmu) 7958 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 7959 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 7960 7961 if (!enable_sgx) { 7962 kvm_cpu_cap_clear(X86_FEATURE_SGX); 7963 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7964 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7965 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7966 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 7967 } 7968 7969 if (vmx_umip_emulated()) 7970 kvm_cpu_cap_set(X86_FEATURE_UMIP); 7971 7972 /* CPUID 0xD.1 */ 7973 kvm_caps.supported_xss = 0; 7974 if (!cpu_has_vmx_xsaves()) 7975 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7976 7977 /* CPUID 0x80000001 and 0x7 (RDPID) */ 7978 if (!cpu_has_vmx_rdtscp()) { 7979 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 7980 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 7981 } 7982 7983 if (cpu_has_vmx_waitpkg()) 7984 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7985 } 7986 7987 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, 7988 struct x86_instruction_info *info, 7989 unsigned long *exit_qualification) 7990 { 7991 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7992 unsigned short port; 7993 int size; 7994 bool imm; 7995 7996 /* 7997 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 7998 * VM-exits depend on the 'unconditional IO exiting' VM-execution 7999 * control. 8000 * 8001 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 8002 */ 8003 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 8004 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 8005 8006 if (info->intercept == x86_intercept_in || 8007 info->intercept == x86_intercept_ins) { 8008 port = info->src_val; 8009 size = info->dst_bytes; 8010 imm = info->src_type == OP_IMM; 8011 } else { 8012 port = info->dst_val; 8013 size = info->src_bytes; 8014 imm = info->dst_type == OP_IMM; 8015 } 8016 8017 8018 *exit_qualification = ((unsigned long)port << 16) | (size - 1); 8019 8020 if (info->intercept == x86_intercept_ins || 8021 info->intercept == x86_intercept_outs) 8022 *exit_qualification |= BIT(4); 8023 8024 if (info->rep_prefix) 8025 *exit_qualification |= BIT(5); 8026 8027 if (imm) 8028 *exit_qualification |= BIT(6); 8029 8030 return nested_vmx_check_io_bitmaps(vcpu, port, size); 8031 } 8032 8033 int vmx_check_intercept(struct kvm_vcpu *vcpu, 8034 struct x86_instruction_info *info, 8035 enum x86_intercept_stage stage, 8036 struct x86_exception *exception) 8037 { 8038 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8039 unsigned long exit_qualification = 0; 8040 u32 vm_exit_reason; 8041 u64 exit_insn_len; 8042 8043 switch (info->intercept) { 8044 case x86_intercept_rdpid: 8045 /* 8046 * RDPID causes #UD if not enabled through secondary execution 8047 * controls (ENABLE_RDTSCP). Note, the implicit MSR access to 8048 * TSC_AUX is NOT subject to interception, i.e. checking only 8049 * the dedicated execution control is architecturally correct. 8050 */ 8051 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 8052 exception->vector = UD_VECTOR; 8053 exception->error_code_valid = false; 8054 return X86EMUL_PROPAGATE_FAULT; 8055 } 8056 return X86EMUL_CONTINUE; 8057 8058 case x86_intercept_in: 8059 case x86_intercept_ins: 8060 case x86_intercept_out: 8061 case x86_intercept_outs: 8062 if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) 8063 return X86EMUL_CONTINUE; 8064 8065 vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; 8066 break; 8067 8068 case x86_intercept_lgdt: 8069 case x86_intercept_lidt: 8070 case x86_intercept_lldt: 8071 case x86_intercept_ltr: 8072 case x86_intercept_sgdt: 8073 case x86_intercept_sidt: 8074 case x86_intercept_sldt: 8075 case x86_intercept_str: 8076 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8077 return X86EMUL_CONTINUE; 8078 8079 if (info->intercept == x86_intercept_lldt || 8080 info->intercept == x86_intercept_ltr || 8081 info->intercept == x86_intercept_sldt || 8082 info->intercept == x86_intercept_str) 8083 vm_exit_reason = EXIT_REASON_LDTR_TR; 8084 else 8085 vm_exit_reason = EXIT_REASON_GDTR_IDTR; 8086 /* 8087 * FIXME: Decode the ModR/M to generate the correct exit 8088 * qualification for memory operands. 8089 */ 8090 break; 8091 8092 case x86_intercept_hlt: 8093 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) 8094 return X86EMUL_CONTINUE; 8095 8096 vm_exit_reason = EXIT_REASON_HLT; 8097 break; 8098 8099 case x86_intercept_pause: 8100 /* 8101 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8102 * with vanilla NOPs in the emulator. Apply the interception 8103 * check only to actual PAUSE instructions. Don't check 8104 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8105 * exit, i.e. KVM is within its rights to allow L2 to execute 8106 * the PAUSE. 8107 */ 8108 if ((info->rep_prefix != REPE_PREFIX) || 8109 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) 8110 return X86EMUL_CONTINUE; 8111 8112 vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; 8113 break; 8114 8115 /* TODO: check more intercepts... */ 8116 default: 8117 return X86EMUL_UNHANDLEABLE; 8118 } 8119 8120 exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); 8121 if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) 8122 return X86EMUL_UNHANDLEABLE; 8123 8124 __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, 8125 exit_insn_len); 8126 return X86EMUL_INTERCEPTED; 8127 } 8128 8129 #ifdef CONFIG_X86_64 8130 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8131 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8132 u64 divisor, u64 *result) 8133 { 8134 u64 low = a << shift, high = a >> (64 - shift); 8135 8136 /* To avoid the overflow on divq */ 8137 if (high >= divisor) 8138 return 1; 8139 8140 /* Low hold the result, high hold rem which is discarded */ 8141 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8142 "rm" (divisor), "0" (low), "1" (high)); 8143 *result = low; 8144 8145 return 0; 8146 } 8147 8148 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8149 bool *expired) 8150 { 8151 struct vcpu_vmx *vmx; 8152 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8153 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8154 8155 vmx = to_vmx(vcpu); 8156 tscl = rdtsc(); 8157 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8158 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8159 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8160 ktimer->timer_advance_ns); 8161 8162 if (delta_tsc > lapic_timer_advance_cycles) 8163 delta_tsc -= lapic_timer_advance_cycles; 8164 else 8165 delta_tsc = 0; 8166 8167 /* Convert to host delta tsc if tsc scaling is enabled */ 8168 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8169 delta_tsc && u64_shl_div_u64(delta_tsc, 8170 kvm_caps.tsc_scaling_ratio_frac_bits, 8171 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8172 return -ERANGE; 8173 8174 /* 8175 * If the delta tsc can't fit in the 32 bit after the multi shift, 8176 * we can't use the preemption timer. 8177 * It's possible that it fits on later vmentries, but checking 8178 * on every vmentry is costly so we just use an hrtimer. 8179 */ 8180 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8181 return -ERANGE; 8182 8183 vmx->hv_deadline_tsc = tscl + delta_tsc; 8184 *expired = !delta_tsc; 8185 return 0; 8186 } 8187 8188 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8189 { 8190 to_vmx(vcpu)->hv_deadline_tsc = -1; 8191 } 8192 #endif 8193 8194 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8195 { 8196 struct vcpu_vmx *vmx = to_vmx(vcpu); 8197 8198 if (WARN_ON_ONCE(!enable_pml)) 8199 return; 8200 8201 if (is_guest_mode(vcpu)) { 8202 vmx->nested.update_vmcs01_cpu_dirty_logging = true; 8203 return; 8204 } 8205 8206 /* 8207 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8208 * code, but in that case another update request will be made and so 8209 * the guest will never run with a stale PML value. 8210 */ 8211 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8212 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8213 else 8214 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8215 } 8216 8217 void vmx_setup_mce(struct kvm_vcpu *vcpu) 8218 { 8219 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8220 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8221 FEAT_CTL_LMCE_ENABLED; 8222 else 8223 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8224 ~FEAT_CTL_LMCE_ENABLED; 8225 } 8226 8227 #ifdef CONFIG_KVM_SMM 8228 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8229 { 8230 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8231 if (to_vmx(vcpu)->nested.nested_run_pending) 8232 return -EBUSY; 8233 return !is_smm(vcpu); 8234 } 8235 8236 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8237 { 8238 struct vcpu_vmx *vmx = to_vmx(vcpu); 8239 8240 /* 8241 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8242 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8243 * SMI and RSM only modify state that is saved and restored via SMRAM. 8244 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8245 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8246 */ 8247 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8248 if (vmx->nested.smm.guest_mode) 8249 nested_vmx_vmexit(vcpu, -1, 0, 0); 8250 8251 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8252 vmx->nested.vmxon = false; 8253 vmx_clear_hlt(vcpu); 8254 return 0; 8255 } 8256 8257 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8258 { 8259 struct vcpu_vmx *vmx = to_vmx(vcpu); 8260 int ret; 8261 8262 if (vmx->nested.smm.vmxon) { 8263 vmx->nested.vmxon = true; 8264 vmx->nested.smm.vmxon = false; 8265 } 8266 8267 if (vmx->nested.smm.guest_mode) { 8268 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8269 if (ret) 8270 return ret; 8271 8272 vmx->nested.nested_run_pending = 1; 8273 vmx->nested.smm.guest_mode = false; 8274 } 8275 return 0; 8276 } 8277 8278 void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8279 { 8280 /* RSM will cause a vmexit anyway. */ 8281 } 8282 #endif 8283 8284 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8285 { 8286 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8287 } 8288 8289 void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8290 { 8291 if (is_guest_mode(vcpu)) { 8292 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8293 8294 if (hrtimer_try_to_cancel(timer) == 1) 8295 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8296 } 8297 } 8298 8299 void vmx_hardware_unsetup(void) 8300 { 8301 kvm_set_posted_intr_wakeup_handler(NULL); 8302 8303 if (nested) 8304 nested_vmx_hardware_unsetup(); 8305 8306 free_kvm_area(); 8307 } 8308 8309 void vmx_vm_destroy(struct kvm *kvm) 8310 { 8311 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8312 8313 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8314 } 8315 8316 /* 8317 * Note, the SDM states that the linear address is masked *after* the modified 8318 * canonicality check, whereas KVM masks (untags) the address and then performs 8319 * a "normal" canonicality check. Functionally, the two methods are identical, 8320 * and when the masking occurs relative to the canonicality check isn't visible 8321 * to software, i.e. KVM's behavior doesn't violate the SDM. 8322 */ 8323 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8324 { 8325 int lam_bit; 8326 unsigned long cr3_bits; 8327 8328 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8329 return gva; 8330 8331 if (!is_64_bit_mode(vcpu)) 8332 return gva; 8333 8334 /* 8335 * Bit 63 determines if the address should be treated as user address 8336 * or a supervisor address. 8337 */ 8338 if (!(gva & BIT_ULL(63))) { 8339 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8340 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8341 return gva; 8342 8343 /* LAM_U48 is ignored if LAM_U57 is set. */ 8344 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8345 } else { 8346 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8347 return gva; 8348 8349 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8350 } 8351 8352 /* 8353 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8354 * Bit 63 is retained from the raw virtual address so that untagging 8355 * doesn't change a user access to a supervisor access, and vice versa. 8356 */ 8357 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8358 } 8359 8360 static unsigned int vmx_handle_intel_pt_intr(void) 8361 { 8362 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8363 8364 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8365 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8366 return 0; 8367 8368 kvm_make_request(KVM_REQ_PMI, vcpu); 8369 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8370 (unsigned long *)&vcpu->arch.pmu.global_status); 8371 return 1; 8372 } 8373 8374 static __init void vmx_setup_user_return_msrs(void) 8375 { 8376 8377 /* 8378 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8379 * will emulate SYSCALL in legacy mode if the vendor string in guest 8380 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8381 * support this emulation, MSR_STAR is included in the list for i386, 8382 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8383 * into hardware and is here purely for emulation purposes. 8384 */ 8385 const u32 vmx_uret_msrs_list[] = { 8386 #ifdef CONFIG_X86_64 8387 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8388 #endif 8389 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8390 MSR_IA32_TSX_CTRL, 8391 }; 8392 int i; 8393 8394 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8395 8396 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8397 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8398 } 8399 8400 static void __init vmx_setup_me_spte_mask(void) 8401 { 8402 u64 me_mask = 0; 8403 8404 /* 8405 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8406 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, 8407 * boot_cpu_data.x86_phys_bits holds the actual physical address 8408 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to 8409 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. 8410 */ 8411 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) 8412 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8413 kvm_host.maxphyaddr - 1); 8414 8415 /* 8416 * Unlike SME, host kernel doesn't support setting up any 8417 * MKTME KeyID on Intel platforms. No memory encryption 8418 * bits should be included into the SPTE. 8419 */ 8420 kvm_mmu_set_me_spte_mask(0, me_mask); 8421 } 8422 8423 __init int vmx_hardware_setup(void) 8424 { 8425 unsigned long host_bndcfgs; 8426 struct desc_ptr dt; 8427 int r; 8428 8429 store_idt(&dt); 8430 host_idt_base = dt.address; 8431 8432 vmx_setup_user_return_msrs(); 8433 8434 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8435 return -EIO; 8436 8437 if (boot_cpu_has(X86_FEATURE_NX)) 8438 kvm_enable_efer_bits(EFER_NX); 8439 8440 if (boot_cpu_has(X86_FEATURE_MPX)) { 8441 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); 8442 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8443 } 8444 8445 if (!cpu_has_vmx_mpx()) 8446 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8447 XFEATURE_MASK_BNDCSR); 8448 8449 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8450 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8451 enable_vpid = 0; 8452 8453 if (!cpu_has_vmx_ept() || 8454 !cpu_has_vmx_ept_4levels() || 8455 !cpu_has_vmx_ept_mt_wb() || 8456 !cpu_has_vmx_invept_global()) 8457 enable_ept = 0; 8458 8459 /* NX support is required for shadow paging. */ 8460 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8461 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8462 return -EOPNOTSUPP; 8463 } 8464 8465 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8466 enable_ept_ad_bits = 0; 8467 8468 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8469 enable_unrestricted_guest = 0; 8470 8471 if (!cpu_has_vmx_flexpriority()) 8472 flexpriority_enabled = 0; 8473 8474 if (!cpu_has_virtual_nmis()) 8475 enable_vnmi = 0; 8476 8477 #ifdef CONFIG_X86_SGX_KVM 8478 if (!cpu_has_vmx_encls_vmexit()) 8479 enable_sgx = false; 8480 #endif 8481 8482 /* 8483 * set_apic_access_page_addr() is used to reload apic access 8484 * page upon invalidation. No need to do anything if not 8485 * using the APIC_ACCESS_ADDR VMCS field. 8486 */ 8487 if (!flexpriority_enabled) 8488 vt_x86_ops.set_apic_access_page_addr = NULL; 8489 8490 if (!cpu_has_vmx_tpr_shadow()) 8491 vt_x86_ops.update_cr8_intercept = NULL; 8492 8493 #if IS_ENABLED(CONFIG_HYPERV) 8494 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8495 && enable_ept) { 8496 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8497 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8498 } 8499 #endif 8500 8501 if (!cpu_has_vmx_ple()) { 8502 ple_gap = 0; 8503 ple_window = 0; 8504 ple_window_grow = 0; 8505 ple_window_max = 0; 8506 ple_window_shrink = 0; 8507 } 8508 8509 if (!cpu_has_vmx_apicv()) 8510 enable_apicv = 0; 8511 if (!enable_apicv) 8512 vt_x86_ops.sync_pir_to_irr = NULL; 8513 8514 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8515 enable_ipiv = false; 8516 8517 if (cpu_has_vmx_tsc_scaling()) 8518 kvm_caps.has_tsc_control = true; 8519 8520 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8521 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8522 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8523 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8524 8525 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8526 8527 if (enable_ept) 8528 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8529 cpu_has_vmx_ept_execute_only()); 8530 else 8531 vt_x86_ops.get_mt_mask = NULL; 8532 8533 /* 8534 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8535 * bits to shadow_zero_check. 8536 */ 8537 vmx_setup_me_spte_mask(); 8538 8539 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8540 ept_caps_to_lpage_level(vmx_capability.ept)); 8541 8542 /* 8543 * Only enable PML when hardware supports PML feature, and both EPT 8544 * and EPT A/D bit features are enabled -- PML depends on them to work. 8545 */ 8546 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8547 enable_pml = 0; 8548 8549 if (!cpu_has_vmx_preemption_timer()) 8550 enable_preemption_timer = false; 8551 8552 if (enable_preemption_timer) { 8553 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8554 8555 cpu_preemption_timer_multi = 8556 vmx_misc_preemption_timer_rate(vmcs_config.misc); 8557 8558 if (tsc_khz) 8559 use_timer_freq = (u64)tsc_khz * 1000; 8560 use_timer_freq >>= cpu_preemption_timer_multi; 8561 8562 /* 8563 * KVM "disables" the preemption timer by setting it to its max 8564 * value. Don't use the timer if it might cause spurious exits 8565 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8566 */ 8567 if (use_timer_freq > 0xffffffffu / 10) 8568 enable_preemption_timer = false; 8569 } 8570 8571 if (!enable_preemption_timer) { 8572 vt_x86_ops.set_hv_timer = NULL; 8573 vt_x86_ops.cancel_hv_timer = NULL; 8574 } 8575 8576 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8577 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8578 8579 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8580 return -EINVAL; 8581 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8582 pt_mode = PT_MODE_SYSTEM; 8583 if (pt_mode == PT_MODE_HOST_GUEST) 8584 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8585 else 8586 vt_init_ops.handle_intel_pt_intr = NULL; 8587 8588 setup_default_sgx_lepubkeyhash(); 8589 8590 if (nested) { 8591 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8592 8593 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8594 if (r) 8595 return r; 8596 } 8597 8598 vmx_set_cpu_caps(); 8599 8600 r = alloc_kvm_area(); 8601 if (r && nested) 8602 nested_vmx_hardware_unsetup(); 8603 8604 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8605 8606 /* 8607 * On Intel CPUs that lack self-snoop feature, letting the guest control 8608 * memory types may result in unexpected behavior. So always ignore guest 8609 * PAT on those CPUs and map VM as writeback, not allowing userspace to 8610 * disable the quirk. 8611 * 8612 * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is 8613 * supported, UC is slow enough to cause issues with some older guests (e.g. 8614 * an old version of bochs driver uses ioremap() instead of ioremap_wc() to 8615 * map the video RAM, causing wayland desktop to fail to get started 8616 * correctly). To avoid breaking those older guests that rely on KVM to force 8617 * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the 8618 * safer (for performance) default behavior. 8619 * 8620 * On top of this, non-coherent DMA devices need the guest to flush CPU 8621 * caches properly. This also requires honoring guest PAT, and is forced 8622 * independent of the quirk in vmx_ignore_guest_pat(). 8623 */ 8624 if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) 8625 kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8626 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8627 return r; 8628 } 8629 8630 static void vmx_cleanup_l1d_flush(void) 8631 { 8632 if (vmx_l1d_flush_pages) { 8633 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8634 vmx_l1d_flush_pages = NULL; 8635 } 8636 /* Restore state so sysfs ignores VMX */ 8637 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8638 } 8639 8640 void vmx_exit(void) 8641 { 8642 allow_smaller_maxphyaddr = false; 8643 8644 vmx_cleanup_l1d_flush(); 8645 8646 kvm_x86_vendor_exit(); 8647 } 8648 8649 int __init vmx_init(void) 8650 { 8651 int r, cpu; 8652 8653 if (!kvm_is_vmx_supported()) 8654 return -EOPNOTSUPP; 8655 8656 /* 8657 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing 8658 * to unwind if a later step fails. 8659 */ 8660 hv_init_evmcs(); 8661 8662 r = kvm_x86_vendor_init(&vt_init_ops); 8663 if (r) 8664 return r; 8665 8666 /* 8667 * Must be called after common x86 init so enable_ept is properly set 8668 * up. Hand the parameter mitigation value in which was stored in 8669 * the pre module init parser. If no parameter was given, it will 8670 * contain 'auto' which will be turned into the default 'cond' 8671 * mitigation mode. 8672 */ 8673 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8674 if (r) 8675 goto err_l1d_flush; 8676 8677 for_each_possible_cpu(cpu) { 8678 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8679 8680 pi_init_cpu(cpu); 8681 } 8682 8683 vmx_check_vmcs12_offsets(); 8684 8685 /* 8686 * Shadow paging doesn't have a (further) performance penalty 8687 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8688 * by default 8689 */ 8690 if (!enable_ept) 8691 allow_smaller_maxphyaddr = true; 8692 8693 return 0; 8694 8695 err_l1d_flush: 8696 kvm_x86_vendor_exit(); 8697 return r; 8698 } 8699