1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 #include <linux/entry-kvm.h> 32 33 #include <asm/apic.h> 34 #include <asm/asm.h> 35 #include <asm/cpu.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/api.h> 40 #include <asm/fpu/xstate.h> 41 #include <asm/fred.h> 42 #include <asm/idtentry.h> 43 #include <asm/io.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/reboot.h> 46 #include <asm/perf_event.h> 47 #include <asm/mmu_context.h> 48 #include <asm/mshyperv.h> 49 #include <asm/mwait.h> 50 #include <asm/spec-ctrl.h> 51 #include <asm/vmx.h> 52 53 #include <trace/events/ipi.h> 54 55 #include "capabilities.h" 56 #include "cpuid.h" 57 #include "hyperv.h" 58 #include "kvm_onhyperv.h" 59 #include "irq.h" 60 #include "kvm_cache_regs.h" 61 #include "lapic.h" 62 #include "mmu.h" 63 #include "nested.h" 64 #include "pmu.h" 65 #include "sgx.h" 66 #include "trace.h" 67 #include "vmcs.h" 68 #include "vmcs12.h" 69 #include "vmx.h" 70 #include "x86.h" 71 #include "x86_ops.h" 72 #include "smm.h" 73 #include "vmx_onhyperv.h" 74 #include "posted_intr.h" 75 76 MODULE_AUTHOR("Qumranet"); 77 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); 78 MODULE_LICENSE("GPL"); 79 80 #ifdef MODULE 81 static const struct x86_cpu_id vmx_cpu_id[] = { 82 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 83 {} 84 }; 85 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 86 #endif 87 88 bool __read_mostly enable_vpid = 1; 89 module_param_named(vpid, enable_vpid, bool, 0444); 90 91 static bool __read_mostly enable_vnmi = 1; 92 module_param_named(vnmi, enable_vnmi, bool, 0444); 93 94 bool __read_mostly flexpriority_enabled = 1; 95 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 96 97 bool __read_mostly enable_ept = 1; 98 module_param_named(ept, enable_ept, bool, 0444); 99 100 bool __read_mostly enable_unrestricted_guest = 1; 101 module_param_named(unrestricted_guest, 102 enable_unrestricted_guest, bool, 0444); 103 104 bool __read_mostly enable_ept_ad_bits = 1; 105 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 106 107 static bool __read_mostly emulate_invalid_guest_state = true; 108 module_param(emulate_invalid_guest_state, bool, 0444); 109 110 static bool __read_mostly fasteoi = 1; 111 module_param(fasteoi, bool, 0444); 112 113 module_param(enable_apicv, bool, 0444); 114 115 bool __read_mostly enable_ipiv = true; 116 module_param(enable_ipiv, bool, 0444); 117 118 /* 119 * If nested=1, nested virtualization is supported, i.e., guests may use 120 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 121 * use VMX instructions. 122 */ 123 static bool __read_mostly nested = 1; 124 module_param(nested, bool, 0444); 125 126 bool __read_mostly enable_pml = 1; 127 module_param_named(pml, enable_pml, bool, 0444); 128 129 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 130 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 131 132 static bool __read_mostly dump_invalid_vmcs = 0; 133 module_param(dump_invalid_vmcs, bool, 0644); 134 135 #define MSR_BITMAP_MODE_X2APIC 1 136 #define MSR_BITMAP_MODE_X2APIC_APICV 2 137 138 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 139 140 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 141 static int __read_mostly cpu_preemption_timer_multi; 142 static bool __read_mostly enable_preemption_timer = 1; 143 #ifdef CONFIG_X86_64 144 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 145 #endif 146 147 extern bool __read_mostly allow_smaller_maxphyaddr; 148 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 149 150 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 151 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 152 #define KVM_VM_CR0_ALWAYS_ON \ 153 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 154 155 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 156 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 157 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 158 159 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 160 161 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 162 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 163 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 164 RTIT_STATUS_BYTECNT)) 165 166 /* 167 * List of MSRs that can be directly passed to the guest. 168 * In addition to these x2apic, PT and LBR MSRs are handled specially. 169 */ 170 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 171 MSR_IA32_SPEC_CTRL, 172 MSR_IA32_PRED_CMD, 173 MSR_IA32_FLUSH_CMD, 174 MSR_IA32_TSC, 175 #ifdef CONFIG_X86_64 176 MSR_FS_BASE, 177 MSR_GS_BASE, 178 MSR_KERNEL_GS_BASE, 179 MSR_IA32_XFD, 180 MSR_IA32_XFD_ERR, 181 #endif 182 MSR_IA32_SYSENTER_CS, 183 MSR_IA32_SYSENTER_ESP, 184 MSR_IA32_SYSENTER_EIP, 185 MSR_CORE_C1_RES, 186 MSR_CORE_C3_RESIDENCY, 187 MSR_CORE_C6_RESIDENCY, 188 MSR_CORE_C7_RESIDENCY, 189 }; 190 191 /* 192 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 193 * ple_gap: upper bound on the amount of time between two successive 194 * executions of PAUSE in a loop. Also indicate if ple enabled. 195 * According to test, this time is usually smaller than 128 cycles. 196 * ple_window: upper bound on the amount of time a guest is allowed to execute 197 * in a PAUSE loop. Tests indicate that most spinlocks are held for 198 * less than 2^12 cycles 199 * Time is measured based on a counter that runs at the same rate as the TSC, 200 * refer SDM volume 3b section 21.6.13 & 22.1.3. 201 */ 202 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 203 module_param(ple_gap, uint, 0444); 204 205 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 206 module_param(ple_window, uint, 0444); 207 208 /* Default doubles per-vcpu window every exit. */ 209 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 210 module_param(ple_window_grow, uint, 0444); 211 212 /* Default resets per-vcpu window every exit to ple_window. */ 213 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 214 module_param(ple_window_shrink, uint, 0444); 215 216 /* Default is to compute the maximum so we can never overflow. */ 217 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 218 module_param(ple_window_max, uint, 0444); 219 220 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ 221 int __read_mostly pt_mode = PT_MODE_SYSTEM; 222 #ifdef CONFIG_BROKEN 223 module_param(pt_mode, int, S_IRUGO); 224 #endif 225 226 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 227 228 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 229 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 230 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 231 232 /* Storage for pre module init parameter parsing */ 233 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 234 235 static const struct { 236 const char *option; 237 bool for_parse; 238 } vmentry_l1d_param[] = { 239 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 240 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 241 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 242 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 243 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 244 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 245 }; 246 247 #define L1D_CACHE_ORDER 4 248 static void *vmx_l1d_flush_pages; 249 250 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 251 { 252 struct page *page; 253 unsigned int i; 254 255 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 256 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 257 return 0; 258 } 259 260 if (!enable_ept) { 261 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 262 return 0; 263 } 264 265 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 266 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 267 return 0; 268 } 269 270 /* If set to auto use the default l1tf mitigation method */ 271 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 272 switch (l1tf_mitigation) { 273 case L1TF_MITIGATION_OFF: 274 l1tf = VMENTER_L1D_FLUSH_NEVER; 275 break; 276 case L1TF_MITIGATION_FLUSH_NOWARN: 277 case L1TF_MITIGATION_FLUSH: 278 case L1TF_MITIGATION_FLUSH_NOSMT: 279 l1tf = VMENTER_L1D_FLUSH_COND; 280 break; 281 case L1TF_MITIGATION_FULL: 282 case L1TF_MITIGATION_FULL_FORCE: 283 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 284 break; 285 } 286 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 287 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 288 } 289 290 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 291 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 292 /* 293 * This allocation for vmx_l1d_flush_pages is not tied to a VM 294 * lifetime and so should not be charged to a memcg. 295 */ 296 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 297 if (!page) 298 return -ENOMEM; 299 vmx_l1d_flush_pages = page_address(page); 300 301 /* 302 * Initialize each page with a different pattern in 303 * order to protect against KSM in the nested 304 * virtualization case. 305 */ 306 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 307 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 308 PAGE_SIZE); 309 } 310 } 311 312 l1tf_vmx_mitigation = l1tf; 313 314 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 315 static_branch_enable(&vmx_l1d_should_flush); 316 else 317 static_branch_disable(&vmx_l1d_should_flush); 318 319 if (l1tf == VMENTER_L1D_FLUSH_COND) 320 static_branch_enable(&vmx_l1d_flush_cond); 321 else 322 static_branch_disable(&vmx_l1d_flush_cond); 323 return 0; 324 } 325 326 static int vmentry_l1d_flush_parse(const char *s) 327 { 328 unsigned int i; 329 330 if (s) { 331 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 332 if (vmentry_l1d_param[i].for_parse && 333 sysfs_streq(s, vmentry_l1d_param[i].option)) 334 return i; 335 } 336 } 337 return -EINVAL; 338 } 339 340 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 341 { 342 int l1tf, ret; 343 344 l1tf = vmentry_l1d_flush_parse(s); 345 if (l1tf < 0) 346 return l1tf; 347 348 if (!boot_cpu_has(X86_BUG_L1TF)) 349 return 0; 350 351 /* 352 * Has vmx_init() run already? If not then this is the pre init 353 * parameter parsing. In that case just store the value and let 354 * vmx_init() do the proper setup after enable_ept has been 355 * established. 356 */ 357 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 358 vmentry_l1d_flush_param = l1tf; 359 return 0; 360 } 361 362 mutex_lock(&vmx_l1d_flush_mutex); 363 ret = vmx_setup_l1d_flush(l1tf); 364 mutex_unlock(&vmx_l1d_flush_mutex); 365 return ret; 366 } 367 368 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 369 { 370 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 371 return sysfs_emit(s, "???\n"); 372 373 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 374 } 375 376 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 377 { 378 u64 msr; 379 380 if (!vmx->disable_fb_clear) 381 return; 382 383 msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); 384 msr |= FB_CLEAR_DIS; 385 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); 386 /* Cache the MSR value to avoid reading it later */ 387 vmx->msr_ia32_mcu_opt_ctrl = msr; 388 } 389 390 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 391 { 392 if (!vmx->disable_fb_clear) 393 return; 394 395 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 396 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 397 } 398 399 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 400 { 401 /* 402 * Disable VERW's behavior of clearing CPU buffers for the guest if the 403 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 404 * the mitigation. Disabling the clearing behavior provides a 405 * performance boost for guests that aren't aware that manually clearing 406 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 407 * and VM-Exit. 408 */ 409 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 410 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 411 !boot_cpu_has_bug(X86_BUG_MDS) && 412 !boot_cpu_has_bug(X86_BUG_TAA); 413 414 /* 415 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 416 * at VMEntry. Skip the MSR read/write when a guest has no use case to 417 * execute VERW. 418 */ 419 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 420 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 421 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 422 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 423 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 424 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 425 vmx->disable_fb_clear = false; 426 } 427 428 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 429 .set = vmentry_l1d_flush_set, 430 .get = vmentry_l1d_flush_get, 431 }; 432 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 433 434 static u32 vmx_segment_access_rights(struct kvm_segment *var); 435 436 void vmx_vmexit(void); 437 438 #define vmx_insn_failed(fmt...) \ 439 do { \ 440 WARN_ONCE(1, fmt); \ 441 pr_warn_ratelimited(fmt); \ 442 } while (0) 443 444 noinline void vmread_error(unsigned long field) 445 { 446 vmx_insn_failed("vmread failed: field=%lx\n", field); 447 } 448 449 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 450 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 451 { 452 if (fault) { 453 kvm_spurious_fault(); 454 } else { 455 instrumentation_begin(); 456 vmread_error(field); 457 instrumentation_end(); 458 } 459 } 460 #endif 461 462 noinline void vmwrite_error(unsigned long field, unsigned long value) 463 { 464 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 465 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 466 } 467 468 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 469 { 470 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 471 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 472 } 473 474 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 475 { 476 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 477 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 478 } 479 480 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 481 { 482 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 483 ext, vpid, gva); 484 } 485 486 noinline void invept_error(unsigned long ext, u64 eptp) 487 { 488 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); 489 } 490 491 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 492 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 493 /* 494 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 495 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 496 */ 497 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 498 499 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 500 static DEFINE_SPINLOCK(vmx_vpid_lock); 501 502 struct vmcs_config vmcs_config __ro_after_init; 503 struct vmx_capability vmx_capability __ro_after_init; 504 505 #define VMX_SEGMENT_FIELD(seg) \ 506 [VCPU_SREG_##seg] = { \ 507 .selector = GUEST_##seg##_SELECTOR, \ 508 .base = GUEST_##seg##_BASE, \ 509 .limit = GUEST_##seg##_LIMIT, \ 510 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 511 } 512 513 static const struct kvm_vmx_segment_field { 514 unsigned selector; 515 unsigned base; 516 unsigned limit; 517 unsigned ar_bytes; 518 } kvm_vmx_segment_fields[] = { 519 VMX_SEGMENT_FIELD(CS), 520 VMX_SEGMENT_FIELD(DS), 521 VMX_SEGMENT_FIELD(ES), 522 VMX_SEGMENT_FIELD(FS), 523 VMX_SEGMENT_FIELD(GS), 524 VMX_SEGMENT_FIELD(SS), 525 VMX_SEGMENT_FIELD(TR), 526 VMX_SEGMENT_FIELD(LDTR), 527 }; 528 529 530 static unsigned long host_idt_base; 531 532 #if IS_ENABLED(CONFIG_HYPERV) 533 static bool __read_mostly enlightened_vmcs = true; 534 module_param(enlightened_vmcs, bool, 0444); 535 536 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 537 { 538 struct hv_enlightened_vmcs *evmcs; 539 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 540 541 if (partition_assist_page == INVALID_PAGE) 542 return -ENOMEM; 543 544 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 545 546 evmcs->partition_assist_page = partition_assist_page; 547 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 548 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 549 550 return 0; 551 } 552 553 static __init void hv_init_evmcs(void) 554 { 555 int cpu; 556 557 if (!enlightened_vmcs) 558 return; 559 560 /* 561 * Enlightened VMCS usage should be recommended and the host needs 562 * to support eVMCS v1 or above. 563 */ 564 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 565 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 566 KVM_EVMCS_VERSION) { 567 568 /* Check that we have assist pages on all online CPUs */ 569 for_each_online_cpu(cpu) { 570 if (!hv_get_vp_assist_page(cpu)) { 571 enlightened_vmcs = false; 572 break; 573 } 574 } 575 576 if (enlightened_vmcs) { 577 pr_info("Using Hyper-V Enlightened VMCS\n"); 578 static_branch_enable(&__kvm_is_using_evmcs); 579 } 580 581 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 582 vt_x86_ops.enable_l2_tlb_flush 583 = hv_enable_l2_tlb_flush; 584 } else { 585 enlightened_vmcs = false; 586 } 587 } 588 589 static void hv_reset_evmcs(void) 590 { 591 struct hv_vp_assist_page *vp_ap; 592 593 if (!kvm_is_using_evmcs()) 594 return; 595 596 /* 597 * KVM should enable eVMCS if and only if all CPUs have a VP assist 598 * page, and should reject CPU onlining if eVMCS is enabled the CPU 599 * doesn't have a VP assist page allocated. 600 */ 601 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 602 if (WARN_ON_ONCE(!vp_ap)) 603 return; 604 605 /* 606 * Reset everything to support using non-enlightened VMCS access later 607 * (e.g. when we reload the module with enlightened_vmcs=0) 608 */ 609 vp_ap->nested_control.features.directhypercall = 0; 610 vp_ap->current_nested_vmcs = 0; 611 vp_ap->enlighten_vmentry = 0; 612 } 613 614 #else /* IS_ENABLED(CONFIG_HYPERV) */ 615 static void hv_init_evmcs(void) {} 616 static void hv_reset_evmcs(void) {} 617 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 618 619 /* 620 * Comment's format: document - errata name - stepping - processor name. 621 * Refer from 622 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 623 */ 624 static u32 vmx_preemption_cpu_tfms[] = { 625 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 626 0x000206E6, 627 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 628 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 629 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 630 0x00020652, 631 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 632 0x00020655, 633 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 634 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 635 /* 636 * 320767.pdf - AAP86 - B1 - 637 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 638 */ 639 0x000106E5, 640 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 641 0x000106A0, 642 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 643 0x000106A1, 644 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 645 0x000106A4, 646 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 647 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 648 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 649 0x000106A5, 650 /* Xeon E3-1220 V2 */ 651 0x000306A8, 652 }; 653 654 static inline bool cpu_has_broken_vmx_preemption_timer(void) 655 { 656 u32 eax = cpuid_eax(0x00000001), i; 657 658 /* Clear the reserved bits */ 659 eax &= ~(0x3U << 14 | 0xfU << 28); 660 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 661 if (eax == vmx_preemption_cpu_tfms[i]) 662 return true; 663 664 return false; 665 } 666 667 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 668 { 669 return flexpriority_enabled && lapic_in_kernel(vcpu); 670 } 671 672 static int vmx_get_passthrough_msr_slot(u32 msr) 673 { 674 int i; 675 676 switch (msr) { 677 case 0x800 ... 0x8ff: 678 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 679 return -ENOENT; 680 case MSR_IA32_RTIT_STATUS: 681 case MSR_IA32_RTIT_OUTPUT_BASE: 682 case MSR_IA32_RTIT_OUTPUT_MASK: 683 case MSR_IA32_RTIT_CR3_MATCH: 684 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 685 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ 686 case MSR_LBR_SELECT: 687 case MSR_LBR_TOS: 688 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: 689 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: 690 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: 691 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 692 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 693 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 694 return -ENOENT; 695 } 696 697 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 698 if (vmx_possible_passthrough_msrs[i] == msr) 699 return i; 700 } 701 702 WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 703 return -ENOENT; 704 } 705 706 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 707 { 708 int i; 709 710 i = kvm_find_user_return_msr(msr); 711 if (i >= 0) 712 return &vmx->guest_uret_msrs[i]; 713 return NULL; 714 } 715 716 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 717 struct vmx_uret_msr *msr, u64 data) 718 { 719 unsigned int slot = msr - vmx->guest_uret_msrs; 720 int ret = 0; 721 722 if (msr->load_into_hardware) { 723 preempt_disable(); 724 ret = kvm_set_user_return_msr(slot, data, msr->mask); 725 preempt_enable(); 726 } 727 if (!ret) 728 msr->data = data; 729 return ret; 730 } 731 732 /* 733 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 734 * 735 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 736 * atomically track post-VMXON state, e.g. this may be called in NMI context. 737 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 738 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 739 * magically in RM, VM86, compat mode, or at CPL>0. 740 */ 741 static int kvm_cpu_vmxoff(void) 742 { 743 asm goto("1: vmxoff\n\t" 744 _ASM_EXTABLE(1b, %l[fault]) 745 ::: "cc", "memory" : fault); 746 747 cr4_clear_bits(X86_CR4_VMXE); 748 return 0; 749 750 fault: 751 cr4_clear_bits(X86_CR4_VMXE); 752 return -EIO; 753 } 754 755 void vmx_emergency_disable_virtualization_cpu(void) 756 { 757 int cpu = raw_smp_processor_id(); 758 struct loaded_vmcs *v; 759 760 kvm_rebooting = true; 761 762 /* 763 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 764 * set in task context. If this races with VMX is disabled by an NMI, 765 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 766 * kvm_rebooting set. 767 */ 768 if (!(__read_cr4() & X86_CR4_VMXE)) 769 return; 770 771 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 772 loaded_vmcss_on_cpu_link) 773 vmcs_clear(v->vmcs); 774 775 kvm_cpu_vmxoff(); 776 } 777 778 static void __loaded_vmcs_clear(void *arg) 779 { 780 struct loaded_vmcs *loaded_vmcs = arg; 781 int cpu = raw_smp_processor_id(); 782 783 if (loaded_vmcs->cpu != cpu) 784 return; /* vcpu migration can race with cpu offline */ 785 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 786 per_cpu(current_vmcs, cpu) = NULL; 787 788 vmcs_clear(loaded_vmcs->vmcs); 789 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 790 vmcs_clear(loaded_vmcs->shadow_vmcs); 791 792 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 793 794 /* 795 * Ensure all writes to loaded_vmcs, including deleting it from its 796 * current percpu list, complete before setting loaded_vmcs->cpu to 797 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 798 * and add loaded_vmcs to its percpu list before it's deleted from this 799 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 800 */ 801 smp_wmb(); 802 803 loaded_vmcs->cpu = -1; 804 loaded_vmcs->launched = 0; 805 } 806 807 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 808 { 809 int cpu = loaded_vmcs->cpu; 810 811 if (cpu != -1) 812 smp_call_function_single(cpu, 813 __loaded_vmcs_clear, loaded_vmcs, 1); 814 } 815 816 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 817 unsigned field) 818 { 819 bool ret; 820 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 821 822 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 823 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 824 vmx->segment_cache.bitmask = 0; 825 } 826 ret = vmx->segment_cache.bitmask & mask; 827 vmx->segment_cache.bitmask |= mask; 828 return ret; 829 } 830 831 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 832 { 833 u16 *p = &vmx->segment_cache.seg[seg].selector; 834 835 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 836 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 837 return *p; 838 } 839 840 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 841 { 842 ulong *p = &vmx->segment_cache.seg[seg].base; 843 844 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 845 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 846 return *p; 847 } 848 849 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 850 { 851 u32 *p = &vmx->segment_cache.seg[seg].limit; 852 853 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 854 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 855 return *p; 856 } 857 858 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 859 { 860 u32 *p = &vmx->segment_cache.seg[seg].ar; 861 862 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 863 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 864 return *p; 865 } 866 867 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 868 { 869 u32 eb; 870 871 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 872 (1u << DB_VECTOR) | (1u << AC_VECTOR); 873 /* 874 * #VE isn't used for VMX. To test against unexpected changes 875 * related to #VE for VMX, intercept unexpected #VE and warn on it. 876 */ 877 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 878 eb |= 1u << VE_VECTOR; 879 /* 880 * Guest access to VMware backdoor ports could legitimately 881 * trigger #GP because of TSS I/O permission bitmap. 882 * We intercept those #GP and allow access to them anyway 883 * as VMware does. 884 */ 885 if (enable_vmware_backdoor) 886 eb |= (1u << GP_VECTOR); 887 if ((vcpu->guest_debug & 888 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 889 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 890 eb |= 1u << BP_VECTOR; 891 if (to_vmx(vcpu)->rmode.vm86_active) 892 eb = ~0; 893 if (!vmx_need_pf_intercept(vcpu)) 894 eb &= ~(1u << PF_VECTOR); 895 896 /* When we are running a nested L2 guest and L1 specified for it a 897 * certain exception bitmap, we must trap the same exceptions and pass 898 * them to L1. When running L2, we will only handle the exceptions 899 * specified above if L1 did not want them. 900 */ 901 if (is_guest_mode(vcpu)) 902 eb |= get_vmcs12(vcpu)->exception_bitmap; 903 else { 904 int mask = 0, match = 0; 905 906 if (enable_ept && (eb & (1u << PF_VECTOR))) { 907 /* 908 * If EPT is enabled, #PF is currently only intercepted 909 * if MAXPHYADDR is smaller on the guest than on the 910 * host. In that case we only care about present, 911 * non-reserved faults. For vmcs02, however, PFEC_MASK 912 * and PFEC_MATCH are set in prepare_vmcs02_rare. 913 */ 914 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 915 match = PFERR_PRESENT_MASK; 916 } 917 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 918 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 919 } 920 921 /* 922 * Disabling xfd interception indicates that dynamic xfeatures 923 * might be used in the guest. Always trap #NM in this case 924 * to save guest xfd_err timely. 925 */ 926 if (vcpu->arch.xfd_no_write_intercept) 927 eb |= (1u << NM_VECTOR); 928 929 vmcs_write32(EXCEPTION_BITMAP, eb); 930 } 931 932 /* 933 * Check if MSR is intercepted for currently loaded MSR bitmap. 934 */ 935 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 936 { 937 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 938 return true; 939 940 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 941 } 942 943 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 944 { 945 unsigned int flags = 0; 946 947 if (vmx->loaded_vmcs->launched) 948 flags |= VMX_RUN_VMRESUME; 949 950 /* 951 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 952 * to change it directly without causing a vmexit. In that case read 953 * it after vmexit and store it in vmx->spec_ctrl. 954 */ 955 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 956 flags |= VMX_RUN_SAVE_SPEC_CTRL; 957 958 return flags; 959 } 960 961 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 962 unsigned long entry, unsigned long exit) 963 { 964 vm_entry_controls_clearbit(vmx, entry); 965 vm_exit_controls_clearbit(vmx, exit); 966 } 967 968 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 969 { 970 unsigned int i; 971 972 for (i = 0; i < m->nr; ++i) { 973 if (m->val[i].index == msr) 974 return i; 975 } 976 return -ENOENT; 977 } 978 979 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 980 { 981 int i; 982 struct msr_autoload *m = &vmx->msr_autoload; 983 984 switch (msr) { 985 case MSR_EFER: 986 if (cpu_has_load_ia32_efer()) { 987 clear_atomic_switch_msr_special(vmx, 988 VM_ENTRY_LOAD_IA32_EFER, 989 VM_EXIT_LOAD_IA32_EFER); 990 return; 991 } 992 break; 993 case MSR_CORE_PERF_GLOBAL_CTRL: 994 if (cpu_has_load_perf_global_ctrl()) { 995 clear_atomic_switch_msr_special(vmx, 996 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 997 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 998 return; 999 } 1000 break; 1001 } 1002 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1003 if (i < 0) 1004 goto skip_guest; 1005 --m->guest.nr; 1006 m->guest.val[i] = m->guest.val[m->guest.nr]; 1007 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1008 1009 skip_guest: 1010 i = vmx_find_loadstore_msr_slot(&m->host, msr); 1011 if (i < 0) 1012 return; 1013 1014 --m->host.nr; 1015 m->host.val[i] = m->host.val[m->host.nr]; 1016 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1017 } 1018 1019 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1020 unsigned long entry, unsigned long exit, 1021 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1022 u64 guest_val, u64 host_val) 1023 { 1024 vmcs_write64(guest_val_vmcs, guest_val); 1025 if (host_val_vmcs != HOST_IA32_EFER) 1026 vmcs_write64(host_val_vmcs, host_val); 1027 vm_entry_controls_setbit(vmx, entry); 1028 vm_exit_controls_setbit(vmx, exit); 1029 } 1030 1031 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1032 u64 guest_val, u64 host_val, bool entry_only) 1033 { 1034 int i, j = 0; 1035 struct msr_autoload *m = &vmx->msr_autoload; 1036 1037 switch (msr) { 1038 case MSR_EFER: 1039 if (cpu_has_load_ia32_efer()) { 1040 add_atomic_switch_msr_special(vmx, 1041 VM_ENTRY_LOAD_IA32_EFER, 1042 VM_EXIT_LOAD_IA32_EFER, 1043 GUEST_IA32_EFER, 1044 HOST_IA32_EFER, 1045 guest_val, host_val); 1046 return; 1047 } 1048 break; 1049 case MSR_CORE_PERF_GLOBAL_CTRL: 1050 if (cpu_has_load_perf_global_ctrl()) { 1051 add_atomic_switch_msr_special(vmx, 1052 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1053 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1054 GUEST_IA32_PERF_GLOBAL_CTRL, 1055 HOST_IA32_PERF_GLOBAL_CTRL, 1056 guest_val, host_val); 1057 return; 1058 } 1059 break; 1060 case MSR_IA32_PEBS_ENABLE: 1061 /* PEBS needs a quiescent period after being disabled (to write 1062 * a record). Disabling PEBS through VMX MSR swapping doesn't 1063 * provide that period, so a CPU could write host's record into 1064 * guest's memory. 1065 */ 1066 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 1067 } 1068 1069 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1070 if (!entry_only) 1071 j = vmx_find_loadstore_msr_slot(&m->host, msr); 1072 1073 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 1074 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 1075 printk_once(KERN_WARNING "Not enough msr switch entries. " 1076 "Can't add msr %x\n", msr); 1077 return; 1078 } 1079 if (i < 0) { 1080 i = m->guest.nr++; 1081 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1082 } 1083 m->guest.val[i].index = msr; 1084 m->guest.val[i].value = guest_val; 1085 1086 if (entry_only) 1087 return; 1088 1089 if (j < 0) { 1090 j = m->host.nr++; 1091 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1092 } 1093 m->host.val[j].index = msr; 1094 m->host.val[j].value = host_val; 1095 } 1096 1097 static bool update_transition_efer(struct vcpu_vmx *vmx) 1098 { 1099 u64 guest_efer = vmx->vcpu.arch.efer; 1100 u64 ignore_bits = 0; 1101 int i; 1102 1103 /* Shadow paging assumes NX to be available. */ 1104 if (!enable_ept) 1105 guest_efer |= EFER_NX; 1106 1107 /* 1108 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1109 */ 1110 ignore_bits |= EFER_SCE; 1111 #ifdef CONFIG_X86_64 1112 ignore_bits |= EFER_LMA | EFER_LME; 1113 /* SCE is meaningful only in long mode on Intel */ 1114 if (guest_efer & EFER_LMA) 1115 ignore_bits &= ~(u64)EFER_SCE; 1116 #endif 1117 1118 /* 1119 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1120 * On CPUs that support "load IA32_EFER", always switch EFER 1121 * atomically, since it's faster than switching it manually. 1122 */ 1123 if (cpu_has_load_ia32_efer() || 1124 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { 1125 if (!(guest_efer & EFER_LMA)) 1126 guest_efer &= ~EFER_LME; 1127 if (guest_efer != kvm_host.efer) 1128 add_atomic_switch_msr(vmx, MSR_EFER, 1129 guest_efer, kvm_host.efer, false); 1130 else 1131 clear_atomic_switch_msr(vmx, MSR_EFER); 1132 return false; 1133 } 1134 1135 i = kvm_find_user_return_msr(MSR_EFER); 1136 if (i < 0) 1137 return false; 1138 1139 clear_atomic_switch_msr(vmx, MSR_EFER); 1140 1141 guest_efer &= ~ignore_bits; 1142 guest_efer |= kvm_host.efer & ignore_bits; 1143 1144 vmx->guest_uret_msrs[i].data = guest_efer; 1145 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1146 1147 return true; 1148 } 1149 1150 #ifdef CONFIG_X86_32 1151 /* 1152 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1153 * VMCS rather than the segment table. KVM uses this helper to figure 1154 * out the current bases to poke them into the VMCS before entry. 1155 */ 1156 static unsigned long segment_base(u16 selector) 1157 { 1158 struct desc_struct *table; 1159 unsigned long v; 1160 1161 if (!(selector & ~SEGMENT_RPL_MASK)) 1162 return 0; 1163 1164 table = get_current_gdt_ro(); 1165 1166 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1167 u16 ldt_selector = kvm_read_ldt(); 1168 1169 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1170 return 0; 1171 1172 table = (struct desc_struct *)segment_base(ldt_selector); 1173 } 1174 v = get_desc_base(&table[selector >> 3]); 1175 return v; 1176 } 1177 #endif 1178 1179 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1180 { 1181 return vmx_pt_mode_is_host_guest() && 1182 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1183 } 1184 1185 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1186 { 1187 /* The base must be 128-byte aligned and a legal physical address. */ 1188 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1189 } 1190 1191 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1192 { 1193 u32 i; 1194 1195 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1196 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1197 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1198 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1199 for (i = 0; i < addr_range; i++) { 1200 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1201 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1202 } 1203 } 1204 1205 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1206 { 1207 u32 i; 1208 1209 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1210 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1211 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1212 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1213 for (i = 0; i < addr_range; i++) { 1214 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1215 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1216 } 1217 } 1218 1219 static void pt_guest_enter(struct vcpu_vmx *vmx) 1220 { 1221 if (vmx_pt_mode_is_system()) 1222 return; 1223 1224 /* 1225 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1226 * Save host state before VM entry. 1227 */ 1228 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1229 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1230 wrmsrl(MSR_IA32_RTIT_CTL, 0); 1231 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1232 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1233 } 1234 } 1235 1236 static void pt_guest_exit(struct vcpu_vmx *vmx) 1237 { 1238 if (vmx_pt_mode_is_system()) 1239 return; 1240 1241 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1242 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1243 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1244 } 1245 1246 /* 1247 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1248 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1249 */ 1250 if (vmx->pt_desc.host.ctl) 1251 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1252 } 1253 1254 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1255 unsigned long fs_base, unsigned long gs_base) 1256 { 1257 if (unlikely(fs_sel != host->fs_sel)) { 1258 if (!(fs_sel & 7)) 1259 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1260 else 1261 vmcs_write16(HOST_FS_SELECTOR, 0); 1262 host->fs_sel = fs_sel; 1263 } 1264 if (unlikely(gs_sel != host->gs_sel)) { 1265 if (!(gs_sel & 7)) 1266 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1267 else 1268 vmcs_write16(HOST_GS_SELECTOR, 0); 1269 host->gs_sel = gs_sel; 1270 } 1271 if (unlikely(fs_base != host->fs_base)) { 1272 vmcs_writel(HOST_FS_BASE, fs_base); 1273 host->fs_base = fs_base; 1274 } 1275 if (unlikely(gs_base != host->gs_base)) { 1276 vmcs_writel(HOST_GS_BASE, gs_base); 1277 host->gs_base = gs_base; 1278 } 1279 } 1280 1281 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1282 { 1283 struct vcpu_vmx *vmx = to_vmx(vcpu); 1284 struct vmcs_host_state *host_state; 1285 #ifdef CONFIG_X86_64 1286 int cpu = raw_smp_processor_id(); 1287 #endif 1288 unsigned long fs_base, gs_base; 1289 u16 fs_sel, gs_sel; 1290 int i; 1291 1292 /* 1293 * Note that guest MSRs to be saved/restored can also be changed 1294 * when guest state is loaded. This happens when guest transitions 1295 * to/from long-mode by setting MSR_EFER.LMA. 1296 */ 1297 if (!vmx->guest_uret_msrs_loaded) { 1298 vmx->guest_uret_msrs_loaded = true; 1299 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1300 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1301 continue; 1302 1303 kvm_set_user_return_msr(i, 1304 vmx->guest_uret_msrs[i].data, 1305 vmx->guest_uret_msrs[i].mask); 1306 } 1307 } 1308 1309 if (vmx->nested.need_vmcs12_to_shadow_sync) 1310 nested_sync_vmcs12_to_shadow(vcpu); 1311 1312 if (vmx->guest_state_loaded) 1313 return; 1314 1315 host_state = &vmx->loaded_vmcs->host_state; 1316 1317 /* 1318 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1319 * allow segment selectors with cpl > 0 or ti == 1. 1320 */ 1321 host_state->ldt_sel = kvm_read_ldt(); 1322 1323 #ifdef CONFIG_X86_64 1324 savesegment(ds, host_state->ds_sel); 1325 savesegment(es, host_state->es_sel); 1326 1327 gs_base = cpu_kernelmode_gs_base(cpu); 1328 if (likely(is_64bit_mm(current->mm))) { 1329 current_save_fsgs(); 1330 fs_sel = current->thread.fsindex; 1331 gs_sel = current->thread.gsindex; 1332 fs_base = current->thread.fsbase; 1333 vmx->msr_host_kernel_gs_base = current->thread.gsbase; 1334 } else { 1335 savesegment(fs, fs_sel); 1336 savesegment(gs, gs_sel); 1337 fs_base = read_msr(MSR_FS_BASE); 1338 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1339 } 1340 1341 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1342 #else 1343 savesegment(fs, fs_sel); 1344 savesegment(gs, gs_sel); 1345 fs_base = segment_base(fs_sel); 1346 gs_base = segment_base(gs_sel); 1347 #endif 1348 1349 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1350 vmx->guest_state_loaded = true; 1351 } 1352 1353 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1354 { 1355 struct vmcs_host_state *host_state; 1356 1357 if (!vmx->guest_state_loaded) 1358 return; 1359 1360 host_state = &vmx->loaded_vmcs->host_state; 1361 1362 ++vmx->vcpu.stat.host_state_reload; 1363 1364 #ifdef CONFIG_X86_64 1365 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1366 #endif 1367 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1368 kvm_load_ldt(host_state->ldt_sel); 1369 #ifdef CONFIG_X86_64 1370 load_gs_index(host_state->gs_sel); 1371 #else 1372 loadsegment(gs, host_state->gs_sel); 1373 #endif 1374 } 1375 if (host_state->fs_sel & 7) 1376 loadsegment(fs, host_state->fs_sel); 1377 #ifdef CONFIG_X86_64 1378 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1379 loadsegment(ds, host_state->ds_sel); 1380 loadsegment(es, host_state->es_sel); 1381 } 1382 #endif 1383 invalidate_tss_limit(); 1384 #ifdef CONFIG_X86_64 1385 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1386 #endif 1387 load_fixmap_gdt(raw_smp_processor_id()); 1388 vmx->guest_state_loaded = false; 1389 vmx->guest_uret_msrs_loaded = false; 1390 } 1391 1392 #ifdef CONFIG_X86_64 1393 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1394 { 1395 preempt_disable(); 1396 if (vmx->guest_state_loaded) 1397 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1398 preempt_enable(); 1399 return vmx->msr_guest_kernel_gs_base; 1400 } 1401 1402 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1403 { 1404 preempt_disable(); 1405 if (vmx->guest_state_loaded) 1406 wrmsrl(MSR_KERNEL_GS_BASE, data); 1407 preempt_enable(); 1408 vmx->msr_guest_kernel_gs_base = data; 1409 } 1410 #endif 1411 1412 static void grow_ple_window(struct kvm_vcpu *vcpu) 1413 { 1414 struct vcpu_vmx *vmx = to_vmx(vcpu); 1415 unsigned int old = vmx->ple_window; 1416 1417 vmx->ple_window = __grow_ple_window(old, ple_window, 1418 ple_window_grow, 1419 ple_window_max); 1420 1421 if (vmx->ple_window != old) { 1422 vmx->ple_window_dirty = true; 1423 trace_kvm_ple_window_update(vcpu->vcpu_id, 1424 vmx->ple_window, old); 1425 } 1426 } 1427 1428 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1429 { 1430 struct vcpu_vmx *vmx = to_vmx(vcpu); 1431 unsigned int old = vmx->ple_window; 1432 1433 vmx->ple_window = __shrink_ple_window(old, ple_window, 1434 ple_window_shrink, 1435 ple_window); 1436 1437 if (vmx->ple_window != old) { 1438 vmx->ple_window_dirty = true; 1439 trace_kvm_ple_window_update(vcpu->vcpu_id, 1440 vmx->ple_window, old); 1441 } 1442 } 1443 1444 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, 1445 struct loaded_vmcs *buddy) 1446 { 1447 struct vcpu_vmx *vmx = to_vmx(vcpu); 1448 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1449 struct vmcs *prev; 1450 1451 if (!already_loaded) { 1452 loaded_vmcs_clear(vmx->loaded_vmcs); 1453 local_irq_disable(); 1454 1455 /* 1456 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1457 * this cpu's percpu list, otherwise it may not yet be deleted 1458 * from its previous cpu's percpu list. Pairs with the 1459 * smb_wmb() in __loaded_vmcs_clear(). 1460 */ 1461 smp_rmb(); 1462 1463 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1464 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1465 local_irq_enable(); 1466 } 1467 1468 prev = per_cpu(current_vmcs, cpu); 1469 if (prev != vmx->loaded_vmcs->vmcs) { 1470 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1471 vmcs_load(vmx->loaded_vmcs->vmcs); 1472 1473 /* 1474 * No indirect branch prediction barrier needed when switching 1475 * the active VMCS within a vCPU, unless IBRS is advertised to 1476 * the vCPU. To minimize the number of IBPBs executed, KVM 1477 * performs IBPB on nested VM-Exit (a single nested transition 1478 * may switch the active VMCS multiple times). 1479 */ 1480 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) 1481 indirect_branch_prediction_barrier(); 1482 } 1483 1484 if (!already_loaded) { 1485 void *gdt = get_current_gdt_ro(); 1486 1487 /* 1488 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1489 * TLB entries from its previous association with the vCPU. 1490 */ 1491 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1492 1493 /* 1494 * Linux uses per-cpu TSS and GDT, so set these when switching 1495 * processors. See 22.2.4. 1496 */ 1497 vmcs_writel(HOST_TR_BASE, 1498 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1499 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1500 1501 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1502 /* 22.2.3 */ 1503 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1504 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1505 } 1506 1507 vmx->loaded_vmcs->cpu = cpu; 1508 } 1509 } 1510 1511 /* 1512 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1513 * vcpu mutex is already taken. 1514 */ 1515 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1516 { 1517 struct vcpu_vmx *vmx = to_vmx(vcpu); 1518 1519 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1520 shrink_ple_window(vcpu); 1521 1522 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1523 1524 vmx_vcpu_pi_load(vcpu, cpu); 1525 1526 vmx->host_debugctlmsr = get_debugctlmsr(); 1527 } 1528 1529 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1530 { 1531 vmx_vcpu_pi_put(vcpu); 1532 1533 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1534 } 1535 1536 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1537 { 1538 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1539 } 1540 1541 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1542 { 1543 struct vcpu_vmx *vmx = to_vmx(vcpu); 1544 unsigned long rflags, save_rflags; 1545 1546 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1547 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1548 rflags = vmcs_readl(GUEST_RFLAGS); 1549 if (vmx->rmode.vm86_active) { 1550 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1551 save_rflags = vmx->rmode.save_rflags; 1552 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1553 } 1554 vmx->rflags = rflags; 1555 } 1556 return vmx->rflags; 1557 } 1558 1559 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1560 { 1561 struct vcpu_vmx *vmx = to_vmx(vcpu); 1562 unsigned long old_rflags; 1563 1564 /* 1565 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1566 * is an unrestricted guest in order to mark L2 as needing emulation 1567 * if L1 runs L2 as a restricted guest. 1568 */ 1569 if (is_unrestricted_guest(vcpu)) { 1570 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1571 vmx->rflags = rflags; 1572 vmcs_writel(GUEST_RFLAGS, rflags); 1573 return; 1574 } 1575 1576 old_rflags = vmx_get_rflags(vcpu); 1577 vmx->rflags = rflags; 1578 if (vmx->rmode.vm86_active) { 1579 vmx->rmode.save_rflags = rflags; 1580 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1581 } 1582 vmcs_writel(GUEST_RFLAGS, rflags); 1583 1584 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1585 vmx->emulation_required = vmx_emulation_required(vcpu); 1586 } 1587 1588 bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1589 { 1590 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1591 } 1592 1593 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1594 { 1595 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1596 int ret = 0; 1597 1598 if (interruptibility & GUEST_INTR_STATE_STI) 1599 ret |= KVM_X86_SHADOW_INT_STI; 1600 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1601 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1602 1603 return ret; 1604 } 1605 1606 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1607 { 1608 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1609 u32 interruptibility = interruptibility_old; 1610 1611 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1612 1613 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1614 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1615 else if (mask & KVM_X86_SHADOW_INT_STI) 1616 interruptibility |= GUEST_INTR_STATE_STI; 1617 1618 if ((interruptibility != interruptibility_old)) 1619 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1620 } 1621 1622 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1623 { 1624 struct vcpu_vmx *vmx = to_vmx(vcpu); 1625 unsigned long value; 1626 1627 /* 1628 * Any MSR write that attempts to change bits marked reserved will 1629 * case a #GP fault. 1630 */ 1631 if (data & vmx->pt_desc.ctl_bitmask) 1632 return 1; 1633 1634 /* 1635 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1636 * result in a #GP unless the same write also clears TraceEn. 1637 */ 1638 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1639 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) 1640 return 1; 1641 1642 /* 1643 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1644 * and FabricEn would cause #GP, if 1645 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1646 */ 1647 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1648 !(data & RTIT_CTL_FABRIC_EN) && 1649 !intel_pt_validate_cap(vmx->pt_desc.caps, 1650 PT_CAP_single_range_output)) 1651 return 1; 1652 1653 /* 1654 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1655 * utilize encodings marked reserved will cause a #GP fault. 1656 */ 1657 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1658 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1659 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1660 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1661 return 1; 1662 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1663 PT_CAP_cycle_thresholds); 1664 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1665 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1666 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1667 return 1; 1668 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1669 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1670 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1671 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1672 return 1; 1673 1674 /* 1675 * If ADDRx_CFG is reserved or the encodings is >2 will 1676 * cause a #GP fault. 1677 */ 1678 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1679 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1680 return 1; 1681 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1682 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1683 return 1; 1684 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1685 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1686 return 1; 1687 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1688 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1689 return 1; 1690 1691 return 0; 1692 } 1693 1694 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1695 void *insn, int insn_len) 1696 { 1697 /* 1698 * Emulation of instructions in SGX enclaves is impossible as RIP does 1699 * not point at the failing instruction, and even if it did, the code 1700 * stream is inaccessible. Inject #UD instead of exiting to userspace 1701 * so that guest userspace can't DoS the guest simply by triggering 1702 * emulation (enclaves are CPL3 only). 1703 */ 1704 if (to_vmx(vcpu)->exit_reason.enclave_mode) { 1705 kvm_queue_exception(vcpu, UD_VECTOR); 1706 return X86EMUL_PROPAGATE_FAULT; 1707 } 1708 1709 /* Check that emulation is possible during event vectoring */ 1710 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 1711 !kvm_can_emulate_event_vectoring(emul_type)) 1712 return X86EMUL_UNHANDLEABLE_VECTORING; 1713 1714 return X86EMUL_CONTINUE; 1715 } 1716 1717 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1718 { 1719 union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; 1720 unsigned long rip, orig_rip; 1721 u32 instr_len; 1722 1723 /* 1724 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1725 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1726 * set when EPT misconfig occurs. In practice, real hardware updates 1727 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1728 * (namely Hyper-V) don't set it due to it being undefined behavior, 1729 * i.e. we end up advancing IP with some random value. 1730 */ 1731 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1732 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1733 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1734 1735 /* 1736 * Emulating an enclave's instructions isn't supported as KVM 1737 * cannot access the enclave's memory or its true RIP, e.g. the 1738 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1739 * the RIP that actually triggered the VM-Exit. But, because 1740 * most instructions that cause VM-Exit will #UD in an enclave, 1741 * most instruction-based VM-Exits simply do not occur. 1742 * 1743 * There are a few exceptions, notably the debug instructions 1744 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1745 * and generate #DB/#BP as expected, which KVM might intercept. 1746 * But again, the CPU does the dirty work and saves an instr 1747 * length of zero so VMMs don't shoot themselves in the foot. 1748 * WARN if KVM tries to skip a non-zero length instruction on 1749 * a VM-Exit from an enclave. 1750 */ 1751 if (!instr_len) 1752 goto rip_updated; 1753 1754 WARN_ONCE(exit_reason.enclave_mode, 1755 "skipping instruction after SGX enclave VM-Exit"); 1756 1757 orig_rip = kvm_rip_read(vcpu); 1758 rip = orig_rip + instr_len; 1759 #ifdef CONFIG_X86_64 1760 /* 1761 * We need to mask out the high 32 bits of RIP if not in 64-bit 1762 * mode, but just finding out that we are in 64-bit mode is 1763 * quite expensive. Only do it if there was a carry. 1764 */ 1765 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1766 rip = (u32)rip; 1767 #endif 1768 kvm_rip_write(vcpu, rip); 1769 } else { 1770 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1771 return 0; 1772 } 1773 1774 rip_updated: 1775 /* skipping an emulated instruction also counts */ 1776 vmx_set_interrupt_shadow(vcpu, 0); 1777 1778 return 1; 1779 } 1780 1781 /* 1782 * Recognizes a pending MTF VM-exit and records the nested state for later 1783 * delivery. 1784 */ 1785 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1786 { 1787 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1788 struct vcpu_vmx *vmx = to_vmx(vcpu); 1789 1790 if (!is_guest_mode(vcpu)) 1791 return; 1792 1793 /* 1794 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1795 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1796 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1797 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1798 * as ICEBP is higher priority than both. As instruction emulation is 1799 * completed at this point (i.e. KVM is at the instruction boundary), 1800 * any #DB exception pending delivery must be a debug-trap of lower 1801 * priority than MTF. Record the pending MTF state to be delivered in 1802 * vmx_check_nested_events(). 1803 */ 1804 if (nested_cpu_has_mtf(vmcs12) && 1805 (!vcpu->arch.exception.pending || 1806 vcpu->arch.exception.vector == DB_VECTOR) && 1807 (!vcpu->arch.exception_vmexit.pending || 1808 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1809 vmx->nested.mtf_pending = true; 1810 kvm_make_request(KVM_REQ_EVENT, vcpu); 1811 } else { 1812 vmx->nested.mtf_pending = false; 1813 } 1814 } 1815 1816 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1817 { 1818 vmx_update_emulated_instruction(vcpu); 1819 return skip_emulated_instruction(vcpu); 1820 } 1821 1822 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1823 { 1824 /* 1825 * Ensure that we clear the HLT state in the VMCS. We don't need to 1826 * explicitly skip the instruction because if the HLT state is set, 1827 * then the instruction is already executing and RIP has already been 1828 * advanced. 1829 */ 1830 if (kvm_hlt_in_guest(vcpu->kvm) && 1831 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1832 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1833 } 1834 1835 void vmx_inject_exception(struct kvm_vcpu *vcpu) 1836 { 1837 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1838 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1839 struct vcpu_vmx *vmx = to_vmx(vcpu); 1840 1841 kvm_deliver_exception_payload(vcpu, ex); 1842 1843 if (ex->has_error_code) { 1844 /* 1845 * Despite the error code being architecturally defined as 32 1846 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1847 * VMX don't actually supporting setting bits 31:16. Hardware 1848 * will (should) never provide a bogus error code, but AMD CPUs 1849 * do generate error codes with bits 31:16 set, and so KVM's 1850 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1851 * the upper bits to avoid VM-Fail, losing information that 1852 * doesn't really exist is preferable to killing the VM. 1853 */ 1854 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1855 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1856 } 1857 1858 if (vmx->rmode.vm86_active) { 1859 int inc_eip = 0; 1860 if (kvm_exception_is_soft(ex->vector)) 1861 inc_eip = vcpu->arch.event_exit_inst_len; 1862 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1863 return; 1864 } 1865 1866 WARN_ON_ONCE(vmx->emulation_required); 1867 1868 if (kvm_exception_is_soft(ex->vector)) { 1869 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1870 vmx->vcpu.arch.event_exit_inst_len); 1871 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1872 } else 1873 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1874 1875 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1876 1877 vmx_clear_hlt(vcpu); 1878 } 1879 1880 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1881 bool load_into_hardware) 1882 { 1883 struct vmx_uret_msr *uret_msr; 1884 1885 uret_msr = vmx_find_uret_msr(vmx, msr); 1886 if (!uret_msr) 1887 return; 1888 1889 uret_msr->load_into_hardware = load_into_hardware; 1890 } 1891 1892 /* 1893 * Configuring user return MSRs to automatically save, load, and restore MSRs 1894 * that need to be shoved into hardware when running the guest. Note, omitting 1895 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1896 * loaded into hardware when running the guest. 1897 */ 1898 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1899 { 1900 #ifdef CONFIG_X86_64 1901 bool load_syscall_msrs; 1902 1903 /* 1904 * The SYSCALL MSRs are only needed on long mode guests, and only 1905 * when EFER.SCE is set. 1906 */ 1907 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1908 (vmx->vcpu.arch.efer & EFER_SCE); 1909 1910 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1911 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1912 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1913 #endif 1914 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1915 1916 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1917 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1918 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1919 1920 /* 1921 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1922 * kernel and old userspace. If those guests run on a tsx=off host, do 1923 * allow guests to use TSX_CTRL, but don't change the value in hardware 1924 * so that TSX remains always disabled. 1925 */ 1926 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1927 1928 /* 1929 * The set of MSRs to load may have changed, reload MSRs before the 1930 * next VM-Enter. 1931 */ 1932 vmx->guest_uret_msrs_loaded = false; 1933 } 1934 1935 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1936 { 1937 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1938 1939 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 1940 return vmcs12->tsc_offset; 1941 1942 return 0; 1943 } 1944 1945 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1946 { 1947 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1948 1949 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 1950 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 1951 return vmcs12->tsc_multiplier; 1952 1953 return kvm_caps.default_tsc_scaling_ratio; 1954 } 1955 1956 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 1957 { 1958 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 1959 } 1960 1961 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1962 { 1963 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 1964 } 1965 1966 /* 1967 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 1968 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 1969 * backwards compatibility even though KVM doesn't support emulating SMX. And 1970 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 1971 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 1972 */ 1973 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 1974 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 1975 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 1976 FEAT_CTL_SGX_LC_ENABLED | \ 1977 FEAT_CTL_SGX_ENABLED | \ 1978 FEAT_CTL_LMCE_ENABLED) 1979 1980 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 1981 struct msr_data *msr) 1982 { 1983 uint64_t valid_bits; 1984 1985 /* 1986 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 1987 * exposed to the guest. 1988 */ 1989 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 1990 ~KVM_SUPPORTED_FEATURE_CONTROL); 1991 1992 if (!msr->host_initiated && 1993 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 1994 return false; 1995 1996 if (msr->host_initiated) 1997 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 1998 else 1999 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 2000 2001 return !(msr->data & ~valid_bits); 2002 } 2003 2004 int vmx_get_feature_msr(u32 msr, u64 *data) 2005 { 2006 switch (msr) { 2007 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2008 if (!nested) 2009 return 1; 2010 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2011 default: 2012 return KVM_MSR_RET_UNSUPPORTED; 2013 } 2014 } 2015 2016 /* 2017 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 2018 * Returns 0 on success, non-0 otherwise. 2019 * Assumes vcpu_load() was already called. 2020 */ 2021 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2022 { 2023 struct vcpu_vmx *vmx = to_vmx(vcpu); 2024 struct vmx_uret_msr *msr; 2025 u32 index; 2026 2027 switch (msr_info->index) { 2028 #ifdef CONFIG_X86_64 2029 case MSR_FS_BASE: 2030 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2031 break; 2032 case MSR_GS_BASE: 2033 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2034 break; 2035 case MSR_KERNEL_GS_BASE: 2036 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 2037 break; 2038 #endif 2039 case MSR_EFER: 2040 return kvm_get_msr_common(vcpu, msr_info); 2041 case MSR_IA32_TSX_CTRL: 2042 if (!msr_info->host_initiated && 2043 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2044 return 1; 2045 goto find_uret_msr; 2046 case MSR_IA32_UMWAIT_CONTROL: 2047 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2048 return 1; 2049 2050 msr_info->data = vmx->msr_ia32_umwait_control; 2051 break; 2052 case MSR_IA32_SPEC_CTRL: 2053 if (!msr_info->host_initiated && 2054 !guest_has_spec_ctrl_msr(vcpu)) 2055 return 1; 2056 2057 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2058 break; 2059 case MSR_IA32_SYSENTER_CS: 2060 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2061 break; 2062 case MSR_IA32_SYSENTER_EIP: 2063 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2064 break; 2065 case MSR_IA32_SYSENTER_ESP: 2066 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2067 break; 2068 case MSR_IA32_BNDCFGS: 2069 if (!kvm_mpx_supported() || 2070 (!msr_info->host_initiated && 2071 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2072 return 1; 2073 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2074 break; 2075 case MSR_IA32_MCG_EXT_CTL: 2076 if (!msr_info->host_initiated && 2077 !(vmx->msr_ia32_feature_control & 2078 FEAT_CTL_LMCE_ENABLED)) 2079 return 1; 2080 msr_info->data = vcpu->arch.mcg_ext_ctl; 2081 break; 2082 case MSR_IA32_FEAT_CTL: 2083 msr_info->data = vmx->msr_ia32_feature_control; 2084 break; 2085 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2086 if (!msr_info->host_initiated && 2087 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 2088 return 1; 2089 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2090 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2091 break; 2092 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2093 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2094 return 1; 2095 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2096 &msr_info->data)) 2097 return 1; 2098 #ifdef CONFIG_KVM_HYPERV 2099 /* 2100 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2101 * instead of just ignoring the features, different Hyper-V 2102 * versions are either trying to use them and fail or do some 2103 * sanity checking and refuse to boot. Filter all unsupported 2104 * features out. 2105 */ 2106 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) 2107 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2108 &msr_info->data); 2109 #endif 2110 break; 2111 case MSR_IA32_RTIT_CTL: 2112 if (!vmx_pt_mode_is_host_guest()) 2113 return 1; 2114 msr_info->data = vmx->pt_desc.guest.ctl; 2115 break; 2116 case MSR_IA32_RTIT_STATUS: 2117 if (!vmx_pt_mode_is_host_guest()) 2118 return 1; 2119 msr_info->data = vmx->pt_desc.guest.status; 2120 break; 2121 case MSR_IA32_RTIT_CR3_MATCH: 2122 if (!vmx_pt_mode_is_host_guest() || 2123 !intel_pt_validate_cap(vmx->pt_desc.caps, 2124 PT_CAP_cr3_filtering)) 2125 return 1; 2126 msr_info->data = vmx->pt_desc.guest.cr3_match; 2127 break; 2128 case MSR_IA32_RTIT_OUTPUT_BASE: 2129 if (!vmx_pt_mode_is_host_guest() || 2130 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2131 PT_CAP_topa_output) && 2132 !intel_pt_validate_cap(vmx->pt_desc.caps, 2133 PT_CAP_single_range_output))) 2134 return 1; 2135 msr_info->data = vmx->pt_desc.guest.output_base; 2136 break; 2137 case MSR_IA32_RTIT_OUTPUT_MASK: 2138 if (!vmx_pt_mode_is_host_guest() || 2139 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2140 PT_CAP_topa_output) && 2141 !intel_pt_validate_cap(vmx->pt_desc.caps, 2142 PT_CAP_single_range_output))) 2143 return 1; 2144 msr_info->data = vmx->pt_desc.guest.output_mask; 2145 break; 2146 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2147 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2148 if (!vmx_pt_mode_is_host_guest() || 2149 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2150 return 1; 2151 if (index % 2) 2152 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2153 else 2154 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2155 break; 2156 case MSR_IA32_DEBUGCTLMSR: 2157 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 2158 break; 2159 default: 2160 find_uret_msr: 2161 msr = vmx_find_uret_msr(vmx, msr_info->index); 2162 if (msr) { 2163 msr_info->data = msr->data; 2164 break; 2165 } 2166 return kvm_get_msr_common(vcpu, msr_info); 2167 } 2168 2169 return 0; 2170 } 2171 2172 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2173 u64 data) 2174 { 2175 #ifdef CONFIG_X86_64 2176 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 2177 return (u32)data; 2178 #endif 2179 return (unsigned long)data; 2180 } 2181 2182 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2183 { 2184 u64 debugctl = 0; 2185 2186 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2187 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2188 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2189 2190 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && 2191 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2192 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2193 2194 return debugctl; 2195 } 2196 2197 /* 2198 * Writes msr value into the appropriate "register". 2199 * Returns 0 on success, non-0 otherwise. 2200 * Assumes vcpu_load() was already called. 2201 */ 2202 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2203 { 2204 struct vcpu_vmx *vmx = to_vmx(vcpu); 2205 struct vmx_uret_msr *msr; 2206 int ret = 0; 2207 u32 msr_index = msr_info->index; 2208 u64 data = msr_info->data; 2209 u32 index; 2210 2211 switch (msr_index) { 2212 case MSR_EFER: 2213 ret = kvm_set_msr_common(vcpu, msr_info); 2214 break; 2215 #ifdef CONFIG_X86_64 2216 case MSR_FS_BASE: 2217 vmx_segment_cache_clear(vmx); 2218 vmcs_writel(GUEST_FS_BASE, data); 2219 break; 2220 case MSR_GS_BASE: 2221 vmx_segment_cache_clear(vmx); 2222 vmcs_writel(GUEST_GS_BASE, data); 2223 break; 2224 case MSR_KERNEL_GS_BASE: 2225 vmx_write_guest_kernel_gs_base(vmx, data); 2226 break; 2227 case MSR_IA32_XFD: 2228 ret = kvm_set_msr_common(vcpu, msr_info); 2229 /* 2230 * Always intercepting WRMSR could incur non-negligible 2231 * overhead given xfd might be changed frequently in 2232 * guest context switch. Disable write interception 2233 * upon the first write with a non-zero value (indicating 2234 * potential usage on dynamic xfeatures). Also update 2235 * exception bitmap to trap #NM for proper virtualization 2236 * of guest xfd_err. 2237 */ 2238 if (!ret && data) { 2239 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2240 MSR_TYPE_RW); 2241 vcpu->arch.xfd_no_write_intercept = true; 2242 vmx_update_exception_bitmap(vcpu); 2243 } 2244 break; 2245 #endif 2246 case MSR_IA32_SYSENTER_CS: 2247 if (is_guest_mode(vcpu)) 2248 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2249 vmcs_write32(GUEST_SYSENTER_CS, data); 2250 break; 2251 case MSR_IA32_SYSENTER_EIP: 2252 if (is_guest_mode(vcpu)) { 2253 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2254 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2255 } 2256 vmcs_writel(GUEST_SYSENTER_EIP, data); 2257 break; 2258 case MSR_IA32_SYSENTER_ESP: 2259 if (is_guest_mode(vcpu)) { 2260 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2261 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2262 } 2263 vmcs_writel(GUEST_SYSENTER_ESP, data); 2264 break; 2265 case MSR_IA32_DEBUGCTLMSR: { 2266 u64 invalid; 2267 2268 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2269 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2270 kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); 2271 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2272 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2273 } 2274 2275 if (invalid) 2276 return 1; 2277 2278 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2279 VM_EXIT_SAVE_DEBUG_CONTROLS) 2280 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2281 2282 vmcs_write64(GUEST_IA32_DEBUGCTL, data); 2283 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2284 (data & DEBUGCTLMSR_LBR)) 2285 intel_pmu_create_guest_lbr_event(vcpu); 2286 return 0; 2287 } 2288 case MSR_IA32_BNDCFGS: 2289 if (!kvm_mpx_supported() || 2290 (!msr_info->host_initiated && 2291 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2292 return 1; 2293 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || 2294 (data & MSR_IA32_BNDCFGS_RSVD)) 2295 return 1; 2296 2297 if (is_guest_mode(vcpu) && 2298 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2299 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2300 get_vmcs12(vcpu)->guest_bndcfgs = data; 2301 2302 vmcs_write64(GUEST_BNDCFGS, data); 2303 break; 2304 case MSR_IA32_UMWAIT_CONTROL: 2305 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2306 return 1; 2307 2308 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2309 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2310 return 1; 2311 2312 vmx->msr_ia32_umwait_control = data; 2313 break; 2314 case MSR_IA32_SPEC_CTRL: 2315 if (!msr_info->host_initiated && 2316 !guest_has_spec_ctrl_msr(vcpu)) 2317 return 1; 2318 2319 if (kvm_spec_ctrl_test_value(data)) 2320 return 1; 2321 2322 vmx->spec_ctrl = data; 2323 if (!data) 2324 break; 2325 2326 /* 2327 * For non-nested: 2328 * When it's written (to non-zero) for the first time, pass 2329 * it through. 2330 * 2331 * For nested: 2332 * The handling of the MSR bitmap for L2 guests is done in 2333 * nested_vmx_prepare_msr_bitmap. We should not touch the 2334 * vmcs02.msr_bitmap here since it gets completely overwritten 2335 * in the merging. We update the vmcs01 here for L1 as well 2336 * since it will end up touching the MSR anyway now. 2337 */ 2338 vmx_disable_intercept_for_msr(vcpu, 2339 MSR_IA32_SPEC_CTRL, 2340 MSR_TYPE_RW); 2341 break; 2342 case MSR_IA32_TSX_CTRL: 2343 if (!msr_info->host_initiated && 2344 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2345 return 1; 2346 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2347 return 1; 2348 goto find_uret_msr; 2349 case MSR_IA32_CR_PAT: 2350 ret = kvm_set_msr_common(vcpu, msr_info); 2351 if (ret) 2352 break; 2353 2354 if (is_guest_mode(vcpu) && 2355 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2356 get_vmcs12(vcpu)->guest_ia32_pat = data; 2357 2358 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2359 vmcs_write64(GUEST_IA32_PAT, data); 2360 break; 2361 case MSR_IA32_MCG_EXT_CTL: 2362 if ((!msr_info->host_initiated && 2363 !(to_vmx(vcpu)->msr_ia32_feature_control & 2364 FEAT_CTL_LMCE_ENABLED)) || 2365 (data & ~MCG_EXT_CTL_LMCE_EN)) 2366 return 1; 2367 vcpu->arch.mcg_ext_ctl = data; 2368 break; 2369 case MSR_IA32_FEAT_CTL: 2370 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2371 return 1; 2372 2373 vmx->msr_ia32_feature_control = data; 2374 if (msr_info->host_initiated && data == 0) 2375 vmx_leave_nested(vcpu); 2376 2377 /* SGX may be enabled/disabled by guest's firmware */ 2378 vmx_write_encls_bitmap(vcpu, NULL); 2379 break; 2380 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2381 /* 2382 * On real hardware, the LE hash MSRs are writable before 2383 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2384 * at which point SGX related bits in IA32_FEATURE_CONTROL 2385 * become writable. 2386 * 2387 * KVM does not emulate SGX activation for simplicity, so 2388 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2389 * is unlocked. This is technically not architectural 2390 * behavior, but it's close enough. 2391 */ 2392 if (!msr_info->host_initiated && 2393 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || 2394 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2395 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2396 return 1; 2397 vmx->msr_ia32_sgxlepubkeyhash 2398 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2399 break; 2400 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2401 if (!msr_info->host_initiated) 2402 return 1; /* they are read-only */ 2403 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2404 return 1; 2405 return vmx_set_vmx_msr(vcpu, msr_index, data); 2406 case MSR_IA32_RTIT_CTL: 2407 if (!vmx_pt_mode_is_host_guest() || 2408 vmx_rtit_ctl_check(vcpu, data) || 2409 vmx->nested.vmxon) 2410 return 1; 2411 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2412 vmx->pt_desc.guest.ctl = data; 2413 pt_update_intercept_for_msr(vcpu); 2414 break; 2415 case MSR_IA32_RTIT_STATUS: 2416 if (!pt_can_write_msr(vmx)) 2417 return 1; 2418 if (data & MSR_IA32_RTIT_STATUS_MASK) 2419 return 1; 2420 vmx->pt_desc.guest.status = data; 2421 break; 2422 case MSR_IA32_RTIT_CR3_MATCH: 2423 if (!pt_can_write_msr(vmx)) 2424 return 1; 2425 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2426 PT_CAP_cr3_filtering)) 2427 return 1; 2428 vmx->pt_desc.guest.cr3_match = data; 2429 break; 2430 case MSR_IA32_RTIT_OUTPUT_BASE: 2431 if (!pt_can_write_msr(vmx)) 2432 return 1; 2433 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2434 PT_CAP_topa_output) && 2435 !intel_pt_validate_cap(vmx->pt_desc.caps, 2436 PT_CAP_single_range_output)) 2437 return 1; 2438 if (!pt_output_base_valid(vcpu, data)) 2439 return 1; 2440 vmx->pt_desc.guest.output_base = data; 2441 break; 2442 case MSR_IA32_RTIT_OUTPUT_MASK: 2443 if (!pt_can_write_msr(vmx)) 2444 return 1; 2445 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2446 PT_CAP_topa_output) && 2447 !intel_pt_validate_cap(vmx->pt_desc.caps, 2448 PT_CAP_single_range_output)) 2449 return 1; 2450 vmx->pt_desc.guest.output_mask = data; 2451 break; 2452 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2453 if (!pt_can_write_msr(vmx)) 2454 return 1; 2455 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2456 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2457 return 1; 2458 if (is_noncanonical_msr_address(data, vcpu)) 2459 return 1; 2460 if (index % 2) 2461 vmx->pt_desc.guest.addr_b[index / 2] = data; 2462 else 2463 vmx->pt_desc.guest.addr_a[index / 2] = data; 2464 break; 2465 case MSR_IA32_PERF_CAPABILITIES: 2466 if (data & PMU_CAP_LBR_FMT) { 2467 if ((data & PMU_CAP_LBR_FMT) != 2468 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) 2469 return 1; 2470 if (!cpuid_model_is_consistent(vcpu)) 2471 return 1; 2472 } 2473 if (data & PERF_CAP_PEBS_FORMAT) { 2474 if ((data & PERF_CAP_PEBS_MASK) != 2475 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2476 return 1; 2477 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) 2478 return 1; 2479 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) 2480 return 1; 2481 if (!cpuid_model_is_consistent(vcpu)) 2482 return 1; 2483 } 2484 ret = kvm_set_msr_common(vcpu, msr_info); 2485 break; 2486 2487 default: 2488 find_uret_msr: 2489 msr = vmx_find_uret_msr(vmx, msr_index); 2490 if (msr) 2491 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2492 else 2493 ret = kvm_set_msr_common(vcpu, msr_info); 2494 } 2495 2496 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2497 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2498 vmx_update_fb_clear_dis(vcpu, vmx); 2499 2500 return ret; 2501 } 2502 2503 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2504 { 2505 unsigned long guest_owned_bits; 2506 2507 kvm_register_mark_available(vcpu, reg); 2508 2509 switch (reg) { 2510 case VCPU_REGS_RSP: 2511 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2512 break; 2513 case VCPU_REGS_RIP: 2514 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2515 break; 2516 case VCPU_EXREG_PDPTR: 2517 if (enable_ept) 2518 ept_save_pdptrs(vcpu); 2519 break; 2520 case VCPU_EXREG_CR0: 2521 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2522 2523 vcpu->arch.cr0 &= ~guest_owned_bits; 2524 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2525 break; 2526 case VCPU_EXREG_CR3: 2527 /* 2528 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2529 * CR3 is loaded into hardware, not the guest's CR3. 2530 */ 2531 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2532 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2533 break; 2534 case VCPU_EXREG_CR4: 2535 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2536 2537 vcpu->arch.cr4 &= ~guest_owned_bits; 2538 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2539 break; 2540 default: 2541 KVM_BUG_ON(1, vcpu->kvm); 2542 break; 2543 } 2544 } 2545 2546 /* 2547 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2548 * directly instead of going through cpu_has(), to ensure KVM is trapping 2549 * ENCLS whenever it's supported in hardware. It does not matter whether 2550 * the host OS supports or has enabled SGX. 2551 */ 2552 static bool cpu_has_sgx(void) 2553 { 2554 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2555 } 2556 2557 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2558 { 2559 u32 vmx_msr_low, vmx_msr_high; 2560 u32 ctl = ctl_min | ctl_opt; 2561 2562 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2563 2564 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2565 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2566 2567 /* Ensure minimum (required) set of control bits are supported. */ 2568 if (ctl_min & ~ctl) 2569 return -EIO; 2570 2571 *result = ctl; 2572 return 0; 2573 } 2574 2575 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2576 { 2577 u64 allowed; 2578 2579 rdmsrl(msr, allowed); 2580 2581 return ctl_opt & allowed; 2582 } 2583 2584 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2585 struct vmx_capability *vmx_cap) 2586 { 2587 u32 _pin_based_exec_control = 0; 2588 u32 _cpu_based_exec_control = 0; 2589 u32 _cpu_based_2nd_exec_control = 0; 2590 u64 _cpu_based_3rd_exec_control = 0; 2591 u32 _vmexit_control = 0; 2592 u32 _vmentry_control = 0; 2593 u64 basic_msr; 2594 u64 misc_msr; 2595 int i; 2596 2597 /* 2598 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2599 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2600 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2601 */ 2602 struct { 2603 u32 entry_control; 2604 u32 exit_control; 2605 } const vmcs_entry_exit_pairs[] = { 2606 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2607 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2608 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2609 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2610 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2611 }; 2612 2613 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2614 2615 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2616 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2617 MSR_IA32_VMX_PROCBASED_CTLS, 2618 &_cpu_based_exec_control)) 2619 return -EIO; 2620 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2621 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2622 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2623 MSR_IA32_VMX_PROCBASED_CTLS2, 2624 &_cpu_based_2nd_exec_control)) 2625 return -EIO; 2626 } 2627 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 2628 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2629 2630 #ifndef CONFIG_X86_64 2631 if (!(_cpu_based_2nd_exec_control & 2632 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2633 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2634 #endif 2635 2636 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2637 _cpu_based_2nd_exec_control &= ~( 2638 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2639 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2640 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2641 2642 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2643 &vmx_cap->ept, &vmx_cap->vpid); 2644 2645 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2646 vmx_cap->ept) { 2647 pr_warn_once("EPT CAP should not exist if not support " 2648 "1-setting enable EPT VM-execution control\n"); 2649 2650 if (error_on_inconsistent_vmcs_config) 2651 return -EIO; 2652 2653 vmx_cap->ept = 0; 2654 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2655 } 2656 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2657 vmx_cap->vpid) { 2658 pr_warn_once("VPID CAP should not exist if not support " 2659 "1-setting enable VPID VM-execution control\n"); 2660 2661 if (error_on_inconsistent_vmcs_config) 2662 return -EIO; 2663 2664 vmx_cap->vpid = 0; 2665 } 2666 2667 if (!cpu_has_sgx()) 2668 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2669 2670 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2671 _cpu_based_3rd_exec_control = 2672 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2673 MSR_IA32_VMX_PROCBASED_CTLS3); 2674 2675 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2676 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2677 MSR_IA32_VMX_EXIT_CTLS, 2678 &_vmexit_control)) 2679 return -EIO; 2680 2681 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2682 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2683 MSR_IA32_VMX_PINBASED_CTLS, 2684 &_pin_based_exec_control)) 2685 return -EIO; 2686 2687 if (cpu_has_broken_vmx_preemption_timer()) 2688 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2689 if (!(_cpu_based_2nd_exec_control & 2690 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2691 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2692 2693 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2694 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2695 MSR_IA32_VMX_ENTRY_CTLS, 2696 &_vmentry_control)) 2697 return -EIO; 2698 2699 for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { 2700 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; 2701 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; 2702 2703 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) 2704 continue; 2705 2706 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", 2707 _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); 2708 2709 if (error_on_inconsistent_vmcs_config) 2710 return -EIO; 2711 2712 _vmentry_control &= ~n_ctrl; 2713 _vmexit_control &= ~x_ctrl; 2714 } 2715 2716 /* 2717 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2718 * can't be used due to an errata where VM Exit may incorrectly clear 2719 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2720 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2721 */ 2722 switch (boot_cpu_data.x86_vfm) { 2723 case INTEL_NEHALEM_EP: /* AAK155 */ 2724 case INTEL_NEHALEM: /* AAP115 */ 2725 case INTEL_WESTMERE: /* AAT100 */ 2726 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2727 case INTEL_NEHALEM_EX: /* BA97 */ 2728 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2729 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2730 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2731 "does not work properly. Using workaround\n"); 2732 break; 2733 default: 2734 break; 2735 } 2736 2737 rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); 2738 2739 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2740 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2741 return -EIO; 2742 2743 #ifdef CONFIG_X86_64 2744 /* 2745 * KVM expects to be able to shove all legal physical addresses into 2746 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2747 * 0 for processors that support Intel 64 architecture". 2748 */ 2749 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2750 return -EIO; 2751 #endif 2752 2753 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2754 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2755 return -EIO; 2756 2757 rdmsrl(MSR_IA32_VMX_MISC, misc_msr); 2758 2759 vmcs_conf->basic = basic_msr; 2760 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2761 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2762 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2763 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2764 vmcs_conf->vmexit_ctrl = _vmexit_control; 2765 vmcs_conf->vmentry_ctrl = _vmentry_control; 2766 vmcs_conf->misc = misc_msr; 2767 2768 #if IS_ENABLED(CONFIG_HYPERV) 2769 if (enlightened_vmcs) 2770 evmcs_sanitize_exec_ctrls(vmcs_conf); 2771 #endif 2772 2773 return 0; 2774 } 2775 2776 static bool __kvm_is_vmx_supported(void) 2777 { 2778 int cpu = smp_processor_id(); 2779 2780 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2781 pr_err("VMX not supported by CPU %d\n", cpu); 2782 return false; 2783 } 2784 2785 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2786 !this_cpu_has(X86_FEATURE_VMX)) { 2787 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2788 return false; 2789 } 2790 2791 return true; 2792 } 2793 2794 static bool kvm_is_vmx_supported(void) 2795 { 2796 bool supported; 2797 2798 migrate_disable(); 2799 supported = __kvm_is_vmx_supported(); 2800 migrate_enable(); 2801 2802 return supported; 2803 } 2804 2805 int vmx_check_processor_compat(void) 2806 { 2807 int cpu = raw_smp_processor_id(); 2808 struct vmcs_config vmcs_conf; 2809 struct vmx_capability vmx_cap; 2810 2811 if (!__kvm_is_vmx_supported()) 2812 return -EIO; 2813 2814 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2815 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2816 return -EIO; 2817 } 2818 if (nested) 2819 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2820 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2821 pr_err("Inconsistent VMCS config on CPU %d\n", cpu); 2822 return -EIO; 2823 } 2824 return 0; 2825 } 2826 2827 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2828 { 2829 u64 msr; 2830 2831 cr4_set_bits(X86_CR4_VMXE); 2832 2833 asm goto("1: vmxon %[vmxon_pointer]\n\t" 2834 _ASM_EXTABLE(1b, %l[fault]) 2835 : : [vmxon_pointer] "m"(vmxon_pointer) 2836 : : fault); 2837 return 0; 2838 2839 fault: 2840 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2841 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2842 cr4_clear_bits(X86_CR4_VMXE); 2843 2844 return -EFAULT; 2845 } 2846 2847 int vmx_enable_virtualization_cpu(void) 2848 { 2849 int cpu = raw_smp_processor_id(); 2850 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2851 int r; 2852 2853 if (cr4_read_shadow() & X86_CR4_VMXE) 2854 return -EBUSY; 2855 2856 /* 2857 * This can happen if we hot-added a CPU but failed to allocate 2858 * VP assist page for it. 2859 */ 2860 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2861 return -EFAULT; 2862 2863 intel_pt_handle_vmx(1); 2864 2865 r = kvm_cpu_vmxon(phys_addr); 2866 if (r) { 2867 intel_pt_handle_vmx(0); 2868 return r; 2869 } 2870 2871 return 0; 2872 } 2873 2874 static void vmclear_local_loaded_vmcss(void) 2875 { 2876 int cpu = raw_smp_processor_id(); 2877 struct loaded_vmcs *v, *n; 2878 2879 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2880 loaded_vmcss_on_cpu_link) 2881 __loaded_vmcs_clear(v); 2882 } 2883 2884 void vmx_disable_virtualization_cpu(void) 2885 { 2886 vmclear_local_loaded_vmcss(); 2887 2888 if (kvm_cpu_vmxoff()) 2889 kvm_spurious_fault(); 2890 2891 hv_reset_evmcs(); 2892 2893 intel_pt_handle_vmx(0); 2894 } 2895 2896 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2897 { 2898 int node = cpu_to_node(cpu); 2899 struct page *pages; 2900 struct vmcs *vmcs; 2901 2902 pages = __alloc_pages_node(node, flags, 0); 2903 if (!pages) 2904 return NULL; 2905 vmcs = page_address(pages); 2906 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 2907 2908 /* KVM supports Enlightened VMCS v1 only */ 2909 if (kvm_is_using_evmcs()) 2910 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2911 else 2912 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 2913 2914 if (shadow) 2915 vmcs->hdr.shadow_vmcs = 1; 2916 return vmcs; 2917 } 2918 2919 void free_vmcs(struct vmcs *vmcs) 2920 { 2921 free_page((unsigned long)vmcs); 2922 } 2923 2924 /* 2925 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2926 */ 2927 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2928 { 2929 if (!loaded_vmcs->vmcs) 2930 return; 2931 loaded_vmcs_clear(loaded_vmcs); 2932 free_vmcs(loaded_vmcs->vmcs); 2933 loaded_vmcs->vmcs = NULL; 2934 if (loaded_vmcs->msr_bitmap) 2935 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2936 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2937 } 2938 2939 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2940 { 2941 loaded_vmcs->vmcs = alloc_vmcs(false); 2942 if (!loaded_vmcs->vmcs) 2943 return -ENOMEM; 2944 2945 vmcs_clear(loaded_vmcs->vmcs); 2946 2947 loaded_vmcs->shadow_vmcs = NULL; 2948 loaded_vmcs->hv_timer_soft_disabled = false; 2949 loaded_vmcs->cpu = -1; 2950 loaded_vmcs->launched = 0; 2951 2952 if (cpu_has_vmx_msr_bitmap()) { 2953 loaded_vmcs->msr_bitmap = (unsigned long *) 2954 __get_free_page(GFP_KERNEL_ACCOUNT); 2955 if (!loaded_vmcs->msr_bitmap) 2956 goto out_vmcs; 2957 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2958 } 2959 2960 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2961 memset(&loaded_vmcs->controls_shadow, 0, 2962 sizeof(struct vmcs_controls_shadow)); 2963 2964 return 0; 2965 2966 out_vmcs: 2967 free_loaded_vmcs(loaded_vmcs); 2968 return -ENOMEM; 2969 } 2970 2971 static void free_kvm_area(void) 2972 { 2973 int cpu; 2974 2975 for_each_possible_cpu(cpu) { 2976 free_vmcs(per_cpu(vmxarea, cpu)); 2977 per_cpu(vmxarea, cpu) = NULL; 2978 } 2979 } 2980 2981 static __init int alloc_kvm_area(void) 2982 { 2983 int cpu; 2984 2985 for_each_possible_cpu(cpu) { 2986 struct vmcs *vmcs; 2987 2988 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 2989 if (!vmcs) { 2990 free_kvm_area(); 2991 return -ENOMEM; 2992 } 2993 2994 /* 2995 * When eVMCS is enabled, alloc_vmcs_cpu() sets 2996 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 2997 * revision_id reported by MSR_IA32_VMX_BASIC. 2998 * 2999 * However, even though not explicitly documented by 3000 * TLFS, VMXArea passed as VMXON argument should 3001 * still be marked with revision_id reported by 3002 * physical CPU. 3003 */ 3004 if (kvm_is_using_evmcs()) 3005 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3006 3007 per_cpu(vmxarea, cpu) = vmcs; 3008 } 3009 return 0; 3010 } 3011 3012 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3013 struct kvm_segment *save) 3014 { 3015 if (!emulate_invalid_guest_state) { 3016 /* 3017 * CS and SS RPL should be equal during guest entry according 3018 * to VMX spec, but in reality it is not always so. Since vcpu 3019 * is in the middle of the transition from real mode to 3020 * protected mode it is safe to assume that RPL 0 is a good 3021 * default value. 3022 */ 3023 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3024 save->selector &= ~SEGMENT_RPL_MASK; 3025 save->dpl = save->selector & SEGMENT_RPL_MASK; 3026 save->s = 1; 3027 } 3028 __vmx_set_segment(vcpu, save, seg); 3029 } 3030 3031 static void enter_pmode(struct kvm_vcpu *vcpu) 3032 { 3033 unsigned long flags; 3034 struct vcpu_vmx *vmx = to_vmx(vcpu); 3035 3036 /* 3037 * Update real mode segment cache. It may be not up-to-date if segment 3038 * register was written while vcpu was in a guest mode. 3039 */ 3040 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3041 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3042 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3043 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3044 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3045 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3046 3047 vmx->rmode.vm86_active = 0; 3048 3049 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3050 3051 flags = vmcs_readl(GUEST_RFLAGS); 3052 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3053 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3054 vmcs_writel(GUEST_RFLAGS, flags); 3055 3056 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3057 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3058 3059 vmx_update_exception_bitmap(vcpu); 3060 3061 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3062 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3063 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3064 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3065 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3066 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3067 } 3068 3069 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3070 { 3071 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3072 struct kvm_segment var = *save; 3073 3074 var.dpl = 0x3; 3075 if (seg == VCPU_SREG_CS) 3076 var.type = 0x3; 3077 3078 if (!emulate_invalid_guest_state) { 3079 var.selector = var.base >> 4; 3080 var.base = var.base & 0xffff0; 3081 var.limit = 0xffff; 3082 var.g = 0; 3083 var.db = 0; 3084 var.present = 1; 3085 var.s = 1; 3086 var.l = 0; 3087 var.unusable = 0; 3088 var.type = 0x3; 3089 var.avl = 0; 3090 if (save->base & 0xf) 3091 pr_warn_once("segment base is not paragraph aligned " 3092 "when entering protected mode (seg=%d)", seg); 3093 } 3094 3095 vmcs_write16(sf->selector, var.selector); 3096 vmcs_writel(sf->base, var.base); 3097 vmcs_write32(sf->limit, var.limit); 3098 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3099 } 3100 3101 static void enter_rmode(struct kvm_vcpu *vcpu) 3102 { 3103 unsigned long flags; 3104 struct vcpu_vmx *vmx = to_vmx(vcpu); 3105 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3106 3107 /* 3108 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3109 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3110 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3111 * should VM-Fail and KVM should reject userspace attempts to stuff 3112 * CR0.PG=0 when L2 is active. 3113 */ 3114 WARN_ON_ONCE(is_guest_mode(vcpu)); 3115 3116 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3117 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3118 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3119 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3120 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3121 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3122 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3123 3124 vmx->rmode.vm86_active = 1; 3125 3126 vmx_segment_cache_clear(vmx); 3127 3128 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3129 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3130 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3131 3132 flags = vmcs_readl(GUEST_RFLAGS); 3133 vmx->rmode.save_rflags = flags; 3134 3135 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3136 3137 vmcs_writel(GUEST_RFLAGS, flags); 3138 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3139 vmx_update_exception_bitmap(vcpu); 3140 3141 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3142 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3143 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3144 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3145 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3146 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3147 } 3148 3149 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3150 { 3151 struct vcpu_vmx *vmx = to_vmx(vcpu); 3152 3153 /* Nothing to do if hardware doesn't support EFER. */ 3154 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3155 return 0; 3156 3157 vcpu->arch.efer = efer; 3158 #ifdef CONFIG_X86_64 3159 if (efer & EFER_LMA) 3160 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3161 else 3162 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3163 #else 3164 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3165 return 1; 3166 #endif 3167 3168 vmx_setup_uret_msrs(vmx); 3169 return 0; 3170 } 3171 3172 #ifdef CONFIG_X86_64 3173 3174 static void enter_lmode(struct kvm_vcpu *vcpu) 3175 { 3176 u32 guest_tr_ar; 3177 3178 vmx_segment_cache_clear(to_vmx(vcpu)); 3179 3180 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3181 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3182 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3183 __func__); 3184 vmcs_write32(GUEST_TR_AR_BYTES, 3185 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3186 | VMX_AR_TYPE_BUSY_64_TSS); 3187 } 3188 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3189 } 3190 3191 static void exit_lmode(struct kvm_vcpu *vcpu) 3192 { 3193 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3194 } 3195 3196 #endif 3197 3198 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3199 { 3200 struct vcpu_vmx *vmx = to_vmx(vcpu); 3201 3202 /* 3203 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3204 * the CPU is not required to invalidate guest-physical mappings on 3205 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3206 * associated with the root EPT structure and not any particular VPID 3207 * (INVVPID also isn't required to invalidate guest-physical mappings). 3208 */ 3209 if (enable_ept) { 3210 ept_sync_global(); 3211 } else if (enable_vpid) { 3212 if (cpu_has_vmx_invvpid_global()) { 3213 vpid_sync_vcpu_global(); 3214 } else { 3215 vpid_sync_vcpu_single(vmx->vpid); 3216 vpid_sync_vcpu_single(vmx->nested.vpid02); 3217 } 3218 } 3219 } 3220 3221 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3222 { 3223 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) 3224 return nested_get_vpid02(vcpu); 3225 return to_vmx(vcpu)->vpid; 3226 } 3227 3228 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3229 { 3230 struct kvm_mmu *mmu = vcpu->arch.mmu; 3231 u64 root_hpa = mmu->root.hpa; 3232 3233 /* No flush required if the current context is invalid. */ 3234 if (!VALID_PAGE(root_hpa)) 3235 return; 3236 3237 if (enable_ept) 3238 ept_sync_context(construct_eptp(vcpu, root_hpa, 3239 mmu->root_role.level)); 3240 else 3241 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3242 } 3243 3244 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3245 { 3246 /* 3247 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3248 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3249 */ 3250 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3251 } 3252 3253 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3254 { 3255 /* 3256 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3257 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3258 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3259 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3260 * i.e. no explicit INVVPID is necessary. 3261 */ 3262 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3263 } 3264 3265 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3266 { 3267 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3268 3269 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3270 return; 3271 3272 if (is_pae_paging(vcpu)) { 3273 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3274 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3275 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3276 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3277 } 3278 } 3279 3280 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3281 { 3282 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3283 3284 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3285 return; 3286 3287 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3288 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3289 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3290 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3291 3292 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3293 } 3294 3295 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3296 CPU_BASED_CR3_STORE_EXITING) 3297 3298 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3299 { 3300 if (is_guest_mode(vcpu)) 3301 return nested_guest_cr0_valid(vcpu, cr0); 3302 3303 if (to_vmx(vcpu)->nested.vmxon) 3304 return nested_host_cr0_valid(vcpu, cr0); 3305 3306 return true; 3307 } 3308 3309 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3310 { 3311 struct vcpu_vmx *vmx = to_vmx(vcpu); 3312 unsigned long hw_cr0, old_cr0_pg; 3313 u32 tmp; 3314 3315 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3316 3317 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3318 if (enable_unrestricted_guest) 3319 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3320 else { 3321 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3322 if (!enable_ept) 3323 hw_cr0 |= X86_CR0_WP; 3324 3325 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3326 enter_pmode(vcpu); 3327 3328 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3329 enter_rmode(vcpu); 3330 } 3331 3332 vmcs_writel(CR0_READ_SHADOW, cr0); 3333 vmcs_writel(GUEST_CR0, hw_cr0); 3334 vcpu->arch.cr0 = cr0; 3335 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3336 3337 #ifdef CONFIG_X86_64 3338 if (vcpu->arch.efer & EFER_LME) { 3339 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3340 enter_lmode(vcpu); 3341 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3342 exit_lmode(vcpu); 3343 } 3344 #endif 3345 3346 if (enable_ept && !enable_unrestricted_guest) { 3347 /* 3348 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3349 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3350 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3351 * KVM's CR3 is installed. 3352 */ 3353 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3354 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3355 3356 /* 3357 * When running with EPT but not unrestricted guest, KVM must 3358 * intercept CR3 accesses when paging is _disabled_. This is 3359 * necessary because restricted guests can't actually run with 3360 * paging disabled, and so KVM stuffs its own CR3 in order to 3361 * run the guest when identity mapped page tables. 3362 * 3363 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3364 * update, it may be stale with respect to CR3 interception, 3365 * e.g. after nested VM-Enter. 3366 * 3367 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3368 * stores to forward them to L1, even if KVM does not need to 3369 * intercept them to preserve its identity mapped page tables. 3370 */ 3371 if (!(cr0 & X86_CR0_PG)) { 3372 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3373 } else if (!is_guest_mode(vcpu)) { 3374 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3375 } else { 3376 tmp = exec_controls_get(vmx); 3377 tmp &= ~CR3_EXITING_BITS; 3378 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3379 exec_controls_set(vmx, tmp); 3380 } 3381 3382 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3383 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3384 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3385 3386 /* 3387 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3388 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3389 */ 3390 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3391 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3392 } 3393 3394 /* depends on vcpu->arch.cr0 to be set to a new value */ 3395 vmx->emulation_required = vmx_emulation_required(vcpu); 3396 } 3397 3398 static int vmx_get_max_ept_level(void) 3399 { 3400 if (cpu_has_vmx_ept_5levels()) 3401 return 5; 3402 return 4; 3403 } 3404 3405 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3406 { 3407 u64 eptp = VMX_EPTP_MT_WB; 3408 3409 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3410 3411 if (enable_ept_ad_bits && 3412 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 3413 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3414 eptp |= root_hpa; 3415 3416 return eptp; 3417 } 3418 3419 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3420 { 3421 struct kvm *kvm = vcpu->kvm; 3422 bool update_guest_cr3 = true; 3423 unsigned long guest_cr3; 3424 u64 eptp; 3425 3426 if (enable_ept) { 3427 eptp = construct_eptp(vcpu, root_hpa, root_level); 3428 vmcs_write64(EPT_POINTER, eptp); 3429 3430 hv_track_root_tdp(vcpu, root_hpa); 3431 3432 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3433 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3434 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3435 guest_cr3 = vcpu->arch.cr3; 3436 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3437 update_guest_cr3 = false; 3438 vmx_ept_load_pdptrs(vcpu); 3439 } else { 3440 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3441 kvm_get_active_cr3_lam_bits(vcpu); 3442 } 3443 3444 if (update_guest_cr3) 3445 vmcs_writel(GUEST_CR3, guest_cr3); 3446 } 3447 3448 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3449 { 3450 /* 3451 * We operate under the default treatment of SMM, so VMX cannot be 3452 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3453 * i.e. is a reserved bit, is handled by common x86 code. 3454 */ 3455 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3456 return false; 3457 3458 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3459 return false; 3460 3461 return true; 3462 } 3463 3464 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3465 { 3466 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3467 struct vcpu_vmx *vmx = to_vmx(vcpu); 3468 unsigned long hw_cr4; 3469 3470 /* 3471 * Pass through host's Machine Check Enable value to hw_cr4, which 3472 * is in force while we are in guest mode. Do not let guests control 3473 * this bit, even if host CR4.MCE == 0. 3474 */ 3475 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3476 if (enable_unrestricted_guest) 3477 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3478 else if (vmx->rmode.vm86_active) 3479 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3480 else 3481 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3482 3483 if (vmx_umip_emulated()) { 3484 if (cr4 & X86_CR4_UMIP) { 3485 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3486 hw_cr4 &= ~X86_CR4_UMIP; 3487 } else if (!is_guest_mode(vcpu) || 3488 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3489 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3490 } 3491 } 3492 3493 vcpu->arch.cr4 = cr4; 3494 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3495 3496 if (!enable_unrestricted_guest) { 3497 if (enable_ept) { 3498 if (!is_paging(vcpu)) { 3499 hw_cr4 &= ~X86_CR4_PAE; 3500 hw_cr4 |= X86_CR4_PSE; 3501 } else if (!(cr4 & X86_CR4_PAE)) { 3502 hw_cr4 &= ~X86_CR4_PAE; 3503 } 3504 } 3505 3506 /* 3507 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3508 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3509 * to be manually disabled when guest switches to non-paging 3510 * mode. 3511 * 3512 * If !enable_unrestricted_guest, the CPU is always running 3513 * with CR0.PG=1 and CR4 needs to be modified. 3514 * If enable_unrestricted_guest, the CPU automatically 3515 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3516 */ 3517 if (!is_paging(vcpu)) 3518 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3519 } 3520 3521 vmcs_writel(CR4_READ_SHADOW, cr4); 3522 vmcs_writel(GUEST_CR4, hw_cr4); 3523 3524 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3525 kvm_update_cpuid_runtime(vcpu); 3526 } 3527 3528 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3529 { 3530 struct vcpu_vmx *vmx = to_vmx(vcpu); 3531 u32 ar; 3532 3533 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3534 *var = vmx->rmode.segs[seg]; 3535 if (seg == VCPU_SREG_TR 3536 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3537 return; 3538 var->base = vmx_read_guest_seg_base(vmx, seg); 3539 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3540 return; 3541 } 3542 var->base = vmx_read_guest_seg_base(vmx, seg); 3543 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3544 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3545 ar = vmx_read_guest_seg_ar(vmx, seg); 3546 var->unusable = (ar >> 16) & 1; 3547 var->type = ar & 15; 3548 var->s = (ar >> 4) & 1; 3549 var->dpl = (ar >> 5) & 3; 3550 /* 3551 * Some userspaces do not preserve unusable property. Since usable 3552 * segment has to be present according to VMX spec we can use present 3553 * property to amend userspace bug by making unusable segment always 3554 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3555 * segment as unusable. 3556 */ 3557 var->present = !var->unusable; 3558 var->avl = (ar >> 12) & 1; 3559 var->l = (ar >> 13) & 1; 3560 var->db = (ar >> 14) & 1; 3561 var->g = (ar >> 15) & 1; 3562 } 3563 3564 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3565 { 3566 struct kvm_segment s; 3567 3568 if (to_vmx(vcpu)->rmode.vm86_active) { 3569 vmx_get_segment(vcpu, &s, seg); 3570 return s.base; 3571 } 3572 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3573 } 3574 3575 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) 3576 { 3577 struct vcpu_vmx *vmx = to_vmx(vcpu); 3578 int ar; 3579 3580 if (unlikely(vmx->rmode.vm86_active)) 3581 return 0; 3582 3583 if (no_cache) 3584 ar = vmcs_read32(GUEST_SS_AR_BYTES); 3585 else 3586 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3587 return VMX_AR_DPL(ar); 3588 } 3589 3590 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3591 { 3592 return __vmx_get_cpl(vcpu, false); 3593 } 3594 3595 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) 3596 { 3597 return __vmx_get_cpl(vcpu, true); 3598 } 3599 3600 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3601 { 3602 u32 ar; 3603 3604 ar = var->type & 15; 3605 ar |= (var->s & 1) << 4; 3606 ar |= (var->dpl & 3) << 5; 3607 ar |= (var->present & 1) << 7; 3608 ar |= (var->avl & 1) << 12; 3609 ar |= (var->l & 1) << 13; 3610 ar |= (var->db & 1) << 14; 3611 ar |= (var->g & 1) << 15; 3612 ar |= (var->unusable || !var->present) << 16; 3613 3614 return ar; 3615 } 3616 3617 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3618 { 3619 struct vcpu_vmx *vmx = to_vmx(vcpu); 3620 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3621 3622 vmx_segment_cache_clear(vmx); 3623 3624 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3625 vmx->rmode.segs[seg] = *var; 3626 if (seg == VCPU_SREG_TR) 3627 vmcs_write16(sf->selector, var->selector); 3628 else if (var->s) 3629 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3630 return; 3631 } 3632 3633 vmcs_writel(sf->base, var->base); 3634 vmcs_write32(sf->limit, var->limit); 3635 vmcs_write16(sf->selector, var->selector); 3636 3637 /* 3638 * Fix the "Accessed" bit in AR field of segment registers for older 3639 * qemu binaries. 3640 * IA32 arch specifies that at the time of processor reset the 3641 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3642 * is setting it to 0 in the userland code. This causes invalid guest 3643 * state vmexit when "unrestricted guest" mode is turned on. 3644 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3645 * tree. Newer qemu binaries with that qemu fix would not need this 3646 * kvm hack. 3647 */ 3648 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3649 var->type |= 0x1; /* Accessed */ 3650 3651 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3652 } 3653 3654 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3655 { 3656 __vmx_set_segment(vcpu, var, seg); 3657 3658 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 3659 } 3660 3661 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3662 { 3663 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3664 3665 *db = (ar >> 14) & 1; 3666 *l = (ar >> 13) & 1; 3667 } 3668 3669 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3670 { 3671 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3672 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3673 } 3674 3675 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3676 { 3677 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3678 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3679 } 3680 3681 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3682 { 3683 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3684 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3685 } 3686 3687 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3688 { 3689 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3690 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3691 } 3692 3693 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3694 { 3695 struct kvm_segment var; 3696 u32 ar; 3697 3698 vmx_get_segment(vcpu, &var, seg); 3699 var.dpl = 0x3; 3700 if (seg == VCPU_SREG_CS) 3701 var.type = 0x3; 3702 ar = vmx_segment_access_rights(&var); 3703 3704 if (var.base != (var.selector << 4)) 3705 return false; 3706 if (var.limit != 0xffff) 3707 return false; 3708 if (ar != 0xf3) 3709 return false; 3710 3711 return true; 3712 } 3713 3714 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3715 { 3716 struct kvm_segment cs; 3717 unsigned int cs_rpl; 3718 3719 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3720 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3721 3722 if (cs.unusable) 3723 return false; 3724 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3725 return false; 3726 if (!cs.s) 3727 return false; 3728 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3729 if (cs.dpl > cs_rpl) 3730 return false; 3731 } else { 3732 if (cs.dpl != cs_rpl) 3733 return false; 3734 } 3735 if (!cs.present) 3736 return false; 3737 3738 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3739 return true; 3740 } 3741 3742 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3743 { 3744 struct kvm_segment ss; 3745 unsigned int ss_rpl; 3746 3747 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3748 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3749 3750 if (ss.unusable) 3751 return true; 3752 if (ss.type != 3 && ss.type != 7) 3753 return false; 3754 if (!ss.s) 3755 return false; 3756 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3757 return false; 3758 if (!ss.present) 3759 return false; 3760 3761 return true; 3762 } 3763 3764 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3765 { 3766 struct kvm_segment var; 3767 unsigned int rpl; 3768 3769 vmx_get_segment(vcpu, &var, seg); 3770 rpl = var.selector & SEGMENT_RPL_MASK; 3771 3772 if (var.unusable) 3773 return true; 3774 if (!var.s) 3775 return false; 3776 if (!var.present) 3777 return false; 3778 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3779 if (var.dpl < rpl) /* DPL < RPL */ 3780 return false; 3781 } 3782 3783 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3784 * rights flags 3785 */ 3786 return true; 3787 } 3788 3789 static bool tr_valid(struct kvm_vcpu *vcpu) 3790 { 3791 struct kvm_segment tr; 3792 3793 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3794 3795 if (tr.unusable) 3796 return false; 3797 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3798 return false; 3799 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3800 return false; 3801 if (!tr.present) 3802 return false; 3803 3804 return true; 3805 } 3806 3807 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3808 { 3809 struct kvm_segment ldtr; 3810 3811 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3812 3813 if (ldtr.unusable) 3814 return true; 3815 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3816 return false; 3817 if (ldtr.type != 2) 3818 return false; 3819 if (!ldtr.present) 3820 return false; 3821 3822 return true; 3823 } 3824 3825 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3826 { 3827 struct kvm_segment cs, ss; 3828 3829 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3830 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3831 3832 return ((cs.selector & SEGMENT_RPL_MASK) == 3833 (ss.selector & SEGMENT_RPL_MASK)); 3834 } 3835 3836 /* 3837 * Check if guest state is valid. Returns true if valid, false if 3838 * not. 3839 * We assume that registers are always usable 3840 */ 3841 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3842 { 3843 /* real mode guest state checks */ 3844 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3845 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3846 return false; 3847 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3848 return false; 3849 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3850 return false; 3851 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3852 return false; 3853 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3854 return false; 3855 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3856 return false; 3857 } else { 3858 /* protected mode guest state checks */ 3859 if (!cs_ss_rpl_check(vcpu)) 3860 return false; 3861 if (!code_segment_valid(vcpu)) 3862 return false; 3863 if (!stack_segment_valid(vcpu)) 3864 return false; 3865 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3866 return false; 3867 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3868 return false; 3869 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3870 return false; 3871 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3872 return false; 3873 if (!tr_valid(vcpu)) 3874 return false; 3875 if (!ldtr_valid(vcpu)) 3876 return false; 3877 } 3878 /* TODO: 3879 * - Add checks on RIP 3880 * - Add checks on RFLAGS 3881 */ 3882 3883 return true; 3884 } 3885 3886 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3887 { 3888 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3889 u16 data; 3890 int i; 3891 3892 for (i = 0; i < 3; i++) { 3893 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3894 return -EFAULT; 3895 } 3896 3897 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3898 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3899 return -EFAULT; 3900 3901 data = ~0; 3902 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3903 return -EFAULT; 3904 3905 return 0; 3906 } 3907 3908 static int init_rmode_identity_map(struct kvm *kvm) 3909 { 3910 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3911 int i, r = 0; 3912 void __user *uaddr; 3913 u32 tmp; 3914 3915 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3916 mutex_lock(&kvm->slots_lock); 3917 3918 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3919 goto out; 3920 3921 if (!kvm_vmx->ept_identity_map_addr) 3922 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3923 3924 uaddr = __x86_set_memory_region(kvm, 3925 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3926 kvm_vmx->ept_identity_map_addr, 3927 PAGE_SIZE); 3928 if (IS_ERR(uaddr)) { 3929 r = PTR_ERR(uaddr); 3930 goto out; 3931 } 3932 3933 /* Set up identity-mapping pagetable for EPT in real mode */ 3934 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 3935 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3936 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3937 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 3938 r = -EFAULT; 3939 goto out; 3940 } 3941 } 3942 kvm_vmx->ept_identity_pagetable_done = true; 3943 3944 out: 3945 mutex_unlock(&kvm->slots_lock); 3946 return r; 3947 } 3948 3949 static void seg_setup(int seg) 3950 { 3951 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3952 unsigned int ar; 3953 3954 vmcs_write16(sf->selector, 0); 3955 vmcs_writel(sf->base, 0); 3956 vmcs_write32(sf->limit, 0xffff); 3957 ar = 0x93; 3958 if (seg == VCPU_SREG_CS) 3959 ar |= 0x08; /* code segment */ 3960 3961 vmcs_write32(sf->ar_bytes, ar); 3962 } 3963 3964 int allocate_vpid(void) 3965 { 3966 int vpid; 3967 3968 if (!enable_vpid) 3969 return 0; 3970 spin_lock(&vmx_vpid_lock); 3971 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3972 if (vpid < VMX_NR_VPIDS) 3973 __set_bit(vpid, vmx_vpid_bitmap); 3974 else 3975 vpid = 0; 3976 spin_unlock(&vmx_vpid_lock); 3977 return vpid; 3978 } 3979 3980 void free_vpid(int vpid) 3981 { 3982 if (!enable_vpid || vpid == 0) 3983 return; 3984 spin_lock(&vmx_vpid_lock); 3985 __clear_bit(vpid, vmx_vpid_bitmap); 3986 spin_unlock(&vmx_vpid_lock); 3987 } 3988 3989 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 3990 { 3991 /* 3992 * When KVM is a nested hypervisor on top of Hyper-V and uses 3993 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 3994 * bitmap has changed. 3995 */ 3996 if (kvm_is_using_evmcs()) { 3997 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 3998 3999 if (evmcs->hv_enlightenments_control.msr_bitmap) 4000 evmcs->hv_clean_fields &= 4001 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 4002 } 4003 4004 vmx->nested.force_msr_bitmap_recalc = true; 4005 } 4006 4007 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4008 { 4009 struct vcpu_vmx *vmx = to_vmx(vcpu); 4010 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4011 int idx; 4012 4013 if (!cpu_has_vmx_msr_bitmap()) 4014 return; 4015 4016 vmx_msr_bitmap_l01_changed(vmx); 4017 4018 /* 4019 * Mark the desired intercept state in shadow bitmap, this is needed 4020 * for resync when the MSR filters change. 4021 */ 4022 idx = vmx_get_passthrough_msr_slot(msr); 4023 if (idx >= 0) { 4024 if (type & MSR_TYPE_R) 4025 clear_bit(idx, vmx->shadow_msr_intercept.read); 4026 if (type & MSR_TYPE_W) 4027 clear_bit(idx, vmx->shadow_msr_intercept.write); 4028 } 4029 4030 if ((type & MSR_TYPE_R) && 4031 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { 4032 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4033 type &= ~MSR_TYPE_R; 4034 } 4035 4036 if ((type & MSR_TYPE_W) && 4037 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { 4038 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4039 type &= ~MSR_TYPE_W; 4040 } 4041 4042 if (type & MSR_TYPE_R) 4043 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4044 4045 if (type & MSR_TYPE_W) 4046 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4047 } 4048 4049 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4050 { 4051 struct vcpu_vmx *vmx = to_vmx(vcpu); 4052 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4053 int idx; 4054 4055 if (!cpu_has_vmx_msr_bitmap()) 4056 return; 4057 4058 vmx_msr_bitmap_l01_changed(vmx); 4059 4060 /* 4061 * Mark the desired intercept state in shadow bitmap, this is needed 4062 * for resync when the MSR filter changes. 4063 */ 4064 idx = vmx_get_passthrough_msr_slot(msr); 4065 if (idx >= 0) { 4066 if (type & MSR_TYPE_R) 4067 set_bit(idx, vmx->shadow_msr_intercept.read); 4068 if (type & MSR_TYPE_W) 4069 set_bit(idx, vmx->shadow_msr_intercept.write); 4070 } 4071 4072 if (type & MSR_TYPE_R) 4073 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4074 4075 if (type & MSR_TYPE_W) 4076 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4077 } 4078 4079 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4080 { 4081 /* 4082 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4083 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4084 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4085 */ 4086 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4087 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4088 struct vcpu_vmx *vmx = to_vmx(vcpu); 4089 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4090 u8 mode; 4091 4092 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4093 return; 4094 4095 if (cpu_has_secondary_exec_ctrls() && 4096 (secondary_exec_controls_get(vmx) & 4097 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4098 mode = MSR_BITMAP_MODE_X2APIC; 4099 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4100 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4101 } else { 4102 mode = 0; 4103 } 4104 4105 if (mode == vmx->x2apic_msr_bitmap_mode) 4106 return; 4107 4108 vmx->x2apic_msr_bitmap_mode = mode; 4109 4110 /* 4111 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4112 * registers (0x840 and above) intercepted, KVM doesn't support them. 4113 * Intercept all writes by default and poke holes as needed. Pass 4114 * through reads for all valid registers by default in x2APIC+APICv 4115 * mode, only the current timer count needs on-demand emulation by KVM. 4116 */ 4117 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4118 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4119 else 4120 msr_bitmap[read_idx] = ~0ull; 4121 msr_bitmap[write_idx] = ~0ull; 4122 4123 /* 4124 * TPR reads and writes can be virtualized even if virtual interrupt 4125 * delivery is not in use. 4126 */ 4127 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4128 !(mode & MSR_BITMAP_MODE_X2APIC)); 4129 4130 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4131 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4132 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4133 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4134 if (enable_ipiv) 4135 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4136 } 4137 } 4138 4139 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4140 { 4141 struct vcpu_vmx *vmx = to_vmx(vcpu); 4142 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4143 u32 i; 4144 4145 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4146 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4147 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4148 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4149 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4150 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4151 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4152 } 4153 } 4154 4155 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) 4156 { 4157 struct vcpu_vmx *vmx = to_vmx(vcpu); 4158 u32 i; 4159 4160 if (!cpu_has_vmx_msr_bitmap()) 4161 return; 4162 4163 /* 4164 * Redo intercept permissions for MSRs that KVM is passing through to 4165 * the guest. Disabling interception will check the new MSR filter and 4166 * ensure that KVM enables interception if usersepace wants to filter 4167 * the MSR. MSRs that KVM is already intercepting don't need to be 4168 * refreshed since KVM is going to intercept them regardless of what 4169 * userspace wants. 4170 */ 4171 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 4172 u32 msr = vmx_possible_passthrough_msrs[i]; 4173 4174 if (!test_bit(i, vmx->shadow_msr_intercept.read)) 4175 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); 4176 4177 if (!test_bit(i, vmx->shadow_msr_intercept.write)) 4178 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); 4179 } 4180 4181 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4182 if (vmx_pt_mode_is_host_guest()) 4183 pt_update_intercept_for_msr(vcpu); 4184 } 4185 4186 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 4187 int pi_vec) 4188 { 4189 #ifdef CONFIG_SMP 4190 if (vcpu->mode == IN_GUEST_MODE) { 4191 /* 4192 * The vector of the virtual has already been set in the PIR. 4193 * Send a notification event to deliver the virtual interrupt 4194 * unless the vCPU is the currently running vCPU, i.e. the 4195 * event is being sent from a fastpath VM-Exit handler, in 4196 * which case the PIR will be synced to the vIRR before 4197 * re-entering the guest. 4198 * 4199 * When the target is not the running vCPU, the following 4200 * possibilities emerge: 4201 * 4202 * Case 1: vCPU stays in non-root mode. Sending a notification 4203 * event posts the interrupt to the vCPU. 4204 * 4205 * Case 2: vCPU exits to root mode and is still runnable. The 4206 * PIR will be synced to the vIRR before re-entering the guest. 4207 * Sending a notification event is ok as the host IRQ handler 4208 * will ignore the spurious event. 4209 * 4210 * Case 3: vCPU exits to root mode and is blocked. vcpu_block() 4211 * has already synced PIR to vIRR and never blocks the vCPU if 4212 * the vIRR is not empty. Therefore, a blocked vCPU here does 4213 * not wait for any requested interrupts in PIR, and sending a 4214 * notification event also results in a benign, spurious event. 4215 */ 4216 4217 if (vcpu != kvm_get_running_vcpu()) 4218 __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); 4219 return; 4220 } 4221 #endif 4222 /* 4223 * The vCPU isn't in the guest; wake the vCPU in case it is blocking, 4224 * otherwise do nothing as KVM will grab the highest priority pending 4225 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). 4226 */ 4227 kvm_vcpu_wake_up(vcpu); 4228 } 4229 4230 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4231 int vector) 4232 { 4233 struct vcpu_vmx *vmx = to_vmx(vcpu); 4234 4235 /* 4236 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4237 * and freed, and must not be accessed outside of vcpu->mutex. The 4238 * vCPU's cached PI NV is valid if and only if posted interrupts 4239 * enabled in its vmcs12, i.e. checking the vector also checks that 4240 * L1 has enabled posted interrupts for L2. 4241 */ 4242 if (is_guest_mode(vcpu) && 4243 vector == vmx->nested.posted_intr_nv) { 4244 /* 4245 * If a posted intr is not recognized by hardware, 4246 * we will accomplish it in the next vmentry. 4247 */ 4248 vmx->nested.pi_pending = true; 4249 kvm_make_request(KVM_REQ_EVENT, vcpu); 4250 4251 /* 4252 * This pairs with the smp_mb_*() after setting vcpu->mode in 4253 * vcpu_enter_guest() to guarantee the vCPU sees the event 4254 * request if triggering a posted interrupt "fails" because 4255 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4256 * the smb_wmb() in kvm_make_request() only ensures everything 4257 * done before making the request is visible when the request 4258 * is visible, it doesn't ensure ordering between the store to 4259 * vcpu->requests and the load from vcpu->mode. 4260 */ 4261 smp_mb__after_atomic(); 4262 4263 /* the PIR and ON have been set by L1. */ 4264 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4265 return 0; 4266 } 4267 return -1; 4268 } 4269 /* 4270 * Send interrupt to vcpu via posted interrupt way. 4271 * 1. If target vcpu is running(non-root mode), send posted interrupt 4272 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4273 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4274 * interrupt from PIR in next vmentry. 4275 */ 4276 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4277 { 4278 struct vcpu_vmx *vmx = to_vmx(vcpu); 4279 int r; 4280 4281 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4282 if (!r) 4283 return 0; 4284 4285 /* Note, this is called iff the local APIC is in-kernel. */ 4286 if (!vcpu->arch.apic->apicv_active) 4287 return -1; 4288 4289 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4290 return 0; 4291 4292 /* If a previous notification has sent the IPI, nothing to do. */ 4293 if (pi_test_and_set_on(&vmx->pi_desc)) 4294 return 0; 4295 4296 /* 4297 * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() 4298 * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is 4299 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a 4300 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. 4301 */ 4302 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); 4303 return 0; 4304 } 4305 4306 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4307 int trig_mode, int vector) 4308 { 4309 struct kvm_vcpu *vcpu = apic->vcpu; 4310 4311 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4312 kvm_lapic_set_irr(vector, apic); 4313 kvm_make_request(KVM_REQ_EVENT, vcpu); 4314 kvm_vcpu_kick(vcpu); 4315 } else { 4316 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4317 trig_mode, vector); 4318 } 4319 } 4320 4321 /* 4322 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4323 * will not change in the lifetime of the guest. 4324 * Note that host-state that does change is set elsewhere. E.g., host-state 4325 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4326 */ 4327 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4328 { 4329 u32 low32, high32; 4330 unsigned long tmpl; 4331 unsigned long cr0, cr3, cr4; 4332 4333 cr0 = read_cr0(); 4334 WARN_ON(cr0 & X86_CR0_TS); 4335 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4336 4337 /* 4338 * Save the most likely value for this task's CR3 in the VMCS. 4339 * We can't use __get_current_cr3_fast() because we're not atomic. 4340 */ 4341 cr3 = __read_cr3(); 4342 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4343 vmx->loaded_vmcs->host_state.cr3 = cr3; 4344 4345 /* Save the most likely value for this task's CR4 in the VMCS. */ 4346 cr4 = cr4_read_shadow(); 4347 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4348 vmx->loaded_vmcs->host_state.cr4 = cr4; 4349 4350 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4351 #ifdef CONFIG_X86_64 4352 /* 4353 * Load null selectors, so we can avoid reloading them in 4354 * vmx_prepare_switch_to_host(), in case userspace uses 4355 * the null selectors too (the expected case). 4356 */ 4357 vmcs_write16(HOST_DS_SELECTOR, 0); 4358 vmcs_write16(HOST_ES_SELECTOR, 0); 4359 #else 4360 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4361 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4362 #endif 4363 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4364 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4365 4366 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4367 4368 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4369 4370 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4371 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4372 4373 /* 4374 * SYSENTER is used for 32-bit system calls on either 32-bit or 4375 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4376 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4377 * have already done so!). 4378 */ 4379 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4380 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4381 4382 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4383 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4384 4385 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4386 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4387 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4388 } 4389 4390 if (cpu_has_load_ia32_efer()) 4391 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4392 } 4393 4394 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4395 { 4396 struct kvm_vcpu *vcpu = &vmx->vcpu; 4397 4398 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4399 ~vcpu->arch.cr4_guest_rsvd_bits; 4400 if (!enable_ept) { 4401 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4402 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4403 } 4404 if (is_guest_mode(&vmx->vcpu)) 4405 vcpu->arch.cr4_guest_owned_bits &= 4406 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4407 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4408 } 4409 4410 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4411 { 4412 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4413 4414 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4415 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4416 4417 if (!enable_vnmi) 4418 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4419 4420 if (!enable_preemption_timer) 4421 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4422 4423 return pin_based_exec_ctrl; 4424 } 4425 4426 static u32 vmx_vmentry_ctrl(void) 4427 { 4428 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4429 4430 if (vmx_pt_mode_is_system()) 4431 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4432 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4433 /* 4434 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4435 */ 4436 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4437 VM_ENTRY_LOAD_IA32_EFER | 4438 VM_ENTRY_IA32E_MODE); 4439 4440 return vmentry_ctrl; 4441 } 4442 4443 static u32 vmx_vmexit_ctrl(void) 4444 { 4445 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4446 4447 /* 4448 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4449 * nested virtualization and thus allowed to be set in vmcs12. 4450 */ 4451 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4452 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4453 4454 if (vmx_pt_mode_is_system()) 4455 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4456 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4457 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4458 return vmexit_ctrl & 4459 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4460 } 4461 4462 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4463 { 4464 struct vcpu_vmx *vmx = to_vmx(vcpu); 4465 4466 if (is_guest_mode(vcpu)) { 4467 vmx->nested.update_vmcs01_apicv_status = true; 4468 return; 4469 } 4470 4471 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4472 4473 if (kvm_vcpu_apicv_active(vcpu)) { 4474 secondary_exec_controls_setbit(vmx, 4475 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4476 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4477 if (enable_ipiv) 4478 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4479 } else { 4480 secondary_exec_controls_clearbit(vmx, 4481 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4482 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4483 if (enable_ipiv) 4484 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4485 } 4486 4487 vmx_update_msr_bitmap_x2apic(vcpu); 4488 } 4489 4490 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4491 { 4492 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4493 4494 /* 4495 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4496 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4497 */ 4498 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4499 CPU_BASED_USE_IO_BITMAPS | 4500 CPU_BASED_MONITOR_TRAP_FLAG | 4501 CPU_BASED_PAUSE_EXITING); 4502 4503 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4504 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4505 CPU_BASED_NMI_WINDOW_EXITING); 4506 4507 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4508 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4509 4510 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4511 exec_control &= ~CPU_BASED_TPR_SHADOW; 4512 4513 #ifdef CONFIG_X86_64 4514 if (exec_control & CPU_BASED_TPR_SHADOW) 4515 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4516 CPU_BASED_CR8_STORE_EXITING); 4517 else 4518 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4519 CPU_BASED_CR8_LOAD_EXITING; 4520 #endif 4521 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4522 if (enable_ept) 4523 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4524 CPU_BASED_CR3_STORE_EXITING | 4525 CPU_BASED_INVLPG_EXITING); 4526 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4527 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4528 CPU_BASED_MONITOR_EXITING); 4529 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4530 exec_control &= ~CPU_BASED_HLT_EXITING; 4531 return exec_control; 4532 } 4533 4534 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4535 { 4536 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4537 4538 /* 4539 * IPI virtualization relies on APICv. Disable IPI virtualization if 4540 * APICv is inhibited. 4541 */ 4542 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4543 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4544 4545 return exec_control; 4546 } 4547 4548 /* 4549 * Adjust a single secondary execution control bit to intercept/allow an 4550 * instruction in the guest. This is usually done based on whether or not a 4551 * feature has been exposed to the guest in order to correctly emulate faults. 4552 */ 4553 static inline void 4554 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4555 u32 control, bool enabled, bool exiting) 4556 { 4557 /* 4558 * If the control is for an opt-in feature, clear the control if the 4559 * feature is not exposed to the guest, i.e. not enabled. If the 4560 * control is opt-out, i.e. an exiting control, clear the control if 4561 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4562 * disabled for the associated instruction. Note, the caller is 4563 * responsible presetting exec_control to set all supported bits. 4564 */ 4565 if (enabled == exiting) 4566 *exec_control &= ~control; 4567 4568 /* 4569 * Update the nested MSR settings so that a nested VMM can/can't set 4570 * controls for features that are/aren't exposed to the guest. 4571 */ 4572 if (nested && 4573 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { 4574 /* 4575 * All features that can be added or removed to VMX MSRs must 4576 * be supported in the first place for nested virtualization. 4577 */ 4578 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4579 enabled = false; 4580 4581 if (enabled) 4582 vmx->nested.msrs.secondary_ctls_high |= control; 4583 else 4584 vmx->nested.msrs.secondary_ctls_high &= ~control; 4585 } 4586 } 4587 4588 /* 4589 * Wrapper macro for the common case of adjusting a secondary execution control 4590 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4591 * verifies that the control is actually supported by KVM and hardware. 4592 */ 4593 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4594 ({ \ 4595 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4596 bool __enabled; \ 4597 \ 4598 if (cpu_has_vmx_##name()) { \ 4599 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ 4600 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4601 __enabled, exiting); \ 4602 } \ 4603 }) 4604 4605 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4606 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4607 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4608 4609 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4610 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4611 4612 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4613 { 4614 struct kvm_vcpu *vcpu = &vmx->vcpu; 4615 4616 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4617 4618 if (vmx_pt_mode_is_system()) 4619 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4620 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4621 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4622 if (vmx->vpid == 0) 4623 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4624 if (!enable_ept) { 4625 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4626 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 4627 enable_unrestricted_guest = 0; 4628 } 4629 if (!enable_unrestricted_guest) 4630 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4631 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4632 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4633 if (!kvm_vcpu_apicv_active(vcpu)) 4634 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4635 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4636 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4637 4638 /* 4639 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4640 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4641 */ 4642 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4643 4644 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4645 * in vmx_set_cr4. */ 4646 exec_control &= ~SECONDARY_EXEC_DESC; 4647 4648 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4649 (handle_vmptrld). 4650 We can NOT enable shadow_vmcs here because we don't have yet 4651 a current VMCS12 4652 */ 4653 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4654 4655 /* 4656 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4657 * it needs to be set here when dirty logging is already active, e.g. 4658 * if this vCPU was created after dirty logging was enabled. 4659 */ 4660 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4661 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4662 4663 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4664 4665 /* 4666 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4667 * feature is exposed to the guest. This creates a virtualization hole 4668 * if both are supported in hardware but only one is exposed to the 4669 * guest, but letting the guest execute RDTSCP or RDPID when either one 4670 * is advertised is preferable to emulating the advertised instruction 4671 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4672 */ 4673 if (cpu_has_vmx_rdtscp()) { 4674 bool rdpid_or_rdtscp_enabled = 4675 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || 4676 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); 4677 4678 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4679 SECONDARY_EXEC_ENABLE_RDTSCP, 4680 rdpid_or_rdtscp_enabled, false); 4681 } 4682 4683 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4684 4685 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4686 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4687 4688 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4689 ENABLE_USR_WAIT_PAUSE, false); 4690 4691 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4692 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4693 4694 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4695 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4696 4697 return exec_control; 4698 } 4699 4700 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4701 { 4702 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4703 } 4704 4705 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4706 { 4707 struct page *pages; 4708 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4709 4710 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4711 return 0; 4712 4713 if (kvm_vmx->pid_table) 4714 return 0; 4715 4716 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4717 vmx_get_pid_table_order(kvm)); 4718 if (!pages) 4719 return -ENOMEM; 4720 4721 kvm_vmx->pid_table = (void *)page_address(pages); 4722 return 0; 4723 } 4724 4725 int vmx_vcpu_precreate(struct kvm *kvm) 4726 { 4727 return vmx_alloc_ipiv_pid_table(kvm); 4728 } 4729 4730 #define VMX_XSS_EXIT_BITMAP 0 4731 4732 static void init_vmcs(struct vcpu_vmx *vmx) 4733 { 4734 struct kvm *kvm = vmx->vcpu.kvm; 4735 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4736 4737 if (nested) 4738 nested_vmx_set_vmcs_shadowing_bitmap(); 4739 4740 if (cpu_has_vmx_msr_bitmap()) 4741 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4742 4743 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4744 4745 /* Control */ 4746 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4747 4748 exec_controls_set(vmx, vmx_exec_control(vmx)); 4749 4750 if (cpu_has_secondary_exec_ctrls()) { 4751 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4752 if (vmx->ve_info) 4753 vmcs_write64(VE_INFORMATION_ADDRESS, 4754 __pa(vmx->ve_info)); 4755 } 4756 4757 if (cpu_has_tertiary_exec_ctrls()) 4758 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4759 4760 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4761 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4762 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4763 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4764 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4765 4766 vmcs_write16(GUEST_INTR_STATUS, 0); 4767 4768 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4769 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4770 } 4771 4772 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4773 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4774 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4775 } 4776 4777 if (!kvm_pause_in_guest(kvm)) { 4778 vmcs_write32(PLE_GAP, ple_gap); 4779 vmx->ple_window = ple_window; 4780 vmx->ple_window_dirty = true; 4781 } 4782 4783 if (kvm_notify_vmexit_enabled(kvm)) 4784 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4785 4786 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4787 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4788 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4789 4790 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4791 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4792 vmx_set_constant_host_state(vmx); 4793 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4794 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4795 4796 if (cpu_has_vmx_vmfunc()) 4797 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4798 4799 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4800 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4801 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4802 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4803 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4804 4805 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4806 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4807 4808 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4809 4810 /* 22.2.1, 20.8.1 */ 4811 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4812 4813 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4814 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4815 4816 set_cr4_guest_host_mask(vmx); 4817 4818 if (vmx->vpid != 0) 4819 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4820 4821 if (cpu_has_vmx_xsaves()) 4822 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4823 4824 if (enable_pml) { 4825 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4826 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 4827 } 4828 4829 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4830 4831 if (vmx_pt_mode_is_host_guest()) { 4832 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4833 /* Bit[6~0] are forced to 1, writes are ignored. */ 4834 vmx->pt_desc.guest.output_mask = 0x7F; 4835 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4836 } 4837 4838 vmcs_write32(GUEST_SYSENTER_CS, 0); 4839 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4840 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4841 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4842 4843 if (cpu_has_vmx_tpr_shadow()) { 4844 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4845 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4846 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4847 __pa(vmx->vcpu.arch.apic->regs)); 4848 vmcs_write32(TPR_THRESHOLD, 0); 4849 } 4850 4851 vmx_setup_uret_msrs(vmx); 4852 } 4853 4854 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4855 { 4856 struct vcpu_vmx *vmx = to_vmx(vcpu); 4857 4858 init_vmcs(vmx); 4859 4860 if (nested && 4861 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4862 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4863 4864 vcpu_setup_sgx_lepubkeyhash(vcpu); 4865 4866 vmx->nested.posted_intr_nv = -1; 4867 vmx->nested.vmxon_ptr = INVALID_GPA; 4868 vmx->nested.current_vmptr = INVALID_GPA; 4869 4870 #ifdef CONFIG_KVM_HYPERV 4871 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4872 #endif 4873 4874 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4875 vcpu->arch.microcode_version = 0x100000000ULL; 4876 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4877 4878 /* 4879 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4880 * or POSTED_INTR_WAKEUP_VECTOR. 4881 */ 4882 vmx->pi_desc.nv = POSTED_INTR_VECTOR; 4883 __pi_set_sn(&vmx->pi_desc); 4884 } 4885 4886 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4887 { 4888 struct vcpu_vmx *vmx = to_vmx(vcpu); 4889 4890 if (!init_event) 4891 __vmx_vcpu_reset(vcpu); 4892 4893 vmx->rmode.vm86_active = 0; 4894 vmx->spec_ctrl = 0; 4895 4896 vmx->msr_ia32_umwait_control = 0; 4897 4898 vmx->hv_deadline_tsc = -1; 4899 kvm_set_cr8(vcpu, 0); 4900 4901 seg_setup(VCPU_SREG_CS); 4902 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4903 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4904 4905 seg_setup(VCPU_SREG_DS); 4906 seg_setup(VCPU_SREG_ES); 4907 seg_setup(VCPU_SREG_FS); 4908 seg_setup(VCPU_SREG_GS); 4909 seg_setup(VCPU_SREG_SS); 4910 4911 vmcs_write16(GUEST_TR_SELECTOR, 0); 4912 vmcs_writel(GUEST_TR_BASE, 0); 4913 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4914 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4915 4916 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4917 vmcs_writel(GUEST_LDTR_BASE, 0); 4918 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4919 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4920 4921 vmcs_writel(GUEST_GDTR_BASE, 0); 4922 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4923 4924 vmcs_writel(GUEST_IDTR_BASE, 0); 4925 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4926 4927 vmx_segment_cache_clear(vmx); 4928 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4929 4930 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4931 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4932 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4933 if (kvm_mpx_supported()) 4934 vmcs_write64(GUEST_BNDCFGS, 0); 4935 4936 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4937 4938 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4939 4940 vpid_sync_context(vmx->vpid); 4941 4942 vmx_update_fb_clear_dis(vcpu, vmx); 4943 } 4944 4945 void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 4946 { 4947 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4948 } 4949 4950 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 4951 { 4952 if (!enable_vnmi || 4953 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4954 vmx_enable_irq_window(vcpu); 4955 return; 4956 } 4957 4958 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4959 } 4960 4961 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 4962 { 4963 struct vcpu_vmx *vmx = to_vmx(vcpu); 4964 uint32_t intr; 4965 int irq = vcpu->arch.interrupt.nr; 4966 4967 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 4968 4969 ++vcpu->stat.irq_injections; 4970 if (vmx->rmode.vm86_active) { 4971 int inc_eip = 0; 4972 if (vcpu->arch.interrupt.soft) 4973 inc_eip = vcpu->arch.event_exit_inst_len; 4974 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4975 return; 4976 } 4977 intr = irq | INTR_INFO_VALID_MASK; 4978 if (vcpu->arch.interrupt.soft) { 4979 intr |= INTR_TYPE_SOFT_INTR; 4980 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4981 vmx->vcpu.arch.event_exit_inst_len); 4982 } else 4983 intr |= INTR_TYPE_EXT_INTR; 4984 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4985 4986 vmx_clear_hlt(vcpu); 4987 } 4988 4989 void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4990 { 4991 struct vcpu_vmx *vmx = to_vmx(vcpu); 4992 4993 if (!enable_vnmi) { 4994 /* 4995 * Tracking the NMI-blocked state in software is built upon 4996 * finding the next open IRQ window. This, in turn, depends on 4997 * well-behaving guests: They have to keep IRQs disabled at 4998 * least as long as the NMI handler runs. Otherwise we may 4999 * cause NMI nesting, maybe breaking the guest. But as this is 5000 * highly unlikely, we can live with the residual risk. 5001 */ 5002 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 5003 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5004 } 5005 5006 ++vcpu->stat.nmi_injections; 5007 vmx->loaded_vmcs->nmi_known_unmasked = false; 5008 5009 if (vmx->rmode.vm86_active) { 5010 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 5011 return; 5012 } 5013 5014 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 5015 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 5016 5017 vmx_clear_hlt(vcpu); 5018 } 5019 5020 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 5021 { 5022 struct vcpu_vmx *vmx = to_vmx(vcpu); 5023 bool masked; 5024 5025 if (!enable_vnmi) 5026 return vmx->loaded_vmcs->soft_vnmi_blocked; 5027 if (vmx->loaded_vmcs->nmi_known_unmasked) 5028 return false; 5029 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 5030 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5031 return masked; 5032 } 5033 5034 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5035 { 5036 struct vcpu_vmx *vmx = to_vmx(vcpu); 5037 5038 if (!enable_vnmi) { 5039 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 5040 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 5041 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5042 } 5043 } else { 5044 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5045 if (masked) 5046 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5047 GUEST_INTR_STATE_NMI); 5048 else 5049 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5050 GUEST_INTR_STATE_NMI); 5051 } 5052 } 5053 5054 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5055 { 5056 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5057 return false; 5058 5059 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5060 return true; 5061 5062 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5063 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5064 GUEST_INTR_STATE_NMI)); 5065 } 5066 5067 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5068 { 5069 if (to_vmx(vcpu)->nested.nested_run_pending) 5070 return -EBUSY; 5071 5072 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5073 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5074 return -EBUSY; 5075 5076 return !vmx_nmi_blocked(vcpu); 5077 } 5078 5079 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5080 { 5081 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5082 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5083 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5084 } 5085 5086 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5087 { 5088 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5089 return false; 5090 5091 return __vmx_interrupt_blocked(vcpu); 5092 } 5093 5094 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5095 { 5096 if (to_vmx(vcpu)->nested.nested_run_pending) 5097 return -EBUSY; 5098 5099 /* 5100 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5101 * e.g. if the IRQ arrived asynchronously after checking nested events. 5102 */ 5103 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5104 return -EBUSY; 5105 5106 return !vmx_interrupt_blocked(vcpu); 5107 } 5108 5109 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5110 { 5111 void __user *ret; 5112 5113 if (enable_unrestricted_guest) 5114 return 0; 5115 5116 mutex_lock(&kvm->slots_lock); 5117 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5118 PAGE_SIZE * 3); 5119 mutex_unlock(&kvm->slots_lock); 5120 5121 if (IS_ERR(ret)) 5122 return PTR_ERR(ret); 5123 5124 to_kvm_vmx(kvm)->tss_addr = addr; 5125 5126 return init_rmode_tss(kvm, ret); 5127 } 5128 5129 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5130 { 5131 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5132 return 0; 5133 } 5134 5135 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5136 { 5137 switch (vec) { 5138 case BP_VECTOR: 5139 /* 5140 * Update instruction length as we may reinject the exception 5141 * from user space while in guest debugging mode. 5142 */ 5143 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5144 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5145 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5146 return false; 5147 fallthrough; 5148 case DB_VECTOR: 5149 return !(vcpu->guest_debug & 5150 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5151 case DE_VECTOR: 5152 case OF_VECTOR: 5153 case BR_VECTOR: 5154 case UD_VECTOR: 5155 case DF_VECTOR: 5156 case SS_VECTOR: 5157 case GP_VECTOR: 5158 case MF_VECTOR: 5159 return true; 5160 } 5161 return false; 5162 } 5163 5164 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5165 int vec, u32 err_code) 5166 { 5167 /* 5168 * Instruction with address size override prefix opcode 0x67 5169 * Cause the #SS fault with 0 error code in VM86 mode. 5170 */ 5171 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5172 if (kvm_emulate_instruction(vcpu, 0)) { 5173 if (vcpu->arch.halt_request) { 5174 vcpu->arch.halt_request = 0; 5175 return kvm_emulate_halt_noskip(vcpu); 5176 } 5177 return 1; 5178 } 5179 return 0; 5180 } 5181 5182 /* 5183 * Forward all other exceptions that are valid in real mode. 5184 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5185 * the required debugging infrastructure rework. 5186 */ 5187 kvm_queue_exception(vcpu, vec); 5188 return 1; 5189 } 5190 5191 static int handle_machine_check(struct kvm_vcpu *vcpu) 5192 { 5193 /* handled by vmx_vcpu_run() */ 5194 return 1; 5195 } 5196 5197 /* 5198 * If the host has split lock detection disabled, then #AC is 5199 * unconditionally injected into the guest, which is the pre split lock 5200 * detection behaviour. 5201 * 5202 * If the host has split lock detection enabled then #AC is 5203 * only injected into the guest when: 5204 * - Guest CPL == 3 (user mode) 5205 * - Guest has #AC detection enabled in CR0 5206 * - Guest EFLAGS has AC bit set 5207 */ 5208 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5209 { 5210 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5211 return true; 5212 5213 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5214 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5215 } 5216 5217 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5218 { 5219 struct vcpu_vmx *vmx = to_vmx(vcpu); 5220 struct kvm_run *kvm_run = vcpu->run; 5221 u32 intr_info, ex_no, error_code; 5222 unsigned long cr2, dr6; 5223 u32 vect_info; 5224 5225 vect_info = vmx->idt_vectoring_info; 5226 intr_info = vmx_get_intr_info(vcpu); 5227 5228 /* 5229 * Machine checks are handled by handle_exception_irqoff(), or by 5230 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5231 * vmx_vcpu_enter_exit(). 5232 */ 5233 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5234 return 1; 5235 5236 /* 5237 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5238 * This ensures the nested_vmx check is not skipped so vmexit can 5239 * be reflected to L1 (when it intercepts #NM) before reaching this 5240 * point. 5241 */ 5242 if (is_nm_fault(intr_info)) { 5243 kvm_queue_exception(vcpu, NM_VECTOR); 5244 return 1; 5245 } 5246 5247 if (is_invalid_opcode(intr_info)) 5248 return handle_ud(vcpu); 5249 5250 if (WARN_ON_ONCE(is_ve_fault(intr_info))) { 5251 struct vmx_ve_information *ve_info = vmx->ve_info; 5252 5253 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, 5254 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); 5255 dump_vmcs(vcpu); 5256 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); 5257 return 1; 5258 } 5259 5260 error_code = 0; 5261 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5262 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5263 5264 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5265 WARN_ON_ONCE(!enable_vmware_backdoor); 5266 5267 /* 5268 * VMware backdoor emulation on #GP interception only handles 5269 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5270 * error code on #GP. 5271 */ 5272 if (error_code) { 5273 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5274 return 1; 5275 } 5276 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5277 } 5278 5279 /* 5280 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5281 * MMIO, it is better to report an internal error. 5282 * See the comments in vmx_handle_exit. 5283 */ 5284 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5285 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5286 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5287 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5288 vcpu->run->internal.ndata = 4; 5289 vcpu->run->internal.data[0] = vect_info; 5290 vcpu->run->internal.data[1] = intr_info; 5291 vcpu->run->internal.data[2] = error_code; 5292 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5293 return 0; 5294 } 5295 5296 if (is_page_fault(intr_info)) { 5297 cr2 = vmx_get_exit_qual(vcpu); 5298 if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5299 /* 5300 * EPT will cause page fault only if we need to 5301 * detect illegal GPAs. 5302 */ 5303 WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5304 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5305 return 1; 5306 } else 5307 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5308 } 5309 5310 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5311 5312 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5313 return handle_rmode_exception(vcpu, ex_no, error_code); 5314 5315 switch (ex_no) { 5316 case DB_VECTOR: 5317 dr6 = vmx_get_exit_qual(vcpu); 5318 if (!(vcpu->guest_debug & 5319 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5320 /* 5321 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5322 * instruction. ICEBP generates a trap-like #DB, but 5323 * despite its interception control being tied to #DB, 5324 * is an instruction intercept, i.e. the VM-Exit occurs 5325 * on the ICEBP itself. Use the inner "skip" helper to 5326 * avoid single-step #DB and MTF updates, as ICEBP is 5327 * higher priority. Note, skipping ICEBP still clears 5328 * STI and MOVSS blocking. 5329 * 5330 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5331 * if single-step is enabled in RFLAGS and STI or MOVSS 5332 * blocking is active, as the CPU doesn't set the bit 5333 * on VM-Exit due to #DB interception. VM-Entry has a 5334 * consistency check that a single-step #DB is pending 5335 * in this scenario as the previous instruction cannot 5336 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5337 * don't modify RFLAGS), therefore the one instruction 5338 * delay when activating single-step breakpoints must 5339 * have already expired. Note, the CPU sets/clears BS 5340 * as appropriate for all other VM-Exits types. 5341 */ 5342 if (is_icebp(intr_info)) 5343 WARN_ON(!skip_emulated_instruction(vcpu)); 5344 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5345 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5346 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5347 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5348 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5349 5350 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5351 return 1; 5352 } 5353 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5354 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5355 fallthrough; 5356 case BP_VECTOR: 5357 /* 5358 * Update instruction length as we may reinject #BP from 5359 * user space while in guest debugging mode. Reading it for 5360 * #DB as well causes no harm, it is not used in that case. 5361 */ 5362 vmx->vcpu.arch.event_exit_inst_len = 5363 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5364 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5365 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5366 kvm_run->debug.arch.exception = ex_no; 5367 break; 5368 case AC_VECTOR: 5369 if (vmx_guest_inject_ac(vcpu)) { 5370 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5371 return 1; 5372 } 5373 5374 /* 5375 * Handle split lock. Depending on detection mode this will 5376 * either warn and disable split lock detection for this 5377 * task or force SIGBUS on it. 5378 */ 5379 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5380 return 1; 5381 fallthrough; 5382 default: 5383 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5384 kvm_run->ex.exception = ex_no; 5385 kvm_run->ex.error_code = error_code; 5386 break; 5387 } 5388 return 0; 5389 } 5390 5391 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5392 { 5393 ++vcpu->stat.irq_exits; 5394 return 1; 5395 } 5396 5397 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5398 { 5399 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5400 vcpu->mmio_needed = 0; 5401 return 0; 5402 } 5403 5404 static int handle_io(struct kvm_vcpu *vcpu) 5405 { 5406 unsigned long exit_qualification; 5407 int size, in, string; 5408 unsigned port; 5409 5410 exit_qualification = vmx_get_exit_qual(vcpu); 5411 string = (exit_qualification & 16) != 0; 5412 5413 ++vcpu->stat.io_exits; 5414 5415 if (string) 5416 return kvm_emulate_instruction(vcpu, 0); 5417 5418 port = exit_qualification >> 16; 5419 size = (exit_qualification & 7) + 1; 5420 in = (exit_qualification & 8) != 0; 5421 5422 return kvm_fast_pio(vcpu, size, port, in); 5423 } 5424 5425 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5426 { 5427 /* 5428 * Patch in the VMCALL instruction: 5429 */ 5430 hypercall[0] = 0x0f; 5431 hypercall[1] = 0x01; 5432 hypercall[2] = 0xc1; 5433 } 5434 5435 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5436 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5437 { 5438 if (is_guest_mode(vcpu)) { 5439 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5440 unsigned long orig_val = val; 5441 5442 /* 5443 * We get here when L2 changed cr0 in a way that did not change 5444 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5445 * but did change L0 shadowed bits. So we first calculate the 5446 * effective cr0 value that L1 would like to write into the 5447 * hardware. It consists of the L2-owned bits from the new 5448 * value combined with the L1-owned bits from L1's guest_cr0. 5449 */ 5450 val = (val & ~vmcs12->cr0_guest_host_mask) | 5451 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5452 5453 if (kvm_set_cr0(vcpu, val)) 5454 return 1; 5455 vmcs_writel(CR0_READ_SHADOW, orig_val); 5456 return 0; 5457 } else { 5458 return kvm_set_cr0(vcpu, val); 5459 } 5460 } 5461 5462 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5463 { 5464 if (is_guest_mode(vcpu)) { 5465 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5466 unsigned long orig_val = val; 5467 5468 /* analogously to handle_set_cr0 */ 5469 val = (val & ~vmcs12->cr4_guest_host_mask) | 5470 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5471 if (kvm_set_cr4(vcpu, val)) 5472 return 1; 5473 vmcs_writel(CR4_READ_SHADOW, orig_val); 5474 return 0; 5475 } else 5476 return kvm_set_cr4(vcpu, val); 5477 } 5478 5479 static int handle_desc(struct kvm_vcpu *vcpu) 5480 { 5481 /* 5482 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5483 * and other code needs to be updated if UMIP can be guest owned. 5484 */ 5485 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5486 5487 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5488 return kvm_emulate_instruction(vcpu, 0); 5489 } 5490 5491 static int handle_cr(struct kvm_vcpu *vcpu) 5492 { 5493 unsigned long exit_qualification, val; 5494 int cr; 5495 int reg; 5496 int err; 5497 int ret; 5498 5499 exit_qualification = vmx_get_exit_qual(vcpu); 5500 cr = exit_qualification & 15; 5501 reg = (exit_qualification >> 8) & 15; 5502 switch ((exit_qualification >> 4) & 3) { 5503 case 0: /* mov to cr */ 5504 val = kvm_register_read(vcpu, reg); 5505 trace_kvm_cr_write(cr, val); 5506 switch (cr) { 5507 case 0: 5508 err = handle_set_cr0(vcpu, val); 5509 return kvm_complete_insn_gp(vcpu, err); 5510 case 3: 5511 WARN_ON_ONCE(enable_unrestricted_guest); 5512 5513 err = kvm_set_cr3(vcpu, val); 5514 return kvm_complete_insn_gp(vcpu, err); 5515 case 4: 5516 err = handle_set_cr4(vcpu, val); 5517 return kvm_complete_insn_gp(vcpu, err); 5518 case 8: { 5519 u8 cr8_prev = kvm_get_cr8(vcpu); 5520 u8 cr8 = (u8)val; 5521 err = kvm_set_cr8(vcpu, cr8); 5522 ret = kvm_complete_insn_gp(vcpu, err); 5523 if (lapic_in_kernel(vcpu)) 5524 return ret; 5525 if (cr8_prev <= cr8) 5526 return ret; 5527 /* 5528 * TODO: we might be squashing a 5529 * KVM_GUESTDBG_SINGLESTEP-triggered 5530 * KVM_EXIT_DEBUG here. 5531 */ 5532 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5533 return 0; 5534 } 5535 } 5536 break; 5537 case 2: /* clts */ 5538 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5539 return -EIO; 5540 case 1: /*mov from cr*/ 5541 switch (cr) { 5542 case 3: 5543 WARN_ON_ONCE(enable_unrestricted_guest); 5544 5545 val = kvm_read_cr3(vcpu); 5546 kvm_register_write(vcpu, reg, val); 5547 trace_kvm_cr_read(cr, val); 5548 return kvm_skip_emulated_instruction(vcpu); 5549 case 8: 5550 val = kvm_get_cr8(vcpu); 5551 kvm_register_write(vcpu, reg, val); 5552 trace_kvm_cr_read(cr, val); 5553 return kvm_skip_emulated_instruction(vcpu); 5554 } 5555 break; 5556 case 3: /* lmsw */ 5557 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5558 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5559 kvm_lmsw(vcpu, val); 5560 5561 return kvm_skip_emulated_instruction(vcpu); 5562 default: 5563 break; 5564 } 5565 vcpu->run->exit_reason = 0; 5566 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5567 (int)(exit_qualification >> 4) & 3, cr); 5568 return 0; 5569 } 5570 5571 static int handle_dr(struct kvm_vcpu *vcpu) 5572 { 5573 unsigned long exit_qualification; 5574 int dr, dr7, reg; 5575 int err = 1; 5576 5577 exit_qualification = vmx_get_exit_qual(vcpu); 5578 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5579 5580 /* First, if DR does not exist, trigger UD */ 5581 if (!kvm_require_dr(vcpu, dr)) 5582 return 1; 5583 5584 if (vmx_get_cpl(vcpu) > 0) 5585 goto out; 5586 5587 dr7 = vmcs_readl(GUEST_DR7); 5588 if (dr7 & DR7_GD) { 5589 /* 5590 * As the vm-exit takes precedence over the debug trap, we 5591 * need to emulate the latter, either for the host or the 5592 * guest debugging itself. 5593 */ 5594 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5595 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5596 vcpu->run->debug.arch.dr7 = dr7; 5597 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5598 vcpu->run->debug.arch.exception = DB_VECTOR; 5599 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5600 return 0; 5601 } else { 5602 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5603 return 1; 5604 } 5605 } 5606 5607 if (vcpu->guest_debug == 0) { 5608 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5609 5610 /* 5611 * No more DR vmexits; force a reload of the debug registers 5612 * and reenter on this instruction. The next vmexit will 5613 * retrieve the full state of the debug registers. 5614 */ 5615 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5616 return 1; 5617 } 5618 5619 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5620 if (exit_qualification & TYPE_MOV_FROM_DR) { 5621 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5622 err = 0; 5623 } else { 5624 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5625 } 5626 5627 out: 5628 return kvm_complete_insn_gp(vcpu, err); 5629 } 5630 5631 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5632 { 5633 get_debugreg(vcpu->arch.db[0], 0); 5634 get_debugreg(vcpu->arch.db[1], 1); 5635 get_debugreg(vcpu->arch.db[2], 2); 5636 get_debugreg(vcpu->arch.db[3], 3); 5637 get_debugreg(vcpu->arch.dr6, 6); 5638 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5639 5640 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5641 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5642 5643 /* 5644 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5645 * a stale dr6 from the guest. 5646 */ 5647 set_debugreg(DR6_RESERVED, 6); 5648 } 5649 5650 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5651 { 5652 vmcs_writel(GUEST_DR7, val); 5653 } 5654 5655 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5656 { 5657 kvm_apic_update_ppr(vcpu); 5658 return 1; 5659 } 5660 5661 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5662 { 5663 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5664 5665 kvm_make_request(KVM_REQ_EVENT, vcpu); 5666 5667 ++vcpu->stat.irq_window_exits; 5668 return 1; 5669 } 5670 5671 static int handle_invlpg(struct kvm_vcpu *vcpu) 5672 { 5673 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5674 5675 kvm_mmu_invlpg(vcpu, exit_qualification); 5676 return kvm_skip_emulated_instruction(vcpu); 5677 } 5678 5679 static int handle_apic_access(struct kvm_vcpu *vcpu) 5680 { 5681 if (likely(fasteoi)) { 5682 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5683 int access_type, offset; 5684 5685 access_type = exit_qualification & APIC_ACCESS_TYPE; 5686 offset = exit_qualification & APIC_ACCESS_OFFSET; 5687 /* 5688 * Sane guest uses MOV to write EOI, with written value 5689 * not cared. So make a short-circuit here by avoiding 5690 * heavy instruction emulation. 5691 */ 5692 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5693 (offset == APIC_EOI)) { 5694 kvm_lapic_set_eoi(vcpu); 5695 return kvm_skip_emulated_instruction(vcpu); 5696 } 5697 } 5698 return kvm_emulate_instruction(vcpu, 0); 5699 } 5700 5701 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5702 { 5703 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5704 int vector = exit_qualification & 0xff; 5705 5706 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5707 kvm_apic_set_eoi_accelerated(vcpu, vector); 5708 return 1; 5709 } 5710 5711 static int handle_apic_write(struct kvm_vcpu *vcpu) 5712 { 5713 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5714 5715 /* 5716 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5717 * hardware has done any necessary aliasing, offset adjustments, etc... 5718 * for the access. I.e. the correct value has already been written to 5719 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5720 * retrieve the register value and emulate the access. 5721 */ 5722 u32 offset = exit_qualification & 0xff0; 5723 5724 kvm_apic_write_nodecode(vcpu, offset); 5725 return 1; 5726 } 5727 5728 static int handle_task_switch(struct kvm_vcpu *vcpu) 5729 { 5730 struct vcpu_vmx *vmx = to_vmx(vcpu); 5731 unsigned long exit_qualification; 5732 bool has_error_code = false; 5733 u32 error_code = 0; 5734 u16 tss_selector; 5735 int reason, type, idt_v, idt_index; 5736 5737 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5738 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5739 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5740 5741 exit_qualification = vmx_get_exit_qual(vcpu); 5742 5743 reason = (u32)exit_qualification >> 30; 5744 if (reason == TASK_SWITCH_GATE && idt_v) { 5745 switch (type) { 5746 case INTR_TYPE_NMI_INTR: 5747 vcpu->arch.nmi_injected = false; 5748 vmx_set_nmi_mask(vcpu, true); 5749 break; 5750 case INTR_TYPE_EXT_INTR: 5751 case INTR_TYPE_SOFT_INTR: 5752 kvm_clear_interrupt_queue(vcpu); 5753 break; 5754 case INTR_TYPE_HARD_EXCEPTION: 5755 if (vmx->idt_vectoring_info & 5756 VECTORING_INFO_DELIVER_CODE_MASK) { 5757 has_error_code = true; 5758 error_code = 5759 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5760 } 5761 fallthrough; 5762 case INTR_TYPE_SOFT_EXCEPTION: 5763 kvm_clear_exception_queue(vcpu); 5764 break; 5765 default: 5766 break; 5767 } 5768 } 5769 tss_selector = exit_qualification; 5770 5771 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5772 type != INTR_TYPE_EXT_INTR && 5773 type != INTR_TYPE_NMI_INTR)) 5774 WARN_ON(!skip_emulated_instruction(vcpu)); 5775 5776 /* 5777 * TODO: What about debug traps on tss switch? 5778 * Are we supposed to inject them and update dr6? 5779 */ 5780 return kvm_task_switch(vcpu, tss_selector, 5781 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5782 reason, has_error_code, error_code); 5783 } 5784 5785 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5786 { 5787 unsigned long exit_qualification; 5788 gpa_t gpa; 5789 u64 error_code; 5790 5791 exit_qualification = vmx_get_exit_qual(vcpu); 5792 5793 /* 5794 * EPT violation happened while executing iret from NMI, 5795 * "blocked by NMI" bit has to be set before next VM entry. 5796 * There are errata that may cause this bit to not be set: 5797 * AAK134, BY25. 5798 */ 5799 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5800 enable_vnmi && 5801 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5802 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5803 5804 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5805 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5806 5807 /* Is it a read fault? */ 5808 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) 5809 ? PFERR_USER_MASK : 0; 5810 /* Is it a write fault? */ 5811 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) 5812 ? PFERR_WRITE_MASK : 0; 5813 /* Is it a fetch fault? */ 5814 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) 5815 ? PFERR_FETCH_MASK : 0; 5816 /* ept page table entry is present? */ 5817 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) 5818 ? PFERR_PRESENT_MASK : 0; 5819 5820 if (error_code & EPT_VIOLATION_GVA_IS_VALID) 5821 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? 5822 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5823 5824 /* 5825 * Check that the GPA doesn't exceed physical memory limits, as that is 5826 * a guest page fault. We have to emulate the instruction here, because 5827 * if the illegal address is that of a paging structure, then 5828 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5829 * would also use advanced VM-exit information for EPT violations to 5830 * reconstruct the page fault error code. 5831 */ 5832 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 5833 return kvm_emulate_instruction(vcpu, 0); 5834 5835 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5836 } 5837 5838 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5839 { 5840 gpa_t gpa; 5841 5842 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5843 return 1; 5844 5845 /* 5846 * A nested guest cannot optimize MMIO vmexits, because we have an 5847 * nGPA here instead of the required GPA. 5848 */ 5849 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5850 if (!is_guest_mode(vcpu) && 5851 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5852 trace_kvm_fast_mmio(gpa); 5853 return kvm_skip_emulated_instruction(vcpu); 5854 } 5855 5856 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5857 } 5858 5859 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5860 { 5861 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5862 return -EIO; 5863 5864 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5865 ++vcpu->stat.nmi_window_exits; 5866 kvm_make_request(KVM_REQ_EVENT, vcpu); 5867 5868 return 1; 5869 } 5870 5871 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) 5872 { 5873 struct vcpu_vmx *vmx = to_vmx(vcpu); 5874 5875 return vmx->emulation_required && !vmx->rmode.vm86_active && 5876 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 5877 } 5878 5879 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5880 { 5881 struct vcpu_vmx *vmx = to_vmx(vcpu); 5882 bool intr_window_requested; 5883 unsigned count = 130; 5884 5885 intr_window_requested = exec_controls_get(vmx) & 5886 CPU_BASED_INTR_WINDOW_EXITING; 5887 5888 while (vmx->emulation_required && count-- != 0) { 5889 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 5890 return handle_interrupt_window(&vmx->vcpu); 5891 5892 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5893 return 1; 5894 5895 if (!kvm_emulate_instruction(vcpu, 0)) 5896 return 0; 5897 5898 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5899 kvm_prepare_emulation_failure_exit(vcpu); 5900 return 0; 5901 } 5902 5903 if (vcpu->arch.halt_request) { 5904 vcpu->arch.halt_request = 0; 5905 return kvm_emulate_halt_noskip(vcpu); 5906 } 5907 5908 /* 5909 * Note, return 1 and not 0, vcpu_run() will invoke 5910 * xfer_to_guest_mode() which will create a proper return 5911 * code. 5912 */ 5913 if (__xfer_to_guest_mode_work_pending()) 5914 return 1; 5915 } 5916 5917 return 1; 5918 } 5919 5920 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5921 { 5922 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5923 kvm_prepare_emulation_failure_exit(vcpu); 5924 return 0; 5925 } 5926 5927 return 1; 5928 } 5929 5930 /* 5931 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5932 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5933 */ 5934 static int handle_pause(struct kvm_vcpu *vcpu) 5935 { 5936 if (!kvm_pause_in_guest(vcpu->kvm)) 5937 grow_ple_window(vcpu); 5938 5939 /* 5940 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5941 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5942 * never set PAUSE_EXITING and just set PLE if supported, 5943 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5944 */ 5945 kvm_vcpu_on_spin(vcpu, true); 5946 return kvm_skip_emulated_instruction(vcpu); 5947 } 5948 5949 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5950 { 5951 return 1; 5952 } 5953 5954 static int handle_invpcid(struct kvm_vcpu *vcpu) 5955 { 5956 u32 vmx_instruction_info; 5957 unsigned long type; 5958 gva_t gva; 5959 struct { 5960 u64 pcid; 5961 u64 gla; 5962 } operand; 5963 int gpr_index; 5964 5965 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 5966 kvm_queue_exception(vcpu, UD_VECTOR); 5967 return 1; 5968 } 5969 5970 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5971 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5972 type = kvm_register_read(vcpu, gpr_index); 5973 5974 /* According to the Intel instruction reference, the memory operand 5975 * is read even if it isn't needed (e.g., for type==all) 5976 */ 5977 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5978 vmx_instruction_info, false, 5979 sizeof(operand), &gva)) 5980 return 1; 5981 5982 return kvm_handle_invpcid(vcpu, type, gva); 5983 } 5984 5985 static int handle_pml_full(struct kvm_vcpu *vcpu) 5986 { 5987 unsigned long exit_qualification; 5988 5989 trace_kvm_pml_full(vcpu->vcpu_id); 5990 5991 exit_qualification = vmx_get_exit_qual(vcpu); 5992 5993 /* 5994 * PML buffer FULL happened while executing iret from NMI, 5995 * "blocked by NMI" bit has to be set before next VM entry. 5996 */ 5997 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5998 enable_vnmi && 5999 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 6000 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6001 GUEST_INTR_STATE_NMI); 6002 6003 /* 6004 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 6005 * here.., and there's no userspace involvement needed for PML. 6006 */ 6007 return 1; 6008 } 6009 6010 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 6011 bool force_immediate_exit) 6012 { 6013 struct vcpu_vmx *vmx = to_vmx(vcpu); 6014 6015 /* 6016 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 6017 * due to the timer expiring while it was "soft" disabled, just eat the 6018 * exit and re-enter the guest. 6019 */ 6020 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 6021 return EXIT_FASTPATH_REENTER_GUEST; 6022 6023 /* 6024 * If the timer expired because KVM used it to force an immediate exit, 6025 * then mission accomplished. 6026 */ 6027 if (force_immediate_exit) 6028 return EXIT_FASTPATH_EXIT_HANDLED; 6029 6030 /* 6031 * If L2 is active, go down the slow path as emulating the guest timer 6032 * expiration likely requires synthesizing a nested VM-Exit. 6033 */ 6034 if (is_guest_mode(vcpu)) 6035 return EXIT_FASTPATH_NONE; 6036 6037 kvm_lapic_expired_hv_timer(vcpu); 6038 return EXIT_FASTPATH_REENTER_GUEST; 6039 } 6040 6041 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6042 { 6043 /* 6044 * This non-fastpath handler is reached if and only if the preemption 6045 * timer was being used to emulate a guest timer while L2 is active. 6046 * All other scenarios are supposed to be handled in the fastpath. 6047 */ 6048 WARN_ON_ONCE(!is_guest_mode(vcpu)); 6049 kvm_lapic_expired_hv_timer(vcpu); 6050 return 1; 6051 } 6052 6053 /* 6054 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6055 * are overwritten by nested_vmx_setup() when nested=1. 6056 */ 6057 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6058 { 6059 kvm_queue_exception(vcpu, UD_VECTOR); 6060 return 1; 6061 } 6062 6063 #ifndef CONFIG_X86_SGX_KVM 6064 static int handle_encls(struct kvm_vcpu *vcpu) 6065 { 6066 /* 6067 * SGX virtualization is disabled. There is no software enable bit for 6068 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6069 * the guest from executing ENCLS (when SGX is supported by hardware). 6070 */ 6071 kvm_queue_exception(vcpu, UD_VECTOR); 6072 return 1; 6073 } 6074 #endif /* CONFIG_X86_SGX_KVM */ 6075 6076 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6077 { 6078 /* 6079 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6080 * VM-Exits. Unconditionally set the flag here and leave the handling to 6081 * vmx_handle_exit(). 6082 */ 6083 to_vmx(vcpu)->exit_reason.bus_lock_detected = true; 6084 return 1; 6085 } 6086 6087 static int handle_notify(struct kvm_vcpu *vcpu) 6088 { 6089 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6090 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6091 6092 ++vcpu->stat.notify_window_exits; 6093 6094 /* 6095 * Notify VM exit happened while executing iret from NMI, 6096 * "blocked by NMI" bit has to be set before next VM entry. 6097 */ 6098 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6099 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6100 GUEST_INTR_STATE_NMI); 6101 6102 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6103 context_invalid) { 6104 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6105 vcpu->run->notify.flags = context_invalid ? 6106 KVM_NOTIFY_CONTEXT_INVALID : 0; 6107 return 0; 6108 } 6109 6110 return 1; 6111 } 6112 6113 /* 6114 * The exit handlers return 1 if the exit was handled fully and guest execution 6115 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6116 * to be done to userspace and return 0. 6117 */ 6118 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6119 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6120 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6121 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6122 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6123 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6124 [EXIT_REASON_CR_ACCESS] = handle_cr, 6125 [EXIT_REASON_DR_ACCESS] = handle_dr, 6126 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6127 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6128 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6129 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6130 [EXIT_REASON_HLT] = kvm_emulate_halt, 6131 [EXIT_REASON_INVD] = kvm_emulate_invd, 6132 [EXIT_REASON_INVLPG] = handle_invlpg, 6133 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6134 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6135 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6136 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6137 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6138 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6139 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6140 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6141 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6142 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6143 [EXIT_REASON_VMON] = handle_vmx_instruction, 6144 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6145 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6146 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6147 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6148 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6149 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6150 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6151 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6152 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6153 [EXIT_REASON_LDTR_TR] = handle_desc, 6154 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6155 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6156 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6157 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6158 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6159 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6160 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6161 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6162 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6163 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6164 [EXIT_REASON_PML_FULL] = handle_pml_full, 6165 [EXIT_REASON_INVPCID] = handle_invpcid, 6166 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6167 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6168 [EXIT_REASON_ENCLS] = handle_encls, 6169 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6170 [EXIT_REASON_NOTIFY] = handle_notify, 6171 }; 6172 6173 static const int kvm_vmx_max_exit_handlers = 6174 ARRAY_SIZE(kvm_vmx_exit_handlers); 6175 6176 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6177 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 6178 { 6179 struct vcpu_vmx *vmx = to_vmx(vcpu); 6180 6181 *reason = vmx->exit_reason.full; 6182 *info1 = vmx_get_exit_qual(vcpu); 6183 if (!(vmx->exit_reason.failed_vmentry)) { 6184 *info2 = vmx->idt_vectoring_info; 6185 *intr_info = vmx_get_intr_info(vcpu); 6186 if (is_exception_with_error_code(*intr_info)) 6187 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6188 else 6189 *error_code = 0; 6190 } else { 6191 *info2 = 0; 6192 *intr_info = 0; 6193 *error_code = 0; 6194 } 6195 } 6196 6197 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6198 { 6199 if (vmx->pml_pg) { 6200 __free_page(vmx->pml_pg); 6201 vmx->pml_pg = NULL; 6202 } 6203 } 6204 6205 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6206 { 6207 struct vcpu_vmx *vmx = to_vmx(vcpu); 6208 u64 *pml_buf; 6209 u16 pml_idx; 6210 6211 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6212 6213 /* Do nothing if PML buffer is empty */ 6214 if (pml_idx == (PML_ENTITY_NUM - 1)) 6215 return; 6216 6217 /* PML index always points to next available PML buffer entity */ 6218 if (pml_idx >= PML_ENTITY_NUM) 6219 pml_idx = 0; 6220 else 6221 pml_idx++; 6222 6223 pml_buf = page_address(vmx->pml_pg); 6224 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 6225 u64 gpa; 6226 6227 gpa = pml_buf[pml_idx]; 6228 WARN_ON(gpa & (PAGE_SIZE - 1)); 6229 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6230 } 6231 6232 /* reset PML index */ 6233 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 6234 } 6235 6236 static void vmx_dump_sel(char *name, uint32_t sel) 6237 { 6238 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6239 name, vmcs_read16(sel), 6240 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6241 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6242 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6243 } 6244 6245 static void vmx_dump_dtsel(char *name, uint32_t limit) 6246 { 6247 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6248 name, vmcs_read32(limit), 6249 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6250 } 6251 6252 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6253 { 6254 unsigned int i; 6255 struct vmx_msr_entry *e; 6256 6257 pr_err("MSR %s:\n", name); 6258 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6259 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6260 } 6261 6262 void dump_vmcs(struct kvm_vcpu *vcpu) 6263 { 6264 struct vcpu_vmx *vmx = to_vmx(vcpu); 6265 u32 vmentry_ctl, vmexit_ctl; 6266 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6267 u64 tertiary_exec_control; 6268 unsigned long cr4; 6269 int efer_slot; 6270 6271 if (!dump_invalid_vmcs) { 6272 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6273 return; 6274 } 6275 6276 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6277 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6278 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6279 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6280 cr4 = vmcs_readl(GUEST_CR4); 6281 6282 if (cpu_has_secondary_exec_ctrls()) 6283 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6284 else 6285 secondary_exec_control = 0; 6286 6287 if (cpu_has_tertiary_exec_ctrls()) 6288 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6289 else 6290 tertiary_exec_control = 0; 6291 6292 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6293 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6294 pr_err("*** Guest State ***\n"); 6295 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6296 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6297 vmcs_readl(CR0_GUEST_HOST_MASK)); 6298 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6299 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6300 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6301 if (cpu_has_vmx_ept()) { 6302 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6303 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6304 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6305 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6306 } 6307 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6308 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6309 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6310 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6311 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6312 vmcs_readl(GUEST_SYSENTER_ESP), 6313 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6314 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6315 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6316 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6317 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6318 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6319 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6320 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6321 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6322 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6323 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6324 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6325 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6326 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6327 else if (efer_slot >= 0) 6328 pr_err("EFER= 0x%016llx (autoload)\n", 6329 vmx->msr_autoload.guest.val[efer_slot].value); 6330 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6331 pr_err("EFER= 0x%016llx (effective)\n", 6332 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6333 else 6334 pr_err("EFER= 0x%016llx (effective)\n", 6335 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6336 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6337 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6338 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6339 vmcs_read64(GUEST_IA32_DEBUGCTL), 6340 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6341 if (cpu_has_load_perf_global_ctrl() && 6342 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6343 pr_err("PerfGlobCtl = 0x%016llx\n", 6344 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6345 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6346 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6347 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6348 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6349 vmcs_read32(GUEST_ACTIVITY_STATE)); 6350 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6351 pr_err("InterruptStatus = %04x\n", 6352 vmcs_read16(GUEST_INTR_STATUS)); 6353 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6354 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6355 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6356 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6357 6358 pr_err("*** Host State ***\n"); 6359 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6360 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6361 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6362 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6363 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6364 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6365 vmcs_read16(HOST_TR_SELECTOR)); 6366 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6367 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6368 vmcs_readl(HOST_TR_BASE)); 6369 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6370 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6371 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6372 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6373 vmcs_readl(HOST_CR4)); 6374 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6375 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6376 vmcs_read32(HOST_IA32_SYSENTER_CS), 6377 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6378 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6379 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6380 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6381 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6382 if (cpu_has_load_perf_global_ctrl() && 6383 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6384 pr_err("PerfGlobCtl = 0x%016llx\n", 6385 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6386 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6387 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6388 6389 pr_err("*** Control State ***\n"); 6390 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6391 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6392 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6393 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6394 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6395 vmcs_read32(EXCEPTION_BITMAP), 6396 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6397 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6398 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6399 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6400 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6401 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6402 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6403 vmcs_read32(VM_EXIT_INTR_INFO), 6404 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6405 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6406 pr_err(" reason=%08x qualification=%016lx\n", 6407 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6408 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6409 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6410 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6411 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6412 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6413 pr_err("TSC Multiplier = 0x%016llx\n", 6414 vmcs_read64(TSC_MULTIPLIER)); 6415 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6416 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6417 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6418 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6419 } 6420 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6421 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6422 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6423 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6424 } 6425 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6426 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6427 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6428 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6429 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6430 pr_err("PLE Gap=%08x Window=%08x\n", 6431 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6432 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6433 pr_err("Virtual processor ID = 0x%04x\n", 6434 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6435 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { 6436 struct vmx_ve_information *ve_info = vmx->ve_info; 6437 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); 6438 6439 /* 6440 * If KVM is dumping the VMCS, then something has gone wrong 6441 * already. Derefencing an address from the VMCS, which could 6442 * very well be corrupted, is a terrible idea. The virtual 6443 * address is known so use it. 6444 */ 6445 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, 6446 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); 6447 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", 6448 ve_info->exit_reason, ve_info->delivery, 6449 ve_info->exit_qualification, 6450 ve_info->guest_linear_address, 6451 ve_info->guest_physical_address, ve_info->eptp_index); 6452 } 6453 } 6454 6455 /* 6456 * The guest has exited. See if we can fix it or if we need userspace 6457 * assistance. 6458 */ 6459 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6460 { 6461 struct vcpu_vmx *vmx = to_vmx(vcpu); 6462 union vmx_exit_reason exit_reason = vmx->exit_reason; 6463 u32 vectoring_info = vmx->idt_vectoring_info; 6464 u16 exit_handler_index; 6465 6466 /* 6467 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6468 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6469 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6470 * mode as if vcpus is in root mode, the PML buffer must has been 6471 * flushed already. Note, PML is never enabled in hardware while 6472 * running L2. 6473 */ 6474 if (enable_pml && !is_guest_mode(vcpu)) 6475 vmx_flush_pml_buffer(vcpu); 6476 6477 /* 6478 * KVM should never reach this point with a pending nested VM-Enter. 6479 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6480 * invalid guest state should never happen as that means KVM knowingly 6481 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6482 */ 6483 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6484 return -EIO; 6485 6486 if (is_guest_mode(vcpu)) { 6487 /* 6488 * PML is never enabled when running L2, bail immediately if a 6489 * PML full exit occurs as something is horribly wrong. 6490 */ 6491 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6492 goto unexpected_vmexit; 6493 6494 /* 6495 * The host physical addresses of some pages of guest memory 6496 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6497 * Page). The CPU may write to these pages via their host 6498 * physical address while L2 is running, bypassing any 6499 * address-translation-based dirty tracking (e.g. EPT write 6500 * protection). 6501 * 6502 * Mark them dirty on every exit from L2 to prevent them from 6503 * getting out of sync with dirty tracking. 6504 */ 6505 nested_mark_vmcs12_pages_dirty(vcpu); 6506 6507 /* 6508 * Synthesize a triple fault if L2 state is invalid. In normal 6509 * operation, nested VM-Enter rejects any attempt to enter L2 6510 * with invalid state. However, those checks are skipped if 6511 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6512 * L2 state is invalid, it means either L1 modified SMRAM state 6513 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6514 * doing so is architecturally allowed in the RSM case, and is 6515 * the least awful solution for the userspace case without 6516 * risking false positives. 6517 */ 6518 if (vmx->emulation_required) { 6519 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6520 return 1; 6521 } 6522 6523 if (nested_vmx_reflect_vmexit(vcpu)) 6524 return 1; 6525 } 6526 6527 /* If guest state is invalid, start emulating. L2 is handled above. */ 6528 if (vmx->emulation_required) 6529 return handle_invalid_guest_state(vcpu); 6530 6531 if (exit_reason.failed_vmentry) { 6532 dump_vmcs(vcpu); 6533 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6534 vcpu->run->fail_entry.hardware_entry_failure_reason 6535 = exit_reason.full; 6536 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6537 return 0; 6538 } 6539 6540 if (unlikely(vmx->fail)) { 6541 dump_vmcs(vcpu); 6542 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6543 vcpu->run->fail_entry.hardware_entry_failure_reason 6544 = vmcs_read32(VM_INSTRUCTION_ERROR); 6545 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6546 return 0; 6547 } 6548 6549 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6550 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6551 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6552 exit_reason.basic != EXIT_REASON_PML_FULL && 6553 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6554 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6555 exit_reason.basic != EXIT_REASON_NOTIFY && 6556 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { 6557 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); 6558 return 0; 6559 } 6560 6561 if (unlikely(!enable_vnmi && 6562 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6563 if (!vmx_interrupt_blocked(vcpu)) { 6564 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6565 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6566 vcpu->arch.nmi_pending) { 6567 /* 6568 * This CPU don't support us in finding the end of an 6569 * NMI-blocked window if the guest runs with IRQs 6570 * disabled. So we pull the trigger after 1 s of 6571 * futile waiting, but inform the user about this. 6572 */ 6573 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6574 "state on VCPU %d after 1 s timeout\n", 6575 __func__, vcpu->vcpu_id); 6576 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6577 } 6578 } 6579 6580 if (exit_fastpath != EXIT_FASTPATH_NONE) 6581 return 1; 6582 6583 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6584 goto unexpected_vmexit; 6585 #ifdef CONFIG_MITIGATION_RETPOLINE 6586 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6587 return kvm_emulate_wrmsr(vcpu); 6588 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6589 return handle_preemption_timer(vcpu); 6590 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6591 return handle_interrupt_window(vcpu); 6592 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6593 return handle_external_interrupt(vcpu); 6594 else if (exit_reason.basic == EXIT_REASON_HLT) 6595 return kvm_emulate_halt(vcpu); 6596 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6597 return handle_ept_misconfig(vcpu); 6598 #endif 6599 6600 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6601 kvm_vmx_max_exit_handlers); 6602 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6603 goto unexpected_vmexit; 6604 6605 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6606 6607 unexpected_vmexit: 6608 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6609 exit_reason.full); 6610 dump_vmcs(vcpu); 6611 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6612 vcpu->run->internal.suberror = 6613 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6614 vcpu->run->internal.ndata = 2; 6615 vcpu->run->internal.data[0] = exit_reason.full; 6616 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6617 return 0; 6618 } 6619 6620 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6621 { 6622 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6623 6624 /* 6625 * Exit to user space when bus lock detected to inform that there is 6626 * a bus lock in guest. 6627 */ 6628 if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { 6629 if (ret > 0) 6630 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6631 6632 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6633 return 0; 6634 } 6635 return ret; 6636 } 6637 6638 /* 6639 * Software based L1D cache flush which is used when microcode providing 6640 * the cache control MSR is not loaded. 6641 * 6642 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6643 * flush it is required to read in 64 KiB because the replacement algorithm 6644 * is not exactly LRU. This could be sized at runtime via topology 6645 * information but as all relevant affected CPUs have 32KiB L1D cache size 6646 * there is no point in doing so. 6647 */ 6648 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6649 { 6650 int size = PAGE_SIZE << L1D_CACHE_ORDER; 6651 6652 /* 6653 * This code is only executed when the flush mode is 'cond' or 6654 * 'always' 6655 */ 6656 if (static_branch_likely(&vmx_l1d_flush_cond)) { 6657 bool flush_l1d; 6658 6659 /* 6660 * Clear the per-vcpu flush bit, it gets set again if the vCPU 6661 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 6662 * exits to userspace, or if KVM reaches one of the unsafe 6663 * VMEXIT handlers, e.g. if KVM calls into the emulator. 6664 */ 6665 flush_l1d = vcpu->arch.l1tf_flush_l1d; 6666 vcpu->arch.l1tf_flush_l1d = false; 6667 6668 /* 6669 * Clear the per-cpu flush bit, it gets set again from 6670 * the interrupt handlers. 6671 */ 6672 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6673 kvm_clear_cpu_l1tf_flush_l1d(); 6674 6675 if (!flush_l1d) 6676 return; 6677 } 6678 6679 vcpu->stat.l1d_flush++; 6680 6681 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6682 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6683 return; 6684 } 6685 6686 asm volatile( 6687 /* First ensure the pages are in the TLB */ 6688 "xorl %%eax, %%eax\n" 6689 ".Lpopulate_tlb:\n\t" 6690 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6691 "addl $4096, %%eax\n\t" 6692 "cmpl %%eax, %[size]\n\t" 6693 "jne .Lpopulate_tlb\n\t" 6694 "xorl %%eax, %%eax\n\t" 6695 "cpuid\n\t" 6696 /* Now fill the cache */ 6697 "xorl %%eax, %%eax\n" 6698 ".Lfill_cache:\n" 6699 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6700 "addl $64, %%eax\n\t" 6701 "cmpl %%eax, %[size]\n\t" 6702 "jne .Lfill_cache\n\t" 6703 "lfence\n" 6704 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6705 [size] "r" (size) 6706 : "eax", "ebx", "ecx", "edx"); 6707 } 6708 6709 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6710 { 6711 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6712 int tpr_threshold; 6713 6714 if (is_guest_mode(vcpu) && 6715 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6716 return; 6717 6718 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6719 if (is_guest_mode(vcpu)) 6720 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; 6721 else 6722 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6723 } 6724 6725 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6726 { 6727 struct vcpu_vmx *vmx = to_vmx(vcpu); 6728 u32 sec_exec_control; 6729 6730 if (!lapic_in_kernel(vcpu)) 6731 return; 6732 6733 if (!flexpriority_enabled && 6734 !cpu_has_vmx_virtualize_x2apic_mode()) 6735 return; 6736 6737 /* Postpone execution until vmcs01 is the current VMCS. */ 6738 if (is_guest_mode(vcpu)) { 6739 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6740 return; 6741 } 6742 6743 sec_exec_control = secondary_exec_controls_get(vmx); 6744 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6745 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6746 6747 switch (kvm_get_apic_mode(vcpu)) { 6748 case LAPIC_MODE_INVALID: 6749 WARN_ONCE(true, "Invalid local APIC state"); 6750 break; 6751 case LAPIC_MODE_DISABLED: 6752 break; 6753 case LAPIC_MODE_XAPIC: 6754 if (flexpriority_enabled) { 6755 sec_exec_control |= 6756 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6757 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6758 6759 /* 6760 * Flush the TLB, reloading the APIC access page will 6761 * only do so if its physical address has changed, but 6762 * the guest may have inserted a non-APIC mapping into 6763 * the TLB while the APIC access page was disabled. 6764 */ 6765 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6766 } 6767 break; 6768 case LAPIC_MODE_X2APIC: 6769 if (cpu_has_vmx_virtualize_x2apic_mode()) 6770 sec_exec_control |= 6771 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6772 break; 6773 } 6774 secondary_exec_controls_set(vmx, sec_exec_control); 6775 6776 vmx_update_msr_bitmap_x2apic(vcpu); 6777 } 6778 6779 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6780 { 6781 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6782 struct kvm *kvm = vcpu->kvm; 6783 struct kvm_memslots *slots = kvm_memslots(kvm); 6784 struct kvm_memory_slot *slot; 6785 struct page *refcounted_page; 6786 unsigned long mmu_seq; 6787 kvm_pfn_t pfn; 6788 bool writable; 6789 6790 /* Defer reload until vmcs01 is the current VMCS. */ 6791 if (is_guest_mode(vcpu)) { 6792 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; 6793 return; 6794 } 6795 6796 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6797 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6798 return; 6799 6800 /* 6801 * Explicitly grab the memslot using KVM's internal slot ID to ensure 6802 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 6803 * be impossible for userspace to create a memslot for the APIC when 6804 * APICv is enabled, but paranoia won't hurt in this case. 6805 */ 6806 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6807 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6808 return; 6809 6810 /* 6811 * Ensure that the mmu_notifier sequence count is read before KVM 6812 * retrieves the pfn from the primary MMU. Note, the memslot is 6813 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6814 * in kvm_mmu_invalidate_end(). 6815 */ 6816 mmu_seq = kvm->mmu_invalidate_seq; 6817 smp_rmb(); 6818 6819 /* 6820 * No need to retry if the memslot does not exist or is invalid. KVM 6821 * controls the APIC-access page memslot, and only deletes the memslot 6822 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6823 */ 6824 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); 6825 if (is_error_noslot_pfn(pfn)) 6826 return; 6827 6828 read_lock(&vcpu->kvm->mmu_lock); 6829 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) 6830 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6831 else 6832 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6833 6834 /* 6835 * Do not pin the APIC access page in memory so that it can be freely 6836 * migrated, the MMU notifier will call us again if it is migrated or 6837 * swapped out. KVM backs the memslot with anonymous memory, the pfn 6838 * should always point at a refcounted page (if the pfn is valid). 6839 */ 6840 if (!WARN_ON_ONCE(!refcounted_page)) 6841 kvm_release_page_clean(refcounted_page); 6842 6843 /* 6844 * No need for a manual TLB flush at this point, KVM has already done a 6845 * flush if there were SPTEs pointing at the previous page. 6846 */ 6847 read_unlock(&vcpu->kvm->mmu_lock); 6848 } 6849 6850 void vmx_hwapic_isr_update(int max_isr) 6851 { 6852 u16 status; 6853 u8 old; 6854 6855 if (max_isr == -1) 6856 max_isr = 0; 6857 6858 status = vmcs_read16(GUEST_INTR_STATUS); 6859 old = status >> 8; 6860 if (max_isr != old) { 6861 status &= 0xff; 6862 status |= max_isr << 8; 6863 vmcs_write16(GUEST_INTR_STATUS, status); 6864 } 6865 } 6866 6867 static void vmx_set_rvi(int vector) 6868 { 6869 u16 status; 6870 u8 old; 6871 6872 if (vector == -1) 6873 vector = 0; 6874 6875 status = vmcs_read16(GUEST_INTR_STATUS); 6876 old = (u8)status & 0xff; 6877 if ((u8)vector != old) { 6878 status &= ~0xff; 6879 status |= (u8)vector; 6880 vmcs_write16(GUEST_INTR_STATUS, status); 6881 } 6882 } 6883 6884 void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 6885 { 6886 /* 6887 * When running L2, updating RVI is only relevant when 6888 * vmcs12 virtual-interrupt-delivery enabled. 6889 * However, it can be enabled only when L1 also 6890 * intercepts external-interrupts and in that case 6891 * we should not update vmcs02 RVI but instead intercept 6892 * interrupt. Therefore, do nothing when running L2. 6893 */ 6894 if (!is_guest_mode(vcpu)) 6895 vmx_set_rvi(max_irr); 6896 } 6897 6898 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6899 { 6900 struct vcpu_vmx *vmx = to_vmx(vcpu); 6901 int max_irr; 6902 bool got_posted_interrupt; 6903 6904 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 6905 return -EIO; 6906 6907 if (pi_test_on(&vmx->pi_desc)) { 6908 pi_clear_on(&vmx->pi_desc); 6909 /* 6910 * IOMMU can write to PID.ON, so the barrier matters even on UP. 6911 * But on x86 this is just a compiler barrier anyway. 6912 */ 6913 smp_mb__after_atomic(); 6914 got_posted_interrupt = 6915 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); 6916 } else { 6917 max_irr = kvm_lapic_find_highest_irr(vcpu); 6918 got_posted_interrupt = false; 6919 } 6920 6921 /* 6922 * Newly recognized interrupts are injected via either virtual interrupt 6923 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 6924 * disabled in two cases: 6925 * 6926 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 6927 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 6928 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 6929 * into L2, but KVM doesn't use virtual interrupt delivery to inject 6930 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 6931 * 6932 * 2) If APICv is disabled for this vCPU, assigned devices may still 6933 * attempt to post interrupts. The posted interrupt vector will cause 6934 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 6935 */ 6936 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 6937 vmx_set_rvi(max_irr); 6938 else if (got_posted_interrupt) 6939 kvm_make_request(KVM_REQ_EVENT, vcpu); 6940 6941 return max_irr; 6942 } 6943 6944 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6945 { 6946 if (!kvm_vcpu_apicv_active(vcpu)) 6947 return; 6948 6949 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6950 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6951 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6952 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6953 } 6954 6955 void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) 6956 { 6957 struct vcpu_vmx *vmx = to_vmx(vcpu); 6958 6959 pi_clear_on(&vmx->pi_desc); 6960 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); 6961 } 6962 6963 void vmx_do_interrupt_irqoff(unsigned long entry); 6964 void vmx_do_nmi_irqoff(void); 6965 6966 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 6967 { 6968 /* 6969 * Save xfd_err to guest_fpu before interrupt is enabled, so the 6970 * MSR value is not clobbered by the host activity before the guest 6971 * has chance to consume it. 6972 * 6973 * Do not blindly read xfd_err here, since this exception might 6974 * be caused by L1 interception on a platform which doesn't 6975 * support xfd at all. 6976 * 6977 * Do it conditionally upon guest_fpu::xfd. xfd_err matters 6978 * only when xfd contains a non-zero value. 6979 * 6980 * Queuing exception is done in vmx_handle_exit. See comment there. 6981 */ 6982 if (vcpu->arch.guest_fpu.fpstate->xfd) 6983 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 6984 } 6985 6986 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 6987 { 6988 /* if exit due to PF check for async PF */ 6989 if (is_page_fault(intr_info)) 6990 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 6991 /* if exit due to NM, handle before interrupts are enabled */ 6992 else if (is_nm_fault(intr_info)) 6993 handle_nm_fault_irqoff(vcpu); 6994 /* Handle machine checks before interrupts are enabled */ 6995 else if (is_machine_check(intr_info)) 6996 kvm_machine_check(); 6997 } 6998 6999 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, 7000 u32 intr_info) 7001 { 7002 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 7003 7004 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 7005 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 7006 return; 7007 7008 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 7009 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7010 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 7011 else 7012 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); 7013 kvm_after_interrupt(vcpu); 7014 7015 vcpu->arch.at_instruction_boundary = true; 7016 } 7017 7018 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 7019 { 7020 struct vcpu_vmx *vmx = to_vmx(vcpu); 7021 7022 if (vmx->emulation_required) 7023 return; 7024 7025 if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 7026 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7027 else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) 7028 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7029 } 7030 7031 /* 7032 * The kvm parameter can be NULL (module initialization, or invocation before 7033 * VM creation). Be sure to check the kvm parameter before using it. 7034 */ 7035 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7036 { 7037 switch (index) { 7038 case MSR_IA32_SMBASE: 7039 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7040 return false; 7041 /* 7042 * We cannot do SMM unless we can run the guest in big 7043 * real mode. 7044 */ 7045 return enable_unrestricted_guest || emulate_invalid_guest_state; 7046 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7047 return nested; 7048 case MSR_AMD64_VIRT_SPEC_CTRL: 7049 case MSR_AMD64_TSC_RATIO: 7050 /* This is AMD only. */ 7051 return false; 7052 default: 7053 return true; 7054 } 7055 } 7056 7057 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7058 { 7059 u32 exit_intr_info; 7060 bool unblock_nmi; 7061 u8 vector; 7062 bool idtv_info_valid; 7063 7064 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7065 7066 if (enable_vnmi) { 7067 if (vmx->loaded_vmcs->nmi_known_unmasked) 7068 return; 7069 7070 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7071 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7072 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7073 /* 7074 * SDM 3: 27.7.1.2 (September 2008) 7075 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7076 * a guest IRET fault. 7077 * SDM 3: 23.2.2 (September 2008) 7078 * Bit 12 is undefined in any of the following cases: 7079 * If the VM exit sets the valid bit in the IDT-vectoring 7080 * information field. 7081 * If the VM exit is due to a double fault. 7082 */ 7083 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7084 vector != DF_VECTOR && !idtv_info_valid) 7085 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7086 GUEST_INTR_STATE_NMI); 7087 else 7088 vmx->loaded_vmcs->nmi_known_unmasked = 7089 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7090 & GUEST_INTR_STATE_NMI); 7091 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7092 vmx->loaded_vmcs->vnmi_blocked_time += 7093 ktime_to_ns(ktime_sub(ktime_get(), 7094 vmx->loaded_vmcs->entry_time)); 7095 } 7096 7097 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7098 u32 idt_vectoring_info, 7099 int instr_len_field, 7100 int error_code_field) 7101 { 7102 u8 vector; 7103 int type; 7104 bool idtv_info_valid; 7105 7106 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7107 7108 vcpu->arch.nmi_injected = false; 7109 kvm_clear_exception_queue(vcpu); 7110 kvm_clear_interrupt_queue(vcpu); 7111 7112 if (!idtv_info_valid) 7113 return; 7114 7115 kvm_make_request(KVM_REQ_EVENT, vcpu); 7116 7117 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7118 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7119 7120 switch (type) { 7121 case INTR_TYPE_NMI_INTR: 7122 vcpu->arch.nmi_injected = true; 7123 /* 7124 * SDM 3: 27.7.1.2 (September 2008) 7125 * Clear bit "block by NMI" before VM entry if a NMI 7126 * delivery faulted. 7127 */ 7128 vmx_set_nmi_mask(vcpu, false); 7129 break; 7130 case INTR_TYPE_SOFT_EXCEPTION: 7131 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7132 fallthrough; 7133 case INTR_TYPE_HARD_EXCEPTION: 7134 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 7135 u32 err = vmcs_read32(error_code_field); 7136 kvm_requeue_exception_e(vcpu, vector, err); 7137 } else 7138 kvm_requeue_exception(vcpu, vector); 7139 break; 7140 case INTR_TYPE_SOFT_INTR: 7141 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7142 fallthrough; 7143 case INTR_TYPE_EXT_INTR: 7144 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7145 break; 7146 default: 7147 break; 7148 } 7149 } 7150 7151 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7152 { 7153 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7154 VM_EXIT_INSTRUCTION_LEN, 7155 IDT_VECTORING_ERROR_CODE); 7156 } 7157 7158 void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7159 { 7160 __vmx_complete_interrupts(vcpu, 7161 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7162 VM_ENTRY_INSTRUCTION_LEN, 7163 VM_ENTRY_EXCEPTION_ERROR_CODE); 7164 7165 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7166 } 7167 7168 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7169 { 7170 int i, nr_msrs; 7171 struct perf_guest_switch_msr *msrs; 7172 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7173 7174 pmu->host_cross_mapped_mask = 0; 7175 if (pmu->pebs_enable & pmu->global_ctrl) 7176 intel_pmu_cross_mapped_check(pmu); 7177 7178 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7179 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7180 if (!msrs) 7181 return; 7182 7183 for (i = 0; i < nr_msrs; i++) 7184 if (msrs[i].host == msrs[i].guest) 7185 clear_atomic_switch_msr(vmx, msrs[i].msr); 7186 else 7187 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7188 msrs[i].host, false); 7189 } 7190 7191 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7192 { 7193 struct vcpu_vmx *vmx = to_vmx(vcpu); 7194 u64 tscl; 7195 u32 delta_tsc; 7196 7197 if (force_immediate_exit) { 7198 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7199 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7200 } else if (vmx->hv_deadline_tsc != -1) { 7201 tscl = rdtsc(); 7202 if (vmx->hv_deadline_tsc > tscl) 7203 /* set_hv_timer ensures the delta fits in 32-bits */ 7204 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7205 cpu_preemption_timer_multi); 7206 else 7207 delta_tsc = 0; 7208 7209 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7210 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7211 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7212 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7213 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7214 } 7215 } 7216 7217 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7218 { 7219 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7220 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7221 vmcs_writel(HOST_RSP, host_rsp); 7222 } 7223 } 7224 7225 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7226 unsigned int flags) 7227 { 7228 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7229 7230 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7231 return; 7232 7233 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7234 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); 7235 7236 /* 7237 * If the guest/host SPEC_CTRL values differ, restore the host value. 7238 * 7239 * For legacy IBRS, the IBRS bit always needs to be written after 7240 * transitioning from a less privileged predictor mode, regardless of 7241 * whether the guest/host values differ. 7242 */ 7243 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7244 vmx->spec_ctrl != hostval) 7245 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); 7246 7247 barrier_nospec(); 7248 } 7249 7250 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7251 bool force_immediate_exit) 7252 { 7253 /* 7254 * If L2 is active, some VMX preemption timer exits can be handled in 7255 * the fastpath even, all other exits must use the slow path. 7256 */ 7257 if (is_guest_mode(vcpu) && 7258 to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) 7259 return EXIT_FASTPATH_NONE; 7260 7261 switch (to_vmx(vcpu)->exit_reason.basic) { 7262 case EXIT_REASON_MSR_WRITE: 7263 return handle_fastpath_set_msr_irqoff(vcpu); 7264 case EXIT_REASON_PREEMPTION_TIMER: 7265 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7266 case EXIT_REASON_HLT: 7267 return handle_fastpath_hlt(vcpu); 7268 default: 7269 return EXIT_FASTPATH_NONE; 7270 } 7271 } 7272 7273 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7274 unsigned int flags) 7275 { 7276 struct vcpu_vmx *vmx = to_vmx(vcpu); 7277 7278 guest_state_enter_irqoff(); 7279 7280 /* 7281 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7282 * mitigation for MDS is done late in VMentry and is still 7283 * executed in spite of L1D Flush. This is because an extra VERW 7284 * should not matter much after the big hammer L1D Flush. 7285 */ 7286 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7287 vmx_l1d_flush(vcpu); 7288 else if (static_branch_unlikely(&mmio_stale_data_clear) && 7289 kvm_arch_has_assigned_device(vcpu->kvm)) 7290 mds_clear_cpu_buffers(); 7291 7292 vmx_disable_fb_clear(vmx); 7293 7294 if (vcpu->arch.cr2 != native_read_cr2()) 7295 native_write_cr2(vcpu->arch.cr2); 7296 7297 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7298 flags); 7299 7300 vcpu->arch.cr2 = native_read_cr2(); 7301 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7302 7303 vmx->idt_vectoring_info = 0; 7304 7305 vmx_enable_fb_clear(vmx); 7306 7307 if (unlikely(vmx->fail)) { 7308 vmx->exit_reason.full = 0xdead; 7309 goto out; 7310 } 7311 7312 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7313 if (likely(!vmx->exit_reason.failed_vmentry)) 7314 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7315 7316 if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && 7317 is_nmi(vmx_get_intr_info(vcpu))) { 7318 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7319 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7320 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7321 else 7322 vmx_do_nmi_irqoff(); 7323 kvm_after_interrupt(vcpu); 7324 } 7325 7326 out: 7327 guest_state_exit_irqoff(); 7328 } 7329 7330 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7331 { 7332 struct vcpu_vmx *vmx = to_vmx(vcpu); 7333 unsigned long cr3, cr4; 7334 7335 /* Record the guest's net vcpu time for enforced NMI injections. */ 7336 if (unlikely(!enable_vnmi && 7337 vmx->loaded_vmcs->soft_vnmi_blocked)) 7338 vmx->loaded_vmcs->entry_time = ktime_get(); 7339 7340 /* 7341 * Don't enter VMX if guest state is invalid, let the exit handler 7342 * start emulation until we arrive back to a valid state. Synthesize a 7343 * consistency check VM-Exit due to invalid guest state and bail. 7344 */ 7345 if (unlikely(vmx->emulation_required)) { 7346 vmx->fail = 0; 7347 7348 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; 7349 vmx->exit_reason.failed_vmentry = 1; 7350 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7351 vmx->exit_qualification = ENTRY_FAIL_DEFAULT; 7352 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7353 vmx->exit_intr_info = 0; 7354 return EXIT_FASTPATH_NONE; 7355 } 7356 7357 trace_kvm_entry(vcpu, force_immediate_exit); 7358 7359 if (vmx->ple_window_dirty) { 7360 vmx->ple_window_dirty = false; 7361 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7362 } 7363 7364 /* 7365 * We did this in prepare_switch_to_guest, because it needs to 7366 * be within srcu_read_lock. 7367 */ 7368 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7369 7370 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7371 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7372 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7373 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7374 vcpu->arch.regs_dirty = 0; 7375 7376 /* 7377 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7378 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7379 * it switches back to the current->mm, which can occur in KVM context 7380 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7381 * toggles a static key while handling a VM-Exit. 7382 */ 7383 cr3 = __get_current_cr3_fast(); 7384 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7385 vmcs_writel(HOST_CR3, cr3); 7386 vmx->loaded_vmcs->host_state.cr3 = cr3; 7387 } 7388 7389 cr4 = cr4_read_shadow(); 7390 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7391 vmcs_writel(HOST_CR4, cr4); 7392 vmx->loaded_vmcs->host_state.cr4 = cr4; 7393 } 7394 7395 /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ 7396 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) 7397 set_debugreg(vcpu->arch.dr6, 6); 7398 7399 /* When single-stepping over STI and MOV SS, we must clear the 7400 * corresponding interruptibility bits in the guest state. Otherwise 7401 * vmentry fails as it then expects bit 14 (BS) in pending debug 7402 * exceptions being set, but that's not correct for the guest debugging 7403 * case. */ 7404 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7405 vmx_set_interrupt_shadow(vcpu, 0); 7406 7407 kvm_load_guest_xsave_state(vcpu); 7408 7409 pt_guest_enter(vmx); 7410 7411 atomic_switch_perf_msrs(vmx); 7412 if (intel_pmu_lbr_is_enabled(vcpu)) 7413 vmx_passthrough_lbr_msrs(vcpu); 7414 7415 if (enable_preemption_timer) 7416 vmx_update_hv_timer(vcpu, force_immediate_exit); 7417 else if (force_immediate_exit) 7418 smp_send_reschedule(vcpu->cpu); 7419 7420 kvm_wait_lapic_expire(vcpu); 7421 7422 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7423 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7424 7425 /* All fields are clean at this point */ 7426 if (kvm_is_using_evmcs()) { 7427 current_evmcs->hv_clean_fields |= 7428 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7429 7430 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7431 } 7432 7433 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7434 if (vmx->host_debugctlmsr) 7435 update_debugctlmsr(vmx->host_debugctlmsr); 7436 7437 #ifndef CONFIG_X86_64 7438 /* 7439 * The sysexit path does not restore ds/es, so we must set them to 7440 * a reasonable value ourselves. 7441 * 7442 * We can't defer this to vmx_prepare_switch_to_host() since that 7443 * function may be executed in interrupt context, which saves and 7444 * restore segments around it, nullifying its effect. 7445 */ 7446 loadsegment(ds, __USER_DS); 7447 loadsegment(es, __USER_DS); 7448 #endif 7449 7450 pt_guest_exit(vmx); 7451 7452 kvm_load_host_xsave_state(vcpu); 7453 7454 if (is_guest_mode(vcpu)) { 7455 /* 7456 * Track VMLAUNCH/VMRESUME that have made past guest state 7457 * checking. 7458 */ 7459 if (vmx->nested.nested_run_pending && 7460 !vmx->exit_reason.failed_vmentry) 7461 ++vcpu->stat.nested_run; 7462 7463 vmx->nested.nested_run_pending = 0; 7464 } 7465 7466 if (unlikely(vmx->fail)) 7467 return EXIT_FASTPATH_NONE; 7468 7469 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7470 kvm_machine_check(); 7471 7472 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7473 7474 if (unlikely(vmx->exit_reason.failed_vmentry)) 7475 return EXIT_FASTPATH_NONE; 7476 7477 vmx->loaded_vmcs->launched = 1; 7478 7479 vmx_recover_nmi_blocking(vmx); 7480 vmx_complete_interrupts(vmx); 7481 7482 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7483 } 7484 7485 void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7486 { 7487 struct vcpu_vmx *vmx = to_vmx(vcpu); 7488 7489 if (enable_pml) 7490 vmx_destroy_pml_buffer(vmx); 7491 free_vpid(vmx->vpid); 7492 nested_vmx_free_vcpu(vcpu); 7493 free_loaded_vmcs(vmx->loaded_vmcs); 7494 free_page((unsigned long)vmx->ve_info); 7495 } 7496 7497 int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7498 { 7499 struct vmx_uret_msr *tsx_ctrl; 7500 struct vcpu_vmx *vmx; 7501 int i, err; 7502 7503 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7504 vmx = to_vmx(vcpu); 7505 7506 INIT_LIST_HEAD(&vmx->pi_wakeup_list); 7507 7508 err = -ENOMEM; 7509 7510 vmx->vpid = allocate_vpid(); 7511 7512 /* 7513 * If PML is turned on, failure on enabling PML just results in failure 7514 * of creating the vcpu, therefore we can simplify PML logic (by 7515 * avoiding dealing with cases, such as enabling PML partially on vcpus 7516 * for the guest), etc. 7517 */ 7518 if (enable_pml) { 7519 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7520 if (!vmx->pml_pg) 7521 goto free_vpid; 7522 } 7523 7524 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7525 vmx->guest_uret_msrs[i].mask = -1ull; 7526 if (boot_cpu_has(X86_FEATURE_RTM)) { 7527 /* 7528 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7529 * Keep the host value unchanged to avoid changing CPUID bits 7530 * under the host kernel's feet. 7531 */ 7532 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7533 if (tsx_ctrl) 7534 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7535 } 7536 7537 err = alloc_loaded_vmcs(&vmx->vmcs01); 7538 if (err < 0) 7539 goto free_pml; 7540 7541 /* 7542 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7543 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7544 * feature only for vmcs01, KVM currently isn't equipped to realize any 7545 * performance benefits from enabling it for vmcs02. 7546 */ 7547 if (kvm_is_using_evmcs() && 7548 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7549 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7550 7551 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7552 } 7553 7554 /* The MSR bitmap starts with all ones */ 7555 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7556 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7557 7558 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 7559 #ifdef CONFIG_X86_64 7560 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 7561 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 7562 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 7563 #endif 7564 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 7565 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 7566 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 7567 if (kvm_cstate_in_guest(vcpu->kvm)) { 7568 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 7569 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 7570 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 7571 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 7572 } 7573 7574 vmx->loaded_vmcs = &vmx->vmcs01; 7575 7576 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7577 err = kvm_alloc_apic_access_page(vcpu->kvm); 7578 if (err) 7579 goto free_vmcs; 7580 } 7581 7582 if (enable_ept && !enable_unrestricted_guest) { 7583 err = init_rmode_identity_map(vcpu->kvm); 7584 if (err) 7585 goto free_vmcs; 7586 } 7587 7588 err = -ENOMEM; 7589 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { 7590 struct page *page; 7591 7592 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); 7593 7594 /* ve_info must be page aligned. */ 7595 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7596 if (!page) 7597 goto free_vmcs; 7598 7599 vmx->ve_info = page_to_virt(page); 7600 } 7601 7602 if (vmx_can_use_ipiv(vcpu)) 7603 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7604 __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); 7605 7606 return 0; 7607 7608 free_vmcs: 7609 free_loaded_vmcs(vmx->loaded_vmcs); 7610 free_pml: 7611 vmx_destroy_pml_buffer(vmx); 7612 free_vpid: 7613 free_vpid(vmx->vpid); 7614 return err; 7615 } 7616 7617 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7618 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7619 7620 int vmx_vm_init(struct kvm *kvm) 7621 { 7622 if (!ple_gap) 7623 kvm->arch.pause_in_guest = true; 7624 7625 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7626 switch (l1tf_mitigation) { 7627 case L1TF_MITIGATION_OFF: 7628 case L1TF_MITIGATION_FLUSH_NOWARN: 7629 /* 'I explicitly don't care' is set */ 7630 break; 7631 case L1TF_MITIGATION_FLUSH: 7632 case L1TF_MITIGATION_FLUSH_NOSMT: 7633 case L1TF_MITIGATION_FULL: 7634 /* 7635 * Warn upon starting the first VM in a potentially 7636 * insecure environment. 7637 */ 7638 if (sched_smt_active()) 7639 pr_warn_once(L1TF_MSG_SMT); 7640 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7641 pr_warn_once(L1TF_MSG_L1D); 7642 break; 7643 case L1TF_MITIGATION_FULL_FORCE: 7644 /* Flush is enforced */ 7645 break; 7646 } 7647 } 7648 return 0; 7649 } 7650 7651 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7652 { 7653 /* 7654 * Force UC for host MMIO regions, as allowing the guest to access MMIO 7655 * with cacheable accesses will result in Machine Checks. 7656 */ 7657 if (is_mmio) 7658 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7659 7660 /* 7661 * Force WB and ignore guest PAT if the VM does NOT have a non-coherent 7662 * device attached. Letting the guest control memory types on Intel 7663 * CPUs may result in unexpected behavior, and so KVM's ABI is to trust 7664 * the guest to behave only as a last resort. 7665 */ 7666 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) 7667 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7668 7669 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); 7670 } 7671 7672 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7673 { 7674 /* 7675 * These bits in the secondary execution controls field 7676 * are dynamic, the others are mostly based on the hypervisor 7677 * architecture and the guest's CPUID. Do not touch the 7678 * dynamic bits. 7679 */ 7680 u32 mask = 7681 SECONDARY_EXEC_SHADOW_VMCS | 7682 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7683 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7684 SECONDARY_EXEC_DESC; 7685 7686 u32 cur_ctl = secondary_exec_controls_get(vmx); 7687 7688 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7689 } 7690 7691 /* 7692 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7693 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7694 */ 7695 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7696 { 7697 struct vcpu_vmx *vmx = to_vmx(vcpu); 7698 struct kvm_cpuid_entry2 *entry; 7699 7700 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7701 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7702 7703 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7704 if (entry && (entry->_reg & (_cpuid_mask))) \ 7705 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7706 } while (0) 7707 7708 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7709 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7710 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7711 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7712 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7713 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7714 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7715 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7716 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7717 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7718 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7719 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7720 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7721 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7722 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7723 7724 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7725 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7726 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7727 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7728 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7729 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7730 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7731 7732 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7733 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7734 7735 #undef cr4_fixed1_update 7736 } 7737 7738 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7739 { 7740 struct vcpu_vmx *vmx = to_vmx(vcpu); 7741 struct kvm_cpuid_entry2 *best = NULL; 7742 int i; 7743 7744 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7745 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7746 if (!best) 7747 return; 7748 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7749 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7750 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7751 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7752 } 7753 7754 /* Get the number of configurable Address Ranges for filtering */ 7755 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7756 PT_CAP_num_address_ranges); 7757 7758 /* Initialize and clear the no dependency bits */ 7759 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7760 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7761 RTIT_CTL_BRANCH_EN); 7762 7763 /* 7764 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7765 * will inject an #GP 7766 */ 7767 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7768 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7769 7770 /* 7771 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7772 * PSBFreq can be set 7773 */ 7774 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7775 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7776 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7777 7778 /* 7779 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7780 */ 7781 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7782 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7783 RTIT_CTL_MTC_RANGE); 7784 7785 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7786 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7787 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7788 RTIT_CTL_PTW_EN); 7789 7790 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7791 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7792 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7793 7794 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7795 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7796 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7797 7798 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7799 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7800 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7801 7802 /* unmask address range configure area */ 7803 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7804 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7805 } 7806 7807 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7808 { 7809 struct vcpu_vmx *vmx = to_vmx(vcpu); 7810 7811 /* 7812 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7813 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7814 * set if and only if XSAVE is supported. 7815 */ 7816 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) 7817 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); 7818 7819 vmx_setup_uret_msrs(vmx); 7820 7821 if (cpu_has_secondary_exec_ctrls()) 7822 vmcs_set_secondary_exec_control(vmx, 7823 vmx_secondary_exec_control(vmx)); 7824 7825 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7826 vmx->msr_ia32_feature_control_valid_bits |= 7827 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7828 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7829 else 7830 vmx->msr_ia32_feature_control_valid_bits &= 7831 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7832 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7833 7834 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7835 nested_vmx_cr_fixed1_bits_update(vcpu); 7836 7837 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7838 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) 7839 update_intel_pt_cfg(vcpu); 7840 7841 if (boot_cpu_has(X86_FEATURE_RTM)) { 7842 struct vmx_uret_msr *msr; 7843 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7844 if (msr) { 7845 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); 7846 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 7847 } 7848 } 7849 7850 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 7851 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 7852 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 7853 7854 if (boot_cpu_has(X86_FEATURE_IBPB)) 7855 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 7856 !guest_has_pred_cmd_msr(vcpu)); 7857 7858 if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 7859 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 7860 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 7861 7862 set_cr4_guest_host_mask(vmx); 7863 7864 vmx_write_encls_bitmap(vcpu, NULL); 7865 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) 7866 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 7867 else 7868 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 7869 7870 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 7871 vmx->msr_ia32_feature_control_valid_bits |= 7872 FEAT_CTL_SGX_LC_ENABLED; 7873 else 7874 vmx->msr_ia32_feature_control_valid_bits &= 7875 ~FEAT_CTL_SGX_LC_ENABLED; 7876 7877 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7878 vmx_update_exception_bitmap(vcpu); 7879 } 7880 7881 static __init u64 vmx_get_perf_capabilities(void) 7882 { 7883 u64 perf_cap = PMU_CAP_FW_WRITES; 7884 u64 host_perf_cap = 0; 7885 7886 if (!enable_pmu) 7887 return 0; 7888 7889 if (boot_cpu_has(X86_FEATURE_PDCM)) 7890 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 7891 7892 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { 7893 x86_perf_get_lbr(&vmx_lbr_caps); 7894 7895 /* 7896 * KVM requires LBR callstack support, as the overhead due to 7897 * context switching LBRs without said support is too high. 7898 * See intel_pmu_create_guest_lbr_event() for more info. 7899 */ 7900 if (!vmx_lbr_caps.has_callstack) 7901 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 7902 else if (vmx_lbr_caps.nr) 7903 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; 7904 } 7905 7906 if (vmx_pebs_supported()) { 7907 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 7908 7909 /* 7910 * Disallow adaptive PEBS as it is functionally broken, can be 7911 * used by the guest to read *host* LBRs, and can be used to 7912 * bypass userspace event filters. To correctly and safely 7913 * support adaptive PEBS, KVM needs to: 7914 * 7915 * 1. Account for the ADAPTIVE flag when (re)programming fixed 7916 * counters. 7917 * 7918 * 2. Gain support from perf (or take direct control of counter 7919 * programming) to support events without adaptive PEBS 7920 * enabled for the hardware counter. 7921 * 7922 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 7923 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 7924 * 7925 * 4. Document which PMU events are effectively exposed to the 7926 * guest via adaptive PEBS, and make adaptive PEBS mutually 7927 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 7928 */ 7929 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 7930 } 7931 7932 return perf_cap; 7933 } 7934 7935 static __init void vmx_set_cpu_caps(void) 7936 { 7937 kvm_set_cpu_caps(); 7938 7939 /* CPUID 0x1 */ 7940 if (nested) 7941 kvm_cpu_cap_set(X86_FEATURE_VMX); 7942 7943 /* CPUID 0x7 */ 7944 if (kvm_mpx_supported()) 7945 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 7946 if (!cpu_has_vmx_invpcid()) 7947 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 7948 if (vmx_pt_mode_is_host_guest()) 7949 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 7950 if (vmx_pebs_supported()) { 7951 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 7952 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 7953 } 7954 7955 if (!enable_pmu) 7956 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 7957 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 7958 7959 if (!enable_sgx) { 7960 kvm_cpu_cap_clear(X86_FEATURE_SGX); 7961 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7962 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7963 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7964 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 7965 } 7966 7967 if (vmx_umip_emulated()) 7968 kvm_cpu_cap_set(X86_FEATURE_UMIP); 7969 7970 /* CPUID 0xD.1 */ 7971 kvm_caps.supported_xss = 0; 7972 if (!cpu_has_vmx_xsaves()) 7973 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7974 7975 /* CPUID 0x80000001 and 0x7 (RDPID) */ 7976 if (!cpu_has_vmx_rdtscp()) { 7977 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 7978 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 7979 } 7980 7981 if (cpu_has_vmx_waitpkg()) 7982 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7983 } 7984 7985 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, 7986 struct x86_instruction_info *info) 7987 { 7988 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7989 unsigned short port; 7990 bool intercept; 7991 int size; 7992 7993 if (info->intercept == x86_intercept_in || 7994 info->intercept == x86_intercept_ins) { 7995 port = info->src_val; 7996 size = info->dst_bytes; 7997 } else { 7998 port = info->dst_val; 7999 size = info->src_bytes; 8000 } 8001 8002 /* 8003 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 8004 * VM-exits depend on the 'unconditional IO exiting' VM-execution 8005 * control. 8006 * 8007 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 8008 */ 8009 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 8010 intercept = nested_cpu_has(vmcs12, 8011 CPU_BASED_UNCOND_IO_EXITING); 8012 else 8013 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); 8014 8015 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 8016 return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; 8017 } 8018 8019 int vmx_check_intercept(struct kvm_vcpu *vcpu, 8020 struct x86_instruction_info *info, 8021 enum x86_intercept_stage stage, 8022 struct x86_exception *exception) 8023 { 8024 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8025 8026 switch (info->intercept) { 8027 /* 8028 * RDPID causes #UD if disabled through secondary execution controls. 8029 * Because it is marked as EmulateOnUD, we need to intercept it here. 8030 * Note, RDPID is hidden behind ENABLE_RDTSCP. 8031 */ 8032 case x86_intercept_rdpid: 8033 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 8034 exception->vector = UD_VECTOR; 8035 exception->error_code_valid = false; 8036 return X86EMUL_PROPAGATE_FAULT; 8037 } 8038 break; 8039 8040 case x86_intercept_in: 8041 case x86_intercept_ins: 8042 case x86_intercept_out: 8043 case x86_intercept_outs: 8044 return vmx_check_intercept_io(vcpu, info); 8045 8046 case x86_intercept_lgdt: 8047 case x86_intercept_lidt: 8048 case x86_intercept_lldt: 8049 case x86_intercept_ltr: 8050 case x86_intercept_sgdt: 8051 case x86_intercept_sidt: 8052 case x86_intercept_sldt: 8053 case x86_intercept_str: 8054 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8055 return X86EMUL_CONTINUE; 8056 8057 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 8058 break; 8059 8060 case x86_intercept_pause: 8061 /* 8062 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8063 * with vanilla NOPs in the emulator. Apply the interception 8064 * check only to actual PAUSE instructions. Don't check 8065 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8066 * exit, i.e. KVM is within its rights to allow L2 to execute 8067 * the PAUSE. 8068 */ 8069 if ((info->rep_prefix != REPE_PREFIX) || 8070 !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) 8071 return X86EMUL_CONTINUE; 8072 8073 break; 8074 8075 /* TODO: check more intercepts... */ 8076 default: 8077 break; 8078 } 8079 8080 return X86EMUL_UNHANDLEABLE; 8081 } 8082 8083 #ifdef CONFIG_X86_64 8084 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8085 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8086 u64 divisor, u64 *result) 8087 { 8088 u64 low = a << shift, high = a >> (64 - shift); 8089 8090 /* To avoid the overflow on divq */ 8091 if (high >= divisor) 8092 return 1; 8093 8094 /* Low hold the result, high hold rem which is discarded */ 8095 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8096 "rm" (divisor), "0" (low), "1" (high)); 8097 *result = low; 8098 8099 return 0; 8100 } 8101 8102 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8103 bool *expired) 8104 { 8105 struct vcpu_vmx *vmx; 8106 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8107 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8108 8109 vmx = to_vmx(vcpu); 8110 tscl = rdtsc(); 8111 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8112 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8113 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8114 ktimer->timer_advance_ns); 8115 8116 if (delta_tsc > lapic_timer_advance_cycles) 8117 delta_tsc -= lapic_timer_advance_cycles; 8118 else 8119 delta_tsc = 0; 8120 8121 /* Convert to host delta tsc if tsc scaling is enabled */ 8122 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8123 delta_tsc && u64_shl_div_u64(delta_tsc, 8124 kvm_caps.tsc_scaling_ratio_frac_bits, 8125 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8126 return -ERANGE; 8127 8128 /* 8129 * If the delta tsc can't fit in the 32 bit after the multi shift, 8130 * we can't use the preemption timer. 8131 * It's possible that it fits on later vmentries, but checking 8132 * on every vmentry is costly so we just use an hrtimer. 8133 */ 8134 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8135 return -ERANGE; 8136 8137 vmx->hv_deadline_tsc = tscl + delta_tsc; 8138 *expired = !delta_tsc; 8139 return 0; 8140 } 8141 8142 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8143 { 8144 to_vmx(vcpu)->hv_deadline_tsc = -1; 8145 } 8146 #endif 8147 8148 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8149 { 8150 struct vcpu_vmx *vmx = to_vmx(vcpu); 8151 8152 if (WARN_ON_ONCE(!enable_pml)) 8153 return; 8154 8155 if (is_guest_mode(vcpu)) { 8156 vmx->nested.update_vmcs01_cpu_dirty_logging = true; 8157 return; 8158 } 8159 8160 /* 8161 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8162 * code, but in that case another update request will be made and so 8163 * the guest will never run with a stale PML value. 8164 */ 8165 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8166 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8167 else 8168 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8169 } 8170 8171 void vmx_setup_mce(struct kvm_vcpu *vcpu) 8172 { 8173 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8174 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8175 FEAT_CTL_LMCE_ENABLED; 8176 else 8177 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8178 ~FEAT_CTL_LMCE_ENABLED; 8179 } 8180 8181 #ifdef CONFIG_KVM_SMM 8182 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8183 { 8184 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8185 if (to_vmx(vcpu)->nested.nested_run_pending) 8186 return -EBUSY; 8187 return !is_smm(vcpu); 8188 } 8189 8190 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8191 { 8192 struct vcpu_vmx *vmx = to_vmx(vcpu); 8193 8194 /* 8195 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8196 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8197 * SMI and RSM only modify state that is saved and restored via SMRAM. 8198 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8199 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8200 */ 8201 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8202 if (vmx->nested.smm.guest_mode) 8203 nested_vmx_vmexit(vcpu, -1, 0, 0); 8204 8205 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8206 vmx->nested.vmxon = false; 8207 vmx_clear_hlt(vcpu); 8208 return 0; 8209 } 8210 8211 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8212 { 8213 struct vcpu_vmx *vmx = to_vmx(vcpu); 8214 int ret; 8215 8216 if (vmx->nested.smm.vmxon) { 8217 vmx->nested.vmxon = true; 8218 vmx->nested.smm.vmxon = false; 8219 } 8220 8221 if (vmx->nested.smm.guest_mode) { 8222 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8223 if (ret) 8224 return ret; 8225 8226 vmx->nested.nested_run_pending = 1; 8227 vmx->nested.smm.guest_mode = false; 8228 } 8229 return 0; 8230 } 8231 8232 void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8233 { 8234 /* RSM will cause a vmexit anyway. */ 8235 } 8236 #endif 8237 8238 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8239 { 8240 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8241 } 8242 8243 void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8244 { 8245 if (is_guest_mode(vcpu)) { 8246 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8247 8248 if (hrtimer_try_to_cancel(timer) == 1) 8249 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8250 } 8251 } 8252 8253 void vmx_hardware_unsetup(void) 8254 { 8255 kvm_set_posted_intr_wakeup_handler(NULL); 8256 8257 if (nested) 8258 nested_vmx_hardware_unsetup(); 8259 8260 free_kvm_area(); 8261 } 8262 8263 void vmx_vm_destroy(struct kvm *kvm) 8264 { 8265 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8266 8267 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8268 } 8269 8270 /* 8271 * Note, the SDM states that the linear address is masked *after* the modified 8272 * canonicality check, whereas KVM masks (untags) the address and then performs 8273 * a "normal" canonicality check. Functionally, the two methods are identical, 8274 * and when the masking occurs relative to the canonicality check isn't visible 8275 * to software, i.e. KVM's behavior doesn't violate the SDM. 8276 */ 8277 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8278 { 8279 int lam_bit; 8280 unsigned long cr3_bits; 8281 8282 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8283 return gva; 8284 8285 if (!is_64_bit_mode(vcpu)) 8286 return gva; 8287 8288 /* 8289 * Bit 63 determines if the address should be treated as user address 8290 * or a supervisor address. 8291 */ 8292 if (!(gva & BIT_ULL(63))) { 8293 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8294 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8295 return gva; 8296 8297 /* LAM_U48 is ignored if LAM_U57 is set. */ 8298 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8299 } else { 8300 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8301 return gva; 8302 8303 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8304 } 8305 8306 /* 8307 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8308 * Bit 63 is retained from the raw virtual address so that untagging 8309 * doesn't change a user access to a supervisor access, and vice versa. 8310 */ 8311 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8312 } 8313 8314 static unsigned int vmx_handle_intel_pt_intr(void) 8315 { 8316 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8317 8318 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8319 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8320 return 0; 8321 8322 kvm_make_request(KVM_REQ_PMI, vcpu); 8323 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8324 (unsigned long *)&vcpu->arch.pmu.global_status); 8325 return 1; 8326 } 8327 8328 static __init void vmx_setup_user_return_msrs(void) 8329 { 8330 8331 /* 8332 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8333 * will emulate SYSCALL in legacy mode if the vendor string in guest 8334 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8335 * support this emulation, MSR_STAR is included in the list for i386, 8336 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8337 * into hardware and is here purely for emulation purposes. 8338 */ 8339 const u32 vmx_uret_msrs_list[] = { 8340 #ifdef CONFIG_X86_64 8341 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8342 #endif 8343 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8344 MSR_IA32_TSX_CTRL, 8345 }; 8346 int i; 8347 8348 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8349 8350 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8351 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8352 } 8353 8354 static void __init vmx_setup_me_spte_mask(void) 8355 { 8356 u64 me_mask = 0; 8357 8358 /* 8359 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8360 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, 8361 * boot_cpu_data.x86_phys_bits holds the actual physical address 8362 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to 8363 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. 8364 */ 8365 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) 8366 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8367 kvm_host.maxphyaddr - 1); 8368 8369 /* 8370 * Unlike SME, host kernel doesn't support setting up any 8371 * MKTME KeyID on Intel platforms. No memory encryption 8372 * bits should be included into the SPTE. 8373 */ 8374 kvm_mmu_set_me_spte_mask(0, me_mask); 8375 } 8376 8377 __init int vmx_hardware_setup(void) 8378 { 8379 unsigned long host_bndcfgs; 8380 struct desc_ptr dt; 8381 int r; 8382 8383 store_idt(&dt); 8384 host_idt_base = dt.address; 8385 8386 vmx_setup_user_return_msrs(); 8387 8388 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8389 return -EIO; 8390 8391 if (boot_cpu_has(X86_FEATURE_NX)) 8392 kvm_enable_efer_bits(EFER_NX); 8393 8394 if (boot_cpu_has(X86_FEATURE_MPX)) { 8395 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); 8396 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8397 } 8398 8399 if (!cpu_has_vmx_mpx()) 8400 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8401 XFEATURE_MASK_BNDCSR); 8402 8403 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8404 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8405 enable_vpid = 0; 8406 8407 if (!cpu_has_vmx_ept() || 8408 !cpu_has_vmx_ept_4levels() || 8409 !cpu_has_vmx_ept_mt_wb() || 8410 !cpu_has_vmx_invept_global()) 8411 enable_ept = 0; 8412 8413 /* NX support is required for shadow paging. */ 8414 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8415 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8416 return -EOPNOTSUPP; 8417 } 8418 8419 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8420 enable_ept_ad_bits = 0; 8421 8422 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8423 enable_unrestricted_guest = 0; 8424 8425 if (!cpu_has_vmx_flexpriority()) 8426 flexpriority_enabled = 0; 8427 8428 if (!cpu_has_virtual_nmis()) 8429 enable_vnmi = 0; 8430 8431 #ifdef CONFIG_X86_SGX_KVM 8432 if (!cpu_has_vmx_encls_vmexit()) 8433 enable_sgx = false; 8434 #endif 8435 8436 /* 8437 * set_apic_access_page_addr() is used to reload apic access 8438 * page upon invalidation. No need to do anything if not 8439 * using the APIC_ACCESS_ADDR VMCS field. 8440 */ 8441 if (!flexpriority_enabled) 8442 vt_x86_ops.set_apic_access_page_addr = NULL; 8443 8444 if (!cpu_has_vmx_tpr_shadow()) 8445 vt_x86_ops.update_cr8_intercept = NULL; 8446 8447 #if IS_ENABLED(CONFIG_HYPERV) 8448 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8449 && enable_ept) { 8450 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8451 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8452 } 8453 #endif 8454 8455 if (!cpu_has_vmx_ple()) { 8456 ple_gap = 0; 8457 ple_window = 0; 8458 ple_window_grow = 0; 8459 ple_window_max = 0; 8460 ple_window_shrink = 0; 8461 } 8462 8463 if (!cpu_has_vmx_apicv()) 8464 enable_apicv = 0; 8465 if (!enable_apicv) 8466 vt_x86_ops.sync_pir_to_irr = NULL; 8467 8468 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8469 enable_ipiv = false; 8470 8471 if (cpu_has_vmx_tsc_scaling()) 8472 kvm_caps.has_tsc_control = true; 8473 8474 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8475 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8476 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8477 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8478 8479 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8480 8481 if (enable_ept) 8482 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8483 cpu_has_vmx_ept_execute_only()); 8484 8485 /* 8486 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8487 * bits to shadow_zero_check. 8488 */ 8489 vmx_setup_me_spte_mask(); 8490 8491 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8492 ept_caps_to_lpage_level(vmx_capability.ept)); 8493 8494 /* 8495 * Only enable PML when hardware supports PML feature, and both EPT 8496 * and EPT A/D bit features are enabled -- PML depends on them to work. 8497 */ 8498 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8499 enable_pml = 0; 8500 8501 if (!enable_pml) 8502 vt_x86_ops.cpu_dirty_log_size = 0; 8503 8504 if (!cpu_has_vmx_preemption_timer()) 8505 enable_preemption_timer = false; 8506 8507 if (enable_preemption_timer) { 8508 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8509 8510 cpu_preemption_timer_multi = 8511 vmx_misc_preemption_timer_rate(vmcs_config.misc); 8512 8513 if (tsc_khz) 8514 use_timer_freq = (u64)tsc_khz * 1000; 8515 use_timer_freq >>= cpu_preemption_timer_multi; 8516 8517 /* 8518 * KVM "disables" the preemption timer by setting it to its max 8519 * value. Don't use the timer if it might cause spurious exits 8520 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8521 */ 8522 if (use_timer_freq > 0xffffffffu / 10) 8523 enable_preemption_timer = false; 8524 } 8525 8526 if (!enable_preemption_timer) { 8527 vt_x86_ops.set_hv_timer = NULL; 8528 vt_x86_ops.cancel_hv_timer = NULL; 8529 } 8530 8531 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8532 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8533 8534 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8535 return -EINVAL; 8536 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8537 pt_mode = PT_MODE_SYSTEM; 8538 if (pt_mode == PT_MODE_HOST_GUEST) 8539 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8540 else 8541 vt_init_ops.handle_intel_pt_intr = NULL; 8542 8543 setup_default_sgx_lepubkeyhash(); 8544 8545 if (nested) { 8546 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8547 8548 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8549 if (r) 8550 return r; 8551 } 8552 8553 vmx_set_cpu_caps(); 8554 8555 r = alloc_kvm_area(); 8556 if (r && nested) 8557 nested_vmx_hardware_unsetup(); 8558 8559 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8560 8561 return r; 8562 } 8563 8564 static void vmx_cleanup_l1d_flush(void) 8565 { 8566 if (vmx_l1d_flush_pages) { 8567 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8568 vmx_l1d_flush_pages = NULL; 8569 } 8570 /* Restore state so sysfs ignores VMX */ 8571 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8572 } 8573 8574 static void __vmx_exit(void) 8575 { 8576 allow_smaller_maxphyaddr = false; 8577 8578 vmx_cleanup_l1d_flush(); 8579 } 8580 8581 static void vmx_exit(void) 8582 { 8583 kvm_exit(); 8584 __vmx_exit(); 8585 kvm_x86_vendor_exit(); 8586 8587 } 8588 module_exit(vmx_exit); 8589 8590 static int __init vmx_init(void) 8591 { 8592 int r, cpu; 8593 8594 if (!kvm_is_vmx_supported()) 8595 return -EOPNOTSUPP; 8596 8597 /* 8598 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing 8599 * to unwind if a later step fails. 8600 */ 8601 hv_init_evmcs(); 8602 8603 r = kvm_x86_vendor_init(&vt_init_ops); 8604 if (r) 8605 return r; 8606 8607 /* 8608 * Must be called after common x86 init so enable_ept is properly set 8609 * up. Hand the parameter mitigation value in which was stored in 8610 * the pre module init parser. If no parameter was given, it will 8611 * contain 'auto' which will be turned into the default 'cond' 8612 * mitigation mode. 8613 */ 8614 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8615 if (r) 8616 goto err_l1d_flush; 8617 8618 for_each_possible_cpu(cpu) { 8619 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8620 8621 pi_init_cpu(cpu); 8622 } 8623 8624 vmx_check_vmcs12_offsets(); 8625 8626 /* 8627 * Shadow paging doesn't have a (further) performance penalty 8628 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8629 * by default 8630 */ 8631 if (!enable_ept) 8632 allow_smaller_maxphyaddr = true; 8633 8634 /* 8635 * Common KVM initialization _must_ come last, after this, /dev/kvm is 8636 * exposed to userspace! 8637 */ 8638 r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), 8639 THIS_MODULE); 8640 if (r) 8641 goto err_kvm_init; 8642 8643 return 0; 8644 8645 err_kvm_init: 8646 __vmx_exit(); 8647 err_l1d_flush: 8648 kvm_x86_vendor_exit(); 8649 return r; 8650 } 8651 module_init(vmx_init); 8652