1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 #include <linux/entry-kvm.h> 32 33 #include <asm/apic.h> 34 #include <asm/asm.h> 35 #include <asm/cpu.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/api.h> 40 #include <asm/fpu/xstate.h> 41 #include <asm/fred.h> 42 #include <asm/idtentry.h> 43 #include <asm/io.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/reboot.h> 46 #include <asm/perf_event.h> 47 #include <asm/mmu_context.h> 48 #include <asm/mshyperv.h> 49 #include <asm/msr.h> 50 #include <asm/mwait.h> 51 #include <asm/spec-ctrl.h> 52 #include <asm/vmx.h> 53 54 #include <trace/events/ipi.h> 55 56 #include "capabilities.h" 57 #include "common.h" 58 #include "cpuid.h" 59 #include "hyperv.h" 60 #include "kvm_onhyperv.h" 61 #include "irq.h" 62 #include "kvm_cache_regs.h" 63 #include "lapic.h" 64 #include "mmu.h" 65 #include "nested.h" 66 #include "pmu.h" 67 #include "sgx.h" 68 #include "trace.h" 69 #include "vmcs.h" 70 #include "vmcs12.h" 71 #include "vmx.h" 72 #include "x86.h" 73 #include "x86_ops.h" 74 #include "smm.h" 75 #include "vmx_onhyperv.h" 76 #include "posted_intr.h" 77 78 MODULE_AUTHOR("Qumranet"); 79 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); 80 MODULE_LICENSE("GPL"); 81 82 #ifdef MODULE 83 static const struct x86_cpu_id vmx_cpu_id[] = { 84 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 85 {} 86 }; 87 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 88 #endif 89 90 bool __read_mostly enable_vpid = 1; 91 module_param_named(vpid, enable_vpid, bool, 0444); 92 93 static bool __read_mostly enable_vnmi = 1; 94 module_param_named(vnmi, enable_vnmi, bool, 0444); 95 96 bool __read_mostly flexpriority_enabled = 1; 97 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 98 99 bool __read_mostly enable_ept = 1; 100 module_param_named(ept, enable_ept, bool, 0444); 101 102 bool __read_mostly enable_unrestricted_guest = 1; 103 module_param_named(unrestricted_guest, 104 enable_unrestricted_guest, bool, 0444); 105 106 bool __read_mostly enable_ept_ad_bits = 1; 107 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 108 109 static bool __read_mostly emulate_invalid_guest_state = true; 110 module_param(emulate_invalid_guest_state, bool, 0444); 111 112 static bool __read_mostly fasteoi = 1; 113 module_param(fasteoi, bool, 0444); 114 115 module_param(enable_apicv, bool, 0444); 116 117 bool __read_mostly enable_ipiv = true; 118 module_param(enable_ipiv, bool, 0444); 119 120 /* 121 * If nested=1, nested virtualization is supported, i.e., guests may use 122 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 123 * use VMX instructions. 124 */ 125 static bool __read_mostly nested = 1; 126 module_param(nested, bool, 0444); 127 128 bool __read_mostly enable_pml = 1; 129 module_param_named(pml, enable_pml, bool, 0444); 130 131 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 132 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 133 134 static bool __read_mostly dump_invalid_vmcs = 0; 135 module_param(dump_invalid_vmcs, bool, 0644); 136 137 #define MSR_BITMAP_MODE_X2APIC 1 138 #define MSR_BITMAP_MODE_X2APIC_APICV 2 139 140 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 141 142 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 143 static int __read_mostly cpu_preemption_timer_multi; 144 static bool __read_mostly enable_preemption_timer = 1; 145 #ifdef CONFIG_X86_64 146 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 147 #endif 148 149 extern bool __read_mostly allow_smaller_maxphyaddr; 150 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 151 152 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 153 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 154 #define KVM_VM_CR0_ALWAYS_ON \ 155 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 156 157 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 158 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 159 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 160 161 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 162 163 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 164 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 165 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 166 RTIT_STATUS_BYTECNT)) 167 168 /* 169 * List of MSRs that can be directly passed to the guest. 170 * In addition to these x2apic, PT and LBR MSRs are handled specially. 171 */ 172 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 173 MSR_IA32_SPEC_CTRL, 174 MSR_IA32_PRED_CMD, 175 MSR_IA32_FLUSH_CMD, 176 MSR_IA32_TSC, 177 #ifdef CONFIG_X86_64 178 MSR_FS_BASE, 179 MSR_GS_BASE, 180 MSR_KERNEL_GS_BASE, 181 MSR_IA32_XFD, 182 MSR_IA32_XFD_ERR, 183 #endif 184 MSR_IA32_SYSENTER_CS, 185 MSR_IA32_SYSENTER_ESP, 186 MSR_IA32_SYSENTER_EIP, 187 MSR_CORE_C1_RES, 188 MSR_CORE_C3_RESIDENCY, 189 MSR_CORE_C6_RESIDENCY, 190 MSR_CORE_C7_RESIDENCY, 191 }; 192 193 /* 194 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 195 * ple_gap: upper bound on the amount of time between two successive 196 * executions of PAUSE in a loop. Also indicate if ple enabled. 197 * According to test, this time is usually smaller than 128 cycles. 198 * ple_window: upper bound on the amount of time a guest is allowed to execute 199 * in a PAUSE loop. Tests indicate that most spinlocks are held for 200 * less than 2^12 cycles 201 * Time is measured based on a counter that runs at the same rate as the TSC, 202 * refer SDM volume 3b section 21.6.13 & 22.1.3. 203 */ 204 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 205 module_param(ple_gap, uint, 0444); 206 207 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 208 module_param(ple_window, uint, 0444); 209 210 /* Default doubles per-vcpu window every exit. */ 211 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 212 module_param(ple_window_grow, uint, 0444); 213 214 /* Default resets per-vcpu window every exit to ple_window. */ 215 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 216 module_param(ple_window_shrink, uint, 0444); 217 218 /* Default is to compute the maximum so we can never overflow. */ 219 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 220 module_param(ple_window_max, uint, 0444); 221 222 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ 223 int __read_mostly pt_mode = PT_MODE_SYSTEM; 224 #ifdef CONFIG_BROKEN 225 module_param(pt_mode, int, S_IRUGO); 226 #endif 227 228 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 229 230 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 231 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 232 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 233 234 /* Storage for pre module init parameter parsing */ 235 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 236 237 static const struct { 238 const char *option; 239 bool for_parse; 240 } vmentry_l1d_param[] = { 241 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 242 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 243 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 244 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 245 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 246 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 247 }; 248 249 #define L1D_CACHE_ORDER 4 250 static void *vmx_l1d_flush_pages; 251 252 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 253 { 254 struct page *page; 255 unsigned int i; 256 257 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 258 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 259 return 0; 260 } 261 262 if (!enable_ept) { 263 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 264 return 0; 265 } 266 267 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 268 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 269 return 0; 270 } 271 272 /* If set to auto use the default l1tf mitigation method */ 273 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 274 switch (l1tf_mitigation) { 275 case L1TF_MITIGATION_OFF: 276 l1tf = VMENTER_L1D_FLUSH_NEVER; 277 break; 278 case L1TF_MITIGATION_AUTO: 279 case L1TF_MITIGATION_FLUSH_NOWARN: 280 case L1TF_MITIGATION_FLUSH: 281 case L1TF_MITIGATION_FLUSH_NOSMT: 282 l1tf = VMENTER_L1D_FLUSH_COND; 283 break; 284 case L1TF_MITIGATION_FULL: 285 case L1TF_MITIGATION_FULL_FORCE: 286 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 287 break; 288 } 289 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 290 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 291 } 292 293 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 294 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 295 /* 296 * This allocation for vmx_l1d_flush_pages is not tied to a VM 297 * lifetime and so should not be charged to a memcg. 298 */ 299 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 300 if (!page) 301 return -ENOMEM; 302 vmx_l1d_flush_pages = page_address(page); 303 304 /* 305 * Initialize each page with a different pattern in 306 * order to protect against KSM in the nested 307 * virtualization case. 308 */ 309 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 310 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 311 PAGE_SIZE); 312 } 313 } 314 315 l1tf_vmx_mitigation = l1tf; 316 317 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 318 static_branch_enable(&vmx_l1d_should_flush); 319 else 320 static_branch_disable(&vmx_l1d_should_flush); 321 322 if (l1tf == VMENTER_L1D_FLUSH_COND) 323 static_branch_enable(&vmx_l1d_flush_cond); 324 else 325 static_branch_disable(&vmx_l1d_flush_cond); 326 return 0; 327 } 328 329 static int vmentry_l1d_flush_parse(const char *s) 330 { 331 unsigned int i; 332 333 if (s) { 334 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 335 if (vmentry_l1d_param[i].for_parse && 336 sysfs_streq(s, vmentry_l1d_param[i].option)) 337 return i; 338 } 339 } 340 return -EINVAL; 341 } 342 343 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 344 { 345 int l1tf, ret; 346 347 l1tf = vmentry_l1d_flush_parse(s); 348 if (l1tf < 0) 349 return l1tf; 350 351 if (!boot_cpu_has(X86_BUG_L1TF)) 352 return 0; 353 354 /* 355 * Has vmx_init() run already? If not then this is the pre init 356 * parameter parsing. In that case just store the value and let 357 * vmx_init() do the proper setup after enable_ept has been 358 * established. 359 */ 360 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 361 vmentry_l1d_flush_param = l1tf; 362 return 0; 363 } 364 365 mutex_lock(&vmx_l1d_flush_mutex); 366 ret = vmx_setup_l1d_flush(l1tf); 367 mutex_unlock(&vmx_l1d_flush_mutex); 368 return ret; 369 } 370 371 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 372 { 373 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 374 return sysfs_emit(s, "???\n"); 375 376 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 377 } 378 379 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 380 { 381 u64 msr; 382 383 if (!vmx->disable_fb_clear) 384 return; 385 386 msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); 387 msr |= FB_CLEAR_DIS; 388 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); 389 /* Cache the MSR value to avoid reading it later */ 390 vmx->msr_ia32_mcu_opt_ctrl = msr; 391 } 392 393 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 394 { 395 if (!vmx->disable_fb_clear) 396 return; 397 398 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 399 native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 400 } 401 402 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 403 { 404 /* 405 * Disable VERW's behavior of clearing CPU buffers for the guest if the 406 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 407 * the mitigation. Disabling the clearing behavior provides a 408 * performance boost for guests that aren't aware that manually clearing 409 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 410 * and VM-Exit. 411 */ 412 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 413 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 414 !boot_cpu_has_bug(X86_BUG_MDS) && 415 !boot_cpu_has_bug(X86_BUG_TAA); 416 417 /* 418 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 419 * at VMEntry. Skip the MSR read/write when a guest has no use case to 420 * execute VERW. 421 */ 422 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 423 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 424 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 425 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 426 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 427 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 428 vmx->disable_fb_clear = false; 429 } 430 431 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 432 .set = vmentry_l1d_flush_set, 433 .get = vmentry_l1d_flush_get, 434 }; 435 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 436 437 static u32 vmx_segment_access_rights(struct kvm_segment *var); 438 439 void vmx_vmexit(void); 440 441 #define vmx_insn_failed(fmt...) \ 442 do { \ 443 WARN_ONCE(1, fmt); \ 444 pr_warn_ratelimited(fmt); \ 445 } while (0) 446 447 noinline void vmread_error(unsigned long field) 448 { 449 vmx_insn_failed("vmread failed: field=%lx\n", field); 450 } 451 452 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 453 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 454 { 455 if (fault) { 456 kvm_spurious_fault(); 457 } else { 458 instrumentation_begin(); 459 vmread_error(field); 460 instrumentation_end(); 461 } 462 } 463 #endif 464 465 noinline void vmwrite_error(unsigned long field, unsigned long value) 466 { 467 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 468 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 469 } 470 471 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 472 { 473 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 474 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 475 } 476 477 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 478 { 479 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 480 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 481 } 482 483 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 484 { 485 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 486 ext, vpid, gva); 487 } 488 489 noinline void invept_error(unsigned long ext, u64 eptp) 490 { 491 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); 492 } 493 494 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 495 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 496 /* 497 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 498 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 499 */ 500 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 501 502 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 503 static DEFINE_SPINLOCK(vmx_vpid_lock); 504 505 struct vmcs_config vmcs_config __ro_after_init; 506 struct vmx_capability vmx_capability __ro_after_init; 507 508 #define VMX_SEGMENT_FIELD(seg) \ 509 [VCPU_SREG_##seg] = { \ 510 .selector = GUEST_##seg##_SELECTOR, \ 511 .base = GUEST_##seg##_BASE, \ 512 .limit = GUEST_##seg##_LIMIT, \ 513 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 514 } 515 516 static const struct kvm_vmx_segment_field { 517 unsigned selector; 518 unsigned base; 519 unsigned limit; 520 unsigned ar_bytes; 521 } kvm_vmx_segment_fields[] = { 522 VMX_SEGMENT_FIELD(CS), 523 VMX_SEGMENT_FIELD(DS), 524 VMX_SEGMENT_FIELD(ES), 525 VMX_SEGMENT_FIELD(FS), 526 VMX_SEGMENT_FIELD(GS), 527 VMX_SEGMENT_FIELD(SS), 528 VMX_SEGMENT_FIELD(TR), 529 VMX_SEGMENT_FIELD(LDTR), 530 }; 531 532 533 static unsigned long host_idt_base; 534 535 #if IS_ENABLED(CONFIG_HYPERV) 536 static bool __read_mostly enlightened_vmcs = true; 537 module_param(enlightened_vmcs, bool, 0444); 538 539 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 540 { 541 struct hv_enlightened_vmcs *evmcs; 542 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 543 544 if (partition_assist_page == INVALID_PAGE) 545 return -ENOMEM; 546 547 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 548 549 evmcs->partition_assist_page = partition_assist_page; 550 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 551 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 552 553 return 0; 554 } 555 556 static __init void hv_init_evmcs(void) 557 { 558 int cpu; 559 560 if (!enlightened_vmcs) 561 return; 562 563 /* 564 * Enlightened VMCS usage should be recommended and the host needs 565 * to support eVMCS v1 or above. 566 */ 567 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 568 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 569 KVM_EVMCS_VERSION) { 570 571 /* Check that we have assist pages on all online CPUs */ 572 for_each_online_cpu(cpu) { 573 if (!hv_get_vp_assist_page(cpu)) { 574 enlightened_vmcs = false; 575 break; 576 } 577 } 578 579 if (enlightened_vmcs) { 580 pr_info("Using Hyper-V Enlightened VMCS\n"); 581 static_branch_enable(&__kvm_is_using_evmcs); 582 } 583 584 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 585 vt_x86_ops.enable_l2_tlb_flush 586 = hv_enable_l2_tlb_flush; 587 } else { 588 enlightened_vmcs = false; 589 } 590 } 591 592 static void hv_reset_evmcs(void) 593 { 594 struct hv_vp_assist_page *vp_ap; 595 596 if (!kvm_is_using_evmcs()) 597 return; 598 599 /* 600 * KVM should enable eVMCS if and only if all CPUs have a VP assist 601 * page, and should reject CPU onlining if eVMCS is enabled the CPU 602 * doesn't have a VP assist page allocated. 603 */ 604 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 605 if (WARN_ON_ONCE(!vp_ap)) 606 return; 607 608 /* 609 * Reset everything to support using non-enlightened VMCS access later 610 * (e.g. when we reload the module with enlightened_vmcs=0) 611 */ 612 vp_ap->nested_control.features.directhypercall = 0; 613 vp_ap->current_nested_vmcs = 0; 614 vp_ap->enlighten_vmentry = 0; 615 } 616 617 #else /* IS_ENABLED(CONFIG_HYPERV) */ 618 static void hv_init_evmcs(void) {} 619 static void hv_reset_evmcs(void) {} 620 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 621 622 /* 623 * Comment's format: document - errata name - stepping - processor name. 624 * Refer from 625 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 626 */ 627 static u32 vmx_preemption_cpu_tfms[] = { 628 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 629 0x000206E6, 630 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 631 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 632 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 633 0x00020652, 634 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 635 0x00020655, 636 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 637 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 638 /* 639 * 320767.pdf - AAP86 - B1 - 640 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 641 */ 642 0x000106E5, 643 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 644 0x000106A0, 645 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 646 0x000106A1, 647 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 648 0x000106A4, 649 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 650 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 651 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 652 0x000106A5, 653 /* Xeon E3-1220 V2 */ 654 0x000306A8, 655 }; 656 657 static inline bool cpu_has_broken_vmx_preemption_timer(void) 658 { 659 u32 eax = cpuid_eax(0x00000001), i; 660 661 /* Clear the reserved bits */ 662 eax &= ~(0x3U << 14 | 0xfU << 28); 663 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 664 if (eax == vmx_preemption_cpu_tfms[i]) 665 return true; 666 667 return false; 668 } 669 670 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 671 { 672 return flexpriority_enabled && lapic_in_kernel(vcpu); 673 } 674 675 static int vmx_get_passthrough_msr_slot(u32 msr) 676 { 677 int i; 678 679 switch (msr) { 680 case 0x800 ... 0x8ff: 681 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 682 return -ENOENT; 683 case MSR_IA32_RTIT_STATUS: 684 case MSR_IA32_RTIT_OUTPUT_BASE: 685 case MSR_IA32_RTIT_OUTPUT_MASK: 686 case MSR_IA32_RTIT_CR3_MATCH: 687 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 688 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ 689 case MSR_LBR_SELECT: 690 case MSR_LBR_TOS: 691 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: 692 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: 693 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: 694 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 695 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 696 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 697 return -ENOENT; 698 } 699 700 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 701 if (vmx_possible_passthrough_msrs[i] == msr) 702 return i; 703 } 704 705 WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 706 return -ENOENT; 707 } 708 709 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 710 { 711 int i; 712 713 i = kvm_find_user_return_msr(msr); 714 if (i >= 0) 715 return &vmx->guest_uret_msrs[i]; 716 return NULL; 717 } 718 719 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 720 struct vmx_uret_msr *msr, u64 data) 721 { 722 unsigned int slot = msr - vmx->guest_uret_msrs; 723 int ret = 0; 724 725 if (msr->load_into_hardware) { 726 preempt_disable(); 727 ret = kvm_set_user_return_msr(slot, data, msr->mask); 728 preempt_enable(); 729 } 730 if (!ret) 731 msr->data = data; 732 return ret; 733 } 734 735 /* 736 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 737 * 738 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 739 * atomically track post-VMXON state, e.g. this may be called in NMI context. 740 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 741 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 742 * magically in RM, VM86, compat mode, or at CPL>0. 743 */ 744 static int kvm_cpu_vmxoff(void) 745 { 746 asm goto("1: vmxoff\n\t" 747 _ASM_EXTABLE(1b, %l[fault]) 748 ::: "cc", "memory" : fault); 749 750 cr4_clear_bits(X86_CR4_VMXE); 751 return 0; 752 753 fault: 754 cr4_clear_bits(X86_CR4_VMXE); 755 return -EIO; 756 } 757 758 void vmx_emergency_disable_virtualization_cpu(void) 759 { 760 int cpu = raw_smp_processor_id(); 761 struct loaded_vmcs *v; 762 763 kvm_rebooting = true; 764 765 /* 766 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 767 * set in task context. If this races with VMX is disabled by an NMI, 768 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 769 * kvm_rebooting set. 770 */ 771 if (!(__read_cr4() & X86_CR4_VMXE)) 772 return; 773 774 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 775 loaded_vmcss_on_cpu_link) 776 vmcs_clear(v->vmcs); 777 778 kvm_cpu_vmxoff(); 779 } 780 781 static void __loaded_vmcs_clear(void *arg) 782 { 783 struct loaded_vmcs *loaded_vmcs = arg; 784 int cpu = raw_smp_processor_id(); 785 786 if (loaded_vmcs->cpu != cpu) 787 return; /* vcpu migration can race with cpu offline */ 788 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 789 per_cpu(current_vmcs, cpu) = NULL; 790 791 vmcs_clear(loaded_vmcs->vmcs); 792 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 793 vmcs_clear(loaded_vmcs->shadow_vmcs); 794 795 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 796 797 /* 798 * Ensure all writes to loaded_vmcs, including deleting it from its 799 * current percpu list, complete before setting loaded_vmcs->cpu to 800 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 801 * and add loaded_vmcs to its percpu list before it's deleted from this 802 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 803 */ 804 smp_wmb(); 805 806 loaded_vmcs->cpu = -1; 807 loaded_vmcs->launched = 0; 808 } 809 810 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 811 { 812 int cpu = loaded_vmcs->cpu; 813 814 if (cpu != -1) 815 smp_call_function_single(cpu, 816 __loaded_vmcs_clear, loaded_vmcs, 1); 817 } 818 819 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 820 unsigned field) 821 { 822 bool ret; 823 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 824 825 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 826 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 827 vmx->segment_cache.bitmask = 0; 828 } 829 ret = vmx->segment_cache.bitmask & mask; 830 vmx->segment_cache.bitmask |= mask; 831 return ret; 832 } 833 834 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 835 { 836 u16 *p = &vmx->segment_cache.seg[seg].selector; 837 838 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 839 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 840 return *p; 841 } 842 843 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 844 { 845 ulong *p = &vmx->segment_cache.seg[seg].base; 846 847 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 848 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 849 return *p; 850 } 851 852 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 853 { 854 u32 *p = &vmx->segment_cache.seg[seg].limit; 855 856 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 857 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 858 return *p; 859 } 860 861 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 862 { 863 u32 *p = &vmx->segment_cache.seg[seg].ar; 864 865 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 866 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 867 return *p; 868 } 869 870 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 871 { 872 u32 eb; 873 874 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 875 (1u << DB_VECTOR) | (1u << AC_VECTOR); 876 /* 877 * #VE isn't used for VMX. To test against unexpected changes 878 * related to #VE for VMX, intercept unexpected #VE and warn on it. 879 */ 880 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 881 eb |= 1u << VE_VECTOR; 882 /* 883 * Guest access to VMware backdoor ports could legitimately 884 * trigger #GP because of TSS I/O permission bitmap. 885 * We intercept those #GP and allow access to them anyway 886 * as VMware does. 887 */ 888 if (enable_vmware_backdoor) 889 eb |= (1u << GP_VECTOR); 890 if ((vcpu->guest_debug & 891 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 892 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 893 eb |= 1u << BP_VECTOR; 894 if (to_vmx(vcpu)->rmode.vm86_active) 895 eb = ~0; 896 if (!vmx_need_pf_intercept(vcpu)) 897 eb &= ~(1u << PF_VECTOR); 898 899 /* When we are running a nested L2 guest and L1 specified for it a 900 * certain exception bitmap, we must trap the same exceptions and pass 901 * them to L1. When running L2, we will only handle the exceptions 902 * specified above if L1 did not want them. 903 */ 904 if (is_guest_mode(vcpu)) 905 eb |= get_vmcs12(vcpu)->exception_bitmap; 906 else { 907 int mask = 0, match = 0; 908 909 if (enable_ept && (eb & (1u << PF_VECTOR))) { 910 /* 911 * If EPT is enabled, #PF is currently only intercepted 912 * if MAXPHYADDR is smaller on the guest than on the 913 * host. In that case we only care about present, 914 * non-reserved faults. For vmcs02, however, PFEC_MASK 915 * and PFEC_MATCH are set in prepare_vmcs02_rare. 916 */ 917 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 918 match = PFERR_PRESENT_MASK; 919 } 920 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 921 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 922 } 923 924 /* 925 * Disabling xfd interception indicates that dynamic xfeatures 926 * might be used in the guest. Always trap #NM in this case 927 * to save guest xfd_err timely. 928 */ 929 if (vcpu->arch.xfd_no_write_intercept) 930 eb |= (1u << NM_VECTOR); 931 932 vmcs_write32(EXCEPTION_BITMAP, eb); 933 } 934 935 /* 936 * Check if MSR is intercepted for currently loaded MSR bitmap. 937 */ 938 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 939 { 940 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 941 return true; 942 943 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 944 } 945 946 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 947 { 948 unsigned int flags = 0; 949 950 if (vmx->loaded_vmcs->launched) 951 flags |= VMX_RUN_VMRESUME; 952 953 /* 954 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 955 * to change it directly without causing a vmexit. In that case read 956 * it after vmexit and store it in vmx->spec_ctrl. 957 */ 958 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 959 flags |= VMX_RUN_SAVE_SPEC_CTRL; 960 961 return flags; 962 } 963 964 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 965 unsigned long entry, unsigned long exit) 966 { 967 vm_entry_controls_clearbit(vmx, entry); 968 vm_exit_controls_clearbit(vmx, exit); 969 } 970 971 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 972 { 973 unsigned int i; 974 975 for (i = 0; i < m->nr; ++i) { 976 if (m->val[i].index == msr) 977 return i; 978 } 979 return -ENOENT; 980 } 981 982 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 983 { 984 int i; 985 struct msr_autoload *m = &vmx->msr_autoload; 986 987 switch (msr) { 988 case MSR_EFER: 989 if (cpu_has_load_ia32_efer()) { 990 clear_atomic_switch_msr_special(vmx, 991 VM_ENTRY_LOAD_IA32_EFER, 992 VM_EXIT_LOAD_IA32_EFER); 993 return; 994 } 995 break; 996 case MSR_CORE_PERF_GLOBAL_CTRL: 997 if (cpu_has_load_perf_global_ctrl()) { 998 clear_atomic_switch_msr_special(vmx, 999 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1000 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1001 return; 1002 } 1003 break; 1004 } 1005 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1006 if (i < 0) 1007 goto skip_guest; 1008 --m->guest.nr; 1009 m->guest.val[i] = m->guest.val[m->guest.nr]; 1010 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1011 1012 skip_guest: 1013 i = vmx_find_loadstore_msr_slot(&m->host, msr); 1014 if (i < 0) 1015 return; 1016 1017 --m->host.nr; 1018 m->host.val[i] = m->host.val[m->host.nr]; 1019 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1020 } 1021 1022 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1023 unsigned long entry, unsigned long exit, 1024 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1025 u64 guest_val, u64 host_val) 1026 { 1027 vmcs_write64(guest_val_vmcs, guest_val); 1028 if (host_val_vmcs != HOST_IA32_EFER) 1029 vmcs_write64(host_val_vmcs, host_val); 1030 vm_entry_controls_setbit(vmx, entry); 1031 vm_exit_controls_setbit(vmx, exit); 1032 } 1033 1034 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1035 u64 guest_val, u64 host_val, bool entry_only) 1036 { 1037 int i, j = 0; 1038 struct msr_autoload *m = &vmx->msr_autoload; 1039 1040 switch (msr) { 1041 case MSR_EFER: 1042 if (cpu_has_load_ia32_efer()) { 1043 add_atomic_switch_msr_special(vmx, 1044 VM_ENTRY_LOAD_IA32_EFER, 1045 VM_EXIT_LOAD_IA32_EFER, 1046 GUEST_IA32_EFER, 1047 HOST_IA32_EFER, 1048 guest_val, host_val); 1049 return; 1050 } 1051 break; 1052 case MSR_CORE_PERF_GLOBAL_CTRL: 1053 if (cpu_has_load_perf_global_ctrl()) { 1054 add_atomic_switch_msr_special(vmx, 1055 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1056 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1057 GUEST_IA32_PERF_GLOBAL_CTRL, 1058 HOST_IA32_PERF_GLOBAL_CTRL, 1059 guest_val, host_val); 1060 return; 1061 } 1062 break; 1063 case MSR_IA32_PEBS_ENABLE: 1064 /* PEBS needs a quiescent period after being disabled (to write 1065 * a record). Disabling PEBS through VMX MSR swapping doesn't 1066 * provide that period, so a CPU could write host's record into 1067 * guest's memory. 1068 */ 1069 wrmsrq(MSR_IA32_PEBS_ENABLE, 0); 1070 } 1071 1072 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1073 if (!entry_only) 1074 j = vmx_find_loadstore_msr_slot(&m->host, msr); 1075 1076 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 1077 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 1078 printk_once(KERN_WARNING "Not enough msr switch entries. " 1079 "Can't add msr %x\n", msr); 1080 return; 1081 } 1082 if (i < 0) { 1083 i = m->guest.nr++; 1084 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1085 } 1086 m->guest.val[i].index = msr; 1087 m->guest.val[i].value = guest_val; 1088 1089 if (entry_only) 1090 return; 1091 1092 if (j < 0) { 1093 j = m->host.nr++; 1094 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1095 } 1096 m->host.val[j].index = msr; 1097 m->host.val[j].value = host_val; 1098 } 1099 1100 static bool update_transition_efer(struct vcpu_vmx *vmx) 1101 { 1102 u64 guest_efer = vmx->vcpu.arch.efer; 1103 u64 ignore_bits = 0; 1104 int i; 1105 1106 /* Shadow paging assumes NX to be available. */ 1107 if (!enable_ept) 1108 guest_efer |= EFER_NX; 1109 1110 /* 1111 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1112 */ 1113 ignore_bits |= EFER_SCE; 1114 #ifdef CONFIG_X86_64 1115 ignore_bits |= EFER_LMA | EFER_LME; 1116 /* SCE is meaningful only in long mode on Intel */ 1117 if (guest_efer & EFER_LMA) 1118 ignore_bits &= ~(u64)EFER_SCE; 1119 #endif 1120 1121 /* 1122 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1123 * On CPUs that support "load IA32_EFER", always switch EFER 1124 * atomically, since it's faster than switching it manually. 1125 */ 1126 if (cpu_has_load_ia32_efer() || 1127 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { 1128 if (!(guest_efer & EFER_LMA)) 1129 guest_efer &= ~EFER_LME; 1130 if (guest_efer != kvm_host.efer) 1131 add_atomic_switch_msr(vmx, MSR_EFER, 1132 guest_efer, kvm_host.efer, false); 1133 else 1134 clear_atomic_switch_msr(vmx, MSR_EFER); 1135 return false; 1136 } 1137 1138 i = kvm_find_user_return_msr(MSR_EFER); 1139 if (i < 0) 1140 return false; 1141 1142 clear_atomic_switch_msr(vmx, MSR_EFER); 1143 1144 guest_efer &= ~ignore_bits; 1145 guest_efer |= kvm_host.efer & ignore_bits; 1146 1147 vmx->guest_uret_msrs[i].data = guest_efer; 1148 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1149 1150 return true; 1151 } 1152 1153 #ifdef CONFIG_X86_32 1154 /* 1155 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1156 * VMCS rather than the segment table. KVM uses this helper to figure 1157 * out the current bases to poke them into the VMCS before entry. 1158 */ 1159 static unsigned long segment_base(u16 selector) 1160 { 1161 struct desc_struct *table; 1162 unsigned long v; 1163 1164 if (!(selector & ~SEGMENT_RPL_MASK)) 1165 return 0; 1166 1167 table = get_current_gdt_ro(); 1168 1169 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1170 u16 ldt_selector = kvm_read_ldt(); 1171 1172 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1173 return 0; 1174 1175 table = (struct desc_struct *)segment_base(ldt_selector); 1176 } 1177 v = get_desc_base(&table[selector >> 3]); 1178 return v; 1179 } 1180 #endif 1181 1182 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1183 { 1184 return vmx_pt_mode_is_host_guest() && 1185 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1186 } 1187 1188 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1189 { 1190 /* The base must be 128-byte aligned and a legal physical address. */ 1191 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1192 } 1193 1194 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1195 { 1196 u32 i; 1197 1198 wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1199 wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1200 wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1201 wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1202 for (i = 0; i < addr_range; i++) { 1203 wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1204 wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1205 } 1206 } 1207 1208 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1209 { 1210 u32 i; 1211 1212 rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); 1213 rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1214 rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1215 rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1216 for (i = 0; i < addr_range; i++) { 1217 rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1218 rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1219 } 1220 } 1221 1222 static void pt_guest_enter(struct vcpu_vmx *vmx) 1223 { 1224 if (vmx_pt_mode_is_system()) 1225 return; 1226 1227 /* 1228 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1229 * Save host state before VM entry. 1230 */ 1231 rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1232 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1233 wrmsrq(MSR_IA32_RTIT_CTL, 0); 1234 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1235 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1236 } 1237 } 1238 1239 static void pt_guest_exit(struct vcpu_vmx *vmx) 1240 { 1241 if (vmx_pt_mode_is_system()) 1242 return; 1243 1244 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1245 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1246 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1247 } 1248 1249 /* 1250 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1251 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1252 */ 1253 if (vmx->pt_desc.host.ctl) 1254 wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1255 } 1256 1257 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1258 unsigned long fs_base, unsigned long gs_base) 1259 { 1260 if (unlikely(fs_sel != host->fs_sel)) { 1261 if (!(fs_sel & 7)) 1262 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1263 else 1264 vmcs_write16(HOST_FS_SELECTOR, 0); 1265 host->fs_sel = fs_sel; 1266 } 1267 if (unlikely(gs_sel != host->gs_sel)) { 1268 if (!(gs_sel & 7)) 1269 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1270 else 1271 vmcs_write16(HOST_GS_SELECTOR, 0); 1272 host->gs_sel = gs_sel; 1273 } 1274 if (unlikely(fs_base != host->fs_base)) { 1275 vmcs_writel(HOST_FS_BASE, fs_base); 1276 host->fs_base = fs_base; 1277 } 1278 if (unlikely(gs_base != host->gs_base)) { 1279 vmcs_writel(HOST_GS_BASE, gs_base); 1280 host->gs_base = gs_base; 1281 } 1282 } 1283 1284 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1285 { 1286 struct vcpu_vmx *vmx = to_vmx(vcpu); 1287 struct vcpu_vt *vt = to_vt(vcpu); 1288 struct vmcs_host_state *host_state; 1289 #ifdef CONFIG_X86_64 1290 int cpu = raw_smp_processor_id(); 1291 #endif 1292 unsigned long fs_base, gs_base; 1293 u16 fs_sel, gs_sel; 1294 int i; 1295 1296 /* 1297 * Note that guest MSRs to be saved/restored can also be changed 1298 * when guest state is loaded. This happens when guest transitions 1299 * to/from long-mode by setting MSR_EFER.LMA. 1300 */ 1301 if (!vmx->guest_uret_msrs_loaded) { 1302 vmx->guest_uret_msrs_loaded = true; 1303 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1304 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1305 continue; 1306 1307 kvm_set_user_return_msr(i, 1308 vmx->guest_uret_msrs[i].data, 1309 vmx->guest_uret_msrs[i].mask); 1310 } 1311 } 1312 1313 if (vmx->nested.need_vmcs12_to_shadow_sync) 1314 nested_sync_vmcs12_to_shadow(vcpu); 1315 1316 if (vt->guest_state_loaded) 1317 return; 1318 1319 host_state = &vmx->loaded_vmcs->host_state; 1320 1321 /* 1322 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1323 * allow segment selectors with cpl > 0 or ti == 1. 1324 */ 1325 host_state->ldt_sel = kvm_read_ldt(); 1326 1327 #ifdef CONFIG_X86_64 1328 savesegment(ds, host_state->ds_sel); 1329 savesegment(es, host_state->es_sel); 1330 1331 gs_base = cpu_kernelmode_gs_base(cpu); 1332 if (likely(is_64bit_mm(current->mm))) { 1333 current_save_fsgs(); 1334 fs_sel = current->thread.fsindex; 1335 gs_sel = current->thread.gsindex; 1336 fs_base = current->thread.fsbase; 1337 vt->msr_host_kernel_gs_base = current->thread.gsbase; 1338 } else { 1339 savesegment(fs, fs_sel); 1340 savesegment(gs, gs_sel); 1341 fs_base = read_msr(MSR_FS_BASE); 1342 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1343 } 1344 1345 wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1346 #else 1347 savesegment(fs, fs_sel); 1348 savesegment(gs, gs_sel); 1349 fs_base = segment_base(fs_sel); 1350 gs_base = segment_base(gs_sel); 1351 #endif 1352 1353 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1354 vt->guest_state_loaded = true; 1355 } 1356 1357 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1358 { 1359 struct vmcs_host_state *host_state; 1360 1361 if (!vmx->vt.guest_state_loaded) 1362 return; 1363 1364 host_state = &vmx->loaded_vmcs->host_state; 1365 1366 ++vmx->vcpu.stat.host_state_reload; 1367 1368 #ifdef CONFIG_X86_64 1369 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1370 #endif 1371 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1372 kvm_load_ldt(host_state->ldt_sel); 1373 #ifdef CONFIG_X86_64 1374 load_gs_index(host_state->gs_sel); 1375 #else 1376 loadsegment(gs, host_state->gs_sel); 1377 #endif 1378 } 1379 if (host_state->fs_sel & 7) 1380 loadsegment(fs, host_state->fs_sel); 1381 #ifdef CONFIG_X86_64 1382 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1383 loadsegment(ds, host_state->ds_sel); 1384 loadsegment(es, host_state->es_sel); 1385 } 1386 #endif 1387 invalidate_tss_limit(); 1388 #ifdef CONFIG_X86_64 1389 wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); 1390 #endif 1391 load_fixmap_gdt(raw_smp_processor_id()); 1392 vmx->vt.guest_state_loaded = false; 1393 vmx->guest_uret_msrs_loaded = false; 1394 } 1395 1396 #ifdef CONFIG_X86_64 1397 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1398 { 1399 preempt_disable(); 1400 if (vmx->vt.guest_state_loaded) 1401 rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1402 preempt_enable(); 1403 return vmx->msr_guest_kernel_gs_base; 1404 } 1405 1406 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1407 { 1408 preempt_disable(); 1409 if (vmx->vt.guest_state_loaded) 1410 wrmsrq(MSR_KERNEL_GS_BASE, data); 1411 preempt_enable(); 1412 vmx->msr_guest_kernel_gs_base = data; 1413 } 1414 #endif 1415 1416 static void grow_ple_window(struct kvm_vcpu *vcpu) 1417 { 1418 struct vcpu_vmx *vmx = to_vmx(vcpu); 1419 unsigned int old = vmx->ple_window; 1420 1421 vmx->ple_window = __grow_ple_window(old, ple_window, 1422 ple_window_grow, 1423 ple_window_max); 1424 1425 if (vmx->ple_window != old) { 1426 vmx->ple_window_dirty = true; 1427 trace_kvm_ple_window_update(vcpu->vcpu_id, 1428 vmx->ple_window, old); 1429 } 1430 } 1431 1432 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1433 { 1434 struct vcpu_vmx *vmx = to_vmx(vcpu); 1435 unsigned int old = vmx->ple_window; 1436 1437 vmx->ple_window = __shrink_ple_window(old, ple_window, 1438 ple_window_shrink, 1439 ple_window); 1440 1441 if (vmx->ple_window != old) { 1442 vmx->ple_window_dirty = true; 1443 trace_kvm_ple_window_update(vcpu->vcpu_id, 1444 vmx->ple_window, old); 1445 } 1446 } 1447 1448 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, 1449 struct loaded_vmcs *buddy) 1450 { 1451 struct vcpu_vmx *vmx = to_vmx(vcpu); 1452 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1453 struct vmcs *prev; 1454 1455 if (!already_loaded) { 1456 loaded_vmcs_clear(vmx->loaded_vmcs); 1457 local_irq_disable(); 1458 1459 /* 1460 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1461 * this cpu's percpu list, otherwise it may not yet be deleted 1462 * from its previous cpu's percpu list. Pairs with the 1463 * smb_wmb() in __loaded_vmcs_clear(). 1464 */ 1465 smp_rmb(); 1466 1467 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1468 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1469 local_irq_enable(); 1470 } 1471 1472 prev = per_cpu(current_vmcs, cpu); 1473 if (prev != vmx->loaded_vmcs->vmcs) { 1474 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1475 vmcs_load(vmx->loaded_vmcs->vmcs); 1476 1477 /* 1478 * No indirect branch prediction barrier needed when switching 1479 * the active VMCS within a vCPU, unless IBRS is advertised to 1480 * the vCPU. To minimize the number of IBPBs executed, KVM 1481 * performs IBPB on nested VM-Exit (a single nested transition 1482 * may switch the active VMCS multiple times). 1483 */ 1484 if (static_branch_likely(&switch_vcpu_ibpb) && 1485 (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) 1486 indirect_branch_prediction_barrier(); 1487 } 1488 1489 if (!already_loaded) { 1490 void *gdt = get_current_gdt_ro(); 1491 1492 /* 1493 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1494 * TLB entries from its previous association with the vCPU. 1495 */ 1496 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1497 1498 /* 1499 * Linux uses per-cpu TSS and GDT, so set these when switching 1500 * processors. See 22.2.4. 1501 */ 1502 vmcs_writel(HOST_TR_BASE, 1503 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1504 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1505 1506 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1507 /* 22.2.3 */ 1508 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1509 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1510 } 1511 1512 vmx->loaded_vmcs->cpu = cpu; 1513 } 1514 } 1515 1516 /* 1517 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1518 * vcpu mutex is already taken. 1519 */ 1520 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1521 { 1522 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1523 shrink_ple_window(vcpu); 1524 1525 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1526 1527 vmx_vcpu_pi_load(vcpu, cpu); 1528 } 1529 1530 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1531 { 1532 vmx_vcpu_pi_put(vcpu); 1533 1534 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1535 } 1536 1537 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1538 { 1539 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1540 } 1541 1542 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1543 { 1544 struct vcpu_vmx *vmx = to_vmx(vcpu); 1545 unsigned long rflags, save_rflags; 1546 1547 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1548 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1549 rflags = vmcs_readl(GUEST_RFLAGS); 1550 if (vmx->rmode.vm86_active) { 1551 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1552 save_rflags = vmx->rmode.save_rflags; 1553 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1554 } 1555 vmx->rflags = rflags; 1556 } 1557 return vmx->rflags; 1558 } 1559 1560 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1561 { 1562 struct vcpu_vmx *vmx = to_vmx(vcpu); 1563 unsigned long old_rflags; 1564 1565 /* 1566 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1567 * is an unrestricted guest in order to mark L2 as needing emulation 1568 * if L1 runs L2 as a restricted guest. 1569 */ 1570 if (is_unrestricted_guest(vcpu)) { 1571 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1572 vmx->rflags = rflags; 1573 vmcs_writel(GUEST_RFLAGS, rflags); 1574 return; 1575 } 1576 1577 old_rflags = vmx_get_rflags(vcpu); 1578 vmx->rflags = rflags; 1579 if (vmx->rmode.vm86_active) { 1580 vmx->rmode.save_rflags = rflags; 1581 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1582 } 1583 vmcs_writel(GUEST_RFLAGS, rflags); 1584 1585 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1586 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 1587 } 1588 1589 bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1590 { 1591 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1592 } 1593 1594 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1595 { 1596 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1597 int ret = 0; 1598 1599 if (interruptibility & GUEST_INTR_STATE_STI) 1600 ret |= KVM_X86_SHADOW_INT_STI; 1601 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1602 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1603 1604 return ret; 1605 } 1606 1607 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1608 { 1609 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1610 u32 interruptibility = interruptibility_old; 1611 1612 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1613 1614 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1615 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1616 else if (mask & KVM_X86_SHADOW_INT_STI) 1617 interruptibility |= GUEST_INTR_STATE_STI; 1618 1619 if ((interruptibility != interruptibility_old)) 1620 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1621 } 1622 1623 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1624 { 1625 struct vcpu_vmx *vmx = to_vmx(vcpu); 1626 unsigned long value; 1627 1628 /* 1629 * Any MSR write that attempts to change bits marked reserved will 1630 * case a #GP fault. 1631 */ 1632 if (data & vmx->pt_desc.ctl_bitmask) 1633 return 1; 1634 1635 /* 1636 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1637 * result in a #GP unless the same write also clears TraceEn. 1638 */ 1639 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1640 (data & RTIT_CTL_TRACEEN) && 1641 data != vmx->pt_desc.guest.ctl) 1642 return 1; 1643 1644 /* 1645 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1646 * and FabricEn would cause #GP, if 1647 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1648 */ 1649 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1650 !(data & RTIT_CTL_FABRIC_EN) && 1651 !intel_pt_validate_cap(vmx->pt_desc.caps, 1652 PT_CAP_single_range_output)) 1653 return 1; 1654 1655 /* 1656 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1657 * utilize encodings marked reserved will cause a #GP fault. 1658 */ 1659 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1660 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1661 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1662 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1663 return 1; 1664 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1665 PT_CAP_cycle_thresholds); 1666 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1667 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1668 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1669 return 1; 1670 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1671 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1672 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1673 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1674 return 1; 1675 1676 /* 1677 * If ADDRx_CFG is reserved or the encodings is >2 will 1678 * cause a #GP fault. 1679 */ 1680 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1681 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1682 return 1; 1683 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1684 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1685 return 1; 1686 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1687 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1688 return 1; 1689 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1690 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1691 return 1; 1692 1693 return 0; 1694 } 1695 1696 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1697 void *insn, int insn_len) 1698 { 1699 /* 1700 * Emulation of instructions in SGX enclaves is impossible as RIP does 1701 * not point at the failing instruction, and even if it did, the code 1702 * stream is inaccessible. Inject #UD instead of exiting to userspace 1703 * so that guest userspace can't DoS the guest simply by triggering 1704 * emulation (enclaves are CPL3 only). 1705 */ 1706 if (vmx_get_exit_reason(vcpu).enclave_mode) { 1707 kvm_queue_exception(vcpu, UD_VECTOR); 1708 return X86EMUL_PROPAGATE_FAULT; 1709 } 1710 1711 /* Check that emulation is possible during event vectoring */ 1712 if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 1713 !kvm_can_emulate_event_vectoring(emul_type)) 1714 return X86EMUL_UNHANDLEABLE_VECTORING; 1715 1716 return X86EMUL_CONTINUE; 1717 } 1718 1719 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1720 { 1721 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 1722 unsigned long rip, orig_rip; 1723 u32 instr_len; 1724 1725 /* 1726 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1727 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1728 * set when EPT misconfig occurs. In practice, real hardware updates 1729 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1730 * (namely Hyper-V) don't set it due to it being undefined behavior, 1731 * i.e. we end up advancing IP with some random value. 1732 */ 1733 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1734 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1735 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1736 1737 /* 1738 * Emulating an enclave's instructions isn't supported as KVM 1739 * cannot access the enclave's memory or its true RIP, e.g. the 1740 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1741 * the RIP that actually triggered the VM-Exit. But, because 1742 * most instructions that cause VM-Exit will #UD in an enclave, 1743 * most instruction-based VM-Exits simply do not occur. 1744 * 1745 * There are a few exceptions, notably the debug instructions 1746 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1747 * and generate #DB/#BP as expected, which KVM might intercept. 1748 * But again, the CPU does the dirty work and saves an instr 1749 * length of zero so VMMs don't shoot themselves in the foot. 1750 * WARN if KVM tries to skip a non-zero length instruction on 1751 * a VM-Exit from an enclave. 1752 */ 1753 if (!instr_len) 1754 goto rip_updated; 1755 1756 WARN_ONCE(exit_reason.enclave_mode, 1757 "skipping instruction after SGX enclave VM-Exit"); 1758 1759 orig_rip = kvm_rip_read(vcpu); 1760 rip = orig_rip + instr_len; 1761 #ifdef CONFIG_X86_64 1762 /* 1763 * We need to mask out the high 32 bits of RIP if not in 64-bit 1764 * mode, but just finding out that we are in 64-bit mode is 1765 * quite expensive. Only do it if there was a carry. 1766 */ 1767 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1768 rip = (u32)rip; 1769 #endif 1770 kvm_rip_write(vcpu, rip); 1771 } else { 1772 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1773 return 0; 1774 } 1775 1776 rip_updated: 1777 /* skipping an emulated instruction also counts */ 1778 vmx_set_interrupt_shadow(vcpu, 0); 1779 1780 return 1; 1781 } 1782 1783 /* 1784 * Recognizes a pending MTF VM-exit and records the nested state for later 1785 * delivery. 1786 */ 1787 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1788 { 1789 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1790 struct vcpu_vmx *vmx = to_vmx(vcpu); 1791 1792 if (!is_guest_mode(vcpu)) 1793 return; 1794 1795 /* 1796 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1797 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1798 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1799 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1800 * as ICEBP is higher priority than both. As instruction emulation is 1801 * completed at this point (i.e. KVM is at the instruction boundary), 1802 * any #DB exception pending delivery must be a debug-trap of lower 1803 * priority than MTF. Record the pending MTF state to be delivered in 1804 * vmx_check_nested_events(). 1805 */ 1806 if (nested_cpu_has_mtf(vmcs12) && 1807 (!vcpu->arch.exception.pending || 1808 vcpu->arch.exception.vector == DB_VECTOR) && 1809 (!vcpu->arch.exception_vmexit.pending || 1810 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1811 vmx->nested.mtf_pending = true; 1812 kvm_make_request(KVM_REQ_EVENT, vcpu); 1813 } else { 1814 vmx->nested.mtf_pending = false; 1815 } 1816 } 1817 1818 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1819 { 1820 vmx_update_emulated_instruction(vcpu); 1821 return skip_emulated_instruction(vcpu); 1822 } 1823 1824 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1825 { 1826 /* 1827 * Ensure that we clear the HLT state in the VMCS. We don't need to 1828 * explicitly skip the instruction because if the HLT state is set, 1829 * then the instruction is already executing and RIP has already been 1830 * advanced. 1831 */ 1832 if (kvm_hlt_in_guest(vcpu->kvm) && 1833 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1834 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1835 } 1836 1837 void vmx_inject_exception(struct kvm_vcpu *vcpu) 1838 { 1839 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1840 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1841 struct vcpu_vmx *vmx = to_vmx(vcpu); 1842 1843 kvm_deliver_exception_payload(vcpu, ex); 1844 1845 if (ex->has_error_code) { 1846 /* 1847 * Despite the error code being architecturally defined as 32 1848 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1849 * VMX don't actually supporting setting bits 31:16. Hardware 1850 * will (should) never provide a bogus error code, but AMD CPUs 1851 * do generate error codes with bits 31:16 set, and so KVM's 1852 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1853 * the upper bits to avoid VM-Fail, losing information that 1854 * doesn't really exist is preferable to killing the VM. 1855 */ 1856 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1857 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1858 } 1859 1860 if (vmx->rmode.vm86_active) { 1861 int inc_eip = 0; 1862 if (kvm_exception_is_soft(ex->vector)) 1863 inc_eip = vcpu->arch.event_exit_inst_len; 1864 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1865 return; 1866 } 1867 1868 WARN_ON_ONCE(vmx->vt.emulation_required); 1869 1870 if (kvm_exception_is_soft(ex->vector)) { 1871 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1872 vmx->vcpu.arch.event_exit_inst_len); 1873 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1874 } else 1875 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1876 1877 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1878 1879 vmx_clear_hlt(vcpu); 1880 } 1881 1882 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1883 bool load_into_hardware) 1884 { 1885 struct vmx_uret_msr *uret_msr; 1886 1887 uret_msr = vmx_find_uret_msr(vmx, msr); 1888 if (!uret_msr) 1889 return; 1890 1891 uret_msr->load_into_hardware = load_into_hardware; 1892 } 1893 1894 /* 1895 * Configuring user return MSRs to automatically save, load, and restore MSRs 1896 * that need to be shoved into hardware when running the guest. Note, omitting 1897 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1898 * loaded into hardware when running the guest. 1899 */ 1900 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1901 { 1902 #ifdef CONFIG_X86_64 1903 bool load_syscall_msrs; 1904 1905 /* 1906 * The SYSCALL MSRs are only needed on long mode guests, and only 1907 * when EFER.SCE is set. 1908 */ 1909 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1910 (vmx->vcpu.arch.efer & EFER_SCE); 1911 1912 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1913 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1914 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1915 #endif 1916 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1917 1918 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1919 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1920 guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1921 1922 /* 1923 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1924 * kernel and old userspace. If those guests run on a tsx=off host, do 1925 * allow guests to use TSX_CTRL, but don't change the value in hardware 1926 * so that TSX remains always disabled. 1927 */ 1928 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1929 1930 /* 1931 * The set of MSRs to load may have changed, reload MSRs before the 1932 * next VM-Enter. 1933 */ 1934 vmx->guest_uret_msrs_loaded = false; 1935 } 1936 1937 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1938 { 1939 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1940 1941 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 1942 return vmcs12->tsc_offset; 1943 1944 return 0; 1945 } 1946 1947 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1948 { 1949 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1950 1951 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 1952 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 1953 return vmcs12->tsc_multiplier; 1954 1955 return kvm_caps.default_tsc_scaling_ratio; 1956 } 1957 1958 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 1959 { 1960 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 1961 } 1962 1963 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1964 { 1965 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 1966 } 1967 1968 /* 1969 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 1970 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 1971 * backwards compatibility even though KVM doesn't support emulating SMX. And 1972 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 1973 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 1974 */ 1975 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 1976 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 1977 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 1978 FEAT_CTL_SGX_LC_ENABLED | \ 1979 FEAT_CTL_SGX_ENABLED | \ 1980 FEAT_CTL_LMCE_ENABLED) 1981 1982 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 1983 struct msr_data *msr) 1984 { 1985 uint64_t valid_bits; 1986 1987 /* 1988 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 1989 * exposed to the guest. 1990 */ 1991 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 1992 ~KVM_SUPPORTED_FEATURE_CONTROL); 1993 1994 if (!msr->host_initiated && 1995 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 1996 return false; 1997 1998 if (msr->host_initiated) 1999 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 2000 else 2001 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 2002 2003 return !(msr->data & ~valid_bits); 2004 } 2005 2006 int vmx_get_feature_msr(u32 msr, u64 *data) 2007 { 2008 switch (msr) { 2009 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2010 if (!nested) 2011 return 1; 2012 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2013 default: 2014 return KVM_MSR_RET_UNSUPPORTED; 2015 } 2016 } 2017 2018 /* 2019 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 2020 * Returns 0 on success, non-0 otherwise. 2021 * Assumes vcpu_load() was already called. 2022 */ 2023 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2024 { 2025 struct vcpu_vmx *vmx = to_vmx(vcpu); 2026 struct vmx_uret_msr *msr; 2027 u32 index; 2028 2029 switch (msr_info->index) { 2030 #ifdef CONFIG_X86_64 2031 case MSR_FS_BASE: 2032 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2033 break; 2034 case MSR_GS_BASE: 2035 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2036 break; 2037 case MSR_KERNEL_GS_BASE: 2038 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 2039 break; 2040 #endif 2041 case MSR_EFER: 2042 return kvm_get_msr_common(vcpu, msr_info); 2043 case MSR_IA32_TSX_CTRL: 2044 if (!msr_info->host_initiated && 2045 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2046 return 1; 2047 goto find_uret_msr; 2048 case MSR_IA32_UMWAIT_CONTROL: 2049 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2050 return 1; 2051 2052 msr_info->data = vmx->msr_ia32_umwait_control; 2053 break; 2054 case MSR_IA32_SPEC_CTRL: 2055 if (!msr_info->host_initiated && 2056 !guest_has_spec_ctrl_msr(vcpu)) 2057 return 1; 2058 2059 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2060 break; 2061 case MSR_IA32_SYSENTER_CS: 2062 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2063 break; 2064 case MSR_IA32_SYSENTER_EIP: 2065 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2066 break; 2067 case MSR_IA32_SYSENTER_ESP: 2068 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2069 break; 2070 case MSR_IA32_BNDCFGS: 2071 if (!kvm_mpx_supported() || 2072 (!msr_info->host_initiated && 2073 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2074 return 1; 2075 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2076 break; 2077 case MSR_IA32_MCG_EXT_CTL: 2078 if (!msr_info->host_initiated && 2079 !(vmx->msr_ia32_feature_control & 2080 FEAT_CTL_LMCE_ENABLED)) 2081 return 1; 2082 msr_info->data = vcpu->arch.mcg_ext_ctl; 2083 break; 2084 case MSR_IA32_FEAT_CTL: 2085 msr_info->data = vmx->msr_ia32_feature_control; 2086 break; 2087 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2088 if (!msr_info->host_initiated && 2089 !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 2090 return 1; 2091 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2092 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2093 break; 2094 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2095 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2096 return 1; 2097 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2098 &msr_info->data)) 2099 return 1; 2100 #ifdef CONFIG_KVM_HYPERV 2101 /* 2102 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2103 * instead of just ignoring the features, different Hyper-V 2104 * versions are either trying to use them and fail or do some 2105 * sanity checking and refuse to boot. Filter all unsupported 2106 * features out. 2107 */ 2108 if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) 2109 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2110 &msr_info->data); 2111 #endif 2112 break; 2113 case MSR_IA32_RTIT_CTL: 2114 if (!vmx_pt_mode_is_host_guest()) 2115 return 1; 2116 msr_info->data = vmx->pt_desc.guest.ctl; 2117 break; 2118 case MSR_IA32_RTIT_STATUS: 2119 if (!vmx_pt_mode_is_host_guest()) 2120 return 1; 2121 msr_info->data = vmx->pt_desc.guest.status; 2122 break; 2123 case MSR_IA32_RTIT_CR3_MATCH: 2124 if (!vmx_pt_mode_is_host_guest() || 2125 !intel_pt_validate_cap(vmx->pt_desc.caps, 2126 PT_CAP_cr3_filtering)) 2127 return 1; 2128 msr_info->data = vmx->pt_desc.guest.cr3_match; 2129 break; 2130 case MSR_IA32_RTIT_OUTPUT_BASE: 2131 if (!vmx_pt_mode_is_host_guest() || 2132 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2133 PT_CAP_topa_output) && 2134 !intel_pt_validate_cap(vmx->pt_desc.caps, 2135 PT_CAP_single_range_output))) 2136 return 1; 2137 msr_info->data = vmx->pt_desc.guest.output_base; 2138 break; 2139 case MSR_IA32_RTIT_OUTPUT_MASK: 2140 if (!vmx_pt_mode_is_host_guest() || 2141 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2142 PT_CAP_topa_output) && 2143 !intel_pt_validate_cap(vmx->pt_desc.caps, 2144 PT_CAP_single_range_output))) 2145 return 1; 2146 msr_info->data = vmx->pt_desc.guest.output_mask; 2147 break; 2148 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2149 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2150 if (!vmx_pt_mode_is_host_guest() || 2151 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2152 return 1; 2153 if (index % 2) 2154 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2155 else 2156 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2157 break; 2158 case MSR_IA32_DEBUGCTLMSR: 2159 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 2160 break; 2161 default: 2162 find_uret_msr: 2163 msr = vmx_find_uret_msr(vmx, msr_info->index); 2164 if (msr) { 2165 msr_info->data = msr->data; 2166 break; 2167 } 2168 return kvm_get_msr_common(vcpu, msr_info); 2169 } 2170 2171 return 0; 2172 } 2173 2174 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2175 u64 data) 2176 { 2177 #ifdef CONFIG_X86_64 2178 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) 2179 return (u32)data; 2180 #endif 2181 return (unsigned long)data; 2182 } 2183 2184 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2185 { 2186 u64 debugctl = 0; 2187 2188 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2189 (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2190 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2191 2192 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && 2193 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2194 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2195 2196 return debugctl; 2197 } 2198 2199 /* 2200 * Writes msr value into the appropriate "register". 2201 * Returns 0 on success, non-0 otherwise. 2202 * Assumes vcpu_load() was already called. 2203 */ 2204 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2205 { 2206 struct vcpu_vmx *vmx = to_vmx(vcpu); 2207 struct vmx_uret_msr *msr; 2208 int ret = 0; 2209 u32 msr_index = msr_info->index; 2210 u64 data = msr_info->data; 2211 u32 index; 2212 2213 switch (msr_index) { 2214 case MSR_EFER: 2215 ret = kvm_set_msr_common(vcpu, msr_info); 2216 break; 2217 #ifdef CONFIG_X86_64 2218 case MSR_FS_BASE: 2219 vmx_segment_cache_clear(vmx); 2220 vmcs_writel(GUEST_FS_BASE, data); 2221 break; 2222 case MSR_GS_BASE: 2223 vmx_segment_cache_clear(vmx); 2224 vmcs_writel(GUEST_GS_BASE, data); 2225 break; 2226 case MSR_KERNEL_GS_BASE: 2227 vmx_write_guest_kernel_gs_base(vmx, data); 2228 break; 2229 case MSR_IA32_XFD: 2230 ret = kvm_set_msr_common(vcpu, msr_info); 2231 /* 2232 * Always intercepting WRMSR could incur non-negligible 2233 * overhead given xfd might be changed frequently in 2234 * guest context switch. Disable write interception 2235 * upon the first write with a non-zero value (indicating 2236 * potential usage on dynamic xfeatures). Also update 2237 * exception bitmap to trap #NM for proper virtualization 2238 * of guest xfd_err. 2239 */ 2240 if (!ret && data) { 2241 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2242 MSR_TYPE_RW); 2243 vcpu->arch.xfd_no_write_intercept = true; 2244 vmx_update_exception_bitmap(vcpu); 2245 } 2246 break; 2247 #endif 2248 case MSR_IA32_SYSENTER_CS: 2249 if (is_guest_mode(vcpu)) 2250 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2251 vmcs_write32(GUEST_SYSENTER_CS, data); 2252 break; 2253 case MSR_IA32_SYSENTER_EIP: 2254 if (is_guest_mode(vcpu)) { 2255 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2256 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2257 } 2258 vmcs_writel(GUEST_SYSENTER_EIP, data); 2259 break; 2260 case MSR_IA32_SYSENTER_ESP: 2261 if (is_guest_mode(vcpu)) { 2262 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2263 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2264 } 2265 vmcs_writel(GUEST_SYSENTER_ESP, data); 2266 break; 2267 case MSR_IA32_DEBUGCTLMSR: { 2268 u64 invalid; 2269 2270 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2271 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2272 kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); 2273 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2274 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2275 } 2276 2277 if (invalid) 2278 return 1; 2279 2280 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2281 VM_EXIT_SAVE_DEBUG_CONTROLS) 2282 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2283 2284 vmcs_write64(GUEST_IA32_DEBUGCTL, data); 2285 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2286 (data & DEBUGCTLMSR_LBR)) 2287 intel_pmu_create_guest_lbr_event(vcpu); 2288 return 0; 2289 } 2290 case MSR_IA32_BNDCFGS: 2291 if (!kvm_mpx_supported() || 2292 (!msr_info->host_initiated && 2293 !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) 2294 return 1; 2295 if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || 2296 (data & MSR_IA32_BNDCFGS_RSVD)) 2297 return 1; 2298 2299 if (is_guest_mode(vcpu) && 2300 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2301 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2302 get_vmcs12(vcpu)->guest_bndcfgs = data; 2303 2304 vmcs_write64(GUEST_BNDCFGS, data); 2305 break; 2306 case MSR_IA32_UMWAIT_CONTROL: 2307 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2308 return 1; 2309 2310 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2311 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2312 return 1; 2313 2314 vmx->msr_ia32_umwait_control = data; 2315 break; 2316 case MSR_IA32_SPEC_CTRL: 2317 if (!msr_info->host_initiated && 2318 !guest_has_spec_ctrl_msr(vcpu)) 2319 return 1; 2320 2321 if (kvm_spec_ctrl_test_value(data)) 2322 return 1; 2323 2324 vmx->spec_ctrl = data; 2325 if (!data) 2326 break; 2327 2328 /* 2329 * For non-nested: 2330 * When it's written (to non-zero) for the first time, pass 2331 * it through. 2332 * 2333 * For nested: 2334 * The handling of the MSR bitmap for L2 guests is done in 2335 * nested_vmx_prepare_msr_bitmap. We should not touch the 2336 * vmcs02.msr_bitmap here since it gets completely overwritten 2337 * in the merging. We update the vmcs01 here for L1 as well 2338 * since it will end up touching the MSR anyway now. 2339 */ 2340 vmx_disable_intercept_for_msr(vcpu, 2341 MSR_IA32_SPEC_CTRL, 2342 MSR_TYPE_RW); 2343 break; 2344 case MSR_IA32_TSX_CTRL: 2345 if (!msr_info->host_initiated && 2346 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2347 return 1; 2348 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2349 return 1; 2350 goto find_uret_msr; 2351 case MSR_IA32_CR_PAT: 2352 ret = kvm_set_msr_common(vcpu, msr_info); 2353 if (ret) 2354 break; 2355 2356 if (is_guest_mode(vcpu) && 2357 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2358 get_vmcs12(vcpu)->guest_ia32_pat = data; 2359 2360 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2361 vmcs_write64(GUEST_IA32_PAT, data); 2362 break; 2363 case MSR_IA32_MCG_EXT_CTL: 2364 if ((!msr_info->host_initiated && 2365 !(to_vmx(vcpu)->msr_ia32_feature_control & 2366 FEAT_CTL_LMCE_ENABLED)) || 2367 (data & ~MCG_EXT_CTL_LMCE_EN)) 2368 return 1; 2369 vcpu->arch.mcg_ext_ctl = data; 2370 break; 2371 case MSR_IA32_FEAT_CTL: 2372 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2373 return 1; 2374 2375 vmx->msr_ia32_feature_control = data; 2376 if (msr_info->host_initiated && data == 0) 2377 vmx_leave_nested(vcpu); 2378 2379 /* SGX may be enabled/disabled by guest's firmware */ 2380 vmx_write_encls_bitmap(vcpu, NULL); 2381 break; 2382 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2383 /* 2384 * On real hardware, the LE hash MSRs are writable before 2385 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2386 * at which point SGX related bits in IA32_FEATURE_CONTROL 2387 * become writable. 2388 * 2389 * KVM does not emulate SGX activation for simplicity, so 2390 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2391 * is unlocked. This is technically not architectural 2392 * behavior, but it's close enough. 2393 */ 2394 if (!msr_info->host_initiated && 2395 (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || 2396 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2397 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2398 return 1; 2399 vmx->msr_ia32_sgxlepubkeyhash 2400 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2401 break; 2402 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2403 if (!msr_info->host_initiated) 2404 return 1; /* they are read-only */ 2405 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 2406 return 1; 2407 return vmx_set_vmx_msr(vcpu, msr_index, data); 2408 case MSR_IA32_RTIT_CTL: 2409 if (!vmx_pt_mode_is_host_guest() || 2410 vmx_rtit_ctl_check(vcpu, data) || 2411 vmx->nested.vmxon) 2412 return 1; 2413 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2414 vmx->pt_desc.guest.ctl = data; 2415 pt_update_intercept_for_msr(vcpu); 2416 break; 2417 case MSR_IA32_RTIT_STATUS: 2418 if (!pt_can_write_msr(vmx)) 2419 return 1; 2420 if (data & MSR_IA32_RTIT_STATUS_MASK) 2421 return 1; 2422 vmx->pt_desc.guest.status = data; 2423 break; 2424 case MSR_IA32_RTIT_CR3_MATCH: 2425 if (!pt_can_write_msr(vmx)) 2426 return 1; 2427 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2428 PT_CAP_cr3_filtering)) 2429 return 1; 2430 vmx->pt_desc.guest.cr3_match = data; 2431 break; 2432 case MSR_IA32_RTIT_OUTPUT_BASE: 2433 if (!pt_can_write_msr(vmx)) 2434 return 1; 2435 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2436 PT_CAP_topa_output) && 2437 !intel_pt_validate_cap(vmx->pt_desc.caps, 2438 PT_CAP_single_range_output)) 2439 return 1; 2440 if (!pt_output_base_valid(vcpu, data)) 2441 return 1; 2442 vmx->pt_desc.guest.output_base = data; 2443 break; 2444 case MSR_IA32_RTIT_OUTPUT_MASK: 2445 if (!pt_can_write_msr(vmx)) 2446 return 1; 2447 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2448 PT_CAP_topa_output) && 2449 !intel_pt_validate_cap(vmx->pt_desc.caps, 2450 PT_CAP_single_range_output)) 2451 return 1; 2452 vmx->pt_desc.guest.output_mask = data; 2453 break; 2454 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2455 if (!pt_can_write_msr(vmx)) 2456 return 1; 2457 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2458 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2459 return 1; 2460 if (is_noncanonical_msr_address(data, vcpu)) 2461 return 1; 2462 if (index % 2) 2463 vmx->pt_desc.guest.addr_b[index / 2] = data; 2464 else 2465 vmx->pt_desc.guest.addr_a[index / 2] = data; 2466 break; 2467 case MSR_IA32_PERF_CAPABILITIES: 2468 if (data & PMU_CAP_LBR_FMT) { 2469 if ((data & PMU_CAP_LBR_FMT) != 2470 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) 2471 return 1; 2472 if (!cpuid_model_is_consistent(vcpu)) 2473 return 1; 2474 } 2475 if (data & PERF_CAP_PEBS_FORMAT) { 2476 if ((data & PERF_CAP_PEBS_MASK) != 2477 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2478 return 1; 2479 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) 2480 return 1; 2481 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) 2482 return 1; 2483 if (!cpuid_model_is_consistent(vcpu)) 2484 return 1; 2485 } 2486 ret = kvm_set_msr_common(vcpu, msr_info); 2487 break; 2488 2489 default: 2490 find_uret_msr: 2491 msr = vmx_find_uret_msr(vmx, msr_index); 2492 if (msr) 2493 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2494 else 2495 ret = kvm_set_msr_common(vcpu, msr_info); 2496 } 2497 2498 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2499 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2500 vmx_update_fb_clear_dis(vcpu, vmx); 2501 2502 return ret; 2503 } 2504 2505 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2506 { 2507 unsigned long guest_owned_bits; 2508 2509 kvm_register_mark_available(vcpu, reg); 2510 2511 switch (reg) { 2512 case VCPU_REGS_RSP: 2513 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2514 break; 2515 case VCPU_REGS_RIP: 2516 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2517 break; 2518 case VCPU_EXREG_PDPTR: 2519 if (enable_ept) 2520 ept_save_pdptrs(vcpu); 2521 break; 2522 case VCPU_EXREG_CR0: 2523 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2524 2525 vcpu->arch.cr0 &= ~guest_owned_bits; 2526 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2527 break; 2528 case VCPU_EXREG_CR3: 2529 /* 2530 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2531 * CR3 is loaded into hardware, not the guest's CR3. 2532 */ 2533 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2534 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2535 break; 2536 case VCPU_EXREG_CR4: 2537 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2538 2539 vcpu->arch.cr4 &= ~guest_owned_bits; 2540 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2541 break; 2542 default: 2543 KVM_BUG_ON(1, vcpu->kvm); 2544 break; 2545 } 2546 } 2547 2548 /* 2549 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2550 * directly instead of going through cpu_has(), to ensure KVM is trapping 2551 * ENCLS whenever it's supported in hardware. It does not matter whether 2552 * the host OS supports or has enabled SGX. 2553 */ 2554 static bool cpu_has_sgx(void) 2555 { 2556 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2557 } 2558 2559 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2560 { 2561 u32 vmx_msr_low, vmx_msr_high; 2562 u32 ctl = ctl_min | ctl_opt; 2563 2564 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2565 2566 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2567 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2568 2569 /* Ensure minimum (required) set of control bits are supported. */ 2570 if (ctl_min & ~ctl) 2571 return -EIO; 2572 2573 *result = ctl; 2574 return 0; 2575 } 2576 2577 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2578 { 2579 u64 allowed; 2580 2581 rdmsrq(msr, allowed); 2582 2583 return ctl_opt & allowed; 2584 } 2585 2586 #define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ 2587 ({ \ 2588 int i, r = 0; \ 2589 \ 2590 BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ 2591 BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ 2592 \ 2593 for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ 2594 typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ 2595 typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ 2596 \ 2597 if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ 2598 continue; \ 2599 \ 2600 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ 2601 "entry = %llx (%llx), exit = %llx (%llx)\n", \ 2602 (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ 2603 (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ 2604 \ 2605 if (error_on_inconsistent_vmcs_config) \ 2606 r = -EIO; \ 2607 \ 2608 entry_controls &= ~n_ctrl; \ 2609 exit_controls &= ~x_ctrl; \ 2610 } \ 2611 r; \ 2612 }) 2613 2614 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2615 struct vmx_capability *vmx_cap) 2616 { 2617 u32 _pin_based_exec_control = 0; 2618 u32 _cpu_based_exec_control = 0; 2619 u32 _cpu_based_2nd_exec_control = 0; 2620 u64 _cpu_based_3rd_exec_control = 0; 2621 u32 _vmexit_control = 0; 2622 u32 _vmentry_control = 0; 2623 u64 basic_msr; 2624 u64 misc_msr; 2625 2626 /* 2627 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2628 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2629 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2630 */ 2631 struct { 2632 u32 entry_control; 2633 u32 exit_control; 2634 } const vmcs_entry_exit_pairs[] = { 2635 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2636 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2637 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2638 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2639 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2640 }; 2641 2642 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2643 2644 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2645 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2646 MSR_IA32_VMX_PROCBASED_CTLS, 2647 &_cpu_based_exec_control)) 2648 return -EIO; 2649 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2650 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2651 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2652 MSR_IA32_VMX_PROCBASED_CTLS2, 2653 &_cpu_based_2nd_exec_control)) 2654 return -EIO; 2655 } 2656 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 2657 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2658 2659 #ifndef CONFIG_X86_64 2660 if (!(_cpu_based_2nd_exec_control & 2661 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2662 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2663 #endif 2664 2665 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2666 _cpu_based_2nd_exec_control &= ~( 2667 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2668 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2669 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2670 2671 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2672 &vmx_cap->ept, &vmx_cap->vpid); 2673 2674 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2675 vmx_cap->ept) { 2676 pr_warn_once("EPT CAP should not exist if not support " 2677 "1-setting enable EPT VM-execution control\n"); 2678 2679 if (error_on_inconsistent_vmcs_config) 2680 return -EIO; 2681 2682 vmx_cap->ept = 0; 2683 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2684 } 2685 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2686 vmx_cap->vpid) { 2687 pr_warn_once("VPID CAP should not exist if not support " 2688 "1-setting enable VPID VM-execution control\n"); 2689 2690 if (error_on_inconsistent_vmcs_config) 2691 return -EIO; 2692 2693 vmx_cap->vpid = 0; 2694 } 2695 2696 if (!cpu_has_sgx()) 2697 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2698 2699 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2700 _cpu_based_3rd_exec_control = 2701 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2702 MSR_IA32_VMX_PROCBASED_CTLS3); 2703 2704 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2705 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2706 MSR_IA32_VMX_EXIT_CTLS, 2707 &_vmexit_control)) 2708 return -EIO; 2709 2710 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2711 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2712 MSR_IA32_VMX_PINBASED_CTLS, 2713 &_pin_based_exec_control)) 2714 return -EIO; 2715 2716 if (cpu_has_broken_vmx_preemption_timer()) 2717 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2718 if (!(_cpu_based_2nd_exec_control & 2719 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2720 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2721 2722 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2723 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2724 MSR_IA32_VMX_ENTRY_CTLS, 2725 &_vmentry_control)) 2726 return -EIO; 2727 2728 if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, 2729 _vmentry_control, _vmexit_control)) 2730 return -EIO; 2731 2732 /* 2733 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2734 * can't be used due to an errata where VM Exit may incorrectly clear 2735 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2736 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2737 */ 2738 switch (boot_cpu_data.x86_vfm) { 2739 case INTEL_NEHALEM_EP: /* AAK155 */ 2740 case INTEL_NEHALEM: /* AAP115 */ 2741 case INTEL_WESTMERE: /* AAT100 */ 2742 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2743 case INTEL_NEHALEM_EX: /* BA97 */ 2744 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2745 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2746 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2747 "does not work properly. Using workaround\n"); 2748 break; 2749 default: 2750 break; 2751 } 2752 2753 rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); 2754 2755 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2756 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2757 return -EIO; 2758 2759 #ifdef CONFIG_X86_64 2760 /* 2761 * KVM expects to be able to shove all legal physical addresses into 2762 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2763 * 0 for processors that support Intel 64 architecture". 2764 */ 2765 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2766 return -EIO; 2767 #endif 2768 2769 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2770 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2771 return -EIO; 2772 2773 rdmsrq(MSR_IA32_VMX_MISC, misc_msr); 2774 2775 vmcs_conf->basic = basic_msr; 2776 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2777 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2778 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2779 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2780 vmcs_conf->vmexit_ctrl = _vmexit_control; 2781 vmcs_conf->vmentry_ctrl = _vmentry_control; 2782 vmcs_conf->misc = misc_msr; 2783 2784 #if IS_ENABLED(CONFIG_HYPERV) 2785 if (enlightened_vmcs) 2786 evmcs_sanitize_exec_ctrls(vmcs_conf); 2787 #endif 2788 2789 return 0; 2790 } 2791 2792 static bool __kvm_is_vmx_supported(void) 2793 { 2794 int cpu = smp_processor_id(); 2795 2796 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2797 pr_err("VMX not supported by CPU %d\n", cpu); 2798 return false; 2799 } 2800 2801 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2802 !this_cpu_has(X86_FEATURE_VMX)) { 2803 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2804 return false; 2805 } 2806 2807 return true; 2808 } 2809 2810 static bool kvm_is_vmx_supported(void) 2811 { 2812 bool supported; 2813 2814 migrate_disable(); 2815 supported = __kvm_is_vmx_supported(); 2816 migrate_enable(); 2817 2818 return supported; 2819 } 2820 2821 int vmx_check_processor_compat(void) 2822 { 2823 int cpu = raw_smp_processor_id(); 2824 struct vmcs_config vmcs_conf; 2825 struct vmx_capability vmx_cap; 2826 2827 if (!__kvm_is_vmx_supported()) 2828 return -EIO; 2829 2830 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2831 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2832 return -EIO; 2833 } 2834 if (nested) 2835 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2836 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2837 pr_err("Inconsistent VMCS config on CPU %d\n", cpu); 2838 return -EIO; 2839 } 2840 return 0; 2841 } 2842 2843 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2844 { 2845 u64 msr; 2846 2847 cr4_set_bits(X86_CR4_VMXE); 2848 2849 asm goto("1: vmxon %[vmxon_pointer]\n\t" 2850 _ASM_EXTABLE(1b, %l[fault]) 2851 : : [vmxon_pointer] "m"(vmxon_pointer) 2852 : : fault); 2853 return 0; 2854 2855 fault: 2856 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2857 rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2858 cr4_clear_bits(X86_CR4_VMXE); 2859 2860 return -EFAULT; 2861 } 2862 2863 int vmx_enable_virtualization_cpu(void) 2864 { 2865 int cpu = raw_smp_processor_id(); 2866 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2867 int r; 2868 2869 if (cr4_read_shadow() & X86_CR4_VMXE) 2870 return -EBUSY; 2871 2872 /* 2873 * This can happen if we hot-added a CPU but failed to allocate 2874 * VP assist page for it. 2875 */ 2876 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2877 return -EFAULT; 2878 2879 intel_pt_handle_vmx(1); 2880 2881 r = kvm_cpu_vmxon(phys_addr); 2882 if (r) { 2883 intel_pt_handle_vmx(0); 2884 return r; 2885 } 2886 2887 return 0; 2888 } 2889 2890 static void vmclear_local_loaded_vmcss(void) 2891 { 2892 int cpu = raw_smp_processor_id(); 2893 struct loaded_vmcs *v, *n; 2894 2895 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2896 loaded_vmcss_on_cpu_link) 2897 __loaded_vmcs_clear(v); 2898 } 2899 2900 void vmx_disable_virtualization_cpu(void) 2901 { 2902 vmclear_local_loaded_vmcss(); 2903 2904 if (kvm_cpu_vmxoff()) 2905 kvm_spurious_fault(); 2906 2907 hv_reset_evmcs(); 2908 2909 intel_pt_handle_vmx(0); 2910 } 2911 2912 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2913 { 2914 int node = cpu_to_node(cpu); 2915 struct page *pages; 2916 struct vmcs *vmcs; 2917 2918 pages = __alloc_pages_node(node, flags, 0); 2919 if (!pages) 2920 return NULL; 2921 vmcs = page_address(pages); 2922 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 2923 2924 /* KVM supports Enlightened VMCS v1 only */ 2925 if (kvm_is_using_evmcs()) 2926 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2927 else 2928 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 2929 2930 if (shadow) 2931 vmcs->hdr.shadow_vmcs = 1; 2932 return vmcs; 2933 } 2934 2935 void free_vmcs(struct vmcs *vmcs) 2936 { 2937 free_page((unsigned long)vmcs); 2938 } 2939 2940 /* 2941 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2942 */ 2943 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2944 { 2945 if (!loaded_vmcs->vmcs) 2946 return; 2947 loaded_vmcs_clear(loaded_vmcs); 2948 free_vmcs(loaded_vmcs->vmcs); 2949 loaded_vmcs->vmcs = NULL; 2950 if (loaded_vmcs->msr_bitmap) 2951 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2952 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2953 } 2954 2955 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2956 { 2957 loaded_vmcs->vmcs = alloc_vmcs(false); 2958 if (!loaded_vmcs->vmcs) 2959 return -ENOMEM; 2960 2961 vmcs_clear(loaded_vmcs->vmcs); 2962 2963 loaded_vmcs->shadow_vmcs = NULL; 2964 loaded_vmcs->hv_timer_soft_disabled = false; 2965 loaded_vmcs->cpu = -1; 2966 loaded_vmcs->launched = 0; 2967 2968 if (cpu_has_vmx_msr_bitmap()) { 2969 loaded_vmcs->msr_bitmap = (unsigned long *) 2970 __get_free_page(GFP_KERNEL_ACCOUNT); 2971 if (!loaded_vmcs->msr_bitmap) 2972 goto out_vmcs; 2973 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2974 } 2975 2976 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2977 memset(&loaded_vmcs->controls_shadow, 0, 2978 sizeof(struct vmcs_controls_shadow)); 2979 2980 return 0; 2981 2982 out_vmcs: 2983 free_loaded_vmcs(loaded_vmcs); 2984 return -ENOMEM; 2985 } 2986 2987 static void free_kvm_area(void) 2988 { 2989 int cpu; 2990 2991 for_each_possible_cpu(cpu) { 2992 free_vmcs(per_cpu(vmxarea, cpu)); 2993 per_cpu(vmxarea, cpu) = NULL; 2994 } 2995 } 2996 2997 static __init int alloc_kvm_area(void) 2998 { 2999 int cpu; 3000 3001 for_each_possible_cpu(cpu) { 3002 struct vmcs *vmcs; 3003 3004 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 3005 if (!vmcs) { 3006 free_kvm_area(); 3007 return -ENOMEM; 3008 } 3009 3010 /* 3011 * When eVMCS is enabled, alloc_vmcs_cpu() sets 3012 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 3013 * revision_id reported by MSR_IA32_VMX_BASIC. 3014 * 3015 * However, even though not explicitly documented by 3016 * TLFS, VMXArea passed as VMXON argument should 3017 * still be marked with revision_id reported by 3018 * physical CPU. 3019 */ 3020 if (kvm_is_using_evmcs()) 3021 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3022 3023 per_cpu(vmxarea, cpu) = vmcs; 3024 } 3025 return 0; 3026 } 3027 3028 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3029 struct kvm_segment *save) 3030 { 3031 if (!emulate_invalid_guest_state) { 3032 /* 3033 * CS and SS RPL should be equal during guest entry according 3034 * to VMX spec, but in reality it is not always so. Since vcpu 3035 * is in the middle of the transition from real mode to 3036 * protected mode it is safe to assume that RPL 0 is a good 3037 * default value. 3038 */ 3039 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3040 save->selector &= ~SEGMENT_RPL_MASK; 3041 save->dpl = save->selector & SEGMENT_RPL_MASK; 3042 save->s = 1; 3043 } 3044 __vmx_set_segment(vcpu, save, seg); 3045 } 3046 3047 static void enter_pmode(struct kvm_vcpu *vcpu) 3048 { 3049 unsigned long flags; 3050 struct vcpu_vmx *vmx = to_vmx(vcpu); 3051 3052 /* 3053 * Update real mode segment cache. It may be not up-to-date if segment 3054 * register was written while vcpu was in a guest mode. 3055 */ 3056 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3057 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3058 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3059 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3060 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3061 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3062 3063 vmx->rmode.vm86_active = 0; 3064 3065 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3066 3067 flags = vmcs_readl(GUEST_RFLAGS); 3068 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3069 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3070 vmcs_writel(GUEST_RFLAGS, flags); 3071 3072 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3073 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3074 3075 vmx_update_exception_bitmap(vcpu); 3076 3077 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3078 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3079 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3080 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3081 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3082 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3083 } 3084 3085 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3086 { 3087 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3088 struct kvm_segment var = *save; 3089 3090 var.dpl = 0x3; 3091 if (seg == VCPU_SREG_CS) 3092 var.type = 0x3; 3093 3094 if (!emulate_invalid_guest_state) { 3095 var.selector = var.base >> 4; 3096 var.base = var.base & 0xffff0; 3097 var.limit = 0xffff; 3098 var.g = 0; 3099 var.db = 0; 3100 var.present = 1; 3101 var.s = 1; 3102 var.l = 0; 3103 var.unusable = 0; 3104 var.type = 0x3; 3105 var.avl = 0; 3106 if (save->base & 0xf) 3107 pr_warn_once("segment base is not paragraph aligned " 3108 "when entering protected mode (seg=%d)", seg); 3109 } 3110 3111 vmcs_write16(sf->selector, var.selector); 3112 vmcs_writel(sf->base, var.base); 3113 vmcs_write32(sf->limit, var.limit); 3114 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3115 } 3116 3117 static void enter_rmode(struct kvm_vcpu *vcpu) 3118 { 3119 unsigned long flags; 3120 struct vcpu_vmx *vmx = to_vmx(vcpu); 3121 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3122 3123 /* 3124 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3125 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3126 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3127 * should VM-Fail and KVM should reject userspace attempts to stuff 3128 * CR0.PG=0 when L2 is active. 3129 */ 3130 WARN_ON_ONCE(is_guest_mode(vcpu)); 3131 3132 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3133 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3134 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3135 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3136 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3137 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3138 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3139 3140 vmx->rmode.vm86_active = 1; 3141 3142 vmx_segment_cache_clear(vmx); 3143 3144 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3145 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3146 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3147 3148 flags = vmcs_readl(GUEST_RFLAGS); 3149 vmx->rmode.save_rflags = flags; 3150 3151 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3152 3153 vmcs_writel(GUEST_RFLAGS, flags); 3154 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3155 vmx_update_exception_bitmap(vcpu); 3156 3157 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3158 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3159 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3160 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3161 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3162 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3163 } 3164 3165 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3166 { 3167 struct vcpu_vmx *vmx = to_vmx(vcpu); 3168 3169 /* Nothing to do if hardware doesn't support EFER. */ 3170 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3171 return 0; 3172 3173 vcpu->arch.efer = efer; 3174 #ifdef CONFIG_X86_64 3175 if (efer & EFER_LMA) 3176 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3177 else 3178 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3179 #else 3180 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3181 return 1; 3182 #endif 3183 3184 vmx_setup_uret_msrs(vmx); 3185 return 0; 3186 } 3187 3188 #ifdef CONFIG_X86_64 3189 3190 static void enter_lmode(struct kvm_vcpu *vcpu) 3191 { 3192 u32 guest_tr_ar; 3193 3194 vmx_segment_cache_clear(to_vmx(vcpu)); 3195 3196 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3197 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3198 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3199 __func__); 3200 vmcs_write32(GUEST_TR_AR_BYTES, 3201 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3202 | VMX_AR_TYPE_BUSY_64_TSS); 3203 } 3204 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3205 } 3206 3207 static void exit_lmode(struct kvm_vcpu *vcpu) 3208 { 3209 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3210 } 3211 3212 #endif 3213 3214 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3215 { 3216 struct vcpu_vmx *vmx = to_vmx(vcpu); 3217 3218 /* 3219 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3220 * the CPU is not required to invalidate guest-physical mappings on 3221 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3222 * associated with the root EPT structure and not any particular VPID 3223 * (INVVPID also isn't required to invalidate guest-physical mappings). 3224 */ 3225 if (enable_ept) { 3226 ept_sync_global(); 3227 } else if (enable_vpid) { 3228 if (cpu_has_vmx_invvpid_global()) { 3229 vpid_sync_vcpu_global(); 3230 } else { 3231 vpid_sync_vcpu_single(vmx->vpid); 3232 vpid_sync_vcpu_single(vmx->nested.vpid02); 3233 } 3234 } 3235 } 3236 3237 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3238 { 3239 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) 3240 return nested_get_vpid02(vcpu); 3241 return to_vmx(vcpu)->vpid; 3242 } 3243 3244 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3245 { 3246 struct kvm_mmu *mmu = vcpu->arch.mmu; 3247 u64 root_hpa = mmu->root.hpa; 3248 3249 /* No flush required if the current context is invalid. */ 3250 if (!VALID_PAGE(root_hpa)) 3251 return; 3252 3253 if (enable_ept) 3254 ept_sync_context(construct_eptp(vcpu, root_hpa, 3255 mmu->root_role.level)); 3256 else 3257 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3258 } 3259 3260 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3261 { 3262 /* 3263 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3264 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3265 */ 3266 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3267 } 3268 3269 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3270 { 3271 /* 3272 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3273 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3274 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3275 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3276 * i.e. no explicit INVVPID is necessary. 3277 */ 3278 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3279 } 3280 3281 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3282 { 3283 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3284 3285 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3286 return; 3287 3288 if (is_pae_paging(vcpu)) { 3289 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3290 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3291 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3292 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3293 } 3294 } 3295 3296 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3297 { 3298 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3299 3300 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3301 return; 3302 3303 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3304 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3305 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3306 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3307 3308 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3309 } 3310 3311 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3312 CPU_BASED_CR3_STORE_EXITING) 3313 3314 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3315 { 3316 if (is_guest_mode(vcpu)) 3317 return nested_guest_cr0_valid(vcpu, cr0); 3318 3319 if (to_vmx(vcpu)->nested.vmxon) 3320 return nested_host_cr0_valid(vcpu, cr0); 3321 3322 return true; 3323 } 3324 3325 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3326 { 3327 struct vcpu_vmx *vmx = to_vmx(vcpu); 3328 unsigned long hw_cr0, old_cr0_pg; 3329 u32 tmp; 3330 3331 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3332 3333 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3334 if (enable_unrestricted_guest) 3335 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3336 else { 3337 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3338 if (!enable_ept) 3339 hw_cr0 |= X86_CR0_WP; 3340 3341 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3342 enter_pmode(vcpu); 3343 3344 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3345 enter_rmode(vcpu); 3346 } 3347 3348 vmcs_writel(CR0_READ_SHADOW, cr0); 3349 vmcs_writel(GUEST_CR0, hw_cr0); 3350 vcpu->arch.cr0 = cr0; 3351 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3352 3353 #ifdef CONFIG_X86_64 3354 if (vcpu->arch.efer & EFER_LME) { 3355 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3356 enter_lmode(vcpu); 3357 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3358 exit_lmode(vcpu); 3359 } 3360 #endif 3361 3362 if (enable_ept && !enable_unrestricted_guest) { 3363 /* 3364 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3365 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3366 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3367 * KVM's CR3 is installed. 3368 */ 3369 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3370 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3371 3372 /* 3373 * When running with EPT but not unrestricted guest, KVM must 3374 * intercept CR3 accesses when paging is _disabled_. This is 3375 * necessary because restricted guests can't actually run with 3376 * paging disabled, and so KVM stuffs its own CR3 in order to 3377 * run the guest when identity mapped page tables. 3378 * 3379 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3380 * update, it may be stale with respect to CR3 interception, 3381 * e.g. after nested VM-Enter. 3382 * 3383 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3384 * stores to forward them to L1, even if KVM does not need to 3385 * intercept them to preserve its identity mapped page tables. 3386 */ 3387 if (!(cr0 & X86_CR0_PG)) { 3388 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3389 } else if (!is_guest_mode(vcpu)) { 3390 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3391 } else { 3392 tmp = exec_controls_get(vmx); 3393 tmp &= ~CR3_EXITING_BITS; 3394 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3395 exec_controls_set(vmx, tmp); 3396 } 3397 3398 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3399 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3400 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3401 3402 /* 3403 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3404 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3405 */ 3406 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3407 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3408 } 3409 3410 /* depends on vcpu->arch.cr0 to be set to a new value */ 3411 vmx->vt.emulation_required = vmx_emulation_required(vcpu); 3412 } 3413 3414 static int vmx_get_max_ept_level(void) 3415 { 3416 if (cpu_has_vmx_ept_5levels()) 3417 return 5; 3418 return 4; 3419 } 3420 3421 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3422 { 3423 u64 eptp = VMX_EPTP_MT_WB; 3424 3425 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3426 3427 if (enable_ept_ad_bits && 3428 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 3429 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3430 eptp |= root_hpa; 3431 3432 return eptp; 3433 } 3434 3435 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3436 { 3437 struct kvm *kvm = vcpu->kvm; 3438 bool update_guest_cr3 = true; 3439 unsigned long guest_cr3; 3440 u64 eptp; 3441 3442 if (enable_ept) { 3443 eptp = construct_eptp(vcpu, root_hpa, root_level); 3444 vmcs_write64(EPT_POINTER, eptp); 3445 3446 hv_track_root_tdp(vcpu, root_hpa); 3447 3448 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3449 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3450 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3451 guest_cr3 = vcpu->arch.cr3; 3452 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3453 update_guest_cr3 = false; 3454 vmx_ept_load_pdptrs(vcpu); 3455 } else { 3456 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3457 kvm_get_active_cr3_lam_bits(vcpu); 3458 } 3459 3460 if (update_guest_cr3) 3461 vmcs_writel(GUEST_CR3, guest_cr3); 3462 } 3463 3464 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3465 { 3466 /* 3467 * We operate under the default treatment of SMM, so VMX cannot be 3468 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3469 * i.e. is a reserved bit, is handled by common x86 code. 3470 */ 3471 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3472 return false; 3473 3474 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3475 return false; 3476 3477 return true; 3478 } 3479 3480 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3481 { 3482 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3483 struct vcpu_vmx *vmx = to_vmx(vcpu); 3484 unsigned long hw_cr4; 3485 3486 /* 3487 * Pass through host's Machine Check Enable value to hw_cr4, which 3488 * is in force while we are in guest mode. Do not let guests control 3489 * this bit, even if host CR4.MCE == 0. 3490 */ 3491 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3492 if (enable_unrestricted_guest) 3493 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3494 else if (vmx->rmode.vm86_active) 3495 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3496 else 3497 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3498 3499 if (vmx_umip_emulated()) { 3500 if (cr4 & X86_CR4_UMIP) { 3501 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3502 hw_cr4 &= ~X86_CR4_UMIP; 3503 } else if (!is_guest_mode(vcpu) || 3504 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3505 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3506 } 3507 } 3508 3509 vcpu->arch.cr4 = cr4; 3510 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3511 3512 if (!enable_unrestricted_guest) { 3513 if (enable_ept) { 3514 if (!is_paging(vcpu)) { 3515 hw_cr4 &= ~X86_CR4_PAE; 3516 hw_cr4 |= X86_CR4_PSE; 3517 } else if (!(cr4 & X86_CR4_PAE)) { 3518 hw_cr4 &= ~X86_CR4_PAE; 3519 } 3520 } 3521 3522 /* 3523 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3524 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3525 * to be manually disabled when guest switches to non-paging 3526 * mode. 3527 * 3528 * If !enable_unrestricted_guest, the CPU is always running 3529 * with CR0.PG=1 and CR4 needs to be modified. 3530 * If enable_unrestricted_guest, the CPU automatically 3531 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3532 */ 3533 if (!is_paging(vcpu)) 3534 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3535 } 3536 3537 vmcs_writel(CR4_READ_SHADOW, cr4); 3538 vmcs_writel(GUEST_CR4, hw_cr4); 3539 3540 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3541 vcpu->arch.cpuid_dynamic_bits_dirty = true; 3542 } 3543 3544 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3545 { 3546 struct vcpu_vmx *vmx = to_vmx(vcpu); 3547 u32 ar; 3548 3549 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3550 *var = vmx->rmode.segs[seg]; 3551 if (seg == VCPU_SREG_TR 3552 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3553 return; 3554 var->base = vmx_read_guest_seg_base(vmx, seg); 3555 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3556 return; 3557 } 3558 var->base = vmx_read_guest_seg_base(vmx, seg); 3559 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3560 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3561 ar = vmx_read_guest_seg_ar(vmx, seg); 3562 var->unusable = (ar >> 16) & 1; 3563 var->type = ar & 15; 3564 var->s = (ar >> 4) & 1; 3565 var->dpl = (ar >> 5) & 3; 3566 /* 3567 * Some userspaces do not preserve unusable property. Since usable 3568 * segment has to be present according to VMX spec we can use present 3569 * property to amend userspace bug by making unusable segment always 3570 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3571 * segment as unusable. 3572 */ 3573 var->present = !var->unusable; 3574 var->avl = (ar >> 12) & 1; 3575 var->l = (ar >> 13) & 1; 3576 var->db = (ar >> 14) & 1; 3577 var->g = (ar >> 15) & 1; 3578 } 3579 3580 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3581 { 3582 struct kvm_segment s; 3583 3584 if (to_vmx(vcpu)->rmode.vm86_active) { 3585 vmx_get_segment(vcpu, &s, seg); 3586 return s.base; 3587 } 3588 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3589 } 3590 3591 static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) 3592 { 3593 struct vcpu_vmx *vmx = to_vmx(vcpu); 3594 int ar; 3595 3596 if (unlikely(vmx->rmode.vm86_active)) 3597 return 0; 3598 3599 if (no_cache) 3600 ar = vmcs_read32(GUEST_SS_AR_BYTES); 3601 else 3602 ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3603 return VMX_AR_DPL(ar); 3604 } 3605 3606 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3607 { 3608 return __vmx_get_cpl(vcpu, false); 3609 } 3610 3611 int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) 3612 { 3613 return __vmx_get_cpl(vcpu, true); 3614 } 3615 3616 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3617 { 3618 u32 ar; 3619 3620 ar = var->type & 15; 3621 ar |= (var->s & 1) << 4; 3622 ar |= (var->dpl & 3) << 5; 3623 ar |= (var->present & 1) << 7; 3624 ar |= (var->avl & 1) << 12; 3625 ar |= (var->l & 1) << 13; 3626 ar |= (var->db & 1) << 14; 3627 ar |= (var->g & 1) << 15; 3628 ar |= (var->unusable || !var->present) << 16; 3629 3630 return ar; 3631 } 3632 3633 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3634 { 3635 struct vcpu_vmx *vmx = to_vmx(vcpu); 3636 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3637 3638 vmx_segment_cache_clear(vmx); 3639 3640 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3641 vmx->rmode.segs[seg] = *var; 3642 if (seg == VCPU_SREG_TR) 3643 vmcs_write16(sf->selector, var->selector); 3644 else if (var->s) 3645 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3646 return; 3647 } 3648 3649 vmcs_writel(sf->base, var->base); 3650 vmcs_write32(sf->limit, var->limit); 3651 vmcs_write16(sf->selector, var->selector); 3652 3653 /* 3654 * Fix the "Accessed" bit in AR field of segment registers for older 3655 * qemu binaries. 3656 * IA32 arch specifies that at the time of processor reset the 3657 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3658 * is setting it to 0 in the userland code. This causes invalid guest 3659 * state vmexit when "unrestricted guest" mode is turned on. 3660 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3661 * tree. Newer qemu binaries with that qemu fix would not need this 3662 * kvm hack. 3663 */ 3664 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3665 var->type |= 0x1; /* Accessed */ 3666 3667 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3668 } 3669 3670 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3671 { 3672 __vmx_set_segment(vcpu, var, seg); 3673 3674 to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); 3675 } 3676 3677 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3678 { 3679 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3680 3681 *db = (ar >> 14) & 1; 3682 *l = (ar >> 13) & 1; 3683 } 3684 3685 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3686 { 3687 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3688 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3689 } 3690 3691 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3692 { 3693 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3694 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3695 } 3696 3697 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3698 { 3699 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3700 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3701 } 3702 3703 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3704 { 3705 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3706 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3707 } 3708 3709 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3710 { 3711 struct kvm_segment var; 3712 u32 ar; 3713 3714 vmx_get_segment(vcpu, &var, seg); 3715 var.dpl = 0x3; 3716 if (seg == VCPU_SREG_CS) 3717 var.type = 0x3; 3718 ar = vmx_segment_access_rights(&var); 3719 3720 if (var.base != (var.selector << 4)) 3721 return false; 3722 if (var.limit != 0xffff) 3723 return false; 3724 if (ar != 0xf3) 3725 return false; 3726 3727 return true; 3728 } 3729 3730 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3731 { 3732 struct kvm_segment cs; 3733 unsigned int cs_rpl; 3734 3735 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3736 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3737 3738 if (cs.unusable) 3739 return false; 3740 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3741 return false; 3742 if (!cs.s) 3743 return false; 3744 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3745 if (cs.dpl > cs_rpl) 3746 return false; 3747 } else { 3748 if (cs.dpl != cs_rpl) 3749 return false; 3750 } 3751 if (!cs.present) 3752 return false; 3753 3754 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3755 return true; 3756 } 3757 3758 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3759 { 3760 struct kvm_segment ss; 3761 unsigned int ss_rpl; 3762 3763 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3764 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3765 3766 if (ss.unusable) 3767 return true; 3768 if (ss.type != 3 && ss.type != 7) 3769 return false; 3770 if (!ss.s) 3771 return false; 3772 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3773 return false; 3774 if (!ss.present) 3775 return false; 3776 3777 return true; 3778 } 3779 3780 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3781 { 3782 struct kvm_segment var; 3783 unsigned int rpl; 3784 3785 vmx_get_segment(vcpu, &var, seg); 3786 rpl = var.selector & SEGMENT_RPL_MASK; 3787 3788 if (var.unusable) 3789 return true; 3790 if (!var.s) 3791 return false; 3792 if (!var.present) 3793 return false; 3794 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3795 if (var.dpl < rpl) /* DPL < RPL */ 3796 return false; 3797 } 3798 3799 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3800 * rights flags 3801 */ 3802 return true; 3803 } 3804 3805 static bool tr_valid(struct kvm_vcpu *vcpu) 3806 { 3807 struct kvm_segment tr; 3808 3809 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3810 3811 if (tr.unusable) 3812 return false; 3813 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3814 return false; 3815 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3816 return false; 3817 if (!tr.present) 3818 return false; 3819 3820 return true; 3821 } 3822 3823 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3824 { 3825 struct kvm_segment ldtr; 3826 3827 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3828 3829 if (ldtr.unusable) 3830 return true; 3831 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3832 return false; 3833 if (ldtr.type != 2) 3834 return false; 3835 if (!ldtr.present) 3836 return false; 3837 3838 return true; 3839 } 3840 3841 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3842 { 3843 struct kvm_segment cs, ss; 3844 3845 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3846 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3847 3848 return ((cs.selector & SEGMENT_RPL_MASK) == 3849 (ss.selector & SEGMENT_RPL_MASK)); 3850 } 3851 3852 /* 3853 * Check if guest state is valid. Returns true if valid, false if 3854 * not. 3855 * We assume that registers are always usable 3856 */ 3857 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3858 { 3859 /* real mode guest state checks */ 3860 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3861 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3862 return false; 3863 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3864 return false; 3865 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3866 return false; 3867 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3868 return false; 3869 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3870 return false; 3871 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3872 return false; 3873 } else { 3874 /* protected mode guest state checks */ 3875 if (!cs_ss_rpl_check(vcpu)) 3876 return false; 3877 if (!code_segment_valid(vcpu)) 3878 return false; 3879 if (!stack_segment_valid(vcpu)) 3880 return false; 3881 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3882 return false; 3883 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3884 return false; 3885 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3886 return false; 3887 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3888 return false; 3889 if (!tr_valid(vcpu)) 3890 return false; 3891 if (!ldtr_valid(vcpu)) 3892 return false; 3893 } 3894 /* TODO: 3895 * - Add checks on RIP 3896 * - Add checks on RFLAGS 3897 */ 3898 3899 return true; 3900 } 3901 3902 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3903 { 3904 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3905 u16 data; 3906 int i; 3907 3908 for (i = 0; i < 3; i++) { 3909 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3910 return -EFAULT; 3911 } 3912 3913 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3914 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3915 return -EFAULT; 3916 3917 data = ~0; 3918 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3919 return -EFAULT; 3920 3921 return 0; 3922 } 3923 3924 static int init_rmode_identity_map(struct kvm *kvm) 3925 { 3926 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3927 int i, r = 0; 3928 void __user *uaddr; 3929 u32 tmp; 3930 3931 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3932 mutex_lock(&kvm->slots_lock); 3933 3934 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3935 goto out; 3936 3937 if (!kvm_vmx->ept_identity_map_addr) 3938 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3939 3940 uaddr = __x86_set_memory_region(kvm, 3941 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3942 kvm_vmx->ept_identity_map_addr, 3943 PAGE_SIZE); 3944 if (IS_ERR(uaddr)) { 3945 r = PTR_ERR(uaddr); 3946 goto out; 3947 } 3948 3949 /* Set up identity-mapping pagetable for EPT in real mode */ 3950 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 3951 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3952 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3953 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 3954 r = -EFAULT; 3955 goto out; 3956 } 3957 } 3958 kvm_vmx->ept_identity_pagetable_done = true; 3959 3960 out: 3961 mutex_unlock(&kvm->slots_lock); 3962 return r; 3963 } 3964 3965 static void seg_setup(int seg) 3966 { 3967 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3968 unsigned int ar; 3969 3970 vmcs_write16(sf->selector, 0); 3971 vmcs_writel(sf->base, 0); 3972 vmcs_write32(sf->limit, 0xffff); 3973 ar = 0x93; 3974 if (seg == VCPU_SREG_CS) 3975 ar |= 0x08; /* code segment */ 3976 3977 vmcs_write32(sf->ar_bytes, ar); 3978 } 3979 3980 int allocate_vpid(void) 3981 { 3982 int vpid; 3983 3984 if (!enable_vpid) 3985 return 0; 3986 spin_lock(&vmx_vpid_lock); 3987 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3988 if (vpid < VMX_NR_VPIDS) 3989 __set_bit(vpid, vmx_vpid_bitmap); 3990 else 3991 vpid = 0; 3992 spin_unlock(&vmx_vpid_lock); 3993 return vpid; 3994 } 3995 3996 void free_vpid(int vpid) 3997 { 3998 if (!enable_vpid || vpid == 0) 3999 return; 4000 spin_lock(&vmx_vpid_lock); 4001 __clear_bit(vpid, vmx_vpid_bitmap); 4002 spin_unlock(&vmx_vpid_lock); 4003 } 4004 4005 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 4006 { 4007 /* 4008 * When KVM is a nested hypervisor on top of Hyper-V and uses 4009 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 4010 * bitmap has changed. 4011 */ 4012 if (kvm_is_using_evmcs()) { 4013 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 4014 4015 if (evmcs->hv_enlightenments_control.msr_bitmap) 4016 evmcs->hv_clean_fields &= 4017 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 4018 } 4019 4020 vmx->nested.force_msr_bitmap_recalc = true; 4021 } 4022 4023 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4024 { 4025 struct vcpu_vmx *vmx = to_vmx(vcpu); 4026 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4027 int idx; 4028 4029 if (!cpu_has_vmx_msr_bitmap()) 4030 return; 4031 4032 vmx_msr_bitmap_l01_changed(vmx); 4033 4034 /* 4035 * Mark the desired intercept state in shadow bitmap, this is needed 4036 * for resync when the MSR filters change. 4037 */ 4038 idx = vmx_get_passthrough_msr_slot(msr); 4039 if (idx >= 0) { 4040 if (type & MSR_TYPE_R) 4041 clear_bit(idx, vmx->shadow_msr_intercept.read); 4042 if (type & MSR_TYPE_W) 4043 clear_bit(idx, vmx->shadow_msr_intercept.write); 4044 } 4045 4046 if ((type & MSR_TYPE_R) && 4047 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { 4048 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4049 type &= ~MSR_TYPE_R; 4050 } 4051 4052 if ((type & MSR_TYPE_W) && 4053 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { 4054 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4055 type &= ~MSR_TYPE_W; 4056 } 4057 4058 if (type & MSR_TYPE_R) 4059 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4060 4061 if (type & MSR_TYPE_W) 4062 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4063 } 4064 4065 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4066 { 4067 struct vcpu_vmx *vmx = to_vmx(vcpu); 4068 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4069 int idx; 4070 4071 if (!cpu_has_vmx_msr_bitmap()) 4072 return; 4073 4074 vmx_msr_bitmap_l01_changed(vmx); 4075 4076 /* 4077 * Mark the desired intercept state in shadow bitmap, this is needed 4078 * for resync when the MSR filter changes. 4079 */ 4080 idx = vmx_get_passthrough_msr_slot(msr); 4081 if (idx >= 0) { 4082 if (type & MSR_TYPE_R) 4083 set_bit(idx, vmx->shadow_msr_intercept.read); 4084 if (type & MSR_TYPE_W) 4085 set_bit(idx, vmx->shadow_msr_intercept.write); 4086 } 4087 4088 if (type & MSR_TYPE_R) 4089 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4090 4091 if (type & MSR_TYPE_W) 4092 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4093 } 4094 4095 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4096 { 4097 /* 4098 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4099 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4100 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4101 */ 4102 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4103 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4104 struct vcpu_vmx *vmx = to_vmx(vcpu); 4105 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4106 u8 mode; 4107 4108 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4109 return; 4110 4111 if (cpu_has_secondary_exec_ctrls() && 4112 (secondary_exec_controls_get(vmx) & 4113 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4114 mode = MSR_BITMAP_MODE_X2APIC; 4115 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4116 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4117 } else { 4118 mode = 0; 4119 } 4120 4121 if (mode == vmx->x2apic_msr_bitmap_mode) 4122 return; 4123 4124 vmx->x2apic_msr_bitmap_mode = mode; 4125 4126 /* 4127 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4128 * registers (0x840 and above) intercepted, KVM doesn't support them. 4129 * Intercept all writes by default and poke holes as needed. Pass 4130 * through reads for all valid registers by default in x2APIC+APICv 4131 * mode, only the current timer count needs on-demand emulation by KVM. 4132 */ 4133 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4134 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4135 else 4136 msr_bitmap[read_idx] = ~0ull; 4137 msr_bitmap[write_idx] = ~0ull; 4138 4139 /* 4140 * TPR reads and writes can be virtualized even if virtual interrupt 4141 * delivery is not in use. 4142 */ 4143 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4144 !(mode & MSR_BITMAP_MODE_X2APIC)); 4145 4146 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4147 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4148 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4149 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4150 if (enable_ipiv) 4151 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4152 } 4153 } 4154 4155 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4156 { 4157 struct vcpu_vmx *vmx = to_vmx(vcpu); 4158 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4159 u32 i; 4160 4161 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4162 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4163 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4164 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4165 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4166 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4167 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4168 } 4169 } 4170 4171 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) 4172 { 4173 struct vcpu_vmx *vmx = to_vmx(vcpu); 4174 u32 i; 4175 4176 if (!cpu_has_vmx_msr_bitmap()) 4177 return; 4178 4179 /* 4180 * Redo intercept permissions for MSRs that KVM is passing through to 4181 * the guest. Disabling interception will check the new MSR filter and 4182 * ensure that KVM enables interception if usersepace wants to filter 4183 * the MSR. MSRs that KVM is already intercepting don't need to be 4184 * refreshed since KVM is going to intercept them regardless of what 4185 * userspace wants. 4186 */ 4187 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 4188 u32 msr = vmx_possible_passthrough_msrs[i]; 4189 4190 if (!test_bit(i, vmx->shadow_msr_intercept.read)) 4191 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); 4192 4193 if (!test_bit(i, vmx->shadow_msr_intercept.write)) 4194 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); 4195 } 4196 4197 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4198 if (vmx_pt_mode_is_host_guest()) 4199 pt_update_intercept_for_msr(vcpu); 4200 } 4201 4202 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4203 int vector) 4204 { 4205 struct vcpu_vmx *vmx = to_vmx(vcpu); 4206 4207 /* 4208 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4209 * and freed, and must not be accessed outside of vcpu->mutex. The 4210 * vCPU's cached PI NV is valid if and only if posted interrupts 4211 * enabled in its vmcs12, i.e. checking the vector also checks that 4212 * L1 has enabled posted interrupts for L2. 4213 */ 4214 if (is_guest_mode(vcpu) && 4215 vector == vmx->nested.posted_intr_nv) { 4216 /* 4217 * If a posted intr is not recognized by hardware, 4218 * we will accomplish it in the next vmentry. 4219 */ 4220 vmx->nested.pi_pending = true; 4221 kvm_make_request(KVM_REQ_EVENT, vcpu); 4222 4223 /* 4224 * This pairs with the smp_mb_*() after setting vcpu->mode in 4225 * vcpu_enter_guest() to guarantee the vCPU sees the event 4226 * request if triggering a posted interrupt "fails" because 4227 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4228 * the smb_wmb() in kvm_make_request() only ensures everything 4229 * done before making the request is visible when the request 4230 * is visible, it doesn't ensure ordering between the store to 4231 * vcpu->requests and the load from vcpu->mode. 4232 */ 4233 smp_mb__after_atomic(); 4234 4235 /* the PIR and ON have been set by L1. */ 4236 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4237 return 0; 4238 } 4239 return -1; 4240 } 4241 /* 4242 * Send interrupt to vcpu via posted interrupt way. 4243 * 1. If target vcpu is running(non-root mode), send posted interrupt 4244 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4245 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4246 * interrupt from PIR in next vmentry. 4247 */ 4248 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4249 { 4250 struct vcpu_vt *vt = to_vt(vcpu); 4251 int r; 4252 4253 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4254 if (!r) 4255 return 0; 4256 4257 /* Note, this is called iff the local APIC is in-kernel. */ 4258 if (!vcpu->arch.apic->apicv_active) 4259 return -1; 4260 4261 __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); 4262 return 0; 4263 } 4264 4265 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4266 int trig_mode, int vector) 4267 { 4268 struct kvm_vcpu *vcpu = apic->vcpu; 4269 4270 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4271 kvm_lapic_set_irr(vector, apic); 4272 kvm_make_request(KVM_REQ_EVENT, vcpu); 4273 kvm_vcpu_kick(vcpu); 4274 } else { 4275 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4276 trig_mode, vector); 4277 } 4278 } 4279 4280 /* 4281 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4282 * will not change in the lifetime of the guest. 4283 * Note that host-state that does change is set elsewhere. E.g., host-state 4284 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4285 */ 4286 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4287 { 4288 u32 low32, high32; 4289 unsigned long tmpl; 4290 unsigned long cr0, cr3, cr4; 4291 4292 cr0 = read_cr0(); 4293 WARN_ON(cr0 & X86_CR0_TS); 4294 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4295 4296 /* 4297 * Save the most likely value for this task's CR3 in the VMCS. 4298 * We can't use __get_current_cr3_fast() because we're not atomic. 4299 */ 4300 cr3 = __read_cr3(); 4301 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4302 vmx->loaded_vmcs->host_state.cr3 = cr3; 4303 4304 /* Save the most likely value for this task's CR4 in the VMCS. */ 4305 cr4 = cr4_read_shadow(); 4306 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4307 vmx->loaded_vmcs->host_state.cr4 = cr4; 4308 4309 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4310 #ifdef CONFIG_X86_64 4311 /* 4312 * Load null selectors, so we can avoid reloading them in 4313 * vmx_prepare_switch_to_host(), in case userspace uses 4314 * the null selectors too (the expected case). 4315 */ 4316 vmcs_write16(HOST_DS_SELECTOR, 0); 4317 vmcs_write16(HOST_ES_SELECTOR, 0); 4318 #else 4319 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4320 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4321 #endif 4322 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4323 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4324 4325 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4326 4327 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4328 4329 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4330 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4331 4332 /* 4333 * SYSENTER is used for 32-bit system calls on either 32-bit or 4334 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4335 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4336 * have already done so!). 4337 */ 4338 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4339 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4340 4341 rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); 4342 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4343 4344 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4345 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4346 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4347 } 4348 4349 if (cpu_has_load_ia32_efer()) 4350 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4351 } 4352 4353 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4354 { 4355 struct kvm_vcpu *vcpu = &vmx->vcpu; 4356 4357 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4358 ~vcpu->arch.cr4_guest_rsvd_bits; 4359 if (!enable_ept) { 4360 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4361 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4362 } 4363 if (is_guest_mode(&vmx->vcpu)) 4364 vcpu->arch.cr4_guest_owned_bits &= 4365 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4366 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4367 } 4368 4369 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4370 { 4371 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4372 4373 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4374 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4375 4376 if (!enable_vnmi) 4377 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4378 4379 if (!enable_preemption_timer) 4380 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4381 4382 return pin_based_exec_ctrl; 4383 } 4384 4385 static u32 vmx_vmentry_ctrl(void) 4386 { 4387 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4388 4389 if (vmx_pt_mode_is_system()) 4390 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4391 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4392 /* 4393 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4394 */ 4395 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4396 VM_ENTRY_LOAD_IA32_EFER | 4397 VM_ENTRY_IA32E_MODE); 4398 4399 return vmentry_ctrl; 4400 } 4401 4402 static u32 vmx_vmexit_ctrl(void) 4403 { 4404 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4405 4406 /* 4407 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4408 * nested virtualization and thus allowed to be set in vmcs12. 4409 */ 4410 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4411 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4412 4413 if (vmx_pt_mode_is_system()) 4414 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4415 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4416 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4417 return vmexit_ctrl & 4418 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4419 } 4420 4421 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4422 { 4423 struct vcpu_vmx *vmx = to_vmx(vcpu); 4424 4425 if (is_guest_mode(vcpu)) { 4426 vmx->nested.update_vmcs01_apicv_status = true; 4427 return; 4428 } 4429 4430 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4431 4432 if (kvm_vcpu_apicv_active(vcpu)) { 4433 secondary_exec_controls_setbit(vmx, 4434 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4435 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4436 if (enable_ipiv) 4437 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4438 } else { 4439 secondary_exec_controls_clearbit(vmx, 4440 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4441 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4442 if (enable_ipiv) 4443 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4444 } 4445 4446 vmx_update_msr_bitmap_x2apic(vcpu); 4447 } 4448 4449 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4450 { 4451 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4452 4453 /* 4454 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4455 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4456 */ 4457 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4458 CPU_BASED_USE_IO_BITMAPS | 4459 CPU_BASED_MONITOR_TRAP_FLAG | 4460 CPU_BASED_PAUSE_EXITING); 4461 4462 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4463 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4464 CPU_BASED_NMI_WINDOW_EXITING); 4465 4466 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4467 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4468 4469 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4470 exec_control &= ~CPU_BASED_TPR_SHADOW; 4471 4472 #ifdef CONFIG_X86_64 4473 if (exec_control & CPU_BASED_TPR_SHADOW) 4474 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4475 CPU_BASED_CR8_STORE_EXITING); 4476 else 4477 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4478 CPU_BASED_CR8_LOAD_EXITING; 4479 #endif 4480 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4481 if (enable_ept) 4482 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4483 CPU_BASED_CR3_STORE_EXITING | 4484 CPU_BASED_INVLPG_EXITING); 4485 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4486 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4487 CPU_BASED_MONITOR_EXITING); 4488 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4489 exec_control &= ~CPU_BASED_HLT_EXITING; 4490 return exec_control; 4491 } 4492 4493 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4494 { 4495 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4496 4497 /* 4498 * IPI virtualization relies on APICv. Disable IPI virtualization if 4499 * APICv is inhibited. 4500 */ 4501 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4502 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4503 4504 return exec_control; 4505 } 4506 4507 /* 4508 * Adjust a single secondary execution control bit to intercept/allow an 4509 * instruction in the guest. This is usually done based on whether or not a 4510 * feature has been exposed to the guest in order to correctly emulate faults. 4511 */ 4512 static inline void 4513 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4514 u32 control, bool enabled, bool exiting) 4515 { 4516 /* 4517 * If the control is for an opt-in feature, clear the control if the 4518 * feature is not exposed to the guest, i.e. not enabled. If the 4519 * control is opt-out, i.e. an exiting control, clear the control if 4520 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4521 * disabled for the associated instruction. Note, the caller is 4522 * responsible presetting exec_control to set all supported bits. 4523 */ 4524 if (enabled == exiting) 4525 *exec_control &= ~control; 4526 4527 /* 4528 * Update the nested MSR settings so that a nested VMM can/can't set 4529 * controls for features that are/aren't exposed to the guest. 4530 */ 4531 if (nested && 4532 kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { 4533 /* 4534 * All features that can be added or removed to VMX MSRs must 4535 * be supported in the first place for nested virtualization. 4536 */ 4537 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4538 enabled = false; 4539 4540 if (enabled) 4541 vmx->nested.msrs.secondary_ctls_high |= control; 4542 else 4543 vmx->nested.msrs.secondary_ctls_high &= ~control; 4544 } 4545 } 4546 4547 /* 4548 * Wrapper macro for the common case of adjusting a secondary execution control 4549 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4550 * verifies that the control is actually supported by KVM and hardware. 4551 */ 4552 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4553 ({ \ 4554 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4555 bool __enabled; \ 4556 \ 4557 if (cpu_has_vmx_##name()) { \ 4558 __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ 4559 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4560 __enabled, exiting); \ 4561 } \ 4562 }) 4563 4564 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4565 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4566 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4567 4568 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4569 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4570 4571 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4572 { 4573 struct kvm_vcpu *vcpu = &vmx->vcpu; 4574 4575 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4576 4577 if (vmx_pt_mode_is_system()) 4578 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4579 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4580 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4581 if (vmx->vpid == 0) 4582 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4583 if (!enable_ept) { 4584 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4585 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 4586 enable_unrestricted_guest = 0; 4587 } 4588 if (!enable_unrestricted_guest) 4589 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4590 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4591 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4592 if (!kvm_vcpu_apicv_active(vcpu)) 4593 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4594 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4595 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4596 4597 /* 4598 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4599 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4600 */ 4601 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4602 4603 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4604 * in vmx_set_cr4. */ 4605 exec_control &= ~SECONDARY_EXEC_DESC; 4606 4607 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4608 (handle_vmptrld). 4609 We can NOT enable shadow_vmcs here because we don't have yet 4610 a current VMCS12 4611 */ 4612 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4613 4614 /* 4615 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4616 * it needs to be set here when dirty logging is already active, e.g. 4617 * if this vCPU was created after dirty logging was enabled. 4618 */ 4619 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4620 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4621 4622 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4623 4624 /* 4625 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4626 * feature is exposed to the guest. This creates a virtualization hole 4627 * if both are supported in hardware but only one is exposed to the 4628 * guest, but letting the guest execute RDTSCP or RDPID when either one 4629 * is advertised is preferable to emulating the advertised instruction 4630 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4631 */ 4632 if (cpu_has_vmx_rdtscp()) { 4633 bool rdpid_or_rdtscp_enabled = 4634 guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || 4635 guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); 4636 4637 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4638 SECONDARY_EXEC_ENABLE_RDTSCP, 4639 rdpid_or_rdtscp_enabled, false); 4640 } 4641 4642 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4643 4644 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4645 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4646 4647 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4648 ENABLE_USR_WAIT_PAUSE, false); 4649 4650 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4651 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4652 4653 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4654 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4655 4656 return exec_control; 4657 } 4658 4659 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4660 { 4661 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4662 } 4663 4664 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4665 { 4666 struct page *pages; 4667 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4668 4669 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4670 return 0; 4671 4672 if (kvm_vmx->pid_table) 4673 return 0; 4674 4675 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4676 vmx_get_pid_table_order(kvm)); 4677 if (!pages) 4678 return -ENOMEM; 4679 4680 kvm_vmx->pid_table = (void *)page_address(pages); 4681 return 0; 4682 } 4683 4684 int vmx_vcpu_precreate(struct kvm *kvm) 4685 { 4686 return vmx_alloc_ipiv_pid_table(kvm); 4687 } 4688 4689 #define VMX_XSS_EXIT_BITMAP 0 4690 4691 static void init_vmcs(struct vcpu_vmx *vmx) 4692 { 4693 struct kvm *kvm = vmx->vcpu.kvm; 4694 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4695 4696 if (nested) 4697 nested_vmx_set_vmcs_shadowing_bitmap(); 4698 4699 if (cpu_has_vmx_msr_bitmap()) 4700 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4701 4702 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4703 4704 /* Control */ 4705 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4706 4707 exec_controls_set(vmx, vmx_exec_control(vmx)); 4708 4709 if (cpu_has_secondary_exec_ctrls()) { 4710 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4711 if (vmx->ve_info) 4712 vmcs_write64(VE_INFORMATION_ADDRESS, 4713 __pa(vmx->ve_info)); 4714 } 4715 4716 if (cpu_has_tertiary_exec_ctrls()) 4717 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4718 4719 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4720 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4721 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4722 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4723 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4724 4725 vmcs_write16(GUEST_INTR_STATUS, 0); 4726 4727 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4728 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); 4729 } 4730 4731 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4732 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4733 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4734 } 4735 4736 if (!kvm_pause_in_guest(kvm)) { 4737 vmcs_write32(PLE_GAP, ple_gap); 4738 vmx->ple_window = ple_window; 4739 vmx->ple_window_dirty = true; 4740 } 4741 4742 if (kvm_notify_vmexit_enabled(kvm)) 4743 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4744 4745 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4746 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4747 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4748 4749 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4750 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4751 vmx_set_constant_host_state(vmx); 4752 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4753 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4754 4755 if (cpu_has_vmx_vmfunc()) 4756 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4757 4758 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4759 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4760 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4761 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4762 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4763 4764 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4765 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4766 4767 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4768 4769 /* 22.2.1, 20.8.1 */ 4770 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4771 4772 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4773 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4774 4775 set_cr4_guest_host_mask(vmx); 4776 4777 if (vmx->vpid != 0) 4778 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4779 4780 if (cpu_has_vmx_xsaves()) 4781 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4782 4783 if (enable_pml) { 4784 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4785 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 4786 } 4787 4788 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4789 4790 if (vmx_pt_mode_is_host_guest()) { 4791 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4792 /* Bit[6~0] are forced to 1, writes are ignored. */ 4793 vmx->pt_desc.guest.output_mask = 0x7F; 4794 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4795 } 4796 4797 vmcs_write32(GUEST_SYSENTER_CS, 0); 4798 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4799 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4800 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4801 4802 if (cpu_has_vmx_tpr_shadow()) { 4803 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4804 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4805 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4806 __pa(vmx->vcpu.arch.apic->regs)); 4807 vmcs_write32(TPR_THRESHOLD, 0); 4808 } 4809 4810 vmx_setup_uret_msrs(vmx); 4811 } 4812 4813 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4814 { 4815 struct vcpu_vmx *vmx = to_vmx(vcpu); 4816 4817 init_vmcs(vmx); 4818 4819 if (nested && 4820 kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4821 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4822 4823 vcpu_setup_sgx_lepubkeyhash(vcpu); 4824 4825 vmx->nested.posted_intr_nv = -1; 4826 vmx->nested.vmxon_ptr = INVALID_GPA; 4827 vmx->nested.current_vmptr = INVALID_GPA; 4828 4829 #ifdef CONFIG_KVM_HYPERV 4830 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4831 #endif 4832 4833 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) 4834 vcpu->arch.microcode_version = 0x100000000ULL; 4835 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4836 4837 /* 4838 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4839 * or POSTED_INTR_WAKEUP_VECTOR. 4840 */ 4841 vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; 4842 __pi_set_sn(&vmx->vt.pi_desc); 4843 } 4844 4845 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4846 { 4847 struct vcpu_vmx *vmx = to_vmx(vcpu); 4848 4849 if (!init_event) 4850 __vmx_vcpu_reset(vcpu); 4851 4852 vmx->rmode.vm86_active = 0; 4853 vmx->spec_ctrl = 0; 4854 4855 vmx->msr_ia32_umwait_control = 0; 4856 4857 vmx->hv_deadline_tsc = -1; 4858 kvm_set_cr8(vcpu, 0); 4859 4860 seg_setup(VCPU_SREG_CS); 4861 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4862 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4863 4864 seg_setup(VCPU_SREG_DS); 4865 seg_setup(VCPU_SREG_ES); 4866 seg_setup(VCPU_SREG_FS); 4867 seg_setup(VCPU_SREG_GS); 4868 seg_setup(VCPU_SREG_SS); 4869 4870 vmcs_write16(GUEST_TR_SELECTOR, 0); 4871 vmcs_writel(GUEST_TR_BASE, 0); 4872 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4873 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4874 4875 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4876 vmcs_writel(GUEST_LDTR_BASE, 0); 4877 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4878 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4879 4880 vmcs_writel(GUEST_GDTR_BASE, 0); 4881 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4882 4883 vmcs_writel(GUEST_IDTR_BASE, 0); 4884 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4885 4886 vmx_segment_cache_clear(vmx); 4887 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4888 4889 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4890 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4891 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4892 if (kvm_mpx_supported()) 4893 vmcs_write64(GUEST_BNDCFGS, 0); 4894 4895 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4896 4897 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4898 4899 vpid_sync_context(vmx->vpid); 4900 4901 vmx_update_fb_clear_dis(vcpu, vmx); 4902 } 4903 4904 void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 4905 { 4906 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4907 } 4908 4909 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 4910 { 4911 if (!enable_vnmi || 4912 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4913 vmx_enable_irq_window(vcpu); 4914 return; 4915 } 4916 4917 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4918 } 4919 4920 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 4921 { 4922 struct vcpu_vmx *vmx = to_vmx(vcpu); 4923 uint32_t intr; 4924 int irq = vcpu->arch.interrupt.nr; 4925 4926 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 4927 4928 ++vcpu->stat.irq_injections; 4929 if (vmx->rmode.vm86_active) { 4930 int inc_eip = 0; 4931 if (vcpu->arch.interrupt.soft) 4932 inc_eip = vcpu->arch.event_exit_inst_len; 4933 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4934 return; 4935 } 4936 intr = irq | INTR_INFO_VALID_MASK; 4937 if (vcpu->arch.interrupt.soft) { 4938 intr |= INTR_TYPE_SOFT_INTR; 4939 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4940 vmx->vcpu.arch.event_exit_inst_len); 4941 } else 4942 intr |= INTR_TYPE_EXT_INTR; 4943 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4944 4945 vmx_clear_hlt(vcpu); 4946 } 4947 4948 void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4949 { 4950 struct vcpu_vmx *vmx = to_vmx(vcpu); 4951 4952 if (!enable_vnmi) { 4953 /* 4954 * Tracking the NMI-blocked state in software is built upon 4955 * finding the next open IRQ window. This, in turn, depends on 4956 * well-behaving guests: They have to keep IRQs disabled at 4957 * least as long as the NMI handler runs. Otherwise we may 4958 * cause NMI nesting, maybe breaking the guest. But as this is 4959 * highly unlikely, we can live with the residual risk. 4960 */ 4961 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 4962 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4963 } 4964 4965 ++vcpu->stat.nmi_injections; 4966 vmx->loaded_vmcs->nmi_known_unmasked = false; 4967 4968 if (vmx->rmode.vm86_active) { 4969 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 4970 return; 4971 } 4972 4973 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4974 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4975 4976 vmx_clear_hlt(vcpu); 4977 } 4978 4979 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4980 { 4981 struct vcpu_vmx *vmx = to_vmx(vcpu); 4982 bool masked; 4983 4984 if (!enable_vnmi) 4985 return vmx->loaded_vmcs->soft_vnmi_blocked; 4986 if (vmx->loaded_vmcs->nmi_known_unmasked) 4987 return false; 4988 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4989 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4990 return masked; 4991 } 4992 4993 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4994 { 4995 struct vcpu_vmx *vmx = to_vmx(vcpu); 4996 4997 if (!enable_vnmi) { 4998 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 4999 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 5000 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5001 } 5002 } else { 5003 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5004 if (masked) 5005 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5006 GUEST_INTR_STATE_NMI); 5007 else 5008 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5009 GUEST_INTR_STATE_NMI); 5010 } 5011 } 5012 5013 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5014 { 5015 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5016 return false; 5017 5018 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5019 return true; 5020 5021 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5022 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5023 GUEST_INTR_STATE_NMI)); 5024 } 5025 5026 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5027 { 5028 if (to_vmx(vcpu)->nested.nested_run_pending) 5029 return -EBUSY; 5030 5031 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5032 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5033 return -EBUSY; 5034 5035 return !vmx_nmi_blocked(vcpu); 5036 } 5037 5038 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5039 { 5040 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5041 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5042 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5043 } 5044 5045 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5046 { 5047 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5048 return false; 5049 5050 return __vmx_interrupt_blocked(vcpu); 5051 } 5052 5053 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5054 { 5055 if (to_vmx(vcpu)->nested.nested_run_pending) 5056 return -EBUSY; 5057 5058 /* 5059 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5060 * e.g. if the IRQ arrived asynchronously after checking nested events. 5061 */ 5062 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5063 return -EBUSY; 5064 5065 return !vmx_interrupt_blocked(vcpu); 5066 } 5067 5068 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5069 { 5070 void __user *ret; 5071 5072 if (enable_unrestricted_guest) 5073 return 0; 5074 5075 mutex_lock(&kvm->slots_lock); 5076 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5077 PAGE_SIZE * 3); 5078 mutex_unlock(&kvm->slots_lock); 5079 5080 if (IS_ERR(ret)) 5081 return PTR_ERR(ret); 5082 5083 to_kvm_vmx(kvm)->tss_addr = addr; 5084 5085 return init_rmode_tss(kvm, ret); 5086 } 5087 5088 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5089 { 5090 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5091 return 0; 5092 } 5093 5094 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5095 { 5096 switch (vec) { 5097 case BP_VECTOR: 5098 /* 5099 * Update instruction length as we may reinject the exception 5100 * from user space while in guest debugging mode. 5101 */ 5102 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5103 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5104 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5105 return false; 5106 fallthrough; 5107 case DB_VECTOR: 5108 return !(vcpu->guest_debug & 5109 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5110 case DE_VECTOR: 5111 case OF_VECTOR: 5112 case BR_VECTOR: 5113 case UD_VECTOR: 5114 case DF_VECTOR: 5115 case SS_VECTOR: 5116 case GP_VECTOR: 5117 case MF_VECTOR: 5118 return true; 5119 } 5120 return false; 5121 } 5122 5123 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5124 int vec, u32 err_code) 5125 { 5126 /* 5127 * Instruction with address size override prefix opcode 0x67 5128 * Cause the #SS fault with 0 error code in VM86 mode. 5129 */ 5130 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5131 if (kvm_emulate_instruction(vcpu, 0)) { 5132 if (vcpu->arch.halt_request) { 5133 vcpu->arch.halt_request = 0; 5134 return kvm_emulate_halt_noskip(vcpu); 5135 } 5136 return 1; 5137 } 5138 return 0; 5139 } 5140 5141 /* 5142 * Forward all other exceptions that are valid in real mode. 5143 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5144 * the required debugging infrastructure rework. 5145 */ 5146 kvm_queue_exception(vcpu, vec); 5147 return 1; 5148 } 5149 5150 static int handle_machine_check(struct kvm_vcpu *vcpu) 5151 { 5152 /* handled by vmx_vcpu_run() */ 5153 return 1; 5154 } 5155 5156 /* 5157 * If the host has split lock detection disabled, then #AC is 5158 * unconditionally injected into the guest, which is the pre split lock 5159 * detection behaviour. 5160 * 5161 * If the host has split lock detection enabled then #AC is 5162 * only injected into the guest when: 5163 * - Guest CPL == 3 (user mode) 5164 * - Guest has #AC detection enabled in CR0 5165 * - Guest EFLAGS has AC bit set 5166 */ 5167 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5168 { 5169 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5170 return true; 5171 5172 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5173 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5174 } 5175 5176 static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) 5177 { 5178 return vcpu->arch.guest_fpu.fpstate->xfd && 5179 !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); 5180 } 5181 5182 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5183 { 5184 struct vcpu_vmx *vmx = to_vmx(vcpu); 5185 struct kvm_run *kvm_run = vcpu->run; 5186 u32 intr_info, ex_no, error_code; 5187 unsigned long cr2, dr6; 5188 u32 vect_info; 5189 5190 vect_info = vmx->idt_vectoring_info; 5191 intr_info = vmx_get_intr_info(vcpu); 5192 5193 /* 5194 * Machine checks are handled by handle_exception_irqoff(), or by 5195 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5196 * vmx_vcpu_enter_exit(). 5197 */ 5198 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5199 return 1; 5200 5201 /* 5202 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5203 * This ensures the nested_vmx check is not skipped so vmexit can 5204 * be reflected to L1 (when it intercepts #NM) before reaching this 5205 * point. 5206 */ 5207 if (is_nm_fault(intr_info)) { 5208 kvm_queue_exception_p(vcpu, NM_VECTOR, 5209 is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); 5210 return 1; 5211 } 5212 5213 if (is_invalid_opcode(intr_info)) 5214 return handle_ud(vcpu); 5215 5216 if (WARN_ON_ONCE(is_ve_fault(intr_info))) { 5217 struct vmx_ve_information *ve_info = vmx->ve_info; 5218 5219 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, 5220 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); 5221 dump_vmcs(vcpu); 5222 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); 5223 return 1; 5224 } 5225 5226 error_code = 0; 5227 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5228 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5229 5230 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5231 WARN_ON_ONCE(!enable_vmware_backdoor); 5232 5233 /* 5234 * VMware backdoor emulation on #GP interception only handles 5235 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5236 * error code on #GP. 5237 */ 5238 if (error_code) { 5239 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5240 return 1; 5241 } 5242 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5243 } 5244 5245 /* 5246 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5247 * MMIO, it is better to report an internal error. 5248 * See the comments in vmx_handle_exit. 5249 */ 5250 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5251 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5252 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5253 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5254 vcpu->run->internal.ndata = 4; 5255 vcpu->run->internal.data[0] = vect_info; 5256 vcpu->run->internal.data[1] = intr_info; 5257 vcpu->run->internal.data[2] = error_code; 5258 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5259 return 0; 5260 } 5261 5262 if (is_page_fault(intr_info)) { 5263 cr2 = vmx_get_exit_qual(vcpu); 5264 if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5265 /* 5266 * EPT will cause page fault only if we need to 5267 * detect illegal GPAs. 5268 */ 5269 WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5270 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5271 return 1; 5272 } else 5273 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5274 } 5275 5276 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5277 5278 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5279 return handle_rmode_exception(vcpu, ex_no, error_code); 5280 5281 switch (ex_no) { 5282 case DB_VECTOR: 5283 dr6 = vmx_get_exit_qual(vcpu); 5284 if (!(vcpu->guest_debug & 5285 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5286 /* 5287 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5288 * instruction. ICEBP generates a trap-like #DB, but 5289 * despite its interception control being tied to #DB, 5290 * is an instruction intercept, i.e. the VM-Exit occurs 5291 * on the ICEBP itself. Use the inner "skip" helper to 5292 * avoid single-step #DB and MTF updates, as ICEBP is 5293 * higher priority. Note, skipping ICEBP still clears 5294 * STI and MOVSS blocking. 5295 * 5296 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5297 * if single-step is enabled in RFLAGS and STI or MOVSS 5298 * blocking is active, as the CPU doesn't set the bit 5299 * on VM-Exit due to #DB interception. VM-Entry has a 5300 * consistency check that a single-step #DB is pending 5301 * in this scenario as the previous instruction cannot 5302 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5303 * don't modify RFLAGS), therefore the one instruction 5304 * delay when activating single-step breakpoints must 5305 * have already expired. Note, the CPU sets/clears BS 5306 * as appropriate for all other VM-Exits types. 5307 */ 5308 if (is_icebp(intr_info)) 5309 WARN_ON(!skip_emulated_instruction(vcpu)); 5310 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5311 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5312 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5313 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5314 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5315 5316 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5317 return 1; 5318 } 5319 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5320 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5321 fallthrough; 5322 case BP_VECTOR: 5323 /* 5324 * Update instruction length as we may reinject #BP from 5325 * user space while in guest debugging mode. Reading it for 5326 * #DB as well causes no harm, it is not used in that case. 5327 */ 5328 vmx->vcpu.arch.event_exit_inst_len = 5329 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5330 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5331 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5332 kvm_run->debug.arch.exception = ex_no; 5333 break; 5334 case AC_VECTOR: 5335 if (vmx_guest_inject_ac(vcpu)) { 5336 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5337 return 1; 5338 } 5339 5340 /* 5341 * Handle split lock. Depending on detection mode this will 5342 * either warn and disable split lock detection for this 5343 * task or force SIGBUS on it. 5344 */ 5345 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5346 return 1; 5347 fallthrough; 5348 default: 5349 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5350 kvm_run->ex.exception = ex_no; 5351 kvm_run->ex.error_code = error_code; 5352 break; 5353 } 5354 return 0; 5355 } 5356 5357 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5358 { 5359 ++vcpu->stat.irq_exits; 5360 return 1; 5361 } 5362 5363 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5364 { 5365 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5366 vcpu->mmio_needed = 0; 5367 return 0; 5368 } 5369 5370 static int handle_io(struct kvm_vcpu *vcpu) 5371 { 5372 unsigned long exit_qualification; 5373 int size, in, string; 5374 unsigned port; 5375 5376 exit_qualification = vmx_get_exit_qual(vcpu); 5377 string = (exit_qualification & 16) != 0; 5378 5379 ++vcpu->stat.io_exits; 5380 5381 if (string) 5382 return kvm_emulate_instruction(vcpu, 0); 5383 5384 port = exit_qualification >> 16; 5385 size = (exit_qualification & 7) + 1; 5386 in = (exit_qualification & 8) != 0; 5387 5388 return kvm_fast_pio(vcpu, size, port, in); 5389 } 5390 5391 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5392 { 5393 /* 5394 * Patch in the VMCALL instruction: 5395 */ 5396 hypercall[0] = 0x0f; 5397 hypercall[1] = 0x01; 5398 hypercall[2] = 0xc1; 5399 } 5400 5401 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5402 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5403 { 5404 if (is_guest_mode(vcpu)) { 5405 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5406 unsigned long orig_val = val; 5407 5408 /* 5409 * We get here when L2 changed cr0 in a way that did not change 5410 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5411 * but did change L0 shadowed bits. So we first calculate the 5412 * effective cr0 value that L1 would like to write into the 5413 * hardware. It consists of the L2-owned bits from the new 5414 * value combined with the L1-owned bits from L1's guest_cr0. 5415 */ 5416 val = (val & ~vmcs12->cr0_guest_host_mask) | 5417 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5418 5419 if (kvm_set_cr0(vcpu, val)) 5420 return 1; 5421 vmcs_writel(CR0_READ_SHADOW, orig_val); 5422 return 0; 5423 } else { 5424 return kvm_set_cr0(vcpu, val); 5425 } 5426 } 5427 5428 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5429 { 5430 if (is_guest_mode(vcpu)) { 5431 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5432 unsigned long orig_val = val; 5433 5434 /* analogously to handle_set_cr0 */ 5435 val = (val & ~vmcs12->cr4_guest_host_mask) | 5436 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5437 if (kvm_set_cr4(vcpu, val)) 5438 return 1; 5439 vmcs_writel(CR4_READ_SHADOW, orig_val); 5440 return 0; 5441 } else 5442 return kvm_set_cr4(vcpu, val); 5443 } 5444 5445 static int handle_desc(struct kvm_vcpu *vcpu) 5446 { 5447 /* 5448 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5449 * and other code needs to be updated if UMIP can be guest owned. 5450 */ 5451 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5452 5453 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5454 return kvm_emulate_instruction(vcpu, 0); 5455 } 5456 5457 static int handle_cr(struct kvm_vcpu *vcpu) 5458 { 5459 unsigned long exit_qualification, val; 5460 int cr; 5461 int reg; 5462 int err; 5463 int ret; 5464 5465 exit_qualification = vmx_get_exit_qual(vcpu); 5466 cr = exit_qualification & 15; 5467 reg = (exit_qualification >> 8) & 15; 5468 switch ((exit_qualification >> 4) & 3) { 5469 case 0: /* mov to cr */ 5470 val = kvm_register_read(vcpu, reg); 5471 trace_kvm_cr_write(cr, val); 5472 switch (cr) { 5473 case 0: 5474 err = handle_set_cr0(vcpu, val); 5475 return kvm_complete_insn_gp(vcpu, err); 5476 case 3: 5477 WARN_ON_ONCE(enable_unrestricted_guest); 5478 5479 err = kvm_set_cr3(vcpu, val); 5480 return kvm_complete_insn_gp(vcpu, err); 5481 case 4: 5482 err = handle_set_cr4(vcpu, val); 5483 return kvm_complete_insn_gp(vcpu, err); 5484 case 8: { 5485 u8 cr8_prev = kvm_get_cr8(vcpu); 5486 u8 cr8 = (u8)val; 5487 err = kvm_set_cr8(vcpu, cr8); 5488 ret = kvm_complete_insn_gp(vcpu, err); 5489 if (lapic_in_kernel(vcpu)) 5490 return ret; 5491 if (cr8_prev <= cr8) 5492 return ret; 5493 /* 5494 * TODO: we might be squashing a 5495 * KVM_GUESTDBG_SINGLESTEP-triggered 5496 * KVM_EXIT_DEBUG here. 5497 */ 5498 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5499 return 0; 5500 } 5501 } 5502 break; 5503 case 2: /* clts */ 5504 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5505 return -EIO; 5506 case 1: /*mov from cr*/ 5507 switch (cr) { 5508 case 3: 5509 WARN_ON_ONCE(enable_unrestricted_guest); 5510 5511 val = kvm_read_cr3(vcpu); 5512 kvm_register_write(vcpu, reg, val); 5513 trace_kvm_cr_read(cr, val); 5514 return kvm_skip_emulated_instruction(vcpu); 5515 case 8: 5516 val = kvm_get_cr8(vcpu); 5517 kvm_register_write(vcpu, reg, val); 5518 trace_kvm_cr_read(cr, val); 5519 return kvm_skip_emulated_instruction(vcpu); 5520 } 5521 break; 5522 case 3: /* lmsw */ 5523 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5524 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5525 kvm_lmsw(vcpu, val); 5526 5527 return kvm_skip_emulated_instruction(vcpu); 5528 default: 5529 break; 5530 } 5531 vcpu->run->exit_reason = 0; 5532 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5533 (int)(exit_qualification >> 4) & 3, cr); 5534 return 0; 5535 } 5536 5537 static int handle_dr(struct kvm_vcpu *vcpu) 5538 { 5539 unsigned long exit_qualification; 5540 int dr, dr7, reg; 5541 int err = 1; 5542 5543 exit_qualification = vmx_get_exit_qual(vcpu); 5544 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5545 5546 /* First, if DR does not exist, trigger UD */ 5547 if (!kvm_require_dr(vcpu, dr)) 5548 return 1; 5549 5550 if (vmx_get_cpl(vcpu) > 0) 5551 goto out; 5552 5553 dr7 = vmcs_readl(GUEST_DR7); 5554 if (dr7 & DR7_GD) { 5555 /* 5556 * As the vm-exit takes precedence over the debug trap, we 5557 * need to emulate the latter, either for the host or the 5558 * guest debugging itself. 5559 */ 5560 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5561 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5562 vcpu->run->debug.arch.dr7 = dr7; 5563 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5564 vcpu->run->debug.arch.exception = DB_VECTOR; 5565 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5566 return 0; 5567 } else { 5568 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5569 return 1; 5570 } 5571 } 5572 5573 if (vcpu->guest_debug == 0) { 5574 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5575 5576 /* 5577 * No more DR vmexits; force a reload of the debug registers 5578 * and reenter on this instruction. The next vmexit will 5579 * retrieve the full state of the debug registers. 5580 */ 5581 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5582 return 1; 5583 } 5584 5585 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5586 if (exit_qualification & TYPE_MOV_FROM_DR) { 5587 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5588 err = 0; 5589 } else { 5590 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5591 } 5592 5593 out: 5594 return kvm_complete_insn_gp(vcpu, err); 5595 } 5596 5597 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5598 { 5599 get_debugreg(vcpu->arch.db[0], 0); 5600 get_debugreg(vcpu->arch.db[1], 1); 5601 get_debugreg(vcpu->arch.db[2], 2); 5602 get_debugreg(vcpu->arch.db[3], 3); 5603 get_debugreg(vcpu->arch.dr6, 6); 5604 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5605 5606 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5607 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5608 5609 /* 5610 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5611 * a stale dr6 from the guest. 5612 */ 5613 set_debugreg(DR6_RESERVED, 6); 5614 } 5615 5616 void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 5617 { 5618 lockdep_assert_irqs_disabled(); 5619 set_debugreg(vcpu->arch.dr6, 6); 5620 } 5621 5622 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5623 { 5624 vmcs_writel(GUEST_DR7, val); 5625 } 5626 5627 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5628 { 5629 kvm_apic_update_ppr(vcpu); 5630 return 1; 5631 } 5632 5633 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5634 { 5635 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5636 5637 kvm_make_request(KVM_REQ_EVENT, vcpu); 5638 5639 ++vcpu->stat.irq_window_exits; 5640 return 1; 5641 } 5642 5643 static int handle_invlpg(struct kvm_vcpu *vcpu) 5644 { 5645 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5646 5647 kvm_mmu_invlpg(vcpu, exit_qualification); 5648 return kvm_skip_emulated_instruction(vcpu); 5649 } 5650 5651 static int handle_apic_access(struct kvm_vcpu *vcpu) 5652 { 5653 if (likely(fasteoi)) { 5654 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5655 int access_type, offset; 5656 5657 access_type = exit_qualification & APIC_ACCESS_TYPE; 5658 offset = exit_qualification & APIC_ACCESS_OFFSET; 5659 /* 5660 * Sane guest uses MOV to write EOI, with written value 5661 * not cared. So make a short-circuit here by avoiding 5662 * heavy instruction emulation. 5663 */ 5664 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5665 (offset == APIC_EOI)) { 5666 kvm_lapic_set_eoi(vcpu); 5667 return kvm_skip_emulated_instruction(vcpu); 5668 } 5669 } 5670 return kvm_emulate_instruction(vcpu, 0); 5671 } 5672 5673 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5674 { 5675 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5676 int vector = exit_qualification & 0xff; 5677 5678 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5679 kvm_apic_set_eoi_accelerated(vcpu, vector); 5680 return 1; 5681 } 5682 5683 static int handle_apic_write(struct kvm_vcpu *vcpu) 5684 { 5685 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5686 5687 /* 5688 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5689 * hardware has done any necessary aliasing, offset adjustments, etc... 5690 * for the access. I.e. the correct value has already been written to 5691 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5692 * retrieve the register value and emulate the access. 5693 */ 5694 u32 offset = exit_qualification & 0xff0; 5695 5696 kvm_apic_write_nodecode(vcpu, offset); 5697 return 1; 5698 } 5699 5700 static int handle_task_switch(struct kvm_vcpu *vcpu) 5701 { 5702 struct vcpu_vmx *vmx = to_vmx(vcpu); 5703 unsigned long exit_qualification; 5704 bool has_error_code = false; 5705 u32 error_code = 0; 5706 u16 tss_selector; 5707 int reason, type, idt_v, idt_index; 5708 5709 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5710 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5711 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5712 5713 exit_qualification = vmx_get_exit_qual(vcpu); 5714 5715 reason = (u32)exit_qualification >> 30; 5716 if (reason == TASK_SWITCH_GATE && idt_v) { 5717 switch (type) { 5718 case INTR_TYPE_NMI_INTR: 5719 vcpu->arch.nmi_injected = false; 5720 vmx_set_nmi_mask(vcpu, true); 5721 break; 5722 case INTR_TYPE_EXT_INTR: 5723 case INTR_TYPE_SOFT_INTR: 5724 kvm_clear_interrupt_queue(vcpu); 5725 break; 5726 case INTR_TYPE_HARD_EXCEPTION: 5727 if (vmx->idt_vectoring_info & 5728 VECTORING_INFO_DELIVER_CODE_MASK) { 5729 has_error_code = true; 5730 error_code = 5731 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5732 } 5733 fallthrough; 5734 case INTR_TYPE_SOFT_EXCEPTION: 5735 kvm_clear_exception_queue(vcpu); 5736 break; 5737 default: 5738 break; 5739 } 5740 } 5741 tss_selector = exit_qualification; 5742 5743 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5744 type != INTR_TYPE_EXT_INTR && 5745 type != INTR_TYPE_NMI_INTR)) 5746 WARN_ON(!skip_emulated_instruction(vcpu)); 5747 5748 /* 5749 * TODO: What about debug traps on tss switch? 5750 * Are we supposed to inject them and update dr6? 5751 */ 5752 return kvm_task_switch(vcpu, tss_selector, 5753 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5754 reason, has_error_code, error_code); 5755 } 5756 5757 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5758 { 5759 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5760 gpa_t gpa; 5761 5762 /* 5763 * EPT violation happened while executing iret from NMI, 5764 * "blocked by NMI" bit has to be set before next VM entry. 5765 * There are errata that may cause this bit to not be set: 5766 * AAK134, BY25. 5767 */ 5768 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5769 enable_vnmi && 5770 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5771 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5772 5773 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5774 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5775 5776 /* 5777 * Check that the GPA doesn't exceed physical memory limits, as that is 5778 * a guest page fault. We have to emulate the instruction here, because 5779 * if the illegal address is that of a paging structure, then 5780 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5781 * would also use advanced VM-exit information for EPT violations to 5782 * reconstruct the page fault error code. 5783 */ 5784 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 5785 return kvm_emulate_instruction(vcpu, 0); 5786 5787 return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); 5788 } 5789 5790 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5791 { 5792 gpa_t gpa; 5793 5794 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5795 return 1; 5796 5797 /* 5798 * A nested guest cannot optimize MMIO vmexits, because we have an 5799 * nGPA here instead of the required GPA. 5800 */ 5801 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5802 if (!is_guest_mode(vcpu) && 5803 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5804 trace_kvm_fast_mmio(gpa); 5805 return kvm_skip_emulated_instruction(vcpu); 5806 } 5807 5808 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5809 } 5810 5811 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5812 { 5813 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5814 return -EIO; 5815 5816 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5817 ++vcpu->stat.nmi_window_exits; 5818 kvm_make_request(KVM_REQ_EVENT, vcpu); 5819 5820 return 1; 5821 } 5822 5823 /* 5824 * Returns true if emulation is required (due to the vCPU having invalid state 5825 * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the 5826 * current vCPU state. 5827 */ 5828 static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) 5829 { 5830 struct vcpu_vmx *vmx = to_vmx(vcpu); 5831 5832 if (!vmx->vt.emulation_required) 5833 return false; 5834 5835 /* 5836 * It is architecturally impossible for emulation to be required when a 5837 * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if 5838 * guest state is invalid and unrestricted guest is disabled, i.e. KVM 5839 * should synthesize VM-Fail instead emulation L2 code. This path is 5840 * only reachable if userspace modifies L2 guest state after KVM has 5841 * performed the nested VM-Enter consistency checks. 5842 */ 5843 if (vmx->nested.nested_run_pending) 5844 return true; 5845 5846 /* 5847 * KVM only supports emulating exceptions if the vCPU is in Real Mode. 5848 * If emulation is required, KVM can't perform a successful VM-Enter to 5849 * inject the exception. 5850 */ 5851 return !vmx->rmode.vm86_active && 5852 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 5853 } 5854 5855 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5856 { 5857 struct vcpu_vmx *vmx = to_vmx(vcpu); 5858 bool intr_window_requested; 5859 unsigned count = 130; 5860 5861 intr_window_requested = exec_controls_get(vmx) & 5862 CPU_BASED_INTR_WINDOW_EXITING; 5863 5864 while (vmx->vt.emulation_required && count-- != 0) { 5865 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 5866 return handle_interrupt_window(&vmx->vcpu); 5867 5868 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5869 return 1; 5870 5871 if (!kvm_emulate_instruction(vcpu, 0)) 5872 return 0; 5873 5874 if (vmx_unhandleable_emulation_required(vcpu)) { 5875 kvm_prepare_emulation_failure_exit(vcpu); 5876 return 0; 5877 } 5878 5879 if (vcpu->arch.halt_request) { 5880 vcpu->arch.halt_request = 0; 5881 return kvm_emulate_halt_noskip(vcpu); 5882 } 5883 5884 /* 5885 * Note, return 1 and not 0, vcpu_run() will invoke 5886 * xfer_to_guest_mode() which will create a proper return 5887 * code. 5888 */ 5889 if (__xfer_to_guest_mode_work_pending()) 5890 return 1; 5891 } 5892 5893 return 1; 5894 } 5895 5896 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5897 { 5898 if (vmx_unhandleable_emulation_required(vcpu)) { 5899 kvm_prepare_emulation_failure_exit(vcpu); 5900 return 0; 5901 } 5902 5903 return 1; 5904 } 5905 5906 /* 5907 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5908 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5909 */ 5910 static int handle_pause(struct kvm_vcpu *vcpu) 5911 { 5912 if (!kvm_pause_in_guest(vcpu->kvm)) 5913 grow_ple_window(vcpu); 5914 5915 /* 5916 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5917 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5918 * never set PAUSE_EXITING and just set PLE if supported, 5919 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5920 */ 5921 kvm_vcpu_on_spin(vcpu, true); 5922 return kvm_skip_emulated_instruction(vcpu); 5923 } 5924 5925 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5926 { 5927 return 1; 5928 } 5929 5930 static int handle_invpcid(struct kvm_vcpu *vcpu) 5931 { 5932 u32 vmx_instruction_info; 5933 unsigned long type; 5934 gva_t gva; 5935 struct { 5936 u64 pcid; 5937 u64 gla; 5938 } operand; 5939 int gpr_index; 5940 5941 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { 5942 kvm_queue_exception(vcpu, UD_VECTOR); 5943 return 1; 5944 } 5945 5946 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5947 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5948 type = kvm_register_read(vcpu, gpr_index); 5949 5950 /* According to the Intel instruction reference, the memory operand 5951 * is read even if it isn't needed (e.g., for type==all) 5952 */ 5953 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5954 vmx_instruction_info, false, 5955 sizeof(operand), &gva)) 5956 return 1; 5957 5958 return kvm_handle_invpcid(vcpu, type, gva); 5959 } 5960 5961 static int handle_pml_full(struct kvm_vcpu *vcpu) 5962 { 5963 unsigned long exit_qualification; 5964 5965 trace_kvm_pml_full(vcpu->vcpu_id); 5966 5967 exit_qualification = vmx_get_exit_qual(vcpu); 5968 5969 /* 5970 * PML buffer FULL happened while executing iret from NMI, 5971 * "blocked by NMI" bit has to be set before next VM entry. 5972 */ 5973 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5974 enable_vnmi && 5975 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5976 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5977 GUEST_INTR_STATE_NMI); 5978 5979 /* 5980 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 5981 * here.., and there's no userspace involvement needed for PML. 5982 */ 5983 return 1; 5984 } 5985 5986 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 5987 bool force_immediate_exit) 5988 { 5989 struct vcpu_vmx *vmx = to_vmx(vcpu); 5990 5991 /* 5992 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 5993 * due to the timer expiring while it was "soft" disabled, just eat the 5994 * exit and re-enter the guest. 5995 */ 5996 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 5997 return EXIT_FASTPATH_REENTER_GUEST; 5998 5999 /* 6000 * If the timer expired because KVM used it to force an immediate exit, 6001 * then mission accomplished. 6002 */ 6003 if (force_immediate_exit) 6004 return EXIT_FASTPATH_EXIT_HANDLED; 6005 6006 /* 6007 * If L2 is active, go down the slow path as emulating the guest timer 6008 * expiration likely requires synthesizing a nested VM-Exit. 6009 */ 6010 if (is_guest_mode(vcpu)) 6011 return EXIT_FASTPATH_NONE; 6012 6013 kvm_lapic_expired_hv_timer(vcpu); 6014 return EXIT_FASTPATH_REENTER_GUEST; 6015 } 6016 6017 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6018 { 6019 /* 6020 * This non-fastpath handler is reached if and only if the preemption 6021 * timer was being used to emulate a guest timer while L2 is active. 6022 * All other scenarios are supposed to be handled in the fastpath. 6023 */ 6024 WARN_ON_ONCE(!is_guest_mode(vcpu)); 6025 kvm_lapic_expired_hv_timer(vcpu); 6026 return 1; 6027 } 6028 6029 /* 6030 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6031 * are overwritten by nested_vmx_hardware_setup() when nested=1. 6032 */ 6033 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6034 { 6035 kvm_queue_exception(vcpu, UD_VECTOR); 6036 return 1; 6037 } 6038 6039 #ifndef CONFIG_X86_SGX_KVM 6040 static int handle_encls(struct kvm_vcpu *vcpu) 6041 { 6042 /* 6043 * SGX virtualization is disabled. There is no software enable bit for 6044 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6045 * the guest from executing ENCLS (when SGX is supported by hardware). 6046 */ 6047 kvm_queue_exception(vcpu, UD_VECTOR); 6048 return 1; 6049 } 6050 #endif /* CONFIG_X86_SGX_KVM */ 6051 6052 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6053 { 6054 /* 6055 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6056 * VM-Exits. Unconditionally set the flag here and leave the handling to 6057 * vmx_handle_exit(). 6058 */ 6059 to_vt(vcpu)->exit_reason.bus_lock_detected = true; 6060 return 1; 6061 } 6062 6063 static int handle_notify(struct kvm_vcpu *vcpu) 6064 { 6065 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6066 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6067 6068 ++vcpu->stat.notify_window_exits; 6069 6070 /* 6071 * Notify VM exit happened while executing iret from NMI, 6072 * "blocked by NMI" bit has to be set before next VM entry. 6073 */ 6074 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6075 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6076 GUEST_INTR_STATE_NMI); 6077 6078 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6079 context_invalid) { 6080 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6081 vcpu->run->notify.flags = context_invalid ? 6082 KVM_NOTIFY_CONTEXT_INVALID : 0; 6083 return 0; 6084 } 6085 6086 return 1; 6087 } 6088 6089 /* 6090 * The exit handlers return 1 if the exit was handled fully and guest execution 6091 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6092 * to be done to userspace and return 0. 6093 */ 6094 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6095 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6096 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6097 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6098 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6099 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6100 [EXIT_REASON_CR_ACCESS] = handle_cr, 6101 [EXIT_REASON_DR_ACCESS] = handle_dr, 6102 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6103 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6104 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6105 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6106 [EXIT_REASON_HLT] = kvm_emulate_halt, 6107 [EXIT_REASON_INVD] = kvm_emulate_invd, 6108 [EXIT_REASON_INVLPG] = handle_invlpg, 6109 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6110 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6111 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6112 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6113 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6114 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6115 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6116 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6117 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6118 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6119 [EXIT_REASON_VMON] = handle_vmx_instruction, 6120 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6121 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6122 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6123 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6124 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6125 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6126 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6127 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6128 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6129 [EXIT_REASON_LDTR_TR] = handle_desc, 6130 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6131 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6132 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6133 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6134 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6135 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6136 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6137 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6138 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6139 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6140 [EXIT_REASON_PML_FULL] = handle_pml_full, 6141 [EXIT_REASON_INVPCID] = handle_invpcid, 6142 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6143 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6144 [EXIT_REASON_ENCLS] = handle_encls, 6145 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6146 [EXIT_REASON_NOTIFY] = handle_notify, 6147 }; 6148 6149 static const int kvm_vmx_max_exit_handlers = 6150 ARRAY_SIZE(kvm_vmx_exit_handlers); 6151 6152 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6153 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 6154 { 6155 struct vcpu_vmx *vmx = to_vmx(vcpu); 6156 6157 *reason = vmx->vt.exit_reason.full; 6158 *info1 = vmx_get_exit_qual(vcpu); 6159 if (!(vmx->vt.exit_reason.failed_vmentry)) { 6160 *info2 = vmx->idt_vectoring_info; 6161 *intr_info = vmx_get_intr_info(vcpu); 6162 if (is_exception_with_error_code(*intr_info)) 6163 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6164 else 6165 *error_code = 0; 6166 } else { 6167 *info2 = 0; 6168 *intr_info = 0; 6169 *error_code = 0; 6170 } 6171 } 6172 6173 void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) 6174 { 6175 *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 6176 if (is_exception_with_error_code(*intr_info)) 6177 *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); 6178 else 6179 *error_code = 0; 6180 } 6181 6182 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6183 { 6184 if (vmx->pml_pg) { 6185 __free_page(vmx->pml_pg); 6186 vmx->pml_pg = NULL; 6187 } 6188 } 6189 6190 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6191 { 6192 struct vcpu_vmx *vmx = to_vmx(vcpu); 6193 u16 pml_idx, pml_tail_index; 6194 u64 *pml_buf; 6195 int i; 6196 6197 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6198 6199 /* Do nothing if PML buffer is empty */ 6200 if (pml_idx == PML_HEAD_INDEX) 6201 return; 6202 /* 6203 * PML index always points to the next available PML buffer entity 6204 * unless PML log has just overflowed. 6205 */ 6206 pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; 6207 6208 /* 6209 * PML log is written backwards: the CPU first writes the entry 511 6210 * then the entry 510, and so on. 6211 * 6212 * Read the entries in the same order they were written, to ensure that 6213 * the dirty ring is filled in the same order the CPU wrote them. 6214 */ 6215 pml_buf = page_address(vmx->pml_pg); 6216 6217 for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { 6218 u64 gpa; 6219 6220 gpa = pml_buf[i]; 6221 WARN_ON(gpa & (PAGE_SIZE - 1)); 6222 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6223 } 6224 6225 /* reset PML index */ 6226 vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); 6227 } 6228 6229 static void vmx_dump_sel(char *name, uint32_t sel) 6230 { 6231 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6232 name, vmcs_read16(sel), 6233 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6234 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6235 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6236 } 6237 6238 static void vmx_dump_dtsel(char *name, uint32_t limit) 6239 { 6240 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6241 name, vmcs_read32(limit), 6242 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6243 } 6244 6245 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6246 { 6247 unsigned int i; 6248 struct vmx_msr_entry *e; 6249 6250 pr_err("MSR %s:\n", name); 6251 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6252 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6253 } 6254 6255 void dump_vmcs(struct kvm_vcpu *vcpu) 6256 { 6257 struct vcpu_vmx *vmx = to_vmx(vcpu); 6258 u32 vmentry_ctl, vmexit_ctl; 6259 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6260 u64 tertiary_exec_control; 6261 unsigned long cr4; 6262 int efer_slot; 6263 6264 if (!dump_invalid_vmcs) { 6265 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6266 return; 6267 } 6268 6269 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6270 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6271 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6272 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6273 cr4 = vmcs_readl(GUEST_CR4); 6274 6275 if (cpu_has_secondary_exec_ctrls()) 6276 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6277 else 6278 secondary_exec_control = 0; 6279 6280 if (cpu_has_tertiary_exec_ctrls()) 6281 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6282 else 6283 tertiary_exec_control = 0; 6284 6285 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6286 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6287 pr_err("*** Guest State ***\n"); 6288 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6289 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6290 vmcs_readl(CR0_GUEST_HOST_MASK)); 6291 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6292 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6293 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6294 if (cpu_has_vmx_ept()) { 6295 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6296 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6297 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6298 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6299 } 6300 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6301 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6302 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6303 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6304 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6305 vmcs_readl(GUEST_SYSENTER_ESP), 6306 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6307 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6308 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6309 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6310 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6311 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6312 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6313 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6314 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6315 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6316 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6317 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6318 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6319 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6320 else if (efer_slot >= 0) 6321 pr_err("EFER= 0x%016llx (autoload)\n", 6322 vmx->msr_autoload.guest.val[efer_slot].value); 6323 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6324 pr_err("EFER= 0x%016llx (effective)\n", 6325 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6326 else 6327 pr_err("EFER= 0x%016llx (effective)\n", 6328 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6329 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6330 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6331 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6332 vmcs_read64(GUEST_IA32_DEBUGCTL), 6333 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6334 if (cpu_has_load_perf_global_ctrl() && 6335 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6336 pr_err("PerfGlobCtl = 0x%016llx\n", 6337 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6338 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6339 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6340 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6341 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6342 vmcs_read32(GUEST_ACTIVITY_STATE)); 6343 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6344 pr_err("InterruptStatus = %04x\n", 6345 vmcs_read16(GUEST_INTR_STATUS)); 6346 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6347 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6348 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6349 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6350 6351 pr_err("*** Host State ***\n"); 6352 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6353 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6354 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6355 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6356 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6357 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6358 vmcs_read16(HOST_TR_SELECTOR)); 6359 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6360 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6361 vmcs_readl(HOST_TR_BASE)); 6362 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6363 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6364 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6365 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6366 vmcs_readl(HOST_CR4)); 6367 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6368 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6369 vmcs_read32(HOST_IA32_SYSENTER_CS), 6370 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6371 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6372 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6373 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6374 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6375 if (cpu_has_load_perf_global_ctrl() && 6376 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6377 pr_err("PerfGlobCtl = 0x%016llx\n", 6378 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6379 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6380 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6381 6382 pr_err("*** Control State ***\n"); 6383 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6384 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6385 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6386 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6387 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6388 vmcs_read32(EXCEPTION_BITMAP), 6389 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6390 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6391 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6392 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6393 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6394 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6395 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6396 vmcs_read32(VM_EXIT_INTR_INFO), 6397 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6398 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6399 pr_err(" reason=%08x qualification=%016lx\n", 6400 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6401 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6402 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6403 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6404 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6405 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6406 pr_err("TSC Multiplier = 0x%016llx\n", 6407 vmcs_read64(TSC_MULTIPLIER)); 6408 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6409 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6410 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6411 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6412 } 6413 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6414 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6415 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6416 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6417 } 6418 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6419 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6420 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6421 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6422 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6423 pr_err("PLE Gap=%08x Window=%08x\n", 6424 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6425 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6426 pr_err("Virtual processor ID = 0x%04x\n", 6427 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6428 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { 6429 struct vmx_ve_information *ve_info = vmx->ve_info; 6430 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); 6431 6432 /* 6433 * If KVM is dumping the VMCS, then something has gone wrong 6434 * already. Derefencing an address from the VMCS, which could 6435 * very well be corrupted, is a terrible idea. The virtual 6436 * address is known so use it. 6437 */ 6438 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, 6439 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); 6440 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", 6441 ve_info->exit_reason, ve_info->delivery, 6442 ve_info->exit_qualification, 6443 ve_info->guest_linear_address, 6444 ve_info->guest_physical_address, ve_info->eptp_index); 6445 } 6446 } 6447 6448 /* 6449 * The guest has exited. See if we can fix it or if we need userspace 6450 * assistance. 6451 */ 6452 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6453 { 6454 struct vcpu_vmx *vmx = to_vmx(vcpu); 6455 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); 6456 u32 vectoring_info = vmx->idt_vectoring_info; 6457 u16 exit_handler_index; 6458 6459 /* 6460 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6461 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6462 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6463 * mode as if vcpus is in root mode, the PML buffer must has been 6464 * flushed already. Note, PML is never enabled in hardware while 6465 * running L2. 6466 */ 6467 if (enable_pml && !is_guest_mode(vcpu)) 6468 vmx_flush_pml_buffer(vcpu); 6469 6470 /* 6471 * KVM should never reach this point with a pending nested VM-Enter. 6472 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6473 * invalid guest state should never happen as that means KVM knowingly 6474 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6475 */ 6476 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6477 return -EIO; 6478 6479 if (is_guest_mode(vcpu)) { 6480 /* 6481 * PML is never enabled when running L2, bail immediately if a 6482 * PML full exit occurs as something is horribly wrong. 6483 */ 6484 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6485 goto unexpected_vmexit; 6486 6487 /* 6488 * The host physical addresses of some pages of guest memory 6489 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6490 * Page). The CPU may write to these pages via their host 6491 * physical address while L2 is running, bypassing any 6492 * address-translation-based dirty tracking (e.g. EPT write 6493 * protection). 6494 * 6495 * Mark them dirty on every exit from L2 to prevent them from 6496 * getting out of sync with dirty tracking. 6497 */ 6498 nested_mark_vmcs12_pages_dirty(vcpu); 6499 6500 /* 6501 * Synthesize a triple fault if L2 state is invalid. In normal 6502 * operation, nested VM-Enter rejects any attempt to enter L2 6503 * with invalid state. However, those checks are skipped if 6504 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6505 * L2 state is invalid, it means either L1 modified SMRAM state 6506 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6507 * doing so is architecturally allowed in the RSM case, and is 6508 * the least awful solution for the userspace case without 6509 * risking false positives. 6510 */ 6511 if (vmx->vt.emulation_required) { 6512 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6513 return 1; 6514 } 6515 6516 if (nested_vmx_reflect_vmexit(vcpu)) 6517 return 1; 6518 } 6519 6520 /* If guest state is invalid, start emulating. L2 is handled above. */ 6521 if (vmx->vt.emulation_required) 6522 return handle_invalid_guest_state(vcpu); 6523 6524 if (exit_reason.failed_vmentry) { 6525 dump_vmcs(vcpu); 6526 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6527 vcpu->run->fail_entry.hardware_entry_failure_reason 6528 = exit_reason.full; 6529 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6530 return 0; 6531 } 6532 6533 if (unlikely(vmx->fail)) { 6534 dump_vmcs(vcpu); 6535 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6536 vcpu->run->fail_entry.hardware_entry_failure_reason 6537 = vmcs_read32(VM_INSTRUCTION_ERROR); 6538 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6539 return 0; 6540 } 6541 6542 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6543 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6544 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6545 exit_reason.basic != EXIT_REASON_PML_FULL && 6546 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6547 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6548 exit_reason.basic != EXIT_REASON_NOTIFY && 6549 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { 6550 kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); 6551 return 0; 6552 } 6553 6554 if (unlikely(!enable_vnmi && 6555 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6556 if (!vmx_interrupt_blocked(vcpu)) { 6557 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6558 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6559 vcpu->arch.nmi_pending) { 6560 /* 6561 * This CPU don't support us in finding the end of an 6562 * NMI-blocked window if the guest runs with IRQs 6563 * disabled. So we pull the trigger after 1 s of 6564 * futile waiting, but inform the user about this. 6565 */ 6566 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6567 "state on VCPU %d after 1 s timeout\n", 6568 __func__, vcpu->vcpu_id); 6569 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6570 } 6571 } 6572 6573 if (exit_fastpath != EXIT_FASTPATH_NONE) 6574 return 1; 6575 6576 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6577 goto unexpected_vmexit; 6578 #ifdef CONFIG_MITIGATION_RETPOLINE 6579 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6580 return kvm_emulate_wrmsr(vcpu); 6581 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6582 return handle_preemption_timer(vcpu); 6583 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6584 return handle_interrupt_window(vcpu); 6585 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6586 return handle_external_interrupt(vcpu); 6587 else if (exit_reason.basic == EXIT_REASON_HLT) 6588 return kvm_emulate_halt(vcpu); 6589 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6590 return handle_ept_misconfig(vcpu); 6591 #endif 6592 6593 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6594 kvm_vmx_max_exit_handlers); 6595 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6596 goto unexpected_vmexit; 6597 6598 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6599 6600 unexpected_vmexit: 6601 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6602 exit_reason.full); 6603 dump_vmcs(vcpu); 6604 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6605 vcpu->run->internal.suberror = 6606 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6607 vcpu->run->internal.ndata = 2; 6608 vcpu->run->internal.data[0] = exit_reason.full; 6609 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6610 return 0; 6611 } 6612 6613 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6614 { 6615 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6616 6617 /* 6618 * Exit to user space when bus lock detected to inform that there is 6619 * a bus lock in guest. 6620 */ 6621 if (vmx_get_exit_reason(vcpu).bus_lock_detected) { 6622 if (ret > 0) 6623 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6624 6625 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6626 return 0; 6627 } 6628 return ret; 6629 } 6630 6631 /* 6632 * Software based L1D cache flush which is used when microcode providing 6633 * the cache control MSR is not loaded. 6634 * 6635 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6636 * flush it is required to read in 64 KiB because the replacement algorithm 6637 * is not exactly LRU. This could be sized at runtime via topology 6638 * information but as all relevant affected CPUs have 32KiB L1D cache size 6639 * there is no point in doing so. 6640 */ 6641 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6642 { 6643 int size = PAGE_SIZE << L1D_CACHE_ORDER; 6644 6645 /* 6646 * This code is only executed when the flush mode is 'cond' or 6647 * 'always' 6648 */ 6649 if (static_branch_likely(&vmx_l1d_flush_cond)) { 6650 bool flush_l1d; 6651 6652 /* 6653 * Clear the per-vcpu flush bit, it gets set again if the vCPU 6654 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 6655 * exits to userspace, or if KVM reaches one of the unsafe 6656 * VMEXIT handlers, e.g. if KVM calls into the emulator. 6657 */ 6658 flush_l1d = vcpu->arch.l1tf_flush_l1d; 6659 vcpu->arch.l1tf_flush_l1d = false; 6660 6661 /* 6662 * Clear the per-cpu flush bit, it gets set again from 6663 * the interrupt handlers. 6664 */ 6665 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6666 kvm_clear_cpu_l1tf_flush_l1d(); 6667 6668 if (!flush_l1d) 6669 return; 6670 } 6671 6672 vcpu->stat.l1d_flush++; 6673 6674 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6675 native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6676 return; 6677 } 6678 6679 asm volatile( 6680 /* First ensure the pages are in the TLB */ 6681 "xorl %%eax, %%eax\n" 6682 ".Lpopulate_tlb:\n\t" 6683 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6684 "addl $4096, %%eax\n\t" 6685 "cmpl %%eax, %[size]\n\t" 6686 "jne .Lpopulate_tlb\n\t" 6687 "xorl %%eax, %%eax\n\t" 6688 "cpuid\n\t" 6689 /* Now fill the cache */ 6690 "xorl %%eax, %%eax\n" 6691 ".Lfill_cache:\n" 6692 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6693 "addl $64, %%eax\n\t" 6694 "cmpl %%eax, %[size]\n\t" 6695 "jne .Lfill_cache\n\t" 6696 "lfence\n" 6697 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6698 [size] "r" (size) 6699 : "eax", "ebx", "ecx", "edx"); 6700 } 6701 6702 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6703 { 6704 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6705 int tpr_threshold; 6706 6707 if (is_guest_mode(vcpu) && 6708 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6709 return; 6710 6711 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6712 if (is_guest_mode(vcpu)) 6713 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; 6714 else 6715 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6716 } 6717 6718 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6719 { 6720 struct vcpu_vmx *vmx = to_vmx(vcpu); 6721 u32 sec_exec_control; 6722 6723 if (!lapic_in_kernel(vcpu)) 6724 return; 6725 6726 if (!flexpriority_enabled && 6727 !cpu_has_vmx_virtualize_x2apic_mode()) 6728 return; 6729 6730 /* Postpone execution until vmcs01 is the current VMCS. */ 6731 if (is_guest_mode(vcpu)) { 6732 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6733 return; 6734 } 6735 6736 sec_exec_control = secondary_exec_controls_get(vmx); 6737 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6738 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6739 6740 switch (kvm_get_apic_mode(vcpu)) { 6741 case LAPIC_MODE_INVALID: 6742 WARN_ONCE(true, "Invalid local APIC state"); 6743 break; 6744 case LAPIC_MODE_DISABLED: 6745 break; 6746 case LAPIC_MODE_XAPIC: 6747 if (flexpriority_enabled) { 6748 sec_exec_control |= 6749 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6750 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6751 6752 /* 6753 * Flush the TLB, reloading the APIC access page will 6754 * only do so if its physical address has changed, but 6755 * the guest may have inserted a non-APIC mapping into 6756 * the TLB while the APIC access page was disabled. 6757 */ 6758 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6759 } 6760 break; 6761 case LAPIC_MODE_X2APIC: 6762 if (cpu_has_vmx_virtualize_x2apic_mode()) 6763 sec_exec_control |= 6764 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6765 break; 6766 } 6767 secondary_exec_controls_set(vmx, sec_exec_control); 6768 6769 vmx_update_msr_bitmap_x2apic(vcpu); 6770 } 6771 6772 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6773 { 6774 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6775 struct kvm *kvm = vcpu->kvm; 6776 struct kvm_memslots *slots = kvm_memslots(kvm); 6777 struct kvm_memory_slot *slot; 6778 struct page *refcounted_page; 6779 unsigned long mmu_seq; 6780 kvm_pfn_t pfn; 6781 bool writable; 6782 6783 /* Defer reload until vmcs01 is the current VMCS. */ 6784 if (is_guest_mode(vcpu)) { 6785 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; 6786 return; 6787 } 6788 6789 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6790 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6791 return; 6792 6793 /* 6794 * Explicitly grab the memslot using KVM's internal slot ID to ensure 6795 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 6796 * be impossible for userspace to create a memslot for the APIC when 6797 * APICv is enabled, but paranoia won't hurt in this case. 6798 */ 6799 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6800 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6801 return; 6802 6803 /* 6804 * Ensure that the mmu_notifier sequence count is read before KVM 6805 * retrieves the pfn from the primary MMU. Note, the memslot is 6806 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6807 * in kvm_mmu_invalidate_end(). 6808 */ 6809 mmu_seq = kvm->mmu_invalidate_seq; 6810 smp_rmb(); 6811 6812 /* 6813 * No need to retry if the memslot does not exist or is invalid. KVM 6814 * controls the APIC-access page memslot, and only deletes the memslot 6815 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6816 */ 6817 pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); 6818 if (is_error_noslot_pfn(pfn)) 6819 return; 6820 6821 read_lock(&vcpu->kvm->mmu_lock); 6822 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) 6823 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6824 else 6825 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6826 6827 /* 6828 * Do not pin the APIC access page in memory so that it can be freely 6829 * migrated, the MMU notifier will call us again if it is migrated or 6830 * swapped out. KVM backs the memslot with anonymous memory, the pfn 6831 * should always point at a refcounted page (if the pfn is valid). 6832 */ 6833 if (!WARN_ON_ONCE(!refcounted_page)) 6834 kvm_release_page_clean(refcounted_page); 6835 6836 /* 6837 * No need for a manual TLB flush at this point, KVM has already done a 6838 * flush if there were SPTEs pointing at the previous page. 6839 */ 6840 read_unlock(&vcpu->kvm->mmu_lock); 6841 } 6842 6843 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 6844 { 6845 u16 status; 6846 u8 old; 6847 6848 /* 6849 * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI 6850 * is only relevant for if and only if Virtual Interrupt Delivery is 6851 * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's 6852 * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested 6853 * VM-Exit, otherwise L1 with run with a stale SVI. 6854 */ 6855 if (is_guest_mode(vcpu)) { 6856 /* 6857 * KVM is supposed to forward intercepted L2 EOIs to L1 if VID 6858 * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. 6859 * Note, userspace can stuff state while L2 is active; assert 6860 * that VID is disabled if and only if the vCPU is in KVM_RUN 6861 * to avoid false positives if userspace is setting APIC state. 6862 */ 6863 WARN_ON_ONCE(vcpu->wants_to_run && 6864 nested_cpu_has_vid(get_vmcs12(vcpu))); 6865 to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; 6866 return; 6867 } 6868 6869 if (max_isr == -1) 6870 max_isr = 0; 6871 6872 status = vmcs_read16(GUEST_INTR_STATUS); 6873 old = status >> 8; 6874 if (max_isr != old) { 6875 status &= 0xff; 6876 status |= max_isr << 8; 6877 vmcs_write16(GUEST_INTR_STATUS, status); 6878 } 6879 } 6880 6881 static void vmx_set_rvi(int vector) 6882 { 6883 u16 status; 6884 u8 old; 6885 6886 if (vector == -1) 6887 vector = 0; 6888 6889 status = vmcs_read16(GUEST_INTR_STATUS); 6890 old = (u8)status & 0xff; 6891 if ((u8)vector != old) { 6892 status &= ~0xff; 6893 status |= (u8)vector; 6894 vmcs_write16(GUEST_INTR_STATUS, status); 6895 } 6896 } 6897 6898 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6899 { 6900 struct vcpu_vt *vt = to_vt(vcpu); 6901 int max_irr; 6902 bool got_posted_interrupt; 6903 6904 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 6905 return -EIO; 6906 6907 if (pi_test_on(&vt->pi_desc)) { 6908 pi_clear_on(&vt->pi_desc); 6909 /* 6910 * IOMMU can write to PID.ON, so the barrier matters even on UP. 6911 * But on x86 this is just a compiler barrier anyway. 6912 */ 6913 smp_mb__after_atomic(); 6914 got_posted_interrupt = 6915 kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); 6916 } else { 6917 max_irr = kvm_lapic_find_highest_irr(vcpu); 6918 got_posted_interrupt = false; 6919 } 6920 6921 /* 6922 * Newly recognized interrupts are injected via either virtual interrupt 6923 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 6924 * disabled in two cases: 6925 * 6926 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 6927 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 6928 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 6929 * into L2, but KVM doesn't use virtual interrupt delivery to inject 6930 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 6931 * 6932 * 2) If APICv is disabled for this vCPU, assigned devices may still 6933 * attempt to post interrupts. The posted interrupt vector will cause 6934 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 6935 */ 6936 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 6937 vmx_set_rvi(max_irr); 6938 else if (got_posted_interrupt) 6939 kvm_make_request(KVM_REQ_EVENT, vcpu); 6940 6941 return max_irr; 6942 } 6943 6944 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6945 { 6946 if (!kvm_vcpu_apicv_active(vcpu)) 6947 return; 6948 6949 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6950 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6951 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6952 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6953 } 6954 6955 void vmx_do_interrupt_irqoff(unsigned long entry); 6956 void vmx_do_nmi_irqoff(void); 6957 6958 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 6959 { 6960 /* 6961 * Save xfd_err to guest_fpu before interrupt is enabled, so the 6962 * MSR value is not clobbered by the host activity before the guest 6963 * has chance to consume it. 6964 * 6965 * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM 6966 * interception may have been caused by L1 interception. Per the SDM, 6967 * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. 6968 * 6969 * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. 6970 * unlike CR2 and DR6, the value is not a payload that is attached to 6971 * the #NM exception. 6972 */ 6973 if (is_xfd_nm_fault(vcpu)) 6974 rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 6975 } 6976 6977 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 6978 { 6979 /* if exit due to PF check for async PF */ 6980 if (is_page_fault(intr_info)) 6981 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 6982 /* if exit due to NM, handle before interrupts are enabled */ 6983 else if (is_nm_fault(intr_info)) 6984 handle_nm_fault_irqoff(vcpu); 6985 /* Handle machine checks before interrupts are enabled */ 6986 else if (is_machine_check(intr_info)) 6987 kvm_machine_check(); 6988 } 6989 6990 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, 6991 u32 intr_info) 6992 { 6993 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 6994 6995 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 6996 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 6997 return; 6998 6999 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 7000 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7001 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 7002 else 7003 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); 7004 kvm_after_interrupt(vcpu); 7005 7006 vcpu->arch.at_instruction_boundary = true; 7007 } 7008 7009 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 7010 { 7011 if (to_vt(vcpu)->emulation_required) 7012 return; 7013 7014 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) 7015 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7016 else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) 7017 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7018 } 7019 7020 /* 7021 * The kvm parameter can be NULL (module initialization, or invocation before 7022 * VM creation). Be sure to check the kvm parameter before using it. 7023 */ 7024 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7025 { 7026 switch (index) { 7027 case MSR_IA32_SMBASE: 7028 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7029 return false; 7030 /* 7031 * We cannot do SMM unless we can run the guest in big 7032 * real mode. 7033 */ 7034 return enable_unrestricted_guest || emulate_invalid_guest_state; 7035 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7036 return nested; 7037 case MSR_AMD64_VIRT_SPEC_CTRL: 7038 case MSR_AMD64_TSC_RATIO: 7039 /* This is AMD only. */ 7040 return false; 7041 default: 7042 return true; 7043 } 7044 } 7045 7046 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7047 { 7048 u32 exit_intr_info; 7049 bool unblock_nmi; 7050 u8 vector; 7051 bool idtv_info_valid; 7052 7053 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7054 7055 if (enable_vnmi) { 7056 if (vmx->loaded_vmcs->nmi_known_unmasked) 7057 return; 7058 7059 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7060 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7061 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7062 /* 7063 * SDM 3: 27.7.1.2 (September 2008) 7064 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7065 * a guest IRET fault. 7066 * SDM 3: 23.2.2 (September 2008) 7067 * Bit 12 is undefined in any of the following cases: 7068 * If the VM exit sets the valid bit in the IDT-vectoring 7069 * information field. 7070 * If the VM exit is due to a double fault. 7071 */ 7072 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7073 vector != DF_VECTOR && !idtv_info_valid) 7074 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7075 GUEST_INTR_STATE_NMI); 7076 else 7077 vmx->loaded_vmcs->nmi_known_unmasked = 7078 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7079 & GUEST_INTR_STATE_NMI); 7080 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7081 vmx->loaded_vmcs->vnmi_blocked_time += 7082 ktime_to_ns(ktime_sub(ktime_get(), 7083 vmx->loaded_vmcs->entry_time)); 7084 } 7085 7086 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7087 u32 idt_vectoring_info, 7088 int instr_len_field, 7089 int error_code_field) 7090 { 7091 u8 vector; 7092 int type; 7093 bool idtv_info_valid; 7094 7095 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7096 7097 vcpu->arch.nmi_injected = false; 7098 kvm_clear_exception_queue(vcpu); 7099 kvm_clear_interrupt_queue(vcpu); 7100 7101 if (!idtv_info_valid) 7102 return; 7103 7104 kvm_make_request(KVM_REQ_EVENT, vcpu); 7105 7106 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7107 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7108 7109 switch (type) { 7110 case INTR_TYPE_NMI_INTR: 7111 vcpu->arch.nmi_injected = true; 7112 /* 7113 * SDM 3: 27.7.1.2 (September 2008) 7114 * Clear bit "block by NMI" before VM entry if a NMI 7115 * delivery faulted. 7116 */ 7117 vmx_set_nmi_mask(vcpu, false); 7118 break; 7119 case INTR_TYPE_SOFT_EXCEPTION: 7120 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7121 fallthrough; 7122 case INTR_TYPE_HARD_EXCEPTION: { 7123 u32 error_code = 0; 7124 7125 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) 7126 error_code = vmcs_read32(error_code_field); 7127 7128 kvm_requeue_exception(vcpu, vector, 7129 idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, 7130 error_code); 7131 break; 7132 } 7133 case INTR_TYPE_SOFT_INTR: 7134 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7135 fallthrough; 7136 case INTR_TYPE_EXT_INTR: 7137 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7138 break; 7139 default: 7140 break; 7141 } 7142 } 7143 7144 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7145 { 7146 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7147 VM_EXIT_INSTRUCTION_LEN, 7148 IDT_VECTORING_ERROR_CODE); 7149 } 7150 7151 void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7152 { 7153 __vmx_complete_interrupts(vcpu, 7154 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7155 VM_ENTRY_INSTRUCTION_LEN, 7156 VM_ENTRY_EXCEPTION_ERROR_CODE); 7157 7158 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7159 } 7160 7161 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7162 { 7163 int i, nr_msrs; 7164 struct perf_guest_switch_msr *msrs; 7165 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7166 7167 pmu->host_cross_mapped_mask = 0; 7168 if (pmu->pebs_enable & pmu->global_ctrl) 7169 intel_pmu_cross_mapped_check(pmu); 7170 7171 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7172 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7173 if (!msrs) 7174 return; 7175 7176 for (i = 0; i < nr_msrs; i++) 7177 if (msrs[i].host == msrs[i].guest) 7178 clear_atomic_switch_msr(vmx, msrs[i].msr); 7179 else 7180 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7181 msrs[i].host, false); 7182 } 7183 7184 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7185 { 7186 struct vcpu_vmx *vmx = to_vmx(vcpu); 7187 u64 tscl; 7188 u32 delta_tsc; 7189 7190 if (force_immediate_exit) { 7191 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7192 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7193 } else if (vmx->hv_deadline_tsc != -1) { 7194 tscl = rdtsc(); 7195 if (vmx->hv_deadline_tsc > tscl) 7196 /* set_hv_timer ensures the delta fits in 32-bits */ 7197 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7198 cpu_preemption_timer_multi); 7199 else 7200 delta_tsc = 0; 7201 7202 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7203 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7204 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7205 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7206 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7207 } 7208 } 7209 7210 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7211 { 7212 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7213 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7214 vmcs_writel(HOST_RSP, host_rsp); 7215 } 7216 } 7217 7218 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7219 unsigned int flags) 7220 { 7221 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7222 7223 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7224 return; 7225 7226 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7227 vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); 7228 7229 /* 7230 * If the guest/host SPEC_CTRL values differ, restore the host value. 7231 * 7232 * For legacy IBRS, the IBRS bit always needs to be written after 7233 * transitioning from a less privileged predictor mode, regardless of 7234 * whether the guest/host values differ. 7235 */ 7236 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7237 vmx->spec_ctrl != hostval) 7238 native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); 7239 7240 barrier_nospec(); 7241 } 7242 7243 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7244 bool force_immediate_exit) 7245 { 7246 /* 7247 * If L2 is active, some VMX preemption timer exits can be handled in 7248 * the fastpath even, all other exits must use the slow path. 7249 */ 7250 if (is_guest_mode(vcpu) && 7251 vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) 7252 return EXIT_FASTPATH_NONE; 7253 7254 switch (vmx_get_exit_reason(vcpu).basic) { 7255 case EXIT_REASON_MSR_WRITE: 7256 return handle_fastpath_set_msr_irqoff(vcpu); 7257 case EXIT_REASON_PREEMPTION_TIMER: 7258 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7259 case EXIT_REASON_HLT: 7260 return handle_fastpath_hlt(vcpu); 7261 default: 7262 return EXIT_FASTPATH_NONE; 7263 } 7264 } 7265 7266 noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) 7267 { 7268 if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || 7269 !is_nmi(vmx_get_intr_info(vcpu))) 7270 return; 7271 7272 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7273 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7274 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7275 else 7276 vmx_do_nmi_irqoff(); 7277 kvm_after_interrupt(vcpu); 7278 } 7279 7280 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7281 unsigned int flags) 7282 { 7283 struct vcpu_vmx *vmx = to_vmx(vcpu); 7284 7285 guest_state_enter_irqoff(); 7286 7287 /* 7288 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7289 * mitigation for MDS is done late in VMentry and is still 7290 * executed in spite of L1D Flush. This is because an extra VERW 7291 * should not matter much after the big hammer L1D Flush. 7292 * 7293 * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, 7294 * and is affected by MMIO Stale Data. In such cases mitigation in only 7295 * needed against an MMIO capable guest. 7296 */ 7297 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7298 vmx_l1d_flush(vcpu); 7299 else if (static_branch_unlikely(&cpu_buf_vm_clear) && 7300 kvm_arch_has_assigned_device(vcpu->kvm)) 7301 mds_clear_cpu_buffers(); 7302 7303 vmx_disable_fb_clear(vmx); 7304 7305 if (vcpu->arch.cr2 != native_read_cr2()) 7306 native_write_cr2(vcpu->arch.cr2); 7307 7308 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7309 flags); 7310 7311 vcpu->arch.cr2 = native_read_cr2(); 7312 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7313 7314 vmx->idt_vectoring_info = 0; 7315 7316 vmx_enable_fb_clear(vmx); 7317 7318 if (unlikely(vmx->fail)) { 7319 vmx->vt.exit_reason.full = 0xdead; 7320 goto out; 7321 } 7322 7323 vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7324 if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) 7325 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7326 7327 vmx_handle_nmi(vcpu); 7328 7329 out: 7330 guest_state_exit_irqoff(); 7331 } 7332 7333 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7334 { 7335 struct vcpu_vmx *vmx = to_vmx(vcpu); 7336 unsigned long cr3, cr4; 7337 7338 /* Record the guest's net vcpu time for enforced NMI injections. */ 7339 if (unlikely(!enable_vnmi && 7340 vmx->loaded_vmcs->soft_vnmi_blocked)) 7341 vmx->loaded_vmcs->entry_time = ktime_get(); 7342 7343 /* 7344 * Don't enter VMX if guest state is invalid, let the exit handler 7345 * start emulation until we arrive back to a valid state. Synthesize a 7346 * consistency check VM-Exit due to invalid guest state and bail. 7347 */ 7348 if (unlikely(vmx->vt.emulation_required)) { 7349 vmx->fail = 0; 7350 7351 vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; 7352 vmx->vt.exit_reason.failed_vmentry = 1; 7353 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7354 vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; 7355 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7356 vmx->vt.exit_intr_info = 0; 7357 return EXIT_FASTPATH_NONE; 7358 } 7359 7360 trace_kvm_entry(vcpu, force_immediate_exit); 7361 7362 if (vmx->ple_window_dirty) { 7363 vmx->ple_window_dirty = false; 7364 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7365 } 7366 7367 /* 7368 * We did this in prepare_switch_to_guest, because it needs to 7369 * be within srcu_read_lock. 7370 */ 7371 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7372 7373 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7374 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7375 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7376 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7377 vcpu->arch.regs_dirty = 0; 7378 7379 /* 7380 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7381 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7382 * it switches back to the current->mm, which can occur in KVM context 7383 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7384 * toggles a static key while handling a VM-Exit. 7385 */ 7386 cr3 = __get_current_cr3_fast(); 7387 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7388 vmcs_writel(HOST_CR3, cr3); 7389 vmx->loaded_vmcs->host_state.cr3 = cr3; 7390 } 7391 7392 cr4 = cr4_read_shadow(); 7393 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7394 vmcs_writel(HOST_CR4, cr4); 7395 vmx->loaded_vmcs->host_state.cr4 = cr4; 7396 } 7397 7398 /* When single-stepping over STI and MOV SS, we must clear the 7399 * corresponding interruptibility bits in the guest state. Otherwise 7400 * vmentry fails as it then expects bit 14 (BS) in pending debug 7401 * exceptions being set, but that's not correct for the guest debugging 7402 * case. */ 7403 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7404 vmx_set_interrupt_shadow(vcpu, 0); 7405 7406 kvm_load_guest_xsave_state(vcpu); 7407 7408 pt_guest_enter(vmx); 7409 7410 atomic_switch_perf_msrs(vmx); 7411 if (intel_pmu_lbr_is_enabled(vcpu)) 7412 vmx_passthrough_lbr_msrs(vcpu); 7413 7414 if (enable_preemption_timer) 7415 vmx_update_hv_timer(vcpu, force_immediate_exit); 7416 else if (force_immediate_exit) 7417 smp_send_reschedule(vcpu->cpu); 7418 7419 kvm_wait_lapic_expire(vcpu); 7420 7421 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7422 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7423 7424 /* All fields are clean at this point */ 7425 if (kvm_is_using_evmcs()) { 7426 current_evmcs->hv_clean_fields |= 7427 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7428 7429 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7430 } 7431 7432 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7433 if (vcpu->arch.host_debugctl) 7434 update_debugctlmsr(vcpu->arch.host_debugctl); 7435 7436 #ifndef CONFIG_X86_64 7437 /* 7438 * The sysexit path does not restore ds/es, so we must set them to 7439 * a reasonable value ourselves. 7440 * 7441 * We can't defer this to vmx_prepare_switch_to_host() since that 7442 * function may be executed in interrupt context, which saves and 7443 * restore segments around it, nullifying its effect. 7444 */ 7445 loadsegment(ds, __USER_DS); 7446 loadsegment(es, __USER_DS); 7447 #endif 7448 7449 pt_guest_exit(vmx); 7450 7451 kvm_load_host_xsave_state(vcpu); 7452 7453 if (is_guest_mode(vcpu)) { 7454 /* 7455 * Track VMLAUNCH/VMRESUME that have made past guest state 7456 * checking. 7457 */ 7458 if (vmx->nested.nested_run_pending && 7459 !vmx_get_exit_reason(vcpu).failed_vmentry) 7460 ++vcpu->stat.nested_run; 7461 7462 vmx->nested.nested_run_pending = 0; 7463 } 7464 7465 if (unlikely(vmx->fail)) 7466 return EXIT_FASTPATH_NONE; 7467 7468 if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7469 kvm_machine_check(); 7470 7471 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7472 7473 if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) 7474 return EXIT_FASTPATH_NONE; 7475 7476 vmx->loaded_vmcs->launched = 1; 7477 7478 vmx_recover_nmi_blocking(vmx); 7479 vmx_complete_interrupts(vmx); 7480 7481 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7482 } 7483 7484 void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7485 { 7486 struct vcpu_vmx *vmx = to_vmx(vcpu); 7487 7488 if (enable_pml) 7489 vmx_destroy_pml_buffer(vmx); 7490 free_vpid(vmx->vpid); 7491 nested_vmx_free_vcpu(vcpu); 7492 free_loaded_vmcs(vmx->loaded_vmcs); 7493 free_page((unsigned long)vmx->ve_info); 7494 } 7495 7496 int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7497 { 7498 struct vmx_uret_msr *tsx_ctrl; 7499 struct vcpu_vmx *vmx; 7500 int i, err; 7501 7502 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7503 vmx = to_vmx(vcpu); 7504 7505 INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); 7506 7507 err = -ENOMEM; 7508 7509 vmx->vpid = allocate_vpid(); 7510 7511 /* 7512 * If PML is turned on, failure on enabling PML just results in failure 7513 * of creating the vcpu, therefore we can simplify PML logic (by 7514 * avoiding dealing with cases, such as enabling PML partially on vcpus 7515 * for the guest), etc. 7516 */ 7517 if (enable_pml) { 7518 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7519 if (!vmx->pml_pg) 7520 goto free_vpid; 7521 } 7522 7523 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7524 vmx->guest_uret_msrs[i].mask = -1ull; 7525 if (boot_cpu_has(X86_FEATURE_RTM)) { 7526 /* 7527 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7528 * Keep the host value unchanged to avoid changing CPUID bits 7529 * under the host kernel's feet. 7530 */ 7531 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7532 if (tsx_ctrl) 7533 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7534 } 7535 7536 err = alloc_loaded_vmcs(&vmx->vmcs01); 7537 if (err < 0) 7538 goto free_pml; 7539 7540 /* 7541 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7542 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7543 * feature only for vmcs01, KVM currently isn't equipped to realize any 7544 * performance benefits from enabling it for vmcs02. 7545 */ 7546 if (kvm_is_using_evmcs() && 7547 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7548 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7549 7550 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7551 } 7552 7553 /* The MSR bitmap starts with all ones */ 7554 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7555 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7556 7557 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 7558 #ifdef CONFIG_X86_64 7559 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 7560 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 7561 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 7562 #endif 7563 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 7564 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 7565 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 7566 if (kvm_cstate_in_guest(vcpu->kvm)) { 7567 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 7568 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 7569 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 7570 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 7571 } 7572 7573 vmx->loaded_vmcs = &vmx->vmcs01; 7574 7575 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7576 err = kvm_alloc_apic_access_page(vcpu->kvm); 7577 if (err) 7578 goto free_vmcs; 7579 } 7580 7581 if (enable_ept && !enable_unrestricted_guest) { 7582 err = init_rmode_identity_map(vcpu->kvm); 7583 if (err) 7584 goto free_vmcs; 7585 } 7586 7587 err = -ENOMEM; 7588 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { 7589 struct page *page; 7590 7591 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); 7592 7593 /* ve_info must be page aligned. */ 7594 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7595 if (!page) 7596 goto free_vmcs; 7597 7598 vmx->ve_info = page_to_virt(page); 7599 } 7600 7601 if (vmx_can_use_ipiv(vcpu)) 7602 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7603 __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); 7604 7605 return 0; 7606 7607 free_vmcs: 7608 free_loaded_vmcs(vmx->loaded_vmcs); 7609 free_pml: 7610 vmx_destroy_pml_buffer(vmx); 7611 free_vpid: 7612 free_vpid(vmx->vpid); 7613 return err; 7614 } 7615 7616 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7617 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7618 7619 int vmx_vm_init(struct kvm *kvm) 7620 { 7621 if (!ple_gap) 7622 kvm->arch.pause_in_guest = true; 7623 7624 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7625 switch (l1tf_mitigation) { 7626 case L1TF_MITIGATION_OFF: 7627 case L1TF_MITIGATION_FLUSH_NOWARN: 7628 /* 'I explicitly don't care' is set */ 7629 break; 7630 case L1TF_MITIGATION_AUTO: 7631 case L1TF_MITIGATION_FLUSH: 7632 case L1TF_MITIGATION_FLUSH_NOSMT: 7633 case L1TF_MITIGATION_FULL: 7634 /* 7635 * Warn upon starting the first VM in a potentially 7636 * insecure environment. 7637 */ 7638 if (sched_smt_active()) 7639 pr_warn_once(L1TF_MSG_SMT); 7640 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7641 pr_warn_once(L1TF_MSG_L1D); 7642 break; 7643 case L1TF_MITIGATION_FULL_FORCE: 7644 /* Flush is enforced */ 7645 break; 7646 } 7647 } 7648 7649 if (enable_pml) 7650 kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; 7651 return 0; 7652 } 7653 7654 static inline bool vmx_ignore_guest_pat(struct kvm *kvm) 7655 { 7656 /* 7657 * Non-coherent DMA devices need the guest to flush CPU properly. 7658 * In that case it is not possible to map all guest RAM as WB, so 7659 * always trust guest PAT. 7660 */ 7661 return !kvm_arch_has_noncoherent_dma(kvm) && 7662 kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); 7663 } 7664 7665 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7666 { 7667 /* 7668 * Force UC for host MMIO regions, as allowing the guest to access MMIO 7669 * with cacheable accesses will result in Machine Checks. 7670 */ 7671 if (is_mmio) 7672 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7673 7674 /* Force WB if ignoring guest PAT */ 7675 if (vmx_ignore_guest_pat(vcpu->kvm)) 7676 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7677 7678 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); 7679 } 7680 7681 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7682 { 7683 /* 7684 * These bits in the secondary execution controls field 7685 * are dynamic, the others are mostly based on the hypervisor 7686 * architecture and the guest's CPUID. Do not touch the 7687 * dynamic bits. 7688 */ 7689 u32 mask = 7690 SECONDARY_EXEC_SHADOW_VMCS | 7691 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7692 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7693 SECONDARY_EXEC_DESC; 7694 7695 u32 cur_ctl = secondary_exec_controls_get(vmx); 7696 7697 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7698 } 7699 7700 /* 7701 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7702 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7703 */ 7704 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7705 { 7706 struct vcpu_vmx *vmx = to_vmx(vcpu); 7707 struct kvm_cpuid_entry2 *entry; 7708 7709 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7710 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7711 7712 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7713 if (entry && (entry->_reg & (_cpuid_mask))) \ 7714 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7715 } while (0) 7716 7717 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7718 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7719 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7720 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7721 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7722 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7723 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7724 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7725 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7726 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7727 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7728 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7729 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7730 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7731 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7732 7733 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7734 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7735 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7736 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7737 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7738 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7739 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7740 7741 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7742 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7743 7744 #undef cr4_fixed1_update 7745 } 7746 7747 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7748 { 7749 struct vcpu_vmx *vmx = to_vmx(vcpu); 7750 struct kvm_cpuid_entry2 *best = NULL; 7751 int i; 7752 7753 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7754 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7755 if (!best) 7756 return; 7757 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7758 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7759 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7760 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7761 } 7762 7763 /* Get the number of configurable Address Ranges for filtering */ 7764 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7765 PT_CAP_num_address_ranges); 7766 7767 /* Initialize and clear the no dependency bits */ 7768 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7769 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7770 RTIT_CTL_BRANCH_EN); 7771 7772 /* 7773 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7774 * will inject an #GP 7775 */ 7776 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7777 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7778 7779 /* 7780 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7781 * PSBFreq can be set 7782 */ 7783 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7784 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7785 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7786 7787 /* 7788 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7789 */ 7790 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7791 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7792 RTIT_CTL_MTC_RANGE); 7793 7794 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7795 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7796 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7797 RTIT_CTL_PTW_EN); 7798 7799 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7800 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7801 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7802 7803 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7804 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7805 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7806 7807 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7808 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7809 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7810 7811 /* unmask address range configure area */ 7812 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7813 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7814 } 7815 7816 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7817 { 7818 struct vcpu_vmx *vmx = to_vmx(vcpu); 7819 7820 /* 7821 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7822 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7823 * set if and only if XSAVE is supported. 7824 */ 7825 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) 7826 guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); 7827 7828 vmx_setup_uret_msrs(vmx); 7829 7830 if (cpu_has_secondary_exec_ctrls()) 7831 vmcs_set_secondary_exec_control(vmx, 7832 vmx_secondary_exec_control(vmx)); 7833 7834 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7835 vmx->msr_ia32_feature_control_valid_bits |= 7836 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7837 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7838 else 7839 vmx->msr_ia32_feature_control_valid_bits &= 7840 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7841 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7842 7843 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 7844 nested_vmx_cr_fixed1_bits_update(vcpu); 7845 7846 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7847 guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) 7848 update_intel_pt_cfg(vcpu); 7849 7850 if (boot_cpu_has(X86_FEATURE_RTM)) { 7851 struct vmx_uret_msr *msr; 7852 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7853 if (msr) { 7854 bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); 7855 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 7856 } 7857 } 7858 7859 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 7860 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 7861 !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); 7862 7863 if (boot_cpu_has(X86_FEATURE_IBPB)) 7864 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 7865 !guest_has_pred_cmd_msr(vcpu)); 7866 7867 if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 7868 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 7869 !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); 7870 7871 set_cr4_guest_host_mask(vmx); 7872 7873 vmx_write_encls_bitmap(vcpu, NULL); 7874 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) 7875 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 7876 else 7877 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 7878 7879 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) 7880 vmx->msr_ia32_feature_control_valid_bits |= 7881 FEAT_CTL_SGX_LC_ENABLED; 7882 else 7883 vmx->msr_ia32_feature_control_valid_bits &= 7884 ~FEAT_CTL_SGX_LC_ENABLED; 7885 7886 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7887 vmx_update_exception_bitmap(vcpu); 7888 } 7889 7890 static __init u64 vmx_get_perf_capabilities(void) 7891 { 7892 u64 perf_cap = PMU_CAP_FW_WRITES; 7893 u64 host_perf_cap = 0; 7894 7895 if (!enable_pmu) 7896 return 0; 7897 7898 if (boot_cpu_has(X86_FEATURE_PDCM)) 7899 rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 7900 7901 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { 7902 x86_perf_get_lbr(&vmx_lbr_caps); 7903 7904 /* 7905 * KVM requires LBR callstack support, as the overhead due to 7906 * context switching LBRs without said support is too high. 7907 * See intel_pmu_create_guest_lbr_event() for more info. 7908 */ 7909 if (!vmx_lbr_caps.has_callstack) 7910 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 7911 else if (vmx_lbr_caps.nr) 7912 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; 7913 } 7914 7915 if (vmx_pebs_supported()) { 7916 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 7917 7918 /* 7919 * Disallow adaptive PEBS as it is functionally broken, can be 7920 * used by the guest to read *host* LBRs, and can be used to 7921 * bypass userspace event filters. To correctly and safely 7922 * support adaptive PEBS, KVM needs to: 7923 * 7924 * 1. Account for the ADAPTIVE flag when (re)programming fixed 7925 * counters. 7926 * 7927 * 2. Gain support from perf (or take direct control of counter 7928 * programming) to support events without adaptive PEBS 7929 * enabled for the hardware counter. 7930 * 7931 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 7932 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 7933 * 7934 * 4. Document which PMU events are effectively exposed to the 7935 * guest via adaptive PEBS, and make adaptive PEBS mutually 7936 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 7937 */ 7938 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 7939 } 7940 7941 return perf_cap; 7942 } 7943 7944 static __init void vmx_set_cpu_caps(void) 7945 { 7946 kvm_set_cpu_caps(); 7947 7948 /* CPUID 0x1 */ 7949 if (nested) 7950 kvm_cpu_cap_set(X86_FEATURE_VMX); 7951 7952 /* CPUID 0x7 */ 7953 if (kvm_mpx_supported()) 7954 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 7955 if (!cpu_has_vmx_invpcid()) 7956 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 7957 if (vmx_pt_mode_is_host_guest()) 7958 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 7959 if (vmx_pebs_supported()) { 7960 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 7961 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 7962 } 7963 7964 if (!enable_pmu) 7965 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 7966 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 7967 7968 if (!enable_sgx) { 7969 kvm_cpu_cap_clear(X86_FEATURE_SGX); 7970 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7971 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7972 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7973 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 7974 } 7975 7976 if (vmx_umip_emulated()) 7977 kvm_cpu_cap_set(X86_FEATURE_UMIP); 7978 7979 /* CPUID 0xD.1 */ 7980 kvm_caps.supported_xss = 0; 7981 if (!cpu_has_vmx_xsaves()) 7982 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7983 7984 /* CPUID 0x80000001 and 0x7 (RDPID) */ 7985 if (!cpu_has_vmx_rdtscp()) { 7986 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 7987 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 7988 } 7989 7990 if (cpu_has_vmx_waitpkg()) 7991 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7992 } 7993 7994 static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, 7995 struct x86_instruction_info *info, 7996 unsigned long *exit_qualification) 7997 { 7998 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7999 unsigned short port; 8000 int size; 8001 bool imm; 8002 8003 /* 8004 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 8005 * VM-exits depend on the 'unconditional IO exiting' VM-execution 8006 * control. 8007 * 8008 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 8009 */ 8010 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 8011 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 8012 8013 if (info->intercept == x86_intercept_in || 8014 info->intercept == x86_intercept_ins) { 8015 port = info->src_val; 8016 size = info->dst_bytes; 8017 imm = info->src_type == OP_IMM; 8018 } else { 8019 port = info->dst_val; 8020 size = info->src_bytes; 8021 imm = info->dst_type == OP_IMM; 8022 } 8023 8024 8025 *exit_qualification = ((unsigned long)port << 16) | (size - 1); 8026 8027 if (info->intercept == x86_intercept_ins || 8028 info->intercept == x86_intercept_outs) 8029 *exit_qualification |= BIT(4); 8030 8031 if (info->rep_prefix) 8032 *exit_qualification |= BIT(5); 8033 8034 if (imm) 8035 *exit_qualification |= BIT(6); 8036 8037 return nested_vmx_check_io_bitmaps(vcpu, port, size); 8038 } 8039 8040 int vmx_check_intercept(struct kvm_vcpu *vcpu, 8041 struct x86_instruction_info *info, 8042 enum x86_intercept_stage stage, 8043 struct x86_exception *exception) 8044 { 8045 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8046 unsigned long exit_qualification = 0; 8047 u32 vm_exit_reason; 8048 u64 exit_insn_len; 8049 8050 switch (info->intercept) { 8051 case x86_intercept_rdpid: 8052 /* 8053 * RDPID causes #UD if not enabled through secondary execution 8054 * controls (ENABLE_RDTSCP). Note, the implicit MSR access to 8055 * TSC_AUX is NOT subject to interception, i.e. checking only 8056 * the dedicated execution control is architecturally correct. 8057 */ 8058 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 8059 exception->vector = UD_VECTOR; 8060 exception->error_code_valid = false; 8061 return X86EMUL_PROPAGATE_FAULT; 8062 } 8063 return X86EMUL_CONTINUE; 8064 8065 case x86_intercept_in: 8066 case x86_intercept_ins: 8067 case x86_intercept_out: 8068 case x86_intercept_outs: 8069 if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) 8070 return X86EMUL_CONTINUE; 8071 8072 vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; 8073 break; 8074 8075 case x86_intercept_lgdt: 8076 case x86_intercept_lidt: 8077 case x86_intercept_lldt: 8078 case x86_intercept_ltr: 8079 case x86_intercept_sgdt: 8080 case x86_intercept_sidt: 8081 case x86_intercept_sldt: 8082 case x86_intercept_str: 8083 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8084 return X86EMUL_CONTINUE; 8085 8086 if (info->intercept == x86_intercept_lldt || 8087 info->intercept == x86_intercept_ltr || 8088 info->intercept == x86_intercept_sldt || 8089 info->intercept == x86_intercept_str) 8090 vm_exit_reason = EXIT_REASON_LDTR_TR; 8091 else 8092 vm_exit_reason = EXIT_REASON_GDTR_IDTR; 8093 /* 8094 * FIXME: Decode the ModR/M to generate the correct exit 8095 * qualification for memory operands. 8096 */ 8097 break; 8098 8099 case x86_intercept_hlt: 8100 if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) 8101 return X86EMUL_CONTINUE; 8102 8103 vm_exit_reason = EXIT_REASON_HLT; 8104 break; 8105 8106 case x86_intercept_pause: 8107 /* 8108 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8109 * with vanilla NOPs in the emulator. Apply the interception 8110 * check only to actual PAUSE instructions. Don't check 8111 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8112 * exit, i.e. KVM is within its rights to allow L2 to execute 8113 * the PAUSE. 8114 */ 8115 if ((info->rep_prefix != REPE_PREFIX) || 8116 !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) 8117 return X86EMUL_CONTINUE; 8118 8119 vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; 8120 break; 8121 8122 /* TODO: check more intercepts... */ 8123 default: 8124 return X86EMUL_UNHANDLEABLE; 8125 } 8126 8127 exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); 8128 if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) 8129 return X86EMUL_UNHANDLEABLE; 8130 8131 __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, 8132 exit_insn_len); 8133 return X86EMUL_INTERCEPTED; 8134 } 8135 8136 #ifdef CONFIG_X86_64 8137 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8138 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8139 u64 divisor, u64 *result) 8140 { 8141 u64 low = a << shift, high = a >> (64 - shift); 8142 8143 /* To avoid the overflow on divq */ 8144 if (high >= divisor) 8145 return 1; 8146 8147 /* Low hold the result, high hold rem which is discarded */ 8148 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8149 "rm" (divisor), "0" (low), "1" (high)); 8150 *result = low; 8151 8152 return 0; 8153 } 8154 8155 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8156 bool *expired) 8157 { 8158 struct vcpu_vmx *vmx; 8159 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8160 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8161 8162 vmx = to_vmx(vcpu); 8163 tscl = rdtsc(); 8164 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8165 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8166 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8167 ktimer->timer_advance_ns); 8168 8169 if (delta_tsc > lapic_timer_advance_cycles) 8170 delta_tsc -= lapic_timer_advance_cycles; 8171 else 8172 delta_tsc = 0; 8173 8174 /* Convert to host delta tsc if tsc scaling is enabled */ 8175 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8176 delta_tsc && u64_shl_div_u64(delta_tsc, 8177 kvm_caps.tsc_scaling_ratio_frac_bits, 8178 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8179 return -ERANGE; 8180 8181 /* 8182 * If the delta tsc can't fit in the 32 bit after the multi shift, 8183 * we can't use the preemption timer. 8184 * It's possible that it fits on later vmentries, but checking 8185 * on every vmentry is costly so we just use an hrtimer. 8186 */ 8187 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8188 return -ERANGE; 8189 8190 vmx->hv_deadline_tsc = tscl + delta_tsc; 8191 *expired = !delta_tsc; 8192 return 0; 8193 } 8194 8195 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8196 { 8197 to_vmx(vcpu)->hv_deadline_tsc = -1; 8198 } 8199 #endif 8200 8201 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8202 { 8203 struct vcpu_vmx *vmx = to_vmx(vcpu); 8204 8205 if (WARN_ON_ONCE(!enable_pml)) 8206 return; 8207 8208 if (is_guest_mode(vcpu)) { 8209 vmx->nested.update_vmcs01_cpu_dirty_logging = true; 8210 return; 8211 } 8212 8213 /* 8214 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8215 * code, but in that case another update request will be made and so 8216 * the guest will never run with a stale PML value. 8217 */ 8218 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8219 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8220 else 8221 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8222 } 8223 8224 void vmx_setup_mce(struct kvm_vcpu *vcpu) 8225 { 8226 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8227 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8228 FEAT_CTL_LMCE_ENABLED; 8229 else 8230 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8231 ~FEAT_CTL_LMCE_ENABLED; 8232 } 8233 8234 #ifdef CONFIG_KVM_SMM 8235 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8236 { 8237 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8238 if (to_vmx(vcpu)->nested.nested_run_pending) 8239 return -EBUSY; 8240 return !is_smm(vcpu); 8241 } 8242 8243 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8244 { 8245 struct vcpu_vmx *vmx = to_vmx(vcpu); 8246 8247 /* 8248 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8249 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8250 * SMI and RSM only modify state that is saved and restored via SMRAM. 8251 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8252 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8253 */ 8254 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8255 if (vmx->nested.smm.guest_mode) 8256 nested_vmx_vmexit(vcpu, -1, 0, 0); 8257 8258 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8259 vmx->nested.vmxon = false; 8260 vmx_clear_hlt(vcpu); 8261 return 0; 8262 } 8263 8264 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8265 { 8266 struct vcpu_vmx *vmx = to_vmx(vcpu); 8267 int ret; 8268 8269 if (vmx->nested.smm.vmxon) { 8270 vmx->nested.vmxon = true; 8271 vmx->nested.smm.vmxon = false; 8272 } 8273 8274 if (vmx->nested.smm.guest_mode) { 8275 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8276 if (ret) 8277 return ret; 8278 8279 vmx->nested.nested_run_pending = 1; 8280 vmx->nested.smm.guest_mode = false; 8281 } 8282 return 0; 8283 } 8284 8285 void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8286 { 8287 /* RSM will cause a vmexit anyway. */ 8288 } 8289 #endif 8290 8291 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8292 { 8293 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8294 } 8295 8296 void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8297 { 8298 if (is_guest_mode(vcpu)) { 8299 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8300 8301 if (hrtimer_try_to_cancel(timer) == 1) 8302 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8303 } 8304 } 8305 8306 void vmx_hardware_unsetup(void) 8307 { 8308 kvm_set_posted_intr_wakeup_handler(NULL); 8309 8310 if (nested) 8311 nested_vmx_hardware_unsetup(); 8312 8313 free_kvm_area(); 8314 } 8315 8316 void vmx_vm_destroy(struct kvm *kvm) 8317 { 8318 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8319 8320 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8321 } 8322 8323 /* 8324 * Note, the SDM states that the linear address is masked *after* the modified 8325 * canonicality check, whereas KVM masks (untags) the address and then performs 8326 * a "normal" canonicality check. Functionally, the two methods are identical, 8327 * and when the masking occurs relative to the canonicality check isn't visible 8328 * to software, i.e. KVM's behavior doesn't violate the SDM. 8329 */ 8330 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8331 { 8332 int lam_bit; 8333 unsigned long cr3_bits; 8334 8335 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8336 return gva; 8337 8338 if (!is_64_bit_mode(vcpu)) 8339 return gva; 8340 8341 /* 8342 * Bit 63 determines if the address should be treated as user address 8343 * or a supervisor address. 8344 */ 8345 if (!(gva & BIT_ULL(63))) { 8346 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8347 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8348 return gva; 8349 8350 /* LAM_U48 is ignored if LAM_U57 is set. */ 8351 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8352 } else { 8353 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8354 return gva; 8355 8356 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8357 } 8358 8359 /* 8360 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8361 * Bit 63 is retained from the raw virtual address so that untagging 8362 * doesn't change a user access to a supervisor access, and vice versa. 8363 */ 8364 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8365 } 8366 8367 static unsigned int vmx_handle_intel_pt_intr(void) 8368 { 8369 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8370 8371 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8372 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8373 return 0; 8374 8375 kvm_make_request(KVM_REQ_PMI, vcpu); 8376 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8377 (unsigned long *)&vcpu->arch.pmu.global_status); 8378 return 1; 8379 } 8380 8381 static __init void vmx_setup_user_return_msrs(void) 8382 { 8383 8384 /* 8385 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8386 * will emulate SYSCALL in legacy mode if the vendor string in guest 8387 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8388 * support this emulation, MSR_STAR is included in the list for i386, 8389 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8390 * into hardware and is here purely for emulation purposes. 8391 */ 8392 const u32 vmx_uret_msrs_list[] = { 8393 #ifdef CONFIG_X86_64 8394 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8395 #endif 8396 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8397 MSR_IA32_TSX_CTRL, 8398 }; 8399 int i; 8400 8401 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8402 8403 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8404 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8405 } 8406 8407 static void __init vmx_setup_me_spte_mask(void) 8408 { 8409 u64 me_mask = 0; 8410 8411 /* 8412 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8413 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, 8414 * boot_cpu_data.x86_phys_bits holds the actual physical address 8415 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to 8416 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. 8417 */ 8418 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) 8419 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8420 kvm_host.maxphyaddr - 1); 8421 8422 /* 8423 * Unlike SME, host kernel doesn't support setting up any 8424 * MKTME KeyID on Intel platforms. No memory encryption 8425 * bits should be included into the SPTE. 8426 */ 8427 kvm_mmu_set_me_spte_mask(0, me_mask); 8428 } 8429 8430 __init int vmx_hardware_setup(void) 8431 { 8432 unsigned long host_bndcfgs; 8433 struct desc_ptr dt; 8434 int r; 8435 8436 store_idt(&dt); 8437 host_idt_base = dt.address; 8438 8439 vmx_setup_user_return_msrs(); 8440 8441 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8442 return -EIO; 8443 8444 if (boot_cpu_has(X86_FEATURE_NX)) 8445 kvm_enable_efer_bits(EFER_NX); 8446 8447 if (boot_cpu_has(X86_FEATURE_MPX)) { 8448 rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); 8449 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8450 } 8451 8452 if (!cpu_has_vmx_mpx()) 8453 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8454 XFEATURE_MASK_BNDCSR); 8455 8456 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8457 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8458 enable_vpid = 0; 8459 8460 if (!cpu_has_vmx_ept() || 8461 !cpu_has_vmx_ept_4levels() || 8462 !cpu_has_vmx_ept_mt_wb() || 8463 !cpu_has_vmx_invept_global()) 8464 enable_ept = 0; 8465 8466 /* NX support is required for shadow paging. */ 8467 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8468 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8469 return -EOPNOTSUPP; 8470 } 8471 8472 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8473 enable_ept_ad_bits = 0; 8474 8475 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8476 enable_unrestricted_guest = 0; 8477 8478 if (!cpu_has_vmx_flexpriority()) 8479 flexpriority_enabled = 0; 8480 8481 if (!cpu_has_virtual_nmis()) 8482 enable_vnmi = 0; 8483 8484 #ifdef CONFIG_X86_SGX_KVM 8485 if (!cpu_has_vmx_encls_vmexit()) 8486 enable_sgx = false; 8487 #endif 8488 8489 /* 8490 * set_apic_access_page_addr() is used to reload apic access 8491 * page upon invalidation. No need to do anything if not 8492 * using the APIC_ACCESS_ADDR VMCS field. 8493 */ 8494 if (!flexpriority_enabled) 8495 vt_x86_ops.set_apic_access_page_addr = NULL; 8496 8497 if (!cpu_has_vmx_tpr_shadow()) 8498 vt_x86_ops.update_cr8_intercept = NULL; 8499 8500 #if IS_ENABLED(CONFIG_HYPERV) 8501 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8502 && enable_ept) { 8503 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8504 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8505 } 8506 #endif 8507 8508 if (!cpu_has_vmx_ple()) { 8509 ple_gap = 0; 8510 ple_window = 0; 8511 ple_window_grow = 0; 8512 ple_window_max = 0; 8513 ple_window_shrink = 0; 8514 } 8515 8516 if (!cpu_has_vmx_apicv()) 8517 enable_apicv = 0; 8518 if (!enable_apicv) 8519 vt_x86_ops.sync_pir_to_irr = NULL; 8520 8521 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8522 enable_ipiv = false; 8523 8524 if (cpu_has_vmx_tsc_scaling()) 8525 kvm_caps.has_tsc_control = true; 8526 8527 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8528 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8529 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8530 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8531 8532 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8533 8534 if (enable_ept) 8535 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8536 cpu_has_vmx_ept_execute_only()); 8537 else 8538 vt_x86_ops.get_mt_mask = NULL; 8539 8540 /* 8541 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8542 * bits to shadow_zero_check. 8543 */ 8544 vmx_setup_me_spte_mask(); 8545 8546 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8547 ept_caps_to_lpage_level(vmx_capability.ept)); 8548 8549 /* 8550 * Only enable PML when hardware supports PML feature, and both EPT 8551 * and EPT A/D bit features are enabled -- PML depends on them to work. 8552 */ 8553 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8554 enable_pml = 0; 8555 8556 if (!cpu_has_vmx_preemption_timer()) 8557 enable_preemption_timer = false; 8558 8559 if (enable_preemption_timer) { 8560 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8561 8562 cpu_preemption_timer_multi = 8563 vmx_misc_preemption_timer_rate(vmcs_config.misc); 8564 8565 if (tsc_khz) 8566 use_timer_freq = (u64)tsc_khz * 1000; 8567 use_timer_freq >>= cpu_preemption_timer_multi; 8568 8569 /* 8570 * KVM "disables" the preemption timer by setting it to its max 8571 * value. Don't use the timer if it might cause spurious exits 8572 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8573 */ 8574 if (use_timer_freq > 0xffffffffu / 10) 8575 enable_preemption_timer = false; 8576 } 8577 8578 if (!enable_preemption_timer) { 8579 vt_x86_ops.set_hv_timer = NULL; 8580 vt_x86_ops.cancel_hv_timer = NULL; 8581 } 8582 8583 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8584 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8585 8586 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8587 return -EINVAL; 8588 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8589 pt_mode = PT_MODE_SYSTEM; 8590 if (pt_mode == PT_MODE_HOST_GUEST) 8591 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8592 else 8593 vt_init_ops.handle_intel_pt_intr = NULL; 8594 8595 setup_default_sgx_lepubkeyhash(); 8596 8597 if (nested) { 8598 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8599 8600 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8601 if (r) 8602 return r; 8603 } 8604 8605 vmx_set_cpu_caps(); 8606 8607 r = alloc_kvm_area(); 8608 if (r && nested) 8609 nested_vmx_hardware_unsetup(); 8610 8611 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8612 8613 /* 8614 * On Intel CPUs that lack self-snoop feature, letting the guest control 8615 * memory types may result in unexpected behavior. So always ignore guest 8616 * PAT on those CPUs and map VM as writeback, not allowing userspace to 8617 * disable the quirk. 8618 * 8619 * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is 8620 * supported, UC is slow enough to cause issues with some older guests (e.g. 8621 * an old version of bochs driver uses ioremap() instead of ioremap_wc() to 8622 * map the video RAM, causing wayland desktop to fail to get started 8623 * correctly). To avoid breaking those older guests that rely on KVM to force 8624 * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the 8625 * safer (for performance) default behavior. 8626 * 8627 * On top of this, non-coherent DMA devices need the guest to flush CPU 8628 * caches properly. This also requires honoring guest PAT, and is forced 8629 * independent of the quirk in vmx_ignore_guest_pat(). 8630 */ 8631 if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) 8632 kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8633 kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; 8634 return r; 8635 } 8636 8637 static void vmx_cleanup_l1d_flush(void) 8638 { 8639 if (vmx_l1d_flush_pages) { 8640 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8641 vmx_l1d_flush_pages = NULL; 8642 } 8643 /* Restore state so sysfs ignores VMX */ 8644 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8645 } 8646 8647 void vmx_exit(void) 8648 { 8649 allow_smaller_maxphyaddr = false; 8650 8651 vmx_cleanup_l1d_flush(); 8652 8653 kvm_x86_vendor_exit(); 8654 } 8655 8656 int __init vmx_init(void) 8657 { 8658 int r, cpu; 8659 8660 if (!kvm_is_vmx_supported()) 8661 return -EOPNOTSUPP; 8662 8663 /* 8664 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing 8665 * to unwind if a later step fails. 8666 */ 8667 hv_init_evmcs(); 8668 8669 r = kvm_x86_vendor_init(&vt_init_ops); 8670 if (r) 8671 return r; 8672 8673 /* 8674 * Must be called after common x86 init so enable_ept is properly set 8675 * up. Hand the parameter mitigation value in which was stored in 8676 * the pre module init parser. If no parameter was given, it will 8677 * contain 'auto' which will be turned into the default 'cond' 8678 * mitigation mode. 8679 */ 8680 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8681 if (r) 8682 goto err_l1d_flush; 8683 8684 for_each_possible_cpu(cpu) { 8685 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8686 8687 pi_init_cpu(cpu); 8688 } 8689 8690 vmx_check_vmcs12_offsets(); 8691 8692 /* 8693 * Shadow paging doesn't have a (further) performance penalty 8694 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8695 * by default 8696 */ 8697 if (!enable_ept) 8698 allow_smaller_maxphyaddr = true; 8699 8700 return 0; 8701 8702 err_l1d_flush: 8703 kvm_x86_vendor_exit(); 8704 return r; 8705 } 8706