1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 #include <linux/entry-kvm.h> 32 33 #include <asm/apic.h> 34 #include <asm/asm.h> 35 #include <asm/cpu.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/api.h> 40 #include <asm/fpu/xstate.h> 41 #include <asm/fred.h> 42 #include <asm/idtentry.h> 43 #include <asm/io.h> 44 #include <asm/irq_remapping.h> 45 #include <asm/reboot.h> 46 #include <asm/perf_event.h> 47 #include <asm/mmu_context.h> 48 #include <asm/mshyperv.h> 49 #include <asm/mwait.h> 50 #include <asm/spec-ctrl.h> 51 #include <asm/vmx.h> 52 53 #include <trace/events/ipi.h> 54 55 #include "capabilities.h" 56 #include "cpuid.h" 57 #include "hyperv.h" 58 #include "kvm_onhyperv.h" 59 #include "irq.h" 60 #include "kvm_cache_regs.h" 61 #include "lapic.h" 62 #include "mmu.h" 63 #include "nested.h" 64 #include "pmu.h" 65 #include "sgx.h" 66 #include "trace.h" 67 #include "vmcs.h" 68 #include "vmcs12.h" 69 #include "vmx.h" 70 #include "x86.h" 71 #include "x86_ops.h" 72 #include "smm.h" 73 #include "vmx_onhyperv.h" 74 #include "posted_intr.h" 75 76 MODULE_AUTHOR("Qumranet"); 77 MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); 78 MODULE_LICENSE("GPL"); 79 80 #ifdef MODULE 81 static const struct x86_cpu_id vmx_cpu_id[] = { 82 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 83 {} 84 }; 85 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 86 #endif 87 88 bool __read_mostly enable_vpid = 1; 89 module_param_named(vpid, enable_vpid, bool, 0444); 90 91 static bool __read_mostly enable_vnmi = 1; 92 module_param_named(vnmi, enable_vnmi, bool, 0444); 93 94 bool __read_mostly flexpriority_enabled = 1; 95 module_param_named(flexpriority, flexpriority_enabled, bool, 0444); 96 97 bool __read_mostly enable_ept = 1; 98 module_param_named(ept, enable_ept, bool, 0444); 99 100 bool __read_mostly enable_unrestricted_guest = 1; 101 module_param_named(unrestricted_guest, 102 enable_unrestricted_guest, bool, 0444); 103 104 bool __read_mostly enable_ept_ad_bits = 1; 105 module_param_named(eptad, enable_ept_ad_bits, bool, 0444); 106 107 static bool __read_mostly emulate_invalid_guest_state = true; 108 module_param(emulate_invalid_guest_state, bool, 0444); 109 110 static bool __read_mostly fasteoi = 1; 111 module_param(fasteoi, bool, 0444); 112 113 module_param(enable_apicv, bool, 0444); 114 115 bool __read_mostly enable_ipiv = true; 116 module_param(enable_ipiv, bool, 0444); 117 118 /* 119 * If nested=1, nested virtualization is supported, i.e., guests may use 120 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 121 * use VMX instructions. 122 */ 123 static bool __read_mostly nested = 1; 124 module_param(nested, bool, 0444); 125 126 bool __read_mostly enable_pml = 1; 127 module_param_named(pml, enable_pml, bool, 0444); 128 129 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 130 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 131 132 static bool __read_mostly dump_invalid_vmcs = 0; 133 module_param(dump_invalid_vmcs, bool, 0644); 134 135 #define MSR_BITMAP_MODE_X2APIC 1 136 #define MSR_BITMAP_MODE_X2APIC_APICV 2 137 138 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 139 140 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 141 static int __read_mostly cpu_preemption_timer_multi; 142 static bool __read_mostly enable_preemption_timer = 1; 143 #ifdef CONFIG_X86_64 144 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 145 #endif 146 147 extern bool __read_mostly allow_smaller_maxphyaddr; 148 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 149 150 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 151 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 152 #define KVM_VM_CR0_ALWAYS_ON \ 153 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 154 155 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 156 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 157 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 158 159 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 160 161 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 162 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 163 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 164 RTIT_STATUS_BYTECNT)) 165 166 /* 167 * List of MSRs that can be directly passed to the guest. 168 * In addition to these x2apic, PT and LBR MSRs are handled specially. 169 */ 170 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 171 MSR_IA32_SPEC_CTRL, 172 MSR_IA32_PRED_CMD, 173 MSR_IA32_FLUSH_CMD, 174 MSR_IA32_TSC, 175 #ifdef CONFIG_X86_64 176 MSR_FS_BASE, 177 MSR_GS_BASE, 178 MSR_KERNEL_GS_BASE, 179 MSR_IA32_XFD, 180 MSR_IA32_XFD_ERR, 181 #endif 182 MSR_IA32_SYSENTER_CS, 183 MSR_IA32_SYSENTER_ESP, 184 MSR_IA32_SYSENTER_EIP, 185 MSR_CORE_C1_RES, 186 MSR_CORE_C3_RESIDENCY, 187 MSR_CORE_C6_RESIDENCY, 188 MSR_CORE_C7_RESIDENCY, 189 }; 190 191 /* 192 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 193 * ple_gap: upper bound on the amount of time between two successive 194 * executions of PAUSE in a loop. Also indicate if ple enabled. 195 * According to test, this time is usually smaller than 128 cycles. 196 * ple_window: upper bound on the amount of time a guest is allowed to execute 197 * in a PAUSE loop. Tests indicate that most spinlocks are held for 198 * less than 2^12 cycles 199 * Time is measured based on a counter that runs at the same rate as the TSC, 200 * refer SDM volume 3b section 21.6.13 & 22.1.3. 201 */ 202 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 203 module_param(ple_gap, uint, 0444); 204 205 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 206 module_param(ple_window, uint, 0444); 207 208 /* Default doubles per-vcpu window every exit. */ 209 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 210 module_param(ple_window_grow, uint, 0444); 211 212 /* Default resets per-vcpu window every exit to ple_window. */ 213 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 214 module_param(ple_window_shrink, uint, 0444); 215 216 /* Default is to compute the maximum so we can never overflow. */ 217 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 218 module_param(ple_window_max, uint, 0444); 219 220 /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ 221 int __read_mostly pt_mode = PT_MODE_SYSTEM; 222 #ifdef CONFIG_BROKEN 223 module_param(pt_mode, int, S_IRUGO); 224 #endif 225 226 struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; 227 228 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 229 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 230 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 231 232 /* Storage for pre module init parameter parsing */ 233 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 234 235 static const struct { 236 const char *option; 237 bool for_parse; 238 } vmentry_l1d_param[] = { 239 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 240 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 241 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 242 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 243 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 244 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 245 }; 246 247 #define L1D_CACHE_ORDER 4 248 static void *vmx_l1d_flush_pages; 249 250 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 251 { 252 struct page *page; 253 unsigned int i; 254 255 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 256 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 257 return 0; 258 } 259 260 if (!enable_ept) { 261 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 262 return 0; 263 } 264 265 if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 266 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 267 return 0; 268 } 269 270 /* If set to auto use the default l1tf mitigation method */ 271 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 272 switch (l1tf_mitigation) { 273 case L1TF_MITIGATION_OFF: 274 l1tf = VMENTER_L1D_FLUSH_NEVER; 275 break; 276 case L1TF_MITIGATION_FLUSH_NOWARN: 277 case L1TF_MITIGATION_FLUSH: 278 case L1TF_MITIGATION_FLUSH_NOSMT: 279 l1tf = VMENTER_L1D_FLUSH_COND; 280 break; 281 case L1TF_MITIGATION_FULL: 282 case L1TF_MITIGATION_FULL_FORCE: 283 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 284 break; 285 } 286 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 287 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 288 } 289 290 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 291 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 292 /* 293 * This allocation for vmx_l1d_flush_pages is not tied to a VM 294 * lifetime and so should not be charged to a memcg. 295 */ 296 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 297 if (!page) 298 return -ENOMEM; 299 vmx_l1d_flush_pages = page_address(page); 300 301 /* 302 * Initialize each page with a different pattern in 303 * order to protect against KSM in the nested 304 * virtualization case. 305 */ 306 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 307 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 308 PAGE_SIZE); 309 } 310 } 311 312 l1tf_vmx_mitigation = l1tf; 313 314 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 315 static_branch_enable(&vmx_l1d_should_flush); 316 else 317 static_branch_disable(&vmx_l1d_should_flush); 318 319 if (l1tf == VMENTER_L1D_FLUSH_COND) 320 static_branch_enable(&vmx_l1d_flush_cond); 321 else 322 static_branch_disable(&vmx_l1d_flush_cond); 323 return 0; 324 } 325 326 static int vmentry_l1d_flush_parse(const char *s) 327 { 328 unsigned int i; 329 330 if (s) { 331 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 332 if (vmentry_l1d_param[i].for_parse && 333 sysfs_streq(s, vmentry_l1d_param[i].option)) 334 return i; 335 } 336 } 337 return -EINVAL; 338 } 339 340 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 341 { 342 int l1tf, ret; 343 344 l1tf = vmentry_l1d_flush_parse(s); 345 if (l1tf < 0) 346 return l1tf; 347 348 if (!boot_cpu_has(X86_BUG_L1TF)) 349 return 0; 350 351 /* 352 * Has vmx_init() run already? If not then this is the pre init 353 * parameter parsing. In that case just store the value and let 354 * vmx_init() do the proper setup after enable_ept has been 355 * established. 356 */ 357 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 358 vmentry_l1d_flush_param = l1tf; 359 return 0; 360 } 361 362 mutex_lock(&vmx_l1d_flush_mutex); 363 ret = vmx_setup_l1d_flush(l1tf); 364 mutex_unlock(&vmx_l1d_flush_mutex); 365 return ret; 366 } 367 368 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 369 { 370 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 371 return sysfs_emit(s, "???\n"); 372 373 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 374 } 375 376 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 377 { 378 u64 msr; 379 380 if (!vmx->disable_fb_clear) 381 return; 382 383 msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); 384 msr |= FB_CLEAR_DIS; 385 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); 386 /* Cache the MSR value to avoid reading it later */ 387 vmx->msr_ia32_mcu_opt_ctrl = msr; 388 } 389 390 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 391 { 392 if (!vmx->disable_fb_clear) 393 return; 394 395 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 396 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 397 } 398 399 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 400 { 401 /* 402 * Disable VERW's behavior of clearing CPU buffers for the guest if the 403 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 404 * the mitigation. Disabling the clearing behavior provides a 405 * performance boost for guests that aren't aware that manually clearing 406 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 407 * and VM-Exit. 408 */ 409 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 410 (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 411 !boot_cpu_has_bug(X86_BUG_MDS) && 412 !boot_cpu_has_bug(X86_BUG_TAA); 413 414 /* 415 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 416 * at VMEntry. Skip the MSR read/write when a guest has no use case to 417 * execute VERW. 418 */ 419 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 420 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 421 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 422 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 423 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 424 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 425 vmx->disable_fb_clear = false; 426 } 427 428 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 429 .set = vmentry_l1d_flush_set, 430 .get = vmentry_l1d_flush_get, 431 }; 432 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 433 434 static u32 vmx_segment_access_rights(struct kvm_segment *var); 435 436 void vmx_vmexit(void); 437 438 #define vmx_insn_failed(fmt...) \ 439 do { \ 440 WARN_ONCE(1, fmt); \ 441 pr_warn_ratelimited(fmt); \ 442 } while (0) 443 444 noinline void vmread_error(unsigned long field) 445 { 446 vmx_insn_failed("vmread failed: field=%lx\n", field); 447 } 448 449 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 450 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 451 { 452 if (fault) { 453 kvm_spurious_fault(); 454 } else { 455 instrumentation_begin(); 456 vmread_error(field); 457 instrumentation_end(); 458 } 459 } 460 #endif 461 462 noinline void vmwrite_error(unsigned long field, unsigned long value) 463 { 464 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 465 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 466 } 467 468 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 469 { 470 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 471 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 472 } 473 474 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 475 { 476 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 477 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 478 } 479 480 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 481 { 482 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 483 ext, vpid, gva); 484 } 485 486 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) 487 { 488 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", 489 ext, eptp, gpa); 490 } 491 492 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 493 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 494 /* 495 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 496 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 497 */ 498 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 499 500 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 501 static DEFINE_SPINLOCK(vmx_vpid_lock); 502 503 struct vmcs_config vmcs_config __ro_after_init; 504 struct vmx_capability vmx_capability __ro_after_init; 505 506 #define VMX_SEGMENT_FIELD(seg) \ 507 [VCPU_SREG_##seg] = { \ 508 .selector = GUEST_##seg##_SELECTOR, \ 509 .base = GUEST_##seg##_BASE, \ 510 .limit = GUEST_##seg##_LIMIT, \ 511 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 512 } 513 514 static const struct kvm_vmx_segment_field { 515 unsigned selector; 516 unsigned base; 517 unsigned limit; 518 unsigned ar_bytes; 519 } kvm_vmx_segment_fields[] = { 520 VMX_SEGMENT_FIELD(CS), 521 VMX_SEGMENT_FIELD(DS), 522 VMX_SEGMENT_FIELD(ES), 523 VMX_SEGMENT_FIELD(FS), 524 VMX_SEGMENT_FIELD(GS), 525 VMX_SEGMENT_FIELD(SS), 526 VMX_SEGMENT_FIELD(TR), 527 VMX_SEGMENT_FIELD(LDTR), 528 }; 529 530 531 static unsigned long host_idt_base; 532 533 #if IS_ENABLED(CONFIG_HYPERV) 534 static bool __read_mostly enlightened_vmcs = true; 535 module_param(enlightened_vmcs, bool, 0444); 536 537 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 538 { 539 struct hv_enlightened_vmcs *evmcs; 540 hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu); 541 542 if (partition_assist_page == INVALID_PAGE) 543 return -ENOMEM; 544 545 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 546 547 evmcs->partition_assist_page = partition_assist_page; 548 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 549 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 550 551 return 0; 552 } 553 554 static __init void hv_init_evmcs(void) 555 { 556 int cpu; 557 558 if (!enlightened_vmcs) 559 return; 560 561 /* 562 * Enlightened VMCS usage should be recommended and the host needs 563 * to support eVMCS v1 or above. 564 */ 565 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 566 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 567 KVM_EVMCS_VERSION) { 568 569 /* Check that we have assist pages on all online CPUs */ 570 for_each_online_cpu(cpu) { 571 if (!hv_get_vp_assist_page(cpu)) { 572 enlightened_vmcs = false; 573 break; 574 } 575 } 576 577 if (enlightened_vmcs) { 578 pr_info("Using Hyper-V Enlightened VMCS\n"); 579 static_branch_enable(&__kvm_is_using_evmcs); 580 } 581 582 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 583 vt_x86_ops.enable_l2_tlb_flush 584 = hv_enable_l2_tlb_flush; 585 } else { 586 enlightened_vmcs = false; 587 } 588 } 589 590 static void hv_reset_evmcs(void) 591 { 592 struct hv_vp_assist_page *vp_ap; 593 594 if (!kvm_is_using_evmcs()) 595 return; 596 597 /* 598 * KVM should enable eVMCS if and only if all CPUs have a VP assist 599 * page, and should reject CPU onlining if eVMCS is enabled the CPU 600 * doesn't have a VP assist page allocated. 601 */ 602 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 603 if (WARN_ON_ONCE(!vp_ap)) 604 return; 605 606 /* 607 * Reset everything to support using non-enlightened VMCS access later 608 * (e.g. when we reload the module with enlightened_vmcs=0) 609 */ 610 vp_ap->nested_control.features.directhypercall = 0; 611 vp_ap->current_nested_vmcs = 0; 612 vp_ap->enlighten_vmentry = 0; 613 } 614 615 #else /* IS_ENABLED(CONFIG_HYPERV) */ 616 static void hv_init_evmcs(void) {} 617 static void hv_reset_evmcs(void) {} 618 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 619 620 /* 621 * Comment's format: document - errata name - stepping - processor name. 622 * Refer from 623 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 624 */ 625 static u32 vmx_preemption_cpu_tfms[] = { 626 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 627 0x000206E6, 628 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 629 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 630 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 631 0x00020652, 632 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 633 0x00020655, 634 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 635 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 636 /* 637 * 320767.pdf - AAP86 - B1 - 638 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 639 */ 640 0x000106E5, 641 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 642 0x000106A0, 643 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 644 0x000106A1, 645 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 646 0x000106A4, 647 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 648 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 649 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 650 0x000106A5, 651 /* Xeon E3-1220 V2 */ 652 0x000306A8, 653 }; 654 655 static inline bool cpu_has_broken_vmx_preemption_timer(void) 656 { 657 u32 eax = cpuid_eax(0x00000001), i; 658 659 /* Clear the reserved bits */ 660 eax &= ~(0x3U << 14 | 0xfU << 28); 661 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 662 if (eax == vmx_preemption_cpu_tfms[i]) 663 return true; 664 665 return false; 666 } 667 668 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 669 { 670 return flexpriority_enabled && lapic_in_kernel(vcpu); 671 } 672 673 static int vmx_get_passthrough_msr_slot(u32 msr) 674 { 675 int i; 676 677 switch (msr) { 678 case 0x800 ... 0x8ff: 679 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 680 return -ENOENT; 681 case MSR_IA32_RTIT_STATUS: 682 case MSR_IA32_RTIT_OUTPUT_BASE: 683 case MSR_IA32_RTIT_OUTPUT_MASK: 684 case MSR_IA32_RTIT_CR3_MATCH: 685 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 686 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ 687 case MSR_LBR_SELECT: 688 case MSR_LBR_TOS: 689 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: 690 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: 691 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: 692 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 693 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 694 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 695 return -ENOENT; 696 } 697 698 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 699 if (vmx_possible_passthrough_msrs[i] == msr) 700 return i; 701 } 702 703 WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 704 return -ENOENT; 705 } 706 707 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 708 { 709 int i; 710 711 i = kvm_find_user_return_msr(msr); 712 if (i >= 0) 713 return &vmx->guest_uret_msrs[i]; 714 return NULL; 715 } 716 717 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 718 struct vmx_uret_msr *msr, u64 data) 719 { 720 unsigned int slot = msr - vmx->guest_uret_msrs; 721 int ret = 0; 722 723 if (msr->load_into_hardware) { 724 preempt_disable(); 725 ret = kvm_set_user_return_msr(slot, data, msr->mask); 726 preempt_enable(); 727 } 728 if (!ret) 729 msr->data = data; 730 return ret; 731 } 732 733 /* 734 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 735 * 736 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 737 * atomically track post-VMXON state, e.g. this may be called in NMI context. 738 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 739 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 740 * magically in RM, VM86, compat mode, or at CPL>0. 741 */ 742 static int kvm_cpu_vmxoff(void) 743 { 744 asm goto("1: vmxoff\n\t" 745 _ASM_EXTABLE(1b, %l[fault]) 746 ::: "cc", "memory" : fault); 747 748 cr4_clear_bits(X86_CR4_VMXE); 749 return 0; 750 751 fault: 752 cr4_clear_bits(X86_CR4_VMXE); 753 return -EIO; 754 } 755 756 void vmx_emergency_disable_virtualization_cpu(void) 757 { 758 int cpu = raw_smp_processor_id(); 759 struct loaded_vmcs *v; 760 761 kvm_rebooting = true; 762 763 /* 764 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 765 * set in task context. If this races with VMX is disabled by an NMI, 766 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 767 * kvm_rebooting set. 768 */ 769 if (!(__read_cr4() & X86_CR4_VMXE)) 770 return; 771 772 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 773 loaded_vmcss_on_cpu_link) 774 vmcs_clear(v->vmcs); 775 776 kvm_cpu_vmxoff(); 777 } 778 779 static void __loaded_vmcs_clear(void *arg) 780 { 781 struct loaded_vmcs *loaded_vmcs = arg; 782 int cpu = raw_smp_processor_id(); 783 784 if (loaded_vmcs->cpu != cpu) 785 return; /* vcpu migration can race with cpu offline */ 786 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 787 per_cpu(current_vmcs, cpu) = NULL; 788 789 vmcs_clear(loaded_vmcs->vmcs); 790 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 791 vmcs_clear(loaded_vmcs->shadow_vmcs); 792 793 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 794 795 /* 796 * Ensure all writes to loaded_vmcs, including deleting it from its 797 * current percpu list, complete before setting loaded_vmcs->cpu to 798 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 799 * and add loaded_vmcs to its percpu list before it's deleted from this 800 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 801 */ 802 smp_wmb(); 803 804 loaded_vmcs->cpu = -1; 805 loaded_vmcs->launched = 0; 806 } 807 808 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 809 { 810 int cpu = loaded_vmcs->cpu; 811 812 if (cpu != -1) 813 smp_call_function_single(cpu, 814 __loaded_vmcs_clear, loaded_vmcs, 1); 815 } 816 817 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 818 unsigned field) 819 { 820 bool ret; 821 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 822 823 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 824 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 825 vmx->segment_cache.bitmask = 0; 826 } 827 ret = vmx->segment_cache.bitmask & mask; 828 vmx->segment_cache.bitmask |= mask; 829 return ret; 830 } 831 832 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 833 { 834 u16 *p = &vmx->segment_cache.seg[seg].selector; 835 836 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 837 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 838 return *p; 839 } 840 841 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 842 { 843 ulong *p = &vmx->segment_cache.seg[seg].base; 844 845 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 846 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 847 return *p; 848 } 849 850 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 851 { 852 u32 *p = &vmx->segment_cache.seg[seg].limit; 853 854 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 855 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 856 return *p; 857 } 858 859 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 860 { 861 u32 *p = &vmx->segment_cache.seg[seg].ar; 862 863 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 864 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 865 return *p; 866 } 867 868 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 869 { 870 u32 eb; 871 872 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 873 (1u << DB_VECTOR) | (1u << AC_VECTOR); 874 /* 875 * #VE isn't used for VMX. To test against unexpected changes 876 * related to #VE for VMX, intercept unexpected #VE and warn on it. 877 */ 878 if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 879 eb |= 1u << VE_VECTOR; 880 /* 881 * Guest access to VMware backdoor ports could legitimately 882 * trigger #GP because of TSS I/O permission bitmap. 883 * We intercept those #GP and allow access to them anyway 884 * as VMware does. 885 */ 886 if (enable_vmware_backdoor) 887 eb |= (1u << GP_VECTOR); 888 if ((vcpu->guest_debug & 889 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 890 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 891 eb |= 1u << BP_VECTOR; 892 if (to_vmx(vcpu)->rmode.vm86_active) 893 eb = ~0; 894 if (!vmx_need_pf_intercept(vcpu)) 895 eb &= ~(1u << PF_VECTOR); 896 897 /* When we are running a nested L2 guest and L1 specified for it a 898 * certain exception bitmap, we must trap the same exceptions and pass 899 * them to L1. When running L2, we will only handle the exceptions 900 * specified above if L1 did not want them. 901 */ 902 if (is_guest_mode(vcpu)) 903 eb |= get_vmcs12(vcpu)->exception_bitmap; 904 else { 905 int mask = 0, match = 0; 906 907 if (enable_ept && (eb & (1u << PF_VECTOR))) { 908 /* 909 * If EPT is enabled, #PF is currently only intercepted 910 * if MAXPHYADDR is smaller on the guest than on the 911 * host. In that case we only care about present, 912 * non-reserved faults. For vmcs02, however, PFEC_MASK 913 * and PFEC_MATCH are set in prepare_vmcs02_rare. 914 */ 915 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 916 match = PFERR_PRESENT_MASK; 917 } 918 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 919 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 920 } 921 922 /* 923 * Disabling xfd interception indicates that dynamic xfeatures 924 * might be used in the guest. Always trap #NM in this case 925 * to save guest xfd_err timely. 926 */ 927 if (vcpu->arch.xfd_no_write_intercept) 928 eb |= (1u << NM_VECTOR); 929 930 vmcs_write32(EXCEPTION_BITMAP, eb); 931 } 932 933 /* 934 * Check if MSR is intercepted for currently loaded MSR bitmap. 935 */ 936 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 937 { 938 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 939 return true; 940 941 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 942 } 943 944 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 945 { 946 unsigned int flags = 0; 947 948 if (vmx->loaded_vmcs->launched) 949 flags |= VMX_RUN_VMRESUME; 950 951 /* 952 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 953 * to change it directly without causing a vmexit. In that case read 954 * it after vmexit and store it in vmx->spec_ctrl. 955 */ 956 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 957 flags |= VMX_RUN_SAVE_SPEC_CTRL; 958 959 return flags; 960 } 961 962 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 963 unsigned long entry, unsigned long exit) 964 { 965 vm_entry_controls_clearbit(vmx, entry); 966 vm_exit_controls_clearbit(vmx, exit); 967 } 968 969 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 970 { 971 unsigned int i; 972 973 for (i = 0; i < m->nr; ++i) { 974 if (m->val[i].index == msr) 975 return i; 976 } 977 return -ENOENT; 978 } 979 980 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 981 { 982 int i; 983 struct msr_autoload *m = &vmx->msr_autoload; 984 985 switch (msr) { 986 case MSR_EFER: 987 if (cpu_has_load_ia32_efer()) { 988 clear_atomic_switch_msr_special(vmx, 989 VM_ENTRY_LOAD_IA32_EFER, 990 VM_EXIT_LOAD_IA32_EFER); 991 return; 992 } 993 break; 994 case MSR_CORE_PERF_GLOBAL_CTRL: 995 if (cpu_has_load_perf_global_ctrl()) { 996 clear_atomic_switch_msr_special(vmx, 997 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 998 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 999 return; 1000 } 1001 break; 1002 } 1003 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1004 if (i < 0) 1005 goto skip_guest; 1006 --m->guest.nr; 1007 m->guest.val[i] = m->guest.val[m->guest.nr]; 1008 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1009 1010 skip_guest: 1011 i = vmx_find_loadstore_msr_slot(&m->host, msr); 1012 if (i < 0) 1013 return; 1014 1015 --m->host.nr; 1016 m->host.val[i] = m->host.val[m->host.nr]; 1017 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1018 } 1019 1020 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1021 unsigned long entry, unsigned long exit, 1022 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1023 u64 guest_val, u64 host_val) 1024 { 1025 vmcs_write64(guest_val_vmcs, guest_val); 1026 if (host_val_vmcs != HOST_IA32_EFER) 1027 vmcs_write64(host_val_vmcs, host_val); 1028 vm_entry_controls_setbit(vmx, entry); 1029 vm_exit_controls_setbit(vmx, exit); 1030 } 1031 1032 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1033 u64 guest_val, u64 host_val, bool entry_only) 1034 { 1035 int i, j = 0; 1036 struct msr_autoload *m = &vmx->msr_autoload; 1037 1038 switch (msr) { 1039 case MSR_EFER: 1040 if (cpu_has_load_ia32_efer()) { 1041 add_atomic_switch_msr_special(vmx, 1042 VM_ENTRY_LOAD_IA32_EFER, 1043 VM_EXIT_LOAD_IA32_EFER, 1044 GUEST_IA32_EFER, 1045 HOST_IA32_EFER, 1046 guest_val, host_val); 1047 return; 1048 } 1049 break; 1050 case MSR_CORE_PERF_GLOBAL_CTRL: 1051 if (cpu_has_load_perf_global_ctrl()) { 1052 add_atomic_switch_msr_special(vmx, 1053 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1054 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1055 GUEST_IA32_PERF_GLOBAL_CTRL, 1056 HOST_IA32_PERF_GLOBAL_CTRL, 1057 guest_val, host_val); 1058 return; 1059 } 1060 break; 1061 case MSR_IA32_PEBS_ENABLE: 1062 /* PEBS needs a quiescent period after being disabled (to write 1063 * a record). Disabling PEBS through VMX MSR swapping doesn't 1064 * provide that period, so a CPU could write host's record into 1065 * guest's memory. 1066 */ 1067 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 1068 } 1069 1070 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1071 if (!entry_only) 1072 j = vmx_find_loadstore_msr_slot(&m->host, msr); 1073 1074 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 1075 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 1076 printk_once(KERN_WARNING "Not enough msr switch entries. " 1077 "Can't add msr %x\n", msr); 1078 return; 1079 } 1080 if (i < 0) { 1081 i = m->guest.nr++; 1082 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1083 } 1084 m->guest.val[i].index = msr; 1085 m->guest.val[i].value = guest_val; 1086 1087 if (entry_only) 1088 return; 1089 1090 if (j < 0) { 1091 j = m->host.nr++; 1092 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1093 } 1094 m->host.val[j].index = msr; 1095 m->host.val[j].value = host_val; 1096 } 1097 1098 static bool update_transition_efer(struct vcpu_vmx *vmx) 1099 { 1100 u64 guest_efer = vmx->vcpu.arch.efer; 1101 u64 ignore_bits = 0; 1102 int i; 1103 1104 /* Shadow paging assumes NX to be available. */ 1105 if (!enable_ept) 1106 guest_efer |= EFER_NX; 1107 1108 /* 1109 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1110 */ 1111 ignore_bits |= EFER_SCE; 1112 #ifdef CONFIG_X86_64 1113 ignore_bits |= EFER_LMA | EFER_LME; 1114 /* SCE is meaningful only in long mode on Intel */ 1115 if (guest_efer & EFER_LMA) 1116 ignore_bits &= ~(u64)EFER_SCE; 1117 #endif 1118 1119 /* 1120 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1121 * On CPUs that support "load IA32_EFER", always switch EFER 1122 * atomically, since it's faster than switching it manually. 1123 */ 1124 if (cpu_has_load_ia32_efer() || 1125 (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { 1126 if (!(guest_efer & EFER_LMA)) 1127 guest_efer &= ~EFER_LME; 1128 if (guest_efer != kvm_host.efer) 1129 add_atomic_switch_msr(vmx, MSR_EFER, 1130 guest_efer, kvm_host.efer, false); 1131 else 1132 clear_atomic_switch_msr(vmx, MSR_EFER); 1133 return false; 1134 } 1135 1136 i = kvm_find_user_return_msr(MSR_EFER); 1137 if (i < 0) 1138 return false; 1139 1140 clear_atomic_switch_msr(vmx, MSR_EFER); 1141 1142 guest_efer &= ~ignore_bits; 1143 guest_efer |= kvm_host.efer & ignore_bits; 1144 1145 vmx->guest_uret_msrs[i].data = guest_efer; 1146 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1147 1148 return true; 1149 } 1150 1151 #ifdef CONFIG_X86_32 1152 /* 1153 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1154 * VMCS rather than the segment table. KVM uses this helper to figure 1155 * out the current bases to poke them into the VMCS before entry. 1156 */ 1157 static unsigned long segment_base(u16 selector) 1158 { 1159 struct desc_struct *table; 1160 unsigned long v; 1161 1162 if (!(selector & ~SEGMENT_RPL_MASK)) 1163 return 0; 1164 1165 table = get_current_gdt_ro(); 1166 1167 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1168 u16 ldt_selector = kvm_read_ldt(); 1169 1170 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1171 return 0; 1172 1173 table = (struct desc_struct *)segment_base(ldt_selector); 1174 } 1175 v = get_desc_base(&table[selector >> 3]); 1176 return v; 1177 } 1178 #endif 1179 1180 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1181 { 1182 return vmx_pt_mode_is_host_guest() && 1183 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1184 } 1185 1186 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1187 { 1188 /* The base must be 128-byte aligned and a legal physical address. */ 1189 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1190 } 1191 1192 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1193 { 1194 u32 i; 1195 1196 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1197 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1198 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1199 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1200 for (i = 0; i < addr_range; i++) { 1201 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1202 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1203 } 1204 } 1205 1206 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1207 { 1208 u32 i; 1209 1210 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1211 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1212 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1213 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1214 for (i = 0; i < addr_range; i++) { 1215 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1216 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1217 } 1218 } 1219 1220 static void pt_guest_enter(struct vcpu_vmx *vmx) 1221 { 1222 if (vmx_pt_mode_is_system()) 1223 return; 1224 1225 /* 1226 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1227 * Save host state before VM entry. 1228 */ 1229 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1230 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1231 wrmsrl(MSR_IA32_RTIT_CTL, 0); 1232 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1233 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1234 } 1235 } 1236 1237 static void pt_guest_exit(struct vcpu_vmx *vmx) 1238 { 1239 if (vmx_pt_mode_is_system()) 1240 return; 1241 1242 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1243 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1244 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1245 } 1246 1247 /* 1248 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1249 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1250 */ 1251 if (vmx->pt_desc.host.ctl) 1252 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1253 } 1254 1255 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1256 unsigned long fs_base, unsigned long gs_base) 1257 { 1258 if (unlikely(fs_sel != host->fs_sel)) { 1259 if (!(fs_sel & 7)) 1260 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1261 else 1262 vmcs_write16(HOST_FS_SELECTOR, 0); 1263 host->fs_sel = fs_sel; 1264 } 1265 if (unlikely(gs_sel != host->gs_sel)) { 1266 if (!(gs_sel & 7)) 1267 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1268 else 1269 vmcs_write16(HOST_GS_SELECTOR, 0); 1270 host->gs_sel = gs_sel; 1271 } 1272 if (unlikely(fs_base != host->fs_base)) { 1273 vmcs_writel(HOST_FS_BASE, fs_base); 1274 host->fs_base = fs_base; 1275 } 1276 if (unlikely(gs_base != host->gs_base)) { 1277 vmcs_writel(HOST_GS_BASE, gs_base); 1278 host->gs_base = gs_base; 1279 } 1280 } 1281 1282 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1283 { 1284 struct vcpu_vmx *vmx = to_vmx(vcpu); 1285 struct vmcs_host_state *host_state; 1286 #ifdef CONFIG_X86_64 1287 int cpu = raw_smp_processor_id(); 1288 #endif 1289 unsigned long fs_base, gs_base; 1290 u16 fs_sel, gs_sel; 1291 int i; 1292 1293 /* 1294 * Note that guest MSRs to be saved/restored can also be changed 1295 * when guest state is loaded. This happens when guest transitions 1296 * to/from long-mode by setting MSR_EFER.LMA. 1297 */ 1298 if (!vmx->guest_uret_msrs_loaded) { 1299 vmx->guest_uret_msrs_loaded = true; 1300 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1301 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1302 continue; 1303 1304 kvm_set_user_return_msr(i, 1305 vmx->guest_uret_msrs[i].data, 1306 vmx->guest_uret_msrs[i].mask); 1307 } 1308 } 1309 1310 if (vmx->nested.need_vmcs12_to_shadow_sync) 1311 nested_sync_vmcs12_to_shadow(vcpu); 1312 1313 if (vmx->guest_state_loaded) 1314 return; 1315 1316 host_state = &vmx->loaded_vmcs->host_state; 1317 1318 /* 1319 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1320 * allow segment selectors with cpl > 0 or ti == 1. 1321 */ 1322 host_state->ldt_sel = kvm_read_ldt(); 1323 1324 #ifdef CONFIG_X86_64 1325 savesegment(ds, host_state->ds_sel); 1326 savesegment(es, host_state->es_sel); 1327 1328 gs_base = cpu_kernelmode_gs_base(cpu); 1329 if (likely(is_64bit_mm(current->mm))) { 1330 current_save_fsgs(); 1331 fs_sel = current->thread.fsindex; 1332 gs_sel = current->thread.gsindex; 1333 fs_base = current->thread.fsbase; 1334 vmx->msr_host_kernel_gs_base = current->thread.gsbase; 1335 } else { 1336 savesegment(fs, fs_sel); 1337 savesegment(gs, gs_sel); 1338 fs_base = read_msr(MSR_FS_BASE); 1339 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1340 } 1341 1342 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1343 #else 1344 savesegment(fs, fs_sel); 1345 savesegment(gs, gs_sel); 1346 fs_base = segment_base(fs_sel); 1347 gs_base = segment_base(gs_sel); 1348 #endif 1349 1350 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1351 vmx->guest_state_loaded = true; 1352 } 1353 1354 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1355 { 1356 struct vmcs_host_state *host_state; 1357 1358 if (!vmx->guest_state_loaded) 1359 return; 1360 1361 host_state = &vmx->loaded_vmcs->host_state; 1362 1363 ++vmx->vcpu.stat.host_state_reload; 1364 1365 #ifdef CONFIG_X86_64 1366 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1367 #endif 1368 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1369 kvm_load_ldt(host_state->ldt_sel); 1370 #ifdef CONFIG_X86_64 1371 load_gs_index(host_state->gs_sel); 1372 #else 1373 loadsegment(gs, host_state->gs_sel); 1374 #endif 1375 } 1376 if (host_state->fs_sel & 7) 1377 loadsegment(fs, host_state->fs_sel); 1378 #ifdef CONFIG_X86_64 1379 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1380 loadsegment(ds, host_state->ds_sel); 1381 loadsegment(es, host_state->es_sel); 1382 } 1383 #endif 1384 invalidate_tss_limit(); 1385 #ifdef CONFIG_X86_64 1386 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1387 #endif 1388 load_fixmap_gdt(raw_smp_processor_id()); 1389 vmx->guest_state_loaded = false; 1390 vmx->guest_uret_msrs_loaded = false; 1391 } 1392 1393 #ifdef CONFIG_X86_64 1394 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1395 { 1396 preempt_disable(); 1397 if (vmx->guest_state_loaded) 1398 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1399 preempt_enable(); 1400 return vmx->msr_guest_kernel_gs_base; 1401 } 1402 1403 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1404 { 1405 preempt_disable(); 1406 if (vmx->guest_state_loaded) 1407 wrmsrl(MSR_KERNEL_GS_BASE, data); 1408 preempt_enable(); 1409 vmx->msr_guest_kernel_gs_base = data; 1410 } 1411 #endif 1412 1413 static void grow_ple_window(struct kvm_vcpu *vcpu) 1414 { 1415 struct vcpu_vmx *vmx = to_vmx(vcpu); 1416 unsigned int old = vmx->ple_window; 1417 1418 vmx->ple_window = __grow_ple_window(old, ple_window, 1419 ple_window_grow, 1420 ple_window_max); 1421 1422 if (vmx->ple_window != old) { 1423 vmx->ple_window_dirty = true; 1424 trace_kvm_ple_window_update(vcpu->vcpu_id, 1425 vmx->ple_window, old); 1426 } 1427 } 1428 1429 static void shrink_ple_window(struct kvm_vcpu *vcpu) 1430 { 1431 struct vcpu_vmx *vmx = to_vmx(vcpu); 1432 unsigned int old = vmx->ple_window; 1433 1434 vmx->ple_window = __shrink_ple_window(old, ple_window, 1435 ple_window_shrink, 1436 ple_window); 1437 1438 if (vmx->ple_window != old) { 1439 vmx->ple_window_dirty = true; 1440 trace_kvm_ple_window_update(vcpu->vcpu_id, 1441 vmx->ple_window, old); 1442 } 1443 } 1444 1445 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, 1446 struct loaded_vmcs *buddy) 1447 { 1448 struct vcpu_vmx *vmx = to_vmx(vcpu); 1449 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1450 struct vmcs *prev; 1451 1452 if (!already_loaded) { 1453 loaded_vmcs_clear(vmx->loaded_vmcs); 1454 local_irq_disable(); 1455 1456 /* 1457 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1458 * this cpu's percpu list, otherwise it may not yet be deleted 1459 * from its previous cpu's percpu list. Pairs with the 1460 * smb_wmb() in __loaded_vmcs_clear(). 1461 */ 1462 smp_rmb(); 1463 1464 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1465 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1466 local_irq_enable(); 1467 } 1468 1469 prev = per_cpu(current_vmcs, cpu); 1470 if (prev != vmx->loaded_vmcs->vmcs) { 1471 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1472 vmcs_load(vmx->loaded_vmcs->vmcs); 1473 1474 /* 1475 * No indirect branch prediction barrier needed when switching 1476 * the active VMCS within a vCPU, unless IBRS is advertised to 1477 * the vCPU. To minimize the number of IBPBs executed, KVM 1478 * performs IBPB on nested VM-Exit (a single nested transition 1479 * may switch the active VMCS multiple times). 1480 */ 1481 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) 1482 indirect_branch_prediction_barrier(); 1483 } 1484 1485 if (!already_loaded) { 1486 void *gdt = get_current_gdt_ro(); 1487 1488 /* 1489 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1490 * TLB entries from its previous association with the vCPU. 1491 */ 1492 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1493 1494 /* 1495 * Linux uses per-cpu TSS and GDT, so set these when switching 1496 * processors. See 22.2.4. 1497 */ 1498 vmcs_writel(HOST_TR_BASE, 1499 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1500 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1501 1502 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1503 /* 22.2.3 */ 1504 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1505 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1506 } 1507 1508 vmx->loaded_vmcs->cpu = cpu; 1509 } 1510 } 1511 1512 /* 1513 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1514 * vcpu mutex is already taken. 1515 */ 1516 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1517 { 1518 struct vcpu_vmx *vmx = to_vmx(vcpu); 1519 1520 if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) 1521 shrink_ple_window(vcpu); 1522 1523 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1524 1525 vmx_vcpu_pi_load(vcpu, cpu); 1526 1527 vmx->host_debugctlmsr = get_debugctlmsr(); 1528 } 1529 1530 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1531 { 1532 vmx_vcpu_pi_put(vcpu); 1533 1534 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1535 } 1536 1537 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1538 { 1539 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1540 } 1541 1542 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1543 { 1544 struct vcpu_vmx *vmx = to_vmx(vcpu); 1545 unsigned long rflags, save_rflags; 1546 1547 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1548 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1549 rflags = vmcs_readl(GUEST_RFLAGS); 1550 if (vmx->rmode.vm86_active) { 1551 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1552 save_rflags = vmx->rmode.save_rflags; 1553 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1554 } 1555 vmx->rflags = rflags; 1556 } 1557 return vmx->rflags; 1558 } 1559 1560 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1561 { 1562 struct vcpu_vmx *vmx = to_vmx(vcpu); 1563 unsigned long old_rflags; 1564 1565 /* 1566 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1567 * is an unrestricted guest in order to mark L2 as needing emulation 1568 * if L1 runs L2 as a restricted guest. 1569 */ 1570 if (is_unrestricted_guest(vcpu)) { 1571 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1572 vmx->rflags = rflags; 1573 vmcs_writel(GUEST_RFLAGS, rflags); 1574 return; 1575 } 1576 1577 old_rflags = vmx_get_rflags(vcpu); 1578 vmx->rflags = rflags; 1579 if (vmx->rmode.vm86_active) { 1580 vmx->rmode.save_rflags = rflags; 1581 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1582 } 1583 vmcs_writel(GUEST_RFLAGS, rflags); 1584 1585 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1586 vmx->emulation_required = vmx_emulation_required(vcpu); 1587 } 1588 1589 bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1590 { 1591 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1592 } 1593 1594 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1595 { 1596 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1597 int ret = 0; 1598 1599 if (interruptibility & GUEST_INTR_STATE_STI) 1600 ret |= KVM_X86_SHADOW_INT_STI; 1601 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1602 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1603 1604 return ret; 1605 } 1606 1607 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1608 { 1609 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1610 u32 interruptibility = interruptibility_old; 1611 1612 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1613 1614 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1615 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1616 else if (mask & KVM_X86_SHADOW_INT_STI) 1617 interruptibility |= GUEST_INTR_STATE_STI; 1618 1619 if ((interruptibility != interruptibility_old)) 1620 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1621 } 1622 1623 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1624 { 1625 struct vcpu_vmx *vmx = to_vmx(vcpu); 1626 unsigned long value; 1627 1628 /* 1629 * Any MSR write that attempts to change bits marked reserved will 1630 * case a #GP fault. 1631 */ 1632 if (data & vmx->pt_desc.ctl_bitmask) 1633 return 1; 1634 1635 /* 1636 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1637 * result in a #GP unless the same write also clears TraceEn. 1638 */ 1639 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1640 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) 1641 return 1; 1642 1643 /* 1644 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1645 * and FabricEn would cause #GP, if 1646 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1647 */ 1648 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1649 !(data & RTIT_CTL_FABRIC_EN) && 1650 !intel_pt_validate_cap(vmx->pt_desc.caps, 1651 PT_CAP_single_range_output)) 1652 return 1; 1653 1654 /* 1655 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1656 * utilize encodings marked reserved will cause a #GP fault. 1657 */ 1658 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1659 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1660 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1661 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1662 return 1; 1663 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1664 PT_CAP_cycle_thresholds); 1665 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1666 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1667 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1668 return 1; 1669 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1670 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1671 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1672 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1673 return 1; 1674 1675 /* 1676 * If ADDRx_CFG is reserved or the encodings is >2 will 1677 * cause a #GP fault. 1678 */ 1679 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1680 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1681 return 1; 1682 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1683 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1684 return 1; 1685 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1686 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1687 return 1; 1688 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1689 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1690 return 1; 1691 1692 return 0; 1693 } 1694 1695 int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1696 void *insn, int insn_len) 1697 { 1698 /* 1699 * Emulation of instructions in SGX enclaves is impossible as RIP does 1700 * not point at the failing instruction, and even if it did, the code 1701 * stream is inaccessible. Inject #UD instead of exiting to userspace 1702 * so that guest userspace can't DoS the guest simply by triggering 1703 * emulation (enclaves are CPL3 only). 1704 */ 1705 if (to_vmx(vcpu)->exit_reason.enclave_mode) { 1706 kvm_queue_exception(vcpu, UD_VECTOR); 1707 return X86EMUL_PROPAGATE_FAULT; 1708 } 1709 return X86EMUL_CONTINUE; 1710 } 1711 1712 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1713 { 1714 union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; 1715 unsigned long rip, orig_rip; 1716 u32 instr_len; 1717 1718 /* 1719 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1720 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1721 * set when EPT misconfig occurs. In practice, real hardware updates 1722 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1723 * (namely Hyper-V) don't set it due to it being undefined behavior, 1724 * i.e. we end up advancing IP with some random value. 1725 */ 1726 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1727 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1728 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1729 1730 /* 1731 * Emulating an enclave's instructions isn't supported as KVM 1732 * cannot access the enclave's memory or its true RIP, e.g. the 1733 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1734 * the RIP that actually triggered the VM-Exit. But, because 1735 * most instructions that cause VM-Exit will #UD in an enclave, 1736 * most instruction-based VM-Exits simply do not occur. 1737 * 1738 * There are a few exceptions, notably the debug instructions 1739 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1740 * and generate #DB/#BP as expected, which KVM might intercept. 1741 * But again, the CPU does the dirty work and saves an instr 1742 * length of zero so VMMs don't shoot themselves in the foot. 1743 * WARN if KVM tries to skip a non-zero length instruction on 1744 * a VM-Exit from an enclave. 1745 */ 1746 if (!instr_len) 1747 goto rip_updated; 1748 1749 WARN_ONCE(exit_reason.enclave_mode, 1750 "skipping instruction after SGX enclave VM-Exit"); 1751 1752 orig_rip = kvm_rip_read(vcpu); 1753 rip = orig_rip + instr_len; 1754 #ifdef CONFIG_X86_64 1755 /* 1756 * We need to mask out the high 32 bits of RIP if not in 64-bit 1757 * mode, but just finding out that we are in 64-bit mode is 1758 * quite expensive. Only do it if there was a carry. 1759 */ 1760 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1761 rip = (u32)rip; 1762 #endif 1763 kvm_rip_write(vcpu, rip); 1764 } else { 1765 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1766 return 0; 1767 } 1768 1769 rip_updated: 1770 /* skipping an emulated instruction also counts */ 1771 vmx_set_interrupt_shadow(vcpu, 0); 1772 1773 return 1; 1774 } 1775 1776 /* 1777 * Recognizes a pending MTF VM-exit and records the nested state for later 1778 * delivery. 1779 */ 1780 void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1781 { 1782 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1783 struct vcpu_vmx *vmx = to_vmx(vcpu); 1784 1785 if (!is_guest_mode(vcpu)) 1786 return; 1787 1788 /* 1789 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1790 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1791 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1792 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1793 * as ICEBP is higher priority than both. As instruction emulation is 1794 * completed at this point (i.e. KVM is at the instruction boundary), 1795 * any #DB exception pending delivery must be a debug-trap of lower 1796 * priority than MTF. Record the pending MTF state to be delivered in 1797 * vmx_check_nested_events(). 1798 */ 1799 if (nested_cpu_has_mtf(vmcs12) && 1800 (!vcpu->arch.exception.pending || 1801 vcpu->arch.exception.vector == DB_VECTOR) && 1802 (!vcpu->arch.exception_vmexit.pending || 1803 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1804 vmx->nested.mtf_pending = true; 1805 kvm_make_request(KVM_REQ_EVENT, vcpu); 1806 } else { 1807 vmx->nested.mtf_pending = false; 1808 } 1809 } 1810 1811 int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1812 { 1813 vmx_update_emulated_instruction(vcpu); 1814 return skip_emulated_instruction(vcpu); 1815 } 1816 1817 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1818 { 1819 /* 1820 * Ensure that we clear the HLT state in the VMCS. We don't need to 1821 * explicitly skip the instruction because if the HLT state is set, 1822 * then the instruction is already executing and RIP has already been 1823 * advanced. 1824 */ 1825 if (kvm_hlt_in_guest(vcpu->kvm) && 1826 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1827 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1828 } 1829 1830 void vmx_inject_exception(struct kvm_vcpu *vcpu) 1831 { 1832 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1833 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1834 struct vcpu_vmx *vmx = to_vmx(vcpu); 1835 1836 kvm_deliver_exception_payload(vcpu, ex); 1837 1838 if (ex->has_error_code) { 1839 /* 1840 * Despite the error code being architecturally defined as 32 1841 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1842 * VMX don't actually supporting setting bits 31:16. Hardware 1843 * will (should) never provide a bogus error code, but AMD CPUs 1844 * do generate error codes with bits 31:16 set, and so KVM's 1845 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1846 * the upper bits to avoid VM-Fail, losing information that 1847 * doesn't really exist is preferable to killing the VM. 1848 */ 1849 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1850 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1851 } 1852 1853 if (vmx->rmode.vm86_active) { 1854 int inc_eip = 0; 1855 if (kvm_exception_is_soft(ex->vector)) 1856 inc_eip = vcpu->arch.event_exit_inst_len; 1857 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1858 return; 1859 } 1860 1861 WARN_ON_ONCE(vmx->emulation_required); 1862 1863 if (kvm_exception_is_soft(ex->vector)) { 1864 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1865 vmx->vcpu.arch.event_exit_inst_len); 1866 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1867 } else 1868 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1869 1870 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1871 1872 vmx_clear_hlt(vcpu); 1873 } 1874 1875 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1876 bool load_into_hardware) 1877 { 1878 struct vmx_uret_msr *uret_msr; 1879 1880 uret_msr = vmx_find_uret_msr(vmx, msr); 1881 if (!uret_msr) 1882 return; 1883 1884 uret_msr->load_into_hardware = load_into_hardware; 1885 } 1886 1887 /* 1888 * Configuring user return MSRs to automatically save, load, and restore MSRs 1889 * that need to be shoved into hardware when running the guest. Note, omitting 1890 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1891 * loaded into hardware when running the guest. 1892 */ 1893 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1894 { 1895 #ifdef CONFIG_X86_64 1896 bool load_syscall_msrs; 1897 1898 /* 1899 * The SYSCALL MSRs are only needed on long mode guests, and only 1900 * when EFER.SCE is set. 1901 */ 1902 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1903 (vmx->vcpu.arch.efer & EFER_SCE); 1904 1905 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1906 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1907 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1908 #endif 1909 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1910 1911 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1912 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1913 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1914 1915 /* 1916 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1917 * kernel and old userspace. If those guests run on a tsx=off host, do 1918 * allow guests to use TSX_CTRL, but don't change the value in hardware 1919 * so that TSX remains always disabled. 1920 */ 1921 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1922 1923 /* 1924 * The set of MSRs to load may have changed, reload MSRs before the 1925 * next VM-Enter. 1926 */ 1927 vmx->guest_uret_msrs_loaded = false; 1928 } 1929 1930 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1931 { 1932 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1933 1934 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 1935 return vmcs12->tsc_offset; 1936 1937 return 0; 1938 } 1939 1940 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1941 { 1942 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1943 1944 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 1945 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 1946 return vmcs12->tsc_multiplier; 1947 1948 return kvm_caps.default_tsc_scaling_ratio; 1949 } 1950 1951 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 1952 { 1953 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 1954 } 1955 1956 void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1957 { 1958 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 1959 } 1960 1961 /* 1962 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 1963 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 1964 * backwards compatibility even though KVM doesn't support emulating SMX. And 1965 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 1966 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 1967 */ 1968 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 1969 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 1970 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 1971 FEAT_CTL_SGX_LC_ENABLED | \ 1972 FEAT_CTL_SGX_ENABLED | \ 1973 FEAT_CTL_LMCE_ENABLED) 1974 1975 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 1976 struct msr_data *msr) 1977 { 1978 uint64_t valid_bits; 1979 1980 /* 1981 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 1982 * exposed to the guest. 1983 */ 1984 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 1985 ~KVM_SUPPORTED_FEATURE_CONTROL); 1986 1987 if (!msr->host_initiated && 1988 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 1989 return false; 1990 1991 if (msr->host_initiated) 1992 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 1993 else 1994 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 1995 1996 return !(msr->data & ~valid_bits); 1997 } 1998 1999 int vmx_get_feature_msr(u32 msr, u64 *data) 2000 { 2001 switch (msr) { 2002 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2003 if (!nested) 2004 return 1; 2005 return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); 2006 default: 2007 return KVM_MSR_RET_UNSUPPORTED; 2008 } 2009 } 2010 2011 /* 2012 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 2013 * Returns 0 on success, non-0 otherwise. 2014 * Assumes vcpu_load() was already called. 2015 */ 2016 int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2017 { 2018 struct vcpu_vmx *vmx = to_vmx(vcpu); 2019 struct vmx_uret_msr *msr; 2020 u32 index; 2021 2022 switch (msr_info->index) { 2023 #ifdef CONFIG_X86_64 2024 case MSR_FS_BASE: 2025 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2026 break; 2027 case MSR_GS_BASE: 2028 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2029 break; 2030 case MSR_KERNEL_GS_BASE: 2031 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 2032 break; 2033 #endif 2034 case MSR_EFER: 2035 return kvm_get_msr_common(vcpu, msr_info); 2036 case MSR_IA32_TSX_CTRL: 2037 if (!msr_info->host_initiated && 2038 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2039 return 1; 2040 goto find_uret_msr; 2041 case MSR_IA32_UMWAIT_CONTROL: 2042 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2043 return 1; 2044 2045 msr_info->data = vmx->msr_ia32_umwait_control; 2046 break; 2047 case MSR_IA32_SPEC_CTRL: 2048 if (!msr_info->host_initiated && 2049 !guest_has_spec_ctrl_msr(vcpu)) 2050 return 1; 2051 2052 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2053 break; 2054 case MSR_IA32_SYSENTER_CS: 2055 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2056 break; 2057 case MSR_IA32_SYSENTER_EIP: 2058 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2059 break; 2060 case MSR_IA32_SYSENTER_ESP: 2061 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2062 break; 2063 case MSR_IA32_BNDCFGS: 2064 if (!kvm_mpx_supported() || 2065 (!msr_info->host_initiated && 2066 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 2067 return 1; 2068 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2069 break; 2070 case MSR_IA32_MCG_EXT_CTL: 2071 if (!msr_info->host_initiated && 2072 !(vmx->msr_ia32_feature_control & 2073 FEAT_CTL_LMCE_ENABLED)) 2074 return 1; 2075 msr_info->data = vcpu->arch.mcg_ext_ctl; 2076 break; 2077 case MSR_IA32_FEAT_CTL: 2078 msr_info->data = vmx->msr_ia32_feature_control; 2079 break; 2080 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2081 if (!msr_info->host_initiated && 2082 !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) 2083 return 1; 2084 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2085 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2086 break; 2087 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2088 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 2089 return 1; 2090 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2091 &msr_info->data)) 2092 return 1; 2093 #ifdef CONFIG_KVM_HYPERV 2094 /* 2095 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2096 * instead of just ignoring the features, different Hyper-V 2097 * versions are either trying to use them and fail or do some 2098 * sanity checking and refuse to boot. Filter all unsupported 2099 * features out. 2100 */ 2101 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) 2102 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2103 &msr_info->data); 2104 #endif 2105 break; 2106 case MSR_IA32_RTIT_CTL: 2107 if (!vmx_pt_mode_is_host_guest()) 2108 return 1; 2109 msr_info->data = vmx->pt_desc.guest.ctl; 2110 break; 2111 case MSR_IA32_RTIT_STATUS: 2112 if (!vmx_pt_mode_is_host_guest()) 2113 return 1; 2114 msr_info->data = vmx->pt_desc.guest.status; 2115 break; 2116 case MSR_IA32_RTIT_CR3_MATCH: 2117 if (!vmx_pt_mode_is_host_guest() || 2118 !intel_pt_validate_cap(vmx->pt_desc.caps, 2119 PT_CAP_cr3_filtering)) 2120 return 1; 2121 msr_info->data = vmx->pt_desc.guest.cr3_match; 2122 break; 2123 case MSR_IA32_RTIT_OUTPUT_BASE: 2124 if (!vmx_pt_mode_is_host_guest() || 2125 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2126 PT_CAP_topa_output) && 2127 !intel_pt_validate_cap(vmx->pt_desc.caps, 2128 PT_CAP_single_range_output))) 2129 return 1; 2130 msr_info->data = vmx->pt_desc.guest.output_base; 2131 break; 2132 case MSR_IA32_RTIT_OUTPUT_MASK: 2133 if (!vmx_pt_mode_is_host_guest() || 2134 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2135 PT_CAP_topa_output) && 2136 !intel_pt_validate_cap(vmx->pt_desc.caps, 2137 PT_CAP_single_range_output))) 2138 return 1; 2139 msr_info->data = vmx->pt_desc.guest.output_mask; 2140 break; 2141 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2142 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2143 if (!vmx_pt_mode_is_host_guest() || 2144 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2145 return 1; 2146 if (index % 2) 2147 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2148 else 2149 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2150 break; 2151 case MSR_IA32_DEBUGCTLMSR: 2152 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 2153 break; 2154 default: 2155 find_uret_msr: 2156 msr = vmx_find_uret_msr(vmx, msr_info->index); 2157 if (msr) { 2158 msr_info->data = msr->data; 2159 break; 2160 } 2161 return kvm_get_msr_common(vcpu, msr_info); 2162 } 2163 2164 return 0; 2165 } 2166 2167 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2168 u64 data) 2169 { 2170 #ifdef CONFIG_X86_64 2171 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) 2172 return (u32)data; 2173 #endif 2174 return (unsigned long)data; 2175 } 2176 2177 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2178 { 2179 u64 debugctl = 0; 2180 2181 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2182 (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2183 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2184 2185 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && 2186 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2187 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2188 2189 return debugctl; 2190 } 2191 2192 /* 2193 * Writes msr value into the appropriate "register". 2194 * Returns 0 on success, non-0 otherwise. 2195 * Assumes vcpu_load() was already called. 2196 */ 2197 int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2198 { 2199 struct vcpu_vmx *vmx = to_vmx(vcpu); 2200 struct vmx_uret_msr *msr; 2201 int ret = 0; 2202 u32 msr_index = msr_info->index; 2203 u64 data = msr_info->data; 2204 u32 index; 2205 2206 switch (msr_index) { 2207 case MSR_EFER: 2208 ret = kvm_set_msr_common(vcpu, msr_info); 2209 break; 2210 #ifdef CONFIG_X86_64 2211 case MSR_FS_BASE: 2212 vmx_segment_cache_clear(vmx); 2213 vmcs_writel(GUEST_FS_BASE, data); 2214 break; 2215 case MSR_GS_BASE: 2216 vmx_segment_cache_clear(vmx); 2217 vmcs_writel(GUEST_GS_BASE, data); 2218 break; 2219 case MSR_KERNEL_GS_BASE: 2220 vmx_write_guest_kernel_gs_base(vmx, data); 2221 break; 2222 case MSR_IA32_XFD: 2223 ret = kvm_set_msr_common(vcpu, msr_info); 2224 /* 2225 * Always intercepting WRMSR could incur non-negligible 2226 * overhead given xfd might be changed frequently in 2227 * guest context switch. Disable write interception 2228 * upon the first write with a non-zero value (indicating 2229 * potential usage on dynamic xfeatures). Also update 2230 * exception bitmap to trap #NM for proper virtualization 2231 * of guest xfd_err. 2232 */ 2233 if (!ret && data) { 2234 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2235 MSR_TYPE_RW); 2236 vcpu->arch.xfd_no_write_intercept = true; 2237 vmx_update_exception_bitmap(vcpu); 2238 } 2239 break; 2240 #endif 2241 case MSR_IA32_SYSENTER_CS: 2242 if (is_guest_mode(vcpu)) 2243 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2244 vmcs_write32(GUEST_SYSENTER_CS, data); 2245 break; 2246 case MSR_IA32_SYSENTER_EIP: 2247 if (is_guest_mode(vcpu)) { 2248 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2249 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2250 } 2251 vmcs_writel(GUEST_SYSENTER_EIP, data); 2252 break; 2253 case MSR_IA32_SYSENTER_ESP: 2254 if (is_guest_mode(vcpu)) { 2255 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2256 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2257 } 2258 vmcs_writel(GUEST_SYSENTER_ESP, data); 2259 break; 2260 case MSR_IA32_DEBUGCTLMSR: { 2261 u64 invalid; 2262 2263 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2264 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2265 kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); 2266 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2267 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2268 } 2269 2270 if (invalid) 2271 return 1; 2272 2273 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2274 VM_EXIT_SAVE_DEBUG_CONTROLS) 2275 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2276 2277 vmcs_write64(GUEST_IA32_DEBUGCTL, data); 2278 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2279 (data & DEBUGCTLMSR_LBR)) 2280 intel_pmu_create_guest_lbr_event(vcpu); 2281 return 0; 2282 } 2283 case MSR_IA32_BNDCFGS: 2284 if (!kvm_mpx_supported() || 2285 (!msr_info->host_initiated && 2286 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 2287 return 1; 2288 if (is_noncanonical_address(data & PAGE_MASK, vcpu) || 2289 (data & MSR_IA32_BNDCFGS_RSVD)) 2290 return 1; 2291 2292 if (is_guest_mode(vcpu) && 2293 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2294 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2295 get_vmcs12(vcpu)->guest_bndcfgs = data; 2296 2297 vmcs_write64(GUEST_BNDCFGS, data); 2298 break; 2299 case MSR_IA32_UMWAIT_CONTROL: 2300 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2301 return 1; 2302 2303 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2304 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2305 return 1; 2306 2307 vmx->msr_ia32_umwait_control = data; 2308 break; 2309 case MSR_IA32_SPEC_CTRL: 2310 if (!msr_info->host_initiated && 2311 !guest_has_spec_ctrl_msr(vcpu)) 2312 return 1; 2313 2314 if (kvm_spec_ctrl_test_value(data)) 2315 return 1; 2316 2317 vmx->spec_ctrl = data; 2318 if (!data) 2319 break; 2320 2321 /* 2322 * For non-nested: 2323 * When it's written (to non-zero) for the first time, pass 2324 * it through. 2325 * 2326 * For nested: 2327 * The handling of the MSR bitmap for L2 guests is done in 2328 * nested_vmx_prepare_msr_bitmap. We should not touch the 2329 * vmcs02.msr_bitmap here since it gets completely overwritten 2330 * in the merging. We update the vmcs01 here for L1 as well 2331 * since it will end up touching the MSR anyway now. 2332 */ 2333 vmx_disable_intercept_for_msr(vcpu, 2334 MSR_IA32_SPEC_CTRL, 2335 MSR_TYPE_RW); 2336 break; 2337 case MSR_IA32_TSX_CTRL: 2338 if (!msr_info->host_initiated && 2339 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2340 return 1; 2341 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2342 return 1; 2343 goto find_uret_msr; 2344 case MSR_IA32_CR_PAT: 2345 ret = kvm_set_msr_common(vcpu, msr_info); 2346 if (ret) 2347 break; 2348 2349 if (is_guest_mode(vcpu) && 2350 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2351 get_vmcs12(vcpu)->guest_ia32_pat = data; 2352 2353 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2354 vmcs_write64(GUEST_IA32_PAT, data); 2355 break; 2356 case MSR_IA32_MCG_EXT_CTL: 2357 if ((!msr_info->host_initiated && 2358 !(to_vmx(vcpu)->msr_ia32_feature_control & 2359 FEAT_CTL_LMCE_ENABLED)) || 2360 (data & ~MCG_EXT_CTL_LMCE_EN)) 2361 return 1; 2362 vcpu->arch.mcg_ext_ctl = data; 2363 break; 2364 case MSR_IA32_FEAT_CTL: 2365 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2366 return 1; 2367 2368 vmx->msr_ia32_feature_control = data; 2369 if (msr_info->host_initiated && data == 0) 2370 vmx_leave_nested(vcpu); 2371 2372 /* SGX may be enabled/disabled by guest's firmware */ 2373 vmx_write_encls_bitmap(vcpu, NULL); 2374 break; 2375 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2376 /* 2377 * On real hardware, the LE hash MSRs are writable before 2378 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2379 * at which point SGX related bits in IA32_FEATURE_CONTROL 2380 * become writable. 2381 * 2382 * KVM does not emulate SGX activation for simplicity, so 2383 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2384 * is unlocked. This is technically not architectural 2385 * behavior, but it's close enough. 2386 */ 2387 if (!msr_info->host_initiated && 2388 (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || 2389 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2390 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2391 return 1; 2392 vmx->msr_ia32_sgxlepubkeyhash 2393 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2394 break; 2395 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2396 if (!msr_info->host_initiated) 2397 return 1; /* they are read-only */ 2398 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 2399 return 1; 2400 return vmx_set_vmx_msr(vcpu, msr_index, data); 2401 case MSR_IA32_RTIT_CTL: 2402 if (!vmx_pt_mode_is_host_guest() || 2403 vmx_rtit_ctl_check(vcpu, data) || 2404 vmx->nested.vmxon) 2405 return 1; 2406 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2407 vmx->pt_desc.guest.ctl = data; 2408 pt_update_intercept_for_msr(vcpu); 2409 break; 2410 case MSR_IA32_RTIT_STATUS: 2411 if (!pt_can_write_msr(vmx)) 2412 return 1; 2413 if (data & MSR_IA32_RTIT_STATUS_MASK) 2414 return 1; 2415 vmx->pt_desc.guest.status = data; 2416 break; 2417 case MSR_IA32_RTIT_CR3_MATCH: 2418 if (!pt_can_write_msr(vmx)) 2419 return 1; 2420 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2421 PT_CAP_cr3_filtering)) 2422 return 1; 2423 vmx->pt_desc.guest.cr3_match = data; 2424 break; 2425 case MSR_IA32_RTIT_OUTPUT_BASE: 2426 if (!pt_can_write_msr(vmx)) 2427 return 1; 2428 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2429 PT_CAP_topa_output) && 2430 !intel_pt_validate_cap(vmx->pt_desc.caps, 2431 PT_CAP_single_range_output)) 2432 return 1; 2433 if (!pt_output_base_valid(vcpu, data)) 2434 return 1; 2435 vmx->pt_desc.guest.output_base = data; 2436 break; 2437 case MSR_IA32_RTIT_OUTPUT_MASK: 2438 if (!pt_can_write_msr(vmx)) 2439 return 1; 2440 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2441 PT_CAP_topa_output) && 2442 !intel_pt_validate_cap(vmx->pt_desc.caps, 2443 PT_CAP_single_range_output)) 2444 return 1; 2445 vmx->pt_desc.guest.output_mask = data; 2446 break; 2447 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2448 if (!pt_can_write_msr(vmx)) 2449 return 1; 2450 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2451 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2452 return 1; 2453 if (is_noncanonical_address(data, vcpu)) 2454 return 1; 2455 if (index % 2) 2456 vmx->pt_desc.guest.addr_b[index / 2] = data; 2457 else 2458 vmx->pt_desc.guest.addr_a[index / 2] = data; 2459 break; 2460 case MSR_IA32_PERF_CAPABILITIES: 2461 if (data && !vcpu_to_pmu(vcpu)->version) 2462 return 1; 2463 if (data & PMU_CAP_LBR_FMT) { 2464 if ((data & PMU_CAP_LBR_FMT) != 2465 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) 2466 return 1; 2467 if (!cpuid_model_is_consistent(vcpu)) 2468 return 1; 2469 } 2470 if (data & PERF_CAP_PEBS_FORMAT) { 2471 if ((data & PERF_CAP_PEBS_MASK) != 2472 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2473 return 1; 2474 if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) 2475 return 1; 2476 if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) 2477 return 1; 2478 if (!cpuid_model_is_consistent(vcpu)) 2479 return 1; 2480 } 2481 ret = kvm_set_msr_common(vcpu, msr_info); 2482 break; 2483 2484 default: 2485 find_uret_msr: 2486 msr = vmx_find_uret_msr(vmx, msr_index); 2487 if (msr) 2488 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2489 else 2490 ret = kvm_set_msr_common(vcpu, msr_info); 2491 } 2492 2493 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2494 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2495 vmx_update_fb_clear_dis(vcpu, vmx); 2496 2497 return ret; 2498 } 2499 2500 void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2501 { 2502 unsigned long guest_owned_bits; 2503 2504 kvm_register_mark_available(vcpu, reg); 2505 2506 switch (reg) { 2507 case VCPU_REGS_RSP: 2508 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2509 break; 2510 case VCPU_REGS_RIP: 2511 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2512 break; 2513 case VCPU_EXREG_PDPTR: 2514 if (enable_ept) 2515 ept_save_pdptrs(vcpu); 2516 break; 2517 case VCPU_EXREG_CR0: 2518 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2519 2520 vcpu->arch.cr0 &= ~guest_owned_bits; 2521 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2522 break; 2523 case VCPU_EXREG_CR3: 2524 /* 2525 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2526 * CR3 is loaded into hardware, not the guest's CR3. 2527 */ 2528 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2529 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2530 break; 2531 case VCPU_EXREG_CR4: 2532 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2533 2534 vcpu->arch.cr4 &= ~guest_owned_bits; 2535 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2536 break; 2537 default: 2538 KVM_BUG_ON(1, vcpu->kvm); 2539 break; 2540 } 2541 } 2542 2543 /* 2544 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2545 * directly instead of going through cpu_has(), to ensure KVM is trapping 2546 * ENCLS whenever it's supported in hardware. It does not matter whether 2547 * the host OS supports or has enabled SGX. 2548 */ 2549 static bool cpu_has_sgx(void) 2550 { 2551 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2552 } 2553 2554 /* 2555 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2556 * can't be used due to errata where VM Exit may incorrectly clear 2557 * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the 2558 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2559 */ 2560 static bool cpu_has_perf_global_ctrl_bug(void) 2561 { 2562 switch (boot_cpu_data.x86_vfm) { 2563 case INTEL_NEHALEM_EP: /* AAK155 */ 2564 case INTEL_NEHALEM: /* AAP115 */ 2565 case INTEL_WESTMERE: /* AAT100 */ 2566 case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2567 case INTEL_NEHALEM_EX: /* BA97 */ 2568 return true; 2569 default: 2570 break; 2571 } 2572 2573 return false; 2574 } 2575 2576 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2577 { 2578 u32 vmx_msr_low, vmx_msr_high; 2579 u32 ctl = ctl_min | ctl_opt; 2580 2581 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2582 2583 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2584 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2585 2586 /* Ensure minimum (required) set of control bits are supported. */ 2587 if (ctl_min & ~ctl) 2588 return -EIO; 2589 2590 *result = ctl; 2591 return 0; 2592 } 2593 2594 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2595 { 2596 u64 allowed; 2597 2598 rdmsrl(msr, allowed); 2599 2600 return ctl_opt & allowed; 2601 } 2602 2603 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2604 struct vmx_capability *vmx_cap) 2605 { 2606 u32 _pin_based_exec_control = 0; 2607 u32 _cpu_based_exec_control = 0; 2608 u32 _cpu_based_2nd_exec_control = 0; 2609 u64 _cpu_based_3rd_exec_control = 0; 2610 u32 _vmexit_control = 0; 2611 u32 _vmentry_control = 0; 2612 u64 basic_msr; 2613 u64 misc_msr; 2614 int i; 2615 2616 /* 2617 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2618 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2619 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2620 */ 2621 struct { 2622 u32 entry_control; 2623 u32 exit_control; 2624 } const vmcs_entry_exit_pairs[] = { 2625 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2626 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2627 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2628 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2629 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2630 }; 2631 2632 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2633 2634 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2635 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2636 MSR_IA32_VMX_PROCBASED_CTLS, 2637 &_cpu_based_exec_control)) 2638 return -EIO; 2639 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2640 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2641 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2642 MSR_IA32_VMX_PROCBASED_CTLS2, 2643 &_cpu_based_2nd_exec_control)) 2644 return -EIO; 2645 } 2646 if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) 2647 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2648 2649 #ifndef CONFIG_X86_64 2650 if (!(_cpu_based_2nd_exec_control & 2651 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2652 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2653 #endif 2654 2655 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2656 _cpu_based_2nd_exec_control &= ~( 2657 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2658 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2659 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2660 2661 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2662 &vmx_cap->ept, &vmx_cap->vpid); 2663 2664 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2665 vmx_cap->ept) { 2666 pr_warn_once("EPT CAP should not exist if not support " 2667 "1-setting enable EPT VM-execution control\n"); 2668 2669 if (error_on_inconsistent_vmcs_config) 2670 return -EIO; 2671 2672 vmx_cap->ept = 0; 2673 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 2674 } 2675 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2676 vmx_cap->vpid) { 2677 pr_warn_once("VPID CAP should not exist if not support " 2678 "1-setting enable VPID VM-execution control\n"); 2679 2680 if (error_on_inconsistent_vmcs_config) 2681 return -EIO; 2682 2683 vmx_cap->vpid = 0; 2684 } 2685 2686 if (!cpu_has_sgx()) 2687 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2688 2689 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2690 _cpu_based_3rd_exec_control = 2691 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2692 MSR_IA32_VMX_PROCBASED_CTLS3); 2693 2694 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2695 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2696 MSR_IA32_VMX_EXIT_CTLS, 2697 &_vmexit_control)) 2698 return -EIO; 2699 2700 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2701 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2702 MSR_IA32_VMX_PINBASED_CTLS, 2703 &_pin_based_exec_control)) 2704 return -EIO; 2705 2706 if (cpu_has_broken_vmx_preemption_timer()) 2707 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2708 if (!(_cpu_based_2nd_exec_control & 2709 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2710 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2711 2712 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2713 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2714 MSR_IA32_VMX_ENTRY_CTLS, 2715 &_vmentry_control)) 2716 return -EIO; 2717 2718 for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { 2719 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; 2720 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; 2721 2722 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) 2723 continue; 2724 2725 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", 2726 _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); 2727 2728 if (error_on_inconsistent_vmcs_config) 2729 return -EIO; 2730 2731 _vmentry_control &= ~n_ctrl; 2732 _vmexit_control &= ~x_ctrl; 2733 } 2734 2735 rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); 2736 2737 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2738 if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) 2739 return -EIO; 2740 2741 #ifdef CONFIG_X86_64 2742 /* 2743 * KVM expects to be able to shove all legal physical addresses into 2744 * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always 2745 * 0 for processors that support Intel 64 architecture". 2746 */ 2747 if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 2748 return -EIO; 2749 #endif 2750 2751 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2752 if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) 2753 return -EIO; 2754 2755 rdmsrl(MSR_IA32_VMX_MISC, misc_msr); 2756 2757 vmcs_conf->basic = basic_msr; 2758 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2759 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2760 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2761 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2762 vmcs_conf->vmexit_ctrl = _vmexit_control; 2763 vmcs_conf->vmentry_ctrl = _vmentry_control; 2764 vmcs_conf->misc = misc_msr; 2765 2766 #if IS_ENABLED(CONFIG_HYPERV) 2767 if (enlightened_vmcs) 2768 evmcs_sanitize_exec_ctrls(vmcs_conf); 2769 #endif 2770 2771 return 0; 2772 } 2773 2774 static bool __kvm_is_vmx_supported(void) 2775 { 2776 int cpu = smp_processor_id(); 2777 2778 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2779 pr_err("VMX not supported by CPU %d\n", cpu); 2780 return false; 2781 } 2782 2783 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2784 !this_cpu_has(X86_FEATURE_VMX)) { 2785 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2786 return false; 2787 } 2788 2789 return true; 2790 } 2791 2792 static bool kvm_is_vmx_supported(void) 2793 { 2794 bool supported; 2795 2796 migrate_disable(); 2797 supported = __kvm_is_vmx_supported(); 2798 migrate_enable(); 2799 2800 return supported; 2801 } 2802 2803 int vmx_check_processor_compat(void) 2804 { 2805 int cpu = raw_smp_processor_id(); 2806 struct vmcs_config vmcs_conf; 2807 struct vmx_capability vmx_cap; 2808 2809 if (!__kvm_is_vmx_supported()) 2810 return -EIO; 2811 2812 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2813 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2814 return -EIO; 2815 } 2816 if (nested) 2817 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2818 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2819 pr_err("Inconsistent VMCS config on CPU %d\n", cpu); 2820 return -EIO; 2821 } 2822 return 0; 2823 } 2824 2825 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2826 { 2827 u64 msr; 2828 2829 cr4_set_bits(X86_CR4_VMXE); 2830 2831 asm goto("1: vmxon %[vmxon_pointer]\n\t" 2832 _ASM_EXTABLE(1b, %l[fault]) 2833 : : [vmxon_pointer] "m"(vmxon_pointer) 2834 : : fault); 2835 return 0; 2836 2837 fault: 2838 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2839 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2840 cr4_clear_bits(X86_CR4_VMXE); 2841 2842 return -EFAULT; 2843 } 2844 2845 int vmx_enable_virtualization_cpu(void) 2846 { 2847 int cpu = raw_smp_processor_id(); 2848 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2849 int r; 2850 2851 if (cr4_read_shadow() & X86_CR4_VMXE) 2852 return -EBUSY; 2853 2854 /* 2855 * This can happen if we hot-added a CPU but failed to allocate 2856 * VP assist page for it. 2857 */ 2858 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2859 return -EFAULT; 2860 2861 intel_pt_handle_vmx(1); 2862 2863 r = kvm_cpu_vmxon(phys_addr); 2864 if (r) { 2865 intel_pt_handle_vmx(0); 2866 return r; 2867 } 2868 2869 return 0; 2870 } 2871 2872 static void vmclear_local_loaded_vmcss(void) 2873 { 2874 int cpu = raw_smp_processor_id(); 2875 struct loaded_vmcs *v, *n; 2876 2877 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2878 loaded_vmcss_on_cpu_link) 2879 __loaded_vmcs_clear(v); 2880 } 2881 2882 void vmx_disable_virtualization_cpu(void) 2883 { 2884 vmclear_local_loaded_vmcss(); 2885 2886 if (kvm_cpu_vmxoff()) 2887 kvm_spurious_fault(); 2888 2889 hv_reset_evmcs(); 2890 2891 intel_pt_handle_vmx(0); 2892 } 2893 2894 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2895 { 2896 int node = cpu_to_node(cpu); 2897 struct page *pages; 2898 struct vmcs *vmcs; 2899 2900 pages = __alloc_pages_node(node, flags, 0); 2901 if (!pages) 2902 return NULL; 2903 vmcs = page_address(pages); 2904 memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); 2905 2906 /* KVM supports Enlightened VMCS v1 only */ 2907 if (kvm_is_using_evmcs()) 2908 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2909 else 2910 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 2911 2912 if (shadow) 2913 vmcs->hdr.shadow_vmcs = 1; 2914 return vmcs; 2915 } 2916 2917 void free_vmcs(struct vmcs *vmcs) 2918 { 2919 free_page((unsigned long)vmcs); 2920 } 2921 2922 /* 2923 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2924 */ 2925 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2926 { 2927 if (!loaded_vmcs->vmcs) 2928 return; 2929 loaded_vmcs_clear(loaded_vmcs); 2930 free_vmcs(loaded_vmcs->vmcs); 2931 loaded_vmcs->vmcs = NULL; 2932 if (loaded_vmcs->msr_bitmap) 2933 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2934 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2935 } 2936 2937 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2938 { 2939 loaded_vmcs->vmcs = alloc_vmcs(false); 2940 if (!loaded_vmcs->vmcs) 2941 return -ENOMEM; 2942 2943 vmcs_clear(loaded_vmcs->vmcs); 2944 2945 loaded_vmcs->shadow_vmcs = NULL; 2946 loaded_vmcs->hv_timer_soft_disabled = false; 2947 loaded_vmcs->cpu = -1; 2948 loaded_vmcs->launched = 0; 2949 2950 if (cpu_has_vmx_msr_bitmap()) { 2951 loaded_vmcs->msr_bitmap = (unsigned long *) 2952 __get_free_page(GFP_KERNEL_ACCOUNT); 2953 if (!loaded_vmcs->msr_bitmap) 2954 goto out_vmcs; 2955 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2956 } 2957 2958 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2959 memset(&loaded_vmcs->controls_shadow, 0, 2960 sizeof(struct vmcs_controls_shadow)); 2961 2962 return 0; 2963 2964 out_vmcs: 2965 free_loaded_vmcs(loaded_vmcs); 2966 return -ENOMEM; 2967 } 2968 2969 static void free_kvm_area(void) 2970 { 2971 int cpu; 2972 2973 for_each_possible_cpu(cpu) { 2974 free_vmcs(per_cpu(vmxarea, cpu)); 2975 per_cpu(vmxarea, cpu) = NULL; 2976 } 2977 } 2978 2979 static __init int alloc_kvm_area(void) 2980 { 2981 int cpu; 2982 2983 for_each_possible_cpu(cpu) { 2984 struct vmcs *vmcs; 2985 2986 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 2987 if (!vmcs) { 2988 free_kvm_area(); 2989 return -ENOMEM; 2990 } 2991 2992 /* 2993 * When eVMCS is enabled, alloc_vmcs_cpu() sets 2994 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 2995 * revision_id reported by MSR_IA32_VMX_BASIC. 2996 * 2997 * However, even though not explicitly documented by 2998 * TLFS, VMXArea passed as VMXON argument should 2999 * still be marked with revision_id reported by 3000 * physical CPU. 3001 */ 3002 if (kvm_is_using_evmcs()) 3003 vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); 3004 3005 per_cpu(vmxarea, cpu) = vmcs; 3006 } 3007 return 0; 3008 } 3009 3010 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3011 struct kvm_segment *save) 3012 { 3013 if (!emulate_invalid_guest_state) { 3014 /* 3015 * CS and SS RPL should be equal during guest entry according 3016 * to VMX spec, but in reality it is not always so. Since vcpu 3017 * is in the middle of the transition from real mode to 3018 * protected mode it is safe to assume that RPL 0 is a good 3019 * default value. 3020 */ 3021 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3022 save->selector &= ~SEGMENT_RPL_MASK; 3023 save->dpl = save->selector & SEGMENT_RPL_MASK; 3024 save->s = 1; 3025 } 3026 __vmx_set_segment(vcpu, save, seg); 3027 } 3028 3029 static void enter_pmode(struct kvm_vcpu *vcpu) 3030 { 3031 unsigned long flags; 3032 struct vcpu_vmx *vmx = to_vmx(vcpu); 3033 3034 /* 3035 * Update real mode segment cache. It may be not up-to-date if segment 3036 * register was written while vcpu was in a guest mode. 3037 */ 3038 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3039 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3040 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3041 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3042 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3043 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3044 3045 vmx->rmode.vm86_active = 0; 3046 3047 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3048 3049 flags = vmcs_readl(GUEST_RFLAGS); 3050 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3051 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3052 vmcs_writel(GUEST_RFLAGS, flags); 3053 3054 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3055 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3056 3057 vmx_update_exception_bitmap(vcpu); 3058 3059 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3060 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3061 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3062 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3063 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3064 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3065 } 3066 3067 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3068 { 3069 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3070 struct kvm_segment var = *save; 3071 3072 var.dpl = 0x3; 3073 if (seg == VCPU_SREG_CS) 3074 var.type = 0x3; 3075 3076 if (!emulate_invalid_guest_state) { 3077 var.selector = var.base >> 4; 3078 var.base = var.base & 0xffff0; 3079 var.limit = 0xffff; 3080 var.g = 0; 3081 var.db = 0; 3082 var.present = 1; 3083 var.s = 1; 3084 var.l = 0; 3085 var.unusable = 0; 3086 var.type = 0x3; 3087 var.avl = 0; 3088 if (save->base & 0xf) 3089 pr_warn_once("segment base is not paragraph aligned " 3090 "when entering protected mode (seg=%d)", seg); 3091 } 3092 3093 vmcs_write16(sf->selector, var.selector); 3094 vmcs_writel(sf->base, var.base); 3095 vmcs_write32(sf->limit, var.limit); 3096 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3097 } 3098 3099 static void enter_rmode(struct kvm_vcpu *vcpu) 3100 { 3101 unsigned long flags; 3102 struct vcpu_vmx *vmx = to_vmx(vcpu); 3103 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3104 3105 /* 3106 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3107 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3108 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3109 * should VM-Fail and KVM should reject userspace attempts to stuff 3110 * CR0.PG=0 when L2 is active. 3111 */ 3112 WARN_ON_ONCE(is_guest_mode(vcpu)); 3113 3114 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3115 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3116 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3117 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3118 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3119 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3120 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3121 3122 vmx->rmode.vm86_active = 1; 3123 3124 vmx_segment_cache_clear(vmx); 3125 3126 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3127 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3128 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3129 3130 flags = vmcs_readl(GUEST_RFLAGS); 3131 vmx->rmode.save_rflags = flags; 3132 3133 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3134 3135 vmcs_writel(GUEST_RFLAGS, flags); 3136 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3137 vmx_update_exception_bitmap(vcpu); 3138 3139 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3140 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3141 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3142 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3143 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3144 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3145 } 3146 3147 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3148 { 3149 struct vcpu_vmx *vmx = to_vmx(vcpu); 3150 3151 /* Nothing to do if hardware doesn't support EFER. */ 3152 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3153 return 0; 3154 3155 vcpu->arch.efer = efer; 3156 #ifdef CONFIG_X86_64 3157 if (efer & EFER_LMA) 3158 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3159 else 3160 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3161 #else 3162 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3163 return 1; 3164 #endif 3165 3166 vmx_setup_uret_msrs(vmx); 3167 return 0; 3168 } 3169 3170 #ifdef CONFIG_X86_64 3171 3172 static void enter_lmode(struct kvm_vcpu *vcpu) 3173 { 3174 u32 guest_tr_ar; 3175 3176 vmx_segment_cache_clear(to_vmx(vcpu)); 3177 3178 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3179 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3180 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3181 __func__); 3182 vmcs_write32(GUEST_TR_AR_BYTES, 3183 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3184 | VMX_AR_TYPE_BUSY_64_TSS); 3185 } 3186 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3187 } 3188 3189 static void exit_lmode(struct kvm_vcpu *vcpu) 3190 { 3191 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3192 } 3193 3194 #endif 3195 3196 void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3197 { 3198 struct vcpu_vmx *vmx = to_vmx(vcpu); 3199 3200 /* 3201 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3202 * the CPU is not required to invalidate guest-physical mappings on 3203 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3204 * associated with the root EPT structure and not any particular VPID 3205 * (INVVPID also isn't required to invalidate guest-physical mappings). 3206 */ 3207 if (enable_ept) { 3208 ept_sync_global(); 3209 } else if (enable_vpid) { 3210 if (cpu_has_vmx_invvpid_global()) { 3211 vpid_sync_vcpu_global(); 3212 } else { 3213 vpid_sync_vcpu_single(vmx->vpid); 3214 vpid_sync_vcpu_single(vmx->nested.vpid02); 3215 } 3216 } 3217 } 3218 3219 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3220 { 3221 if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) 3222 return nested_get_vpid02(vcpu); 3223 return to_vmx(vcpu)->vpid; 3224 } 3225 3226 void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3227 { 3228 struct kvm_mmu *mmu = vcpu->arch.mmu; 3229 u64 root_hpa = mmu->root.hpa; 3230 3231 /* No flush required if the current context is invalid. */ 3232 if (!VALID_PAGE(root_hpa)) 3233 return; 3234 3235 if (enable_ept) 3236 ept_sync_context(construct_eptp(vcpu, root_hpa, 3237 mmu->root_role.level)); 3238 else 3239 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3240 } 3241 3242 void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3243 { 3244 /* 3245 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3246 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3247 */ 3248 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3249 } 3250 3251 void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3252 { 3253 /* 3254 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3255 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3256 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3257 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3258 * i.e. no explicit INVVPID is necessary. 3259 */ 3260 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3261 } 3262 3263 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3264 { 3265 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3266 3267 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3268 return; 3269 3270 if (is_pae_paging(vcpu)) { 3271 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3272 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3273 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3274 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3275 } 3276 } 3277 3278 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3279 { 3280 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3281 3282 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3283 return; 3284 3285 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3286 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3287 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3288 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3289 3290 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3291 } 3292 3293 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3294 CPU_BASED_CR3_STORE_EXITING) 3295 3296 bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3297 { 3298 if (is_guest_mode(vcpu)) 3299 return nested_guest_cr0_valid(vcpu, cr0); 3300 3301 if (to_vmx(vcpu)->nested.vmxon) 3302 return nested_host_cr0_valid(vcpu, cr0); 3303 3304 return true; 3305 } 3306 3307 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3308 { 3309 struct vcpu_vmx *vmx = to_vmx(vcpu); 3310 unsigned long hw_cr0, old_cr0_pg; 3311 u32 tmp; 3312 3313 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3314 3315 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3316 if (enable_unrestricted_guest) 3317 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3318 else { 3319 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3320 if (!enable_ept) 3321 hw_cr0 |= X86_CR0_WP; 3322 3323 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3324 enter_pmode(vcpu); 3325 3326 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3327 enter_rmode(vcpu); 3328 } 3329 3330 vmcs_writel(CR0_READ_SHADOW, cr0); 3331 vmcs_writel(GUEST_CR0, hw_cr0); 3332 vcpu->arch.cr0 = cr0; 3333 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3334 3335 #ifdef CONFIG_X86_64 3336 if (vcpu->arch.efer & EFER_LME) { 3337 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3338 enter_lmode(vcpu); 3339 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3340 exit_lmode(vcpu); 3341 } 3342 #endif 3343 3344 if (enable_ept && !enable_unrestricted_guest) { 3345 /* 3346 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3347 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3348 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3349 * KVM's CR3 is installed. 3350 */ 3351 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3352 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3353 3354 /* 3355 * When running with EPT but not unrestricted guest, KVM must 3356 * intercept CR3 accesses when paging is _disabled_. This is 3357 * necessary because restricted guests can't actually run with 3358 * paging disabled, and so KVM stuffs its own CR3 in order to 3359 * run the guest when identity mapped page tables. 3360 * 3361 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3362 * update, it may be stale with respect to CR3 interception, 3363 * e.g. after nested VM-Enter. 3364 * 3365 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3366 * stores to forward them to L1, even if KVM does not need to 3367 * intercept them to preserve its identity mapped page tables. 3368 */ 3369 if (!(cr0 & X86_CR0_PG)) { 3370 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3371 } else if (!is_guest_mode(vcpu)) { 3372 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3373 } else { 3374 tmp = exec_controls_get(vmx); 3375 tmp &= ~CR3_EXITING_BITS; 3376 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3377 exec_controls_set(vmx, tmp); 3378 } 3379 3380 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3381 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3382 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3383 3384 /* 3385 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3386 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3387 */ 3388 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3389 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3390 } 3391 3392 /* depends on vcpu->arch.cr0 to be set to a new value */ 3393 vmx->emulation_required = vmx_emulation_required(vcpu); 3394 } 3395 3396 static int vmx_get_max_ept_level(void) 3397 { 3398 if (cpu_has_vmx_ept_5levels()) 3399 return 5; 3400 return 4; 3401 } 3402 3403 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3404 { 3405 u64 eptp = VMX_EPTP_MT_WB; 3406 3407 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3408 3409 if (enable_ept_ad_bits && 3410 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 3411 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3412 eptp |= root_hpa; 3413 3414 return eptp; 3415 } 3416 3417 void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3418 { 3419 struct kvm *kvm = vcpu->kvm; 3420 bool update_guest_cr3 = true; 3421 unsigned long guest_cr3; 3422 u64 eptp; 3423 3424 if (enable_ept) { 3425 eptp = construct_eptp(vcpu, root_hpa, root_level); 3426 vmcs_write64(EPT_POINTER, eptp); 3427 3428 hv_track_root_tdp(vcpu, root_hpa); 3429 3430 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3431 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3432 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3433 guest_cr3 = vcpu->arch.cr3; 3434 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3435 update_guest_cr3 = false; 3436 vmx_ept_load_pdptrs(vcpu); 3437 } else { 3438 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu) | 3439 kvm_get_active_cr3_lam_bits(vcpu); 3440 } 3441 3442 if (update_guest_cr3) 3443 vmcs_writel(GUEST_CR3, guest_cr3); 3444 } 3445 3446 bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3447 { 3448 /* 3449 * We operate under the default treatment of SMM, so VMX cannot be 3450 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3451 * i.e. is a reserved bit, is handled by common x86 code. 3452 */ 3453 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3454 return false; 3455 3456 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3457 return false; 3458 3459 return true; 3460 } 3461 3462 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3463 { 3464 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3465 struct vcpu_vmx *vmx = to_vmx(vcpu); 3466 unsigned long hw_cr4; 3467 3468 /* 3469 * Pass through host's Machine Check Enable value to hw_cr4, which 3470 * is in force while we are in guest mode. Do not let guests control 3471 * this bit, even if host CR4.MCE == 0. 3472 */ 3473 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3474 if (enable_unrestricted_guest) 3475 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3476 else if (vmx->rmode.vm86_active) 3477 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3478 else 3479 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3480 3481 if (vmx_umip_emulated()) { 3482 if (cr4 & X86_CR4_UMIP) { 3483 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3484 hw_cr4 &= ~X86_CR4_UMIP; 3485 } else if (!is_guest_mode(vcpu) || 3486 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3487 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3488 } 3489 } 3490 3491 vcpu->arch.cr4 = cr4; 3492 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3493 3494 if (!enable_unrestricted_guest) { 3495 if (enable_ept) { 3496 if (!is_paging(vcpu)) { 3497 hw_cr4 &= ~X86_CR4_PAE; 3498 hw_cr4 |= X86_CR4_PSE; 3499 } else if (!(cr4 & X86_CR4_PAE)) { 3500 hw_cr4 &= ~X86_CR4_PAE; 3501 } 3502 } 3503 3504 /* 3505 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3506 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3507 * to be manually disabled when guest switches to non-paging 3508 * mode. 3509 * 3510 * If !enable_unrestricted_guest, the CPU is always running 3511 * with CR0.PG=1 and CR4 needs to be modified. 3512 * If enable_unrestricted_guest, the CPU automatically 3513 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3514 */ 3515 if (!is_paging(vcpu)) 3516 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3517 } 3518 3519 vmcs_writel(CR4_READ_SHADOW, cr4); 3520 vmcs_writel(GUEST_CR4, hw_cr4); 3521 3522 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3523 kvm_update_cpuid_runtime(vcpu); 3524 } 3525 3526 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3527 { 3528 struct vcpu_vmx *vmx = to_vmx(vcpu); 3529 u32 ar; 3530 3531 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3532 *var = vmx->rmode.segs[seg]; 3533 if (seg == VCPU_SREG_TR 3534 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3535 return; 3536 var->base = vmx_read_guest_seg_base(vmx, seg); 3537 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3538 return; 3539 } 3540 var->base = vmx_read_guest_seg_base(vmx, seg); 3541 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3542 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3543 ar = vmx_read_guest_seg_ar(vmx, seg); 3544 var->unusable = (ar >> 16) & 1; 3545 var->type = ar & 15; 3546 var->s = (ar >> 4) & 1; 3547 var->dpl = (ar >> 5) & 3; 3548 /* 3549 * Some userspaces do not preserve unusable property. Since usable 3550 * segment has to be present according to VMX spec we can use present 3551 * property to amend userspace bug by making unusable segment always 3552 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3553 * segment as unusable. 3554 */ 3555 var->present = !var->unusable; 3556 var->avl = (ar >> 12) & 1; 3557 var->l = (ar >> 13) & 1; 3558 var->db = (ar >> 14) & 1; 3559 var->g = (ar >> 15) & 1; 3560 } 3561 3562 u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3563 { 3564 struct kvm_segment s; 3565 3566 if (to_vmx(vcpu)->rmode.vm86_active) { 3567 vmx_get_segment(vcpu, &s, seg); 3568 return s.base; 3569 } 3570 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3571 } 3572 3573 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3574 { 3575 struct vcpu_vmx *vmx = to_vmx(vcpu); 3576 3577 if (unlikely(vmx->rmode.vm86_active)) 3578 return 0; 3579 else { 3580 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3581 return VMX_AR_DPL(ar); 3582 } 3583 } 3584 3585 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3586 { 3587 u32 ar; 3588 3589 ar = var->type & 15; 3590 ar |= (var->s & 1) << 4; 3591 ar |= (var->dpl & 3) << 5; 3592 ar |= (var->present & 1) << 7; 3593 ar |= (var->avl & 1) << 12; 3594 ar |= (var->l & 1) << 13; 3595 ar |= (var->db & 1) << 14; 3596 ar |= (var->g & 1) << 15; 3597 ar |= (var->unusable || !var->present) << 16; 3598 3599 return ar; 3600 } 3601 3602 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3603 { 3604 struct vcpu_vmx *vmx = to_vmx(vcpu); 3605 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3606 3607 vmx_segment_cache_clear(vmx); 3608 3609 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3610 vmx->rmode.segs[seg] = *var; 3611 if (seg == VCPU_SREG_TR) 3612 vmcs_write16(sf->selector, var->selector); 3613 else if (var->s) 3614 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3615 return; 3616 } 3617 3618 vmcs_writel(sf->base, var->base); 3619 vmcs_write32(sf->limit, var->limit); 3620 vmcs_write16(sf->selector, var->selector); 3621 3622 /* 3623 * Fix the "Accessed" bit in AR field of segment registers for older 3624 * qemu binaries. 3625 * IA32 arch specifies that at the time of processor reset the 3626 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3627 * is setting it to 0 in the userland code. This causes invalid guest 3628 * state vmexit when "unrestricted guest" mode is turned on. 3629 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3630 * tree. Newer qemu binaries with that qemu fix would not need this 3631 * kvm hack. 3632 */ 3633 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3634 var->type |= 0x1; /* Accessed */ 3635 3636 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3637 } 3638 3639 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3640 { 3641 __vmx_set_segment(vcpu, var, seg); 3642 3643 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 3644 } 3645 3646 void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3647 { 3648 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3649 3650 *db = (ar >> 14) & 1; 3651 *l = (ar >> 13) & 1; 3652 } 3653 3654 void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3655 { 3656 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3657 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3658 } 3659 3660 void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3661 { 3662 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3663 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3664 } 3665 3666 void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3667 { 3668 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3669 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3670 } 3671 3672 void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3673 { 3674 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3675 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3676 } 3677 3678 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3679 { 3680 struct kvm_segment var; 3681 u32 ar; 3682 3683 vmx_get_segment(vcpu, &var, seg); 3684 var.dpl = 0x3; 3685 if (seg == VCPU_SREG_CS) 3686 var.type = 0x3; 3687 ar = vmx_segment_access_rights(&var); 3688 3689 if (var.base != (var.selector << 4)) 3690 return false; 3691 if (var.limit != 0xffff) 3692 return false; 3693 if (ar != 0xf3) 3694 return false; 3695 3696 return true; 3697 } 3698 3699 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3700 { 3701 struct kvm_segment cs; 3702 unsigned int cs_rpl; 3703 3704 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3705 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3706 3707 if (cs.unusable) 3708 return false; 3709 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3710 return false; 3711 if (!cs.s) 3712 return false; 3713 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3714 if (cs.dpl > cs_rpl) 3715 return false; 3716 } else { 3717 if (cs.dpl != cs_rpl) 3718 return false; 3719 } 3720 if (!cs.present) 3721 return false; 3722 3723 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3724 return true; 3725 } 3726 3727 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3728 { 3729 struct kvm_segment ss; 3730 unsigned int ss_rpl; 3731 3732 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3733 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3734 3735 if (ss.unusable) 3736 return true; 3737 if (ss.type != 3 && ss.type != 7) 3738 return false; 3739 if (!ss.s) 3740 return false; 3741 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3742 return false; 3743 if (!ss.present) 3744 return false; 3745 3746 return true; 3747 } 3748 3749 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3750 { 3751 struct kvm_segment var; 3752 unsigned int rpl; 3753 3754 vmx_get_segment(vcpu, &var, seg); 3755 rpl = var.selector & SEGMENT_RPL_MASK; 3756 3757 if (var.unusable) 3758 return true; 3759 if (!var.s) 3760 return false; 3761 if (!var.present) 3762 return false; 3763 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3764 if (var.dpl < rpl) /* DPL < RPL */ 3765 return false; 3766 } 3767 3768 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3769 * rights flags 3770 */ 3771 return true; 3772 } 3773 3774 static bool tr_valid(struct kvm_vcpu *vcpu) 3775 { 3776 struct kvm_segment tr; 3777 3778 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3779 3780 if (tr.unusable) 3781 return false; 3782 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3783 return false; 3784 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3785 return false; 3786 if (!tr.present) 3787 return false; 3788 3789 return true; 3790 } 3791 3792 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3793 { 3794 struct kvm_segment ldtr; 3795 3796 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3797 3798 if (ldtr.unusable) 3799 return true; 3800 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3801 return false; 3802 if (ldtr.type != 2) 3803 return false; 3804 if (!ldtr.present) 3805 return false; 3806 3807 return true; 3808 } 3809 3810 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3811 { 3812 struct kvm_segment cs, ss; 3813 3814 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3815 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3816 3817 return ((cs.selector & SEGMENT_RPL_MASK) == 3818 (ss.selector & SEGMENT_RPL_MASK)); 3819 } 3820 3821 /* 3822 * Check if guest state is valid. Returns true if valid, false if 3823 * not. 3824 * We assume that registers are always usable 3825 */ 3826 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3827 { 3828 /* real mode guest state checks */ 3829 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3830 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3831 return false; 3832 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3833 return false; 3834 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3835 return false; 3836 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3837 return false; 3838 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3839 return false; 3840 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3841 return false; 3842 } else { 3843 /* protected mode guest state checks */ 3844 if (!cs_ss_rpl_check(vcpu)) 3845 return false; 3846 if (!code_segment_valid(vcpu)) 3847 return false; 3848 if (!stack_segment_valid(vcpu)) 3849 return false; 3850 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3851 return false; 3852 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3853 return false; 3854 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3855 return false; 3856 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3857 return false; 3858 if (!tr_valid(vcpu)) 3859 return false; 3860 if (!ldtr_valid(vcpu)) 3861 return false; 3862 } 3863 /* TODO: 3864 * - Add checks on RIP 3865 * - Add checks on RFLAGS 3866 */ 3867 3868 return true; 3869 } 3870 3871 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3872 { 3873 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3874 u16 data; 3875 int i; 3876 3877 for (i = 0; i < 3; i++) { 3878 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3879 return -EFAULT; 3880 } 3881 3882 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3883 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3884 return -EFAULT; 3885 3886 data = ~0; 3887 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3888 return -EFAULT; 3889 3890 return 0; 3891 } 3892 3893 static int init_rmode_identity_map(struct kvm *kvm) 3894 { 3895 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3896 int i, r = 0; 3897 void __user *uaddr; 3898 u32 tmp; 3899 3900 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3901 mutex_lock(&kvm->slots_lock); 3902 3903 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3904 goto out; 3905 3906 if (!kvm_vmx->ept_identity_map_addr) 3907 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3908 3909 uaddr = __x86_set_memory_region(kvm, 3910 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3911 kvm_vmx->ept_identity_map_addr, 3912 PAGE_SIZE); 3913 if (IS_ERR(uaddr)) { 3914 r = PTR_ERR(uaddr); 3915 goto out; 3916 } 3917 3918 /* Set up identity-mapping pagetable for EPT in real mode */ 3919 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 3920 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3921 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3922 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 3923 r = -EFAULT; 3924 goto out; 3925 } 3926 } 3927 kvm_vmx->ept_identity_pagetable_done = true; 3928 3929 out: 3930 mutex_unlock(&kvm->slots_lock); 3931 return r; 3932 } 3933 3934 static void seg_setup(int seg) 3935 { 3936 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3937 unsigned int ar; 3938 3939 vmcs_write16(sf->selector, 0); 3940 vmcs_writel(sf->base, 0); 3941 vmcs_write32(sf->limit, 0xffff); 3942 ar = 0x93; 3943 if (seg == VCPU_SREG_CS) 3944 ar |= 0x08; /* code segment */ 3945 3946 vmcs_write32(sf->ar_bytes, ar); 3947 } 3948 3949 int allocate_vpid(void) 3950 { 3951 int vpid; 3952 3953 if (!enable_vpid) 3954 return 0; 3955 spin_lock(&vmx_vpid_lock); 3956 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3957 if (vpid < VMX_NR_VPIDS) 3958 __set_bit(vpid, vmx_vpid_bitmap); 3959 else 3960 vpid = 0; 3961 spin_unlock(&vmx_vpid_lock); 3962 return vpid; 3963 } 3964 3965 void free_vpid(int vpid) 3966 { 3967 if (!enable_vpid || vpid == 0) 3968 return; 3969 spin_lock(&vmx_vpid_lock); 3970 __clear_bit(vpid, vmx_vpid_bitmap); 3971 spin_unlock(&vmx_vpid_lock); 3972 } 3973 3974 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 3975 { 3976 /* 3977 * When KVM is a nested hypervisor on top of Hyper-V and uses 3978 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 3979 * bitmap has changed. 3980 */ 3981 if (kvm_is_using_evmcs()) { 3982 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 3983 3984 if (evmcs->hv_enlightenments_control.msr_bitmap) 3985 evmcs->hv_clean_fields &= 3986 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 3987 } 3988 3989 vmx->nested.force_msr_bitmap_recalc = true; 3990 } 3991 3992 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 3993 { 3994 struct vcpu_vmx *vmx = to_vmx(vcpu); 3995 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3996 int idx; 3997 3998 if (!cpu_has_vmx_msr_bitmap()) 3999 return; 4000 4001 vmx_msr_bitmap_l01_changed(vmx); 4002 4003 /* 4004 * Mark the desired intercept state in shadow bitmap, this is needed 4005 * for resync when the MSR filters change. 4006 */ 4007 idx = vmx_get_passthrough_msr_slot(msr); 4008 if (idx >= 0) { 4009 if (type & MSR_TYPE_R) 4010 clear_bit(idx, vmx->shadow_msr_intercept.read); 4011 if (type & MSR_TYPE_W) 4012 clear_bit(idx, vmx->shadow_msr_intercept.write); 4013 } 4014 4015 if ((type & MSR_TYPE_R) && 4016 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { 4017 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4018 type &= ~MSR_TYPE_R; 4019 } 4020 4021 if ((type & MSR_TYPE_W) && 4022 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { 4023 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4024 type &= ~MSR_TYPE_W; 4025 } 4026 4027 if (type & MSR_TYPE_R) 4028 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4029 4030 if (type & MSR_TYPE_W) 4031 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4032 } 4033 4034 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4035 { 4036 struct vcpu_vmx *vmx = to_vmx(vcpu); 4037 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4038 int idx; 4039 4040 if (!cpu_has_vmx_msr_bitmap()) 4041 return; 4042 4043 vmx_msr_bitmap_l01_changed(vmx); 4044 4045 /* 4046 * Mark the desired intercept state in shadow bitmap, this is needed 4047 * for resync when the MSR filter changes. 4048 */ 4049 idx = vmx_get_passthrough_msr_slot(msr); 4050 if (idx >= 0) { 4051 if (type & MSR_TYPE_R) 4052 set_bit(idx, vmx->shadow_msr_intercept.read); 4053 if (type & MSR_TYPE_W) 4054 set_bit(idx, vmx->shadow_msr_intercept.write); 4055 } 4056 4057 if (type & MSR_TYPE_R) 4058 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4059 4060 if (type & MSR_TYPE_W) 4061 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4062 } 4063 4064 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4065 { 4066 /* 4067 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4068 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4069 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4070 */ 4071 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4072 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4073 struct vcpu_vmx *vmx = to_vmx(vcpu); 4074 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4075 u8 mode; 4076 4077 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4078 return; 4079 4080 if (cpu_has_secondary_exec_ctrls() && 4081 (secondary_exec_controls_get(vmx) & 4082 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4083 mode = MSR_BITMAP_MODE_X2APIC; 4084 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4085 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4086 } else { 4087 mode = 0; 4088 } 4089 4090 if (mode == vmx->x2apic_msr_bitmap_mode) 4091 return; 4092 4093 vmx->x2apic_msr_bitmap_mode = mode; 4094 4095 /* 4096 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4097 * registers (0x840 and above) intercepted, KVM doesn't support them. 4098 * Intercept all writes by default and poke holes as needed. Pass 4099 * through reads for all valid registers by default in x2APIC+APICv 4100 * mode, only the current timer count needs on-demand emulation by KVM. 4101 */ 4102 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4103 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4104 else 4105 msr_bitmap[read_idx] = ~0ull; 4106 msr_bitmap[write_idx] = ~0ull; 4107 4108 /* 4109 * TPR reads and writes can be virtualized even if virtual interrupt 4110 * delivery is not in use. 4111 */ 4112 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4113 !(mode & MSR_BITMAP_MODE_X2APIC)); 4114 4115 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4116 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4117 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4118 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4119 if (enable_ipiv) 4120 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4121 } 4122 } 4123 4124 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4125 { 4126 struct vcpu_vmx *vmx = to_vmx(vcpu); 4127 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4128 u32 i; 4129 4130 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4131 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4132 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4133 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4134 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4135 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4136 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4137 } 4138 } 4139 4140 void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) 4141 { 4142 struct vcpu_vmx *vmx = to_vmx(vcpu); 4143 u32 i; 4144 4145 if (!cpu_has_vmx_msr_bitmap()) 4146 return; 4147 4148 /* 4149 * Redo intercept permissions for MSRs that KVM is passing through to 4150 * the guest. Disabling interception will check the new MSR filter and 4151 * ensure that KVM enables interception if usersepace wants to filter 4152 * the MSR. MSRs that KVM is already intercepting don't need to be 4153 * refreshed since KVM is going to intercept them regardless of what 4154 * userspace wants. 4155 */ 4156 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 4157 u32 msr = vmx_possible_passthrough_msrs[i]; 4158 4159 if (!test_bit(i, vmx->shadow_msr_intercept.read)) 4160 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); 4161 4162 if (!test_bit(i, vmx->shadow_msr_intercept.write)) 4163 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); 4164 } 4165 4166 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4167 if (vmx_pt_mode_is_host_guest()) 4168 pt_update_intercept_for_msr(vcpu); 4169 } 4170 4171 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 4172 int pi_vec) 4173 { 4174 #ifdef CONFIG_SMP 4175 if (vcpu->mode == IN_GUEST_MODE) { 4176 /* 4177 * The vector of the virtual has already been set in the PIR. 4178 * Send a notification event to deliver the virtual interrupt 4179 * unless the vCPU is the currently running vCPU, i.e. the 4180 * event is being sent from a fastpath VM-Exit handler, in 4181 * which case the PIR will be synced to the vIRR before 4182 * re-entering the guest. 4183 * 4184 * When the target is not the running vCPU, the following 4185 * possibilities emerge: 4186 * 4187 * Case 1: vCPU stays in non-root mode. Sending a notification 4188 * event posts the interrupt to the vCPU. 4189 * 4190 * Case 2: vCPU exits to root mode and is still runnable. The 4191 * PIR will be synced to the vIRR before re-entering the guest. 4192 * Sending a notification event is ok as the host IRQ handler 4193 * will ignore the spurious event. 4194 * 4195 * Case 3: vCPU exits to root mode and is blocked. vcpu_block() 4196 * has already synced PIR to vIRR and never blocks the vCPU if 4197 * the vIRR is not empty. Therefore, a blocked vCPU here does 4198 * not wait for any requested interrupts in PIR, and sending a 4199 * notification event also results in a benign, spurious event. 4200 */ 4201 4202 if (vcpu != kvm_get_running_vcpu()) 4203 __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); 4204 return; 4205 } 4206 #endif 4207 /* 4208 * The vCPU isn't in the guest; wake the vCPU in case it is blocking, 4209 * otherwise do nothing as KVM will grab the highest priority pending 4210 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). 4211 */ 4212 kvm_vcpu_wake_up(vcpu); 4213 } 4214 4215 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4216 int vector) 4217 { 4218 struct vcpu_vmx *vmx = to_vmx(vcpu); 4219 4220 /* 4221 * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated 4222 * and freed, and must not be accessed outside of vcpu->mutex. The 4223 * vCPU's cached PI NV is valid if and only if posted interrupts 4224 * enabled in its vmcs12, i.e. checking the vector also checks that 4225 * L1 has enabled posted interrupts for L2. 4226 */ 4227 if (is_guest_mode(vcpu) && 4228 vector == vmx->nested.posted_intr_nv) { 4229 /* 4230 * If a posted intr is not recognized by hardware, 4231 * we will accomplish it in the next vmentry. 4232 */ 4233 vmx->nested.pi_pending = true; 4234 kvm_make_request(KVM_REQ_EVENT, vcpu); 4235 4236 /* 4237 * This pairs with the smp_mb_*() after setting vcpu->mode in 4238 * vcpu_enter_guest() to guarantee the vCPU sees the event 4239 * request if triggering a posted interrupt "fails" because 4240 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4241 * the smb_wmb() in kvm_make_request() only ensures everything 4242 * done before making the request is visible when the request 4243 * is visible, it doesn't ensure ordering between the store to 4244 * vcpu->requests and the load from vcpu->mode. 4245 */ 4246 smp_mb__after_atomic(); 4247 4248 /* the PIR and ON have been set by L1. */ 4249 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4250 return 0; 4251 } 4252 return -1; 4253 } 4254 /* 4255 * Send interrupt to vcpu via posted interrupt way. 4256 * 1. If target vcpu is running(non-root mode), send posted interrupt 4257 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4258 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4259 * interrupt from PIR in next vmentry. 4260 */ 4261 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4262 { 4263 struct vcpu_vmx *vmx = to_vmx(vcpu); 4264 int r; 4265 4266 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4267 if (!r) 4268 return 0; 4269 4270 /* Note, this is called iff the local APIC is in-kernel. */ 4271 if (!vcpu->arch.apic->apicv_active) 4272 return -1; 4273 4274 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4275 return 0; 4276 4277 /* If a previous notification has sent the IPI, nothing to do. */ 4278 if (pi_test_and_set_on(&vmx->pi_desc)) 4279 return 0; 4280 4281 /* 4282 * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() 4283 * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is 4284 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a 4285 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. 4286 */ 4287 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); 4288 return 0; 4289 } 4290 4291 void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4292 int trig_mode, int vector) 4293 { 4294 struct kvm_vcpu *vcpu = apic->vcpu; 4295 4296 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4297 kvm_lapic_set_irr(vector, apic); 4298 kvm_make_request(KVM_REQ_EVENT, vcpu); 4299 kvm_vcpu_kick(vcpu); 4300 } else { 4301 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4302 trig_mode, vector); 4303 } 4304 } 4305 4306 /* 4307 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4308 * will not change in the lifetime of the guest. 4309 * Note that host-state that does change is set elsewhere. E.g., host-state 4310 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4311 */ 4312 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4313 { 4314 u32 low32, high32; 4315 unsigned long tmpl; 4316 unsigned long cr0, cr3, cr4; 4317 4318 cr0 = read_cr0(); 4319 WARN_ON(cr0 & X86_CR0_TS); 4320 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4321 4322 /* 4323 * Save the most likely value for this task's CR3 in the VMCS. 4324 * We can't use __get_current_cr3_fast() because we're not atomic. 4325 */ 4326 cr3 = __read_cr3(); 4327 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4328 vmx->loaded_vmcs->host_state.cr3 = cr3; 4329 4330 /* Save the most likely value for this task's CR4 in the VMCS. */ 4331 cr4 = cr4_read_shadow(); 4332 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4333 vmx->loaded_vmcs->host_state.cr4 = cr4; 4334 4335 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4336 #ifdef CONFIG_X86_64 4337 /* 4338 * Load null selectors, so we can avoid reloading them in 4339 * vmx_prepare_switch_to_host(), in case userspace uses 4340 * the null selectors too (the expected case). 4341 */ 4342 vmcs_write16(HOST_DS_SELECTOR, 0); 4343 vmcs_write16(HOST_ES_SELECTOR, 0); 4344 #else 4345 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4346 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4347 #endif 4348 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4349 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4350 4351 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4352 4353 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4354 4355 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4356 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4357 4358 /* 4359 * SYSENTER is used for 32-bit system calls on either 32-bit or 4360 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4361 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4362 * have already done so!). 4363 */ 4364 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4365 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4366 4367 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4368 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4369 4370 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4371 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4372 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4373 } 4374 4375 if (cpu_has_load_ia32_efer()) 4376 vmcs_write64(HOST_IA32_EFER, kvm_host.efer); 4377 } 4378 4379 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4380 { 4381 struct kvm_vcpu *vcpu = &vmx->vcpu; 4382 4383 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4384 ~vcpu->arch.cr4_guest_rsvd_bits; 4385 if (!enable_ept) { 4386 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4387 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4388 } 4389 if (is_guest_mode(&vmx->vcpu)) 4390 vcpu->arch.cr4_guest_owned_bits &= 4391 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4392 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4393 } 4394 4395 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4396 { 4397 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4398 4399 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4400 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4401 4402 if (!enable_vnmi) 4403 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4404 4405 if (!enable_preemption_timer) 4406 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4407 4408 return pin_based_exec_ctrl; 4409 } 4410 4411 static u32 vmx_vmentry_ctrl(void) 4412 { 4413 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4414 4415 if (vmx_pt_mode_is_system()) 4416 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4417 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4418 /* 4419 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4420 */ 4421 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4422 VM_ENTRY_LOAD_IA32_EFER | 4423 VM_ENTRY_IA32E_MODE); 4424 4425 if (cpu_has_perf_global_ctrl_bug()) 4426 vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4427 4428 return vmentry_ctrl; 4429 } 4430 4431 static u32 vmx_vmexit_ctrl(void) 4432 { 4433 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4434 4435 /* 4436 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4437 * nested virtualization and thus allowed to be set in vmcs12. 4438 */ 4439 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4440 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4441 4442 if (vmx_pt_mode_is_system()) 4443 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4444 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4445 4446 if (cpu_has_perf_global_ctrl_bug()) 4447 vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4448 4449 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4450 return vmexit_ctrl & 4451 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4452 } 4453 4454 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4455 { 4456 struct vcpu_vmx *vmx = to_vmx(vcpu); 4457 4458 if (is_guest_mode(vcpu)) { 4459 vmx->nested.update_vmcs01_apicv_status = true; 4460 return; 4461 } 4462 4463 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4464 4465 if (kvm_vcpu_apicv_active(vcpu)) { 4466 secondary_exec_controls_setbit(vmx, 4467 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4468 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4469 if (enable_ipiv) 4470 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4471 } else { 4472 secondary_exec_controls_clearbit(vmx, 4473 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4474 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4475 if (enable_ipiv) 4476 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4477 } 4478 4479 vmx_update_msr_bitmap_x2apic(vcpu); 4480 } 4481 4482 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4483 { 4484 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4485 4486 /* 4487 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4488 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4489 */ 4490 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4491 CPU_BASED_USE_IO_BITMAPS | 4492 CPU_BASED_MONITOR_TRAP_FLAG | 4493 CPU_BASED_PAUSE_EXITING); 4494 4495 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4496 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4497 CPU_BASED_NMI_WINDOW_EXITING); 4498 4499 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4500 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4501 4502 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4503 exec_control &= ~CPU_BASED_TPR_SHADOW; 4504 4505 #ifdef CONFIG_X86_64 4506 if (exec_control & CPU_BASED_TPR_SHADOW) 4507 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4508 CPU_BASED_CR8_STORE_EXITING); 4509 else 4510 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4511 CPU_BASED_CR8_LOAD_EXITING; 4512 #endif 4513 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4514 if (enable_ept) 4515 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4516 CPU_BASED_CR3_STORE_EXITING | 4517 CPU_BASED_INVLPG_EXITING); 4518 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4519 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4520 CPU_BASED_MONITOR_EXITING); 4521 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4522 exec_control &= ~CPU_BASED_HLT_EXITING; 4523 return exec_control; 4524 } 4525 4526 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4527 { 4528 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4529 4530 /* 4531 * IPI virtualization relies on APICv. Disable IPI virtualization if 4532 * APICv is inhibited. 4533 */ 4534 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4535 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4536 4537 return exec_control; 4538 } 4539 4540 /* 4541 * Adjust a single secondary execution control bit to intercept/allow an 4542 * instruction in the guest. This is usually done based on whether or not a 4543 * feature has been exposed to the guest in order to correctly emulate faults. 4544 */ 4545 static inline void 4546 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4547 u32 control, bool enabled, bool exiting) 4548 { 4549 /* 4550 * If the control is for an opt-in feature, clear the control if the 4551 * feature is not exposed to the guest, i.e. not enabled. If the 4552 * control is opt-out, i.e. an exiting control, clear the control if 4553 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4554 * disabled for the associated instruction. Note, the caller is 4555 * responsible presetting exec_control to set all supported bits. 4556 */ 4557 if (enabled == exiting) 4558 *exec_control &= ~control; 4559 4560 /* 4561 * Update the nested MSR settings so that a nested VMM can/can't set 4562 * controls for features that are/aren't exposed to the guest. 4563 */ 4564 if (nested) { 4565 /* 4566 * All features that can be added or removed to VMX MSRs must 4567 * be supported in the first place for nested virtualization. 4568 */ 4569 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4570 enabled = false; 4571 4572 if (enabled) 4573 vmx->nested.msrs.secondary_ctls_high |= control; 4574 else 4575 vmx->nested.msrs.secondary_ctls_high &= ~control; 4576 } 4577 } 4578 4579 /* 4580 * Wrapper macro for the common case of adjusting a secondary execution control 4581 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4582 * verifies that the control is actually supported by KVM and hardware. 4583 */ 4584 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4585 ({ \ 4586 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4587 bool __enabled; \ 4588 \ 4589 if (cpu_has_vmx_##name()) { \ 4590 if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \ 4591 __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \ 4592 else \ 4593 __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \ 4594 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4595 __enabled, exiting); \ 4596 } \ 4597 }) 4598 4599 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4600 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4601 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4602 4603 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4604 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4605 4606 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4607 { 4608 struct kvm_vcpu *vcpu = &vmx->vcpu; 4609 4610 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4611 4612 if (vmx_pt_mode_is_system()) 4613 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4614 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4615 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4616 if (vmx->vpid == 0) 4617 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4618 if (!enable_ept) { 4619 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4620 exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; 4621 enable_unrestricted_guest = 0; 4622 } 4623 if (!enable_unrestricted_guest) 4624 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4625 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4626 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4627 if (!kvm_vcpu_apicv_active(vcpu)) 4628 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4629 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4630 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4631 4632 /* 4633 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4634 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4635 */ 4636 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4637 4638 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4639 * in vmx_set_cr4. */ 4640 exec_control &= ~SECONDARY_EXEC_DESC; 4641 4642 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4643 (handle_vmptrld). 4644 We can NOT enable shadow_vmcs here because we don't have yet 4645 a current VMCS12 4646 */ 4647 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4648 4649 /* 4650 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4651 * it needs to be set here when dirty logging is already active, e.g. 4652 * if this vCPU was created after dirty logging was enabled. 4653 */ 4654 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4655 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4656 4657 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4658 4659 /* 4660 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4661 * feature is exposed to the guest. This creates a virtualization hole 4662 * if both are supported in hardware but only one is exposed to the 4663 * guest, but letting the guest execute RDTSCP or RDPID when either one 4664 * is advertised is preferable to emulating the advertised instruction 4665 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4666 */ 4667 if (cpu_has_vmx_rdtscp()) { 4668 bool rdpid_or_rdtscp_enabled = 4669 guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || 4670 guest_cpuid_has(vcpu, X86_FEATURE_RDPID); 4671 4672 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4673 SECONDARY_EXEC_ENABLE_RDTSCP, 4674 rdpid_or_rdtscp_enabled, false); 4675 } 4676 4677 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4678 4679 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4680 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4681 4682 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4683 ENABLE_USR_WAIT_PAUSE, false); 4684 4685 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4686 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4687 4688 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4689 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4690 4691 return exec_control; 4692 } 4693 4694 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4695 { 4696 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4697 } 4698 4699 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4700 { 4701 struct page *pages; 4702 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4703 4704 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4705 return 0; 4706 4707 if (kvm_vmx->pid_table) 4708 return 0; 4709 4710 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4711 vmx_get_pid_table_order(kvm)); 4712 if (!pages) 4713 return -ENOMEM; 4714 4715 kvm_vmx->pid_table = (void *)page_address(pages); 4716 return 0; 4717 } 4718 4719 int vmx_vcpu_precreate(struct kvm *kvm) 4720 { 4721 return vmx_alloc_ipiv_pid_table(kvm); 4722 } 4723 4724 #define VMX_XSS_EXIT_BITMAP 0 4725 4726 static void init_vmcs(struct vcpu_vmx *vmx) 4727 { 4728 struct kvm *kvm = vmx->vcpu.kvm; 4729 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4730 4731 if (nested) 4732 nested_vmx_set_vmcs_shadowing_bitmap(); 4733 4734 if (cpu_has_vmx_msr_bitmap()) 4735 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4736 4737 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4738 4739 /* Control */ 4740 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4741 4742 exec_controls_set(vmx, vmx_exec_control(vmx)); 4743 4744 if (cpu_has_secondary_exec_ctrls()) { 4745 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4746 if (vmx->ve_info) 4747 vmcs_write64(VE_INFORMATION_ADDRESS, 4748 __pa(vmx->ve_info)); 4749 } 4750 4751 if (cpu_has_tertiary_exec_ctrls()) 4752 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4753 4754 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4755 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4756 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4757 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4758 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4759 4760 vmcs_write16(GUEST_INTR_STATUS, 0); 4761 4762 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4763 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4764 } 4765 4766 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4767 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4768 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4769 } 4770 4771 if (!kvm_pause_in_guest(kvm)) { 4772 vmcs_write32(PLE_GAP, ple_gap); 4773 vmx->ple_window = ple_window; 4774 vmx->ple_window_dirty = true; 4775 } 4776 4777 if (kvm_notify_vmexit_enabled(kvm)) 4778 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4779 4780 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4781 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4782 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4783 4784 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4785 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4786 vmx_set_constant_host_state(vmx); 4787 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4788 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4789 4790 if (cpu_has_vmx_vmfunc()) 4791 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4792 4793 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4794 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4795 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4796 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4797 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4798 4799 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4800 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4801 4802 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4803 4804 /* 22.2.1, 20.8.1 */ 4805 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4806 4807 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4808 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4809 4810 set_cr4_guest_host_mask(vmx); 4811 4812 if (vmx->vpid != 0) 4813 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4814 4815 if (cpu_has_vmx_xsaves()) 4816 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4817 4818 if (enable_pml) { 4819 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4820 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 4821 } 4822 4823 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4824 4825 if (vmx_pt_mode_is_host_guest()) { 4826 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4827 /* Bit[6~0] are forced to 1, writes are ignored. */ 4828 vmx->pt_desc.guest.output_mask = 0x7F; 4829 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4830 } 4831 4832 vmcs_write32(GUEST_SYSENTER_CS, 0); 4833 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4834 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4835 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4836 4837 if (cpu_has_vmx_tpr_shadow()) { 4838 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4839 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4840 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4841 __pa(vmx->vcpu.arch.apic->regs)); 4842 vmcs_write32(TPR_THRESHOLD, 0); 4843 } 4844 4845 vmx_setup_uret_msrs(vmx); 4846 } 4847 4848 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4849 { 4850 struct vcpu_vmx *vmx = to_vmx(vcpu); 4851 4852 init_vmcs(vmx); 4853 4854 if (nested) 4855 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4856 4857 vcpu_setup_sgx_lepubkeyhash(vcpu); 4858 4859 vmx->nested.posted_intr_nv = -1; 4860 vmx->nested.vmxon_ptr = INVALID_GPA; 4861 vmx->nested.current_vmptr = INVALID_GPA; 4862 4863 #ifdef CONFIG_KVM_HYPERV 4864 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4865 #endif 4866 4867 vcpu->arch.microcode_version = 0x100000000ULL; 4868 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4869 4870 /* 4871 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4872 * or POSTED_INTR_WAKEUP_VECTOR. 4873 */ 4874 vmx->pi_desc.nv = POSTED_INTR_VECTOR; 4875 __pi_set_sn(&vmx->pi_desc); 4876 } 4877 4878 void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4879 { 4880 struct vcpu_vmx *vmx = to_vmx(vcpu); 4881 4882 if (!init_event) 4883 __vmx_vcpu_reset(vcpu); 4884 4885 vmx->rmode.vm86_active = 0; 4886 vmx->spec_ctrl = 0; 4887 4888 vmx->msr_ia32_umwait_control = 0; 4889 4890 vmx->hv_deadline_tsc = -1; 4891 kvm_set_cr8(vcpu, 0); 4892 4893 seg_setup(VCPU_SREG_CS); 4894 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4895 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4896 4897 seg_setup(VCPU_SREG_DS); 4898 seg_setup(VCPU_SREG_ES); 4899 seg_setup(VCPU_SREG_FS); 4900 seg_setup(VCPU_SREG_GS); 4901 seg_setup(VCPU_SREG_SS); 4902 4903 vmcs_write16(GUEST_TR_SELECTOR, 0); 4904 vmcs_writel(GUEST_TR_BASE, 0); 4905 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4906 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4907 4908 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4909 vmcs_writel(GUEST_LDTR_BASE, 0); 4910 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4911 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4912 4913 vmcs_writel(GUEST_GDTR_BASE, 0); 4914 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4915 4916 vmcs_writel(GUEST_IDTR_BASE, 0); 4917 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4918 4919 vmx_segment_cache_clear(vmx); 4920 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4921 4922 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4923 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4924 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4925 if (kvm_mpx_supported()) 4926 vmcs_write64(GUEST_BNDCFGS, 0); 4927 4928 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4929 4930 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4931 4932 vpid_sync_context(vmx->vpid); 4933 4934 vmx_update_fb_clear_dis(vcpu, vmx); 4935 } 4936 4937 void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 4938 { 4939 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4940 } 4941 4942 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 4943 { 4944 if (!enable_vnmi || 4945 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4946 vmx_enable_irq_window(vcpu); 4947 return; 4948 } 4949 4950 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4951 } 4952 4953 void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 4954 { 4955 struct vcpu_vmx *vmx = to_vmx(vcpu); 4956 uint32_t intr; 4957 int irq = vcpu->arch.interrupt.nr; 4958 4959 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 4960 4961 ++vcpu->stat.irq_injections; 4962 if (vmx->rmode.vm86_active) { 4963 int inc_eip = 0; 4964 if (vcpu->arch.interrupt.soft) 4965 inc_eip = vcpu->arch.event_exit_inst_len; 4966 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4967 return; 4968 } 4969 intr = irq | INTR_INFO_VALID_MASK; 4970 if (vcpu->arch.interrupt.soft) { 4971 intr |= INTR_TYPE_SOFT_INTR; 4972 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4973 vmx->vcpu.arch.event_exit_inst_len); 4974 } else 4975 intr |= INTR_TYPE_EXT_INTR; 4976 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4977 4978 vmx_clear_hlt(vcpu); 4979 } 4980 4981 void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4982 { 4983 struct vcpu_vmx *vmx = to_vmx(vcpu); 4984 4985 if (!enable_vnmi) { 4986 /* 4987 * Tracking the NMI-blocked state in software is built upon 4988 * finding the next open IRQ window. This, in turn, depends on 4989 * well-behaving guests: They have to keep IRQs disabled at 4990 * least as long as the NMI handler runs. Otherwise we may 4991 * cause NMI nesting, maybe breaking the guest. But as this is 4992 * highly unlikely, we can live with the residual risk. 4993 */ 4994 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 4995 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4996 } 4997 4998 ++vcpu->stat.nmi_injections; 4999 vmx->loaded_vmcs->nmi_known_unmasked = false; 5000 5001 if (vmx->rmode.vm86_active) { 5002 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 5003 return; 5004 } 5005 5006 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 5007 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 5008 5009 vmx_clear_hlt(vcpu); 5010 } 5011 5012 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 5013 { 5014 struct vcpu_vmx *vmx = to_vmx(vcpu); 5015 bool masked; 5016 5017 if (!enable_vnmi) 5018 return vmx->loaded_vmcs->soft_vnmi_blocked; 5019 if (vmx->loaded_vmcs->nmi_known_unmasked) 5020 return false; 5021 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 5022 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5023 return masked; 5024 } 5025 5026 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5027 { 5028 struct vcpu_vmx *vmx = to_vmx(vcpu); 5029 5030 if (!enable_vnmi) { 5031 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 5032 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 5033 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5034 } 5035 } else { 5036 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5037 if (masked) 5038 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5039 GUEST_INTR_STATE_NMI); 5040 else 5041 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5042 GUEST_INTR_STATE_NMI); 5043 } 5044 } 5045 5046 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5047 { 5048 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5049 return false; 5050 5051 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5052 return true; 5053 5054 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5055 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5056 GUEST_INTR_STATE_NMI)); 5057 } 5058 5059 int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5060 { 5061 if (to_vmx(vcpu)->nested.nested_run_pending) 5062 return -EBUSY; 5063 5064 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5065 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5066 return -EBUSY; 5067 5068 return !vmx_nmi_blocked(vcpu); 5069 } 5070 5071 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5072 { 5073 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5074 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5075 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5076 } 5077 5078 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5079 { 5080 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5081 return false; 5082 5083 return __vmx_interrupt_blocked(vcpu); 5084 } 5085 5086 int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5087 { 5088 if (to_vmx(vcpu)->nested.nested_run_pending) 5089 return -EBUSY; 5090 5091 /* 5092 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5093 * e.g. if the IRQ arrived asynchronously after checking nested events. 5094 */ 5095 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5096 return -EBUSY; 5097 5098 return !vmx_interrupt_blocked(vcpu); 5099 } 5100 5101 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5102 { 5103 void __user *ret; 5104 5105 if (enable_unrestricted_guest) 5106 return 0; 5107 5108 mutex_lock(&kvm->slots_lock); 5109 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5110 PAGE_SIZE * 3); 5111 mutex_unlock(&kvm->slots_lock); 5112 5113 if (IS_ERR(ret)) 5114 return PTR_ERR(ret); 5115 5116 to_kvm_vmx(kvm)->tss_addr = addr; 5117 5118 return init_rmode_tss(kvm, ret); 5119 } 5120 5121 int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5122 { 5123 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5124 return 0; 5125 } 5126 5127 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5128 { 5129 switch (vec) { 5130 case BP_VECTOR: 5131 /* 5132 * Update instruction length as we may reinject the exception 5133 * from user space while in guest debugging mode. 5134 */ 5135 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5136 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5137 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5138 return false; 5139 fallthrough; 5140 case DB_VECTOR: 5141 return !(vcpu->guest_debug & 5142 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5143 case DE_VECTOR: 5144 case OF_VECTOR: 5145 case BR_VECTOR: 5146 case UD_VECTOR: 5147 case DF_VECTOR: 5148 case SS_VECTOR: 5149 case GP_VECTOR: 5150 case MF_VECTOR: 5151 return true; 5152 } 5153 return false; 5154 } 5155 5156 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5157 int vec, u32 err_code) 5158 { 5159 /* 5160 * Instruction with address size override prefix opcode 0x67 5161 * Cause the #SS fault with 0 error code in VM86 mode. 5162 */ 5163 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5164 if (kvm_emulate_instruction(vcpu, 0)) { 5165 if (vcpu->arch.halt_request) { 5166 vcpu->arch.halt_request = 0; 5167 return kvm_emulate_halt_noskip(vcpu); 5168 } 5169 return 1; 5170 } 5171 return 0; 5172 } 5173 5174 /* 5175 * Forward all other exceptions that are valid in real mode. 5176 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5177 * the required debugging infrastructure rework. 5178 */ 5179 kvm_queue_exception(vcpu, vec); 5180 return 1; 5181 } 5182 5183 static int handle_machine_check(struct kvm_vcpu *vcpu) 5184 { 5185 /* handled by vmx_vcpu_run() */ 5186 return 1; 5187 } 5188 5189 /* 5190 * If the host has split lock detection disabled, then #AC is 5191 * unconditionally injected into the guest, which is the pre split lock 5192 * detection behaviour. 5193 * 5194 * If the host has split lock detection enabled then #AC is 5195 * only injected into the guest when: 5196 * - Guest CPL == 3 (user mode) 5197 * - Guest has #AC detection enabled in CR0 5198 * - Guest EFLAGS has AC bit set 5199 */ 5200 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5201 { 5202 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5203 return true; 5204 5205 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5206 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5207 } 5208 5209 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5210 { 5211 struct vcpu_vmx *vmx = to_vmx(vcpu); 5212 struct kvm_run *kvm_run = vcpu->run; 5213 u32 intr_info, ex_no, error_code; 5214 unsigned long cr2, dr6; 5215 u32 vect_info; 5216 5217 vect_info = vmx->idt_vectoring_info; 5218 intr_info = vmx_get_intr_info(vcpu); 5219 5220 /* 5221 * Machine checks are handled by handle_exception_irqoff(), or by 5222 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5223 * vmx_vcpu_enter_exit(). 5224 */ 5225 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5226 return 1; 5227 5228 /* 5229 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5230 * This ensures the nested_vmx check is not skipped so vmexit can 5231 * be reflected to L1 (when it intercepts #NM) before reaching this 5232 * point. 5233 */ 5234 if (is_nm_fault(intr_info)) { 5235 kvm_queue_exception(vcpu, NM_VECTOR); 5236 return 1; 5237 } 5238 5239 if (is_invalid_opcode(intr_info)) 5240 return handle_ud(vcpu); 5241 5242 if (WARN_ON_ONCE(is_ve_fault(intr_info))) { 5243 struct vmx_ve_information *ve_info = vmx->ve_info; 5244 5245 WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, 5246 "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); 5247 dump_vmcs(vcpu); 5248 kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); 5249 return 1; 5250 } 5251 5252 error_code = 0; 5253 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5254 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5255 5256 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5257 WARN_ON_ONCE(!enable_vmware_backdoor); 5258 5259 /* 5260 * VMware backdoor emulation on #GP interception only handles 5261 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5262 * error code on #GP. 5263 */ 5264 if (error_code) { 5265 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5266 return 1; 5267 } 5268 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5269 } 5270 5271 /* 5272 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5273 * MMIO, it is better to report an internal error. 5274 * See the comments in vmx_handle_exit. 5275 */ 5276 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5277 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5278 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5279 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5280 vcpu->run->internal.ndata = 4; 5281 vcpu->run->internal.data[0] = vect_info; 5282 vcpu->run->internal.data[1] = intr_info; 5283 vcpu->run->internal.data[2] = error_code; 5284 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5285 return 0; 5286 } 5287 5288 if (is_page_fault(intr_info)) { 5289 cr2 = vmx_get_exit_qual(vcpu); 5290 if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5291 /* 5292 * EPT will cause page fault only if we need to 5293 * detect illegal GPAs. 5294 */ 5295 WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5296 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5297 return 1; 5298 } else 5299 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5300 } 5301 5302 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5303 5304 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5305 return handle_rmode_exception(vcpu, ex_no, error_code); 5306 5307 switch (ex_no) { 5308 case DB_VECTOR: 5309 dr6 = vmx_get_exit_qual(vcpu); 5310 if (!(vcpu->guest_debug & 5311 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5312 /* 5313 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5314 * instruction. ICEBP generates a trap-like #DB, but 5315 * despite its interception control being tied to #DB, 5316 * is an instruction intercept, i.e. the VM-Exit occurs 5317 * on the ICEBP itself. Use the inner "skip" helper to 5318 * avoid single-step #DB and MTF updates, as ICEBP is 5319 * higher priority. Note, skipping ICEBP still clears 5320 * STI and MOVSS blocking. 5321 * 5322 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5323 * if single-step is enabled in RFLAGS and STI or MOVSS 5324 * blocking is active, as the CPU doesn't set the bit 5325 * on VM-Exit due to #DB interception. VM-Entry has a 5326 * consistency check that a single-step #DB is pending 5327 * in this scenario as the previous instruction cannot 5328 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5329 * don't modify RFLAGS), therefore the one instruction 5330 * delay when activating single-step breakpoints must 5331 * have already expired. Note, the CPU sets/clears BS 5332 * as appropriate for all other VM-Exits types. 5333 */ 5334 if (is_icebp(intr_info)) 5335 WARN_ON(!skip_emulated_instruction(vcpu)); 5336 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5337 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5338 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5339 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5340 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5341 5342 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5343 return 1; 5344 } 5345 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5346 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5347 fallthrough; 5348 case BP_VECTOR: 5349 /* 5350 * Update instruction length as we may reinject #BP from 5351 * user space while in guest debugging mode. Reading it for 5352 * #DB as well causes no harm, it is not used in that case. 5353 */ 5354 vmx->vcpu.arch.event_exit_inst_len = 5355 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5356 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5357 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5358 kvm_run->debug.arch.exception = ex_no; 5359 break; 5360 case AC_VECTOR: 5361 if (vmx_guest_inject_ac(vcpu)) { 5362 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5363 return 1; 5364 } 5365 5366 /* 5367 * Handle split lock. Depending on detection mode this will 5368 * either warn and disable split lock detection for this 5369 * task or force SIGBUS on it. 5370 */ 5371 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5372 return 1; 5373 fallthrough; 5374 default: 5375 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5376 kvm_run->ex.exception = ex_no; 5377 kvm_run->ex.error_code = error_code; 5378 break; 5379 } 5380 return 0; 5381 } 5382 5383 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5384 { 5385 ++vcpu->stat.irq_exits; 5386 return 1; 5387 } 5388 5389 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5390 { 5391 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5392 vcpu->mmio_needed = 0; 5393 return 0; 5394 } 5395 5396 static int handle_io(struct kvm_vcpu *vcpu) 5397 { 5398 unsigned long exit_qualification; 5399 int size, in, string; 5400 unsigned port; 5401 5402 exit_qualification = vmx_get_exit_qual(vcpu); 5403 string = (exit_qualification & 16) != 0; 5404 5405 ++vcpu->stat.io_exits; 5406 5407 if (string) 5408 return kvm_emulate_instruction(vcpu, 0); 5409 5410 port = exit_qualification >> 16; 5411 size = (exit_qualification & 7) + 1; 5412 in = (exit_qualification & 8) != 0; 5413 5414 return kvm_fast_pio(vcpu, size, port, in); 5415 } 5416 5417 void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5418 { 5419 /* 5420 * Patch in the VMCALL instruction: 5421 */ 5422 hypercall[0] = 0x0f; 5423 hypercall[1] = 0x01; 5424 hypercall[2] = 0xc1; 5425 } 5426 5427 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5428 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5429 { 5430 if (is_guest_mode(vcpu)) { 5431 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5432 unsigned long orig_val = val; 5433 5434 /* 5435 * We get here when L2 changed cr0 in a way that did not change 5436 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5437 * but did change L0 shadowed bits. So we first calculate the 5438 * effective cr0 value that L1 would like to write into the 5439 * hardware. It consists of the L2-owned bits from the new 5440 * value combined with the L1-owned bits from L1's guest_cr0. 5441 */ 5442 val = (val & ~vmcs12->cr0_guest_host_mask) | 5443 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5444 5445 if (kvm_set_cr0(vcpu, val)) 5446 return 1; 5447 vmcs_writel(CR0_READ_SHADOW, orig_val); 5448 return 0; 5449 } else { 5450 return kvm_set_cr0(vcpu, val); 5451 } 5452 } 5453 5454 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5455 { 5456 if (is_guest_mode(vcpu)) { 5457 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5458 unsigned long orig_val = val; 5459 5460 /* analogously to handle_set_cr0 */ 5461 val = (val & ~vmcs12->cr4_guest_host_mask) | 5462 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5463 if (kvm_set_cr4(vcpu, val)) 5464 return 1; 5465 vmcs_writel(CR4_READ_SHADOW, orig_val); 5466 return 0; 5467 } else 5468 return kvm_set_cr4(vcpu, val); 5469 } 5470 5471 static int handle_desc(struct kvm_vcpu *vcpu) 5472 { 5473 /* 5474 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5475 * and other code needs to be updated if UMIP can be guest owned. 5476 */ 5477 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5478 5479 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5480 return kvm_emulate_instruction(vcpu, 0); 5481 } 5482 5483 static int handle_cr(struct kvm_vcpu *vcpu) 5484 { 5485 unsigned long exit_qualification, val; 5486 int cr; 5487 int reg; 5488 int err; 5489 int ret; 5490 5491 exit_qualification = vmx_get_exit_qual(vcpu); 5492 cr = exit_qualification & 15; 5493 reg = (exit_qualification >> 8) & 15; 5494 switch ((exit_qualification >> 4) & 3) { 5495 case 0: /* mov to cr */ 5496 val = kvm_register_read(vcpu, reg); 5497 trace_kvm_cr_write(cr, val); 5498 switch (cr) { 5499 case 0: 5500 err = handle_set_cr0(vcpu, val); 5501 return kvm_complete_insn_gp(vcpu, err); 5502 case 3: 5503 WARN_ON_ONCE(enable_unrestricted_guest); 5504 5505 err = kvm_set_cr3(vcpu, val); 5506 return kvm_complete_insn_gp(vcpu, err); 5507 case 4: 5508 err = handle_set_cr4(vcpu, val); 5509 return kvm_complete_insn_gp(vcpu, err); 5510 case 8: { 5511 u8 cr8_prev = kvm_get_cr8(vcpu); 5512 u8 cr8 = (u8)val; 5513 err = kvm_set_cr8(vcpu, cr8); 5514 ret = kvm_complete_insn_gp(vcpu, err); 5515 if (lapic_in_kernel(vcpu)) 5516 return ret; 5517 if (cr8_prev <= cr8) 5518 return ret; 5519 /* 5520 * TODO: we might be squashing a 5521 * KVM_GUESTDBG_SINGLESTEP-triggered 5522 * KVM_EXIT_DEBUG here. 5523 */ 5524 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5525 return 0; 5526 } 5527 } 5528 break; 5529 case 2: /* clts */ 5530 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5531 return -EIO; 5532 case 1: /*mov from cr*/ 5533 switch (cr) { 5534 case 3: 5535 WARN_ON_ONCE(enable_unrestricted_guest); 5536 5537 val = kvm_read_cr3(vcpu); 5538 kvm_register_write(vcpu, reg, val); 5539 trace_kvm_cr_read(cr, val); 5540 return kvm_skip_emulated_instruction(vcpu); 5541 case 8: 5542 val = kvm_get_cr8(vcpu); 5543 kvm_register_write(vcpu, reg, val); 5544 trace_kvm_cr_read(cr, val); 5545 return kvm_skip_emulated_instruction(vcpu); 5546 } 5547 break; 5548 case 3: /* lmsw */ 5549 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5550 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5551 kvm_lmsw(vcpu, val); 5552 5553 return kvm_skip_emulated_instruction(vcpu); 5554 default: 5555 break; 5556 } 5557 vcpu->run->exit_reason = 0; 5558 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5559 (int)(exit_qualification >> 4) & 3, cr); 5560 return 0; 5561 } 5562 5563 static int handle_dr(struct kvm_vcpu *vcpu) 5564 { 5565 unsigned long exit_qualification; 5566 int dr, dr7, reg; 5567 int err = 1; 5568 5569 exit_qualification = vmx_get_exit_qual(vcpu); 5570 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5571 5572 /* First, if DR does not exist, trigger UD */ 5573 if (!kvm_require_dr(vcpu, dr)) 5574 return 1; 5575 5576 if (vmx_get_cpl(vcpu) > 0) 5577 goto out; 5578 5579 dr7 = vmcs_readl(GUEST_DR7); 5580 if (dr7 & DR7_GD) { 5581 /* 5582 * As the vm-exit takes precedence over the debug trap, we 5583 * need to emulate the latter, either for the host or the 5584 * guest debugging itself. 5585 */ 5586 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5587 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5588 vcpu->run->debug.arch.dr7 = dr7; 5589 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5590 vcpu->run->debug.arch.exception = DB_VECTOR; 5591 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5592 return 0; 5593 } else { 5594 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5595 return 1; 5596 } 5597 } 5598 5599 if (vcpu->guest_debug == 0) { 5600 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5601 5602 /* 5603 * No more DR vmexits; force a reload of the debug registers 5604 * and reenter on this instruction. The next vmexit will 5605 * retrieve the full state of the debug registers. 5606 */ 5607 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5608 return 1; 5609 } 5610 5611 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5612 if (exit_qualification & TYPE_MOV_FROM_DR) { 5613 kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); 5614 err = 0; 5615 } else { 5616 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5617 } 5618 5619 out: 5620 return kvm_complete_insn_gp(vcpu, err); 5621 } 5622 5623 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5624 { 5625 get_debugreg(vcpu->arch.db[0], 0); 5626 get_debugreg(vcpu->arch.db[1], 1); 5627 get_debugreg(vcpu->arch.db[2], 2); 5628 get_debugreg(vcpu->arch.db[3], 3); 5629 get_debugreg(vcpu->arch.dr6, 6); 5630 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5631 5632 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5633 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5634 5635 /* 5636 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5637 * a stale dr6 from the guest. 5638 */ 5639 set_debugreg(DR6_RESERVED, 6); 5640 } 5641 5642 void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5643 { 5644 vmcs_writel(GUEST_DR7, val); 5645 } 5646 5647 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5648 { 5649 kvm_apic_update_ppr(vcpu); 5650 return 1; 5651 } 5652 5653 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5654 { 5655 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5656 5657 kvm_make_request(KVM_REQ_EVENT, vcpu); 5658 5659 ++vcpu->stat.irq_window_exits; 5660 return 1; 5661 } 5662 5663 static int handle_invlpg(struct kvm_vcpu *vcpu) 5664 { 5665 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5666 5667 kvm_mmu_invlpg(vcpu, exit_qualification); 5668 return kvm_skip_emulated_instruction(vcpu); 5669 } 5670 5671 static int handle_apic_access(struct kvm_vcpu *vcpu) 5672 { 5673 if (likely(fasteoi)) { 5674 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5675 int access_type, offset; 5676 5677 access_type = exit_qualification & APIC_ACCESS_TYPE; 5678 offset = exit_qualification & APIC_ACCESS_OFFSET; 5679 /* 5680 * Sane guest uses MOV to write EOI, with written value 5681 * not cared. So make a short-circuit here by avoiding 5682 * heavy instruction emulation. 5683 */ 5684 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5685 (offset == APIC_EOI)) { 5686 kvm_lapic_set_eoi(vcpu); 5687 return kvm_skip_emulated_instruction(vcpu); 5688 } 5689 } 5690 return kvm_emulate_instruction(vcpu, 0); 5691 } 5692 5693 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5694 { 5695 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5696 int vector = exit_qualification & 0xff; 5697 5698 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5699 kvm_apic_set_eoi_accelerated(vcpu, vector); 5700 return 1; 5701 } 5702 5703 static int handle_apic_write(struct kvm_vcpu *vcpu) 5704 { 5705 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5706 5707 /* 5708 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5709 * hardware has done any necessary aliasing, offset adjustments, etc... 5710 * for the access. I.e. the correct value has already been written to 5711 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5712 * retrieve the register value and emulate the access. 5713 */ 5714 u32 offset = exit_qualification & 0xff0; 5715 5716 kvm_apic_write_nodecode(vcpu, offset); 5717 return 1; 5718 } 5719 5720 static int handle_task_switch(struct kvm_vcpu *vcpu) 5721 { 5722 struct vcpu_vmx *vmx = to_vmx(vcpu); 5723 unsigned long exit_qualification; 5724 bool has_error_code = false; 5725 u32 error_code = 0; 5726 u16 tss_selector; 5727 int reason, type, idt_v, idt_index; 5728 5729 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5730 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5731 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5732 5733 exit_qualification = vmx_get_exit_qual(vcpu); 5734 5735 reason = (u32)exit_qualification >> 30; 5736 if (reason == TASK_SWITCH_GATE && idt_v) { 5737 switch (type) { 5738 case INTR_TYPE_NMI_INTR: 5739 vcpu->arch.nmi_injected = false; 5740 vmx_set_nmi_mask(vcpu, true); 5741 break; 5742 case INTR_TYPE_EXT_INTR: 5743 case INTR_TYPE_SOFT_INTR: 5744 kvm_clear_interrupt_queue(vcpu); 5745 break; 5746 case INTR_TYPE_HARD_EXCEPTION: 5747 if (vmx->idt_vectoring_info & 5748 VECTORING_INFO_DELIVER_CODE_MASK) { 5749 has_error_code = true; 5750 error_code = 5751 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5752 } 5753 fallthrough; 5754 case INTR_TYPE_SOFT_EXCEPTION: 5755 kvm_clear_exception_queue(vcpu); 5756 break; 5757 default: 5758 break; 5759 } 5760 } 5761 tss_selector = exit_qualification; 5762 5763 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5764 type != INTR_TYPE_EXT_INTR && 5765 type != INTR_TYPE_NMI_INTR)) 5766 WARN_ON(!skip_emulated_instruction(vcpu)); 5767 5768 /* 5769 * TODO: What about debug traps on tss switch? 5770 * Are we supposed to inject them and update dr6? 5771 */ 5772 return kvm_task_switch(vcpu, tss_selector, 5773 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5774 reason, has_error_code, error_code); 5775 } 5776 5777 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5778 { 5779 unsigned long exit_qualification; 5780 gpa_t gpa; 5781 u64 error_code; 5782 5783 exit_qualification = vmx_get_exit_qual(vcpu); 5784 5785 /* 5786 * EPT violation happened while executing iret from NMI, 5787 * "blocked by NMI" bit has to be set before next VM entry. 5788 * There are errata that may cause this bit to not be set: 5789 * AAK134, BY25. 5790 */ 5791 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5792 enable_vnmi && 5793 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5794 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5795 5796 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5797 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5798 5799 /* Is it a read fault? */ 5800 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) 5801 ? PFERR_USER_MASK : 0; 5802 /* Is it a write fault? */ 5803 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) 5804 ? PFERR_WRITE_MASK : 0; 5805 /* Is it a fetch fault? */ 5806 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) 5807 ? PFERR_FETCH_MASK : 0; 5808 /* ept page table entry is present? */ 5809 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) 5810 ? PFERR_PRESENT_MASK : 0; 5811 5812 if (error_code & EPT_VIOLATION_GVA_IS_VALID) 5813 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? 5814 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5815 5816 /* 5817 * Check that the GPA doesn't exceed physical memory limits, as that is 5818 * a guest page fault. We have to emulate the instruction here, because 5819 * if the illegal address is that of a paging structure, then 5820 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5821 * would also use advanced VM-exit information for EPT violations to 5822 * reconstruct the page fault error code. 5823 */ 5824 if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) 5825 return kvm_emulate_instruction(vcpu, 0); 5826 5827 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5828 } 5829 5830 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5831 { 5832 gpa_t gpa; 5833 5834 if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5835 return 1; 5836 5837 /* 5838 * A nested guest cannot optimize MMIO vmexits, because we have an 5839 * nGPA here instead of the required GPA. 5840 */ 5841 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5842 if (!is_guest_mode(vcpu) && 5843 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5844 trace_kvm_fast_mmio(gpa); 5845 return kvm_skip_emulated_instruction(vcpu); 5846 } 5847 5848 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5849 } 5850 5851 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5852 { 5853 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5854 return -EIO; 5855 5856 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5857 ++vcpu->stat.nmi_window_exits; 5858 kvm_make_request(KVM_REQ_EVENT, vcpu); 5859 5860 return 1; 5861 } 5862 5863 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) 5864 { 5865 struct vcpu_vmx *vmx = to_vmx(vcpu); 5866 5867 return vmx->emulation_required && !vmx->rmode.vm86_active && 5868 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 5869 } 5870 5871 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5872 { 5873 struct vcpu_vmx *vmx = to_vmx(vcpu); 5874 bool intr_window_requested; 5875 unsigned count = 130; 5876 5877 intr_window_requested = exec_controls_get(vmx) & 5878 CPU_BASED_INTR_WINDOW_EXITING; 5879 5880 while (vmx->emulation_required && count-- != 0) { 5881 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 5882 return handle_interrupt_window(&vmx->vcpu); 5883 5884 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5885 return 1; 5886 5887 if (!kvm_emulate_instruction(vcpu, 0)) 5888 return 0; 5889 5890 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5891 kvm_prepare_emulation_failure_exit(vcpu); 5892 return 0; 5893 } 5894 5895 if (vcpu->arch.halt_request) { 5896 vcpu->arch.halt_request = 0; 5897 return kvm_emulate_halt_noskip(vcpu); 5898 } 5899 5900 /* 5901 * Note, return 1 and not 0, vcpu_run() will invoke 5902 * xfer_to_guest_mode() which will create a proper return 5903 * code. 5904 */ 5905 if (__xfer_to_guest_mode_work_pending()) 5906 return 1; 5907 } 5908 5909 return 1; 5910 } 5911 5912 int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5913 { 5914 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5915 kvm_prepare_emulation_failure_exit(vcpu); 5916 return 0; 5917 } 5918 5919 return 1; 5920 } 5921 5922 /* 5923 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5924 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5925 */ 5926 static int handle_pause(struct kvm_vcpu *vcpu) 5927 { 5928 if (!kvm_pause_in_guest(vcpu->kvm)) 5929 grow_ple_window(vcpu); 5930 5931 /* 5932 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5933 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5934 * never set PAUSE_EXITING and just set PLE if supported, 5935 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5936 */ 5937 kvm_vcpu_on_spin(vcpu, true); 5938 return kvm_skip_emulated_instruction(vcpu); 5939 } 5940 5941 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5942 { 5943 return 1; 5944 } 5945 5946 static int handle_invpcid(struct kvm_vcpu *vcpu) 5947 { 5948 u32 vmx_instruction_info; 5949 unsigned long type; 5950 gva_t gva; 5951 struct { 5952 u64 pcid; 5953 u64 gla; 5954 } operand; 5955 int gpr_index; 5956 5957 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 5958 kvm_queue_exception(vcpu, UD_VECTOR); 5959 return 1; 5960 } 5961 5962 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5963 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5964 type = kvm_register_read(vcpu, gpr_index); 5965 5966 /* According to the Intel instruction reference, the memory operand 5967 * is read even if it isn't needed (e.g., for type==all) 5968 */ 5969 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5970 vmx_instruction_info, false, 5971 sizeof(operand), &gva)) 5972 return 1; 5973 5974 return kvm_handle_invpcid(vcpu, type, gva); 5975 } 5976 5977 static int handle_pml_full(struct kvm_vcpu *vcpu) 5978 { 5979 unsigned long exit_qualification; 5980 5981 trace_kvm_pml_full(vcpu->vcpu_id); 5982 5983 exit_qualification = vmx_get_exit_qual(vcpu); 5984 5985 /* 5986 * PML buffer FULL happened while executing iret from NMI, 5987 * "blocked by NMI" bit has to be set before next VM entry. 5988 */ 5989 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5990 enable_vnmi && 5991 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5992 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5993 GUEST_INTR_STATE_NMI); 5994 5995 /* 5996 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 5997 * here.., and there's no userspace involvement needed for PML. 5998 */ 5999 return 1; 6000 } 6001 6002 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, 6003 bool force_immediate_exit) 6004 { 6005 struct vcpu_vmx *vmx = to_vmx(vcpu); 6006 6007 /* 6008 * In the *extremely* unlikely scenario that this is a spurious VM-Exit 6009 * due to the timer expiring while it was "soft" disabled, just eat the 6010 * exit and re-enter the guest. 6011 */ 6012 if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 6013 return EXIT_FASTPATH_REENTER_GUEST; 6014 6015 /* 6016 * If the timer expired because KVM used it to force an immediate exit, 6017 * then mission accomplished. 6018 */ 6019 if (force_immediate_exit) 6020 return EXIT_FASTPATH_EXIT_HANDLED; 6021 6022 /* 6023 * If L2 is active, go down the slow path as emulating the guest timer 6024 * expiration likely requires synthesizing a nested VM-Exit. 6025 */ 6026 if (is_guest_mode(vcpu)) 6027 return EXIT_FASTPATH_NONE; 6028 6029 kvm_lapic_expired_hv_timer(vcpu); 6030 return EXIT_FASTPATH_REENTER_GUEST; 6031 } 6032 6033 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6034 { 6035 /* 6036 * This non-fastpath handler is reached if and only if the preemption 6037 * timer was being used to emulate a guest timer while L2 is active. 6038 * All other scenarios are supposed to be handled in the fastpath. 6039 */ 6040 WARN_ON_ONCE(!is_guest_mode(vcpu)); 6041 kvm_lapic_expired_hv_timer(vcpu); 6042 return 1; 6043 } 6044 6045 /* 6046 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6047 * are overwritten by nested_vmx_setup() when nested=1. 6048 */ 6049 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6050 { 6051 kvm_queue_exception(vcpu, UD_VECTOR); 6052 return 1; 6053 } 6054 6055 #ifndef CONFIG_X86_SGX_KVM 6056 static int handle_encls(struct kvm_vcpu *vcpu) 6057 { 6058 /* 6059 * SGX virtualization is disabled. There is no software enable bit for 6060 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6061 * the guest from executing ENCLS (when SGX is supported by hardware). 6062 */ 6063 kvm_queue_exception(vcpu, UD_VECTOR); 6064 return 1; 6065 } 6066 #endif /* CONFIG_X86_SGX_KVM */ 6067 6068 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6069 { 6070 /* 6071 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6072 * VM-Exits. Unconditionally set the flag here and leave the handling to 6073 * vmx_handle_exit(). 6074 */ 6075 to_vmx(vcpu)->exit_reason.bus_lock_detected = true; 6076 return 1; 6077 } 6078 6079 static int handle_notify(struct kvm_vcpu *vcpu) 6080 { 6081 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6082 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6083 6084 ++vcpu->stat.notify_window_exits; 6085 6086 /* 6087 * Notify VM exit happened while executing iret from NMI, 6088 * "blocked by NMI" bit has to be set before next VM entry. 6089 */ 6090 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6091 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6092 GUEST_INTR_STATE_NMI); 6093 6094 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6095 context_invalid) { 6096 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6097 vcpu->run->notify.flags = context_invalid ? 6098 KVM_NOTIFY_CONTEXT_INVALID : 0; 6099 return 0; 6100 } 6101 6102 return 1; 6103 } 6104 6105 /* 6106 * The exit handlers return 1 if the exit was handled fully and guest execution 6107 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6108 * to be done to userspace and return 0. 6109 */ 6110 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6111 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6112 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6113 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6114 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6115 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6116 [EXIT_REASON_CR_ACCESS] = handle_cr, 6117 [EXIT_REASON_DR_ACCESS] = handle_dr, 6118 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6119 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6120 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6121 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6122 [EXIT_REASON_HLT] = kvm_emulate_halt, 6123 [EXIT_REASON_INVD] = kvm_emulate_invd, 6124 [EXIT_REASON_INVLPG] = handle_invlpg, 6125 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6126 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6127 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6128 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6129 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6130 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6131 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6132 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6133 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6134 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6135 [EXIT_REASON_VMON] = handle_vmx_instruction, 6136 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6137 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6138 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6139 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6140 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6141 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6142 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6143 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6144 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6145 [EXIT_REASON_LDTR_TR] = handle_desc, 6146 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6147 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6148 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6149 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6150 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6151 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6152 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6153 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6154 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6155 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6156 [EXIT_REASON_PML_FULL] = handle_pml_full, 6157 [EXIT_REASON_INVPCID] = handle_invpcid, 6158 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6159 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6160 [EXIT_REASON_ENCLS] = handle_encls, 6161 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6162 [EXIT_REASON_NOTIFY] = handle_notify, 6163 }; 6164 6165 static const int kvm_vmx_max_exit_handlers = 6166 ARRAY_SIZE(kvm_vmx_exit_handlers); 6167 6168 void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6169 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) 6170 { 6171 struct vcpu_vmx *vmx = to_vmx(vcpu); 6172 6173 *reason = vmx->exit_reason.full; 6174 *info1 = vmx_get_exit_qual(vcpu); 6175 if (!(vmx->exit_reason.failed_vmentry)) { 6176 *info2 = vmx->idt_vectoring_info; 6177 *intr_info = vmx_get_intr_info(vcpu); 6178 if (is_exception_with_error_code(*intr_info)) 6179 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6180 else 6181 *error_code = 0; 6182 } else { 6183 *info2 = 0; 6184 *intr_info = 0; 6185 *error_code = 0; 6186 } 6187 } 6188 6189 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6190 { 6191 if (vmx->pml_pg) { 6192 __free_page(vmx->pml_pg); 6193 vmx->pml_pg = NULL; 6194 } 6195 } 6196 6197 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6198 { 6199 struct vcpu_vmx *vmx = to_vmx(vcpu); 6200 u64 *pml_buf; 6201 u16 pml_idx; 6202 6203 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6204 6205 /* Do nothing if PML buffer is empty */ 6206 if (pml_idx == (PML_ENTITY_NUM - 1)) 6207 return; 6208 6209 /* PML index always points to next available PML buffer entity */ 6210 if (pml_idx >= PML_ENTITY_NUM) 6211 pml_idx = 0; 6212 else 6213 pml_idx++; 6214 6215 pml_buf = page_address(vmx->pml_pg); 6216 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 6217 u64 gpa; 6218 6219 gpa = pml_buf[pml_idx]; 6220 WARN_ON(gpa & (PAGE_SIZE - 1)); 6221 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6222 } 6223 6224 /* reset PML index */ 6225 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 6226 } 6227 6228 static void vmx_dump_sel(char *name, uint32_t sel) 6229 { 6230 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6231 name, vmcs_read16(sel), 6232 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6233 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6234 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6235 } 6236 6237 static void vmx_dump_dtsel(char *name, uint32_t limit) 6238 { 6239 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6240 name, vmcs_read32(limit), 6241 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6242 } 6243 6244 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6245 { 6246 unsigned int i; 6247 struct vmx_msr_entry *e; 6248 6249 pr_err("MSR %s:\n", name); 6250 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6251 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6252 } 6253 6254 void dump_vmcs(struct kvm_vcpu *vcpu) 6255 { 6256 struct vcpu_vmx *vmx = to_vmx(vcpu); 6257 u32 vmentry_ctl, vmexit_ctl; 6258 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6259 u64 tertiary_exec_control; 6260 unsigned long cr4; 6261 int efer_slot; 6262 6263 if (!dump_invalid_vmcs) { 6264 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6265 return; 6266 } 6267 6268 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6269 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6270 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6271 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6272 cr4 = vmcs_readl(GUEST_CR4); 6273 6274 if (cpu_has_secondary_exec_ctrls()) 6275 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6276 else 6277 secondary_exec_control = 0; 6278 6279 if (cpu_has_tertiary_exec_ctrls()) 6280 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6281 else 6282 tertiary_exec_control = 0; 6283 6284 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6285 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6286 pr_err("*** Guest State ***\n"); 6287 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6288 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6289 vmcs_readl(CR0_GUEST_HOST_MASK)); 6290 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6291 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6292 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6293 if (cpu_has_vmx_ept()) { 6294 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6295 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6296 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6297 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6298 } 6299 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6300 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6301 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6302 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6303 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6304 vmcs_readl(GUEST_SYSENTER_ESP), 6305 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6306 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6307 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6308 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6309 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6310 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6311 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6312 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6313 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6314 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6315 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6316 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6317 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6318 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6319 else if (efer_slot >= 0) 6320 pr_err("EFER= 0x%016llx (autoload)\n", 6321 vmx->msr_autoload.guest.val[efer_slot].value); 6322 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6323 pr_err("EFER= 0x%016llx (effective)\n", 6324 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6325 else 6326 pr_err("EFER= 0x%016llx (effective)\n", 6327 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6328 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6329 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6330 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6331 vmcs_read64(GUEST_IA32_DEBUGCTL), 6332 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6333 if (cpu_has_load_perf_global_ctrl() && 6334 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6335 pr_err("PerfGlobCtl = 0x%016llx\n", 6336 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6337 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6338 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6339 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6340 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6341 vmcs_read32(GUEST_ACTIVITY_STATE)); 6342 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6343 pr_err("InterruptStatus = %04x\n", 6344 vmcs_read16(GUEST_INTR_STATUS)); 6345 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6346 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6347 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6348 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6349 6350 pr_err("*** Host State ***\n"); 6351 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6352 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6353 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6354 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6355 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6356 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6357 vmcs_read16(HOST_TR_SELECTOR)); 6358 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6359 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6360 vmcs_readl(HOST_TR_BASE)); 6361 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6362 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6363 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6364 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6365 vmcs_readl(HOST_CR4)); 6366 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6367 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6368 vmcs_read32(HOST_IA32_SYSENTER_CS), 6369 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6370 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6371 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6372 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6373 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6374 if (cpu_has_load_perf_global_ctrl() && 6375 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6376 pr_err("PerfGlobCtl = 0x%016llx\n", 6377 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6378 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6379 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6380 6381 pr_err("*** Control State ***\n"); 6382 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6383 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6384 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6385 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6386 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6387 vmcs_read32(EXCEPTION_BITMAP), 6388 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6389 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6390 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6391 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6392 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6393 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6394 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6395 vmcs_read32(VM_EXIT_INTR_INFO), 6396 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6397 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6398 pr_err(" reason=%08x qualification=%016lx\n", 6399 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6400 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6401 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6402 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6403 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6404 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6405 pr_err("TSC Multiplier = 0x%016llx\n", 6406 vmcs_read64(TSC_MULTIPLIER)); 6407 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6408 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6409 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6410 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6411 } 6412 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6413 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6414 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6415 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6416 } 6417 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6418 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6419 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6420 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6421 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6422 pr_err("PLE Gap=%08x Window=%08x\n", 6423 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6424 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6425 pr_err("Virtual processor ID = 0x%04x\n", 6426 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6427 if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { 6428 struct vmx_ve_information *ve_info = vmx->ve_info; 6429 u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); 6430 6431 /* 6432 * If KVM is dumping the VMCS, then something has gone wrong 6433 * already. Derefencing an address from the VMCS, which could 6434 * very well be corrupted, is a terrible idea. The virtual 6435 * address is known so use it. 6436 */ 6437 pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, 6438 ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); 6439 pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", 6440 ve_info->exit_reason, ve_info->delivery, 6441 ve_info->exit_qualification, 6442 ve_info->guest_linear_address, 6443 ve_info->guest_physical_address, ve_info->eptp_index); 6444 } 6445 } 6446 6447 /* 6448 * The guest has exited. See if we can fix it or if we need userspace 6449 * assistance. 6450 */ 6451 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6452 { 6453 struct vcpu_vmx *vmx = to_vmx(vcpu); 6454 union vmx_exit_reason exit_reason = vmx->exit_reason; 6455 u32 vectoring_info = vmx->idt_vectoring_info; 6456 u16 exit_handler_index; 6457 6458 /* 6459 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6460 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6461 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6462 * mode as if vcpus is in root mode, the PML buffer must has been 6463 * flushed already. Note, PML is never enabled in hardware while 6464 * running L2. 6465 */ 6466 if (enable_pml && !is_guest_mode(vcpu)) 6467 vmx_flush_pml_buffer(vcpu); 6468 6469 /* 6470 * KVM should never reach this point with a pending nested VM-Enter. 6471 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6472 * invalid guest state should never happen as that means KVM knowingly 6473 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6474 */ 6475 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6476 return -EIO; 6477 6478 if (is_guest_mode(vcpu)) { 6479 /* 6480 * PML is never enabled when running L2, bail immediately if a 6481 * PML full exit occurs as something is horribly wrong. 6482 */ 6483 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6484 goto unexpected_vmexit; 6485 6486 /* 6487 * The host physical addresses of some pages of guest memory 6488 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6489 * Page). The CPU may write to these pages via their host 6490 * physical address while L2 is running, bypassing any 6491 * address-translation-based dirty tracking (e.g. EPT write 6492 * protection). 6493 * 6494 * Mark them dirty on every exit from L2 to prevent them from 6495 * getting out of sync with dirty tracking. 6496 */ 6497 nested_mark_vmcs12_pages_dirty(vcpu); 6498 6499 /* 6500 * Synthesize a triple fault if L2 state is invalid. In normal 6501 * operation, nested VM-Enter rejects any attempt to enter L2 6502 * with invalid state. However, those checks are skipped if 6503 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6504 * L2 state is invalid, it means either L1 modified SMRAM state 6505 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6506 * doing so is architecturally allowed in the RSM case, and is 6507 * the least awful solution for the userspace case without 6508 * risking false positives. 6509 */ 6510 if (vmx->emulation_required) { 6511 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6512 return 1; 6513 } 6514 6515 if (nested_vmx_reflect_vmexit(vcpu)) 6516 return 1; 6517 } 6518 6519 /* If guest state is invalid, start emulating. L2 is handled above. */ 6520 if (vmx->emulation_required) 6521 return handle_invalid_guest_state(vcpu); 6522 6523 if (exit_reason.failed_vmentry) { 6524 dump_vmcs(vcpu); 6525 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6526 vcpu->run->fail_entry.hardware_entry_failure_reason 6527 = exit_reason.full; 6528 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6529 return 0; 6530 } 6531 6532 if (unlikely(vmx->fail)) { 6533 dump_vmcs(vcpu); 6534 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6535 vcpu->run->fail_entry.hardware_entry_failure_reason 6536 = vmcs_read32(VM_INSTRUCTION_ERROR); 6537 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6538 return 0; 6539 } 6540 6541 /* 6542 * Note: 6543 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 6544 * delivery event since it indicates guest is accessing MMIO. 6545 * The vm-exit can be triggered again after return to guest that 6546 * will cause infinite loop. 6547 */ 6548 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6549 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6550 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6551 exit_reason.basic != EXIT_REASON_PML_FULL && 6552 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6553 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6554 exit_reason.basic != EXIT_REASON_NOTIFY)) { 6555 int ndata = 3; 6556 6557 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6558 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 6559 vcpu->run->internal.data[0] = vectoring_info; 6560 vcpu->run->internal.data[1] = exit_reason.full; 6561 vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu); 6562 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { 6563 vcpu->run->internal.data[ndata++] = 6564 vmcs_read64(GUEST_PHYSICAL_ADDRESS); 6565 } 6566 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; 6567 vcpu->run->internal.ndata = ndata; 6568 return 0; 6569 } 6570 6571 if (unlikely(!enable_vnmi && 6572 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6573 if (!vmx_interrupt_blocked(vcpu)) { 6574 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6575 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6576 vcpu->arch.nmi_pending) { 6577 /* 6578 * This CPU don't support us in finding the end of an 6579 * NMI-blocked window if the guest runs with IRQs 6580 * disabled. So we pull the trigger after 1 s of 6581 * futile waiting, but inform the user about this. 6582 */ 6583 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6584 "state on VCPU %d after 1 s timeout\n", 6585 __func__, vcpu->vcpu_id); 6586 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6587 } 6588 } 6589 6590 if (exit_fastpath != EXIT_FASTPATH_NONE) 6591 return 1; 6592 6593 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6594 goto unexpected_vmexit; 6595 #ifdef CONFIG_MITIGATION_RETPOLINE 6596 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6597 return kvm_emulate_wrmsr(vcpu); 6598 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6599 return handle_preemption_timer(vcpu); 6600 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6601 return handle_interrupt_window(vcpu); 6602 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6603 return handle_external_interrupt(vcpu); 6604 else if (exit_reason.basic == EXIT_REASON_HLT) 6605 return kvm_emulate_halt(vcpu); 6606 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6607 return handle_ept_misconfig(vcpu); 6608 #endif 6609 6610 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6611 kvm_vmx_max_exit_handlers); 6612 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6613 goto unexpected_vmexit; 6614 6615 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6616 6617 unexpected_vmexit: 6618 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6619 exit_reason.full); 6620 dump_vmcs(vcpu); 6621 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6622 vcpu->run->internal.suberror = 6623 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6624 vcpu->run->internal.ndata = 2; 6625 vcpu->run->internal.data[0] = exit_reason.full; 6626 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6627 return 0; 6628 } 6629 6630 int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6631 { 6632 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6633 6634 /* 6635 * Exit to user space when bus lock detected to inform that there is 6636 * a bus lock in guest. 6637 */ 6638 if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { 6639 if (ret > 0) 6640 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6641 6642 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6643 return 0; 6644 } 6645 return ret; 6646 } 6647 6648 /* 6649 * Software based L1D cache flush which is used when microcode providing 6650 * the cache control MSR is not loaded. 6651 * 6652 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6653 * flush it is required to read in 64 KiB because the replacement algorithm 6654 * is not exactly LRU. This could be sized at runtime via topology 6655 * information but as all relevant affected CPUs have 32KiB L1D cache size 6656 * there is no point in doing so. 6657 */ 6658 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6659 { 6660 int size = PAGE_SIZE << L1D_CACHE_ORDER; 6661 6662 /* 6663 * This code is only executed when the flush mode is 'cond' or 6664 * 'always' 6665 */ 6666 if (static_branch_likely(&vmx_l1d_flush_cond)) { 6667 bool flush_l1d; 6668 6669 /* 6670 * Clear the per-vcpu flush bit, it gets set again if the vCPU 6671 * is reloaded, i.e. if the vCPU is scheduled out or if KVM 6672 * exits to userspace, or if KVM reaches one of the unsafe 6673 * VMEXIT handlers, e.g. if KVM calls into the emulator. 6674 */ 6675 flush_l1d = vcpu->arch.l1tf_flush_l1d; 6676 vcpu->arch.l1tf_flush_l1d = false; 6677 6678 /* 6679 * Clear the per-cpu flush bit, it gets set again from 6680 * the interrupt handlers. 6681 */ 6682 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6683 kvm_clear_cpu_l1tf_flush_l1d(); 6684 6685 if (!flush_l1d) 6686 return; 6687 } 6688 6689 vcpu->stat.l1d_flush++; 6690 6691 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6692 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6693 return; 6694 } 6695 6696 asm volatile( 6697 /* First ensure the pages are in the TLB */ 6698 "xorl %%eax, %%eax\n" 6699 ".Lpopulate_tlb:\n\t" 6700 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6701 "addl $4096, %%eax\n\t" 6702 "cmpl %%eax, %[size]\n\t" 6703 "jne .Lpopulate_tlb\n\t" 6704 "xorl %%eax, %%eax\n\t" 6705 "cpuid\n\t" 6706 /* Now fill the cache */ 6707 "xorl %%eax, %%eax\n" 6708 ".Lfill_cache:\n" 6709 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6710 "addl $64, %%eax\n\t" 6711 "cmpl %%eax, %[size]\n\t" 6712 "jne .Lfill_cache\n\t" 6713 "lfence\n" 6714 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6715 [size] "r" (size) 6716 : "eax", "ebx", "ecx", "edx"); 6717 } 6718 6719 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6720 { 6721 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6722 int tpr_threshold; 6723 6724 if (is_guest_mode(vcpu) && 6725 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6726 return; 6727 6728 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6729 if (is_guest_mode(vcpu)) 6730 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; 6731 else 6732 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6733 } 6734 6735 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6736 { 6737 struct vcpu_vmx *vmx = to_vmx(vcpu); 6738 u32 sec_exec_control; 6739 6740 if (!lapic_in_kernel(vcpu)) 6741 return; 6742 6743 if (!flexpriority_enabled && 6744 !cpu_has_vmx_virtualize_x2apic_mode()) 6745 return; 6746 6747 /* Postpone execution until vmcs01 is the current VMCS. */ 6748 if (is_guest_mode(vcpu)) { 6749 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6750 return; 6751 } 6752 6753 sec_exec_control = secondary_exec_controls_get(vmx); 6754 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6755 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6756 6757 switch (kvm_get_apic_mode(vcpu)) { 6758 case LAPIC_MODE_INVALID: 6759 WARN_ONCE(true, "Invalid local APIC state"); 6760 break; 6761 case LAPIC_MODE_DISABLED: 6762 break; 6763 case LAPIC_MODE_XAPIC: 6764 if (flexpriority_enabled) { 6765 sec_exec_control |= 6766 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6767 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6768 6769 /* 6770 * Flush the TLB, reloading the APIC access page will 6771 * only do so if its physical address has changed, but 6772 * the guest may have inserted a non-APIC mapping into 6773 * the TLB while the APIC access page was disabled. 6774 */ 6775 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6776 } 6777 break; 6778 case LAPIC_MODE_X2APIC: 6779 if (cpu_has_vmx_virtualize_x2apic_mode()) 6780 sec_exec_control |= 6781 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6782 break; 6783 } 6784 secondary_exec_controls_set(vmx, sec_exec_control); 6785 6786 vmx_update_msr_bitmap_x2apic(vcpu); 6787 } 6788 6789 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6790 { 6791 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6792 struct kvm *kvm = vcpu->kvm; 6793 struct kvm_memslots *slots = kvm_memslots(kvm); 6794 struct kvm_memory_slot *slot; 6795 unsigned long mmu_seq; 6796 kvm_pfn_t pfn; 6797 6798 /* Defer reload until vmcs01 is the current VMCS. */ 6799 if (is_guest_mode(vcpu)) { 6800 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; 6801 return; 6802 } 6803 6804 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6805 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6806 return; 6807 6808 /* 6809 * Explicitly grab the memslot using KVM's internal slot ID to ensure 6810 * KVM doesn't unintentionally grab a userspace memslot. It _should_ 6811 * be impossible for userspace to create a memslot for the APIC when 6812 * APICv is enabled, but paranoia won't hurt in this case. 6813 */ 6814 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6815 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6816 return; 6817 6818 /* 6819 * Ensure that the mmu_notifier sequence count is read before KVM 6820 * retrieves the pfn from the primary MMU. Note, the memslot is 6821 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6822 * in kvm_mmu_invalidate_end(). 6823 */ 6824 mmu_seq = kvm->mmu_invalidate_seq; 6825 smp_rmb(); 6826 6827 /* 6828 * No need to retry if the memslot does not exist or is invalid. KVM 6829 * controls the APIC-access page memslot, and only deletes the memslot 6830 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6831 */ 6832 pfn = gfn_to_pfn_memslot(slot, gfn); 6833 if (is_error_noslot_pfn(pfn)) 6834 return; 6835 6836 read_lock(&vcpu->kvm->mmu_lock); 6837 if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { 6838 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6839 read_unlock(&vcpu->kvm->mmu_lock); 6840 goto out; 6841 } 6842 6843 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6844 read_unlock(&vcpu->kvm->mmu_lock); 6845 6846 /* 6847 * No need for a manual TLB flush at this point, KVM has already done a 6848 * flush if there were SPTEs pointing at the previous page. 6849 */ 6850 out: 6851 /* 6852 * Do not pin apic access page in memory, the MMU notifier 6853 * will call us again if it is migrated or swapped out. 6854 */ 6855 kvm_release_pfn_clean(pfn); 6856 } 6857 6858 void vmx_hwapic_isr_update(int max_isr) 6859 { 6860 u16 status; 6861 u8 old; 6862 6863 if (max_isr == -1) 6864 max_isr = 0; 6865 6866 status = vmcs_read16(GUEST_INTR_STATUS); 6867 old = status >> 8; 6868 if (max_isr != old) { 6869 status &= 0xff; 6870 status |= max_isr << 8; 6871 vmcs_write16(GUEST_INTR_STATUS, status); 6872 } 6873 } 6874 6875 static void vmx_set_rvi(int vector) 6876 { 6877 u16 status; 6878 u8 old; 6879 6880 if (vector == -1) 6881 vector = 0; 6882 6883 status = vmcs_read16(GUEST_INTR_STATUS); 6884 old = (u8)status & 0xff; 6885 if ((u8)vector != old) { 6886 status &= ~0xff; 6887 status |= (u8)vector; 6888 vmcs_write16(GUEST_INTR_STATUS, status); 6889 } 6890 } 6891 6892 void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 6893 { 6894 /* 6895 * When running L2, updating RVI is only relevant when 6896 * vmcs12 virtual-interrupt-delivery enabled. 6897 * However, it can be enabled only when L1 also 6898 * intercepts external-interrupts and in that case 6899 * we should not update vmcs02 RVI but instead intercept 6900 * interrupt. Therefore, do nothing when running L2. 6901 */ 6902 if (!is_guest_mode(vcpu)) 6903 vmx_set_rvi(max_irr); 6904 } 6905 6906 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6907 { 6908 struct vcpu_vmx *vmx = to_vmx(vcpu); 6909 int max_irr; 6910 bool got_posted_interrupt; 6911 6912 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 6913 return -EIO; 6914 6915 if (pi_test_on(&vmx->pi_desc)) { 6916 pi_clear_on(&vmx->pi_desc); 6917 /* 6918 * IOMMU can write to PID.ON, so the barrier matters even on UP. 6919 * But on x86 this is just a compiler barrier anyway. 6920 */ 6921 smp_mb__after_atomic(); 6922 got_posted_interrupt = 6923 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); 6924 } else { 6925 max_irr = kvm_lapic_find_highest_irr(vcpu); 6926 got_posted_interrupt = false; 6927 } 6928 6929 /* 6930 * Newly recognized interrupts are injected via either virtual interrupt 6931 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 6932 * disabled in two cases: 6933 * 6934 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 6935 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 6936 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 6937 * into L2, but KVM doesn't use virtual interrupt delivery to inject 6938 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 6939 * 6940 * 2) If APICv is disabled for this vCPU, assigned devices may still 6941 * attempt to post interrupts. The posted interrupt vector will cause 6942 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 6943 */ 6944 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 6945 vmx_set_rvi(max_irr); 6946 else if (got_posted_interrupt) 6947 kvm_make_request(KVM_REQ_EVENT, vcpu); 6948 6949 return max_irr; 6950 } 6951 6952 void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6953 { 6954 if (!kvm_vcpu_apicv_active(vcpu)) 6955 return; 6956 6957 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6958 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6959 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6960 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6961 } 6962 6963 void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) 6964 { 6965 struct vcpu_vmx *vmx = to_vmx(vcpu); 6966 6967 pi_clear_on(&vmx->pi_desc); 6968 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); 6969 } 6970 6971 void vmx_do_interrupt_irqoff(unsigned long entry); 6972 void vmx_do_nmi_irqoff(void); 6973 6974 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 6975 { 6976 /* 6977 * Save xfd_err to guest_fpu before interrupt is enabled, so the 6978 * MSR value is not clobbered by the host activity before the guest 6979 * has chance to consume it. 6980 * 6981 * Do not blindly read xfd_err here, since this exception might 6982 * be caused by L1 interception on a platform which doesn't 6983 * support xfd at all. 6984 * 6985 * Do it conditionally upon guest_fpu::xfd. xfd_err matters 6986 * only when xfd contains a non-zero value. 6987 * 6988 * Queuing exception is done in vmx_handle_exit. See comment there. 6989 */ 6990 if (vcpu->arch.guest_fpu.fpstate->xfd) 6991 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 6992 } 6993 6994 static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) 6995 { 6996 /* if exit due to PF check for async PF */ 6997 if (is_page_fault(intr_info)) 6998 vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 6999 /* if exit due to NM, handle before interrupts are enabled */ 7000 else if (is_nm_fault(intr_info)) 7001 handle_nm_fault_irqoff(vcpu); 7002 /* Handle machine checks before interrupts are enabled */ 7003 else if (is_machine_check(intr_info)) 7004 kvm_machine_check(); 7005 } 7006 7007 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, 7008 u32 intr_info) 7009 { 7010 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 7011 7012 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 7013 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 7014 return; 7015 7016 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 7017 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7018 fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); 7019 else 7020 vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); 7021 kvm_after_interrupt(vcpu); 7022 7023 vcpu->arch.at_instruction_boundary = true; 7024 } 7025 7026 void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 7027 { 7028 struct vcpu_vmx *vmx = to_vmx(vcpu); 7029 7030 if (vmx->emulation_required) 7031 return; 7032 7033 if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 7034 handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7035 else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) 7036 handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); 7037 } 7038 7039 /* 7040 * The kvm parameter can be NULL (module initialization, or invocation before 7041 * VM creation). Be sure to check the kvm parameter before using it. 7042 */ 7043 bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7044 { 7045 switch (index) { 7046 case MSR_IA32_SMBASE: 7047 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7048 return false; 7049 /* 7050 * We cannot do SMM unless we can run the guest in big 7051 * real mode. 7052 */ 7053 return enable_unrestricted_guest || emulate_invalid_guest_state; 7054 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7055 return nested; 7056 case MSR_AMD64_VIRT_SPEC_CTRL: 7057 case MSR_AMD64_TSC_RATIO: 7058 /* This is AMD only. */ 7059 return false; 7060 default: 7061 return true; 7062 } 7063 } 7064 7065 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7066 { 7067 u32 exit_intr_info; 7068 bool unblock_nmi; 7069 u8 vector; 7070 bool idtv_info_valid; 7071 7072 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7073 7074 if (enable_vnmi) { 7075 if (vmx->loaded_vmcs->nmi_known_unmasked) 7076 return; 7077 7078 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7079 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7080 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7081 /* 7082 * SDM 3: 27.7.1.2 (September 2008) 7083 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7084 * a guest IRET fault. 7085 * SDM 3: 23.2.2 (September 2008) 7086 * Bit 12 is undefined in any of the following cases: 7087 * If the VM exit sets the valid bit in the IDT-vectoring 7088 * information field. 7089 * If the VM exit is due to a double fault. 7090 */ 7091 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7092 vector != DF_VECTOR && !idtv_info_valid) 7093 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7094 GUEST_INTR_STATE_NMI); 7095 else 7096 vmx->loaded_vmcs->nmi_known_unmasked = 7097 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7098 & GUEST_INTR_STATE_NMI); 7099 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7100 vmx->loaded_vmcs->vnmi_blocked_time += 7101 ktime_to_ns(ktime_sub(ktime_get(), 7102 vmx->loaded_vmcs->entry_time)); 7103 } 7104 7105 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7106 u32 idt_vectoring_info, 7107 int instr_len_field, 7108 int error_code_field) 7109 { 7110 u8 vector; 7111 int type; 7112 bool idtv_info_valid; 7113 7114 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7115 7116 vcpu->arch.nmi_injected = false; 7117 kvm_clear_exception_queue(vcpu); 7118 kvm_clear_interrupt_queue(vcpu); 7119 7120 if (!idtv_info_valid) 7121 return; 7122 7123 kvm_make_request(KVM_REQ_EVENT, vcpu); 7124 7125 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7126 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7127 7128 switch (type) { 7129 case INTR_TYPE_NMI_INTR: 7130 vcpu->arch.nmi_injected = true; 7131 /* 7132 * SDM 3: 27.7.1.2 (September 2008) 7133 * Clear bit "block by NMI" before VM entry if a NMI 7134 * delivery faulted. 7135 */ 7136 vmx_set_nmi_mask(vcpu, false); 7137 break; 7138 case INTR_TYPE_SOFT_EXCEPTION: 7139 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7140 fallthrough; 7141 case INTR_TYPE_HARD_EXCEPTION: 7142 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 7143 u32 err = vmcs_read32(error_code_field); 7144 kvm_requeue_exception_e(vcpu, vector, err); 7145 } else 7146 kvm_requeue_exception(vcpu, vector); 7147 break; 7148 case INTR_TYPE_SOFT_INTR: 7149 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7150 fallthrough; 7151 case INTR_TYPE_EXT_INTR: 7152 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7153 break; 7154 default: 7155 break; 7156 } 7157 } 7158 7159 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7160 { 7161 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7162 VM_EXIT_INSTRUCTION_LEN, 7163 IDT_VECTORING_ERROR_CODE); 7164 } 7165 7166 void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7167 { 7168 __vmx_complete_interrupts(vcpu, 7169 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7170 VM_ENTRY_INSTRUCTION_LEN, 7171 VM_ENTRY_EXCEPTION_ERROR_CODE); 7172 7173 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7174 } 7175 7176 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7177 { 7178 int i, nr_msrs; 7179 struct perf_guest_switch_msr *msrs; 7180 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7181 7182 pmu->host_cross_mapped_mask = 0; 7183 if (pmu->pebs_enable & pmu->global_ctrl) 7184 intel_pmu_cross_mapped_check(pmu); 7185 7186 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7187 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7188 if (!msrs) 7189 return; 7190 7191 for (i = 0; i < nr_msrs; i++) 7192 if (msrs[i].host == msrs[i].guest) 7193 clear_atomic_switch_msr(vmx, msrs[i].msr); 7194 else 7195 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7196 msrs[i].host, false); 7197 } 7198 7199 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7200 { 7201 struct vcpu_vmx *vmx = to_vmx(vcpu); 7202 u64 tscl; 7203 u32 delta_tsc; 7204 7205 if (force_immediate_exit) { 7206 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7207 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7208 } else if (vmx->hv_deadline_tsc != -1) { 7209 tscl = rdtsc(); 7210 if (vmx->hv_deadline_tsc > tscl) 7211 /* set_hv_timer ensures the delta fits in 32-bits */ 7212 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7213 cpu_preemption_timer_multi); 7214 else 7215 delta_tsc = 0; 7216 7217 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7218 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7219 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7220 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7221 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7222 } 7223 } 7224 7225 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7226 { 7227 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7228 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7229 vmcs_writel(HOST_RSP, host_rsp); 7230 } 7231 } 7232 7233 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7234 unsigned int flags) 7235 { 7236 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7237 7238 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7239 return; 7240 7241 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7242 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); 7243 7244 /* 7245 * If the guest/host SPEC_CTRL values differ, restore the host value. 7246 * 7247 * For legacy IBRS, the IBRS bit always needs to be written after 7248 * transitioning from a less privileged predictor mode, regardless of 7249 * whether the guest/host values differ. 7250 */ 7251 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7252 vmx->spec_ctrl != hostval) 7253 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); 7254 7255 barrier_nospec(); 7256 } 7257 7258 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, 7259 bool force_immediate_exit) 7260 { 7261 /* 7262 * If L2 is active, some VMX preemption timer exits can be handled in 7263 * the fastpath even, all other exits must use the slow path. 7264 */ 7265 if (is_guest_mode(vcpu) && 7266 to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) 7267 return EXIT_FASTPATH_NONE; 7268 7269 switch (to_vmx(vcpu)->exit_reason.basic) { 7270 case EXIT_REASON_MSR_WRITE: 7271 return handle_fastpath_set_msr_irqoff(vcpu); 7272 case EXIT_REASON_PREEMPTION_TIMER: 7273 return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); 7274 case EXIT_REASON_HLT: 7275 return handle_fastpath_hlt(vcpu); 7276 default: 7277 return EXIT_FASTPATH_NONE; 7278 } 7279 } 7280 7281 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7282 unsigned int flags) 7283 { 7284 struct vcpu_vmx *vmx = to_vmx(vcpu); 7285 7286 guest_state_enter_irqoff(); 7287 7288 /* 7289 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7290 * mitigation for MDS is done late in VMentry and is still 7291 * executed in spite of L1D Flush. This is because an extra VERW 7292 * should not matter much after the big hammer L1D Flush. 7293 */ 7294 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7295 vmx_l1d_flush(vcpu); 7296 else if (static_branch_unlikely(&mmio_stale_data_clear) && 7297 kvm_arch_has_assigned_device(vcpu->kvm)) 7298 mds_clear_cpu_buffers(); 7299 7300 vmx_disable_fb_clear(vmx); 7301 7302 if (vcpu->arch.cr2 != native_read_cr2()) 7303 native_write_cr2(vcpu->arch.cr2); 7304 7305 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7306 flags); 7307 7308 vcpu->arch.cr2 = native_read_cr2(); 7309 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7310 7311 vmx->idt_vectoring_info = 0; 7312 7313 vmx_enable_fb_clear(vmx); 7314 7315 if (unlikely(vmx->fail)) { 7316 vmx->exit_reason.full = 0xdead; 7317 goto out; 7318 } 7319 7320 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7321 if (likely(!vmx->exit_reason.failed_vmentry)) 7322 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7323 7324 if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && 7325 is_nmi(vmx_get_intr_info(vcpu))) { 7326 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7327 if (cpu_feature_enabled(X86_FEATURE_FRED)) 7328 fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); 7329 else 7330 vmx_do_nmi_irqoff(); 7331 kvm_after_interrupt(vcpu); 7332 } 7333 7334 out: 7335 guest_state_exit_irqoff(); 7336 } 7337 7338 fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) 7339 { 7340 struct vcpu_vmx *vmx = to_vmx(vcpu); 7341 unsigned long cr3, cr4; 7342 7343 /* Record the guest's net vcpu time for enforced NMI injections. */ 7344 if (unlikely(!enable_vnmi && 7345 vmx->loaded_vmcs->soft_vnmi_blocked)) 7346 vmx->loaded_vmcs->entry_time = ktime_get(); 7347 7348 /* 7349 * Don't enter VMX if guest state is invalid, let the exit handler 7350 * start emulation until we arrive back to a valid state. Synthesize a 7351 * consistency check VM-Exit due to invalid guest state and bail. 7352 */ 7353 if (unlikely(vmx->emulation_required)) { 7354 vmx->fail = 0; 7355 7356 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; 7357 vmx->exit_reason.failed_vmentry = 1; 7358 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7359 vmx->exit_qualification = ENTRY_FAIL_DEFAULT; 7360 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7361 vmx->exit_intr_info = 0; 7362 return EXIT_FASTPATH_NONE; 7363 } 7364 7365 trace_kvm_entry(vcpu, force_immediate_exit); 7366 7367 if (vmx->ple_window_dirty) { 7368 vmx->ple_window_dirty = false; 7369 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7370 } 7371 7372 /* 7373 * We did this in prepare_switch_to_guest, because it needs to 7374 * be within srcu_read_lock. 7375 */ 7376 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7377 7378 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7379 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7380 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7381 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7382 vcpu->arch.regs_dirty = 0; 7383 7384 /* 7385 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7386 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7387 * it switches back to the current->mm, which can occur in KVM context 7388 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7389 * toggles a static key while handling a VM-Exit. 7390 */ 7391 cr3 = __get_current_cr3_fast(); 7392 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7393 vmcs_writel(HOST_CR3, cr3); 7394 vmx->loaded_vmcs->host_state.cr3 = cr3; 7395 } 7396 7397 cr4 = cr4_read_shadow(); 7398 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7399 vmcs_writel(HOST_CR4, cr4); 7400 vmx->loaded_vmcs->host_state.cr4 = cr4; 7401 } 7402 7403 /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ 7404 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) 7405 set_debugreg(vcpu->arch.dr6, 6); 7406 7407 /* When single-stepping over STI and MOV SS, we must clear the 7408 * corresponding interruptibility bits in the guest state. Otherwise 7409 * vmentry fails as it then expects bit 14 (BS) in pending debug 7410 * exceptions being set, but that's not correct for the guest debugging 7411 * case. */ 7412 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7413 vmx_set_interrupt_shadow(vcpu, 0); 7414 7415 kvm_load_guest_xsave_state(vcpu); 7416 7417 pt_guest_enter(vmx); 7418 7419 atomic_switch_perf_msrs(vmx); 7420 if (intel_pmu_lbr_is_enabled(vcpu)) 7421 vmx_passthrough_lbr_msrs(vcpu); 7422 7423 if (enable_preemption_timer) 7424 vmx_update_hv_timer(vcpu, force_immediate_exit); 7425 else if (force_immediate_exit) 7426 smp_send_reschedule(vcpu->cpu); 7427 7428 kvm_wait_lapic_expire(vcpu); 7429 7430 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7431 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7432 7433 /* All fields are clean at this point */ 7434 if (kvm_is_using_evmcs()) { 7435 current_evmcs->hv_clean_fields |= 7436 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7437 7438 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7439 } 7440 7441 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7442 if (vmx->host_debugctlmsr) 7443 update_debugctlmsr(vmx->host_debugctlmsr); 7444 7445 #ifndef CONFIG_X86_64 7446 /* 7447 * The sysexit path does not restore ds/es, so we must set them to 7448 * a reasonable value ourselves. 7449 * 7450 * We can't defer this to vmx_prepare_switch_to_host() since that 7451 * function may be executed in interrupt context, which saves and 7452 * restore segments around it, nullifying its effect. 7453 */ 7454 loadsegment(ds, __USER_DS); 7455 loadsegment(es, __USER_DS); 7456 #endif 7457 7458 pt_guest_exit(vmx); 7459 7460 kvm_load_host_xsave_state(vcpu); 7461 7462 if (is_guest_mode(vcpu)) { 7463 /* 7464 * Track VMLAUNCH/VMRESUME that have made past guest state 7465 * checking. 7466 */ 7467 if (vmx->nested.nested_run_pending && 7468 !vmx->exit_reason.failed_vmentry) 7469 ++vcpu->stat.nested_run; 7470 7471 vmx->nested.nested_run_pending = 0; 7472 } 7473 7474 if (unlikely(vmx->fail)) 7475 return EXIT_FASTPATH_NONE; 7476 7477 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7478 kvm_machine_check(); 7479 7480 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7481 7482 if (unlikely(vmx->exit_reason.failed_vmentry)) 7483 return EXIT_FASTPATH_NONE; 7484 7485 vmx->loaded_vmcs->launched = 1; 7486 7487 vmx_recover_nmi_blocking(vmx); 7488 vmx_complete_interrupts(vmx); 7489 7490 return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); 7491 } 7492 7493 void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7494 { 7495 struct vcpu_vmx *vmx = to_vmx(vcpu); 7496 7497 if (enable_pml) 7498 vmx_destroy_pml_buffer(vmx); 7499 free_vpid(vmx->vpid); 7500 nested_vmx_free_vcpu(vcpu); 7501 free_loaded_vmcs(vmx->loaded_vmcs); 7502 free_page((unsigned long)vmx->ve_info); 7503 } 7504 7505 int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7506 { 7507 struct vmx_uret_msr *tsx_ctrl; 7508 struct vcpu_vmx *vmx; 7509 int i, err; 7510 7511 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7512 vmx = to_vmx(vcpu); 7513 7514 INIT_LIST_HEAD(&vmx->pi_wakeup_list); 7515 7516 err = -ENOMEM; 7517 7518 vmx->vpid = allocate_vpid(); 7519 7520 /* 7521 * If PML is turned on, failure on enabling PML just results in failure 7522 * of creating the vcpu, therefore we can simplify PML logic (by 7523 * avoiding dealing with cases, such as enabling PML partially on vcpus 7524 * for the guest), etc. 7525 */ 7526 if (enable_pml) { 7527 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7528 if (!vmx->pml_pg) 7529 goto free_vpid; 7530 } 7531 7532 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7533 vmx->guest_uret_msrs[i].mask = -1ull; 7534 if (boot_cpu_has(X86_FEATURE_RTM)) { 7535 /* 7536 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7537 * Keep the host value unchanged to avoid changing CPUID bits 7538 * under the host kernel's feet. 7539 */ 7540 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7541 if (tsx_ctrl) 7542 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7543 } 7544 7545 err = alloc_loaded_vmcs(&vmx->vmcs01); 7546 if (err < 0) 7547 goto free_pml; 7548 7549 /* 7550 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7551 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7552 * feature only for vmcs01, KVM currently isn't equipped to realize any 7553 * performance benefits from enabling it for vmcs02. 7554 */ 7555 if (kvm_is_using_evmcs() && 7556 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7557 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7558 7559 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7560 } 7561 7562 /* The MSR bitmap starts with all ones */ 7563 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7564 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7565 7566 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 7567 #ifdef CONFIG_X86_64 7568 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 7569 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 7570 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 7571 #endif 7572 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 7573 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 7574 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 7575 if (kvm_cstate_in_guest(vcpu->kvm)) { 7576 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 7577 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 7578 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 7579 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 7580 } 7581 7582 vmx->loaded_vmcs = &vmx->vmcs01; 7583 7584 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7585 err = kvm_alloc_apic_access_page(vcpu->kvm); 7586 if (err) 7587 goto free_vmcs; 7588 } 7589 7590 if (enable_ept && !enable_unrestricted_guest) { 7591 err = init_rmode_identity_map(vcpu->kvm); 7592 if (err) 7593 goto free_vmcs; 7594 } 7595 7596 err = -ENOMEM; 7597 if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { 7598 struct page *page; 7599 7600 BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); 7601 7602 /* ve_info must be page aligned. */ 7603 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7604 if (!page) 7605 goto free_vmcs; 7606 7607 vmx->ve_info = page_to_virt(page); 7608 } 7609 7610 if (vmx_can_use_ipiv(vcpu)) 7611 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7612 __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); 7613 7614 return 0; 7615 7616 free_vmcs: 7617 free_loaded_vmcs(vmx->loaded_vmcs); 7618 free_pml: 7619 vmx_destroy_pml_buffer(vmx); 7620 free_vpid: 7621 free_vpid(vmx->vpid); 7622 return err; 7623 } 7624 7625 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7626 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7627 7628 int vmx_vm_init(struct kvm *kvm) 7629 { 7630 if (!ple_gap) 7631 kvm->arch.pause_in_guest = true; 7632 7633 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7634 switch (l1tf_mitigation) { 7635 case L1TF_MITIGATION_OFF: 7636 case L1TF_MITIGATION_FLUSH_NOWARN: 7637 /* 'I explicitly don't care' is set */ 7638 break; 7639 case L1TF_MITIGATION_FLUSH: 7640 case L1TF_MITIGATION_FLUSH_NOSMT: 7641 case L1TF_MITIGATION_FULL: 7642 /* 7643 * Warn upon starting the first VM in a potentially 7644 * insecure environment. 7645 */ 7646 if (sched_smt_active()) 7647 pr_warn_once(L1TF_MSG_SMT); 7648 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7649 pr_warn_once(L1TF_MSG_L1D); 7650 break; 7651 case L1TF_MITIGATION_FULL_FORCE: 7652 /* Flush is enforced */ 7653 break; 7654 } 7655 } 7656 return 0; 7657 } 7658 7659 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7660 { 7661 /* 7662 * Force UC for host MMIO regions, as allowing the guest to access MMIO 7663 * with cacheable accesses will result in Machine Checks. 7664 */ 7665 if (is_mmio) 7666 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7667 7668 /* 7669 * Force WB and ignore guest PAT if the VM does NOT have a non-coherent 7670 * device attached. Letting the guest control memory types on Intel 7671 * CPUs may result in unexpected behavior, and so KVM's ABI is to trust 7672 * the guest to behave only as a last resort. 7673 */ 7674 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) 7675 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7676 7677 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); 7678 } 7679 7680 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7681 { 7682 /* 7683 * These bits in the secondary execution controls field 7684 * are dynamic, the others are mostly based on the hypervisor 7685 * architecture and the guest's CPUID. Do not touch the 7686 * dynamic bits. 7687 */ 7688 u32 mask = 7689 SECONDARY_EXEC_SHADOW_VMCS | 7690 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7691 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7692 SECONDARY_EXEC_DESC; 7693 7694 u32 cur_ctl = secondary_exec_controls_get(vmx); 7695 7696 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7697 } 7698 7699 /* 7700 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7701 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7702 */ 7703 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7704 { 7705 struct vcpu_vmx *vmx = to_vmx(vcpu); 7706 struct kvm_cpuid_entry2 *entry; 7707 7708 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7709 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7710 7711 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7712 if (entry && (entry->_reg & (_cpuid_mask))) \ 7713 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7714 } while (0) 7715 7716 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7717 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7718 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7719 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7720 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7721 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7722 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7723 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7724 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7725 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7726 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7727 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7728 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7729 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7730 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7731 7732 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7733 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7734 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7735 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7736 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7737 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7738 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7739 7740 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); 7741 cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); 7742 7743 #undef cr4_fixed1_update 7744 } 7745 7746 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7747 { 7748 struct vcpu_vmx *vmx = to_vmx(vcpu); 7749 struct kvm_cpuid_entry2 *best = NULL; 7750 int i; 7751 7752 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7753 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7754 if (!best) 7755 return; 7756 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7757 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7758 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7759 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7760 } 7761 7762 /* Get the number of configurable Address Ranges for filtering */ 7763 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7764 PT_CAP_num_address_ranges); 7765 7766 /* Initialize and clear the no dependency bits */ 7767 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7768 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7769 RTIT_CTL_BRANCH_EN); 7770 7771 /* 7772 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7773 * will inject an #GP 7774 */ 7775 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7776 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7777 7778 /* 7779 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7780 * PSBFreq can be set 7781 */ 7782 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7783 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7784 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7785 7786 /* 7787 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7788 */ 7789 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7790 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7791 RTIT_CTL_MTC_RANGE); 7792 7793 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7794 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7795 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7796 RTIT_CTL_PTW_EN); 7797 7798 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7799 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7800 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7801 7802 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7803 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7804 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7805 7806 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7807 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7808 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7809 7810 /* unmask address range configure area */ 7811 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7812 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7813 } 7814 7815 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7816 { 7817 struct vcpu_vmx *vmx = to_vmx(vcpu); 7818 7819 /* 7820 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7821 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7822 * set if and only if XSAVE is supported. 7823 */ 7824 if (boot_cpu_has(X86_FEATURE_XSAVE) && 7825 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) 7826 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); 7827 7828 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); 7829 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM); 7830 7831 vmx_setup_uret_msrs(vmx); 7832 7833 if (cpu_has_secondary_exec_ctrls()) 7834 vmcs_set_secondary_exec_control(vmx, 7835 vmx_secondary_exec_control(vmx)); 7836 7837 if (guest_can_use(vcpu, X86_FEATURE_VMX)) 7838 vmx->msr_ia32_feature_control_valid_bits |= 7839 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7840 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7841 else 7842 vmx->msr_ia32_feature_control_valid_bits &= 7843 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7844 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7845 7846 if (guest_can_use(vcpu, X86_FEATURE_VMX)) 7847 nested_vmx_cr_fixed1_bits_update(vcpu); 7848 7849 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7850 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) 7851 update_intel_pt_cfg(vcpu); 7852 7853 if (boot_cpu_has(X86_FEATURE_RTM)) { 7854 struct vmx_uret_msr *msr; 7855 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7856 if (msr) { 7857 bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); 7858 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 7859 } 7860 } 7861 7862 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 7863 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 7864 !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); 7865 7866 if (boot_cpu_has(X86_FEATURE_IBPB)) 7867 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 7868 !guest_has_pred_cmd_msr(vcpu)); 7869 7870 if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 7871 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 7872 !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); 7873 7874 set_cr4_guest_host_mask(vmx); 7875 7876 vmx_write_encls_bitmap(vcpu, NULL); 7877 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) 7878 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 7879 else 7880 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 7881 7882 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) 7883 vmx->msr_ia32_feature_control_valid_bits |= 7884 FEAT_CTL_SGX_LC_ENABLED; 7885 else 7886 vmx->msr_ia32_feature_control_valid_bits &= 7887 ~FEAT_CTL_SGX_LC_ENABLED; 7888 7889 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7890 vmx_update_exception_bitmap(vcpu); 7891 } 7892 7893 static __init u64 vmx_get_perf_capabilities(void) 7894 { 7895 u64 perf_cap = PMU_CAP_FW_WRITES; 7896 u64 host_perf_cap = 0; 7897 7898 if (!enable_pmu) 7899 return 0; 7900 7901 if (boot_cpu_has(X86_FEATURE_PDCM)) 7902 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 7903 7904 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { 7905 x86_perf_get_lbr(&vmx_lbr_caps); 7906 7907 /* 7908 * KVM requires LBR callstack support, as the overhead due to 7909 * context switching LBRs without said support is too high. 7910 * See intel_pmu_create_guest_lbr_event() for more info. 7911 */ 7912 if (!vmx_lbr_caps.has_callstack) 7913 memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); 7914 else if (vmx_lbr_caps.nr) 7915 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; 7916 } 7917 7918 if (vmx_pebs_supported()) { 7919 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 7920 7921 /* 7922 * Disallow adaptive PEBS as it is functionally broken, can be 7923 * used by the guest to read *host* LBRs, and can be used to 7924 * bypass userspace event filters. To correctly and safely 7925 * support adaptive PEBS, KVM needs to: 7926 * 7927 * 1. Account for the ADAPTIVE flag when (re)programming fixed 7928 * counters. 7929 * 7930 * 2. Gain support from perf (or take direct control of counter 7931 * programming) to support events without adaptive PEBS 7932 * enabled for the hardware counter. 7933 * 7934 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 7935 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 7936 * 7937 * 4. Document which PMU events are effectively exposed to the 7938 * guest via adaptive PEBS, and make adaptive PEBS mutually 7939 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 7940 */ 7941 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 7942 } 7943 7944 return perf_cap; 7945 } 7946 7947 static __init void vmx_set_cpu_caps(void) 7948 { 7949 kvm_set_cpu_caps(); 7950 7951 /* CPUID 0x1 */ 7952 if (nested) 7953 kvm_cpu_cap_set(X86_FEATURE_VMX); 7954 7955 /* CPUID 0x7 */ 7956 if (kvm_mpx_supported()) 7957 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 7958 if (!cpu_has_vmx_invpcid()) 7959 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 7960 if (vmx_pt_mode_is_host_guest()) 7961 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 7962 if (vmx_pebs_supported()) { 7963 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 7964 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 7965 } 7966 7967 if (!enable_pmu) 7968 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 7969 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 7970 7971 if (!enable_sgx) { 7972 kvm_cpu_cap_clear(X86_FEATURE_SGX); 7973 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7974 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7975 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7976 kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); 7977 } 7978 7979 if (vmx_umip_emulated()) 7980 kvm_cpu_cap_set(X86_FEATURE_UMIP); 7981 7982 /* CPUID 0xD.1 */ 7983 kvm_caps.supported_xss = 0; 7984 if (!cpu_has_vmx_xsaves()) 7985 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7986 7987 /* CPUID 0x80000001 and 0x7 (RDPID) */ 7988 if (!cpu_has_vmx_rdtscp()) { 7989 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 7990 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 7991 } 7992 7993 if (cpu_has_vmx_waitpkg()) 7994 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7995 } 7996 7997 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, 7998 struct x86_instruction_info *info) 7999 { 8000 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8001 unsigned short port; 8002 bool intercept; 8003 int size; 8004 8005 if (info->intercept == x86_intercept_in || 8006 info->intercept == x86_intercept_ins) { 8007 port = info->src_val; 8008 size = info->dst_bytes; 8009 } else { 8010 port = info->dst_val; 8011 size = info->src_bytes; 8012 } 8013 8014 /* 8015 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 8016 * VM-exits depend on the 'unconditional IO exiting' VM-execution 8017 * control. 8018 * 8019 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 8020 */ 8021 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 8022 intercept = nested_cpu_has(vmcs12, 8023 CPU_BASED_UNCOND_IO_EXITING); 8024 else 8025 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); 8026 8027 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 8028 return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; 8029 } 8030 8031 int vmx_check_intercept(struct kvm_vcpu *vcpu, 8032 struct x86_instruction_info *info, 8033 enum x86_intercept_stage stage, 8034 struct x86_exception *exception) 8035 { 8036 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8037 8038 switch (info->intercept) { 8039 /* 8040 * RDPID causes #UD if disabled through secondary execution controls. 8041 * Because it is marked as EmulateOnUD, we need to intercept it here. 8042 * Note, RDPID is hidden behind ENABLE_RDTSCP. 8043 */ 8044 case x86_intercept_rdpid: 8045 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 8046 exception->vector = UD_VECTOR; 8047 exception->error_code_valid = false; 8048 return X86EMUL_PROPAGATE_FAULT; 8049 } 8050 break; 8051 8052 case x86_intercept_in: 8053 case x86_intercept_ins: 8054 case x86_intercept_out: 8055 case x86_intercept_outs: 8056 return vmx_check_intercept_io(vcpu, info); 8057 8058 case x86_intercept_lgdt: 8059 case x86_intercept_lidt: 8060 case x86_intercept_lldt: 8061 case x86_intercept_ltr: 8062 case x86_intercept_sgdt: 8063 case x86_intercept_sidt: 8064 case x86_intercept_sldt: 8065 case x86_intercept_str: 8066 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8067 return X86EMUL_CONTINUE; 8068 8069 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 8070 break; 8071 8072 case x86_intercept_pause: 8073 /* 8074 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8075 * with vanilla NOPs in the emulator. Apply the interception 8076 * check only to actual PAUSE instructions. Don't check 8077 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8078 * exit, i.e. KVM is within its rights to allow L2 to execute 8079 * the PAUSE. 8080 */ 8081 if ((info->rep_prefix != REPE_PREFIX) || 8082 !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) 8083 return X86EMUL_CONTINUE; 8084 8085 break; 8086 8087 /* TODO: check more intercepts... */ 8088 default: 8089 break; 8090 } 8091 8092 return X86EMUL_UNHANDLEABLE; 8093 } 8094 8095 #ifdef CONFIG_X86_64 8096 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8097 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8098 u64 divisor, u64 *result) 8099 { 8100 u64 low = a << shift, high = a >> (64 - shift); 8101 8102 /* To avoid the overflow on divq */ 8103 if (high >= divisor) 8104 return 1; 8105 8106 /* Low hold the result, high hold rem which is discarded */ 8107 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8108 "rm" (divisor), "0" (low), "1" (high)); 8109 *result = low; 8110 8111 return 0; 8112 } 8113 8114 int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8115 bool *expired) 8116 { 8117 struct vcpu_vmx *vmx; 8118 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8119 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8120 8121 vmx = to_vmx(vcpu); 8122 tscl = rdtsc(); 8123 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8124 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8125 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8126 ktimer->timer_advance_ns); 8127 8128 if (delta_tsc > lapic_timer_advance_cycles) 8129 delta_tsc -= lapic_timer_advance_cycles; 8130 else 8131 delta_tsc = 0; 8132 8133 /* Convert to host delta tsc if tsc scaling is enabled */ 8134 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8135 delta_tsc && u64_shl_div_u64(delta_tsc, 8136 kvm_caps.tsc_scaling_ratio_frac_bits, 8137 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8138 return -ERANGE; 8139 8140 /* 8141 * If the delta tsc can't fit in the 32 bit after the multi shift, 8142 * we can't use the preemption timer. 8143 * It's possible that it fits on later vmentries, but checking 8144 * on every vmentry is costly so we just use an hrtimer. 8145 */ 8146 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8147 return -ERANGE; 8148 8149 vmx->hv_deadline_tsc = tscl + delta_tsc; 8150 *expired = !delta_tsc; 8151 return 0; 8152 } 8153 8154 void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8155 { 8156 to_vmx(vcpu)->hv_deadline_tsc = -1; 8157 } 8158 #endif 8159 8160 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8161 { 8162 struct vcpu_vmx *vmx = to_vmx(vcpu); 8163 8164 if (WARN_ON_ONCE(!enable_pml)) 8165 return; 8166 8167 if (is_guest_mode(vcpu)) { 8168 vmx->nested.update_vmcs01_cpu_dirty_logging = true; 8169 return; 8170 } 8171 8172 /* 8173 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8174 * code, but in that case another update request will be made and so 8175 * the guest will never run with a stale PML value. 8176 */ 8177 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8178 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8179 else 8180 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8181 } 8182 8183 void vmx_setup_mce(struct kvm_vcpu *vcpu) 8184 { 8185 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8186 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8187 FEAT_CTL_LMCE_ENABLED; 8188 else 8189 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8190 ~FEAT_CTL_LMCE_ENABLED; 8191 } 8192 8193 #ifdef CONFIG_KVM_SMM 8194 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8195 { 8196 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8197 if (to_vmx(vcpu)->nested.nested_run_pending) 8198 return -EBUSY; 8199 return !is_smm(vcpu); 8200 } 8201 8202 int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8203 { 8204 struct vcpu_vmx *vmx = to_vmx(vcpu); 8205 8206 /* 8207 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8208 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8209 * SMI and RSM only modify state that is saved and restored via SMRAM. 8210 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8211 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8212 */ 8213 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8214 if (vmx->nested.smm.guest_mode) 8215 nested_vmx_vmexit(vcpu, -1, 0, 0); 8216 8217 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8218 vmx->nested.vmxon = false; 8219 vmx_clear_hlt(vcpu); 8220 return 0; 8221 } 8222 8223 int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8224 { 8225 struct vcpu_vmx *vmx = to_vmx(vcpu); 8226 int ret; 8227 8228 if (vmx->nested.smm.vmxon) { 8229 vmx->nested.vmxon = true; 8230 vmx->nested.smm.vmxon = false; 8231 } 8232 8233 if (vmx->nested.smm.guest_mode) { 8234 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8235 if (ret) 8236 return ret; 8237 8238 vmx->nested.nested_run_pending = 1; 8239 vmx->nested.smm.guest_mode = false; 8240 } 8241 return 0; 8242 } 8243 8244 void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8245 { 8246 /* RSM will cause a vmexit anyway. */ 8247 } 8248 #endif 8249 8250 bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8251 { 8252 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8253 } 8254 8255 void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8256 { 8257 if (is_guest_mode(vcpu)) { 8258 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8259 8260 if (hrtimer_try_to_cancel(timer) == 1) 8261 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8262 } 8263 } 8264 8265 void vmx_hardware_unsetup(void) 8266 { 8267 kvm_set_posted_intr_wakeup_handler(NULL); 8268 8269 if (nested) 8270 nested_vmx_hardware_unsetup(); 8271 8272 free_kvm_area(); 8273 } 8274 8275 void vmx_vm_destroy(struct kvm *kvm) 8276 { 8277 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8278 8279 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8280 } 8281 8282 /* 8283 * Note, the SDM states that the linear address is masked *after* the modified 8284 * canonicality check, whereas KVM masks (untags) the address and then performs 8285 * a "normal" canonicality check. Functionally, the two methods are identical, 8286 * and when the masking occurs relative to the canonicality check isn't visible 8287 * to software, i.e. KVM's behavior doesn't violate the SDM. 8288 */ 8289 gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags) 8290 { 8291 int lam_bit; 8292 unsigned long cr3_bits; 8293 8294 if (flags & (X86EMUL_F_FETCH | X86EMUL_F_IMPLICIT | X86EMUL_F_INVLPG)) 8295 return gva; 8296 8297 if (!is_64_bit_mode(vcpu)) 8298 return gva; 8299 8300 /* 8301 * Bit 63 determines if the address should be treated as user address 8302 * or a supervisor address. 8303 */ 8304 if (!(gva & BIT_ULL(63))) { 8305 cr3_bits = kvm_get_active_cr3_lam_bits(vcpu); 8306 if (!(cr3_bits & (X86_CR3_LAM_U57 | X86_CR3_LAM_U48))) 8307 return gva; 8308 8309 /* LAM_U48 is ignored if LAM_U57 is set. */ 8310 lam_bit = cr3_bits & X86_CR3_LAM_U57 ? 56 : 47; 8311 } else { 8312 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP)) 8313 return gva; 8314 8315 lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 56 : 47; 8316 } 8317 8318 /* 8319 * Untag the address by sign-extending the lam_bit, but NOT to bit 63. 8320 * Bit 63 is retained from the raw virtual address so that untagging 8321 * doesn't change a user access to a supervisor access, and vice versa. 8322 */ 8323 return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); 8324 } 8325 8326 static unsigned int vmx_handle_intel_pt_intr(void) 8327 { 8328 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8329 8330 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8331 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8332 return 0; 8333 8334 kvm_make_request(KVM_REQ_PMI, vcpu); 8335 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8336 (unsigned long *)&vcpu->arch.pmu.global_status); 8337 return 1; 8338 } 8339 8340 static __init void vmx_setup_user_return_msrs(void) 8341 { 8342 8343 /* 8344 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8345 * will emulate SYSCALL in legacy mode if the vendor string in guest 8346 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8347 * support this emulation, MSR_STAR is included in the list for i386, 8348 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8349 * into hardware and is here purely for emulation purposes. 8350 */ 8351 const u32 vmx_uret_msrs_list[] = { 8352 #ifdef CONFIG_X86_64 8353 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8354 #endif 8355 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8356 MSR_IA32_TSX_CTRL, 8357 }; 8358 int i; 8359 8360 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8361 8362 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8363 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8364 } 8365 8366 static void __init vmx_setup_me_spte_mask(void) 8367 { 8368 u64 me_mask = 0; 8369 8370 /* 8371 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8372 * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, 8373 * boot_cpu_data.x86_phys_bits holds the actual physical address 8374 * w/o the KeyID bits, and kvm_host.maxphyaddr equals to 8375 * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. 8376 */ 8377 if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) 8378 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8379 kvm_host.maxphyaddr - 1); 8380 8381 /* 8382 * Unlike SME, host kernel doesn't support setting up any 8383 * MKTME KeyID on Intel platforms. No memory encryption 8384 * bits should be included into the SPTE. 8385 */ 8386 kvm_mmu_set_me_spte_mask(0, me_mask); 8387 } 8388 8389 __init int vmx_hardware_setup(void) 8390 { 8391 unsigned long host_bndcfgs; 8392 struct desc_ptr dt; 8393 int r; 8394 8395 store_idt(&dt); 8396 host_idt_base = dt.address; 8397 8398 vmx_setup_user_return_msrs(); 8399 8400 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8401 return -EIO; 8402 8403 if (cpu_has_perf_global_ctrl_bug()) 8404 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 8405 "does not work properly. Using workaround\n"); 8406 8407 if (boot_cpu_has(X86_FEATURE_NX)) 8408 kvm_enable_efer_bits(EFER_NX); 8409 8410 if (boot_cpu_has(X86_FEATURE_MPX)) { 8411 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); 8412 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8413 } 8414 8415 if (!cpu_has_vmx_mpx()) 8416 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8417 XFEATURE_MASK_BNDCSR); 8418 8419 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8420 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8421 enable_vpid = 0; 8422 8423 if (!cpu_has_vmx_ept() || 8424 !cpu_has_vmx_ept_4levels() || 8425 !cpu_has_vmx_ept_mt_wb() || 8426 !cpu_has_vmx_invept_global()) 8427 enable_ept = 0; 8428 8429 /* NX support is required for shadow paging. */ 8430 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8431 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8432 return -EOPNOTSUPP; 8433 } 8434 8435 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8436 enable_ept_ad_bits = 0; 8437 8438 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8439 enable_unrestricted_guest = 0; 8440 8441 if (!cpu_has_vmx_flexpriority()) 8442 flexpriority_enabled = 0; 8443 8444 if (!cpu_has_virtual_nmis()) 8445 enable_vnmi = 0; 8446 8447 #ifdef CONFIG_X86_SGX_KVM 8448 if (!cpu_has_vmx_encls_vmexit()) 8449 enable_sgx = false; 8450 #endif 8451 8452 /* 8453 * set_apic_access_page_addr() is used to reload apic access 8454 * page upon invalidation. No need to do anything if not 8455 * using the APIC_ACCESS_ADDR VMCS field. 8456 */ 8457 if (!flexpriority_enabled) 8458 vt_x86_ops.set_apic_access_page_addr = NULL; 8459 8460 if (!cpu_has_vmx_tpr_shadow()) 8461 vt_x86_ops.update_cr8_intercept = NULL; 8462 8463 #if IS_ENABLED(CONFIG_HYPERV) 8464 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8465 && enable_ept) { 8466 vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8467 vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8468 } 8469 #endif 8470 8471 if (!cpu_has_vmx_ple()) { 8472 ple_gap = 0; 8473 ple_window = 0; 8474 ple_window_grow = 0; 8475 ple_window_max = 0; 8476 ple_window_shrink = 0; 8477 } 8478 8479 if (!cpu_has_vmx_apicv()) 8480 enable_apicv = 0; 8481 if (!enable_apicv) 8482 vt_x86_ops.sync_pir_to_irr = NULL; 8483 8484 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8485 enable_ipiv = false; 8486 8487 if (cpu_has_vmx_tsc_scaling()) 8488 kvm_caps.has_tsc_control = true; 8489 8490 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8491 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8492 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8493 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8494 8495 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8496 8497 if (enable_ept) 8498 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8499 cpu_has_vmx_ept_execute_only()); 8500 8501 /* 8502 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8503 * bits to shadow_zero_check. 8504 */ 8505 vmx_setup_me_spte_mask(); 8506 8507 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8508 ept_caps_to_lpage_level(vmx_capability.ept)); 8509 8510 /* 8511 * Only enable PML when hardware supports PML feature, and both EPT 8512 * and EPT A/D bit features are enabled -- PML depends on them to work. 8513 */ 8514 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8515 enable_pml = 0; 8516 8517 if (!enable_pml) 8518 vt_x86_ops.cpu_dirty_log_size = 0; 8519 8520 if (!cpu_has_vmx_preemption_timer()) 8521 enable_preemption_timer = false; 8522 8523 if (enable_preemption_timer) { 8524 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8525 8526 cpu_preemption_timer_multi = 8527 vmx_misc_preemption_timer_rate(vmcs_config.misc); 8528 8529 if (tsc_khz) 8530 use_timer_freq = (u64)tsc_khz * 1000; 8531 use_timer_freq >>= cpu_preemption_timer_multi; 8532 8533 /* 8534 * KVM "disables" the preemption timer by setting it to its max 8535 * value. Don't use the timer if it might cause spurious exits 8536 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8537 */ 8538 if (use_timer_freq > 0xffffffffu / 10) 8539 enable_preemption_timer = false; 8540 } 8541 8542 if (!enable_preemption_timer) { 8543 vt_x86_ops.set_hv_timer = NULL; 8544 vt_x86_ops.cancel_hv_timer = NULL; 8545 } 8546 8547 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8548 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8549 8550 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8551 return -EINVAL; 8552 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8553 pt_mode = PT_MODE_SYSTEM; 8554 if (pt_mode == PT_MODE_HOST_GUEST) 8555 vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8556 else 8557 vt_init_ops.handle_intel_pt_intr = NULL; 8558 8559 setup_default_sgx_lepubkeyhash(); 8560 8561 if (nested) { 8562 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8563 8564 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8565 if (r) 8566 return r; 8567 } 8568 8569 vmx_set_cpu_caps(); 8570 8571 r = alloc_kvm_area(); 8572 if (r && nested) 8573 nested_vmx_hardware_unsetup(); 8574 8575 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8576 8577 return r; 8578 } 8579 8580 static void vmx_cleanup_l1d_flush(void) 8581 { 8582 if (vmx_l1d_flush_pages) { 8583 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8584 vmx_l1d_flush_pages = NULL; 8585 } 8586 /* Restore state so sysfs ignores VMX */ 8587 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8588 } 8589 8590 static void __vmx_exit(void) 8591 { 8592 allow_smaller_maxphyaddr = false; 8593 8594 vmx_cleanup_l1d_flush(); 8595 } 8596 8597 static void vmx_exit(void) 8598 { 8599 kvm_exit(); 8600 __vmx_exit(); 8601 kvm_x86_vendor_exit(); 8602 8603 } 8604 module_exit(vmx_exit); 8605 8606 static int __init vmx_init(void) 8607 { 8608 int r, cpu; 8609 8610 if (!kvm_is_vmx_supported()) 8611 return -EOPNOTSUPP; 8612 8613 /* 8614 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing 8615 * to unwind if a later step fails. 8616 */ 8617 hv_init_evmcs(); 8618 8619 r = kvm_x86_vendor_init(&vt_init_ops); 8620 if (r) 8621 return r; 8622 8623 /* 8624 * Must be called after common x86 init so enable_ept is properly set 8625 * up. Hand the parameter mitigation value in which was stored in 8626 * the pre module init parser. If no parameter was given, it will 8627 * contain 'auto' which will be turned into the default 'cond' 8628 * mitigation mode. 8629 */ 8630 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8631 if (r) 8632 goto err_l1d_flush; 8633 8634 for_each_possible_cpu(cpu) { 8635 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8636 8637 pi_init_cpu(cpu); 8638 } 8639 8640 vmx_check_vmcs12_offsets(); 8641 8642 /* 8643 * Shadow paging doesn't have a (further) performance penalty 8644 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8645 * by default 8646 */ 8647 if (!enable_ept) 8648 allow_smaller_maxphyaddr = true; 8649 8650 /* 8651 * Common KVM initialization _must_ come last, after this, /dev/kvm is 8652 * exposed to userspace! 8653 */ 8654 r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), 8655 THIS_MODULE); 8656 if (r) 8657 goto err_kvm_init; 8658 8659 return 0; 8660 8661 err_kvm_init: 8662 __vmx_exit(); 8663 err_l1d_flush: 8664 kvm_x86_vendor_exit(); 8665 return r; 8666 } 8667 module_init(vmx_init); 8668